From de399236e240743ad2dd10d719c37b97ddf31996 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Wed, 18 May 2022 19:17:30 +0200
Subject: ucounts: Split rlimit and ucount values and max values

Since the semantics of maximum rlimit values are different, it would be
better not to mix ucount and rlimit values. This will prevent the error
of using inc_count/dec_ucount for rlimit parameters.

This patch also renames the functions to emphasize the lack of
connection between rlimit and ucount.

v3:
- Fix BUG:KASAN:use-after-free_in_dec_ucount.

v2:
- Fix the array-index-out-of-bounds that was found by the lkp project.

Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Alexey Gladkov <legion@kernel.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Link: https://lkml.kernel.org/r/20220518171730.l65lmnnjtnxnftpq@example.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/user_namespace.h | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 33a4240e6a6f..45f09bec02c4 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -54,15 +54,17 @@ enum ucount_type {
 	UCOUNT_FANOTIFY_GROUPS,
 	UCOUNT_FANOTIFY_MARKS,
 #endif
+	UCOUNT_COUNTS,
+};
+
+enum rlimit_type {
 	UCOUNT_RLIMIT_NPROC,
 	UCOUNT_RLIMIT_MSGQUEUE,
 	UCOUNT_RLIMIT_SIGPENDING,
 	UCOUNT_RLIMIT_MEMLOCK,
-	UCOUNT_COUNTS,
+	UCOUNT_RLIMIT_COUNTS,
 };
 
-#define MAX_PER_NAMESPACE_UCOUNTS UCOUNT_RLIMIT_NPROC
-
 struct user_namespace {
 	struct uid_gid_map	uid_map;
 	struct uid_gid_map	gid_map;
@@ -99,6 +101,7 @@ struct user_namespace {
 #endif
 	struct ucounts		*ucounts;
 	long ucount_max[UCOUNT_COUNTS];
+	long rlimit_max[UCOUNT_RLIMIT_COUNTS];
 } __randomize_layout;
 
 struct ucounts {
@@ -107,6 +110,7 @@ struct ucounts {
 	kuid_t uid;
 	atomic_t count;
 	atomic_long_t ucount[UCOUNT_COUNTS];
+	atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS];
 };
 
 extern struct user_namespace init_user_ns;
@@ -120,21 +124,26 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
 struct ucounts * __must_check get_ucounts(struct ucounts *ucounts);
 void put_ucounts(struct ucounts *ucounts);
 
-static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type type)
+static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type)
 {
-	return atomic_long_read(&ucounts->ucount[type]);
+	return atomic_long_read(&ucounts->rlimit[type]);
 }
 
-long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
-bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
-long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type);
-void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type);
-bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max);
+long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
+bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
+long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type);
+void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type);
+bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max);
+
+static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type)
+{
+	return READ_ONCE(ns->rlimit_max[type]);
+}
 
-static inline void set_rlimit_ucount_max(struct user_namespace *ns,
-		enum ucount_type type, unsigned long max)
+static inline void set_userns_rlimit_max(struct user_namespace *ns,
+		enum rlimit_type type, unsigned long max)
 {
-	ns->ucount_max[type] = max <= LONG_MAX ? max : LONG_MAX;
+	ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX;
 }
 
 #ifdef CONFIG_USER_NS
-- 
cgit v1.2.3


From d80f7d7b2c75c5954d335dffbccca62a5002c3e0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 21 Jun 2022 14:38:52 -0500
Subject: signal: Guarantee that SIGNAL_GROUP_EXIT is set on process exit

Track how many threads have not started exiting and when the last
thread starts exiting set SIGNAL_GROUP_EXIT.

This guarantees that SIGNAL_GROUP_EXIT will get set when a process
exits.  In practice this achieves nothing as glibc's implementation of
_exit calls sys_group_exit then sys_exit.  While glibc's implemenation
of pthread_exit calls exit (which cleansup and calls _exit) if it is
the last thread and sys_exit if it is the last thread.

This means the only way the kernel might observe a process that does
not set call exit_group is if the language runtime does not use glibc.

With more cleanups I hope to move the decrement of quick_threads
earlier.

Link: https://lkml.kernel.org/r/87bkukd4tc.fsf_-_@email.froward.int.ebiederm.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched/signal.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index cafbe03eed01..20099268fa25 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -94,6 +94,7 @@ struct signal_struct {
 	refcount_t		sigcnt;
 	atomic_t		live;
 	int			nr_threads;
+	int			quick_threads;
 	struct list_head	thread_head;
 
 	wait_queue_head_t	wait_chldexit;	/* for wait4() */
-- 
cgit v1.2.3


From fa96b24204af42274ec13dfb2f2e6990d7510e55 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 5 Aug 2022 14:48:14 -0700
Subject: btf: Add a new kfunc flag which allows to mark a function to be
 sleepable

This allows to declare a kfunc as sleepable and prevents its use in
a non sleepable program.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Co-developed-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Hao Luo <haoluo@google.com>
Link: https://lore.kernel.org/r/20220805214821.1058337-2-haoluo@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index cdb376d53238..976cbdd2981f 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -49,6 +49,7 @@
  * for this case.
  */
 #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
+#define KF_SLEEPABLE   (1 << 5) /* kfunc may sleep */
 
 struct btf;
 struct btf_member;
-- 
cgit v1.2.3


From c8996c98f703b09afe77a1d247dae691c9849dc1 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 9 Aug 2022 08:08:02 +0200
Subject: bpf: Add BPF-helper for accessing CLOCK_TAI

Commit 3dc6ffae2da2 ("timekeeping: Introduce fast accessor to clock tai")
introduced a fast and NMI-safe accessor for CLOCK_TAI. Especially in time
sensitive networks (TSN), where all nodes are synchronized by Precision Time
Protocol (PTP), it's helpful to have the possibility to generate timestamps
based on CLOCK_TAI instead of CLOCK_MONOTONIC. With a BPF helper for TAI in
place, it becomes very convenient to correlate activity across different
machines in the network.

Use cases for such a BPF helper include functionalities such as Tx launch
time (e.g. ETF and TAPRIO Qdiscs) and timestamping.

Note: CLOCK_TAI is nothing new per se, only the NMI-safe variant of it is.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
[Kurt: Wrote changelog and renamed helper]
Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
Link: https://lore.kernel.org/r/20220809060803.5773-2-kurt@linutronix.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 20c26aed7896..a627a02cf8ab 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2349,6 +2349,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
 extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto;
+extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto;
 extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
 extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
-- 
cgit v1.2.3


From 4dd48c6f1f83290d4bc61b43e61d86f8bc6c310e Mon Sep 17 00:00:00 2001
From: Artem Savkov <asavkov@redhat.com>
Date: Wed, 10 Aug 2022 08:59:03 +0200
Subject: bpf: add destructive kfunc flag

Add KF_DESTRUCTIVE flag for destructive functions. Functions with this
flag set will require CAP_SYS_BOOT capabilities.

Signed-off-by: Artem Savkov <asavkov@redhat.com>
Link: https://lore.kernel.org/r/20220810065905.475418-2-asavkov@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 976cbdd2981f..ad93c2d9cc1c 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -49,7 +49,8 @@
  * for this case.
  */
 #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
-#define KF_SLEEPABLE   (1 << 5) /* kfunc may sleep */
+#define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
+#define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
 
 struct btf;
 struct btf_member;
-- 
cgit v1.2.3


From fb12ad5e3179c9667dfcbafe2c5da91f9e700e53 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 11 Aug 2022 16:11:36 -0700
Subject: Input: bma150 - fix a typo in some comments

Remove some extra '0'

s/BMA0150_RANGE_xxx/BMA150_RANGE_xxx/
s/BMA0150_BW_xxx/BMA150_BW_xxx

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/a331a6244a1dfbf34dc85f1be6995fa91500c801.1659802757.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/bma150.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bma150.h b/include/linux/bma150.h
index 31c9e323a391..4d4a62d49341 100644
--- a/include/linux/bma150.h
+++ b/include/linux/bma150.h
@@ -33,8 +33,8 @@ struct bma150_cfg {
 	unsigned char lg_hyst;		/* Low-G hysterisis */
 	unsigned char lg_dur;		/* Low-G duration */
 	unsigned char lg_thres;		/* Low-G threshold */
-	unsigned char range;		/* one of BMA0150_RANGE_xxx */
-	unsigned char bandwidth;	/* one of BMA0150_BW_xxx */
+	unsigned char range;		/* one of BMA150_RANGE_xxx */
+	unsigned char bandwidth;	/* one of BMA150_BW_xxx */
 };
 
 struct bma150_platform_data {
-- 
cgit v1.2.3


From 67f4b5dc49913abcdb5cc736e73674e2f352f81d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 13 Aug 2022 08:22:25 -0400
Subject: NFS: Fix another fsync() issue after a server reboot

Currently, when the writeback code detects a server reboot, it redirties
any pages that were not committed to disk, and it sets the flag
NFS_CONTEXT_RESEND_WRITES in the nfs_open_context of the file descriptor
that dirtied the file. While this allows the file descriptor in question
to redrive its own writes, it violates the fsync() requirement that we
should be synchronising all writes to disk.
While the problem is infrequent, we do see corner cases where an
untimely server reboot causes the fsync() call to abandon its attempt to
sync data to disk and causing data corruption issues due to missed error
conditions or similar.

In order to tighted up the client's ability to deal with this situation
without introducing livelocks, add a counter that records the number of
times pages are redirtied due to a server reboot-like condition, and use
that in fsync() to redrive the sync to disk.

Fixes: 2197e9b06c22 ("NFS: Fix up fsync() when the server rebooted")
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b32ed68e7dc4..f08e581f0161 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -182,6 +182,7 @@ struct nfs_inode {
 		/* Regular file */
 		struct {
 			atomic_long_t	nrequests;
+			atomic_long_t	redirtied_pages;
 			struct nfs_mds_commit_info commit_info;
 			struct mutex	commit_mutex;
 		};
-- 
cgit v1.2.3


From 5f6277a0c15e1ea54b6fd3d78c9fff7bfe42556c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 13 Aug 2022 08:51:45 -0400
Subject: NFS: Cleanup to remove unused flag NFS_CONTEXT_RESEND_WRITES

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f08e581f0161..7931fa472561 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -83,7 +83,6 @@ struct nfs_open_context {
 	fmode_t mode;
 
 	unsigned long flags;
-#define NFS_CONTEXT_RESEND_WRITES	(1)
 #define NFS_CONTEXT_BAD			(2)
 #define NFS_CONTEXT_UNLOCK	(3)
 #define NFS_CONTEXT_FILE_OPEN		(4)
-- 
cgit v1.2.3


From 93ce557679e1cf7742ad327d40a1499e7d8535b7 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 8 Aug 2022 23:33:59 +0300
Subject: regmap: mmio: Introduce IO accessors that can talk to IO port

Some users may use regmap MMIO for IO ports, and this can be done
by assigning ioreadXX()/iowriteXX() and their Big Endian counterparts
to the regmap context.

Add IO port support with a corresponding flag added.

While doing that, make sure that user won't select relaxed MMIO access
along with IO port because the latter have no relaxed variants.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: William Breathitt Gray <william.gray@linaro.org>
Link: https://lore.kernel.org/r/20220808203401.35153-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 7cf2157134ac..8cccc247cd37 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -311,6 +311,8 @@ typedef void (*regmap_unlock)(void *);
  *		  This field is a duplicate of a similar file in
  *		  'struct regmap_bus' and serves exact same purpose.
  *		   Use it only for "no-bus" cases.
+ * @io_port:	  Support IO port accessors. Makes sense only when MMIO vs. IO port
+ *		  access can be distinguished.
  * @max_register: Optional, specifies the maximum valid register address.
  * @wr_table:     Optional, points to a struct regmap_access_table specifying
  *                valid ranges for write access.
@@ -399,6 +401,7 @@ struct regmap_config {
 	size_t max_raw_write;
 
 	bool fast_io;
+	bool io_port;
 
 	unsigned int max_register;
 	const struct regmap_access_table *wr_table;
-- 
cgit v1.2.3


From 0a90ed8d0cfa29735a221eba14d9cb6c735d35b6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 1 Aug 2022 14:37:31 +0300
Subject: platform/x86: pmc_atom: Fix SLP_TYPx bitfield mask

On Intel hardware the SLP_TYPx bitfield occupies bits 10-12 as per ACPI
specification (see Table 4.13 "PM1 Control Registers Fixed Hardware
Feature Control Bits" for the details).

Fix the mask and other related definitions accordingly.

Fixes: 93e5eadd1f6e ("x86/platform: New Intel Atom SOC power management controller driver")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220801113734.36131-1-andriy.shevchenko@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/pmc_atom.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h
index 3edfb6d4e67a..dd81f510e4cf 100644
--- a/include/linux/platform_data/x86/pmc_atom.h
+++ b/include/linux/platform_data/x86/pmc_atom.h
@@ -7,6 +7,8 @@
 #ifndef PMC_ATOM_H
 #define PMC_ATOM_H
 
+#include <linux/bits.h>
+
 /* ValleyView Power Control Unit PCI Device ID */
 #define	PCI_DEVICE_ID_VLV_PMC	0x0F1C
 /* CherryTrail Power Control Unit PCI Device ID */
@@ -139,9 +141,9 @@
 #define	ACPI_MMIO_REG_LEN	0x100
 
 #define	PM1_CNT			0x4
-#define	SLEEP_TYPE_MASK		0xFFFFECFF
+#define	SLEEP_TYPE_MASK		GENMASK(12, 10)
 #define	SLEEP_TYPE_S5		0x1C00
-#define	SLEEP_ENABLE		0x2000
+#define	SLEEP_ENABLE		BIT(13)
 
 extern int pmc_atom_read(int offset, u32 *value);
 
-- 
cgit v1.2.3


From 7f203bc89eb66d6afde7eae91347fc0352090cc3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Jul 2022 13:10:16 -1000
Subject: cgroup: Replace cgroup->ancestor_ids[] with ->ancestors[]

Every cgroup knows all its ancestors through its ->ancestor_ids[]. There's
no advantage to remembering the IDs instead of the pointers directly and
this makes the array useless for finding an actual ancestor cgroup forcing
cgroup_ancestor() to iteratively walk up the hierarchy instead. Let's
replace cgroup->ancestor_ids[] with ->ancestors[] and remove the walking-up
from cgroup_ancestor().

While at it, improve comments around cgroup_root->cgrp_ancestor_storage.

This patch shouldn't cause user-visible behavior differences.

v2: Update cgroup_ancestor() to use ->ancestors[].

v3: cgroup_root->cgrp_ancestor_storage's type is updated to match
    cgroup->ancestors[]. Better comments.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
---
 include/linux/cgroup-defs.h | 16 ++++++++++------
 include/linux/cgroup.h      |  8 +++-----
 2 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 4bcf56b3491c..1283993d7ea8 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -384,7 +384,7 @@ struct cgroup {
 	/*
 	 * The depth this cgroup is at.  The root is at depth zero and each
 	 * step down the hierarchy increments the level.  This along with
-	 * ancestor_ids[] can determine whether a given cgroup is a
+	 * ancestors[] can determine whether a given cgroup is a
 	 * descendant of another without traversing the hierarchy.
 	 */
 	int level;
@@ -504,8 +504,8 @@ struct cgroup {
 	/* Used to store internal freezer state */
 	struct cgroup_freezer_state freezer;
 
-	/* ids of the ancestors at each level including self */
-	u64 ancestor_ids[];
+	/* All ancestors including self */
+	struct cgroup *ancestors[];
 };
 
 /*
@@ -522,11 +522,15 @@ struct cgroup_root {
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
-	/* The root cgroup.  Root is destroyed on its release. */
+	/*
+	 * The root cgroup. The containing cgroup_root will be destroyed on its
+	 * release. cgrp->ancestors[0] will be used overflowing into the
+	 * following field. cgrp_ancestor_storage must immediately follow.
+	 */
 	struct cgroup cgrp;
 
-	/* for cgrp->ancestor_ids[0] */
-	u64 cgrp_ancestor_id_storage;
+	/* must follow cgrp for cgrp->ancestors[0], see above */
+	struct cgroup *cgrp_ancestor_storage;
 
 	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
 	atomic_t nr_cgrps;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ed53bfe7c46c..4d143729b246 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -574,7 +574,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
 {
 	if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
 		return false;
-	return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor);
+	return cgrp->ancestors[ancestor->level] == ancestor;
 }
 
 /**
@@ -591,11 +591,9 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
 static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
 					     int ancestor_level)
 {
-	if (cgrp->level < ancestor_level)
+	if (ancestor_level < 0 || ancestor_level > cgrp->level)
 		return NULL;
-	while (cgrp && cgrp->level > ancestor_level)
-		cgrp = cgroup_parent(cgrp);
-	return cgrp;
+	return cgrp->ancestors[ancestor_level];
 }
 
 /**
-- 
cgit v1.2.3


From 1e64b9c5f9a01f1a752438724bc83180c451e1c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nuno=20S=C3=A1?= <nuno.sa@analog.com>
Date: Fri, 15 Jul 2022 14:28:53 +0200
Subject: iio: inkern: move to fwnode properties
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This moves the IIO in kernel interface to use fwnode properties and thus
be firmware agnostic.

Note that the interface is still not firmware agnostic. At this point we
have both OF and fwnode interfaces so that we don't break any user. On
top of this we also want to have a per driver conversion and that is the
main reason we have both of_xlate() and fwnode_xlate() support.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20220715122903.332535-6-nuno.sa@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/consumer.h | 36 ++++++++++++++++++++----------------
 include/linux/iio/iio.h      |  5 +++++
 2 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index 5fa5957586cf..2adb1306da3e 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -7,6 +7,7 @@
 #ifndef _IIO_INKERN_CONSUMER_H_
 #define _IIO_INKERN_CONSUMER_H_
 
+#include <linux/of.h>
 #include <linux/types.h>
 #include <linux/iio/types.h>
 
@@ -14,6 +15,7 @@ struct iio_dev;
 struct iio_chan_spec;
 struct device;
 struct device_node;
+struct fwnode_handle;
 
 /**
  * struct iio_channel - everything needed for a consumer to use a channel
@@ -99,26 +101,20 @@ void iio_channel_release_all(struct iio_channel *chan);
 struct iio_channel *devm_iio_channel_get_all(struct device *dev);
 
 /**
- * of_iio_channel_get_by_name() - get description of all that is needed to access channel.
- * @np:			Pointer to consumer device tree node
+ * fwnode_iio_channel_get_by_name() - get description of all that is needed to access channel.
+ * @fwnode:		Pointer to consumer Firmware node
  * @consumer_channel:	Unique name to identify the channel on the consumer
  *			side. This typically describes the channels use within
  *			the consumer. E.g. 'battery_voltage'
  */
-#ifdef CONFIG_OF
-struct iio_channel *of_iio_channel_get_by_name(struct device_node *np, const char *name);
-#else
-static inline struct iio_channel *
-of_iio_channel_get_by_name(struct device_node *np, const char *name)
-{
-	return NULL;
-}
-#endif
+struct iio_channel *fwnode_iio_channel_get_by_name(struct fwnode_handle *fwnode,
+						   const char *name);
 
 /**
- * devm_of_iio_channel_get_by_name() - Resource managed version of of_iio_channel_get_by_name().
+ * devm_fwnode_iio_channel_get_by_name() - Resource managed version of
+ *					   fwnode_iio_channel_get_by_name().
  * @dev:		Pointer to consumer device.
- * @np:			Pointer to consumer device tree node
+ * @fwnode:		Pointer to consumer Firmware node
  * @consumer_channel:	Unique name to identify the channel on the consumer
  *			side. This typically describes the channels use within
  *			the consumer. E.g. 'battery_voltage'
@@ -129,9 +125,17 @@ of_iio_channel_get_by_name(struct device_node *np, const char *name)
  * The allocated iio channel is automatically released when the device is
  * unbound.
  */
-struct iio_channel *devm_of_iio_channel_get_by_name(struct device *dev,
-						    struct device_node *np,
-						    const char *consumer_channel);
+struct iio_channel *devm_fwnode_iio_channel_get_by_name(struct device *dev,
+							struct fwnode_handle *fwnode,
+							const char *consumer_channel);
+
+static inline struct iio_channel
+*devm_of_iio_channel_get_by_name(struct device *dev, struct device_node *np,
+				 const char *consumer_channel)
+{
+	return devm_fwnode_iio_channel_get_by_name(dev, of_fwnode_handle(np),
+						   consumer_channel);
+}
 
 struct iio_cb_buffer;
 /**
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 5dfbfc991c69..6002f8a0baf1 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -18,6 +18,7 @@
  */
 
 struct of_phandle_args;
+struct fwnode_reference_args;
 
 enum iio_shared_by {
 	IIO_SEPARATE,
@@ -429,6 +430,8 @@ struct iio_trigger; /* forward declaration */
  *			provide a custom of_xlate function that reads the
  *			*args* and returns the appropriate index in registered
  *			IIO channels array.
+ * @fwnode_xlate:	fwnode based function pointer to obtain channel specifier index.
+ *			Functionally the same as @of_xlate.
  * @hwfifo_set_watermark: function pointer to set the current hardware
  *			fifo watermark level; see hwfifo_* entries in
  *			Documentation/ABI/testing/sysfs-bus-iio for details on
@@ -510,6 +513,8 @@ struct iio_info {
 				  unsigned *readval);
 	int (*of_xlate)(struct iio_dev *indio_dev,
 			const struct of_phandle_args *iiospec);
+	int (*fwnode_xlate)(struct iio_dev *indio_dev,
+			    const struct fwnode_reference_args *iiospec);
 	int (*hwfifo_set_watermark)(struct iio_dev *indio_dev, unsigned val);
 	int (*hwfifo_flush_to_buffer)(struct iio_dev *indio_dev,
 				      unsigned count);
-- 
cgit v1.2.3


From b22bc4d6072e74b768c4c19e2ff3585ba5927904 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nuno=20S=C3=A1?= <nuno.sa@analog.com>
Date: Fri, 15 Jul 2022 14:29:02 +0200
Subject: iio: inkern: remove OF dependencies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since all users of the OF dependendent API are now converted to use the
firmware agnostic alternative, we can drop OF dependencies from the IIO
in kernel interface.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20220715122903.332535-15-nuno.sa@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/consumer.h | 10 ----------
 include/linux/iio/iio.h      |  3 ---
 2 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index 2adb1306da3e..6802596b017c 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -7,14 +7,12 @@
 #ifndef _IIO_INKERN_CONSUMER_H_
 #define _IIO_INKERN_CONSUMER_H_
 
-#include <linux/of.h>
 #include <linux/types.h>
 #include <linux/iio/types.h>
 
 struct iio_dev;
 struct iio_chan_spec;
 struct device;
-struct device_node;
 struct fwnode_handle;
 
 /**
@@ -129,14 +127,6 @@ struct iio_channel *devm_fwnode_iio_channel_get_by_name(struct device *dev,
 							struct fwnode_handle *fwnode,
 							const char *consumer_channel);
 
-static inline struct iio_channel
-*devm_of_iio_channel_get_by_name(struct device *dev, struct device_node *np,
-				 const char *consumer_channel)
-{
-	return devm_fwnode_iio_channel_get_by_name(dev, of_fwnode_handle(np),
-						   consumer_channel);
-}
-
 struct iio_cb_buffer;
 /**
  * iio_channel_get_all_cb() - register callback for triggered capture
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 6002f8a0baf1..f0ec8a5e5a7a 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -17,7 +17,6 @@
  * Currently assumes nano seconds.
  */
 
-struct of_phandle_args;
 struct fwnode_reference_args;
 
 enum iio_shared_by {
@@ -511,8 +510,6 @@ struct iio_info {
 	int (*debugfs_reg_access)(struct iio_dev *indio_dev,
 				  unsigned reg, unsigned writeval,
 				  unsigned *readval);
-	int (*of_xlate)(struct iio_dev *indio_dev,
-			const struct of_phandle_args *iiospec);
 	int (*fwnode_xlate)(struct iio_dev *indio_dev,
 			    const struct fwnode_reference_args *iiospec);
 	int (*hwfifo_set_watermark)(struct iio_dev *indio_dev, unsigned val);
-- 
cgit v1.2.3


From 5c64990b99aa91622cf044a9a2f7a78de8f770a2 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sun, 26 Jun 2022 13:29:32 +0100
Subject: iio: core: Introduce _zeropoint for differential channels

Address an ABI gap for device where the offset of both lines in a
differential pair may be controlled so as to allow a wider range of
inputs, but without having any direct effect of the differential
measurement.

_offset cannot be used as to remain in line with existing usage,
userspace would be expected to apply it as (_raw + _offset) * _scale
whereas _zeropoint is not. i.e. If we were computing the differential
in software it would be.
((postive_raw + _zeropoint) - (negative_raw + zeropoint) + _offset) * _scale
= ((postive_raw - negative_raw) + _offset) * _scale
= (differential_raw + _offset) * _scale

Similarly calibbias is expected to tweak the measurement seen, not
the adjust the two lines of the differential pair.

Needed for in_capacitanceX-capacitanceY_zeropoint for the
AD7746 CDC driver.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20220626122938.582107-12-jic23@kernel.org
---
 include/linux/iio/types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index a7aa91f3a8dc..27143b03909d 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -63,6 +63,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_OVERSAMPLING_RATIO,
 	IIO_CHAN_INFO_THERMOCOUPLE_TYPE,
 	IIO_CHAN_INFO_CALIBAMBIENT,
+	IIO_CHAN_INFO_ZEROPOINT,
 };
 
 #endif /* _IIO_TYPES_H_ */
-- 
cgit v1.2.3


From 76b079ef4cc954fc2c2e0333a01855b0b2b6bdee Mon Sep 17 00:00:00 2001
From: Hao Jia <jiahao.os@bytedance.com>
Date: Sat, 6 Aug 2022 20:05:09 +0800
Subject: sched/psi: Remove unused parameter nbytes of psi_trigger_create()

psi_trigger_create()'s 'nbytes' parameter is not used, so we can remove it.

Signed-off-by: Hao Jia <jiahao.os@bytedance.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/psi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/psi.h b/include/linux/psi.h
index 89784763d19e..dd74411ac21d 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -27,7 +27,7 @@ void psi_memstall_leave(unsigned long *flags);
 
 int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
 struct psi_trigger *psi_trigger_create(struct psi_group *group,
-			char *buf, size_t nbytes, enum psi_res res);
+			char *buf, enum psi_res res);
 void psi_trigger_destroy(struct psi_trigger *t);
 
 __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
-- 
cgit v1.2.3


From d7ae5818c3fa3007dee13f9d99832e7f26b8bc44 Mon Sep 17 00:00:00 2001
From: Hao Jia <jiahao.os@bytedance.com>
Date: Sat, 6 Aug 2022 20:05:10 +0800
Subject: sched/psi: Remove redundant cgroup_psi() when !CONFIG_CGROUPS

cgroup_psi() is only called under CONFIG_CGROUPS.
We don't need cgroup_psi() when !CONFIG_CGROUPS,
so we can remove it in this case.

Signed-off-by: Hao Jia <jiahao.os@bytedance.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ed53bfe7c46c..ac5d0515680e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -734,11 +734,6 @@ static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
 	return NULL;
 }
 
-static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
-{
-	return NULL;
-}
-
 static inline bool cgroup_psi_enabled(void)
 {
 	return false;
-- 
cgit v1.2.3


From 6a8f359c3132e4f51bdb263ad74ec632c65e55fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Mon, 15 Aug 2022 10:02:29 +0200
Subject: gpio: pca953x: Make platform teardown callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All platforms that provide a teardown callback return 0. New users are
supposed to not make use of platform support, so there is no
functionality lost.

This patch is a preparation for making i2c remove callbacks return void.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Bartosz Golaszewski <brgl@bgdev.pl>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/platform_data/pca953x.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/pca953x.h b/include/linux/platform_data/pca953x.h
index 4eb53e023997..96c1a14ab365 100644
--- a/include/linux/platform_data/pca953x.h
+++ b/include/linux/platform_data/pca953x.h
@@ -22,7 +22,7 @@ struct pca953x_platform_data {
 	int		(*setup)(struct i2c_client *client,
 				unsigned gpio, unsigned ngpio,
 				void *context);
-	int		(*teardown)(struct i2c_client *client,
+	void		(*teardown)(struct i2c_client *client,
 				unsigned gpio, unsigned ngpio,
 				void *context);
 	const char	*const *names;
-- 
cgit v1.2.3


From ed5c2f5fd10dda07263f79f338a512c0f49f76f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Mon, 15 Aug 2022 10:02:30 +0200
Subject: i2c: Make remove callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The value returned by an i2c driver's remove function is mostly ignored.
(Only an error message is printed if the value is non-zero that the
error is ignored.)

So change the prototype of the remove function to return no value. This
way driver authors are not tempted to assume that passing an error to
the upper layer is a good idea. All drivers are adapted accordingly.
There is no intended change of behaviour, all callbacks were prepared to
return 0 before.

Reviewed-by: Peter Senna Tschudin <peter.senna@gmail.com>
Reviewed-by: Jeremy Kerr <jk@codeconstruct.com.au>
Reviewed-by: Benjamin Mugnier <benjamin.mugnier@foss.st.com>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Reviewed-by: Crt Mori <cmo@melexis.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Marek Behún <kabel@kernel.org> # for leds-turris-omnia
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Petr Machata <petrm@nvidia.com> # for mlxsw
Reviewed-by: Maximilian Luz <luzmaximilian@gmail.com> # for surface3_power
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> # for bmc150-accel-i2c + kxcjk-1013
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl> # for media/* + staging/media/*
Acked-by: Miguel Ojeda <ojeda@kernel.org> # for auxdisplay/ht16k33 + auxdisplay/lcd2s
Reviewed-by: Luca Ceresoli <luca.ceresoli@bootlin.com> # for versaclock5
Reviewed-by: Ajay Gupta <ajayg@nvidia.com> # for ucsi_ccg
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # for iio
Acked-by: Peter Rosin <peda@axentia.se> # for i2c-mux-*, max9860
Acked-by: Adrien Grassein <adrien.grassein@gmail.com> # for lontium-lt8912b
Reviewed-by: Jean Delvare <jdelvare@suse.de> # for hwmon, i2c-core and i2c/muxes
Acked-by: Corey Minyard <cminyard@mvista.com> # for IPMI
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Sebastian Reichel <sebastian.reichel@collabora.com> # for drivers/power
Acked-by: Krzysztof Hałasa <khalasa@piap.pl>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/i2c.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 8eab5017bff3..f7c49bbdb8a1 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -273,7 +273,7 @@ struct i2c_driver {
 
 	/* Standard driver model interfaces */
 	int (*probe)(struct i2c_client *client, const struct i2c_device_id *id);
-	int (*remove)(struct i2c_client *client);
+	void (*remove)(struct i2c_client *client);
 
 	/* New driver model interface to aid the seamless removal of the
 	 * current probe()'s, more commonly unused than used second parameter.
-- 
cgit v1.2.3


From 680f8666baf6b4a6cc368dfff6614010ec23c51d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Mon, 18 Jul 2022 14:14:08 +0200
Subject: interconnect: Make icc_provider_del() return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All users ignore the return value of icc_provider_del(). Consequently
make it not return an error code.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20220718121409.171773-8-u.kleine-koenig@pengutronix.de
Signed-off-by: Georgi Djakov <djakov@kernel.org>
---
 include/linux/interconnect-provider.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interconnect-provider.h b/include/linux/interconnect-provider.h
index 6bd01f7159c6..cd5c5a27557f 100644
--- a/include/linux/interconnect-provider.h
+++ b/include/linux/interconnect-provider.h
@@ -123,7 +123,7 @@ void icc_node_add(struct icc_node *node, struct icc_provider *provider);
 void icc_node_del(struct icc_node *node);
 int icc_nodes_remove(struct icc_provider *provider);
 int icc_provider_add(struct icc_provider *provider);
-int icc_provider_del(struct icc_provider *provider);
+void icc_provider_del(struct icc_provider *provider);
 struct icc_node_data *of_icc_get_from_provider(struct of_phandle_args *spec);
 void icc_sync_state(struct device *dev);
 
@@ -172,9 +172,8 @@ static inline int icc_provider_add(struct icc_provider *provider)
 	return -ENOTSUPP;
 }
 
-static inline int icc_provider_del(struct icc_provider *provider)
+static inline void icc_provider_del(struct icc_provider *provider)
 {
-	return -ENOTSUPP;
 }
 
 static inline struct icc_node_data *of_icc_get_from_provider(struct of_phandle_args *spec)
-- 
cgit v1.2.3


From 7cd4c5c2101cb092db00f61f69d24380cf7a0ee8 Mon Sep 17 00:00:00 2001
From: Frederick Lawler <fred@cloudflare.com>
Date: Mon, 15 Aug 2022 11:20:25 -0500
Subject: security, lsm: Introduce security_create_user_ns()

User namespaces are an effective tool to allow programs to run with
permission without requiring the need for a program to run as root. User
namespaces may also be used as a sandboxing technique. However, attackers
sometimes leverage user namespaces as an initial attack vector to perform
some exploit. [1,2,3]

While it is not the unprivileged user namespace functionality, which
causes the kernel to be exploitable, users/administrators might want to
more granularly limit or at least monitor how various processes use this
functionality, while vulnerable kernel subsystems are being patched.

Preventing user namespace already creation comes in a few of forms in
order of granularity:

        1. /proc/sys/user/max_user_namespaces sysctl
        2. Distro specific patch(es)
        3. CONFIG_USER_NS

To block a task based on its attributes, the LSM hook cred_prepare is a
decent candidate for use because it provides more granular control, and
it is called before create_user_ns():

        cred = prepare_creds()
                security_prepare_creds()
                        call_int_hook(cred_prepare, ...
        if (cred)
                create_user_ns(cred)

Since security_prepare_creds() is meant for LSMs to copy and prepare
credentials, access control is an unintended use of the hook. [4]
Further, security_prepare_creds() will always return a ENOMEM if the
hook returns any non-zero error code.

This hook also does not handle the clone3 case which requires us to
access a user space pointer to know if we're in the CLONE_NEW_USER
call path which may be subject to a TOCTTOU attack.

Lastly, cred_prepare is called in many call paths, and a targeted hook
further limits the frequency of calls which is a beneficial outcome.
Therefore introduce a new function security_create_user_ns() with an
accompanying userns_create LSM hook.

With the new userns_create hook, users will have more control over the
observability and access control over user namespace creation. Users
should expect that normal operation of user namespaces will behave as
usual, and only be impacted when controls are implemented by users or
administrators.

This hook takes the prepared creds for LSM authors to write policy
against. On success, the new namespace is applied to credentials,
otherwise an error is returned.

Links:
1. https://nvd.nist.gov/vuln/detail/CVE-2022-0492
2. https://nvd.nist.gov/vuln/detail/CVE-2022-25636
3. https://nvd.nist.gov/vuln/detail/CVE-2022-34918
4. https://lore.kernel.org/all/1c4b1c0d-12f6-6e9e-a6a3-cdce7418110c@schaufler-ca.com/

Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: KP Singh <kpsingh@kernel.org>
Signed-off-by: Frederick Lawler <fred@cloudflare.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 1 +
 include/linux/lsm_hooks.h     | 4 ++++
 include/linux/security.h      | 6 ++++++
 3 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 806448173033..aa7272e83626 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -224,6 +224,7 @@ LSM_HOOK(int, -ENOSYS, task_prctl, int option, unsigned long arg2,
 	 unsigned long arg3, unsigned long arg4, unsigned long arg5)
 LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p,
 	 struct inode *inode)
+LSM_HOOK(int, 0, userns_create, const struct cred *cred)
 LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag)
 LSM_HOOK(void, LSM_RET_VOID, ipc_getsecid, struct kern_ipc_perm *ipcp,
 	 u32 *secid)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 84a0d7e02176..2e11a2a22ed1 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -806,6 +806,10 @@
  *	security attributes, e.g. for /proc/pid inodes.
  *	@p contains the task_struct for the task.
  *	@inode contains the inode structure for the inode.
+ * @userns_create:
+ *	Check permission prior to creating a new user namespace.
+ *	@cred points to prepared creds.
+ *	Return 0 if successful, otherwise < 0 error code.
  *
  * Security hooks for Netlink messaging.
  *
diff --git a/include/linux/security.h b/include/linux/security.h
index 1bc362cb413f..767802fe9bfa 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -437,6 +437,7 @@ int security_task_kill(struct task_struct *p, struct kernel_siginfo *info,
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			unsigned long arg4, unsigned long arg5);
 void security_task_to_inode(struct task_struct *p, struct inode *inode);
+int security_create_user_ns(const struct cred *cred);
 int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag);
 void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid);
 int security_msg_msg_alloc(struct msg_msg *msg);
@@ -1194,6 +1195,11 @@ static inline int security_task_prctl(int option, unsigned long arg2,
 static inline void security_task_to_inode(struct task_struct *p, struct inode *inode)
 { }
 
+static inline int security_create_user_ns(const struct cred *cred)
+{
+	return 0;
+}
+
 static inline int security_ipc_permission(struct kern_ipc_perm *ipcp,
 					  short flag)
 {
-- 
cgit v1.2.3


From 0630f64d25a0f0a8c6a9ce9fde8750b3b561e6f5 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 15 Aug 2022 12:07:47 -0700
Subject: net: phy: broadcom: Implement suspend/resume for AC131 and BCM5241

Implement the suspend/resume procedure for the Broadcom AC131 and BCM5241 type
of PHYs (10/100 only) by entering the standard power down followed by the
proprietary standby mode in the auxiliary mode 4 shadow register. On resume,
the PHY software reset is enough to make it come out of standby mode so we can
utilize brcm_fet_config_init() as the resume hook.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 6ff567ece34a..9e77165f3ef6 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -293,6 +293,7 @@
 #define MII_BRCM_FET_SHDW_MC_FAME	0x4000	/* Force Auto MDIX enable */
 
 #define MII_BRCM_FET_SHDW_AUXMODE4	0x1a	/* Auxiliary mode 4 */
+#define MII_BRCM_FET_SHDW_AM4_STANDBY	0x0008	/* Standby enable */
 #define MII_BRCM_FET_SHDW_AM4_LED_MASK	0x0003
 #define MII_BRCM_FET_SHDW_AM4_LED_MODE1 0x0001
 
-- 
cgit v1.2.3


From c20cc099b30abd50f563e422aa72edcd7f92da55 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 16 Aug 2022 22:48:31 +0200
Subject: regmap: Support accelerated noinc operations

Several architectures have accelerated operations for MMIO
operations writing to a single register, such as writesb, writesw,
writesl, writesq, readsb, readsw, readsl and readsq but regmap
currently cannot use them because we have no hooks for providing
an accelerated noinc back-end for MMIO.

Solve this by providing reg_[read/write]_noinc callbacks for
the bus abstraction, so that the regmap-mmio bus can use this.

Currently I do not see a need to support this for custom regmaps
so it is only added to the bus.

Callbacks are passed a void * with the array of values and a
count which is the number of items of the byte chunk size for
the specific register width.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20220816204832.265837-1-linus.walleij@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 8cccc247cd37..ca3434dca3a0 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -492,8 +492,12 @@ typedef int (*regmap_hw_read)(void *context,
 			      void *val_buf, size_t val_size);
 typedef int (*regmap_hw_reg_read)(void *context, unsigned int reg,
 				  unsigned int *val);
+typedef int (*regmap_hw_reg_noinc_read)(void *context, unsigned int reg,
+					void *val, size_t val_count);
 typedef int (*regmap_hw_reg_write)(void *context, unsigned int reg,
 				   unsigned int val);
+typedef int (*regmap_hw_reg_noinc_write)(void *context, unsigned int reg,
+					 const void *val, size_t val_count);
 typedef int (*regmap_hw_reg_update_bits)(void *context, unsigned int reg,
 					 unsigned int mask, unsigned int val);
 typedef struct regmap_async *(*regmap_hw_async_alloc)(void);
@@ -514,6 +518,8 @@ typedef void (*regmap_hw_free_context)(void *context);
  *               must serialise with respect to non-async I/O.
  * @reg_write: Write a single register value to the given register address. This
  *             write operation has to complete when returning from the function.
+ * @reg_write_noinc: Write multiple register value to the same register. This
+ *             write operation has to complete when returning from the function.
  * @reg_update_bits: Update bits operation to be used against volatile
  *                   registers, intended for devices supporting some mechanism
  *                   for setting clearing bits without having to
@@ -541,9 +547,11 @@ struct regmap_bus {
 	regmap_hw_gather_write gather_write;
 	regmap_hw_async_write async_write;
 	regmap_hw_reg_write reg_write;
+	regmap_hw_reg_noinc_write reg_noinc_write;
 	regmap_hw_reg_update_bits reg_update_bits;
 	regmap_hw_read read;
 	regmap_hw_reg_read reg_read;
+	regmap_hw_reg_noinc_read reg_noinc_read;
 	regmap_hw_free_context free_context;
 	regmap_hw_async_alloc async_alloc;
 	u8 read_flag_mask;
-- 
cgit v1.2.3


From 874de459488b8741afc0e9888d39f2e15a962b3d Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Thu, 14 Jul 2022 09:10:40 +0800
Subject: soundwire: add read_ping_status helper definition in manager ops

The existing manager ops provide callbacks to transfer read/write
commands, but don't allow for direct access to PING status
register. This is accessible in all existing IP, and would help
diagnose timeouts or resume issues by reporting the 'true' status
instead of the internal status reported by the IP.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Acked-By: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20220714011043.46059-2-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/soundwire/sdw.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 39058c841469..eb840a07c25c 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -839,6 +839,8 @@ struct sdw_defer {
  * @set_bus_conf: Set the bus configuration
  * @pre_bank_switch: Callback for pre bank switch
  * @post_bank_switch: Callback for post bank switch
+ * @read_ping_status: Read status from PING frames, reported with two bits per Device.
+ * Bits 31:24 are reserved.
  */
 struct sdw_master_ops {
 	int (*read_prop)(struct sdw_bus *bus);
@@ -855,6 +857,7 @@ struct sdw_master_ops {
 			struct sdw_bus_params *params);
 	int (*pre_bank_switch)(struct sdw_bus *bus);
 	int (*post_bank_switch)(struct sdw_bus *bus);
+	u32 (*read_ping_status)(struct sdw_bus *bus);
 
 };
 
-- 
cgit v1.2.3


From 79fe02cb7547fcc09e83b850cfd32896d7dc6289 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Thu, 14 Jul 2022 09:10:42 +0800
Subject: soundwire: add sdw_show_ping_status() helper

This helper provides an optional delay parameter to wait for devices
to resync in case of errors, and checks that devices are indeed
attached on the bus.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Acked-By: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20220714011043.46059-4-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/soundwire/sdw.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index eb840a07c25c..822599957b35 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -922,6 +922,8 @@ int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent,
 		       struct fwnode_handle *fwnode);
 void sdw_bus_master_delete(struct sdw_bus *bus);
 
+void sdw_show_ping_status(struct sdw_bus *bus, bool sync_delay);
+
 /**
  * sdw_port_config: Master or Slave Port configuration
  *
-- 
cgit v1.2.3


From 3fd6d6e2b4e80fe45bfd1c8f01dff7d30a0f9b53 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:17 +0200
Subject: thermal/of: Rework the thermal device tree initialization

The following changes are reworking entirely the thermal device tree
initialization. The old version is kept until the different drivers
using it are converted to the new API.

The old approach creates the different actors independently. This
approach is the source of the code duplication in the thermal OF
because a thermal zone is created but a sensor is registered
after. The thermal zones are created unconditionnaly with a fake
sensor at init time, thus forcing to provide fake ops and store all
the thermal zone related information in duplicated structures. Then
the sensor is initialized and the code looks up the thermal zone name
using the device tree. Then the sensor is associated to the thermal
zone, and the sensor specific ops are called with a second level of
indirection from the thermal zone ops.

When a sensor is removed (with a module unload), the thermal zone
stays there with the fake sensor.

The cooling device associated with a thermal zone and a trip point is
stored in a list, again duplicating information, using the node name
of the device tree to match afterwards the cooling devices.

The new approach is simpler, it creates a thermal zone when the sensor
is registered and destroys it when the sensor is removed. All the
matching between the cooling device, trip points and thermal zones are
done using the device tree, as well as bindings. The ops are no longer
specific but uses the generic ones provided by the thermal framework.

When the old code won't have any users, it can be removed and the
remaining thermal OF code will be much simpler.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-2-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 include/linux/thermal.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 1386c713885d..e2ac9d473bd6 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -325,6 +325,16 @@ struct thermal_zone_of_device_ops {
 
 /* Function declarations */
 #ifdef CONFIG_THERMAL_OF
+struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data,
+						     const struct thermal_zone_device_ops *ops);
+
+struct thermal_zone_device *devm_thermal_of_zone_register(struct device *dev, int id, void *data,
+							  const struct thermal_zone_device_ops *ops);
+
+void thermal_of_zone_unregister(struct thermal_zone_device *tz);
+
+void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz);
+
 int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
 				  struct device_node *sensor_np,
 				  u32 *id);
@@ -366,6 +376,14 @@ static inline struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
 	return ERR_PTR(-ENODEV);
 }
 
+static inline void thermal_of_zone_unregister(struct thermal_zone_device *tz)
+{
+}
+
+static inline void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz)
+{
+}
+
 static inline
 void devm_thermal_zone_of_sensor_unregister(struct device *dev,
 					    struct thermal_zone_device *tz)
-- 
cgit v1.2.3


From f59ac19b7f44cab23df84810e2da5f57bdd3a7e7 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:49 +0200
Subject: thermal/of: Remove old OF code

All the drivers are converted to the new OF API, remove the old OF code.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-34-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 include/linux/thermal.h | 77 ++++++++++---------------------------------------
 1 file changed, 15 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index e2ac9d473bd6..86c24ddd5985 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -296,33 +296,6 @@ struct thermal_zone_params {
 	int offset;
 };
 
-/**
- * struct thermal_zone_of_device_ops - callbacks for handling DT based zones
- *
- * Mandatory:
- * @get_temp: a pointer to a function that reads the sensor temperature.
- *
- * Optional:
- * @get_trend: a pointer to a function that reads the sensor temperature trend.
- * @set_trips: a pointer to a function that sets a temperature window. When
- *	       this window is left the driver must inform the thermal core via
- *	       thermal_zone_device_update.
- * @set_emul_temp: a pointer to a function that sets sensor emulated
- *		   temperature.
- * @set_trip_temp: a pointer to a function that sets the trip temperature on
- *		   hardware.
- * @change_mode: a pointer to a function that notifies the thermal zone
- *		   mode change.
- */
-struct thermal_zone_of_device_ops {
-	int (*get_temp)(void *, int *);
-	int (*get_trend)(void *, int, enum thermal_trend *);
-	int (*set_trips)(void *, int, int);
-	int (*set_emul_temp)(void *, int);
-	int (*set_trip_temp)(void *, int, int);
-	int (*change_mode) (void *, enum thermal_device_mode);
-};
-
 /* Function declarations */
 #ifdef CONFIG_THERMAL_OF
 struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data,
@@ -335,61 +308,41 @@ void thermal_of_zone_unregister(struct thermal_zone_device *tz);
 
 void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz);
 
+void thermal_of_zone_unregister(struct thermal_zone_device *tz);
+
 int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
 				  struct device_node *sensor_np,
 				  u32 *id);
-struct thermal_zone_device *
-thermal_zone_of_sensor_register(struct device *dev, int id, void *data,
-				const struct thermal_zone_of_device_ops *ops);
-void thermal_zone_of_sensor_unregister(struct device *dev,
-				       struct thermal_zone_device *tz);
-struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
-		struct device *dev, int id, void *data,
-		const struct thermal_zone_of_device_ops *ops);
-void devm_thermal_zone_of_sensor_unregister(struct device *dev,
-					    struct thermal_zone_device *tz);
 #else
-
-static inline int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
-					 struct device_node *sensor_np,
-					 u32 *id)
-{
-	return -ENOENT;
-}
-static inline struct thermal_zone_device *
-thermal_zone_of_sensor_register(struct device *dev, int id, void *data,
-				const struct thermal_zone_of_device_ops *ops)
-{
-	return ERR_PTR(-ENODEV);
-}
-
 static inline
-void thermal_zone_of_sensor_unregister(struct device *dev,
-				       struct thermal_zone_device *tz)
+struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data,
+						     const struct thermal_zone_device_ops *ops)
 {
+	return ERR_PTR(-ENOTSUPP);
 }
 
-static inline struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
-		struct device *dev, int id, void *data,
-		const struct thermal_zone_of_device_ops *ops)
+static inline
+struct thermal_zone_device *devm_thermal_of_zone_register(struct device *dev, int id, void *data,
+							  const struct thermal_zone_device_ops *ops)
 {
-	return ERR_PTR(-ENODEV);
+	return ERR_PTR(-ENOTSUPP);
 }
 
 static inline void thermal_of_zone_unregister(struct thermal_zone_device *tz)
 {
 }
 
-static inline void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz)
+static inline void devm_thermal_of_zone_unregister(struct device *dev,
+						   struct thermal_zone_device *tz)
 {
 }
 
-static inline
-void devm_thermal_zone_of_sensor_unregister(struct device *dev,
-					    struct thermal_zone_device *tz)
+static inline int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
+						struct device_node *sensor_np,
+						u32 *id)
 {
+	return -ENOENT;
 }
-
 #endif
 
 #ifdef CONFIG_THERMAL
-- 
cgit v1.2.3


From 25885a35a72007cf28ec5f9ba7169c5c798f7167 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 16 Aug 2022 11:57:56 -0400
Subject: Change calling conventions for filldir_t

filldir_t instances (directory iterators callbacks) used to return 0 for
"OK, keep going" or -E... for "stop".  Note that it's *NOT* how the
error values are reported - the rules for those are callback-dependent
and ->iterate{,_shared}() instances only care about zero vs. non-zero
(look at emit_dir() and friends).

So let's just return bool ("should we keep going?") - it's less confusing
that way.  The choice between "true means keep going" and "true means
stop" is bikesheddable; we have two groups of callbacks -
	do something for everything in directory, until we run into problem
and
	find an entry in directory and do something to it.

The former tended to use 0/-E... conventions - -E<something> on failure.
The latter tended to use 0/1, 1 being "stop, we are done".
The callers treated anything non-zero as "stop", ignoring which
non-zero value did they get.

"true means stop" would be more natural for the second group; "true
means keep going" - for the first one.  I tried both variants and
the things like
	if allocation failed
		something = -ENOMEM;
		return true;
just looked unnatural and asking for trouble.

[folded suggestion from Matthew Wilcox <willy@infradead.org>]
Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9eced4cc286e..c40f68f62941 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2038,9 +2038,10 @@ umode_t mode_strip_sgid(struct user_namespace *mnt_userns,
  * the kernel specify what kind of dirent layout it wants to have.
  * This allows the kernel to read directories into kernel space or
  * to have different dirent layouts depending on the binary type.
+ * Return 'true' to keep going and 'false' if there are no more entries.
  */
 struct dir_context;
-typedef int (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
+typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
 			 unsigned);
 
 struct dir_context {
@@ -3540,17 +3541,17 @@ static inline bool dir_emit(struct dir_context *ctx,
 			    const char *name, int namelen,
 			    u64 ino, unsigned type)
 {
-	return ctx->actor(ctx, name, namelen, ctx->pos, ino, type) == 0;
+	return ctx->actor(ctx, name, namelen, ctx->pos, ino, type);
 }
 static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx)
 {
 	return ctx->actor(ctx, ".", 1, ctx->pos,
-			  file->f_path.dentry->d_inode->i_ino, DT_DIR) == 0;
+			  file->f_path.dentry->d_inode->i_ino, DT_DIR);
 }
 static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx)
 {
 	return ctx->actor(ctx, "..", 2, ctx->pos,
-			  parent_ino(file->f_path.dentry), DT_DIR) == 0;
+			  parent_ino(file->f_path.dentry), DT_DIR);
 }
 static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
 {
-- 
cgit v1.2.3


From e34cfee65ec891a319ce79797dda18083af33a76 Mon Sep 17 00:00:00 2001
From: Wong Vee Khee <veekhee@apple.com>
Date: Wed, 17 Aug 2022 14:43:24 +0800
Subject: stmmac: intel: remove unused 'has_crossts' flag

The 'has_crossts' flag was not used anywhere in the stmmac driver,
removing it from both header file and dwmac-intel driver.

Signed-off-by: Wong Vee Khee <veekhee@apple.com>
Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de>
Link: https://lore.kernel.org/r/20220817064324.10025-1-veekhee@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/stmmac.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 8df475db88c0..fb2e88614f5d 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -257,7 +257,6 @@ struct plat_stmmacenet_data {
 	u8 vlan_fail_q;
 	unsigned int eee_usecs_rate;
 	struct pci_dev *pdev;
-	bool has_crossts;
 	int int_snapshot_num;
 	int ext_snapshot_num;
 	bool int_snapshot_en;
-- 
cgit v1.2.3


From da279e6965b3838e99e5c0ab8f76b87bf86b31a5 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Fri, 12 Aug 2022 13:10:37 +0300
Subject: regulator: Add devm helpers for get and enable

A few regulator consumer drivers seem to be just getting a regulator,
enabling it and registering a devm-action to disable the regulator at
the driver detach and then forget about it.

We can simplify this a bit by adding a devm-helper for this pattern.
Add devm_regulator_get_enable() and devm_regulator_get_enable_optional()

Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Link: https://lore.kernel.org/r/ed7b8841193bb9749d426f3cb3b199c9460794cd.1660292316.git.mazziesaccount@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/consumer.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index bc6cda706d1f..ee3b4a014611 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -207,6 +207,8 @@ struct regulator *__must_check regulator_get_optional(struct device *dev,
 						      const char *id);
 struct regulator *__must_check devm_regulator_get_optional(struct device *dev,
 							   const char *id);
+int devm_regulator_get_enable(struct device *dev, const char *id);
+int devm_regulator_get_enable_optional(struct device *dev, const char *id);
 void regulator_put(struct regulator *regulator);
 void devm_regulator_put(struct regulator *regulator);
 
@@ -244,12 +246,15 @@ int __must_check regulator_bulk_get(struct device *dev, int num_consumers,
 				    struct regulator_bulk_data *consumers);
 int __must_check devm_regulator_bulk_get(struct device *dev, int num_consumers,
 					 struct regulator_bulk_data *consumers);
+void devm_regulator_bulk_put(struct regulator_bulk_data *consumers);
 int __must_check devm_regulator_bulk_get_const(
 	struct device *dev, int num_consumers,
 	const struct regulator_bulk_data *in_consumers,
 	struct regulator_bulk_data **out_consumers);
 int __must_check regulator_bulk_enable(int num_consumers,
 				       struct regulator_bulk_data *consumers);
+int devm_regulator_bulk_get_enable(struct device *dev, int num_consumers,
+				   const char * const *id);
 int regulator_bulk_disable(int num_consumers,
 			   struct regulator_bulk_data *consumers);
 int regulator_bulk_force_disable(int num_consumers,
@@ -354,6 +359,17 @@ devm_regulator_get_exclusive(struct device *dev, const char *id)
 	return ERR_PTR(-ENODEV);
 }
 
+static inline int devm_regulator_get_enable(struct device *dev, const char *id)
+{
+	return -ENODEV;
+}
+
+static inline int devm_regulator_get_enable_optional(struct device *dev,
+						     const char *id)
+{
+	return -ENODEV;
+}
+
 static inline struct regulator *__must_check
 regulator_get_optional(struct device *dev, const char *id)
 {
@@ -375,6 +391,10 @@ static inline void devm_regulator_put(struct regulator *regulator)
 {
 }
 
+static inline void devm_regulator_bulk_put(struct regulator_bulk_data *consumers)
+{
+}
+
 static inline int regulator_register_supply_alias(struct device *dev,
 						  const char *id,
 						  struct device *alias_dev,
@@ -465,6 +485,13 @@ static inline int regulator_bulk_enable(int num_consumers,
 	return 0;
 }
 
+static inline int devm_regulator_bulk_get_enable(struct device *dev,
+						 int num_consumers,
+						 const char * const *id)
+{
+	return 0;
+}
+
 static inline int regulator_bulk_disable(int num_consumers,
 					 struct regulator_bulk_data *consumers)
 {
-- 
cgit v1.2.3


From b5a5b9d5f28d23b84f06b45c61dcad95b07d41bc Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@kernel.org>
Date: Thu, 18 Aug 2022 15:38:58 +0200
Subject: serial: document start_rx member at struct uart_ops

Fix this doc build warning:
	./include/linux/serial_core.h:397: warning: Function parameter or member 'start_rx' not described in 'uart_ops'

Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Link: https://lore.kernel.org/r/5d07ae2eec8fbad87e623160f9926b178bef2744.1660829433.git.mchehab@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index aef3145f2032..6e4f4765d209 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -141,6 +141,14 @@ struct gpio_desc;
  *	Locking: none.
  *	Interrupts: caller dependent.
  *
+ * @start_rx: ``void ()(struct uart_port *port)``
+ *
+ *	Start receiving characters.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
  * @stop_rx: ``void ()(struct uart_port *port)``
  *
  *	Stop receiving characters; the @port is in the process of being closed.
-- 
cgit v1.2.3


From c1e5c2f0cb8a22ec2e14af92afc7006491bebabb Mon Sep 17 00:00:00 2001
From: Pablo Sun <pablo.sun@mediatek.com>
Date: Thu, 4 Aug 2022 11:48:03 +0800
Subject: usb: typec: altmodes/displayport: correct pin assignment for UFP
 receptacles

Fix incorrect pin assignment values when connecting to a monitor with
Type-C receptacle instead of a plug.

According to specification, an UFP_D receptacle's pin assignment
should came from the UFP_D pin assignments field (bit 23:16), while
an UFP_D plug's assignments are described in the DFP_D pin assignments
(bit 15:8) during Mode Discovery.

For example the LG 27 UL850-W is a monitor with Type-C receptacle.
The monitor responds to MODE DISCOVERY command with following
DisplayPort Capability flag:

        dp->alt->vdo=0x140045

The existing logic only take cares of UPF_D plug case,
and would take the bit 15:8 for this 0x140045 case.

This results in an non-existing pin assignment 0x0 in
dp_altmode_configure.

To fix this problem a new set of macros are introduced
to take plug/receptacle differences into consideration.

Fixes: 0e3bb7d6894d ("usb: typec: Add driver for DisplayPort alternate mode")
Cc: stable@vger.kernel.org
Co-developed-by: Pablo Sun <pablo.sun@mediatek.com>
Co-developed-by: Macpaul Lin <macpaul.lin@mediatek.com>
Reviewed-by: Guillaume Ranquet <granquet@baylibre.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Pablo Sun <pablo.sun@mediatek.com>
Signed-off-by: Macpaul Lin <macpaul.lin@mediatek.com>
Link: https://lore.kernel.org/r/20220804034803.19486-1-macpaul.lin@mediatek.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/typec_dp.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/typec_dp.h b/include/linux/usb/typec_dp.h
index cfb916cccd31..8d09c2f0a9b8 100644
--- a/include/linux/usb/typec_dp.h
+++ b/include/linux/usb/typec_dp.h
@@ -73,6 +73,11 @@ enum {
 #define DP_CAP_USB			BIT(7)
 #define DP_CAP_DFP_D_PIN_ASSIGN(_cap_)	(((_cap_) & GENMASK(15, 8)) >> 8)
 #define DP_CAP_UFP_D_PIN_ASSIGN(_cap_)	(((_cap_) & GENMASK(23, 16)) >> 16)
+/* Get pin assignment taking plug & receptacle into consideration */
+#define DP_CAP_PIN_ASSIGN_UFP_D(_cap_) ((_cap_ & DP_CAP_RECEPTACLE) ? \
+			DP_CAP_UFP_D_PIN_ASSIGN(_cap_) : DP_CAP_DFP_D_PIN_ASSIGN(_cap_))
+#define DP_CAP_PIN_ASSIGN_DFP_D(_cap_) ((_cap_ & DP_CAP_RECEPTACLE) ? \
+			DP_CAP_DFP_D_PIN_ASSIGN(_cap_) : DP_CAP_UFP_D_PIN_ASSIGN(_cap_))
 
 /* DisplayPort Status Update VDO bits */
 #define DP_STATUS_CONNECTION(_status_)	((_status_) & 3)
-- 
cgit v1.2.3


From 77947238dad3b83bbe085ebcd576ea55afb70425 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Tue, 16 Aug 2022 21:48:29 +0000
Subject: platform/chrome: Add Type-C mux set command definitions

Copy EC header definitions for the USB Type-C Mux control command from
the EC code base. Also pull in "TBT_UFP_REPLY" definitions, since that
is the prior entry in the enum.

These headers are already present in the EC code base. [1]

[1] https://chromium.googlesource.com/chromiumos/platform/ec/+/b80f85a94a423273c1638ef7b662c56931a138dd/include/ec_commands.h

Signed-off-by: Prashant Malani <pmalani@chromium.org>
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20220816214857.2088914-2-pmalani@chromium.org
---
 include/linux/platform_data/cros_ec_commands.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 8b1b795867a1..5744a2d746aa 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -5724,8 +5724,21 @@ enum typec_control_command {
 	TYPEC_CONTROL_COMMAND_EXIT_MODES,
 	TYPEC_CONTROL_COMMAND_CLEAR_EVENTS,
 	TYPEC_CONTROL_COMMAND_ENTER_MODE,
+	TYPEC_CONTROL_COMMAND_TBT_UFP_REPLY,
+	TYPEC_CONTROL_COMMAND_USB_MUX_SET,
 };
 
+/* Replies the AP may specify to the TBT EnterMode command as a UFP */
+enum typec_tbt_ufp_reply {
+	TYPEC_TBT_UFP_REPLY_NAK,
+	TYPEC_TBT_UFP_REPLY_ACK,
+};
+
+struct typec_usb_mux_set {
+	uint8_t mux_index;	/* Index of the mux to set in the chain */
+	uint8_t mux_flags;	/* USB_PD_MUX_*-encoded USB mux state to set */
+} __ec_align1;
+
 struct ec_params_typec_control {
 	uint8_t port;
 	uint8_t command;	/* enum typec_control_command */
@@ -5739,6 +5752,8 @@ struct ec_params_typec_control {
 	union {
 		uint32_t clear_events_mask;
 		uint8_t mode_to_enter;      /* enum typec_mode */
+		uint8_t tbt_ufp_reply;      /* enum typec_tbt_ufp_reply */
+		struct typec_usb_mux_set mux_params;
 		uint8_t placeholder[128];
 	};
 } __ec_align1;
@@ -5817,6 +5832,9 @@ enum tcpc_cc_polarity {
 #define PD_STATUS_EVENT_SOP_DISC_DONE		BIT(0)
 #define PD_STATUS_EVENT_SOP_PRIME_DISC_DONE	BIT(1)
 #define PD_STATUS_EVENT_HARD_RESET		BIT(2)
+#define PD_STATUS_EVENT_DISCONNECTED		BIT(3)
+#define PD_STATUS_EVENT_MUX_0_SET_DONE		BIT(4)
+#define PD_STATUS_EVENT_MUX_1_SET_DONE		BIT(5)
 
 struct ec_params_typec_status {
 	uint8_t port;
-- 
cgit v1.2.3


From 24426654ed3ae83d1127511891fb782c54f49203 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 16 Aug 2022 23:17:17 -0700
Subject: bpf: net: Avoid sk_setsockopt() taking sk lock when called from bpf

Most of the code in bpf_setsockopt(SOL_SOCKET) are duplicated from
the sk_setsockopt().  The number of supported optnames are
increasing ever and so as the duplicated code.

One issue in reusing sk_setsockopt() is that the bpf prog
has already acquired the sk lock.  This patch adds a
has_current_bpf_ctx() to tell if the sk_setsockopt() is called from
a bpf prog.  The bpf prog calling bpf_setsockopt() is either running
in_task() or in_serving_softirq().  Both cases have the current->bpf_ctx
initialized.  Thus, the has_current_bpf_ctx() only needs to
test !!current->bpf_ctx.

This patch also adds sockopt_{lock,release}_sock() helpers
for sk_setsockopt() to use.  These helpers will test
has_current_bpf_ctx() before acquiring/releasing the lock.  They are
in EXPORT_SYMBOL for the ipv6 module to use in a latter patch.

Note on the change in sock_setbindtodevice().  sockopt_lock_sock()
is done in sock_setbindtodevice() instead of doing the lock_sock
in sock_bindtoindex(..., lock_sk = true).

Reviewed-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/r/20220817061717.4175589-1-kafai@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a627a02cf8ab..39bd36359c1e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1966,6 +1966,15 @@ static inline bool unprivileged_ebpf_enabled(void)
 	return !sysctl_unprivileged_bpf_disabled;
 }
 
+/* Not all bpf prog type has the bpf_ctx.
+ * For the bpf prog type that has initialized the bpf_ctx,
+ * this function can be used to decide if a kernel function
+ * is called by a bpf program.
+ */
+static inline bool has_current_bpf_ctx(void)
+{
+	return !!current->bpf_ctx;
+}
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -2175,6 +2184,10 @@ static inline bool unprivileged_ebpf_enabled(void)
 	return false;
 }
 
+static inline bool has_current_bpf_ctx(void)
+{
+	return false;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
-- 
cgit v1.2.3


From 2c8cc0946c14c39b02748fba34325ecae636530a Mon Sep 17 00:00:00 2001
From: Gene Chen <gene_chen@richtek.com>
Date: Fri, 5 Aug 2022 15:17:12 +0800
Subject: usb: typec: tcpci: Move function "tcpci_to_typec_cc" to common

Move transition function "tcpci_to_typec_cc" to common header

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Gene Chen <gene_chen@richtek.com>
Link: https://lore.kernel.org/r/20220805071714.150882-7-gene.chen.richtek@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/tcpci.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h
index 20c0bedb8ec8..17657451c762 100644
--- a/include/linux/usb/tcpci.h
+++ b/include/linux/usb/tcpci.h
@@ -167,6 +167,11 @@
 /* I2C_WRITE_BYTE_COUNT + 1 when TX_BUF_BYTE_x is only accessible I2C_WRITE_BYTE_COUNT */
 #define TCPC_TRANSMIT_BUFFER_MAX_LEN		31
 
+#define tcpc_presenting_rd(reg, cc) \
+	(!(TCPC_ROLE_CTRL_DRP & (reg)) && \
+	 (((reg) & (TCPC_ROLE_CTRL_## cc ##_MASK << TCPC_ROLE_CTRL_## cc ##_SHIFT)) == \
+	  (TCPC_ROLE_CTRL_CC_RD << TCPC_ROLE_CTRL_## cc ##_SHIFT)))
+
 struct tcpci;
 
 /*
@@ -207,4 +212,21 @@ irqreturn_t tcpci_irq(struct tcpci *tcpci);
 
 struct tcpm_port;
 struct tcpm_port *tcpci_get_tcpm_port(struct tcpci *tcpci);
+
+static inline enum typec_cc_status tcpci_to_typec_cc(unsigned int cc, bool sink)
+{
+	switch (cc) {
+	case 0x1:
+		return sink ? TYPEC_CC_RP_DEF : TYPEC_CC_RA;
+	case 0x2:
+		return sink ? TYPEC_CC_RP_1_5 : TYPEC_CC_RD;
+	case 0x3:
+		if (sink)
+			return TYPEC_CC_RP_3_0;
+		fallthrough;
+	case 0x0:
+	default:
+		return TYPEC_CC_OPEN;
+	}
+}
 #endif /* __LINUX_USB_TCPCI_H */
-- 
cgit v1.2.3


From 77bfa0fc7536e8fa7dc6f12081827e0edd75b0f9 Mon Sep 17 00:00:00 2001
From: Jim Lin <jilin@nvidia.com>
Date: Tue, 16 Aug 2022 16:23:52 +0800
Subject: phy: tegra: xusb: add utmi pad power on/down ops

Add utmi_pad_power_on/down ops for each SOC instead of exporting
tegra_phy_xusb_utmi_pad_power_on/down directly for Tegra186 chip.

Signed-off-by: BH Hsieh <bhsieh@nvidia.com>
Signed-off-by: Jim Lin <jilin@nvidia.com>
Link: https://lore.kernel.org/r/20220816082353.13390-2-jilin@nvidia.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/phy/tegra/xusb.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phy/tegra/xusb.h b/include/linux/phy/tegra/xusb.h
index 3a35e74cdc61..70998e6dd6fd 100644
--- a/include/linux/phy/tegra/xusb.h
+++ b/include/linux/phy/tegra/xusb.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2016-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION.  All rights reserved.
  */
 
 #ifndef PHY_TEGRA_XUSB_H
@@ -21,6 +21,8 @@ int tegra_xusb_padctl_usb3_set_lfps_detect(struct tegra_xusb_padctl *padctl,
 					   unsigned int port, bool enable);
 int tegra_xusb_padctl_set_vbus_override(struct tegra_xusb_padctl *padctl,
 					bool val);
+void tegra_phy_xusb_utmi_pad_power_on(struct phy *phy);
+void tegra_phy_xusb_utmi_pad_power_down(struct phy *phy);
 int tegra_phy_xusb_utmi_port_reset(struct phy *phy);
 int tegra_xusb_padctl_get_usb3_companion(struct tegra_xusb_padctl *padctl,
 					 unsigned int port);
-- 
cgit v1.2.3


From 36cb6494429bd64b27b7ff8b4af56f8e526da2b4 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Thu, 28 Jul 2022 18:22:20 +0800
Subject: hwrng: core - let sleep be interrupted when unregistering hwrng
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are two deadlock scenarios that need addressing, which cause
problems when the computer goes to sleep, the interface is set down, and
hwrng_unregister() is called. When the deadlock is hit, sleep is delayed
for tens of seconds, causing it to fail. These scenarios are:

1) The hwrng kthread can't be stopped while it's sleeping, because it
   uses msleep_interruptible() which does not react to kthread_stop.

2) A normal user thread can't be interrupted by hwrng_unregister() while
   it's sleeping, because hwrng_unregister() is called from elsewhere.

We solve both issues by add a completion object called dying that
fulfils waiters once we have started the process in hwrng_unregister.

At the same time, we should cleanup a common and useless dmesg splat
in the same area.

Cc: <stable@vger.kernel.org>
Reported-by: Gregory Erwin <gregerwin256@gmail.com>
Fixes: fcd09c90c3c5 ("ath9k: use hw_random API instead of directly dumping into random.c")
Link: https://lore.kernel.org/all/CAO+Okf6ZJC5-nTE_EJUGQtd8JiCkiEHytGgDsFGTEjs0c00giw@mail.gmail.com/
Link: https://lore.kernel.org/lkml/CAO+Okf5k+C+SE6pMVfPf-d8MfVPVq4PO7EY8Hys_DVXtent3HA@mail.gmail.com/
Link: https://bugs.archlinux.org/task/75138
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Toke Høiland-Jørgensen <toke@toke.dk>
Acked-by: Kalle Valo <kvalo@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/hw_random.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index aa1d4da03538..77c2885c4c13 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -50,6 +50,7 @@ struct hwrng {
 	struct list_head list;
 	struct kref ref;
 	struct completion cleanup_done;
+	struct completion dying;
 };
 
 struct device;
@@ -61,4 +62,6 @@ extern int devm_hwrng_register(struct device *dev, struct hwrng *rng);
 extern void hwrng_unregister(struct hwrng *rng);
 extern void devm_hwrng_unregister(struct device *dve, struct hwrng *rng);
 
+extern long hwrng_msleep(struct hwrng *rng, unsigned int msecs);
+
 #endif /* LINUX_HWRANDOM_H_ */
-- 
cgit v1.2.3


From 0f60d28828dd94779c6527440289e1c36a05115a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 30 Jan 2022 15:03:49 -0500
Subject: dynamic_dname(): drop unused dentry argument

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 92c78ed02b54..54d46518c481 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -287,8 +287,8 @@ static inline unsigned d_count(const struct dentry *dentry)
 /*
  * helper function for dentry_operations.d_dname() members
  */
-extern __printf(4, 5)
-char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
+extern __printf(3, 4)
+char *dynamic_dname(char *, int, const char *, ...);
 
 extern char *__d_path(const struct path *, const struct path *, char *, int);
 extern char *d_absolute_path(const struct path *, char *, int);
-- 
cgit v1.2.3


From 5535be3099717646781ce1540cf725965d680e7b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 9 Aug 2022 22:56:40 +0200
Subject: mm/gup: fix FOLL_FORCE COW security issue and remove FOLL_COW

Ever since the Dirty COW (CVE-2016-5195) security issue happened, we know
that FOLL_FORCE can be possibly dangerous, especially if there are races
that can be exploited by user space.

Right now, it would be sufficient to have some code that sets a PTE of a
R/O-mapped shared page dirty, in order for it to erroneously become
writable by FOLL_FORCE.  The implications of setting a write-protected PTE
dirty might not be immediately obvious to everyone.

And in fact ever since commit 9ae0f87d009c ("mm/shmem: unconditionally set
pte dirty in mfill_atomic_install_pte"), we can use UFFDIO_CONTINUE to map
a shmem page R/O while marking the pte dirty.  This can be used by
unprivileged user space to modify tmpfs/shmem file content even if the
user does not have write permissions to the file, and to bypass memfd
write sealing -- Dirty COW restricted to tmpfs/shmem (CVE-2022-2590).

To fix such security issues for good, the insight is that we really only
need that fancy retry logic (FOLL_COW) for COW mappings that are not
writable (!VM_WRITE).  And in a COW mapping, we really only broke COW if
we have an exclusive anonymous page mapped.  If we have something else
mapped, or the mapped anonymous page might be shared (!PageAnonExclusive),
we have to trigger a write fault to break COW.  If we don't find an
exclusive anonymous page when we retry, we have to trigger COW breaking
once again because something intervened.

Let's move away from this mandatory-retry + dirty handling and rely on our
PageAnonExclusive() flag for making a similar decision, to use the same
COW logic as in other kernel parts here as well.  In case we stumble over
a PTE in a COW mapping that does not map an exclusive anonymous page, COW
was not properly broken and we have to trigger a fake write-fault to break
COW.

Just like we do in can_change_pte_writable() added via commit 64fe24a3e05e
("mm/mprotect: try avoiding write faults for exclusive anonymous pages
when changing protection") and commit 76aefad628aa ("mm/mprotect: fix
soft-dirty check in can_change_pte_writable()"), take care of softdirty
and uffd-wp manually.

For example, a write() via /proc/self/mem to a uffd-wp-protected range has
to fail instead of silently granting write access and bypassing the
userspace fault handler.  Note that FOLL_FORCE is not only used for debug
access, but also triggered by applications without debug intentions, for
example, when pinning pages via RDMA.

This fixes CVE-2022-2590. Note that only x86_64 and aarch64 are
affected, because only those support CONFIG_HAVE_ARCH_USERFAULTFD_MINOR.

Fortunately, FOLL_COW is no longer required to handle FOLL_FORCE. So
let's just get rid of it.

Thanks to Nadav Amit for pointing out that the pte_dirty() check in
FOLL_FORCE code is problematic and might be exploitable.

Note 1: We don't check for the PTE being dirty because it doesn't matter
	for making a "was COWed" decision anymore, and whoever modifies the
	page has to set the page dirty either way.

Note 2: Kernels before extended uffd-wp support and before
	PageAnonExclusive (< 5.19) can simply revert the problematic
	commit instead and be safe regarding UFFDIO_CONTINUE. A backport to
	v5.19 requires minor adjustments due to lack of
	vma_soft_dirty_enabled().

Link: https://lkml.kernel.org/r/20220809205640.70916-1-david@redhat.com
Fixes: 9ae0f87d009c ("mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte")
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: <stable@vger.kernel.org>	[5.16]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3bedc449c14d..982f2607180b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2885,7 +2885,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
 #define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
 #define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */
-#define FOLL_COW	0x4000	/* internal GUP flag */
 #define FOLL_ANON	0x8000	/* don't do file mappings */
 #define FOLL_LONGTERM	0x10000	/* mapping lifetime is indefinite: see below */
 #define FOLL_SPLIT_PMD	0x20000	/* split huge pmd before returning */
-- 
cgit v1.2.3


From a39c5d3ce03dd890ab6a9be44b21177cec32da55 Mon Sep 17 00:00:00 2001
From: Hao Lee <haolee.swjtu@gmail.com>
Date: Sun, 7 Aug 2022 15:44:42 +0000
Subject: mm: add DEVICE_ZONE to FOR_ALL_ZONES

FOR_ALL_ZONES should be consistent with enum zone_type.  Otherwise,
__count_zid_vm_events have the potential to add count to wrong item when
zid is ZONE_DEVICE.

Link: https://lkml.kernel.org/r/20220807154442.GA18167@haolee.io
Signed-off-by: Hao Lee <haolee.swjtu@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vm_event_item.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 404024486fa5..f3fc36cd2276 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -20,12 +20,19 @@
 #define HIGHMEM_ZONE(xx)
 #endif
 
-#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE
+#ifdef CONFIG_ZONE_DEVICE
+#define DEVICE_ZONE(xx) xx##_DEVICE,
+#else
+#define DEVICE_ZONE(xx)
+#endif
+
+#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, \
+	HIGHMEM_ZONE(xx) xx##_MOVABLE, DEVICE_ZONE(xx)
 
 enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
-		FOR_ALL_ZONES(PGALLOC),
-		FOR_ALL_ZONES(ALLOCSTALL),
-		FOR_ALL_ZONES(PGSCAN_SKIP),
+		FOR_ALL_ZONES(PGALLOC)
+		FOR_ALL_ZONES(ALLOCSTALL)
+		FOR_ALL_ZONES(PGSCAN_SKIP)
 		PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
 		PGFAULT, PGMAJFAULT,
 		PGLAZYFREED,
-- 
cgit v1.2.3


From f369b07c861435bd812a9d14493f71b34132ed6f Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 11 Aug 2022 16:13:40 -0400
Subject: mm/uffd: reset write protection when unregister with wp-mode

The motivation of this patch comes from a recent report and patchfix from
David Hildenbrand on hugetlb shared handling of wr-protected page [1].

With the reproducer provided in commit message of [1], one can leverage
the uffd-wp lazy-reset of ptes to trigger a hugetlb issue which can affect
not only the attacker process, but also the whole system.

The lazy-reset mechanism of uffd-wp was used to make unregister faster,
meanwhile it has an assumption that any leftover pgtable entries should
only affect the process on its own, so not only the user should be aware
of anything it does, but also it should not affect outside of the process.

But it seems that this is not true, and it can also be utilized to make
some exploit easier.

So far there's no clue showing that the lazy-reset is important to any
userfaultfd users because normally the unregister will only happen once
for a specific range of memory of the lifecycle of the process.

Considering all above, what this patch proposes is to do explicit pte
resets when unregister an uffd region with wr-protect mode enabled.

It should be the same as calling ioctl(UFFDIO_WRITEPROTECT, wp=false)
right before ioctl(UFFDIO_UNREGISTER) for the user.  So potentially it'll
make the unregister slower.  From that pov it's a very slight abi change,
but hopefully nothing should break with this change either.

Regarding to the change itself - core of uffd write [un]protect operation
is moved into a separate function (uffd_wp_range()) and it is reused in
the unregister code path.

Note that the new function will not check for anything, e.g.  ranges or
memory types, because they should have been checked during the previous
UFFDIO_REGISTER or it should have failed already.  It also doesn't check
mmap_changing because we're with mmap write lock held anyway.

I added a Fixes upon introducing of uffd-wp shmem+hugetlbfs because that's
the only issue reported so far and that's the commit David's reproducer
will start working (v5.19+).  But the whole idea actually applies to not
only file memories but also anonymous.  It's just that we don't need to
fix anonymous prior to v5.19- because there's no known way to exploit.

IOW, this patch can also fix the issue reported in [1] as the patch 2 does.

[1] https://lore.kernel.org/all/20220811103435.188481-3-david@redhat.com/

Link: https://lkml.kernel.org/r/20220811201340.39342-1-peterx@redhat.com
Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs")
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/userfaultfd_k.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 732b522bacb7..e1b8a915e9e9 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -73,6 +73,8 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
 			       unsigned long start, unsigned long len,
 			       bool enable_wp, atomic_t *mmap_changing);
+extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
+			  unsigned long start, unsigned long len, bool enable_wp);
 
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
-- 
cgit v1.2.3


From cb241339b9d020c758a6647c69f8e42538c5cf88 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 10 Aug 2022 21:51:09 -0700
Subject: mm/shmem: fix chattr fsflags support in tmpfs

ext[234] have always allowed unimplemented chattr flags to be set, but
other filesystems have tended to be stricter.  Follow the stricter
approach for tmpfs: I don't want to have to explain why csu attributes
don't actually work, and we won't need to update the chattr(1) manpage;
and it's never wrong to start off strict, relaxing later if persuaded.
Allow only a (append only) i (immutable) A (no atime) and d (no dump).

Although lsattr showed 'A' inherited, the NOATIME behavior was not being
inherited: because nothing sync'ed FS_NOATIME_FL to S_NOATIME.  Add
shmem_set_inode_flags() to sync the flags, using inode_set_flags() to
avoid that instant of lost immutablility during fileattr_set().

But that change switched generic/079 from passing to failing: because
FS_IMMUTABLE_FL and FS_APPEND_FL had been unconventionally included in the
INHERITED fsflags: remove them and generic/079 is back to passing.

Link: https://lkml.kernel.org/r/2961dcb0-ddf3-b9f0-3268-12a4ff996856@google.com
Fixes: e408e695f5f1 ("mm/shmem: support FS_IOC_[SG]ETFLAGS in tmpfs")
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Radoslaw Burny <rburny@google.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 1b6c4013f691..ff0b990de83d 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -29,15 +29,10 @@ struct shmem_inode_info {
 	struct inode		vfs_inode;
 };
 
-#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE
-#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE
-#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE
-
-/* Flags that are appropriate for regular files (all but dir-specific ones). */
-#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
-
-/* Flags that are appropriate for non-directories/regular files. */
-#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+#define SHMEM_FL_USER_VISIBLE		FS_FL_USER_VISIBLE
+#define SHMEM_FL_USER_MODIFIABLE \
+	(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL)
+#define SHMEM_FL_INHERITED		(FS_NODUMP_FL | FS_NOATIME_FL)
 
 struct shmem_sb_info {
 	unsigned long max_blocks;   /* How many blocks are allowed */
-- 
cgit v1.2.3


From 5e61fe157a27afc7c0d4f7bcbceefdca536c015f Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Wed, 17 Aug 2022 14:32:52 +0200
Subject: net: phy: Introduce QUSGMII PHY mode

The QUSGMII mode is a derivative of Cisco's USXGMII standard. This
standard is pretty similar to SGMII, but allows for faster speeds, and
has the build-in bits for Quad and Octa variants (like QSGMII).

The main difference with SGMII/QSGMII is that USXGMII/QUSGMII re-uses
the preamble to carry various information, named 'Extensions'.

As of today, the USXGMII standard only mentions the "PCH" extension,
which is used to convey timestamps, allowing in-band signaling of PTP
timestamps without having to modify the frame itself.

This commit adds support for that mode. When no extension is in use, it
behaves exactly like QSGMII, although it's not compatible with QSGMII.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 87638c55d844..9eeab9b9a74c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -115,6 +115,7 @@ extern const int phy_10gbit_features_array[1];
  * @PHY_INTERFACE_MODE_25GBASER: 25G BaseR
  * @PHY_INTERFACE_MODE_USXGMII:  Universal Serial 10GE MII
  * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN
+ * @PHY_INTERFACE_MODE_QUSGMII: Quad Universal SGMII
  * @PHY_INTERFACE_MODE_MAX: Book keeping
  *
  * Describes the interface between the MAC and PHY.
@@ -152,6 +153,7 @@ typedef enum {
 	PHY_INTERFACE_MODE_USXGMII,
 	/* 10GBASE-KR - with Clause 73 AN */
 	PHY_INTERFACE_MODE_10GKR,
+	PHY_INTERFACE_MODE_QUSGMII,
 	PHY_INTERFACE_MODE_MAX,
 } phy_interface_t;
 
@@ -267,6 +269,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 		return "10gbase-kr";
 	case PHY_INTERFACE_MODE_100BASEX:
 		return "100base-x";
+	case PHY_INTERFACE_MODE_QUSGMII:
+		return "qusgmii";
 	default:
 		return "unknown";
 	}
-- 
cgit v1.2.3


From c04ade27cb7b952b6b9b9a0efa0a6129cc63f2ae Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Wed, 17 Aug 2022 14:32:54 +0200
Subject: net: phy: Add helper to derive the number of ports from a phy mode

Some phy modes such as QSGMII multiplex several MAC<->PHY links on one
single physical interface. QSGMII used to be the only one supported, but
other modes such as QUSGMII also carry multiple links.

This helper allows getting the number of links that are multiplexed
on a given interface.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 9eeab9b9a74c..7c49ab95441b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -968,6 +968,8 @@ struct phy_fixup {
 const char *phy_speed_to_str(int speed);
 const char *phy_duplex_to_str(unsigned int duplex);
 
+int phy_interface_num_ports(phy_interface_t interface);
+
 /* A structure for mapping a particular speed and duplex
  * combination to a particular SUPPORTED and ADVERTISED value
  */
-- 
cgit v1.2.3


From 1202cdd665315c525b5237e96e0bedc76d7e754f Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Wed, 17 Aug 2022 17:43:21 -0700
Subject: Remove DECnet support from kernel

DECnet is an obsolete network protocol that receives more attention
from kernel janitors than users. It belongs in computer protocol
history museum not in Linux kernel.

It has been "Orphaned" in kernel since 2010. The iproute2 support
for DECnet was dropped in 5.0 release. The documentation link on
Sourceforge says it is abandoned there as well.

Leave the UAPI alone to keep userspace programs compiling.
This means that there is still an empty neighbour table
for AF_DECNET.

The table of /proc/sys/net entries was updated to match
current directories and reformatted to be alphabetical.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: David Ahern <dsahern@kernel.org>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h      | 4 ----
 include/linux/netfilter.h      | 5 -----
 include/linux/netfilter_defs.h | 8 --------
 3 files changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1a3cb93c3dcc..64e8662632f8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1837,7 +1837,6 @@ enum netdev_ml_priv_type {
  *	@tipc_ptr:	TIPC specific data
  *	@atalk_ptr:	AppleTalk link
  *	@ip_ptr:	IPv4 specific data
- *	@dn_ptr:	DECnet specific data
  *	@ip6_ptr:	IPv6 specific data
  *	@ax25_ptr:	AX.25 specific data
  *	@ieee80211_ptr:	IEEE 802.11 specific data, assign before registering
@@ -2133,9 +2132,6 @@ struct net_device {
 #if IS_ENABLED(CONFIG_ATALK)
 	void 			*atalk_ptr;
 #endif
-#if IS_ENABLED(CONFIG_DECNET)
-	struct dn_dev __rcu     *dn_ptr;
-#endif
 #if IS_ENABLED(CONFIG_AX25)
 	void			*ax25_ptr;
 #endif
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index c2c6f332fb90..d8817d381c14 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -243,11 +243,6 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 		hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
 #endif
 		break;
-#if IS_ENABLED(CONFIG_DECNET)
-	case NFPROTO_DECNET:
-		hook_head = rcu_dereference(net->nf.hooks_decnet[hook]);
-		break;
-#endif
 	default:
 		WARN_ON_ONCE(1);
 		break;
diff --git a/include/linux/netfilter_defs.h b/include/linux/netfilter_defs.h
index 8dddfb151f00..a5f7bef1b3a4 100644
--- a/include/linux/netfilter_defs.h
+++ b/include/linux/netfilter_defs.h
@@ -7,14 +7,6 @@
 /* in/out/forward only */
 #define NF_ARP_NUMHOOKS 3
 
-/* max hook is NF_DN_ROUTE (6), also see uapi/linux/netfilter_decnet.h */
-#define NF_DN_NUMHOOKS 7
-
-#if IS_ENABLED(CONFIG_DECNET)
-/* Largest hook number + 1, see uapi/linux/netfilter_decnet.h */
-#define NF_MAX_HOOKS	NF_DN_NUMHOOKS
-#else
 #define NF_MAX_HOOKS	NF_INET_NUMHOOKS
-#endif
 
 #endif
-- 
cgit v1.2.3


From c6ea70604249bc357ce09e9f8e16c29df0fb2fa2 Mon Sep 17 00:00:00 2001
From: "dougmill@linux.vnet.ibm.com" <dougmill@linux.vnet.ibm.com>
Date: Tue, 16 Aug 2022 15:07:13 +0100
Subject: block: sed-opal: Add ioctl to return device status

Provide a mechanism to retrieve basic status information about
the device, including the "supported" flag indicating whether
SED-OPAL is supported. The information returned is from the various
feature descriptors received during the discovery0 step, and so
this ioctl does nothing more than perform the discovery0 step
and then save the information received. See "struct opal_status"
and OPAL_FL_* bits for the status information currently returned.

This is necessary to be able to check whether a device is OPAL
enabled, set up, locked or unlocked from userspace programs
like systemd-cryptsetup and libcryptsetup. Right now we just
have to assume the user 'knows' or blindly attempt setup/lock/unlock
operations.

Signed-off-by: Douglas Miller <dougmill@linux.vnet.ibm.com>
Tested-by: Luca Boccassi <bluca@debian.org>
Reviewed-by: Scott Bauer <sbauer@plzdonthack.me>
Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Link: https://lore.kernel.org/r/20220816140713.84893-1-luca.boccassi@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sed-opal.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 1ac0d712a9c3..6f837bb6c715 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -43,6 +43,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	case IOC_OPAL_MBR_DONE:
 	case IOC_OPAL_WRITE_SHADOW_MBR:
 	case IOC_OPAL_GENERIC_TABLE_RW:
+	case IOC_OPAL_GET_STATUS:
 		return true;
 	}
 	return false;
-- 
cgit v1.2.3


From a4e1d0b76e7b32c0839e72679c530445172a2564 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Mon, 15 Aug 2022 10:00:43 -0700
Subject: block: Change the return type of blk_mq_map_queues() into void

Since blk_mq_map_queues() and the .map_queues() callbacks always return 0,
change their return type into void. Most callers ignore the returned value
anyway.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Doug Gilbert <dgilbert@interlog.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: John Garry <john.garry@huawei.com>
Acked-by: Md Haris Iqbal <haris.iqbal@ionos.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Link: https://lore.kernel.org/r/20220815170043.19489-3-bvanassche@acm.org
[axboe: fold in fix from Bart]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq-pci.h    | 4 ++--
 include/linux/blk-mq-rdma.h   | 2 +-
 include/linux/blk-mq-virtio.h | 2 +-
 include/linux/blk-mq.h        | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h
index 0b1f45c62623..ca544e1d3508 100644
--- a/include/linux/blk-mq-pci.h
+++ b/include/linux/blk-mq-pci.h
@@ -5,7 +5,7 @@
 struct blk_mq_queue_map;
 struct pci_dev;
 
-int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
-			  int offset);
+void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
+			   int offset);
 
 #endif /* _LINUX_BLK_MQ_PCI_H */
diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h
index 5cc5f0f36218..53b58c610e76 100644
--- a/include/linux/blk-mq-rdma.h
+++ b/include/linux/blk-mq-rdma.h
@@ -5,7 +5,7 @@
 struct blk_mq_tag_set;
 struct ib_device;
 
-int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
+void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
 		struct ib_device *dev, int first_vec);
 
 #endif /* _LINUX_BLK_MQ_RDMA_H */
diff --git a/include/linux/blk-mq-virtio.h b/include/linux/blk-mq-virtio.h
index 687ae287e1dc..13226e9b22dd 100644
--- a/include/linux/blk-mq-virtio.h
+++ b/include/linux/blk-mq-virtio.h
@@ -5,7 +5,7 @@
 struct blk_mq_queue_map;
 struct virtio_device;
 
-int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
+void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
 		struct virtio_device *vdev, int first_vec);
 
 #endif /* _LINUX_BLK_MQ_VIRTIO_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 92294a5fb083..c38575209d51 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -630,7 +630,7 @@ struct blk_mq_ops {
 	 * @map_queues: This allows drivers specify their own queue mapping by
 	 * overriding the setup-time function that builds the mq_map.
 	 */
-	int (*map_queues)(struct blk_mq_tag_set *set);
+	void (*map_queues)(struct blk_mq_tag_set *set);
 
 #ifdef CONFIG_BLK_DEBUG_FS
 	/**
@@ -880,7 +880,7 @@ void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
 
-int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
+void blk_mq_map_queues(struct blk_mq_queue_map *qmap);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
 
 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
-- 
cgit v1.2.3


From f5d632d15e9e0a037339601680d82bb840f85d10 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 5 Aug 2022 16:39:04 -0600
Subject: block: shrink rq_map_data a bit

We don't need full ints for several of these members. Change the
page_order and nr_entries to unsigned shorts, and the true/false from_user
and null_mapped to booleans.

This shrinks the struct from 32 to 24 bytes on 64-bit archs.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c38575209d51..74b99d716b0b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -963,11 +963,11 @@ blk_status_t blk_insert_cloned_request(struct request *rq);
 
 struct rq_map_data {
 	struct page **pages;
-	int page_order;
-	int nr_entries;
 	unsigned long offset;
-	int null_mapped;
-	int from_user;
+	unsigned short page_order;
+	unsigned short nr_entries;
+	bool null_mapped;
+	bool from_user;
 };
 
 int blk_rq_map_user(struct request_queue *, struct request *,
-- 
cgit v1.2.3


From 1ecb7d27b1af6705e9a4e94415b4d8cc8cf2fbfb Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Wed, 17 Aug 2022 18:27:27 +0100
Subject: firmware: arm_scmi: Improve checks in the info_get operations

SCMI protocols abstract and expose a number of protocol specific
resources like clocks, sensors and so on. Information about such
specific domain resources are generally exposed via an `info_get`
protocol operation.

Improve the sanity check on these operations where needed.

Link: https://lore.kernel.org/r/20220817172731.1185305-3-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_protocol.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index a193884ecf2b..4f765bc788ff 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -84,7 +84,7 @@ struct scmi_protocol_handle;
 struct scmi_clk_proto_ops {
 	int (*count_get)(const struct scmi_protocol_handle *ph);
 
-	const struct scmi_clock_info *(*info_get)
+	const struct scmi_clock_info __must_check *(*info_get)
 		(const struct scmi_protocol_handle *ph, u32 clk_id);
 	int (*rate_get)(const struct scmi_protocol_handle *ph, u32 clk_id,
 			u64 *rate);
@@ -466,7 +466,7 @@ enum scmi_sensor_class {
  */
 struct scmi_sensor_proto_ops {
 	int (*count_get)(const struct scmi_protocol_handle *ph);
-	const struct scmi_sensor_info *(*info_get)
+	const struct scmi_sensor_info __must_check *(*info_get)
 		(const struct scmi_protocol_handle *ph, u32 sensor_id);
 	int (*trip_point_config)(const struct scmi_protocol_handle *ph,
 				 u32 sensor_id, u8 trip_id, u64 trip_value);
-- 
cgit v1.2.3


From d59b73a66e5e0682442b6d7b4965364e57078b80 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Wed, 3 Aug 2022 10:49:23 +0300
Subject: net/mlx5: Avoid false positive lockdep warning by adding
 lock_class_key

Add a lock_class_key per mlx5 device to avoid a false positive
"possible circular locking dependency" warning by lockdep, on flows
which lock more than one mlx5 device, such as adding SF.

kernel log:
 ======================================================
 WARNING: possible circular locking dependency detected
 5.19.0-rc8+ #2 Not tainted
 ------------------------------------------------------
 kworker/u20:0/8 is trying to acquire lock:
 ffff88812dfe0d98 (&dev->intf_state_mutex){+.+.}-{3:3}, at: mlx5_init_one+0x2e/0x490 [mlx5_core]

 but task is already holding lock:
 ffff888101aa7898 (&(&notifier->n_head)->rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x5a/0x130

 which lock already depends on the new lock.

 the existing dependency chain (in reverse order) is:

 -> #1 (&(&notifier->n_head)->rwsem){++++}-{3:3}:
        down_write+0x90/0x150
        blocking_notifier_chain_register+0x53/0xa0
        mlx5_sf_table_init+0x369/0x4a0 [mlx5_core]
        mlx5_init_one+0x261/0x490 [mlx5_core]
        probe_one+0x430/0x680 [mlx5_core]
        local_pci_probe+0xd6/0x170
        work_for_cpu_fn+0x4e/0xa0
        process_one_work+0x7c2/0x1340
        worker_thread+0x6f6/0xec0
        kthread+0x28f/0x330
        ret_from_fork+0x1f/0x30

 -> #0 (&dev->intf_state_mutex){+.+.}-{3:3}:
        __lock_acquire+0x2fc7/0x6720
        lock_acquire+0x1c1/0x550
        __mutex_lock+0x12c/0x14b0
        mlx5_init_one+0x2e/0x490 [mlx5_core]
        mlx5_sf_dev_probe+0x29c/0x370 [mlx5_core]
        auxiliary_bus_probe+0x9d/0xe0
        really_probe+0x1e0/0xaa0
        __driver_probe_device+0x219/0x480
        driver_probe_device+0x49/0x130
        __device_attach_driver+0x1b8/0x280
        bus_for_each_drv+0x123/0x1a0
        __device_attach+0x1a3/0x460
        bus_probe_device+0x1a2/0x260
        device_add+0x9b1/0x1b40
        __auxiliary_device_add+0x88/0xc0
        mlx5_sf_dev_state_change_handler+0x67e/0x9d0 [mlx5_core]
        blocking_notifier_call_chain+0xd5/0x130
        mlx5_vhca_state_work_handler+0x2b0/0x3f0 [mlx5_core]
        process_one_work+0x7c2/0x1340
        worker_thread+0x59d/0xec0
        kthread+0x28f/0x330
        ret_from_fork+0x1f/0x30

  other info that might help us debug this:

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(&(&notifier->n_head)->rwsem);
                                lock(&dev->intf_state_mutex);
                                lock(&(&notifier->n_head)->rwsem);
   lock(&dev->intf_state_mutex);

  *** DEADLOCK ***

 4 locks held by kworker/u20:0/8:
  #0: ffff888150612938 ((wq_completion)mlx5_events){+.+.}-{0:0}, at: process_one_work+0x6e2/0x1340
  #1: ffff888100cafdb8 ((work_completion)(&work->work)#3){+.+.}-{0:0}, at: process_one_work+0x70f/0x1340
  #2: ffff888101aa7898 (&(&notifier->n_head)->rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x5a/0x130
  #3: ffff88813682d0e8 (&dev->mutex){....}-{3:3}, at:__device_attach+0x76/0x460

 stack backtrace:
 CPU: 6 PID: 8 Comm: kworker/u20:0 Not tainted 5.19.0-rc8+
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
 Workqueue: mlx5_events mlx5_vhca_state_work_handler [mlx5_core]
 Call Trace:
  <TASK>
  dump_stack_lvl+0x57/0x7d
  check_noncircular+0x278/0x300
  ? print_circular_bug+0x460/0x460
  ? lock_chain_count+0x20/0x20
  ? register_lock_class+0x1880/0x1880
  __lock_acquire+0x2fc7/0x6720
  ? register_lock_class+0x1880/0x1880
  ? register_lock_class+0x1880/0x1880
  lock_acquire+0x1c1/0x550
  ? mlx5_init_one+0x2e/0x490 [mlx5_core]
  ? lockdep_hardirqs_on_prepare+0x400/0x400
  __mutex_lock+0x12c/0x14b0
  ? mlx5_init_one+0x2e/0x490 [mlx5_core]
  ? mlx5_init_one+0x2e/0x490 [mlx5_core]
  ? _raw_read_unlock+0x1f/0x30
  ? mutex_lock_io_nested+0x1320/0x1320
  ? __ioremap_caller.constprop.0+0x306/0x490
  ? mlx5_sf_dev_probe+0x269/0x370 [mlx5_core]
  ? iounmap+0x160/0x160
  mlx5_init_one+0x2e/0x490 [mlx5_core]
  mlx5_sf_dev_probe+0x29c/0x370 [mlx5_core]
  ? mlx5_sf_dev_remove+0x130/0x130 [mlx5_core]
  auxiliary_bus_probe+0x9d/0xe0
  really_probe+0x1e0/0xaa0
  __driver_probe_device+0x219/0x480
  ? auxiliary_match_id+0xe9/0x140
  driver_probe_device+0x49/0x130
  __device_attach_driver+0x1b8/0x280
  ? driver_allows_async_probing+0x140/0x140
  bus_for_each_drv+0x123/0x1a0
  ? bus_for_each_dev+0x1a0/0x1a0
  ? lockdep_hardirqs_on_prepare+0x286/0x400
  ? trace_hardirqs_on+0x2d/0x100
  __device_attach+0x1a3/0x460
  ? device_driver_attach+0x1e0/0x1e0
  ? kobject_uevent_env+0x22d/0xf10
  bus_probe_device+0x1a2/0x260
  device_add+0x9b1/0x1b40
  ? dev_set_name+0xab/0xe0
  ? __fw_devlink_link_to_suppliers+0x260/0x260
  ? memset+0x20/0x40
  ? lockdep_init_map_type+0x21a/0x7d0
  __auxiliary_device_add+0x88/0xc0
  ? auxiliary_device_init+0x86/0xa0
  mlx5_sf_dev_state_change_handler+0x67e/0x9d0 [mlx5_core]
  blocking_notifier_call_chain+0xd5/0x130
  mlx5_vhca_state_work_handler+0x2b0/0x3f0 [mlx5_core]
  ? mlx5_vhca_event_arm+0x100/0x100 [mlx5_core]
  ? lock_downgrade+0x6e0/0x6e0
  ? lockdep_hardirqs_on_prepare+0x286/0x400
  process_one_work+0x7c2/0x1340
  ? lockdep_hardirqs_on_prepare+0x400/0x400
  ? pwq_dec_nr_in_flight+0x230/0x230
  ? rwlock_bug.part.0+0x90/0x90
  worker_thread+0x59d/0xec0
  ? process_one_work+0x1340/0x1340
  kthread+0x28f/0x330
  ? kthread_complete_and_exit+0x20/0x20
  ret_from_fork+0x1f/0x30
  </TASK>

Fixes: 6a3273217469 ("net/mlx5: SF, Port function state change support")
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 96b16fbe1aa4..7b7ce602c808 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -779,6 +779,7 @@ struct mlx5_core_dev {
 	enum mlx5_device_state	state;
 	/* sync interface state */
 	struct mutex		intf_state_mutex;
+	struct lock_class_key	lock_key;
 	unsigned long		intf_state;
 	struct mlx5_priv	priv;
 	struct mlx5_profile	profile;
-- 
cgit v1.2.3


From 272ac1500372183ffd54b0c9f43f52afc482e610 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 15 Aug 2022 11:45:01 -0700
Subject: fscrypt: remove fscrypt_set_test_dummy_encryption()

Now that all its callers have been converted to
fscrypt_parse_test_dummy_encryption() and fscrypt_add_test_dummy_key()
instead, fscrypt_set_test_dummy_encryption() can be removed.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220513231605.175121-6-ebiggers@kernel.org
---
 include/linux/fscrypt.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 7d2f1e0f23b1..b95b8601b9c1 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -295,8 +295,6 @@ int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
 				    struct fscrypt_dummy_policy *dummy_policy);
 bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1,
 				  const struct fscrypt_dummy_policy *p2);
-int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg,
-				struct fscrypt_dummy_policy *dummy_policy);
 void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep,
 					struct super_block *sb);
 static inline bool
-- 
cgit v1.2.3


From a9da7251ac8bcc2f2358513868f1903ac2809b3d Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Mon, 22 Aug 2022 17:14:58 -0700
Subject: Input: gameport - move from strlcpy with unused retval to strscpy

Follow the advice of the below link and prefer 'strscpy' in this
subsystem. Conversion is 1:1 because the return value is not used.
Generated by a coccinelle script.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Link: https://lore.kernel.org/r/20220818210156.8143-1-wsa+renesas@sang-engineering.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/gameport.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gameport.h b/include/linux/gameport.h
index 69081d899492..8c2f00018e89 100644
--- a/include/linux/gameport.h
+++ b/include/linux/gameport.h
@@ -110,7 +110,7 @@ static inline void gameport_free_port(struct gameport *gameport)
 
 static inline void gameport_set_name(struct gameport *gameport, const char *name)
 {
-	strlcpy(gameport->name, name, sizeof(gameport->name));
+	strscpy(gameport->name, name, sizeof(gameport->name));
 }
 
 /*
-- 
cgit v1.2.3


From 13a8e0f6b01b14b2e28ba144e112c883f03a3db2 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 19 Aug 2022 15:16:11 -0700
Subject: Revert "driver core: Delete driver_deferred_probe_check_state()"

This reverts commit 9cbffc7a59561be950ecc675d19a3d2b45202b2b.

There are a few more issues to fix that have been reported in the thread
for the original series [1]. We'll need to fix those before this will work.
So, revert it for now.

[1] - https://lore.kernel.org/lkml/20220601070707.3946847-1-saravanak@google.com/

Fixes: 9cbffc7a5956 ("driver core: Delete driver_deferred_probe_check_state()")
Tested-by: Tony Lindgren <tony@atomide.com>
Tested-by: Peng Fan <peng.fan@nxp.com>
Tested-by: Douglas Anderson <dianders@chromium.org>
Tested-by: Alexander Stein <alexander.stein@ew.tq-group.com>
Reviewed-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20220819221616.2107893-2-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index 7acaabde5396..2114d65b862f 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -242,6 +242,7 @@ driver_find_device_by_acpi_dev(struct device_driver *drv, const void *adev)
 
 extern int driver_deferred_probe_timeout;
 void driver_deferred_probe_add(struct device *dev);
+int driver_deferred_probe_check_state(struct device *dev);
 void driver_init(void);
 
 /**
-- 
cgit v1.2.3


From 7997eff82828304b780dc0a39707e1946d6f1ebf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 20 Aug 2022 17:38:37 +0200
Subject: netfilter: ebtables: reject blobs that don't provide all entry points

Harshit Mogalapalli says:
 In ebt_do_table() function dereferencing 'private->hook_entry[hook]'
 can lead to NULL pointer dereference. [..] Kernel panic:

general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN
KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f]
[..]
RIP: 0010:ebt_do_table+0x1dc/0x1ce0
Code: 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 5c 16 00 00 48 b8 00 00 00 00 00 fc ff df 49 8b 6c df 08 48 8d 7d 2c 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 88
[..]
Call Trace:
 nf_hook_slow+0xb1/0x170
 __br_forward+0x289/0x730
 maybe_deliver+0x24b/0x380
 br_flood+0xc6/0x390
 br_dev_xmit+0xa2e/0x12c0

For some reason ebtables rejects blobs that provide entry points that are
not supported by the table, but what it should instead reject is the
opposite: blobs that DO NOT provide an entry point supported by the table.

t->valid_hooks is the bitmask of hooks (input, forward ...) that will see
packets.  Providing an entry point that is not support is harmless
(never called/used), but the inverse isn't: it results in a crash
because the ebtables traverser doesn't expect a NULL blob for a location
its receiving packets for.

Instead of fixing all the individual checks, do what iptables is doing and
reject all blobs that differ from the expected hooks.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
Reported-by: syzkaller <syzkaller@googlegroups.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter_bridge/ebtables.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index a13296d6c7ce..fd533552a062 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -94,10 +94,6 @@ struct ebt_table {
 	struct ebt_replace_kernel *table;
 	unsigned int valid_hooks;
 	rwlock_t lock;
-	/* e.g. could be the table explicitly only allows certain
-	 * matches, targets, ... 0 == let it in */
-	int (*check)(const struct ebt_table_info *info,
-	   unsigned int valid_hooks);
 	/* the data used by the kernel */
 	struct ebt_table_info *private;
 	struct nf_hook_ops *ops;
-- 
cgit v1.2.3


From 12cda13cfd5310bbfefdfe32a82489228e2e0381 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:25 -0400
Subject: fs: dlm: remove DLM_LSFL_FS from uapi

The DLM_LSFL_FS flag is set in lockspaces created directly
for a kernel user, as opposed to those lockspaces created
for user space applications.  The user space libdlm allowed
this flag to be set for lockspaces created from user space,
but then used by a kernel user.  No kernel user has ever
used this method, so remove the ability to do it.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 include/linux/dlm.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dlm.h b/include/linux/dlm.h
index ff951e9f6f20..f5f55c2138ae 100644
--- a/include/linux/dlm.h
+++ b/include/linux/dlm.h
@@ -56,9 +56,6 @@ struct dlm_lockspace_ops {
  * DLM_LSFL_TIMEWARN
  *   The dlm should emit netlink messages if locks have been waiting
  *   for a configurable amount of time.  (Unused.)
- * DLM_LSFL_FS
- *   The lockspace user is in the kernel (i.e. filesystem).  Enables
- *   direct bast/cast callbacks.
  * DLM_LSFL_NEWEXCL
  *   dlm_new_lockspace() should return -EEXIST if the lockspace exists.
  *
-- 
cgit v1.2.3


From 56171e0db23a5e0edce1596dd2792b95ffe57bd3 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:27 -0400
Subject: fs: dlm: const void resource name parameter

The resource name parameter should never be changed by DLM so we declare
it as const. At some point it is handled as a char pointer, a resource
name can be a non printable ascii string as well. This patch change it
to handle it as void pointer as it is offered by DLM API.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 include/linux/dlm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dlm.h b/include/linux/dlm.h
index f5f55c2138ae..c6bc2b5ee7e6 100644
--- a/include/linux/dlm.h
+++ b/include/linux/dlm.h
@@ -131,7 +131,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
 	     int mode,
 	     struct dlm_lksb *lksb,
 	     uint32_t flags,
-	     void *name,
+	     const void *name,
 	     unsigned int namelen,
 	     uint32_t parent_lkid,
 	     void (*lockast) (void *astarg),
-- 
cgit v1.2.3


From 0ba985024ae7db226776725d9aa436b5c1c9fca2 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Sun, 21 Aug 2022 14:35:16 +0300
Subject: flow_dissector: Make 'bpf_flow_dissect' return the bpf program
 retcode

Let 'bpf_flow_dissect' callers know the BPF program's retcode and act
accordingly.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20220821113519.116765-2-shmulik.ladkani@gmail.com
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ca8afa382bf2..87921996175c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1460,8 +1460,8 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     unsigned int key_count);
 
 struct bpf_flow_dissector;
-bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
-		      __be16 proto, int nhoff, int hlen, unsigned int flags);
+u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
+		     __be16 proto, int nhoff, int hlen, unsigned int flags);
 
 bool __skb_flow_dissect(const struct net *net,
 			const struct sk_buff *skb,
-- 
cgit v1.2.3


From dea6a4e17013382b20717664ebf3d7cc405e0952 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 23 Aug 2022 15:25:51 -0700
Subject: bpf: Introduce cgroup_{common,current}_func_proto

Split cgroup_base_func_proto into the following:

* cgroup_common_func_proto - common helpers for all cgroup hooks
* cgroup_current_func_proto - common helpers for all cgroup hooks
  running in the process context (== have meaningful 'current').

Move bpf_{g,s}et_retval and other cgroup-related helpers into
kernel/bpf/cgroup.c so they closer to where they are being used.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/r/20220823222555.523590-2-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 2bd1b5f8de9b..57e9e109257e 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -414,6 +414,11 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr,
 int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int cgroup_bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr);
+
+const struct bpf_func_proto *
+cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
+const struct bpf_func_proto *
+cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
 #else
 
 static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
@@ -444,6 +449,18 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
+static inline const struct bpf_func_proto *
+cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return NULL;
+}
+
+static inline const struct bpf_func_proto *
+cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return NULL;
+}
+
 static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
 					    struct bpf_map *map) { return 0; }
 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
-- 
cgit v1.2.3


From bed89185af0de0d417e29ca1798df50f161b0231 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 23 Aug 2022 15:25:52 -0700
Subject: bpf: Use cgroup_{common,current}_func_proto in more hooks

The following hooks are per-cgroup hooks but they are not
using cgroup_{common,current}_func_proto, fix it:

* BPF_PROG_TYPE_CGROUP_SKB (cg_skb)
* BPF_PROG_TYPE_CGROUP_SOCK_ADDR (cg_sock_addr)
* BPF_PROG_TYPE_CGROUP_SOCK (cg_sock)
* BPF_PROG_TYPE_LSM+BPF_LSM_CGROUP

Also:

* move common func_proto's into cgroup func_proto handlers
* make sure bpf_{g,s}et_retval are not accessible from recvmsg,
  getpeername and getsockname (return/errno is ignored in these
  places)
* as a side effect, expose get_current_pid_tgid, get_current_comm_proto,
  get_current_ancestor_cgroup_id, get_cgroup_classid to more cgroup
  hooks

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220823222555.523590-3-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 39bd36359c1e..99fc7a64564f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2375,6 +2375,7 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
 extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto;
+extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto;
 extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
 extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
 extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
-- 
cgit v1.2.3


From e8bf17d58a4db4b4f38617925414097f12e0d509 Mon Sep 17 00:00:00 2001
From: Evan Green <evgreen@chromium.org>
Date: Mon, 22 Aug 2022 14:40:40 -0700
Subject: platform/chrome: cros_ec: Expose suspend_timeout_ms in debugfs

In modern Chromebooks, the embedded controller has a mechanism where
it will watch a hardware-controlled line that toggles in suspend, and
wake the system up if an expected sleep transition didn't occur. This
can be very useful for detecting power management issues where the
system appears to suspend, but doesn't actually reach its lowest
expected power states.

Sometimes it's useful in debug and test scenarios to be able to control
the duration of that timeout, or even disable the EC timeout mechanism
altogether. Add a debugfs control to set the timeout to values other
than the EC-defined default, for more convenient debug and
development iteration.

Signed-off-by: Evan Green <evgreen@chromium.org>
Reviewed-by: Prashant Malani <pmalani@chromium.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Link: https://lore.kernel.org/r/20220822144026.v3.1.Idd188ff3f9caddebc17ac357a13005f93333c21f@changeid
[tzungbi: fix one nit in Documentation/ABI/testing/debugfs-cros-ec.]
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
---
 include/linux/platform_data/cros_ec_proto.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 408b29ca4004..e43107e0bee1 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -169,6 +169,7 @@ struct cros_ec_device {
 	int event_size;
 	u32 host_event_wake_mask;
 	u32 last_resume_result;
+	u16 suspend_timeout_ms;
 	ktime_t last_event_time;
 	struct notifier_block notifier_ready;
 
-- 
cgit v1.2.3


From c205cc7534a97f2d6fbd2a23a94ed7c036c6e2aa Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Sun, 21 Aug 2022 13:18:58 +0800
Subject: net: skb: prevent the split of kfree_skb_reason() by gcc

Sometimes, gcc will optimize the function by spliting it to two or
more functions. In this case, kfree_skb_reason() is splited to
kfree_skb_reason and kfree_skb_reason.part.0. However, the
function/tracepoint trace_kfree_skb() in it needs the return address
of kfree_skb_reason().

This split makes the call chains becomes:
  kfree_skb_reason() -> kfree_skb_reason.part.0 -> trace_kfree_skb()

which makes the return address that passed to trace_kfree_skb() be
kfree_skb().

Therefore, introduce '__fix_address', which is the combination of
'__noclone' and 'noinline', and apply it to kfree_skb_reason() to
prevent to from being splited or made inline.

(Is it better to simply apply '__noclone oninline' to kfree_skb_reason?
I'm thinking maybe other functions have the same problems)

Meanwhile, wrap 'skb_unref()' with 'unlikely()', as the compiler thinks
it is likely return true and splits kfree_skb_reason().

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/compiler_attributes.h | 7 +++++++
 include/linux/skbuff.h              | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index 445e80517cab..fc93c9488c76 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -371,4 +371,11 @@
  */
 #define __weak                          __attribute__((__weak__))
 
+/*
+ * Used by functions that use '__builtin_return_address'. These function
+ * don't want to be splited or made inline, which can make
+ * the '__builtin_return_address' get unexpected address.
+ */
+#define __fix_address noinline __noclone
+
 #endif /* __LINUX_COMPILER_ATTRIBUTES_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ca8afa382bf2..b51d07a727c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1195,7 +1195,8 @@ static inline bool skb_unref(struct sk_buff *skb)
 	return true;
 }
 
-void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason);
+void __fix_address
+kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason);
 
 /**
  *	kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason
-- 
cgit v1.2.3


From af67508ea6cbf0e4ea27f8120056fa2efce127dd Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 23 Aug 2022 10:46:56 -0700
Subject: net: Fix data-races around sysctl_fb_tunnels_only_for_init_net.

While reading sysctl_fb_tunnels_only_for_init_net, it can be changed
concurrently.  Thus, we need to add READ_ONCE() to its readers.

Fixes: 79134e6ce2c9 ("net: do not create fallback tunnels for non-default namespaces")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1a3cb93c3dcc..6d3a33fd0cdb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -640,9 +640,14 @@ extern int sysctl_devconf_inherit_init_net;
  */
 static inline bool net_has_fallback_tunnels(const struct net *net)
 {
-	return !IS_ENABLED(CONFIG_SYSCTL) ||
-	       !sysctl_fb_tunnels_only_for_init_net ||
-	       (net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1);
+#if IS_ENABLED(CONFIG_SYSCTL)
+	int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net);
+
+	return !fb_tunnels_only_for_init_net ||
+		(net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1);
+#else
+	return true;
+#endif
 }
 
 static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
-- 
cgit v1.2.3


From a5612ca10d1aa05624ebe72633e0c8c792970833 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 23 Aug 2022 10:46:57 -0700
Subject: net: Fix data-races around sysctl_devconf_inherit_init_net.

While reading sysctl_devconf_inherit_init_net, it can be changed
concurrently.  Thus, we need to add READ_ONCE() to its readers.

Fixes: 856c395cfa63 ("net: introduce a knob to control whether to inherit devconf config")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6d3a33fd0cdb..05d6f3facd5a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -650,6 +650,15 @@ static inline bool net_has_fallback_tunnels(const struct net *net)
 #endif
 }
 
+static inline int net_inherit_devconf(void)
+{
+#if IS_ENABLED(CONFIG_SYSCTL)
+	return READ_ONCE(sysctl_devconf_inherit_init_net);
+#else
+	return 0;
+#endif
+}
+
 static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
 {
 #if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
-- 
cgit v1.2.3


From f78a03f6e28be0283f73d3c18b54837b638a8ccf Mon Sep 17 00:00:00 2001
From: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Date: Wed, 17 Aug 2022 19:18:12 +0900
Subject: mm/slab_common: remove CONFIG_NUMA ifdefs for common kmalloc
 functions

Now that slab_alloc_node() is available for SLAB when CONFIG_NUMA=n,
remove CONFIG_NUMA ifdefs for common kmalloc functions.

Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0fefdf528e0d..4754c834b0e3 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -456,38 +456,18 @@ static __always_inline void kfree_bulk(size_t size, void **p)
 	kmem_cache_free_bulk(NULL, size, p);
 }
 
-#ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
 							 __alloc_size(1);
 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment
 									 __malloc;
-#else
-static __always_inline __alloc_size(1) void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
-	return __kmalloc(size, flags);
-}
-
-static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node)
-{
-	return kmem_cache_alloc(s, flags);
-}
-#endif
 
 #ifdef CONFIG_TRACING
 extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
 				   __assume_slab_alignment __alloc_size(3);
 
-#ifdef CONFIG_NUMA
 extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
 					 int node, size_t size) __assume_slab_alignment
 								__alloc_size(4);
-#else
-static __always_inline __alloc_size(4) void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
-						 gfp_t gfpflags, int node, size_t size)
-{
-	return kmem_cache_alloc_trace(s, gfpflags, size);
-}
-#endif /* CONFIG_NUMA */
 
 #else /* CONFIG_TRACING */
 static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s,
@@ -701,20 +681,12 @@ static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t
 }
 
 
-#ifdef CONFIG_NUMA
 extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
 					 unsigned long caller) __alloc_size(1);
 #define kmalloc_node_track_caller(size, flags, node) \
 	__kmalloc_node_track_caller(size, flags, node, \
 			_RET_IP_)
 
-#else /* CONFIG_NUMA */
-
-#define kmalloc_node_track_caller(size, flags, node) \
-	kmalloc_track_caller(size, flags)
-
-#endif /* CONFIG_NUMA */
-
 /*
  * Shortcuts
  */
-- 
cgit v1.2.3


From c45248db04f8e3aca4798d67a394fb9cc2168118 Mon Sep 17 00:00:00 2001
From: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Date: Wed, 17 Aug 2022 19:18:13 +0900
Subject: mm/slab_common: cleanup kmalloc_track_caller()

Make kmalloc_track_caller() wrapper of kmalloc_node_track_caller().

Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 4754c834b0e3..a0e57df3d5a4 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -651,6 +651,12 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag
 	return kmalloc_array(n, size, flags | __GFP_ZERO);
 }
 
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
+				  unsigned long caller) __alloc_size(1);
+#define kmalloc_node_track_caller(size, flags, node) \
+	__kmalloc_node_track_caller(size, flags, node, \
+				    _RET_IP_)
+
 /*
  * kmalloc_track_caller is a special version of kmalloc that records the
  * calling function of the routine calling it for slab leak tracking instead
@@ -659,9 +665,9 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag
  * allocator where we care about the real place the memory allocation
  * request comes from.
  */
-extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller);
 #define kmalloc_track_caller(size, flags) \
-	__kmalloc_track_caller(size, flags, _RET_IP_)
+	__kmalloc_node_track_caller(size, flags, \
+				    NUMA_NO_NODE, _RET_IP_)
 
 static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
 							  int node)
@@ -680,13 +686,6 @@ static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t
 	return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
 }
 
-
-extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
-					 unsigned long caller) __alloc_size(1);
-#define kmalloc_node_track_caller(size, flags, node) \
-	__kmalloc_node_track_caller(size, flags, node, \
-			_RET_IP_)
-
 /*
  * Shortcuts
  */
-- 
cgit v1.2.3


From e4c98d68959e51646c379e157bad36ef0d7bf467 Mon Sep 17 00:00:00 2001
From: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Date: Wed, 17 Aug 2022 19:18:15 +0900
Subject: mm/slab_common: fold kmalloc_order_trace() into kmalloc_large()

There is no caller of kmalloc_order_trace() except kmalloc_large().
Fold it into kmalloc_large() and remove kmalloc_order{,_trace}().

Also add tracepoint in kmalloc_large() that was previously
in kmalloc_order_trace().

Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index a0e57df3d5a4..15a4c59da59e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -489,26 +489,8 @@ static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, g
 }
 #endif /* CONFIG_TRACING */
 
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment
-									 __alloc_size(1);
-
-#ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
-				__assume_page_alignment __alloc_size(1);
-#else
-static __always_inline __alloc_size(1) void *kmalloc_order_trace(size_t size, gfp_t flags,
-								 unsigned int order)
-{
-	return kmalloc_order(size, flags, order);
-}
-#endif
-
-static __always_inline __alloc_size(1) void *kmalloc_large(size_t size, gfp_t flags)
-{
-	unsigned int order = get_order(size);
-	return kmalloc_order_trace(size, flags, order);
-}
-
+void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment
+					      __alloc_size(1);
 /**
  * kmalloc - allocate memory
  * @size: how many bytes of memory are required.
-- 
cgit v1.2.3


From a0c3b940023eef3fa005b2bc37d9312712331dcb Mon Sep 17 00:00:00 2001
From: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Date: Wed, 17 Aug 2022 19:18:16 +0900
Subject: mm/slub: move kmalloc_large_node() to slab_common.c

In later patch SLAB will also pass requests larger than order-1 page
to page allocator. Move kmalloc_large_node() to slab_common.c.

Fold kmalloc_large_node_hook() into kmalloc_large_node() as there is
no other caller.

Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 15a4c59da59e..082499306098 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -491,6 +491,10 @@ static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, g
 
 void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment
 					      __alloc_size(1);
+
+void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_alignment
+							     __alloc_size(1);
+
 /**
  * kmalloc - allocate memory
  * @size: how many bytes of memory are required.
-- 
cgit v1.2.3


From bf37d791022ecfb1279ac88c5448a53f1ae40a59 Mon Sep 17 00:00:00 2001
From: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Date: Wed, 17 Aug 2022 19:18:17 +0900
Subject: mm/slab_common: kmalloc_node: pass large requests to page allocator

Now that kmalloc_large_node() is in common code, pass large requests
to page allocator in kmalloc_node() using kmalloc_large_node().

One problem is that currently there is no tracepoint in
kmalloc_large_node(). Instead of simply putting tracepoint in it,
use kmalloc_large_node{,_notrace} depending on its caller to show
useful address for both inlined kmalloc_node() and
__kmalloc_node_track_caller() when large objects are allocated.

Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 082499306098..fd2e129fc813 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -571,23 +571,35 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
 	return __kmalloc(size, flags);
 }
 
+#ifndef CONFIG_SLOB
 static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
-#ifndef CONFIG_SLOB
-	if (__builtin_constant_p(size) &&
-		size <= KMALLOC_MAX_CACHE_SIZE) {
-		unsigned int i = kmalloc_index(size);
+	if (__builtin_constant_p(size)) {
+		unsigned int index;
 
-		if (!i)
+		if (size > KMALLOC_MAX_CACHE_SIZE)
+			return kmalloc_large_node(size, flags, node);
+
+		index = kmalloc_index(size);
+
+		if (!index)
 			return ZERO_SIZE_PTR;
 
 		return kmem_cache_alloc_node_trace(
-				kmalloc_caches[kmalloc_type(flags)][i],
+				kmalloc_caches[kmalloc_type(flags)][index],
 						flags, node, size);
 	}
-#endif
 	return __kmalloc_node(size, flags, node);
 }
+#else
+static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	if (__builtin_constant_p(size) && size > KMALLOC_MAX_CACHE_SIZE)
+		return kmalloc_large_node(size, flags, node);
+
+	return __kmalloc_node(size, flags, node);
+}
+#endif
 
 /**
  * kmalloc_array - allocate memory for an array.
-- 
cgit v1.2.3


From d6a71648dbc0ca5520cba16a8fdce8d37ae74218 Mon Sep 17 00:00:00 2001
From: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Date: Wed, 17 Aug 2022 19:18:19 +0900
Subject: mm/slab: kmalloc: pass requests larger than order-1 page to page
 allocator

There is not much benefit for serving large objects in kmalloc().
Let's pass large requests to page allocator like SLUB for better
maintenance of common code.

Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index fd2e129fc813..4ee5b2fed164 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -243,27 +243,17 @@ static inline unsigned int arch_slab_minalign(void)
 
 #ifdef CONFIG_SLAB
 /*
- * The largest kmalloc size supported by the SLAB allocators is
- * 32 megabyte (2^25) or the maximum allocatable page order if that is
- * less than 32 MB.
- *
- * WARNING: Its not easy to increase this value since the allocators have
- * to do various tricks to work around compiler limitations in order to
- * ensure proper constant folding.
+ * SLAB and SLUB directly allocates requests fitting in to an order-1 page
+ * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
  */
-#define KMALLOC_SHIFT_HIGH	((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
-				(MAX_ORDER + PAGE_SHIFT - 1) : 25)
-#define KMALLOC_SHIFT_MAX	KMALLOC_SHIFT_HIGH
+#define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
+#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT - 1)
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	5
 #endif
 #endif
 
 #ifdef CONFIG_SLUB
-/*
- * SLUB directly allocates requests fitting in to an order-1 page
- * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
- */
 #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
 #define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT - 1)
 #ifndef KMALLOC_SHIFT_LOW
@@ -415,10 +405,6 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
 	if (size <= 512 * 1024) return 19;
 	if (size <= 1024 * 1024) return 20;
 	if (size <=  2 * 1024 * 1024) return 21;
-	if (size <=  4 * 1024 * 1024) return 22;
-	if (size <=  8 * 1024 * 1024) return 23;
-	if (size <=  16 * 1024 * 1024) return 24;
-	if (size <=  32 * 1024 * 1024) return 25;
 
 	if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
 		BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
@@ -428,6 +414,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
 	/* Will never be reached. Needed because the compiler may complain */
 	return -1;
 }
+static_assert(PAGE_SHIFT <= 20);
 #define kmalloc_index(s) __kmalloc_index(s, true)
 #endif /* !CONFIG_SLOB */
 
-- 
cgit v1.2.3


From ebc97a52b5d6cd5fb0c15a3fc9cdd6eb924646a1 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Tue, 23 Aug 2022 00:46:36 +0000
Subject: mm: add NR_SECONDARY_PAGETABLE to count secondary page table uses.

We keep track of several kernel memory stats (total kernel memory, page
tables, stack, vmalloc, etc) on multiple levels (global, per-node,
per-memcg, etc). These stats give insights to users to how much memory
is used by the kernel and for what purposes.

Currently, memory used by KVM mmu is not accounted in any of those
kernel memory stats. This patch series accounts the memory pages
used by KVM for page tables in those stats in a new
NR_SECONDARY_PAGETABLE stat. This stat can be later extended to account
for other types of secondary pages tables (e.g. iommu page tables).

KVM has a decent number of large allocations that aren't for page
tables, but for most of them, the number/size of those allocations
scales linearly with either the number of vCPUs or the amount of memory
assigned to the VM. KVM's secondary page table allocations do not scale
linearly, especially when nested virtualization is in use.

From a KVM perspective, NR_SECONDARY_PAGETABLE will scale with KVM's
per-VM pages_{4k,2m,1g} stats unless the guest is doing something
bizarre (e.g. accessing only 4kb chunks of 2mb pages so that KVM is
forced to allocate a large number of page tables even though the guest
isn't accessing that much memory). However, someone would need to either
understand how KVM works to make that connection, or know (or be told) to
go look at KVM's stats if they're running VMs to better decipher the stats.

Furthermore, having NR_PAGETABLE side-by-side with NR_SECONDARY_PAGETABLE
is informative. For example, when backing a VM with THP vs. HugeTLB,
NR_SECONDARY_PAGETABLE is roughly the same, but NR_PAGETABLE is an order
of magnitude higher with THP. So having this stat will at the very least
prove to be useful for understanding tradeoffs between VM backing types,
and likely even steer folks towards potential optimizations.

The original discussion with more details about the rationale:
https://lore.kernel.org/all/87ilqoi77b.wl-maz@kernel.org

This stat will be used by subsequent patches to count KVM mmu
memory usage.

Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220823004639.2387269-2-yosryahmed@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/mmzone.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e24b40c52468..355d842d2731 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -216,6 +216,7 @@ enum node_stat_item {
 	NR_KERNEL_SCS_KB,	/* measured in KiB */
 #endif
 	NR_PAGETABLE,		/* used for pagetables */
+	NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. KVM pagetables */
 #ifdef CONFIG_SWAP
 	NR_SWAPCACHE,
 #endif
-- 
cgit v1.2.3


From 9d9d00ac29d0ef7ce426964de46fa6b380357d0a Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 23 Aug 2022 03:31:25 +0200
Subject: bpf: Fix reference state management for synchronous callbacks

Currently, verifier verifies callback functions (sync and async) as if
they will be executed once, (i.e. it explores execution state as if the
function was being called once). The next insn to explore is set to
start of subprog and the exit from nested frame is handled using
curframe > 0 and prepare_func_exit. In case of async callback it uses a
customized variant of push_stack simulating a kind of branch to set up
custom state and execution context for the async callback.

While this approach is simple and works when callback really will be
executed only once, it is unsafe for all of our current helpers which
are for_each style, i.e. they execute the callback multiple times.

A callback releasing acquired references of the caller may do so
multiple times, but currently verifier sees it as one call inside the
frame, which then returns to caller. Hence, it thinks it released some
reference that the cb e.g. got access through callback_ctx (register
filled inside cb from spilled typed register on stack).

Similarly, it may see that an acquire call is unpaired inside the
callback, so the caller will copy the reference state of callback and
then will have to release the register with new ref_obj_ids. But again,
the callback may execute multiple times, but the verifier will only
account for acquired references for a single symbolic execution of the
callback, which will cause leaks.

Note that for async callback case, things are different. While currently
we have bpf_timer_set_callback which only executes it once, even for
multiple executions it would be safe, as reference state is NULL and
check_reference_leak would force program to release state before
BPF_EXIT. The state is also unaffected by analysis for the caller frame.
Hence async callback is safe.

Since we want the reference state to be accessible, e.g. for pointers
loaded from stack through callback_ctx's PTR_TO_STACK, we still have to
copy caller's reference_state to callback's bpf_func_state, but we
enforce that whatever references it adds to that reference_state has
been released before it hits BPF_EXIT. This requires introducing a new
callback_ref member in the reference state to distinguish between caller
vs callee references. Hence, check_reference_leak now errors out if it
sees we are in callback_fn and we have not released callback_ref refs.
Since there can be multiple nested callbacks, like frame 0 -> cb1 -> cb2
etc. we need to also distinguish between whether this particular ref
belongs to this callback frame or parent, and only error for our own, so
we store state->frameno (which is always non-zero for callbacks).

In short, callbacks can read parent reference_state, but cannot mutate
it, to be able to use pointers acquired by the caller. They must only
undo their changes (by releasing their own acquired_refs before
BPF_EXIT) on top of caller reference_state before returning (at which
point the caller and callback state will match anyway, so no need to
copy it back to caller).

Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220823013125.24938-1-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 2e3bad8640dc..1fdddbf3546b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -212,6 +212,17 @@ struct bpf_reference_state {
 	 * is used purely to inform the user of a reference leak.
 	 */
 	int insn_idx;
+	/* There can be a case like:
+	 * main (frame 0)
+	 *  cb (frame 1)
+	 *   func (frame 3)
+	 *    cb (frame 4)
+	 * Hence for frame 4, if callback_ref just stored boolean, it would be
+	 * impossible to distinguish nested callback refs. Hence store the
+	 * frameno and compare that to callback_ref in check_reference_leak when
+	 * exiting a callback function.
+	 */
+	int callback_ref;
 };
 
 /* state of the program:
-- 
cgit v1.2.3


From ea5cba269fb1fe22b84f4d01bb3d56320e6ffa3e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 16 Aug 2022 11:26:23 +0200
Subject: wifi: cfg80211/mac80211: check EHT capability size correctly

For AP/non-AP the EHT MCS/NSS subfield size differs, the
4-octet subfield is only used for 20 MHz-only non-AP STA.
Pass an argument around everywhere to be able to parse it
properly.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 55e6f4ad0ca6..6f70394417ac 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2886,7 +2886,8 @@ ieee80211_he_spr_size(const u8 *he_spr_ie)
 /* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */
 static inline u8
 ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap,
-			   const struct ieee80211_eht_cap_elem_fixed *eht_cap)
+			   const struct ieee80211_eht_cap_elem_fixed *eht_cap,
+			   bool from_ap)
 {
 	u8 count = 0;
 
@@ -2907,7 +2908,10 @@ ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap,
 	if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ)
 		count += 3;
 
-	return count ? count : 4;
+	if (count)
+		return count;
+
+	return from_ap ? 3 : 4;
 }
 
 /* 802.11be EHT PPE Thresholds */
@@ -2943,7 +2947,8 @@ ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info)
 }
 
 static inline bool
-ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len)
+ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len,
+			   bool from_ap)
 {
 	const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data;
 	u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed);
@@ -2952,7 +2957,8 @@ ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len)
 		return false;
 
 	needed += ieee80211_eht_mcs_nss_size((const void *)he_capa,
-					     (const void *)data);
+					     (const void *)data,
+					     from_ap);
 	if (len < needed)
 		return false;
 
-- 
cgit v1.2.3


From d8c04e27d93eb0afd750d5ea60119a92b146a755 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 1 Aug 2022 14:37:31 +0300
Subject: platform/x86: pmc_atom: Fix SLP_TYPx bitfield mask

On Intel hardware the SLP_TYPx bitfield occupies bits 10-12 as per ACPI
specification (see Table 4.13 "PM1 Control Registers Fixed Hardware
Feature Control Bits" for the details).

Fix the mask and other related definitions accordingly.

Fixes: 93e5eadd1f6e ("x86/platform: New Intel Atom SOC power management controller driver")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220801113734.36131-1-andriy.shevchenko@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/pmc_atom.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h
index 3edfb6d4e67a..dd81f510e4cf 100644
--- a/include/linux/platform_data/x86/pmc_atom.h
+++ b/include/linux/platform_data/x86/pmc_atom.h
@@ -7,6 +7,8 @@
 #ifndef PMC_ATOM_H
 #define PMC_ATOM_H
 
+#include <linux/bits.h>
+
 /* ValleyView Power Control Unit PCI Device ID */
 #define	PCI_DEVICE_ID_VLV_PMC	0x0F1C
 /* CherryTrail Power Control Unit PCI Device ID */
@@ -139,9 +141,9 @@
 #define	ACPI_MMIO_REG_LEN	0x100
 
 #define	PM1_CNT			0x4
-#define	SLEEP_TYPE_MASK		0xFFFFECFF
+#define	SLEEP_TYPE_MASK		GENMASK(12, 10)
 #define	SLEEP_TYPE_S5		0x1C00
-#define	SLEEP_ENABLE		0x2000
+#define	SLEEP_ENABLE		BIT(13)
 
 extern int pmc_atom_read(int offset, u32 *value);
 
-- 
cgit v1.2.3


From 5a88ace44d0f99e2bac55d827f4cd6e8bc07e00b Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 1 Aug 2022 14:37:34 +0300
Subject: platform/x86: pmc_atom: Amend comment style and grammar

The style of the comments is not uniform, make it so and fix
a few grammar issues. While at it, update Copyright years.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220801113734.36131-4-andriy.shevchenko@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/pmc_atom.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h
index dd81f510e4cf..b8a701c77fd0 100644
--- a/include/linux/platform_data/x86/pmc_atom.h
+++ b/include/linux/platform_data/x86/pmc_atom.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Intel Atom SOC Power Management Controller Header File
- * Copyright (c) 2014, Intel Corporation.
+ * Intel Atom SoC Power Management Controller Header File
+ * Copyright (c) 2014-2015,2022 Intel Corporation.
  */
 
 #ifndef PMC_ATOM_H
-- 
cgit v1.2.3


From 01ef026ab36357a818c7d8324a36dbb8beff6ff5 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Sat, 13 Aug 2022 21:26:24 +1200
Subject: platform/x86: asus-wmi: Support the hardware GPU MUX on some laptops

Support the hardware GPU MUX switch available on some models. This
switch can toggle the MUX between:

- 0, Dedicated mode
- 1, Optimus mode

Optimus mode is the regular iGPU + dGPU available, while dedicated
mode switches the system to have only the dGPU available.

Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20220813092624.6228-1-luke@ljones.dev
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 98f2b2f20f3e..70d2347bf6ca 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -99,6 +99,9 @@
 /* dgpu on/off */
 #define ASUS_WMI_DEVID_DGPU		0x00090020
 
+/* gpu mux switch, 0 = dGPU, 1 = Optimus */
+#define ASUS_WMI_DEVID_GPU_MUX		0x00090016
+
 /* DSTS masks */
 #define ASUS_WMI_DSTS_STATUS_BIT	0x00000001
 #define ASUS_WMI_DSTS_UNKNOWN_BIT	0x00000002
-- 
cgit v1.2.3


From e397c3c460bf3849384f2f55516d1887617cfca9 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Sat, 13 Aug 2022 21:27:53 +1200
Subject: platform/x86: asus-wmi: Add support for ROG X13 tablet mode

Add quirk for ASUS ROG X13 Flow 2-in-1 to enable tablet mode with
lid flip (all screen rotations).

Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20220813092753.6635-2-luke@ljones.dev
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 70d2347bf6ca..6e8a95c10d17 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -65,6 +65,7 @@
 #define ASUS_WMI_DEVID_PANEL_OD		0x00050019
 #define ASUS_WMI_DEVID_CAMERA		0x00060013
 #define ASUS_WMI_DEVID_LID_FLIP		0x00060062
+#define ASUS_WMI_DEVID_LID_FLIP_ROG	0x00060077
 
 /* Storage */
 #define ASUS_WMI_DEVID_CARDREADER	0x00080013
-- 
cgit v1.2.3


From d4ccaf58a8472123ac97e6db03932c375b5c45ba Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Wed, 24 Aug 2022 16:31:13 -0700
Subject: bpf: Introduce cgroup iter

Cgroup_iter is a type of bpf_iter. It walks over cgroups in four modes:

 - walking a cgroup's descendants in pre-order.
 - walking a cgroup's descendants in post-order.
 - walking a cgroup's ancestors.
 - process only the given cgroup.

When attaching cgroup_iter, one can set a cgroup to the iter_link
created from attaching. This cgroup is passed as a file descriptor
or cgroup id and serves as the starting point of the walk. If no
cgroup is specified, the starting point will be the root cgroup v2.

For walking descendants, one can specify the order: either pre-order or
post-order. For walking ancestors, the walk starts at the specified
cgroup and ends at the root.

One can also terminate the walk early by returning 1 from the iter
program.

Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
program is called with cgroup_mutex held.

Currently only one session is supported, which means, depending on the
volume of data bpf program intends to send to user space, the number
of cgroups that can be walked is limited. For example, given the current
buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
be walked is 512. This is a limitation of cgroup_iter. If the output
data is larger than the kernel buffer size, after all data in the
kernel buffer is consumed by user space, the subsequent read() syscall
will signal EOPNOTSUPP. In order to work around, the user may have to
update their program to reduce the volume of data sent to output. For
example, skip some uninteresting cgroups. In future, we may extend
bpf_iter flags to allow customizing buffer size.

Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Hao Luo <haoluo@google.com>
Link: https://lore.kernel.org/r/20220824233117.1312810-2-haoluo@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 99fc7a64564f..9c1674973e03 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -48,6 +48,7 @@ struct mem_cgroup;
 struct module;
 struct bpf_func_state;
 struct ftrace_ops;
+struct cgroup;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -1730,7 +1731,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 	int __init bpf_iter_ ## target(args) { return 0; }
 
 struct bpf_iter_aux_info {
+	/* for map_elem iter */
 	struct bpf_map *map;
+
+	/* for cgroup iter */
+	struct {
+		struct cgroup *start; /* starting cgroup */
+		enum bpf_cgroup_iter_order order;
+	} cgroup;
 };
 
 typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
-- 
cgit v1.2.3


From 40bfe7a86d84cf08ac6a8fe2f0c8bf7a43edd110 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 24 Aug 2022 17:32:56 +0200
Subject: of/device: Fix up of_dma_configure_id() stub

Since the stub version of of_dma_configure_id() was added in commit
a081bd4af4ce ("of/device: Add input id to of_dma_configure()"), it has
not matched the signature of the full function, leading to build failure
reports when code using this function is built on !OF configurations.

Fixes: a081bd4af4ce ("of/device: Add input id to of_dma_configure()")
Cc: stable@vger.kernel.org
Signed-off-by: Thierry Reding <treding@nvidia.com>
Reviewed-by: Frank Rowand <frank.rowand@sony.com>
Acked-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Link: https://lore.kernel.org/r/20220824153256.1437483-1-thierry.reding@gmail.com
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_device.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 1d7992a02e36..1a803e4335d3 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -101,8 +101,9 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 }
 
 static inline int of_dma_configure_id(struct device *dev,
-				   struct device_node *np,
-				   bool force_dma)
+				      struct device_node *np,
+				      bool force_dma,
+				      const u32 *id)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From b9030780971b56c0c455c3b66244efd96608846d Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 22 Aug 2022 16:32:43 +0200
Subject: netdev: Use try_cmpxchg in napi_if_scheduled_mark_missed

Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in
napi_if_scheduled_mark_missed. x86 CMPXCHG instruction returns
success in ZF flag, so this change saves a compare after cmpxchg
(and related move instruction in front of cmpxchg).

Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg
fails, enabling further code simplifications.

Cc: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Link: https://lore.kernel.org/r/20220822143243.2798-1-ubizjak@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 64e8662632f8..8839abc1f941 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -546,8 +546,8 @@ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
 {
 	unsigned long val, new;
 
+	val = READ_ONCE(n->state);
 	do {
-		val = READ_ONCE(n->state);
 		if (val & NAPIF_STATE_DISABLE)
 			return true;
 
@@ -555,7 +555,7 @@ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
 			return false;
 
 		new = val | NAPIF_STATE_MISSED;
-	} while (cmpxchg(&n->state, val, new) != val);
+	} while (!try_cmpxchg(&n->state, &val, new));
 
 	return true;
 }
-- 
cgit v1.2.3


From e00923c59e68b63c998a0fef4145b5279331968a Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Thu, 9 Jun 2022 12:33:13 +0900
Subject: ata: libata: Rename ATA_DFLAG_NCQ_PRIO_ENABLE

Rename ATA_DFLAG_NCQ_PRIO_ENABLE to ATA_DFLAG_NCQ_PRIO_ENABLED to match
the fact that this flags indicates if NCQ priority use is enabled by the
user.

Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0269ff114f5a..a505cfb92ab3 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -101,7 +101,7 @@ enum {
 	ATA_DFLAG_UNLOCK_HPA	= (1 << 18), /* unlock HPA */
 	ATA_DFLAG_NCQ_SEND_RECV = (1 << 19), /* device supports NCQ SEND and RECV */
 	ATA_DFLAG_NCQ_PRIO	= (1 << 20), /* device supports NCQ priority */
-	ATA_DFLAG_NCQ_PRIO_ENABLE = (1 << 21), /* Priority cmds sent to dev */
+	ATA_DFLAG_NCQ_PRIO_ENABLED = (1 << 21), /* Priority cmds sent to dev */
 	ATA_DFLAG_INIT_MASK	= (1 << 24) - 1,
 
 	ATA_DFLAG_DETACH	= (1 << 24),
-- 
cgit v1.2.3


From 12ff4c803d23218f9afea9f82019a5c9619f6744 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Fri, 26 Aug 2022 12:42:10 +1200
Subject: platform/x86: asus-wmi: Support the GPU fan on TUF laptops

Add support for TUF laptops which have the ability to control
the GPU fan. This will show as a second fan in hwmon, and has
the ability to run as boost (fullspeed), or auto.

Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20220826004210.356534-3-luke@ljones.dev
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 6e8a95c10d17..31d810e6569d 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -79,6 +79,7 @@
 #define ASUS_WMI_DEVID_THERMAL_CTRL	0x00110011
 #define ASUS_WMI_DEVID_FAN_CTRL		0x00110012 /* deprecated */
 #define ASUS_WMI_DEVID_CPU_FAN_CTRL	0x00110013
+#define ASUS_WMI_DEVID_GPU_FAN_CTRL	0x00110014
 #define ASUS_WMI_DEVID_CPU_FAN_CURVE	0x00110024
 #define ASUS_WMI_DEVID_GPU_FAN_CURVE	0x00110025
 
-- 
cgit v1.2.3


From e305a71cea37a64c7558b8b979f6f08f657d0c3d Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Fri, 26 Aug 2022 11:22:50 +1200
Subject: platform/x86: asus-wmi: Implement TUF laptop keyboard LED modes

Adds support for changing the laptop keyboard LED mode and colour.

The modes are visible effects such as static, rainbow, pulsing,
colour cycles.

These sysfs attributes are added to asus::kbd_backlight:
- kbd_rgb_mode
- kbd_rgb_mode_index

Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20220825232251.345893-2-luke@ljones.dev
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 31d810e6569d..09e5477f1aea 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -104,6 +104,9 @@
 /* gpu mux switch, 0 = dGPU, 1 = Optimus */
 #define ASUS_WMI_DEVID_GPU_MUX		0x00090016
 
+/* TUF laptop RGB modes/colours */
+#define ASUS_WMI_DEVID_TUF_RGB_MODE	0x00100056
+
 /* DSTS masks */
 #define ASUS_WMI_DSTS_STATUS_BIT	0x00000001
 #define ASUS_WMI_DSTS_UNKNOWN_BIT	0x00000002
-- 
cgit v1.2.3


From 61f64515299e5f4885093656087ec1c0df8109c5 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Fri, 26 Aug 2022 11:22:51 +1200
Subject: platform/x86: asus-wmi: Implement TUF laptop keyboard power states

Adds support for setting various power states of TUF keyboards.
These states are combinations of:
- boot, set if a boot animation is shown on keyboard
- awake, set if the keyboard LEDs are visible while laptop is on
- sleep, set if an animation is displayed while the laptop is suspended
- keyboard (unknown effect)

Adds two sysfs attributes to asus::kbd_backlight:
- kbd_rgb_state
- kbd_rgb_state_index

Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20220825232251.345893-3-luke@ljones.dev
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 09e5477f1aea..28234dc9fa6a 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -107,6 +107,9 @@
 /* TUF laptop RGB modes/colours */
 #define ASUS_WMI_DEVID_TUF_RGB_MODE	0x00100056
 
+/* TUF laptop RGB power/state */
+#define ASUS_WMI_DEVID_TUF_RGB_STATE	0x00100057
+
 /* DSTS masks */
 #define ASUS_WMI_DSTS_STATUS_BIT	0x00000001
 #define ASUS_WMI_DSTS_UNKNOWN_BIT	0x00000002
-- 
cgit v1.2.3


From 2a5840124009f133bd09fd855963551fb2cefe22 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 15 Jul 2022 12:16:22 -0700
Subject: lsm,io_uring: add LSM hooks for the new uring_cmd file op

io-uring cmd support was added through ee692a21e9bf ("fs,io_uring:
add infrastructure for uring-cmd"), this extended the struct
file_operations to allow a new command which each subsystem can use
to enable command passthrough. Add an LSM specific for the command
passthrough which enables LSMs to inspect the command details.

This was discussed long ago without no clear pointer for something
conclusive, so this enables LSMs to at least reject this new file
operation.

[0] https://lkml.kernel.org/r/8adf55db-7bab-f59d-d612-ed906b948d19@schaufler-ca.com

Cc: stable@vger.kernel.org
Fixes: ee692a21e9bf ("fs,io_uring: add infrastructure for uring-cmd")
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 1 +
 include/linux/lsm_hooks.h     | 3 +++
 include/linux/security.h      | 5 +++++
 3 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 806448173033..60fff133c0b1 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -407,4 +407,5 @@ LSM_HOOK(int, 0, perf_event_write, struct perf_event *event)
 #ifdef CONFIG_IO_URING
 LSM_HOOK(int, 0, uring_override_creds, const struct cred *new)
 LSM_HOOK(int, 0, uring_sqpoll, void)
+LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd)
 #endif /* CONFIG_IO_URING */
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 84a0d7e02176..3aa6030302f5 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1582,6 +1582,9 @@
  *      Check whether the current task is allowed to spawn a io_uring polling
  *      thread (IORING_SETUP_SQPOLL).
  *
+ * @uring_cmd:
+ *      Check whether the file_operations uring_cmd is allowed to run.
+ *
  */
 union security_list_options {
 	#define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__);
diff --git a/include/linux/security.h b/include/linux/security.h
index 1bc362cb413f..7bd0c490703d 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2060,6 +2060,7 @@ static inline int security_perf_event_write(struct perf_event *event)
 #ifdef CONFIG_SECURITY
 extern int security_uring_override_creds(const struct cred *new);
 extern int security_uring_sqpoll(void);
+extern int security_uring_cmd(struct io_uring_cmd *ioucmd);
 #else
 static inline int security_uring_override_creds(const struct cred *new)
 {
@@ -2069,6 +2070,10 @@ static inline int security_uring_sqpoll(void)
 {
 	return 0;
 }
+static inline int security_uring_cmd(struct io_uring_cmd *ioucmd)
+{
+	return 0;
+}
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_IO_URING */
 
-- 
cgit v1.2.3


From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 26 Aug 2022 09:17:08 -0400
Subject: wait_on_bit: add an acquire memory barrier

There are several places in the kernel where wait_on_bit is not followed
by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read).

On architectures with weak memory ordering, it may happen that memory
accesses that follow wait_on_bit are reordered before wait_on_bit and
they may return invalid data.

Fix this class of bugs by introducing a new function "test_bit_acquire"
that works like test_bit, but has acquire memory ordering semantics.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Acked-by: Will Deacon <will@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitops.h      | 1 +
 include/linux/buffer_head.h | 2 +-
 include/linux/wait_bit.h    | 8 ++++----
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf9bf65039f2..3b89c64bcfd8 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -59,6 +59,7 @@ extern unsigned long __sw_hweight64(__u64 w);
 #define __test_and_clear_bit(nr, addr)	bitop(___test_and_clear_bit, nr, addr)
 #define __test_and_change_bit(nr, addr)	bitop(___test_and_change_bit, nr, addr)
 #define test_bit(nr, addr)		bitop(_test_bit, nr, addr)
+#define test_bit_acquire(nr, addr)	bitop(_test_bit_acquire, nr, addr)
 
 /*
  * Include this here because some architectures need generic_ffs/fls in
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index def8b8d30ccc..089c9ade4325 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -156,7 +156,7 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh)
 	 * make it consistent with folio_test_uptodate
 	 * pairs with smp_mb__before_atomic in set_buffer_uptodate
 	 */
-	return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
+	return test_bit_acquire(BH_Uptodate, &bh->b_state);
 }
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 7dec36aecbd9..7725b7579b78 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -71,7 +71,7 @@ static inline int
 wait_on_bit(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit(word, bit,
 				       bit_wait,
@@ -96,7 +96,7 @@ static inline int
 wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit(word, bit,
 				       bit_wait_io,
@@ -123,7 +123,7 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
 		    unsigned long timeout)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit_timeout(word, bit,
 					       bit_wait_timeout,
@@ -151,7 +151,7 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
 		   unsigned mode)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit(word, bit, action, mode);
 }
-- 
cgit v1.2.3


From fa7e439cf90ba23ea473d0b7d85efd02ae6ccf94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Koutn=C3=BD?= <mkoutny@suse.com>
Date: Fri, 26 Aug 2022 18:52:37 +0200
Subject: cgroup: Homogenize cgroup_get_from_id() return value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cgroup id is user provided datum hence extend its return domain to
include possible error reason (similar to cgroup_get_from_fd()).

This change also fixes commit d4ccaf58a847 ("bpf: Introduce cgroup
iter") that would use NULL instead of proper error handling in
d4ccaf58a847 ("bpf: Introduce cgroup iter").

Additionally, neither of: fc_appid_store, bpf_iter_attach_cgroup,
mem_cgroup_get_from_ino (callers of cgroup_get_from_fd) is built without
CONFIG_CGROUPS (depends via CONFIG_BLK_CGROUP, direct, transitive
CONFIG_MEMCG respectively) transitive, so drop the singular definition
not needed with !CONFIG_CGROUPS.

Fixes: d4ccaf58a847 ("bpf: Introduce cgroup iter")
Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4d143729b246..30ee8ce2665e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -750,11 +750,6 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 
 static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
 {}
-
-static inline struct cgroup *cgroup_get_from_id(u64 id)
-{
-	return NULL;
-}
 #endif /* !CONFIG_CGROUPS */
 
 #ifdef CONFIG_CGROUPS
-- 
cgit v1.2.3


From 93315e46b000fc80fff5d53c3f444417fb3df6de Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan.das@amd.com>
Date: Thu, 11 Aug 2022 18:00:00 +0530
Subject: perf/core: Add speculation info to branch entries

Add a new "spec" bitfield to branch entries for providing speculation
information. This will be populated using hints provided by branch sampling
features on supported hardware. The following cases are covered:

  * No branch speculation information is available
  * Branch is speculative but taken on the wrong path
  * Branch is non-speculative but taken on the correct path
  * Branch is speculative and taken on the correct path

Suggested-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/834088c302faf21c7b665031dd111f424e509a64.1660211399.git.sandipan.das@amd.com
---
 include/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ee8b9ecdc03b..ae30c61957d2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1078,6 +1078,7 @@ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *b
 	br->abort = 0;
 	br->cycles = 0;
 	br->type = 0;
+	br->spec = PERF_BR_SPEC_NA;
 	br->reserved = 0;
 }
 
-- 
cgit v1.2.3


From d2a4cbcb8bdc0e3d1cf85bf47a670695da0bc27a Mon Sep 17 00:00:00 2001
From: Dmitry Rokosov <DDRokosov@sberdevices.ru>
Date: Fri, 12 Aug 2022 16:52:26 +0000
Subject: units: complement the set of Hz units

Currently, Hz units do not have milli, micro and nano Hz coefficients.
Some drivers (IIO especially) use their analogues to calculate
appropriate Hz values. This patch includes them to units.h definitions,
so they can be used from different kernel places.

Signed-off-by: Dmitry Rokosov <ddrokosov@sberdevices.ru>
Link: https://lore.kernel.org/r/20220812165243.22177-3-ddrokosov@sberdevices.ru
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/units.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/units.h b/include/linux/units.h
index 681fc652e3d7..2793a41e73a2 100644
--- a/include/linux/units.h
+++ b/include/linux/units.h
@@ -20,6 +20,9 @@
 #define PICO	1000000000000ULL
 #define FEMTO	1000000000000000ULL
 
+#define NANOHZ_PER_HZ		1000000000UL
+#define MICROHZ_PER_HZ		1000000UL
+#define MILLIHZ_PER_HZ		1000UL
 #define HZ_PER_KHZ		1000UL
 #define KHZ_PER_MHZ		1000UL
 #define HZ_PER_MHZ		1000000UL
-- 
cgit v1.2.3


From 1f5d7ea73c4b630dbb2c90818cb9fc0be54d2fe3 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 22 Aug 2022 17:49:23 +0000
Subject: lib/string_helpers: Add str_read_write() helper

Add str_read_write() helper to return 'read' or 'write' string literal.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Dmitry Rokosov <ddrokosov@sberdevices.ru>
Link: https://lore.kernel.org/r/20220822175011.2886-2-ddrokosov@sberdevices.ru
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/string_helpers.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 4d72258d42fd..9e22cd78f3b8 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -126,4 +126,9 @@ static inline const char *str_enabled_disabled(bool v)
 	return v ? "enabled" : "disabled";
 }
 
+static inline const char *str_read_write(bool v)
+{
+	return v ? "read" : "write";
+}
+
 #endif
-- 
cgit v1.2.3


From fcab34b433e2c13e333b2f53c4a8409eadc432c7 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 10 Aug 2022 10:53:59 -0600
Subject: mm: re-allow pinning of zero pfns (again)

The below referenced commit makes the same error as 1c563432588d ("mm: fix
is_pinnable_page against a cma page"), re-interpreting the logic to
exclude pinning of the zero page, which breaks device assignment with
vfio.

To avoid further subtle mistakes, split the logic into discrete tests.

[akpm@linux-foundation.org: simplify comment, per John]
Link: https://lkml.kernel.org/r/166015037385.760108.16881097713975517242.stgit@omen
Link: https://lore.kernel.org/all/165490039431.944052.12458624139225785964.stgit@omen
Fixes: f25cbb7a95a2 ("mm: add zone device coherent type memory support")
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Suggested-by: Felix Kuehling <felix.kuehling@amd.com>
Tested-by: Slawomir Laba <slawomirx.laba@intel.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 982f2607180b..21f8b27bd9fd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1544,9 +1544,16 @@ static inline bool is_longterm_pinnable_page(struct page *page)
 	if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
 		return false;
 #endif
-	return !(is_device_coherent_page(page) ||
-		 is_zone_movable_page(page) ||
-		 is_zero_pfn(page_to_pfn(page)));
+	/* The zero page may always be pinned */
+	if (is_zero_pfn(page_to_pfn(page)))
+		return true;
+
+	/* Coherent device memory must always allow eviction. */
+	if (is_device_coherent_page(page))
+		return false;
+
+	/* Otherwise, non-movable zone pages can be pinned. */
+	return !is_zone_movable_page(page);
 }
 #else
 static inline bool is_longterm_pinnable_page(struct page *page)
-- 
cgit v1.2.3


From dbb16df6443c59e8a1ef21c2272fcf387d600ddf Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Wed, 17 Aug 2022 17:21:39 +0000
Subject: Revert "memcg: cleanup racy sum avoidance code"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 96e51ccf1af33e82f429a0d6baebba29c6448d0f.

Recently we started running the kernel with rstat infrastructure on
production traffic and begin to see negative memcg stats values.
Particularly the 'sock' stat is the one which we observed having negative
value.

$ grep "sock " /mnt/memory/job/memory.stat
sock 253952
total_sock 18446744073708724224

Re-run after couple of seconds

$ grep "sock " /mnt/memory/job/memory.stat
sock 253952
total_sock 53248

For now we are only seeing this issue on large machines (256 CPUs) and
only with 'sock' stat.  I think the networking stack increase the stat on
one cpu and decrease it on another cpu much more often.  So, this negative
sock is due to rstat flusher flushing the stats on the CPU that has seen
the decrement of sock but missed the CPU that has increments.  A typical
race condition.

For easy stable backport, revert is the most simple solution.  For long
term solution, I am thinking of two directions.  First is just reduce the
race window by optimizing the rstat flusher.  Second is if the reader sees
a negative stat value, force flush and restart the stat collection.
Basically retry but limited.

Link: https://lkml.kernel.org/r/20220817172139.3141101-1-shakeelb@google.com
Fixes: 96e51ccf1af33e8 ("memcg: cleanup racy sum avoidance code")
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Cc: "Michal Koutný" <mkoutny@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: <stable@vger.kernel.org>	[5.15]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4d31ce55b1c0..6257867fbf95 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -987,19 +987,30 @@ static inline void mod_memcg_page_state(struct page *page,
 
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 {
-	return READ_ONCE(memcg->vmstats.state[idx]);
+	long x = READ_ONCE(memcg->vmstats.state[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
 }
 
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
 					      enum node_stat_item idx)
 {
 	struct mem_cgroup_per_node *pn;
+	long x;
 
 	if (mem_cgroup_disabled())
 		return node_page_state(lruvec_pgdat(lruvec), idx);
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	return READ_ONCE(pn->lruvec_stats.state[idx]);
+	x = READ_ONCE(pn->lruvec_stats.state[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
 }
 
 static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
-- 
cgit v1.2.3


From bc12b70f7d216b36bd87701349374a13e486f8eb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 25 Aug 2022 13:36:40 +0200
Subject: x86/earlyprintk: Clean up pciserial

While working on a GRUB patch to support PCI-serial, a number of
cleanups were suggested that apply to the code I took inspiration from.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>   # pci_ids.h
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lkml.kernel.org/r/YwdeyCEtW+wa+QhH@worktop.programming.kicks-ass.net
---
 include/linux/pci_ids.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 6feade66efdb..41b3fffdbb8e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -75,6 +75,9 @@
 #define PCI_CLASS_COMMUNICATION_MODEM	0x0703
 #define PCI_CLASS_COMMUNICATION_OTHER	0x0780
 
+/* Interface for SERIAL/MODEM */
+#define PCI_SERIAL_16550_COMPATIBLE	0x02
+
 #define PCI_BASE_CLASS_SYSTEM		0x08
 #define PCI_CLASS_SYSTEM_PIC		0x0800
 #define PCI_CLASS_SYSTEM_PIC_IOAPIC	0x080010
-- 
cgit v1.2.3


From 9c5d03d362519f36cd551aec596388f895c93d2d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Aug 2022 17:18:30 -0700
Subject: genetlink: start to validate reserved header bytes

We had historically not checked that genlmsghdr.reserved
is 0 on input which prevents us from using those precious
bytes in the future.

One use case would be to extend the cmd field, which is
currently just 8 bits wide and 256 is not a lot of commands
for some core families.

To make sure that new families do the right thing by default
put the onus of opting out of validation on existing families.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Paul Moore <paul@paul-moore.com> (NetLabel)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/genl_magic_func.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 939b1a8f571b..4a4b387181ad 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -294,6 +294,7 @@ static struct genl_family ZZZ_genl_family __ro_after_init = {
 	.ops = ZZZ_genl_ops,
 	.n_ops = ARRAY_SIZE(ZZZ_genl_ops),
 	.mcgrps = ZZZ_genl_mcgrps,
+	.resv_start_op = 42, /* drbd is currently the only user */
 	.n_mcgrps = ARRAY_SIZE(ZZZ_genl_mcgrps),
 	.module = THIS_MODULE,
 };
-- 
cgit v1.2.3


From e8013f8edaa30e17fd583a326daff1fa86b75c82 Mon Sep 17 00:00:00 2001
From: Casper Andersson <casper.casan@gmail.com>
Date: Thu, 25 Aug 2022 11:28:35 +0200
Subject: ethernet: Add helpers to recognize addresses mapped to IP multicast

IP multicast must sometimes be discriminated from non-IP multicast,
e.g. when determining the forwarding behavior of a given group in the
presence of multicast router ports on an offloaded bridge. Therefore,
provide helpers to identify these groups.

Signed-off-by: Casper Andersson <casper.casan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 92b10e67d5f8..a541f0c4f146 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -428,6 +428,28 @@ static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2,
 	return true;
 }
 
+static inline bool ether_addr_is_ipv4_mcast(const u8 *addr)
+{
+	u8 base[ETH_ALEN] = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 };
+	u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 };
+
+	return ether_addr_equal_masked(addr, base, mask);
+}
+
+static inline bool ether_addr_is_ipv6_mcast(const u8 *addr)
+{
+	u8 base[ETH_ALEN] = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 };
+	u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 };
+
+	return ether_addr_equal_masked(addr, base, mask);
+}
+
+static inline bool ether_addr_is_ip_mcast(const u8 *addr)
+{
+	return ether_addr_is_ipv4_mcast(addr) ||
+		ether_addr_is_ipv6_mcast(addr);
+}
+
 /**
  * ether_addr_to_u64 - Convert an Ethernet address into a u64 value.
  * @addr: Pointer to a six-byte array containing the Ethernet address
-- 
cgit v1.2.3


From dcf8e5633e2e69ad60b730ab5905608b756a032f Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 23 Aug 2022 12:59:25 -0700
Subject: tracing: Define the is_signed_type() macro once

There are two definitions of the is_signed_type() macro: one in
<linux/overflow.h> and a second definition in <linux/trace_events.h>.

As suggested by Linus, move the definition of the is_signed_type() macro
into the <linux/compiler.h> header file.  Change the definition of the
is_signed_type() macro to make sure that it does not trigger any sparse
warnings with future versions of sparse for bitwise types.

Link: https://lore.kernel.org/all/CAHk-=whjH6p+qzwUdx5SOVVHjS3WvzJQr6mDUwhEyTf6pJWzaQ@mail.gmail.com/
Link: https://lore.kernel.org/all/CAHk-=wjQGnVfb4jehFR0XyZikdQvCZouE96xR_nnf5kqaM5qqQ@mail.gmail.com/
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler.h     | 6 ++++++
 include/linux/overflow.h     | 1 -
 include/linux/trace_events.h | 2 --
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 01ce94b58b42..7713d7bcdaea 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -239,6 +239,12 @@ static inline void *offset_to_ptr(const int *off)
 /* &a[0] degrades to a pointer: a different type from an array */
 #define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 
+/*
+ * Whether 'type' is a signed type or an unsigned type. Supports scalar types,
+ * bool and also pointer types.
+ */
+#define is_signed_type(type) (((type)(-1)) < (__force type)1)
+
 /*
  * This is needed in functions which generate the stack canary, see
  * arch/x86/kernel/smpboot.c::start_secondary() for an example.
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index f1221d11f8e5..0eb3b192f07a 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -30,7 +30,6 @@
  * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
  * credit to Christian Biere.
  */
-#define is_signed_type(type)       (((type)(-1)) < (type)1)
 #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
 #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
 #define type_min(T) ((T)((T)-type_max(T)-(T)1))
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index b18759a673c6..8401dec93c15 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -814,8 +814,6 @@ extern int trace_add_event_call(struct trace_event_call *call);
 extern int trace_remove_event_call(struct trace_event_call *call);
 extern int trace_event_get_offsets(struct trace_event_call *call);
 
-#define is_signed_type(type)	(((type)(-1)) < (type)1)
-
 int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
 int trace_set_clr_event(const char *system, const char *event, int set);
 int trace_array_set_clr_event(struct trace_array *tr, const char *system,
-- 
cgit v1.2.3


From ff6d365898d4d31bd557954c7fc53f38977b491c Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Mon, 22 Aug 2022 08:34:35 -0700
Subject: soc: qcom: qmi: use const for struct qmi_elem_info

Currently all usage of struct qmi_elem_info, which is used to define
the QMI message encoding/decoding rules, does not use const. This
prevents clients from registering const arrays. Since these arrays are
always pre-defined, they should be const, so add the const qualifier
to all places in the QMI interface where struct qmi_elem_info is used.

Once this patch is in place, clients can independently update their
pre-defined arrays to be const, as demonstrated in the QMI sample
code.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20220822153435.7856-1-quic_jjohnson@quicinc.com
---
 include/linux/soc/qcom/qmi.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/qmi.h b/include/linux/soc/qcom/qmi.h
index b1f80e756d2a..469e02d2aa0d 100644
--- a/include/linux/soc/qcom/qmi.h
+++ b/include/linux/soc/qcom/qmi.h
@@ -75,7 +75,7 @@ struct qmi_elem_info {
 	enum qmi_array_type array_type;
 	u8 tlv_type;
 	u32 offset;
-	struct qmi_elem_info *ei_array;
+	const struct qmi_elem_info *ei_array;
 };
 
 #define QMI_RESULT_SUCCESS_V01			0
@@ -102,7 +102,7 @@ struct qmi_response_type_v01 {
 	u16 error;
 };
 
-extern struct qmi_elem_info qmi_response_type_v01_ei[];
+extern const struct qmi_elem_info qmi_response_type_v01_ei[];
 
 /**
  * struct qmi_service - context to track lookup-results
@@ -173,7 +173,7 @@ struct qmi_txn {
 	struct completion completion;
 	int result;
 
-	struct qmi_elem_info *ei;
+	const struct qmi_elem_info *ei;
 	void *dest;
 };
 
@@ -189,7 +189,7 @@ struct qmi_msg_handler {
 	unsigned int type;
 	unsigned int msg_id;
 
-	struct qmi_elem_info *ei;
+	const struct qmi_elem_info *ei;
 
 	size_t decoded_size;
 	void (*fn)(struct qmi_handle *qmi, struct sockaddr_qrtr *sq,
@@ -249,23 +249,23 @@ void qmi_handle_release(struct qmi_handle *qmi);
 
 ssize_t qmi_send_request(struct qmi_handle *qmi, struct sockaddr_qrtr *sq,
 			 struct qmi_txn *txn, int msg_id, size_t len,
-			 struct qmi_elem_info *ei, const void *c_struct);
+			 const struct qmi_elem_info *ei, const void *c_struct);
 ssize_t qmi_send_response(struct qmi_handle *qmi, struct sockaddr_qrtr *sq,
 			  struct qmi_txn *txn, int msg_id, size_t len,
-			  struct qmi_elem_info *ei, const void *c_struct);
+			  const struct qmi_elem_info *ei, const void *c_struct);
 ssize_t qmi_send_indication(struct qmi_handle *qmi, struct sockaddr_qrtr *sq,
-			    int msg_id, size_t len, struct qmi_elem_info *ei,
+			    int msg_id, size_t len, const struct qmi_elem_info *ei,
 			    const void *c_struct);
 
 void *qmi_encode_message(int type, unsigned int msg_id, size_t *len,
-			 unsigned int txn_id, struct qmi_elem_info *ei,
+			 unsigned int txn_id, const struct qmi_elem_info *ei,
 			 const void *c_struct);
 
 int qmi_decode_message(const void *buf, size_t len,
-		       struct qmi_elem_info *ei, void *c_struct);
+		       const struct qmi_elem_info *ei, void *c_struct);
 
 int qmi_txn_init(struct qmi_handle *qmi, struct qmi_txn *txn,
-		 struct qmi_elem_info *ei, void *c_struct);
+		 const struct qmi_elem_info *ei, void *c_struct);
 int qmi_txn_wait(struct qmi_txn *txn, unsigned long timeout);
 void qmi_txn_cancel(struct qmi_txn *txn);
 
-- 
cgit v1.2.3


From c13d7d261e361dbaf5adbdc216ee4a1204c48001 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Thu, 25 Aug 2022 10:08:56 +0530
Subject: soc: qcom: llcc: Pass LLCC version based register offsets to EDAC
 driver

The LLCC EDAC register offsets varies between each SoCs. Until now, the
EDAC driver used the hardcoded register offsets. But this caused crash
on SM8450 SoC where the register offsets has been changed.

So to avoid this crash and also to make it easy to accommodate changes for
new SoCs, let's pass the LLCC version specific register offsets to the
EDAC driver.

Currently, two set of offsets are used. One is starting from LLCC version
v1.0.0 used by all SoCs other than SM8450. For SM8450, LLCC version
starting from v2.1.0 is used.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Reviewed-by: Sai Prakash Ranjan <quic_saipraka@quicinc.com>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20220825043859.30066-3-manivannan.sadhasivam@linaro.org
---
 include/linux/soc/qcom/llcc-qcom.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index 9ed5384c5ca1..bc2fb8343a94 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -78,11 +78,40 @@ struct llcc_edac_reg_data {
 	u8  ways_shift;
 };
 
+struct llcc_edac_reg_offset {
+	/* LLCC TRP registers */
+	u32 trp_ecc_error_status0;
+	u32 trp_ecc_error_status1;
+	u32 trp_ecc_sb_err_syn0;
+	u32 trp_ecc_db_err_syn0;
+	u32 trp_ecc_error_cntr_clear;
+	u32 trp_interrupt_0_status;
+	u32 trp_interrupt_0_clear;
+	u32 trp_interrupt_0_enable;
+
+	/* LLCC Common registers */
+	u32 cmn_status0;
+	u32 cmn_interrupt_0_enable;
+	u32 cmn_interrupt_2_enable;
+
+	/* LLCC DRP registers */
+	u32 drp_ecc_error_cfg;
+	u32 drp_ecc_error_cntr_clear;
+	u32 drp_interrupt_status;
+	u32 drp_interrupt_clear;
+	u32 drp_interrupt_enable;
+	u32 drp_ecc_error_status0;
+	u32 drp_ecc_error_status1;
+	u32 drp_ecc_sb_err_syn0;
+	u32 drp_ecc_db_err_syn0;
+};
+
 /**
  * struct llcc_drv_data - Data associated with the llcc driver
  * @regmap: regmap associated with the llcc device
  * @bcast_regmap: regmap associated with llcc broadcast offset
  * @cfg: pointer to the data structure for slice configuration
+ * @edac_reg_offset: Offset of the LLCC EDAC registers
  * @lock: mutex associated with each slice
  * @cfg_size: size of the config data table
  * @max_slices: max slices as read from device tree
@@ -96,6 +125,7 @@ struct llcc_drv_data {
 	struct regmap *regmap;
 	struct regmap *bcast_regmap;
 	const struct llcc_slice_config *cfg;
+	const struct llcc_edac_reg_offset *edac_reg_offset;
 	struct mutex lock;
 	u32 cfg_size;
 	u32 max_slices;
-- 
cgit v1.2.3


From c60561014257699d81ab392eb6cb6389ad8907df Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 23 Aug 2022 12:50:03 +0800
Subject: soundwire: bus: allow device number to be unique at system level

The SoundWire specification allows the device number to be allocated
at will. When a system includes multiple SoundWire links, the device
number scope is limited to the link to which the device is attached.

However, for integration/debug it can be convenient to have a unique
device number across the system. This patch adds a 'dev_num_ida_min'
field at the bus level, which when set will be used to allocate an
IDA.

The allocation happens when a hardware device reports as ATTACHED. If
any error happens during the enumeration, the allocated IDA is not
freed - the device number will be reused if/when the device re-joins
the bus. The IDA is only freed when the Linux device is unregistered.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220823045004.2670658-3-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 39058c841469..a2b31d25ea27 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -889,6 +889,9 @@ struct sdw_master_ops {
  * meaningful if multi_link is set. If set to 1, hardware-based
  * synchronization will be used even if a stream only uses a single
  * SoundWire segment.
+ * @dev_num_ida_min: if set, defines the minimum values for the IDA
+ * used to allocate system-unique device numbers. This value needs to be
+ * identical across all SoundWire bus in the system.
  */
 struct sdw_bus {
 	struct device *dev;
@@ -913,6 +916,7 @@ struct sdw_bus {
 	u32 bank_switch_timeout;
 	bool multi_link;
 	int hw_sync_min_links;
+	int dev_num_ida_min;
 };
 
 int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent,
-- 
cgit v1.2.3


From c5b81449f915a28bb9c7725e53aebab3ba39b4a2 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 29 Aug 2022 14:47:07 +0200
Subject: perf/hw_breakpoint: Provide hw_breakpoint_is_used() and use in test

Provide hw_breakpoint_is_used() to check if breakpoints are in use on
the system.

Use it in the KUnit test to verify the global state before and after a
test case.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20220829124719.675715-3-elver@google.com
---
 include/linux/hw_breakpoint.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 78dd7035d1e5..a3fb846705eb 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -74,6 +74,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
 extern int register_perf_hw_breakpoint(struct perf_event *bp);
 extern void unregister_hw_breakpoint(struct perf_event *bp);
 extern void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events);
+extern bool hw_breakpoint_is_used(void);
 
 extern int dbg_reserve_bp_slot(struct perf_event *bp);
 extern int dbg_release_bp_slot(struct perf_event *bp);
@@ -121,6 +122,8 @@ register_perf_hw_breakpoint(struct perf_event *bp)	{ return -ENOSYS; }
 static inline void unregister_hw_breakpoint(struct perf_event *bp)	{ }
 static inline void
 unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)	{ }
+static inline bool hw_breakpoint_is_used(void)		{ return false; }
+
 static inline int
 reserve_bp_slot(struct perf_event *bp)			{return -ENOSYS; }
 static inline void release_bp_slot(struct perf_event *bp) 		{ }
-- 
cgit v1.2.3


From 0370dc314df35579b751d1b77c9169f071444962 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 29 Aug 2022 14:47:09 +0200
Subject: perf/hw_breakpoint: Optimize list of per-task breakpoints

On a machine with 256 CPUs, running the recently added perf breakpoint
benchmark results in:

 | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64
 | # Running 'breakpoint/thread' benchmark:
 | # Created/joined 30 threads with 4 breakpoints and 64 parallelism
 |      Total time: 236.418 [sec]
 |
 |   123134.794271 usecs/op
 |  7880626.833333 usecs/op/cpu

The benchmark tests inherited breakpoint perf events across many
threads.

Looking at a perf profile, we can see that the majority of the time is
spent in various hw_breakpoint.c functions, which execute within the
'nr_bp_mutex' critical sections which then results in contention on that
mutex as well:

    37.27%  [kernel]       [k] osq_lock
    34.92%  [kernel]       [k] mutex_spin_on_owner
    12.15%  [kernel]       [k] toggle_bp_slot
    11.90%  [kernel]       [k] __reserve_bp_slot

The culprit here is task_bp_pinned(), which has a runtime complexity of
O(#tasks) due to storing all task breakpoints in the same list and
iterating through that list looking for a matching task. Clearly, this
does not scale to thousands of tasks.

Instead, make use of the "rhashtable" variant "rhltable" which stores
multiple items with the same key in a list. This results in average
runtime complexity of O(1) for task_bp_pinned().

With the optimization, the benchmark shows:

 | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64
 | # Running 'breakpoint/thread' benchmark:
 | # Created/joined 30 threads with 4 breakpoints and 64 parallelism
 |      Total time: 0.208 [sec]
 |
 |      108.422396 usecs/op
 |     6939.033333 usecs/op/cpu

On this particular setup that's a speedup of ~1135x.

While one option would be to make task_struct a breakpoint list node,
this would only further bloat task_struct for infrequently used data.
Furthermore, after all optimizations in this series, there's no evidence
it would result in better performance: later optimizations make the time
spent looking up entries in the hash table negligible (we'll reach the
theoretical ideal performance i.e. no constraints).

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20220829124719.675715-5-elver@google.com
---
 include/linux/perf_event.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ae30c61957d2..1999408a9cbb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -36,6 +36,7 @@ struct perf_guest_info_callbacks {
 };
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
+#include <linux/rhashtable-types.h>
 #include <asm/hw_breakpoint.h>
 #endif
 
@@ -178,7 +179,7 @@ struct hw_perf_event {
 			 * creation and event initalization.
 			 */
 			struct arch_hw_breakpoint	info;
-			struct list_head		bp_list;
+			struct rhlist_head		bp_list;
 		};
 #endif
 		struct { /* amd_iommu */
-- 
cgit v1.2.3


From 9caf87be118f4639537404eeb67dd444a3716e9a Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 29 Aug 2022 14:47:12 +0200
Subject: perf/hw_breakpoint: Make hw_breakpoint_weight() inlinable

Due to being a __weak function, hw_breakpoint_weight() will cause the
compiler to always emit a call to it. This generates unnecessarily bad
code (register spills etc.) for no good reason; in fact it appears in
profiles of `perf bench -r 100 breakpoint thread -b 4 -p 128 -t 512`:

    ...
    0.70%  [kernel]       [k] hw_breakpoint_weight
    ...

While a small percentage, no architecture defines its own
hw_breakpoint_weight() nor are there users outside hw_breakpoint.c,
which makes the fact it is currently __weak a poor choice.

Change hw_breakpoint_weight()'s definition to follow a similar protocol
to hw_breakpoint_slots(), such that if <asm/hw_breakpoint.h> defines
hw_breakpoint_weight(), we'll use it instead.

The result is that it is inlined and no longer shows up in profiles.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20220829124719.675715-8-elver@google.com
---
 include/linux/hw_breakpoint.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index a3fb846705eb..f319bd26b030 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -80,7 +80,6 @@ extern int dbg_reserve_bp_slot(struct perf_event *bp);
 extern int dbg_release_bp_slot(struct perf_event *bp);
 extern int reserve_bp_slot(struct perf_event *bp);
 extern void release_bp_slot(struct perf_event *bp);
-int hw_breakpoint_weight(struct perf_event *bp);
 int arch_reserve_bp_slot(struct perf_event *bp);
 void arch_release_bp_slot(struct perf_event *bp);
 void arch_unregister_hw_breakpoint(struct perf_event *bp);
-- 
cgit v1.2.3


From 01fe8a3f818e1074a9a95d624be4549ee7ea2b2b Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 29 Aug 2022 14:47:15 +0200
Subject: locking/percpu-rwsem: Add percpu_is_write_locked() and
 percpu_is_read_locked()

Implement simple accessors to probe percpu-rwsem's locked state:
percpu_is_write_locked(), percpu_is_read_locked().

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20220829124719.675715-11-elver@google.com
---
 include/linux/percpu-rwsem.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 5fda40f97fe9..36b942b67b7d 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -121,9 +121,15 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 	preempt_enable();
 }
 
+extern bool percpu_is_read_locked(struct percpu_rw_semaphore *);
 extern void percpu_down_write(struct percpu_rw_semaphore *);
 extern void percpu_up_write(struct percpu_rw_semaphore *);
 
+static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem)
+{
+	return atomic_read(&sem->block);
+}
+
 extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
 				const char *, struct lock_class_key *);
 
-- 
cgit v1.2.3


From 25af7406df5915f04d5f1c8f081dabb0ead1cdcc Mon Sep 17 00:00:00 2001
From: Isaac Manjarres <isaacmanjarres@google.com>
Date: Thu, 18 Aug 2022 18:28:51 +0100
Subject: ARM: 9229/1: amba: Fix use-after-free in amba_read_periphid()

After commit f2d3b9a46e0e ("ARM: 9220/1: amba: Remove deferred device
addition"), it became possible for amba_read_periphid() to be invoked
concurrently from two threads for a particular AMBA device.

Consider the case where a thread (T0) is registering an AMBA driver, and
searching for all of the devices it can match with on the AMBA bus.
Suppose that another thread (T1) is executing the deferred probe work,
and is searching through all of the AMBA drivers on the bus for a driver
that matches a particular AMBA device. Assume that both threads begin
operating on the same AMBA device and the device's peripheral ID is
still unknown.

In this scenario, the amba_match() function will be invoked for the
same AMBA device by both threads, which means amba_read_periphid()
can also be invoked by both threads, and both threads will be able
to manipulate the AMBA device's pclk pointer without any synchronization.
It's possible that one thread will initialize the pclk pointer, then the
other thread will re-initialize it, overwriting the previous value, and
both will race to free the same pclk, resulting in a use-after-free for
whichever thread frees the pclk last.

Add a lock per AMBA device to synchronize the handling with detecting the
peripheral ID to avoid the use-after-free scenario.

The following KFENCE bug report helped detect this problem:
==================================================================
BUG: KFENCE: use-after-free read in clk_disable+0x14/0x34

Use-after-free read at 0x(ptrval) (in kfence-#19):
 clk_disable+0x14/0x34
 amba_read_periphid+0xdc/0x134
 amba_match+0x3c/0x84
 __driver_attach+0x20/0x158
 bus_for_each_dev+0x74/0xc0
 bus_add_driver+0x154/0x1e8
 driver_register+0x88/0x11c
 do_one_initcall+0x8c/0x2fc
 kernel_init_freeable+0x190/0x220
 kernel_init+0x10/0x108
 ret_from_fork+0x14/0x3c
 0x0

kfence-#19: 0x(ptrval)-0x(ptrval), size=36, cache=kmalloc-64

allocated by task 8 on cpu 0 at 11.629931s:
 clk_hw_create_clk+0x38/0x134
 amba_get_enable_pclk+0x10/0x68
 amba_read_periphid+0x28/0x134
 amba_match+0x3c/0x84
 __device_attach_driver+0x2c/0xc4
 bus_for_each_drv+0x80/0xd0
 __device_attach+0xb0/0x1f0
 bus_probe_device+0x88/0x90
 deferred_probe_work_func+0x8c/0xc0
 process_one_work+0x23c/0x690
 worker_thread+0x34/0x488
 kthread+0xd4/0xfc
 ret_from_fork+0x14/0x3c
 0x0

freed by task 8 on cpu 0 at 11.630095s:
 amba_read_periphid+0xec/0x134
 amba_match+0x3c/0x84
 __device_attach_driver+0x2c/0xc4
 bus_for_each_drv+0x80/0xd0
 __device_attach+0xb0/0x1f0
 bus_probe_device+0x88/0x90
 deferred_probe_work_func+0x8c/0xc0
 process_one_work+0x23c/0x690
 worker_thread+0x34/0x488
 kthread+0xd4/0xfc
 ret_from_fork+0x14/0x3c
 0x0

Cc: Saravana Kannan <saravanak@google.com>
Cc: patches@armlinux.org.uk
Fixes: f2d3b9a46e0e ("ARM: 9220/1: amba: Remove deferred device addition")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 include/linux/amba/bus.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index e94cdf235f1d..5001e14c5c06 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -67,6 +67,7 @@ struct amba_device {
 	struct clk		*pclk;
 	struct device_dma_parameters dma_parms;
 	unsigned int		periphid;
+	struct mutex		periphid_lock;
 	unsigned int		cid;
 	struct amba_cs_uci_id	uci;
 	unsigned int		irq[AMBA_NR_IRQS];
-- 
cgit v1.2.3


From 690252f19f0e486abb8590b3a7a03d4e065d93d4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 25 Aug 2022 20:09:31 -0700
Subject: netlink: add support for ext_ack missing attributes

There is currently no way to report via extack in a structured way
that an attribute is missing. This leads to families resorting to
string messages.

Add a pair of attributes - @offset and @type for machine-readable
way of reporting missing attributes. The @offset points to the
nest which should have contained the attribute, @type is the
expected nla_type. The offset will be skipped if the attribute
is missing at the message level rather than inside a nest.

User space should be able to figure out which attribute enum
(AKA attribute space AKA attribute set) the nest pointed to by
@offset is using.

Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netlink.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index bda1c385cffb..1619221c415c 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -71,6 +71,8 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg)
  *	%NL_SET_ERR_MSG
  * @bad_attr: attribute with error
  * @policy: policy for a bad attribute
+ * @miss_type: attribute type which was missing
+ * @miss_nest: nest missing an attribute (%NULL if missing top level attr)
  * @cookie: cookie data to return to userspace (for success)
  * @cookie_len: actual cookie data length
  */
@@ -78,6 +80,8 @@ struct netlink_ext_ack {
 	const char *_msg;
 	const struct nlattr *bad_attr;
 	const struct nla_policy *policy;
+	const struct nlattr *miss_nest;
+	u16 miss_type;
 	u8 cookie[NETLINK_MAX_COOKIE_LEN];
 	u8 cookie_len;
 };
@@ -126,6 +130,15 @@ struct netlink_ext_ack {
 #define NL_SET_ERR_MSG_ATTR(extack, attr, msg)		\
 	NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg)
 
+#define NL_SET_ERR_ATTR_MISS(extack, nest, type)  do {	\
+	struct netlink_ext_ack *__extack = (extack);	\
+							\
+	if (__extack) {					\
+		__extack->miss_nest = (nest);		\
+		__extack->miss_type = (type);		\
+	}						\
+} while (0)
+
 static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack,
 					    u64 cookie)
 {
-- 
cgit v1.2.3


From 45dca15759643806660e9285e6af8a1ba3c76c82 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 25 Aug 2022 20:09:32 -0700
Subject: netlink: add helpers for extack attr presence checking

Being able to check attribute presence and set extack
if not on one line is handy, add helpers.

Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netlink.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 1619221c415c..d51e041d2242 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -139,6 +139,17 @@ struct netlink_ext_ack {
 	}						\
 } while (0)
 
+#define NL_REQ_ATTR_CHECK(extack, nest, tb, type) ({		\
+	struct nlattr **__tb = (tb);				\
+	u32 __attr = (type);					\
+	int __retval;						\
+								\
+	__retval = !__tb[__attr];				\
+	if (__retval)						\
+		NL_SET_ERR_ATTR_MISS((extack), (nest), __attr);	\
+	__retval;						\
+})
+
 static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack,
 					    u64 cookie)
 {
-- 
cgit v1.2.3


From 87888fb9ac0c71cdc1edd0779382052475dadb84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@linux.intel.com>
Date: Tue, 16 Aug 2022 14:57:32 +0300
Subject: tty: Remove baudrate dead code & make ktermios params const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the architectures currently in-tree, either:
  1) CBAUDEX is zero
  2) The earlier BOTHER if check covers cbaud < 1 case
  3) All CBAUD bits are covered by the baud_table

Thus, the check for cbaud being out-of-range for CBAUDEX case cannot
ever be true.

The ktermios parameters can now be made const.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220816115739.10928-2-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 7b0a5d478ef6..cf5ab26de73d 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -434,7 +434,7 @@ int tty_hung_up_p(struct file *filp);
 void do_SAK(struct tty_struct *tty);
 void __do_SAK(struct tty_struct *tty);
 void no_tty(void);
-speed_t tty_termios_baud_rate(struct ktermios *termios);
+speed_t tty_termios_baud_rate(const struct ktermios *termios);
 void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud,
 		speed_t obaud);
 void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud,
-- 
cgit v1.2.3


From d15f89d997d995c9a0bf5be5de4c1c405886f21c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@linux.intel.com>
Date: Tue, 16 Aug 2022 14:57:35 +0300
Subject: tty: Make tty_termios_copy_hw() old ktermios const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There should be no reason to adjust old ktermios which is going to get
discarded anyway.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220816115739.10928-5-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index cf5ab26de73d..ae41893f8653 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -458,7 +458,7 @@ static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
 unsigned char tty_get_char_size(unsigned int cflag);
 unsigned char tty_get_frame_size(unsigned int cflag);
 
-void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old);
+void tty_termios_copy_hw(struct ktermios *new, const struct ktermios *old);
 int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b);
 int tty_set_termios(struct tty_struct *tty, struct ktermios *kt);
 
-- 
cgit v1.2.3


From 8b7d2d95cf82f8ca034ac65d0f39a2b3359b680f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@linux.intel.com>
Date: Tue, 16 Aug 2022 14:57:36 +0300
Subject: tty: Make ldisc ->set_termios() old ktermios const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There should be no reason to adjust old ktermios which is going to get
discarded anyway.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220816115739.10928-6-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_ldisc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index ede6f2157f32..dcb61ec11424 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -130,7 +130,7 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
  *	a pointer to wordsize-sensitive structure belongs here, but most of
  *	ldiscs will happily leave it %NULL.
  *
- * @set_termios: [TTY] ``void ()(struct tty_struct *tty, struct ktermios *old)``
+ * @set_termios: [TTY] ``void ()(struct tty_struct *tty, const struct ktermios *old)``
  *
  *	This function notifies the line discpline that a change has been made
  *	to the termios structure.
@@ -227,7 +227,7 @@ struct tty_ldisc_ops {
 			unsigned long arg);
 	int	(*compat_ioctl)(struct tty_struct *tty, unsigned int cmd,
 			unsigned long arg);
-	void	(*set_termios)(struct tty_struct *tty, struct ktermios *old);
+	void	(*set_termios)(struct tty_struct *tty, const struct ktermios *old);
 	__poll_t (*poll)(struct tty_struct *tty, struct file *file,
 			     struct poll_table_struct *wait);
 	void	(*hangup)(struct tty_struct *tty);
-- 
cgit v1.2.3


From bec5b814d46c2a704c3c8148752e62a33e9fa6dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@linux.intel.com>
Date: Tue, 16 Aug 2022 14:57:37 +0300
Subject: serial: Make ->set_termios() old ktermios const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There should be no reason to adjust old ktermios which is going to get
discarded anyway.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220816115739.10928-7-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_8250.h | 4 ++--
 include/linux/serial_core.h | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 8c7b793aa4d7..43edd10e8c96 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -32,7 +32,7 @@ struct plat_serial8250_port {
 	void		(*serial_out)(struct uart_port *, int, int);
 	void		(*set_termios)(struct uart_port *,
 			               struct ktermios *new,
-			               struct ktermios *old);
+			               const struct ktermios *old);
 	void		(*set_ldisc)(struct uart_port *,
 				     struct ktermios *);
 	unsigned int	(*get_mctrl)(struct uart_port *);
@@ -157,7 +157,7 @@ extern int early_serial8250_setup(struct earlycon_device *device,
 extern void serial8250_update_uartclk(struct uart_port *port,
 				      unsigned int uartclk);
 extern void serial8250_do_set_termios(struct uart_port *port,
-		struct ktermios *termios, struct ktermios *old);
+		struct ktermios *termios, const struct ktermios *old);
 extern void serial8250_do_set_ldisc(struct uart_port *port,
 				    struct ktermios *termios);
 extern unsigned int serial8250_do_get_mctrl(struct uart_port *port);
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index aef3145f2032..0e9bc943bfa8 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -379,7 +379,7 @@ struct uart_ops {
 	void		(*shutdown)(struct uart_port *);
 	void		(*flush_buffer)(struct uart_port *);
 	void		(*set_termios)(struct uart_port *, struct ktermios *new,
-				       struct ktermios *old);
+				       const struct ktermios *old);
 	void		(*set_ldisc)(struct uart_port *, struct ktermios *);
 	void		(*pm)(struct uart_port *, unsigned int state,
 			      unsigned int oldstate);
@@ -425,7 +425,7 @@ struct uart_port {
 	void			(*serial_out)(struct uart_port *, int, int);
 	void			(*set_termios)(struct uart_port *,
 				               struct ktermios *new,
-				               struct ktermios *old);
+				               const struct ktermios *old);
 	void			(*set_ldisc)(struct uart_port *,
 					     struct ktermios *);
 	unsigned int		(*get_mctrl)(struct uart_port *);
@@ -644,7 +644,7 @@ void uart_write_wakeup(struct uart_port *port);
 void uart_update_timeout(struct uart_port *port, unsigned int cflag,
 			 unsigned int baud);
 unsigned int uart_get_baud_rate(struct uart_port *port, struct ktermios *termios,
-				struct ktermios *old, unsigned int min,
+				const struct ktermios *old, unsigned int min,
 				unsigned int max);
 unsigned int uart_get_divisor(struct uart_port *port, unsigned int baud);
 
-- 
cgit v1.2.3


From f6d47fe5921a6d3f5a1a3d0b9a3dd34b8e295722 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@linux.intel.com>
Date: Tue, 16 Aug 2022 14:57:38 +0300
Subject: usb: serial: Make ->set_termios() old ktermios const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There should be no reason to adjust old ktermios which is going to get
discarded anyway.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220816115739.10928-8-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/serial.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 8ea319f89e1f..f7bfedb740f5 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -276,8 +276,8 @@ struct usb_serial_driver {
 		      unsigned int cmd, unsigned long arg);
 	void (*get_serial)(struct tty_struct *tty, struct serial_struct *ss);
 	int  (*set_serial)(struct tty_struct *tty, struct serial_struct *ss);
-	void (*set_termios)(struct tty_struct *tty,
-			struct usb_serial_port *port, struct ktermios *old);
+	void (*set_termios)(struct tty_struct *tty, struct usb_serial_port *port,
+			    const struct ktermios *old);
 	void (*break_ctl)(struct tty_struct *tty, int break_state);
 	unsigned int (*chars_in_buffer)(struct tty_struct *tty);
 	void (*wait_until_sent)(struct tty_struct *tty, long timeout);
-- 
cgit v1.2.3


From a8c11c1520347be74b02312d10ef686b01b525f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@linux.intel.com>
Date: Tue, 16 Aug 2022 14:57:39 +0300
Subject: tty: Make ->set_termios() old ktermios const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There should be no reason to adjust old ktermios which is going to get
discarded anyway.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220816115739.10928-9-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_driver.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 4841d8069c07..b2456b545ba0 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -141,7 +141,7 @@ struct serial_struct;
  *
  *	Optional.
  *
- * @set_termios: ``void ()(struct tty_struct *tty, struct ktermios *old)``
+ * @set_termios: ``void ()(struct tty_struct *tty, const struct ktermios *old)``
  *
  *	This routine allows the @tty driver to be notified when device's
  *	termios settings have changed. New settings are in @tty->termios.
@@ -365,7 +365,7 @@ struct tty_operations {
 		    unsigned int cmd, unsigned long arg);
 	long (*compat_ioctl)(struct tty_struct *tty,
 			     unsigned int cmd, unsigned long arg);
-	void (*set_termios)(struct tty_struct *tty, struct ktermios * old);
+	void (*set_termios)(struct tty_struct *tty, const struct ktermios *old);
 	void (*throttle)(struct tty_struct * tty);
 	void (*unthrottle)(struct tty_struct * tty);
 	void (*stop)(struct tty_struct *tty);
-- 
cgit v1.2.3


From 9c6d778800b921bde3bff3cff5003d1650f942d1 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 26 Aug 2022 15:31:32 -0400
Subject: USB: core: Prevent nested device-reset calls

Automatic kernel fuzzing revealed a recursive locking violation in
usb-storage:

============================================
WARNING: possible recursive locking detected
5.18.0 #3 Not tainted
--------------------------------------------
kworker/1:3/1205 is trying to acquire lock:
ffff888018638db8 (&us_interface_key[i]){+.+.}-{3:3}, at:
usb_stor_pre_reset+0x35/0x40 drivers/usb/storage/usb.c:230

but task is already holding lock:
ffff888018638db8 (&us_interface_key[i]){+.+.}-{3:3}, at:
usb_stor_pre_reset+0x35/0x40 drivers/usb/storage/usb.c:230

...

stack backtrace:
CPU: 1 PID: 1205 Comm: kworker/1:3 Not tainted 5.18.0 #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.13.0-1ubuntu1.1 04/01/2014
Workqueue: usb_hub_wq hub_event
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_deadlock_bug kernel/locking/lockdep.c:2988 [inline]
check_deadlock kernel/locking/lockdep.c:3031 [inline]
validate_chain kernel/locking/lockdep.c:3816 [inline]
__lock_acquire.cold+0x152/0x3ca kernel/locking/lockdep.c:5053
lock_acquire kernel/locking/lockdep.c:5665 [inline]
lock_acquire+0x1ab/0x520 kernel/locking/lockdep.c:5630
__mutex_lock_common kernel/locking/mutex.c:603 [inline]
__mutex_lock+0x14f/0x1610 kernel/locking/mutex.c:747
usb_stor_pre_reset+0x35/0x40 drivers/usb/storage/usb.c:230
usb_reset_device+0x37d/0x9a0 drivers/usb/core/hub.c:6109
r871xu_dev_remove+0x21a/0x270 drivers/staging/rtl8712/usb_intf.c:622
usb_unbind_interface+0x1bd/0x890 drivers/usb/core/driver.c:458
device_remove drivers/base/dd.c:545 [inline]
device_remove+0x11f/0x170 drivers/base/dd.c:537
__device_release_driver drivers/base/dd.c:1222 [inline]
device_release_driver_internal+0x1a7/0x2f0 drivers/base/dd.c:1248
usb_driver_release_interface+0x102/0x180 drivers/usb/core/driver.c:627
usb_forced_unbind_intf+0x4d/0xa0 drivers/usb/core/driver.c:1118
usb_reset_device+0x39b/0x9a0 drivers/usb/core/hub.c:6114

This turned out not to be an error in usb-storage but rather a nested
device reset attempt.  That is, as the rtl8712 driver was being
unbound from a composite device in preparation for an unrelated USB
reset (that driver does not have pre_reset or post_reset callbacks),
its ->remove routine called usb_reset_device() -- thus nesting one
reset call within another.

Performing a reset as part of disconnect processing is a questionable
practice at best.  However, the bug report points out that the USB
core does not have any protection against nested resets.  Adding a
reset_in_progress flag and testing it will prevent such errors in the
future.

Link: https://lore.kernel.org/all/CAB7eexKUpvX-JNiLzhXBDWgfg2T9e9_0Tw4HQ6keN==voRbP0g@mail.gmail.com/
Cc: stable@vger.kernel.org
Reported-and-tested-by: Rondreis <linhaoguo86@gmail.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/YwkflDxvg0KWqyZK@rowland.harvard.edu
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index f7a9914fc97f..9ff1ad4dfad1 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -575,6 +575,7 @@ struct usb3_lpm_parameters {
  * @devaddr: device address, XHCI: assigned by HW, others: same as devnum
  * @can_submit: URBs may be submitted
  * @persist_enabled:  USB_PERSIST enabled for this device
+ * @reset_in_progress: the device is being reset
  * @have_langid: whether string_langid is valid
  * @authorized: policy has said we can use it;
  *	(user space) policy determines if we authorize this device to be
@@ -662,6 +663,7 @@ struct usb_device {
 
 	unsigned can_submit:1;
 	unsigned persist_enabled:1;
+	unsigned reset_in_progress:1;
 	unsigned have_langid:1;
 	unsigned authorized:1;
 	unsigned authenticated:1;
-- 
cgit v1.2.3


From 43a063cab325ee7cc50349967e536b3cd4e57f03 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Tue, 23 Aug 2022 00:46:37 +0000
Subject: KVM: x86/mmu: count KVM mmu usage in secondary pagetable stats.

Count the pages used by KVM mmu on x86 in memory stats under secondary
pagetable stats (e.g. "SecPageTables" in /proc/meminfo) to give better
visibility into the memory consumption of KVM mmu in a similar way to
how normal user page tables are accounted.

Add the inner helper in common KVM, ARM will also use it to count stats
in a future commit.

Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Acked-by: Marc Zyngier <maz@kernel.org> # generic KVM changes
Link: https://lore.kernel.org/r/20220823004639.2387269-3-yosryahmed@google.com
Link: https://lore.kernel.org/r/20220823004639.2387269-4-yosryahmed@google.com
[sean: squash x86 usage to workaround modpost issues]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f4519d3689e1..04c7e5f2f727 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2247,6 +2247,19 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */
 
+/*
+ * If more than one page is being (un)accounted, @virt must be the address of
+ * the first page of a block of pages what were allocated together (i.e
+ * accounted together).
+ *
+ * kvm_account_pgtable_pages() is thread-safe because mod_lruvec_page_state()
+ * is thread-safe.
+ */
+static inline void kvm_account_pgtable_pages(void *virt, int nr)
+{
+	mod_lruvec_page_state(virt_to_page(virt), NR_SECONDARY_PAGETABLE, nr);
+}
+
 /*
  * This defines how many reserved entries we want to keep before we
  * kick the vcpu to the userspace to avoid dirty ring full.  This
-- 
cgit v1.2.3


From 4e508b259ed02f5fa608cdd83b817a7f49c22271 Mon Sep 17 00:00:00 2001
From: "Chengci.Xu" <chengci.xu@mediatek.com>
Date: Wed, 17 Aug 2022 20:46:07 +0800
Subject: memory: mtk-smi: Add enable IOMMU SMC command for MM master

For concerns about security, the register to enable/disable IOMMU of
SMI LARB should only be configured in secure world. Thus, we add some
SMC command for multimedia master to enable/disable MM IOMMU in ATF by
setting the register of SMI LARB. This function is prepared for MT8188.

Signed-off-by: Chengci.Xu <chengci.xu@mediatek.com>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20220817124608.10062-4-chengci.xu@mediatek.com
---
 include/linux/soc/mediatek/mtk_sip_svc.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/mediatek/mtk_sip_svc.h b/include/linux/soc/mediatek/mtk_sip_svc.h
index 082398e0cfb1..0761128b4354 100644
--- a/include/linux/soc/mediatek/mtk_sip_svc.h
+++ b/include/linux/soc/mediatek/mtk_sip_svc.h
@@ -22,4 +22,7 @@
 	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, MTK_SIP_SMC_CONVENTION, \
 			   ARM_SMCCC_OWNER_SIP, fn_id)
 
+/* IOMMU related SMC call */
+#define MTK_SIP_KERNEL_IOMMU_CONTROL	MTK_SIP_SMC_CMD(0x514)
+
 #endif
-- 
cgit v1.2.3


From 9d2b2e83ef277b9c7b8852e8717140daa373ccf1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nuno=20S=C3=A1?= <nuno.sa@analog.com>
Date: Tue, 30 Aug 2022 20:54:10 -0700
Subject: Input: adp5588-keys - support gpi key events as 'gpio keys'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change replaces the support for GPIs as key event generators.
Instead of reporting the events directly, we add a gpio based irqchip
so that these events can be consumed by keys defined in the gpio-keys
driver (as it's goal is indeed for keys on GPIOs capable of generating
interrupts). With this, the gpio-adp5588 driver can also be dropped.

The basic idea is that all the pins that are not being used as part of
the keymap matrix can be possibly requested as GPIOs by gpio-keys
(it's also fine to use these pins as plain interrupts though that's not
really the point).

Since the gpiochip now also has irqchip capabilities, we should only
remove it after we free the device interrupt (otherwise we could, in
theory, be handling GPIs interrupts while the gpiochip is concurrently
removed). Thus the call 'adp5588_gpio_add()' is moved and since the
setup phase also needs to come before making the gpios visible, we also
need to move 'adp5588_setup()'.

While at it, always select GPIOLIB so that we don't need to use #ifdef
guards.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20220829131553.690063-2-nuno.sa@analog.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/platform_data/adp5588.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/adp5588.h b/include/linux/platform_data/adp5588.h
index 6d3f7d911a92..82170ec8c266 100644
--- a/include/linux/platform_data/adp5588.h
+++ b/include/linux/platform_data/adp5588.h
@@ -147,8 +147,6 @@ struct adp5588_kpad_platform_data {
 	unsigned en_keylock:1;		/* Enable Key Lock feature */
 	unsigned short unlock_key1;	/* Unlock Key 1 */
 	unsigned short unlock_key2;	/* Unlock Key 2 */
-	const struct adp5588_gpi_map *gpimap;
-	unsigned short gpimapsize;
 	const struct adp5588_gpio_platform_data *gpio_data;
 };
 
-- 
cgit v1.2.3


From 6704a86283b7e79ff7ae36d388466428f6672962 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nuno=20S=C3=A1?= <nuno.sa@analog.com>
Date: Tue, 30 Aug 2022 21:00:14 -0700
Subject: Input: adp5588-keys - add support for fw properties
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use firmware properties (eg: OF) to get the device specific
configuration. This change just replaces the platform data since there
was no platform using it and so, it makes no sense having both.

Special note to the PULL-UP disable setting that is now supported as
part of the gpio subsystem (using 'set_config()' callback).

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20220829131553.690063-5-nuno.sa@analog.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/platform_data/adp5588.h | 169 ----------------------------------
 1 file changed, 169 deletions(-)
 delete mode 100644 include/linux/platform_data/adp5588.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/adp5588.h b/include/linux/platform_data/adp5588.h
deleted file mode 100644
index 82170ec8c266..000000000000
--- a/include/linux/platform_data/adp5588.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Analog Devices ADP5588 I/O Expander and QWERTY Keypad Controller
- *
- * Copyright 2009-2010 Analog Devices Inc.
- */
-
-#ifndef _ADP5588_H
-#define _ADP5588_H
-
-#define DEV_ID 0x00		/* Device ID */
-#define CFG 0x01		/* Configuration Register1 */
-#define INT_STAT 0x02		/* Interrupt Status Register */
-#define KEY_LCK_EC_STAT 0x03	/* Key Lock and Event Counter Register */
-#define Key_EVENTA 0x04		/* Key Event Register A */
-#define Key_EVENTB 0x05		/* Key Event Register B */
-#define Key_EVENTC 0x06		/* Key Event Register C */
-#define Key_EVENTD 0x07		/* Key Event Register D */
-#define Key_EVENTE 0x08		/* Key Event Register E */
-#define Key_EVENTF 0x09		/* Key Event Register F */
-#define Key_EVENTG 0x0A		/* Key Event Register G */
-#define Key_EVENTH 0x0B		/* Key Event Register H */
-#define Key_EVENTI 0x0C		/* Key Event Register I */
-#define Key_EVENTJ 0x0D		/* Key Event Register J */
-#define KP_LCK_TMR 0x0E		/* Keypad Lock1 to Lock2 Timer */
-#define UNLOCK1 0x0F		/* Unlock Key1 */
-#define UNLOCK2 0x10		/* Unlock Key2 */
-#define GPIO_INT_STAT1 0x11	/* GPIO Interrupt Status */
-#define GPIO_INT_STAT2 0x12	/* GPIO Interrupt Status */
-#define GPIO_INT_STAT3 0x13	/* GPIO Interrupt Status */
-#define GPIO_DAT_STAT1 0x14	/* GPIO Data Status, Read twice to clear */
-#define GPIO_DAT_STAT2 0x15	/* GPIO Data Status, Read twice to clear */
-#define GPIO_DAT_STAT3 0x16	/* GPIO Data Status, Read twice to clear */
-#define GPIO_DAT_OUT1 0x17	/* GPIO DATA OUT */
-#define GPIO_DAT_OUT2 0x18	/* GPIO DATA OUT */
-#define GPIO_DAT_OUT3 0x19	/* GPIO DATA OUT */
-#define GPIO_INT_EN1 0x1A	/* GPIO Interrupt Enable */
-#define GPIO_INT_EN2 0x1B	/* GPIO Interrupt Enable */
-#define GPIO_INT_EN3 0x1C	/* GPIO Interrupt Enable */
-#define KP_GPIO1 0x1D		/* Keypad or GPIO Selection */
-#define KP_GPIO2 0x1E		/* Keypad or GPIO Selection */
-#define KP_GPIO3 0x1F		/* Keypad or GPIO Selection */
-#define GPI_EM1 0x20		/* GPI Event Mode 1 */
-#define GPI_EM2 0x21		/* GPI Event Mode 2 */
-#define GPI_EM3 0x22		/* GPI Event Mode 3 */
-#define GPIO_DIR1 0x23		/* GPIO Data Direction */
-#define GPIO_DIR2 0x24		/* GPIO Data Direction */
-#define GPIO_DIR3 0x25		/* GPIO Data Direction */
-#define GPIO_INT_LVL1 0x26	/* GPIO Edge/Level Detect */
-#define GPIO_INT_LVL2 0x27	/* GPIO Edge/Level Detect */
-#define GPIO_INT_LVL3 0x28	/* GPIO Edge/Level Detect */
-#define Debounce_DIS1 0x29	/* Debounce Disable */
-#define Debounce_DIS2 0x2A	/* Debounce Disable */
-#define Debounce_DIS3 0x2B	/* Debounce Disable */
-#define GPIO_PULL1 0x2C		/* GPIO Pull Disable */
-#define GPIO_PULL2 0x2D		/* GPIO Pull Disable */
-#define GPIO_PULL3 0x2E		/* GPIO Pull Disable */
-#define CMP_CFG_STAT 0x30	/* Comparator Configuration and Status Register */
-#define CMP_CONFG_SENS1 0x31	/* Sensor1 Comparator Configuration Register */
-#define CMP_CONFG_SENS2 0x32	/* L2 Light Sensor Reference Level, Output Falling for Sensor 1 */
-#define CMP1_LVL2_TRIP 0x33	/* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 1 */
-#define CMP1_LVL2_HYS 0x34	/* L3 Light Sensor Reference Level, Output Falling For Sensor 1 */
-#define CMP1_LVL3_TRIP 0x35	/* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 1 */
-#define CMP1_LVL3_HYS 0x36	/* Sensor 2 Comparator Configuration Register */
-#define CMP2_LVL2_TRIP 0x37	/* L2 Light Sensor Reference Level, Output Falling for Sensor 2 */
-#define CMP2_LVL2_HYS 0x38	/* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 2 */
-#define CMP2_LVL3_TRIP 0x39	/* L3 Light Sensor Reference Level, Output Falling For Sensor 2 */
-#define CMP2_LVL3_HYS 0x3A	/* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 2 */
-#define CMP1_ADC_DAT_R1 0x3B	/* Comparator 1 ADC data Register1 */
-#define CMP1_ADC_DAT_R2 0x3C	/* Comparator 1 ADC data Register2 */
-#define CMP2_ADC_DAT_R1 0x3D	/* Comparator 2 ADC data Register1 */
-#define CMP2_ADC_DAT_R2 0x3E	/* Comparator 2 ADC data Register2 */
-
-#define ADP5588_DEVICE_ID_MASK	0xF
-
- /* Configuration Register1 */
-#define ADP5588_AUTO_INC	(1 << 7)
-#define ADP5588_GPIEM_CFG	(1 << 6)
-#define ADP5588_OVR_FLOW_M	(1 << 5)
-#define ADP5588_INT_CFG		(1 << 4)
-#define ADP5588_OVR_FLOW_IEN	(1 << 3)
-#define ADP5588_K_LCK_IM	(1 << 2)
-#define ADP5588_GPI_IEN		(1 << 1)
-#define ADP5588_KE_IEN		(1 << 0)
-
-/* Interrupt Status Register */
-#define ADP5588_CMP2_INT	(1 << 5)
-#define ADP5588_CMP1_INT	(1 << 4)
-#define ADP5588_OVR_FLOW_INT	(1 << 3)
-#define ADP5588_K_LCK_INT	(1 << 2)
-#define ADP5588_GPI_INT		(1 << 1)
-#define ADP5588_KE_INT		(1 << 0)
-
-/* Key Lock and Event Counter Register */
-#define ADP5588_K_LCK_EN	(1 << 6)
-#define ADP5588_LCK21		0x30
-#define ADP5588_KEC		0xF
-
-#define ADP5588_MAXGPIO		18
-#define ADP5588_BANK(offs)	((offs) >> 3)
-#define ADP5588_BIT(offs)	(1u << ((offs) & 0x7))
-
-/* Put one of these structures in i2c_board_info platform_data */
-
-#define ADP5588_KEYMAPSIZE	80
-
-#define GPI_PIN_ROW0 97
-#define GPI_PIN_ROW1 98
-#define GPI_PIN_ROW2 99
-#define GPI_PIN_ROW3 100
-#define GPI_PIN_ROW4 101
-#define GPI_PIN_ROW5 102
-#define GPI_PIN_ROW6 103
-#define GPI_PIN_ROW7 104
-#define GPI_PIN_COL0 105
-#define GPI_PIN_COL1 106
-#define GPI_PIN_COL2 107
-#define GPI_PIN_COL3 108
-#define GPI_PIN_COL4 109
-#define GPI_PIN_COL5 110
-#define GPI_PIN_COL6 111
-#define GPI_PIN_COL7 112
-#define GPI_PIN_COL8 113
-#define GPI_PIN_COL9 114
-
-#define GPI_PIN_ROW_BASE GPI_PIN_ROW0
-#define GPI_PIN_ROW_END GPI_PIN_ROW7
-#define GPI_PIN_COL_BASE GPI_PIN_COL0
-#define GPI_PIN_COL_END GPI_PIN_COL9
-
-#define GPI_PIN_BASE GPI_PIN_ROW_BASE
-#define GPI_PIN_END GPI_PIN_COL_END
-
-#define ADP5588_GPIMAPSIZE_MAX (GPI_PIN_END - GPI_PIN_BASE + 1)
-
-struct adp5588_gpi_map {
-	unsigned short pin;
-	unsigned short sw_evt;
-};
-
-struct adp5588_kpad_platform_data {
-	int rows;			/* Number of rows */
-	int cols;			/* Number of columns */
-	const unsigned short *keymap;	/* Pointer to keymap */
-	unsigned short keymapsize;	/* Keymap size */
-	unsigned repeat:1;		/* Enable key repeat */
-	unsigned en_keylock:1;		/* Enable Key Lock feature */
-	unsigned short unlock_key1;	/* Unlock Key 1 */
-	unsigned short unlock_key2;	/* Unlock Key 2 */
-	const struct adp5588_gpio_platform_data *gpio_data;
-};
-
-struct i2c_client; /* forward declaration */
-
-struct adp5588_gpio_platform_data {
-	int gpio_start;		/* GPIO Chip base # */
-	const char *const *names;
-	unsigned irq_base;	/* interrupt base # */
-	unsigned pullup_dis_mask; /* Pull-Up Disable Mask */
-	int	(*setup)(struct i2c_client *client,
-				unsigned gpio, unsigned ngpio,
-				void *context);
-	int	(*teardown)(struct i2c_client *client,
-				unsigned gpio, unsigned ngpio,
-				void *context);
-	void	*context;
-};
-
-#endif
-- 
cgit v1.2.3


From 6b70fe0601adb1396ad0b85cdf05d217500b49e7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 29 Aug 2022 14:38:42 +0200
Subject: acl: add vfs_set_acl_prepare()

Various filesystems store POSIX ACLs on the backing store in their uapi
format. Such filesystems need to translate from the uapi POSIX ACL
format into the VFS format during i_op->get_acl(). The VFS provides the
posix_acl_from_xattr() helper for this task.

But the usage of posix_acl_from_xattr() is currently ambiguous. It is
intended to transform from a uapi POSIX ACL  to the VFS represenation.
For example, when retrieving POSIX ACLs for permission checking during
lookup or when calling getxattr() to retrieve system.posix_acl_{access,default}.

Calling posix_acl_from_xattr() during i_op->get_acl() will map the raw
{g,u}id values stored as ACL_{GROUP,USER} entries in the uapi POSIX ACL
format into k{g,u}id_t in the filesystem's idmapping and return a struct
posix_acl ready to be returned to the VFS for caching and to perform
permission checks on.

However, posix_acl_from_xattr() is also called during setxattr() for all
filesystems that rely on VFS provides posix_acl_{access,default}_xattr_handler.
The posix_acl_xattr_set() handler which is used for the ->set() method
of posix_acl_{access,default}_xattr_handler uses posix_acl_from_xattr()
to translate from the uapi POSIX ACL format to the VFS format so that it
can be passed to the i_op->set_acl() handler of the filesystem or for
direct caching in case no i_op->set_acl() handler is defined.

During setxattr() the {g,u}id values stored as ACL_{GROUP,USER} entries
in the uapi POSIX ACL format aren't raw {g,u}id values that need to be
mapped according to the filesystem's idmapping. Instead they are {g,u}id
values in the caller's idmapping which have been generated during
posix_acl_fix_xattr_from_user(). In other words, they are k{g,u}id_t
which are passed as raw {g,u}id values abusing the uapi POSIX ACL format
(Please note that this type safety violation has existed since the
introduction of k{g,u}id_t. Please see [1] for more details.).

So when posix_acl_from_xattr() is called in posix_acl_xattr_set() the
filesystem idmapping is completely irrelevant. Instead, we abuse the
initial idmapping to recover the k{g,u}id_t base on the value stored in
raw {g,u}id as ACL_{GROUP,USER} in the uapi POSIX ACL format.

We need to clearly distinguish betweeen these two operations as it is
really easy to confuse for filesystems as can be seen in ntfs3.

In order to do this we factor out make_posix_acl() which takes callbacks
allowing callers to pass dedicated methods to generate the correct
k{g,u}id_t. This is just an internal static helper which is not exposed
to any filesystems but it neatly encapsulates the basic logic of walking
through a uapi POSIX ACL and returning an allocated VFS POSIX ACL with
the correct k{g,u}id_t values.

The posix_acl_from_xattr() helper can then be implemented as a simple
call to make_posix_acl() with callbacks that generate the correct
k{g,u}id_t from the raw {g,u}id values in ACL_{GROUP,USER} entries in
the uapi POSIX ACL format as read from the backing store.

For setxattr() we add a new helper vfs_set_acl_prepare() which has
callbacks to map the POSIX ACLs from the uapi format with the k{g,u}id_t
values stored in raw {g,u}id format in ACL_{GROUP,USER} entries into the
correct k{g,u}id_t values in the filesystem idmapping. In contrast to
posix_acl_from_xattr() the vfs_set_acl_prepare() helper needs to take
the mount idmapping into account. The differences are explained in more
detail in the kernel doc for the new functions.

In follow up patches we will remove all abuses of posix_acl_from_xattr()
for setxattr() operations and replace it with calls to vfs_set_acl_prepare().

The new vfs_set_acl_prepare() helper allows us to deal with the
ambiguity in how the POSI ACL uapi struct stores {g,u}id values
depending on whether this is a getxattr() or setxattr() operation.

This also allows us to remove the posix_acl_setxattr_idmapped_mnt()
helper reducing the abuse of the POSIX ACL uapi format to pass values
that should be distinct types in {g,u}id values stored as
ACL_{GROUP,USER} entries.

The removal of posix_acl_setxattr_idmapped_mnt() in turn allows us to
re-constify the value parameter of vfs_setxattr() which in turn allows
us to avoid the nasty cast from a const void pointer to a non-const void
pointer on ovl_do_setxattr().

Ultimately, the plan is to get rid of the type violations completely and
never pass the values from k{g,u}id_t as raw {g,u}id in ACL_{GROUP,USER}
entries in uapi POSIX ACL format. But that's a longer way to go and this
is a preparatory step.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Co-Developed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 include/linux/posix_acl_xattr.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index b6bd3eac2bcc..47eca15fd842 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -66,6 +66,9 @@ struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns,
 				       const void *value, size_t size);
 int posix_acl_to_xattr(struct user_namespace *user_ns,
 		       const struct posix_acl *acl, void *buffer, size_t size);
+struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns,
+				      struct user_namespace *fs_userns,
+				      const void *value, size_t size);
 
 extern const struct xattr_handler posix_acl_access_xattr_handler;
 extern const struct xattr_handler posix_acl_default_xattr_handler;
-- 
cgit v1.2.3


From 66d1c8021e1d9c97101d58fa09c33c002d96747a Mon Sep 17 00:00:00 2001
From: Piyush Mehta <piyush.mehta@xilinx.com>
Date: Mon, 22 Aug 2022 11:10:51 +0530
Subject: usb: chipidea: Add support for VBUS control with PHY

Some platforms make use of VBUS control over PHY which means controller
driver has to access PHY registers to turn on/off VBUS line.This patch
adds support for such platforms in chipidea.

Flag 'CI_HDRC_PHY_VBUS_CONTROL' added to support VBus control feature.

Acked-by: Peter Chen <peter.chen@kernel.org>
Signed-off-by: Piyush Mehta <piyush.mehta@xilinx.com>
Signed-off-by: Piyush Mehta <piyush.mehta@amd.com>
Link: https://lore.kernel.org/r/20220822054051.2941282-1-piyush.mehta@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/chipidea.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index edf3342507f1..ee38835ed77c 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -62,6 +62,7 @@ struct ci_hdrc_platform_data {
 #define CI_HDRC_REQUIRES_ALIGNED_DMA	BIT(13)
 #define CI_HDRC_IMX_IS_HSIC		BIT(14)
 #define CI_HDRC_PMQOS			BIT(15)
+#define CI_HDRC_PHY_VBUS_CONTROL	BIT(16)
 	enum usb_dr_mode	dr_mode;
 #define CI_HDRC_CONTROLLER_RESET_EVENT		0
 #define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
-- 
cgit v1.2.3


From 66df18b3bd74107dd7c196e75ce00d64d7553152 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 31 Aug 2022 10:47:50 +0200
Subject: gpio: ucb1400: Use proper header

The UCB1400 implements a GPIO driver so it needs to include the
<linux/gpio/driver.h> header, not the legacy <linux/gpio.h> header.
Compile tested on pxa_defconfig.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 include/linux/ucb1400.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ucb1400.h b/include/linux/ucb1400.h
index 22345391350b..2516082cd3a9 100644
--- a/include/linux/ucb1400.h
+++ b/include/linux/ucb1400.h
@@ -23,7 +23,7 @@
 #include <sound/ac97_codec.h>
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
-#include <linux/gpio.h>
+#include <linux/gpio/driver.h>
 
 /*
  * UCB1400 AC-link registers
-- 
cgit v1.2.3


From d8f3f5834febb74d18b5b2098f67d9db740f3e30 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Aug 2022 10:36:32 -0700
Subject: rcu: Update rcu_access_pointer() header for
 rcu_dereference_protected()

The rcu_access_pointer() docbook header correctly notes that it may be
used during post-grace-period teardown.  However, it is usually better to
use rcu_dereference_protected() for this purpose.  This commit therefore
calls out this preferred usage.

Reported-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f527f27e6438..61a1a85c720c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -496,13 +496,21 @@ do {									      \
  * against NULL.  Although rcu_access_pointer() may also be used in cases
  * where update-side locks prevent the value of the pointer from changing,
  * you should instead use rcu_dereference_protected() for this use case.
+ * Within an RCU read-side critical section, there is little reason to
+ * use rcu_access_pointer().
+ *
+ * It is usually best to test the rcu_access_pointer() return value
+ * directly in order to avoid accidental dereferences being introduced
+ * by later inattentive changes.  In other words, assigning the
+ * rcu_access_pointer() return value to a local variable results in an
+ * accident waiting to happen.
  *
  * It is also permissible to use rcu_access_pointer() when read-side
- * access to the pointer was removed at least one grace period ago, as
- * is the case in the context of the RCU callback that is freeing up
- * the data, or after a synchronize_rcu() returns.  This can be useful
- * when tearing down multi-linked structures after a grace period
- * has elapsed.
+ * access to the pointer was removed at least one grace period ago, as is
+ * the case in the context of the RCU callback that is freeing up the data,
+ * or after a synchronize_rcu() returns.  This can be useful when tearing
+ * down multi-linked structures after a grace period has elapsed.  However,
+ * rcu_dereference_protected() is normally preferred for this use case.
  */
 #define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu)
 
-- 
cgit v1.2.3


From 91a967fd6934abc0c7e4b0d26728e38674278707 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 Jul 2022 15:37:05 -0700
Subject: rcu: Add full-sized polling for get_completed*() and poll_state*()

The get_completed_synchronize_rcu() and poll_state_synchronize_rcu()
APIs compress the combined expedited and normal grace-period states into a
single unsigned long, which conserves storage, but can miss grace periods
in certain cases involving overlapping normal and expedited grace periods.
Missing the occasional grace period is usually not a problem, but there
are use cases that care about each and every grace period.

This commit therefore adds the first members of the full-state RCU
grace-period polling API, namely the get_completed_synchronize_rcu_full()
and poll_state_synchronize_rcu_full() functions.  These use up to three
times the storage (rcu_gp_oldstate structure instead of unsigned long),
but which are guaranteed not to miss grace periods, at least in situations
where the single-CPU grace-period optimization does not apply.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 3 +++
 include/linux/rcutiny.h  | 9 +++++++++
 include/linux/rcutree.h  | 8 ++++++++
 3 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f527f27e6438..faaa174dfb27 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -42,7 +42,10 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
 void rcu_barrier_tasks(void);
 void rcu_barrier_tasks_rude(void);
 void synchronize_rcu(void);
+
+struct rcu_gp_oldstate;
 unsigned long get_completed_synchronize_rcu(void);
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 
 #ifdef CONFIG_PREEMPT_RCU
 
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 62815c0a2dce..1fbff8600d92 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -14,10 +14,19 @@
 
 #include <asm/param.h> /* for HZ */
 
+struct rcu_gp_oldstate {
+	unsigned long rgos_norm;
+};
+
 unsigned long get_state_synchronize_rcu(void);
 unsigned long start_poll_synchronize_rcu(void);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 
+static inline bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	return poll_state_synchronize_rcu(rgosp->rgos_norm);
+}
+
 static inline void cond_synchronize_rcu(unsigned long oldstate)
 {
 	might_sleep();
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 47eaa4cb0df7..4ccbc3aa9dc2 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -40,11 +40,19 @@ bool rcu_eqs_special_set(int cpu);
 void rcu_momentary_dyntick_idle(void);
 void kfree_rcu_scheduler_running(void);
 bool rcu_gp_might_be_stalled(void);
+
+struct rcu_gp_oldstate {
+	unsigned long rgos_norm;
+	unsigned long rgos_exp;
+	unsigned long rgos_polled;
+};
+
 unsigned long start_poll_synchronize_rcu_expedited(void);
 void cond_synchronize_rcu_expedited(unsigned long oldstate);
 unsigned long get_state_synchronize_rcu(void);
 unsigned long start_poll_synchronize_rcu(void);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
+bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 void cond_synchronize_rcu(unsigned long oldstate);
 
 bool rcu_is_idle_cpu(int cpu);
-- 
cgit v1.2.3


From 3fdefca9b42c8bebe3beea5c1a067c9718ca0fc7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 Jul 2022 19:58:13 -0700
Subject: rcu: Add full-sized polling for get_state()

The get_state_synchronize_rcu() API compresses the combined expedited and
normal grace-period states into a single unsigned long, which conserves
storage, but can miss grace periods in certain cases involving overlapping
normal and expedited grace periods.  Missing the occasional grace period
is usually not a problem, but there are use cases that care about each
and every grace period.

This commit therefore adds the next member of the full-state RCU
grace-period polling API, namely the get_state_synchronize_rcu_full()
function.  This uses up to three times the storage (rcu_gp_oldstate
structure instead of unsigned long), but is guaranteed not to miss
grace periods.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h | 6 ++++++
 include/linux/rcutree.h | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 1fbff8600d92..6e299955c4e9 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -19,6 +19,12 @@ struct rcu_gp_oldstate {
 };
 
 unsigned long get_state_synchronize_rcu(void);
+
+static inline void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	rgosp->rgos_norm = get_state_synchronize_rcu();
+}
+
 unsigned long start_poll_synchronize_rcu(void);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 4ccbc3aa9dc2..7b769f1b417a 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -50,6 +50,7 @@ struct rcu_gp_oldstate {
 unsigned long start_poll_synchronize_rcu_expedited(void);
 void cond_synchronize_rcu_expedited(unsigned long oldstate);
 unsigned long get_state_synchronize_rcu(void);
+void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 unsigned long start_poll_synchronize_rcu(void);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
-- 
cgit v1.2.3


From 76ea364161e72b1878126edf8d507d2a62fb47b0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Aug 2022 17:04:54 -0700
Subject: rcu: Add full-sized polling for start_poll()

The start_poll_synchronize_rcu() API compresses the combined expedited and
normal grace-period states into a single unsigned long, which conserves
storage, but can miss grace periods in certain cases involving overlapping
normal and expedited grace periods.  Missing the occasional grace period
is usually not a problem, but there are use cases that care about each
and every grace period.

This commit therefore adds the next member of the full-state RCU
grace-period polling API, namely the start_poll_synchronize_rcu_full()
function.  This uses up to three times the storage (rcu_gp_oldstate
structure instead of unsigned long), but is guaranteed not to miss
grace periods.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h | 6 ++++++
 include/linux/rcutree.h | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 6e299955c4e9..6bc30e46a819 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -26,6 +26,12 @@ static inline void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 }
 
 unsigned long start_poll_synchronize_rcu(void);
+
+static inline void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	rgosp->rgos_norm = start_poll_synchronize_rcu();
+}
+
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 
 static inline bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 7b769f1b417a..8f2e0f0b26f6 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -52,6 +52,7 @@ void cond_synchronize_rcu_expedited(unsigned long oldstate);
 unsigned long get_state_synchronize_rcu(void);
 void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 unsigned long start_poll_synchronize_rcu(void);
+void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 void cond_synchronize_rcu(unsigned long oldstate);
-- 
cgit v1.2.3


From 6c502b14ba66da0670a59e20354469fa56eab26c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Aug 2022 12:38:51 -0700
Subject: rcu: Add full-sized polling for start_poll_expedited()

The start_poll_synchronize_rcu_expedited() API compresses the combined
expedited and normal grace-period states into a single unsigned long,
which conserves storage, but can miss grace periods in certain cases
involving overlapping normal and expedited grace periods.  Missing the
occasional grace period is usually not a problem, but there are use
cases that care about each and every grace period.

This commit therefore adds yet another member of the
full-state RCU grace-period polling API, which is the
start_poll_synchronize_rcu_expedited_full() function.  This uses up to
three times the storage (rcu_gp_oldstate structure instead of unsigned
long), but is guaranteed not to miss grace periods.

[ paulmck: Apply feedback from kernel test robot and Julia Lawall. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h | 5 +++++
 include/linux/rcutree.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 6bc30e46a819..653e35777a99 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -49,6 +49,11 @@ static inline unsigned long start_poll_synchronize_rcu_expedited(void)
 	return start_poll_synchronize_rcu();
 }
 
+static inline void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+	rgosp->rgos_norm = start_poll_synchronize_rcu_expedited();
+}
+
 static inline void cond_synchronize_rcu_expedited(unsigned long oldstate)
 {
 	cond_synchronize_rcu(oldstate);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 8f2e0f0b26f6..7151fd861736 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -48,6 +48,7 @@ struct rcu_gp_oldstate {
 };
 
 unsigned long start_poll_synchronize_rcu_expedited(void);
+void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
 void cond_synchronize_rcu_expedited(unsigned long oldstate);
 unsigned long get_state_synchronize_rcu(void);
 void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
-- 
cgit v1.2.3


From b6fe4917ae4353b397079902cb024ae01f20dfb2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 13:46:05 -0700
Subject: rcu: Add full-sized polling for cond_sync_full()

The cond_synchronize_rcu() API compresses the combined expedited and
normal grace-period states into a single unsigned long, which conserves
storage, but can miss grace periods in certain cases involving overlapping
normal and expedited grace periods.  Missing the occasional grace period
is usually not a problem, but there are use cases that care about each
and every grace period.

This commit therefore adds yet another member of the full-state RCU
grace-period polling API, which is the cond_synchronize_rcu_full()
function.  This uses up to three times the storage (rcu_gp_oldstate
structure instead of unsigned long), but is guaranteed not to miss
grace periods.

[ paulmck: Apply feedback from kernel test robot and Julia Lawall. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h | 5 +++++
 include/linux/rcutree.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 653e35777a99..3bee97f76bf4 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -44,6 +44,11 @@ static inline void cond_synchronize_rcu(unsigned long oldstate)
 	might_sleep();
 }
 
+static inline void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	cond_synchronize_rcu(rgosp->rgos_norm);
+}
+
 static inline unsigned long start_poll_synchronize_rcu_expedited(void)
 {
 	return start_poll_synchronize_rcu();
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 7151fd861736..1b44288c027d 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -57,6 +57,7 @@ void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 void cond_synchronize_rcu(unsigned long oldstate);
+void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 
 bool rcu_is_idle_cpu(int cpu);
 
-- 
cgit v1.2.3


From 8df13f01608ea48712956c0b1afce35bdba5a1c5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 15:23:26 -0700
Subject: rcu: Add full-sized polling for cond_sync_exp_full()

The cond_synchronize_rcu_expedited() API compresses the combined expedited and
normal grace-period states into a single unsigned long, which conserves
storage, but can miss grace periods in certain cases involving overlapping
normal and expedited grace periods.  Missing the occasional grace period
is usually not a problem, but there are use cases that care about each
and every grace period.

This commit therefore adds yet another member of the full-state RCU
grace-period polling API, which is the cond_synchronize_rcu_exp_full()
function.  This uses up to three times the storage (rcu_gp_oldstate
structure instead of unsigned long), but is guaranteed not to miss
grace periods.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h | 5 +++++
 include/linux/rcutree.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 3bee97f76bf4..4405e9112cee 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -64,6 +64,11 @@ static inline void cond_synchronize_rcu_expedited(unsigned long oldstate)
 	cond_synchronize_rcu(oldstate);
 }
 
+static inline void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+	cond_synchronize_rcu_expedited(rgosp->rgos_norm);
+}
+
 extern void rcu_barrier(void);
 
 static inline void synchronize_rcu_expedited(void)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 1b44288c027d..755b082f4ec6 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -50,6 +50,7 @@ struct rcu_gp_oldstate {
 unsigned long start_poll_synchronize_rcu_expedited(void);
 void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
 void cond_synchronize_rcu_expedited(unsigned long oldstate);
+void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
 unsigned long get_state_synchronize_rcu(void);
 void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 unsigned long start_poll_synchronize_rcu(void);
-- 
cgit v1.2.3


From 7ecef0871dd9a879038dbe8a681ab48bd0c92988 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 17:54:53 -0700
Subject: rcu: Remove ->rgos_polled field from rcu_gp_oldstate structure

Because both normal and expedited grace periods increment their respective
counters on their pre-scheduler early boot fastpaths, the rcu_gp_oldstate
structure no longer needs its ->rgos_polled field.  This commit therefore
removes this field, shrinking this structure so that it is the same size
as an rcu_head structure.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutree.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 755b082f4ec6..455a03bdce15 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -44,7 +44,6 @@ bool rcu_gp_might_be_stalled(void);
 struct rcu_gp_oldstate {
 	unsigned long rgos_norm;
 	unsigned long rgos_exp;
-	unsigned long rgos_polled;
 };
 
 unsigned long start_poll_synchronize_rcu_expedited(void);
-- 
cgit v1.2.3


From 18538248e5486b0f0e8581083de275176674cd1f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 24 Aug 2022 15:39:09 -0700
Subject: rcu: Add functions to compare grace-period state values

This commit adds same_state_synchronize_rcu() and
same_state_synchronize_rcu_full() functions to compare grace-period state
values, for example, those obtained from get_state_synchronize_rcu()
and get_state_synchronize_rcu_full().  These functions allow small
structures to omit these state values by placing them in list headers for
lists containing structures with the same token value.  Presumably the
per-structure list pointers are the same ones used to link the structures
into whatever reader-accessible data structure was used.

This commit also adds both NUM_ACTIVE_RCU_POLL_OLDSTATE and
NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE, which define the maximum number of
distinct unsigned long values and rcu_gp_oldstate values, respectively,
corresponding to not-yet-completed grace periods.  These values can be
used to size arrays of the list headers described above.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 21 +++++++++++++++++++++
 include/linux/rcutiny.h  | 14 ++++++++++++++
 include/linux/rcutree.h  | 28 ++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index faaa174dfb27..9941d5c3d5e1 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -47,6 +47,27 @@ struct rcu_gp_oldstate;
 unsigned long get_completed_synchronize_rcu(void);
 void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 
+// Maximum number of unsigned long values corresponding to
+// not-yet-completed RCU grace periods.
+#define NUM_ACTIVE_RCU_POLL_OLDSTATE 2
+
+/**
+ * same_state_synchronize_rcu - Are two old-state values identical?
+ * @oldstate1: First old-state value.
+ * @oldstate2: Second old-state value.
+ *
+ * The two old-state values must have been obtained from either
+ * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or
+ * get_completed_synchronize_rcu().  Returns @true if the two values are
+ * identical and @false otherwise.  This allows structures whose lifetimes
+ * are tracked by old-state values to push these values to a list header,
+ * allowing those structures to be slightly smaller.
+ */
+static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2)
+{
+	return oldstate1 == oldstate2;
+}
+
 #ifdef CONFIG_PREEMPT_RCU
 
 void __rcu_read_lock(void);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 4405e9112cee..768196a5f39d 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -18,6 +18,20 @@ struct rcu_gp_oldstate {
 	unsigned long rgos_norm;
 };
 
+// Maximum number of rcu_gp_oldstate values corresponding to
+// not-yet-completed RCU grace periods.
+#define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 2
+
+/*
+ * Are the two oldstate values the same?  See the Tree RCU version for
+ * docbook header.
+ */
+static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1,
+						   struct rcu_gp_oldstate *rgosp2)
+{
+	return rgosp1->rgos_norm == rgosp2->rgos_norm;
+}
+
 unsigned long get_state_synchronize_rcu(void);
 
 static inline void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 455a03bdce15..5efb51486e8a 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -46,6 +46,34 @@ struct rcu_gp_oldstate {
 	unsigned long rgos_exp;
 };
 
+// Maximum number of rcu_gp_oldstate values corresponding to
+// not-yet-completed RCU grace periods.
+#define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 4
+
+/**
+ * same_state_synchronize_rcu_full - Are two old-state values identical?
+ * @rgosp1: First old-state value.
+ * @rgosp2: Second old-state value.
+ *
+ * The two old-state values must have been obtained from either
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or get_completed_synchronize_rcu_full().  Returns @true if the two
+ * values are identical and @false otherwise.  This allows structures
+ * whose lifetimes are tracked by old-state values to push these values
+ * to a list header, allowing those structures to be slightly smaller.
+ *
+ * Note that equality is judged on a bitwise basis, so that an
+ * @rcu_gp_oldstate structure with an already-completed state in one field
+ * will compare not-equal to a structure with an already-completed state
+ * in the other field.  After all, the @rcu_gp_oldstate structure is opaque
+ * so how did such a situation come to pass in the first place?
+ */
+static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1,
+						   struct rcu_gp_oldstate *rgosp2)
+{
+	return rgosp1->rgos_norm == rgosp2->rgos_norm && rgosp1->rgos_exp == rgosp2->rgos_exp;
+}
+
 unsigned long start_poll_synchronize_rcu_expedited(void);
 void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
 void cond_synchronize_rcu_expedited(unsigned long oldstate);
-- 
cgit v1.2.3


From d66e4cf974a53c1195f1f5a96387ee5dbad2bdf2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Aug 2022 10:18:23 -0700
Subject: srcu: Add GP and maximum requested GP to Tiny SRCU rcutorture output

This commit adds the ->srcu_idx and ->srcu_max_idx fields to the Tiny
SRCU rcutorture output for additional diagnostics.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutiny.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 6cfaa0a9a9b9..4fcec6f5af90 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -82,10 +82,12 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp,
 	int idx;
 
 	idx = ((data_race(READ_ONCE(ssp->srcu_idx)) + 1) & 0x2) >> 1;
-	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
+	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd) gp: %hu->%hu\n",
 		 tt, tf, idx,
 		 data_race(READ_ONCE(ssp->srcu_lock_nesting[!idx])),
-		 data_race(READ_ONCE(ssp->srcu_lock_nesting[idx])));
+		 data_race(READ_ONCE(ssp->srcu_lock_nesting[idx])),
+		 data_race(READ_ONCE(ssp->srcu_idx)),
+		 data_race(READ_ONCE(ssp->srcu_idx_max)));
 }
 
 #endif
-- 
cgit v1.2.3


From 5fe89191e43fb37e874b8e5177fb2a5c72379b06 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Aug 2022 15:32:47 -0700
Subject: srcu: Make Tiny SRCU use full-sized grace-period counters

This commit makes Tiny SRCU use full-sized grace-period counters to
further avoid counter-wrap issues when using polled grace-period APIs.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutiny.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 4fcec6f5af90..5aa5e0faf6a1 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -15,10 +15,10 @@
 
 struct srcu_struct {
 	short srcu_lock_nesting[2];	/* srcu_read_lock() nesting depth. */
-	unsigned short srcu_idx;	/* Current reader array element in bit 0x2. */
-	unsigned short srcu_idx_max;	/* Furthest future srcu_idx request. */
 	u8 srcu_gp_running;		/* GP workqueue running? */
 	u8 srcu_gp_waiting;		/* GP waiting for readers? */
+	unsigned long srcu_idx;		/* Current reader array element in bit 0x2. */
+	unsigned long srcu_idx_max;	/* Furthest future srcu_idx request. */
 	struct swait_queue_head srcu_wq;
 					/* Last srcu_read_unlock() wakes GP. */
 	struct rcu_head *srcu_cb_head;	/* Pending callbacks: Head. */
@@ -82,7 +82,7 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp,
 	int idx;
 
 	idx = ((data_race(READ_ONCE(ssp->srcu_idx)) + 1) & 0x2) >> 1;
-	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd) gp: %hu->%hu\n",
+	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd) gp: %lu->%lu\n",
 		 tt, tf, idx,
 		 data_race(READ_ONCE(ssp->srcu_lock_nesting[!idx])),
 		 data_race(READ_ONCE(ssp->srcu_lock_nesting[idx])),
-- 
cgit v1.2.3


From f9cad07b840ec8a8eb54928908d694b6e262631c Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 30 Aug 2022 18:32:47 +0300
Subject: thunderbolt: Show link type for XDomain connections too

Following what we do for routers already, extend this to XDomain
connections as well. This will show in sysfs whether the link is in USB4
or Thunderbolt mode.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/thunderbolt.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 9f442d73f3df..90cd08ab2f5d 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -187,6 +187,7 @@ void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir);
  * @device_name: Name of the device (or %NULL if not known)
  * @link_speed: Speed of the link in Gb/s
  * @link_width: Width of the link (1 or 2)
+ * @link_usb4: Downstream link is USB4
  * @is_unplugged: The XDomain is unplugged
  * @needs_uuid: If the XDomain does not have @remote_uuid it will be
  *		queried first
@@ -234,6 +235,7 @@ struct tb_xdomain {
 	const char *device_name;
 	unsigned int link_speed;
 	unsigned int link_width;
+	bool link_usb4;
 	bool is_unplugged;
 	bool needs_uuid;
 	struct ida service_ids;
-- 
cgit v1.2.3


From ec1bd37123c607ca6485beb4542a792a4db765aa Mon Sep 17 00:00:00 2001
From: Khalid Masum <khalid.masum.92@gmail.com>
Date: Thu, 18 Aug 2022 10:07:38 +0600
Subject: fscache: fix misdocumented parameter

This patch fixes two warnings generated by make docs. The functions
fscache_use_cookie and fscache_unuse_cookie, both have a parameter
named cookie. But they are documented with the name "object" with
unclear description. Which generates the warning when creating docs.

This commit will replace the currently misdocumented parameter names
with the correct ones while adding proper descriptions.

CC: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Khalid Masum <khalid.masum.92@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20220521142446.4746-1-khalid.masum.92@gmail.com/ # v1
Link: https://lore.kernel.org/r/20220818040738.12036-1-khalid.masum.92@gmail.com/ # v2
Link: https://lore.kernel.org/r/880d7d25753fb326ee17ac08005952112fcf9bdb.1657360984.git.mchehab@kernel.org/ # Mauro's version
---
 include/linux/fscache.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 720874e6ee94..36e5dd84cf59 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -258,7 +258,7 @@ struct fscache_cookie *fscache_acquire_cookie(struct fscache_volume *volume,
 
 /**
  * fscache_use_cookie - Request usage of cookie attached to an object
- * @object: Object description
+ * @cookie: The cookie representing the cache object
  * @will_modify: If cache is expected to be modified locally
  *
  * Request usage of the cookie attached to an object.  The caller should tell
@@ -274,7 +274,7 @@ static inline void fscache_use_cookie(struct fscache_cookie *cookie,
 
 /**
  * fscache_unuse_cookie - Cease usage of cookie attached to an object
- * @object: Object description
+ * @cookie: The cookie representing the cache object
  * @aux_data: Updated auxiliary data (or NULL)
  * @object_size: Revised size of the object (or NULL)
  *
-- 
cgit v1.2.3


From 52edb4080eb9606536c34d5d642ccd9d35ad5d08 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 29 Aug 2022 14:38:43 +0200
Subject: acl: move idmapping handling into posix_acl_xattr_set()

The uapi POSIX ACL struct passed through the value argument during
setxattr() contains {g,u}id values encoded via ACL_{GROUP,USER} entries
that should actually be stored in the form of k{g,u}id_t (See [1] for a
long explanation of the issue.).

In 0c5fd887d2bb ("acl: move idmapped mount fixup into vfs_{g,s}etxattr()")
we took the mount's idmapping into account in order to let overlayfs
handle POSIX ACLs on idmapped layers correctly. The fixup is currently
performed directly in vfs_setxattr() which piles on top of the earlier
hackiness by handling the mount's idmapping and stuff the vfs{g,u}id_t
values into the uapi struct as well. While that is all correct and works
fine it's just ugly.

Now that we have introduced vfs_make_posix_acl() earlier move handling
idmapped mounts out of vfs_setxattr() and into the POSIX ACL handler
where it belongs.

Note that we also need to call vfs_make_posix_acl() for EVM which
interpretes POSIX ACLs during security_inode_setxattr(). Leave them a
longer comment for future reference.

All filesystems that support idmapped mounts via FS_ALLOW_IDMAP use the
standard POSIX ACL xattr handlers and are covered by this change. This
includes overlayfs which simply calls vfs_{g,s}etxattr().

The following filesystems use custom POSIX ACL xattr handlers: 9p, cifs,
ecryptfs, and ntfs3 (and overlayfs but we've covered that in the paragraph
above) and none of them support idmapped mounts yet.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org/ [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
---
 include/linux/posix_acl_xattr.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index 47eca15fd842..8163dd48c430 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -38,9 +38,6 @@ void posix_acl_fix_xattr_to_user(void *value, size_t size);
 void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
 				     const struct inode *inode,
 				     void *value, size_t size);
-void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
-				     const struct inode *inode,
-				     void *value, size_t size);
 #else
 static inline void posix_acl_fix_xattr_from_user(void *value, size_t size)
 {
@@ -54,12 +51,6 @@ posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
 				size_t size)
 {
 }
-static inline void
-posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
-				const struct inode *inode, void *value,
-				size_t size)
-{
-}
 #endif
 
 struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns, 
-- 
cgit v1.2.3


From 6344e66970c619a1623f457910e78819076e9104 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 29 Aug 2022 14:38:45 +0200
Subject: xattr: constify value argument in vfs_setxattr()

Now that we don't perform translations directly in vfs_setxattr()
anymore we can constify the @value argument in vfs_setxattr(). This also
allows us to remove the hack to cast from a const in ovl_do_setxattr().

Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
---
 include/linux/xattr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 979a9d3e5bfb..4c379d23ec6e 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -61,7 +61,7 @@ int __vfs_setxattr_locked(struct user_namespace *, struct dentry *,
 			  const char *, const void *, size_t, int,
 			  struct inode **);
 int vfs_setxattr(struct user_namespace *, struct dentry *, const char *,
-		 void *, size_t, int);
+		 const void *, size_t, int);
 int __vfs_removexattr(struct user_namespace *, struct dentry *, const char *);
 int __vfs_removexattr_locked(struct user_namespace *, struct dentry *,
 			     const char *, struct inode **);
-- 
cgit v1.2.3


From b6df1cbb415e543f2908f9c59a8fb20714b86879 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Tue, 30 Aug 2022 18:26:10 +0100
Subject: coresight: Simplify sysfs accessors by using csdev_access abstraction

The coresight_device struct is available in the sysfs accessor, and this
contains a csdev_access struct which can be used to access registers.
Use this instead of passing in the type of each drvdata so that a common
function can be shared between all the cs drivers.

No functional changes.

Signed-off-by: James Clark <james.clark@arm.com>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Link: https://lore.kernel.org/r/20220830172614.340962-3-james.clark@arm.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/linux/coresight.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 9f445f09fcfe..a47dd1f62216 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -372,6 +372,24 @@ static inline u32 csdev_access_relaxed_read32(struct csdev_access *csa,
 	return csa->read(offset, true, false);
 }
 
+static inline u64 csdev_access_relaxed_read_pair(struct csdev_access *csa,
+						 s32 lo_offset, s32 hi_offset)
+{
+	u64 val;
+
+	if (likely(csa->io_mem)) {
+		val = readl_relaxed(csa->base + lo_offset);
+		val |= (hi_offset < 0) ? 0 :
+		       (u64)readl_relaxed(csa->base + hi_offset) << 32;
+		return val;
+	}
+
+	val = csa->read(lo_offset, true, false);
+	val |= (hi_offset < 0) ? 0 :
+	       (u64)csa->read(hi_offset, true, false) << 32;
+	return val;
+}
+
 static inline u32 csdev_access_read32(struct csdev_access *csa, u32 offset)
 {
 	if (likely(csa->io_mem))
-- 
cgit v1.2.3


From 0a98181f805058773961c5ab3172ecf1bf1ed0e1 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Tue, 30 Aug 2022 18:26:13 +0100
Subject: coresight: Make new csdev_access offsets unsigned

New csdev_access functions were added as part of the previous
refactor. In order to make them more consistent with the
existing ones, change any signed offset types to be unsigned.

Now that they are unsigned, stop using hi_off = -1 to signify
a single 32bit access. Instead just call the existing 32bit
accessors. This is also applied to other parts of the codebase,
and the coresight_{read,write}_reg_pair() functions can be
deleted.

Signed-off-by: James Clark <james.clark@arm.com>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Link: https://lore.kernel.org/r/20220830172614.340962-6-james.clark@arm.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/linux/coresight.h | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index a47dd1f62216..1554021231f9 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -373,21 +373,26 @@ static inline u32 csdev_access_relaxed_read32(struct csdev_access *csa,
 }
 
 static inline u64 csdev_access_relaxed_read_pair(struct csdev_access *csa,
-						 s32 lo_offset, s32 hi_offset)
+						 u32 lo_offset, u32 hi_offset)
 {
-	u64 val;
-
 	if (likely(csa->io_mem)) {
-		val = readl_relaxed(csa->base + lo_offset);
-		val |= (hi_offset < 0) ? 0 :
-		       (u64)readl_relaxed(csa->base + hi_offset) << 32;
-		return val;
+		return readl_relaxed(csa->base + lo_offset) |
+			((u64)readl_relaxed(csa->base + hi_offset) << 32);
 	}
 
-	val = csa->read(lo_offset, true, false);
-	val |= (hi_offset < 0) ? 0 :
-	       (u64)csa->read(hi_offset, true, false) << 32;
-	return val;
+	return csa->read(lo_offset, true, false) | (csa->read(hi_offset, true, false) << 32);
+}
+
+static inline void csdev_access_relaxed_write_pair(struct csdev_access *csa, u64 val,
+						   u32 lo_offset, u32 hi_offset)
+{
+	if (likely(csa->io_mem)) {
+		writel_relaxed((u32)val, csa->base + lo_offset);
+		writel_relaxed((u32)(val >> 32), csa->base + hi_offset);
+	} else {
+		csa->write((u32)val, lo_offset, true, false);
+		csa->write((u32)(val >> 32), hi_offset, true, false);
+	}
 }
 
 static inline u32 csdev_access_read32(struct csdev_access *csa, u32 offset)
-- 
cgit v1.2.3


From 92d23c6e94157739b997cacce151586a0d07bb8a Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 26 Aug 2022 09:21:16 -0700
Subject: overflow, tracing: Define the is_signed_type() macro once

There are two definitions of the is_signed_type() macro: one in
<linux/overflow.h> and a second definition in <linux/trace_events.h>.

As suggested by Linus Torvalds, move the definition of the
is_signed_type() macro into the <linux/compiler.h> header file. Change
the definition of the is_signed_type() macro to make sure that it does
not trigger any sparse warnings with future versions of sparse for
bitwise types. See also:
https://lore.kernel.org/all/CAHk-=whjH6p+qzwUdx5SOVVHjS3WvzJQr6mDUwhEyTf6pJWzaQ@mail.gmail.com/
https://lore.kernel.org/all/CAHk-=wjQGnVfb4jehFR0XyZikdQvCZouE96xR_nnf5kqaM5qqQ@mail.gmail.com/

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Isabella Basso <isabbasso@riseup.net>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sander Vanheule <sander@svanheule.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220826162116.1050972-3-bvanassche@acm.org
---
 include/linux/compiler.h     | 6 ++++++
 include/linux/overflow.h     | 1 -
 include/linux/trace_events.h | 2 --
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 01ce94b58b42..7713d7bcdaea 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -239,6 +239,12 @@ static inline void *offset_to_ptr(const int *off)
 /* &a[0] degrades to a pointer: a different type from an array */
 #define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 
+/*
+ * Whether 'type' is a signed type or an unsigned type. Supports scalar types,
+ * bool and also pointer types.
+ */
+#define is_signed_type(type) (((type)(-1)) < (__force type)1)
+
 /*
  * This is needed in functions which generate the stack canary, see
  * arch/x86/kernel/smpboot.c::start_secondary() for an example.
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index f1221d11f8e5..0eb3b192f07a 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -30,7 +30,6 @@
  * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
  * credit to Christian Biere.
  */
-#define is_signed_type(type)       (((type)(-1)) < (type)1)
 #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
 #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
 #define type_min(T) ((T)((T)-type_max(T)-(T)1))
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index b18759a673c6..8401dec93c15 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -814,8 +814,6 @@ extern int trace_add_event_call(struct trace_event_call *call);
 extern int trace_remove_event_call(struct trace_event_call *call);
 extern int trace_event_get_offsets(struct trace_event_call *call);
 
-#define is_signed_type(type)	(((type)(-1)) < (type)1)
-
 int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
 int trace_set_clr_event(const char *system, const char *event, int set);
 int trace_array_set_clr_event(struct trace_array *tr, const char *system,
-- 
cgit v1.2.3


From bd8092def983f567800f764a5d23b2dca98f078c Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 18 Aug 2022 23:01:56 +0200
Subject: PM: suspend: move from strlcpy() with unused retval to strscpy()

Follow the advice of the below link and prefer 'strscpy' in this
subsystem. Conversion is 1:1 because the return value is not used.
Generated by a coccinelle script.

Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/suspend.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 70f2921e2e70..23a253df7f6b 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -75,7 +75,7 @@ extern struct suspend_stats suspend_stats;
 
 static inline void dpm_save_failed_dev(const char *name)
 {
-	strlcpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev],
+	strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev],
 		name,
 		sizeof(suspend_stats.failed_devs[0]));
 	suspend_stats.last_failed_dev++;
-- 
cgit v1.2.3


From 21370ecddfe1ff6fb826faedb601cfbb7adcf4ff Mon Sep 17 00:00:00 2001
From: Allen-KH Cheng <allen-kh.cheng@mediatek.com>
Date: Thu, 1 Sep 2022 01:21:51 +0800
Subject: soc: mediatek: mutex: Add mt8186 mutex mod settings for mdp3

Add mt8186 mutex mod settings for mdp3.

Co-developed-by: Xiandong Wang <xiandong.wang@mediatek.com>
Signed-off-by: Xiandong Wang <xiandong.wang@mediatek.com>
Signed-off-by: Allen-KH Cheng <allen-kh.cheng@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20220831172151.10215-3-allen-kh.cheng@mediatek.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 include/linux/soc/mediatek/mtk-mutex.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/mediatek/mtk-mutex.h b/include/linux/soc/mediatek/mtk-mutex.h
index a0f4f51a3b45..b335c2837cd8 100644
--- a/include/linux/soc/mediatek/mtk-mutex.h
+++ b/include/linux/soc/mediatek/mtk-mutex.h
@@ -20,6 +20,8 @@ enum mtk_mutex_mod_index {
 	MUTEX_MOD_IDX_MDP_WDMA,
 	MUTEX_MOD_IDX_MDP_AAL0,
 	MUTEX_MOD_IDX_MDP_CCORR0,
+	MUTEX_MOD_IDX_MDP_HDR0,
+	MUTEX_MOD_IDX_MDP_COLOR0,
 
 	MUTEX_MOD_IDX_MAX		/* ALWAYS keep at the end */
 };
-- 
cgit v1.2.3


From 2555283eb40df89945557273121e9393ef9b542b Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 31 Aug 2022 19:06:00 +0200
Subject: mm/rmap: Fix anon_vma->degree ambiguity leading to double-reuse

anon_vma->degree tracks the combined number of child anon_vmas and VMAs
that use the anon_vma as their ->anon_vma.

anon_vma_clone() then assumes that for any anon_vma attached to
src->anon_vma_chain other than src->anon_vma, it is impossible for it to
be a leaf node of the VMA tree, meaning that for such VMAs ->degree is
elevated by 1 because of a child anon_vma, meaning that if ->degree
equals 1 there are no VMAs that use the anon_vma as their ->anon_vma.

This assumption is wrong because the ->degree optimization leads to leaf
nodes being abandoned on anon_vma_clone() - an existing anon_vma is
reused and no new parent-child relationship is created.  So it is
possible to reuse an anon_vma for one VMA while it is still tied to
another VMA.

This is an issue because is_mergeable_anon_vma() and its callers assume
that if two VMAs have the same ->anon_vma, the list of anon_vmas
attached to the VMAs is guaranteed to be the same.  When this assumption
is violated, vma_merge() can merge pages into a VMA that is not attached
to the corresponding anon_vma, leading to dangling page->mapping
pointers that will be dereferenced during rmap walks.

Fix it by separately tracking the number of child anon_vmas and the
number of VMAs using the anon_vma as their ->anon_vma.

Fixes: 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy")
Cc: stable@kernel.org
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bf80adca980b..b89b4b86951f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -41,12 +41,15 @@ struct anon_vma {
 	atomic_t refcount;
 
 	/*
-	 * Count of child anon_vmas and VMAs which points to this anon_vma.
+	 * Count of child anon_vmas. Equals to the count of all anon_vmas that
+	 * have ->parent pointing to this one, including itself.
 	 *
 	 * This counter is used for making decision about reusing anon_vma
 	 * instead of forking new one. See comments in function anon_vma_clone.
 	 */
-	unsigned degree;
+	unsigned long num_children;
+	/* Count of VMAs whose ->anon_vma pointer points to this object. */
+	unsigned long num_active_vmas;
 
 	struct anon_vma *parent;	/* Parent of this anon_vma */
 
-- 
cgit v1.2.3


From 12198d9179aaa53d0a4026318d8b73b146d89729 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Wed, 20 Jul 2022 10:29:34 +0200
Subject: clk: davinci: remove PLL and PSC clocks for DaVinci DM644x and DM646x

Commit 7dd33764486d ("ARM: davinci: Delete DM644x board files") and commit
b4aed01de486 ("ARM: davinci: Delete DM646x board files") removes the
support for DaVinci DM644x and DM646x boards.

Hence, remove the PLL and PSC clock descriptions for those boards as well.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Link: https://lore.kernel.org/r/20220720082934.17741-1-lukas.bulwahn@gmail.com
Reviewed-by: David Lechner <david@lechnology.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk/davinci.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk/davinci.h b/include/linux/clk/davinci.h
index 8a7b5cd7eac0..f6ebab6228c2 100644
--- a/include/linux/clk/davinci.h
+++ b/include/linux/clk/davinci.h
@@ -28,13 +28,5 @@ int dm365_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgch
 int dm365_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
 int dm365_psc_init(struct device *dev, void __iomem *base);
 #endif
-#ifdef CONFIG_ARCH_DAVINCI_DM644x
-int dm644x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
-int dm644x_psc_init(struct device *dev, void __iomem *base);
-#endif
-#ifdef CONFIG_ARCH_DAVINCI_DM646x
-int dm646x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip);
-int dm646x_psc_init(struct device *dev, void __iomem *base);
-#endif
 
 #endif /* __LINUX_CLK_DAVINCI_PLL_H___ */
-- 
cgit v1.2.3


From 9a1043d43a9ab75d28b8ec54512ea13ec33bc910 Mon Sep 17 00:00:00 2001
From: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Date: Mon, 22 Aug 2022 22:07:22 +0300
Subject: EDAC/mc: Replace spaces with tabs in memtype flags definition

Currently, the memory type macros are partly defined with multiple
spaces between the macro name and its definition. Replace the spaces
with tabs as the kernel coding style requires.

Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220822190730.27277-13-Sergey.Semin@baikalelectronics.ru
---
 include/linux/edac.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/edac.h b/include/linux/edac.h
index e730b3468719..fa4bda2a70f6 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -231,21 +231,21 @@ enum mem_type {
 #define MEM_FLAG_DDR		BIT(MEM_DDR)
 #define MEM_FLAG_RDDR		BIT(MEM_RDDR)
 #define MEM_FLAG_RMBS		BIT(MEM_RMBS)
-#define MEM_FLAG_DDR2           BIT(MEM_DDR2)
-#define MEM_FLAG_FB_DDR2        BIT(MEM_FB_DDR2)
-#define MEM_FLAG_RDDR2          BIT(MEM_RDDR2)
-#define MEM_FLAG_XDR            BIT(MEM_XDR)
-#define MEM_FLAG_DDR3           BIT(MEM_DDR3)
-#define MEM_FLAG_RDDR3          BIT(MEM_RDDR3)
-#define MEM_FLAG_LPDDR3         BIT(MEM_LPDDR3)
-#define MEM_FLAG_DDR4           BIT(MEM_DDR4)
-#define MEM_FLAG_RDDR4          BIT(MEM_RDDR4)
-#define MEM_FLAG_LRDDR4         BIT(MEM_LRDDR4)
-#define MEM_FLAG_LPDDR4         BIT(MEM_LPDDR4)
-#define MEM_FLAG_DDR5           BIT(MEM_DDR5)
-#define MEM_FLAG_RDDR5          BIT(MEM_RDDR5)
-#define MEM_FLAG_LRDDR5         BIT(MEM_LRDDR5)
-#define MEM_FLAG_NVDIMM         BIT(MEM_NVDIMM)
+#define MEM_FLAG_DDR2		BIT(MEM_DDR2)
+#define MEM_FLAG_FB_DDR2	BIT(MEM_FB_DDR2)
+#define MEM_FLAG_RDDR2		BIT(MEM_RDDR2)
+#define MEM_FLAG_XDR		BIT(MEM_XDR)
+#define MEM_FLAG_DDR3		BIT(MEM_DDR3)
+#define MEM_FLAG_RDDR3		BIT(MEM_RDDR3)
+#define MEM_FLAG_LPDDR3		BIT(MEM_LPDDR3)
+#define MEM_FLAG_DDR4		BIT(MEM_DDR4)
+#define MEM_FLAG_RDDR4		BIT(MEM_RDDR4)
+#define MEM_FLAG_LRDDR4		BIT(MEM_LRDDR4)
+#define MEM_FLAG_LPDDR4		BIT(MEM_LPDDR4)
+#define MEM_FLAG_DDR5		BIT(MEM_DDR5)
+#define MEM_FLAG_RDDR5		BIT(MEM_RDDR5)
+#define MEM_FLAG_LRDDR5		BIT(MEM_LRDDR5)
+#define MEM_FLAG_NVDIMM		BIT(MEM_NVDIMM)
 #define MEM_FLAG_WIO2		BIT(MEM_WIO2)
 #define MEM_FLAG_HBM2		BIT(MEM_HBM2)
 
-- 
cgit v1.2.3


From 26a40990ba052e6f553256f9d0f112452b992a38 Mon Sep 17 00:00:00 2001
From: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Date: Wed, 17 Aug 2022 19:18:22 +0900
Subject: mm/sl[au]b: cleanup kmem_cache_alloc[_node]_trace()

Despite its name, kmem_cache_alloc[_node]_trace() is hook for inlined
kmalloc. So rename it to kmalloc[_node]_trace().

Move its implementation to slab_common.c by using
__kmem_cache_alloc_node(), but keep CONFIG_TRACING=n varients to save a
function call when CONFIG_TRACING=n.

Use __assume_kmalloc_alignment for kmalloc[_node]_trace instead of
__assume_slab_alignement. Generally kmalloc has larger alignment
requirements.

Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 4ee5b2fed164..c8e485ce8815 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -449,16 +449,16 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assum
 									 __malloc;
 
 #ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
-				   __assume_slab_alignment __alloc_size(3);
-
-extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
-					 int node, size_t size) __assume_slab_alignment
-								__alloc_size(4);
+void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
+		    __assume_kmalloc_alignment __alloc_size(3);
 
+void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
+			 int node, size_t size) __assume_kmalloc_alignment
+						__alloc_size(4);
 #else /* CONFIG_TRACING */
-static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s,
-								    gfp_t flags, size_t size)
+/* Save a function call when CONFIG_TRACING=n */
+static __always_inline __alloc_size(3)
+void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
 {
 	void *ret = kmem_cache_alloc(s, flags);
 
@@ -466,8 +466,9 @@ static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_
 	return ret;
 }
 
-static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
-							 int node, size_t size)
+static __always_inline __alloc_size(4)
+void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
+			 int node, size_t size)
 {
 	void *ret = kmem_cache_alloc_node(s, gfpflags, node);
 
@@ -550,7 +551,7 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
 		if (!index)
 			return ZERO_SIZE_PTR;
 
-		return kmem_cache_alloc_trace(
+		return kmalloc_trace(
 				kmalloc_caches[kmalloc_type(flags)][index],
 				flags, size);
 #endif
@@ -572,9 +573,9 @@ static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t fla
 		if (!index)
 			return ZERO_SIZE_PTR;
 
-		return kmem_cache_alloc_node_trace(
+		return kmalloc_node_trace(
 				kmalloc_caches[kmalloc_type(flags)][index],
-						flags, node, size);
+				flags, node, size);
 	}
 	return __kmalloc_node(size, flags, node);
 }
-- 
cgit v1.2.3


From 7f8170686d1733321841b21da8c2cd09ddb922b7 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 23 Aug 2022 13:38:36 +0800
Subject: soundwire: intel: cleanup definition of LCOUNT

Add definition in header file rather than hidden in code.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220823053846.2684635-2-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index ec16ae49e6a4..49a3c265529b 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -15,7 +15,10 @@
 #define SDW_LINK_SIZE			0x10000
 
 /* Intel SHIM Registers Definition */
+/* LCAP */
 #define SDW_SHIM_LCAP			0x0
+#define SDW_SHIM_LCAP_LCOUNT_MASK	GENMASK(2, 0)
+
 #define SDW_SHIM_LCTL			0x4
 #define SDW_SHIM_IPPTR			0x8
 #define SDW_SHIM_SYNC			0xC
-- 
cgit v1.2.3


From feaa24aa408f117dbf074b580d38131dc819505a Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 23 Aug 2022 13:38:37 +0800
Subject: soundwire: intel: regroup definitions for LCTL

No functionality change, just regroup offset and bitfield definitions.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220823053846.2684635-3-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 49a3c265529b..d9f51f43e42c 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -19,7 +19,14 @@
 #define SDW_SHIM_LCAP			0x0
 #define SDW_SHIM_LCAP_LCOUNT_MASK	GENMASK(2, 0)
 
+/* LCTL */
 #define SDW_SHIM_LCTL			0x4
+
+#define SDW_SHIM_LCTL_SPA		BIT(0)
+#define SDW_SHIM_LCTL_SPA_MASK		GENMASK(3, 0)
+#define SDW_SHIM_LCTL_CPA		BIT(8)
+#define SDW_SHIM_LCTL_CPA_MASK		GENMASK(11, 8)
+
 #define SDW_SHIM_IPPTR			0x8
 #define SDW_SHIM_SYNC			0xC
 
@@ -39,11 +46,6 @@
 #define SDW_SHIM_WAKEEN			0x190
 #define SDW_SHIM_WAKESTS		0x192
 
-#define SDW_SHIM_LCTL_SPA		BIT(0)
-#define SDW_SHIM_LCTL_SPA_MASK		GENMASK(3, 0)
-#define SDW_SHIM_LCTL_CPA		BIT(8)
-#define SDW_SHIM_LCTL_CPA_MASK		GENMASK(11, 8)
-
 #define SDW_SHIM_SYNC_SYNCPRD_VAL_24	(24000 / SDW_CADENCE_GSYNC_KHZ - 1)
 #define SDW_SHIM_SYNC_SYNCPRD_VAL_38_4	(38400 / SDW_CADENCE_GSYNC_KHZ - 1)
 #define SDW_SHIM_SYNC_SYNCPRD		GENMASK(14, 0)
-- 
cgit v1.2.3


From c36b610047463a37203e1158aeaf90f0a82f45dc Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 23 Aug 2022 13:38:38 +0800
Subject: soundwire: intel: remove IPPTR unused definition

This read-only register only defines an offset which is known already
and a version which isn't used.

Remove unused definition.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220823053846.2684635-4-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index d9f51f43e42c..581a9ba32f82 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -27,7 +27,6 @@
 #define SDW_SHIM_LCTL_CPA		BIT(8)
 #define SDW_SHIM_LCTL_CPA_MASK		GENMASK(11, 8)
 
-#define SDW_SHIM_IPPTR			0x8
 #define SDW_SHIM_SYNC			0xC
 
 #define SDW_SHIM_CTLSCAP(x)		(0x010 + 0x60 * (x))
-- 
cgit v1.2.3


From ca33a58d12d3c75a3b28cc97037563bb6e03be7c Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 23 Aug 2022 13:38:39 +0800
Subject: soundwire: intel: cleanup SHIM SYNC

Regroup offset and bitfields, no functionality change

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220823053846.2684635-5-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 581a9ba32f82..66503cf29f48 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -27,8 +27,17 @@
 #define SDW_SHIM_LCTL_CPA		BIT(8)
 #define SDW_SHIM_LCTL_CPA_MASK		GENMASK(11, 8)
 
+/* SYNC */
 #define SDW_SHIM_SYNC			0xC
 
+#define SDW_SHIM_SYNC_SYNCPRD_VAL_24	(24000 / SDW_CADENCE_GSYNC_KHZ - 1)
+#define SDW_SHIM_SYNC_SYNCPRD_VAL_38_4	(38400 / SDW_CADENCE_GSYNC_KHZ - 1)
+#define SDW_SHIM_SYNC_SYNCPRD		GENMASK(14, 0)
+#define SDW_SHIM_SYNC_SYNCCPU		BIT(15)
+#define SDW_SHIM_SYNC_CMDSYNC_MASK	GENMASK(19, 16)
+#define SDW_SHIM_SYNC_CMDSYNC		BIT(16)
+#define SDW_SHIM_SYNC_SYNCGO		BIT(24)
+
 #define SDW_SHIM_CTLSCAP(x)		(0x010 + 0x60 * (x))
 #define SDW_SHIM_CTLS0CM(x)		(0x012 + 0x60 * (x))
 #define SDW_SHIM_CTLS1CM(x)		(0x014 + 0x60 * (x))
@@ -45,14 +54,6 @@
 #define SDW_SHIM_WAKEEN			0x190
 #define SDW_SHIM_WAKESTS		0x192
 
-#define SDW_SHIM_SYNC_SYNCPRD_VAL_24	(24000 / SDW_CADENCE_GSYNC_KHZ - 1)
-#define SDW_SHIM_SYNC_SYNCPRD_VAL_38_4	(38400 / SDW_CADENCE_GSYNC_KHZ - 1)
-#define SDW_SHIM_SYNC_SYNCPRD		GENMASK(14, 0)
-#define SDW_SHIM_SYNC_SYNCCPU		BIT(15)
-#define SDW_SHIM_SYNC_CMDSYNC_MASK	GENMASK(19, 16)
-#define SDW_SHIM_SYNC_CMDSYNC		BIT(16)
-#define SDW_SHIM_SYNC_SYNCGO		BIT(24)
-
 #define SDW_SHIM_PCMSCAP_ISS		GENMASK(3, 0)
 #define SDW_SHIM_PCMSCAP_OSS		GENMASK(7, 4)
 #define SDW_SHIM_PCMSCAP_BSS		GENMASK(12, 8)
-- 
cgit v1.2.3


From c27ce5c9dd178f7218de6fd29651ad04bd496d5c Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 23 Aug 2022 13:38:40 +0800
Subject: soundwire: intel: remove unused PDM capabilities

We removed PDM support a long time ago but kept the definitions.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220823053846.2684635-6-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 66503cf29f48..34af5bc010b3 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -47,7 +47,6 @@
 
 #define SDW_SHIM_PCMSYCHM(x, y)		(0x022 + (0x60 * (x)) + (0x2 * (y)))
 #define SDW_SHIM_PCMSYCHC(x, y)		(0x042 + (0x60 * (x)) + (0x2 * (y)))
-#define SDW_SHIM_PDMSCAP(x)		(0x062 + 0x60 * (x))
 #define SDW_SHIM_IOCTL(x)		(0x06C + 0x60 * (x))
 #define SDW_SHIM_CTMCTL(x)		(0x06E + 0x60 * (x))
 
@@ -63,11 +62,6 @@
 #define SDW_SHIM_PCMSYCM_STREAM		GENMASK(13, 8)
 #define SDW_SHIM_PCMSYCM_DIR		BIT(15)
 
-#define SDW_SHIM_PDMSCAP_ISS		GENMASK(3, 0)
-#define SDW_SHIM_PDMSCAP_OSS		GENMASK(7, 4)
-#define SDW_SHIM_PDMSCAP_BSS		GENMASK(12, 8)
-#define SDW_SHIM_PDMSCAP_CPSS		GENMASK(15, 13)
-
 #define SDW_SHIM_IOCTL_MIF		BIT(0)
 #define SDW_SHIM_IOCTL_CO		BIT(1)
 #define SDW_SHIM_IOCTL_COE		BIT(2)
-- 
cgit v1.2.3


From bd45a65dad8e13ba47f8fb3d6a6c68adecc96db7 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 23 Aug 2022 13:38:41 +0800
Subject: soundwire: intel: add comment for control stream cap/chmap

add comment and newline to mark out control stream capabilities and
chmap. These registers are unused by the driver, only dumped in
debugfs.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220823053846.2684635-7-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 34af5bc010b3..7c1d111c5f3f 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -38,11 +38,13 @@
 #define SDW_SHIM_SYNC_CMDSYNC		BIT(16)
 #define SDW_SHIM_SYNC_SYNCGO		BIT(24)
 
+/* Control stream capabililities and channel mask */
 #define SDW_SHIM_CTLSCAP(x)		(0x010 + 0x60 * (x))
 #define SDW_SHIM_CTLS0CM(x)		(0x012 + 0x60 * (x))
 #define SDW_SHIM_CTLS1CM(x)		(0x014 + 0x60 * (x))
 #define SDW_SHIM_CTLS2CM(x)		(0x016 + 0x60 * (x))
 #define SDW_SHIM_CTLS3CM(x)		(0x018 + 0x60 * (x))
+
 #define SDW_SHIM_PCMSCAP(x)		(0x020 + 0x60 * (x))
 
 #define SDW_SHIM_PCMSYCHM(x, y)		(0x022 + (0x60 * (x)) + (0x2 * (y)))
-- 
cgit v1.2.3


From 40f7a3ddf4e4eff8868d0b0e4ed072f41ab6a528 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 23 Aug 2022 13:38:42 +0800
Subject: soundwire: intel: cleanup PCM stream capabilities

Regroup offset and bitfields

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220823053846.2684635-8-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 7c1d111c5f3f..958628e936ea 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -45,8 +45,13 @@
 #define SDW_SHIM_CTLS2CM(x)		(0x016 + 0x60 * (x))
 #define SDW_SHIM_CTLS3CM(x)		(0x018 + 0x60 * (x))
 
+/* PCM Stream capabilities */
 #define SDW_SHIM_PCMSCAP(x)		(0x020 + 0x60 * (x))
 
+#define SDW_SHIM_PCMSCAP_ISS		GENMASK(3, 0)
+#define SDW_SHIM_PCMSCAP_OSS		GENMASK(7, 4)
+#define SDW_SHIM_PCMSCAP_BSS		GENMASK(12, 8)
+
 #define SDW_SHIM_PCMSYCHM(x, y)		(0x022 + (0x60 * (x)) + (0x2 * (y)))
 #define SDW_SHIM_PCMSYCHC(x, y)		(0x042 + (0x60 * (x)) + (0x2 * (y)))
 #define SDW_SHIM_IOCTL(x)		(0x06C + 0x60 * (x))
@@ -55,10 +60,6 @@
 #define SDW_SHIM_WAKEEN			0x190
 #define SDW_SHIM_WAKESTS		0x192
 
-#define SDW_SHIM_PCMSCAP_ISS		GENMASK(3, 0)
-#define SDW_SHIM_PCMSCAP_OSS		GENMASK(7, 4)
-#define SDW_SHIM_PCMSCAP_BSS		GENMASK(12, 8)
-
 #define SDW_SHIM_PCMSYCM_LCHN		GENMASK(3, 0)
 #define SDW_SHIM_PCMSYCM_HCHN		GENMASK(7, 4)
 #define SDW_SHIM_PCMSYCM_STREAM		GENMASK(13, 8)
-- 
cgit v1.2.3


From 5c0d256201ec0e47e42ca8cafd3bee54f1c923f0 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 23 Aug 2022 13:38:43 +0800
Subject: soundwire: intel: cleanup PCM Stream channel map and channel count

Regroup offset and bitfield definitions.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220823053846.2684635-9-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 958628e936ea..a1810701642e 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -52,19 +52,23 @@
 #define SDW_SHIM_PCMSCAP_OSS		GENMASK(7, 4)
 #define SDW_SHIM_PCMSCAP_BSS		GENMASK(12, 8)
 
+/* PCM Stream Channel Map */
 #define SDW_SHIM_PCMSYCHM(x, y)		(0x022 + (0x60 * (x)) + (0x2 * (y)))
-#define SDW_SHIM_PCMSYCHC(x, y)		(0x042 + (0x60 * (x)) + (0x2 * (y)))
-#define SDW_SHIM_IOCTL(x)		(0x06C + 0x60 * (x))
-#define SDW_SHIM_CTMCTL(x)		(0x06E + 0x60 * (x))
 
-#define SDW_SHIM_WAKEEN			0x190
-#define SDW_SHIM_WAKESTS		0x192
+/* PCM Stream Channel Count */
+#define SDW_SHIM_PCMSYCHC(x, y)		(0x042 + (0x60 * (x)) + (0x2 * (y)))
 
 #define SDW_SHIM_PCMSYCM_LCHN		GENMASK(3, 0)
 #define SDW_SHIM_PCMSYCM_HCHN		GENMASK(7, 4)
 #define SDW_SHIM_PCMSYCM_STREAM		GENMASK(13, 8)
 #define SDW_SHIM_PCMSYCM_DIR		BIT(15)
 
+#define SDW_SHIM_IOCTL(x)		(0x06C + 0x60 * (x))
+#define SDW_SHIM_CTMCTL(x)		(0x06E + 0x60 * (x))
+
+#define SDW_SHIM_WAKEEN			0x190
+#define SDW_SHIM_WAKESTS		0x192
+
 #define SDW_SHIM_IOCTL_MIF		BIT(0)
 #define SDW_SHIM_IOCTL_CO		BIT(1)
 #define SDW_SHIM_IOCTL_COE		BIT(2)
-- 
cgit v1.2.3


From 3ea29d33651db069080d1e3b12aea5ef36d766d1 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 23 Aug 2022 13:38:44 +0800
Subject: soundwire: intel: cleanup IO control

Regroup offset and bitfield definitions.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220823053846.2684635-10-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index a1810701642e..a2e1927ac8ac 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -63,11 +63,8 @@
 #define SDW_SHIM_PCMSYCM_STREAM		GENMASK(13, 8)
 #define SDW_SHIM_PCMSYCM_DIR		BIT(15)
 
+/* IO control */
 #define SDW_SHIM_IOCTL(x)		(0x06C + 0x60 * (x))
-#define SDW_SHIM_CTMCTL(x)		(0x06E + 0x60 * (x))
-
-#define SDW_SHIM_WAKEEN			0x190
-#define SDW_SHIM_WAKESTS		0x192
 
 #define SDW_SHIM_IOCTL_MIF		BIT(0)
 #define SDW_SHIM_IOCTL_CO		BIT(1)
@@ -79,6 +76,11 @@
 #define SDW_SHIM_IOCTL_CIBD		BIT(8)
 #define SDW_SHIM_IOCTL_DIBD		BIT(9)
 
+#define SDW_SHIM_CTMCTL(x)		(0x06E + 0x60 * (x))
+
+#define SDW_SHIM_WAKEEN			0x190
+#define SDW_SHIM_WAKESTS		0x192
+
 #define SDW_SHIM_CTMCTL_DACTQE		BIT(0)
 #define SDW_SHIM_CTMCTL_DODS		BIT(1)
 #define SDW_SHIM_CTMCTL_DOAIS		GENMASK(4, 3)
-- 
cgit v1.2.3


From bc7b9595392407549158651bc760c9bda2ab8b5d Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 23 Aug 2022 13:38:45 +0800
Subject: soundwire: intel: cleanup AC Timing Control

Regroup offset and bitfield definitions

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220823053846.2684635-11-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index a2e1927ac8ac..3a56fd5a6331 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -76,11 +76,12 @@
 #define SDW_SHIM_IOCTL_CIBD		BIT(8)
 #define SDW_SHIM_IOCTL_DIBD		BIT(9)
 
-#define SDW_SHIM_CTMCTL(x)		(0x06E + 0x60 * (x))
-
 #define SDW_SHIM_WAKEEN			0x190
 #define SDW_SHIM_WAKESTS		0x192
 
+/* AC Timing control */
+#define SDW_SHIM_CTMCTL(x)		(0x06E + 0x60 * (x))
+
 #define SDW_SHIM_CTMCTL_DACTQE		BIT(0)
 #define SDW_SHIM_CTMCTL_DODS		BIT(1)
 #define SDW_SHIM_CTMCTL_DOAIS		GENMASK(4, 3)
-- 
cgit v1.2.3


From 279e46bc298629b26436c90bd4b5d104cc1e0fb2 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 23 Aug 2022 13:38:46 +0800
Subject: soundwire: intel: cleanup WakeEnable and WakeStatus

Regroup offset and bitfield definitions.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20220823053846.2684635-12-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 3a56fd5a6331..2e9fd91572d4 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -76,9 +76,16 @@
 #define SDW_SHIM_IOCTL_CIBD		BIT(8)
 #define SDW_SHIM_IOCTL_DIBD		BIT(9)
 
+/* Wake Enable*/
 #define SDW_SHIM_WAKEEN			0x190
+
+#define SDW_SHIM_WAKEEN_ENABLE		BIT(0)
+
+/* Wake Status */
 #define SDW_SHIM_WAKESTS		0x192
 
+#define SDW_SHIM_WAKESTS_STATUS		BIT(0)
+
 /* AC Timing control */
 #define SDW_SHIM_CTMCTL(x)		(0x06E + 0x60 * (x))
 
@@ -86,9 +93,6 @@
 #define SDW_SHIM_CTMCTL_DODS		BIT(1)
 #define SDW_SHIM_CTMCTL_DOAIS		GENMASK(4, 3)
 
-#define SDW_SHIM_WAKEEN_ENABLE		BIT(0)
-#define SDW_SHIM_WAKESTS_STATUS		BIT(0)
-
 /* Intel ALH Register definitions */
 #define SDW_ALH_STRMZCFG(x)		(0x000 + (0x4 * (x)))
 #define SDW_ALH_NUM_STREAMS		64
-- 
cgit v1.2.3


From 8dfa9d554061873f96335730fb1d403698b2b1b4 Mon Sep 17 00:00:00 2001
From: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Date: Wed, 17 Aug 2022 19:18:25 +0900
Subject: mm/slab_common: move declaration of __ksize() to mm/slab.h

__ksize() is only called by KASAN. Remove export symbol and move
declaration to mm/slab.h as we don't want to grow its callers.

Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index c8e485ce8815..9b592e611cb1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -187,7 +187,6 @@ int kmem_cache_shrink(struct kmem_cache *s);
 void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2);
 void kfree(const void *objp);
 void kfree_sensitive(const void *objp);
-size_t __ksize(const void *objp);
 size_t ksize(const void *objp);
 #ifdef CONFIG_PRINTK
 bool kmem_valid_obj(void *object);
-- 
cgit v1.2.3


From ac56a0b48da86fd1b4389632fb7c4c8a5d86eefa Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 26 Aug 2022 15:39:28 +0100
Subject: rxrpc: Fix ICMP/ICMP6 error handling

Because rxrpc pretends to be a tunnel on top of a UDP/UDP6 socket, allowing
it to siphon off UDP packets early in the handling of received UDP packets
thereby avoiding the packet going through the UDP receive queue, it doesn't
get ICMP packets through the UDP ->sk_error_report() callback.  In fact, it
doesn't appear that there's any usable option for getting hold of ICMP
packets.

Fix this by adding a new UDP encap hook to distribute error messages for
UDP tunnels.  If the hook is set, then the tunnel driver will be able to
see ICMP packets.  The hook provides the offset into the packet of the UDP
header of the original packet that caused the notification.

An alternative would be to call the ->error_handler() hook - but that
requires that the skbuff be cloned (as ip_icmp_error() or ipv6_cmp_error()
do, though isn't really necessary or desirable in rxrpc's case is we want
to parse them there and then, not queue them).

Changes
=======
ver #3)
 - Fixed an uninitialised variable.

ver #2)
 - Fixed some missing CONFIG_AF_RXRPC_IPV6 conditionals.

Fixes: 5271953cad31 ("rxrpc: Use the UDP encap_rcv hook")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/udp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 254a2654400f..e96da4157d04 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -70,6 +70,7 @@ struct udp_sock {
 	 * For encapsulation sockets.
 	 */
 	int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
+	void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset);
 	int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb);
 	void (*encap_destroy)(struct sock *sk);
 
-- 
cgit v1.2.3


From 4e55e22d3d9aa50ef1ba059bf3a53aa61109c179 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Wed, 31 Aug 2022 15:50:52 +0300
Subject: USB: hcd-pci: Drop the unused id parameter from usb_hcd_pci_probe()

Since the HC driver is now passed to the function with its
own parameter (it used to be picked from id->driver_data),
the id parameter serves no purpose.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20220831125052.71584-1-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/hcd.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 67f8713d3fa3..78cd566ee238 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -479,7 +479,6 @@ static inline int ehset_single_step_set_feature(struct usb_hcd *hcd, int port)
 struct pci_dev;
 struct pci_device_id;
 extern int usb_hcd_pci_probe(struct pci_dev *dev,
-			     const struct pci_device_id *id,
 			     const struct hc_driver *driver);
 extern void usb_hcd_pci_remove(struct pci_dev *dev);
 extern void usb_hcd_pci_shutdown(struct pci_dev *dev);
-- 
cgit v1.2.3


From a97126265dfe10d3321c0fde4708a6cea49b19ed Mon Sep 17 00:00:00 2001
From: Henning Schild <henning.schild@siemens.com>
Date: Thu, 25 Aug 2022 12:44:20 +0200
Subject: leds: simatic-ipc-leds-gpio: add new model 227G

This adds support of the Siemens Simatic IPC227G. Its LEDs are connected
to GPIO pins provided by the gpio-f7188x module. We make sure that
gets loaded, if not enabled in the kernel config no LED support will be
available.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Henning Schild <henning.schild@siemens.com>
Link: https://lore.kernel.org/r/20220825104422.14156-6-henning.schild@siemens.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/simatic-ipc-base.h | 1 +
 include/linux/platform_data/x86/simatic-ipc.h      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/simatic-ipc-base.h b/include/linux/platform_data/x86/simatic-ipc-base.h
index 39fefd48cf4d..57d6a10dfc9e 100644
--- a/include/linux/platform_data/x86/simatic-ipc-base.h
+++ b/include/linux/platform_data/x86/simatic-ipc-base.h
@@ -19,6 +19,7 @@
 #define SIMATIC_IPC_DEVICE_427E 2
 #define SIMATIC_IPC_DEVICE_127E 3
 #define SIMATIC_IPC_DEVICE_227E 4
+#define SIMATIC_IPC_DEVICE_227G 5
 
 struct simatic_ipc_platform {
 	u8	devmode;
diff --git a/include/linux/platform_data/x86/simatic-ipc.h b/include/linux/platform_data/x86/simatic-ipc.h
index f3b76b39776b..7a2e79f3be0b 100644
--- a/include/linux/platform_data/x86/simatic-ipc.h
+++ b/include/linux/platform_data/x86/simatic-ipc.h
@@ -31,6 +31,7 @@ enum simatic_ipc_station_ids {
 	SIMATIC_IPC_IPC427E = 0x00000A01,
 	SIMATIC_IPC_IPC477E = 0x00000A02,
 	SIMATIC_IPC_IPC127E = 0x00000D01,
+	SIMATIC_IPC_IPC227G = 0x00000F01,
 };
 
 static inline u32 simatic_ipc_get_station_id(u8 *data, int max_len)
-- 
cgit v1.2.3


From 8f5c9858c5db129359b5de2f60f5f034bf5d56c0 Mon Sep 17 00:00:00 2001
From: Henning Schild <henning.schild@siemens.com>
Date: Thu, 25 Aug 2022 12:44:22 +0200
Subject: platform/x86: simatic-ipc: add new model 427G

This adds support for the Siemens Simatic IPC427G. A board which
basically works like the 227G added in a previous patch. So all there is
to do is to add the station_id and make it take all the 227G branches.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Henning Schild <henning.schild@siemens.com>
Link: https://lore.kernel.org/r/20220825104422.14156-8-henning.schild@siemens.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/simatic-ipc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/simatic-ipc.h b/include/linux/platform_data/x86/simatic-ipc.h
index 7a2e79f3be0b..632320ec8f08 100644
--- a/include/linux/platform_data/x86/simatic-ipc.h
+++ b/include/linux/platform_data/x86/simatic-ipc.h
@@ -32,6 +32,7 @@ enum simatic_ipc_station_ids {
 	SIMATIC_IPC_IPC477E = 0x00000A02,
 	SIMATIC_IPC_IPC127E = 0x00000D01,
 	SIMATIC_IPC_IPC227G = 0x00000F01,
+	SIMATIC_IPC_IPC427G = 0x00001001,
 };
 
 static inline u32 simatic_ipc_get_station_id(u8 *data, int max_len)
-- 
cgit v1.2.3


From 0a64ce6e5442bbd96cbe9057d9ba1edab244f25b Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 30 Aug 2022 16:50:04 +0200
Subject: kernel/panic: Drop unblank_screen call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

console_unblank() does this too (called in both places right after),
and with a lot more confidence inspiring approach to locking.

Reconstructing this story is very strange:

In b61312d353da ("oops handling: ensure that any oops is flushed to
the mtdoops console") it is claimed that a printk(" "); flushed out
the console buffer, which was removed in e3e8a75d2acf ("[PATCH]
Extract and use wake_up_klogd()"). In todays kernels this is done way
earlier in console_flush_on_panic with some really nasty tricks. I
didn't bother to fully reconstruct this all, least because the call to
bust_spinlock(0); gets moved every few years, depending upon how the
wind blows (or well, who screamed loudest about the various issue each
call site caused).

Before that commit the only calls to console_unblank() where in s390
arch code.

The other side here is the console->unblank callback, which was
introduced in 2.1.31 for the vt driver. Which predates the
console_unblank() function by a lot, which was added (without users)
in 2.4.14.3. So pretty much impossible to guess at any motivation
here. Also afaict the vt driver is the only (and always was the only)
console driver implementing the unblank callback, so no idea why a
call to console_unblank() was added for the mtdooops driver - the
action actually flushing out the console buffers is done from
console_unlock() only.

Note that as prep for the s390 users the locking was adjusted in
2.5.22 (I couldn't figure out how to properly reference the BK commit
from the historical git trees) from a normal semaphore to a trylock.

Note that a copy of the direct unblank_screen() call was added to
panic() in c7c3f05e341a ("panic: avoid deadlocks in re-entrant console
drivers"), which partially inlined the bust_spinlocks(0); call.

Long story short, I have no idea why the direct call to unblank_screen
survived for so long (the infrastructure to do it properly existed for
years), nor why it wasn't removed when the console_unblank() call was
finally added. But it makes a ton more sense to finally do that than
not - it's just better encapsulation to go through the console
functions instead of doing a direct call, so let's dare. Plus it
really does not make much sense to call the only unblank
implementation there is twice, once without, and once with appropriate
locking.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Ilpo Järvinen" <ilpo.jarvinen@linux.intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Xuezhi Zhang <zhangxuezhi1@coolpad.com>
Cc: Yangxi Xiang <xyangxi5@gmail.com>
Cc: nick black <dankamongmen@gmail.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: "Guilherme G. Piccoli" <gpiccoli@igalia.com>
Cc: Marco Elver <elver@google.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: David Gow <davidgow@google.com>
Cc: tangmeng <tangmeng@uniontech.com>
Cc: Tiezhu Yang <yangtiezhu@loongson.cn>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/r/20220830145004.430545-1-daniel.vetter@ffwll.ch
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/vt_kern.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index b5ab452fca5b..c1f5aebef170 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -30,7 +30,6 @@ struct vc_data *vc_deallocate(unsigned int console);
 void reset_palette(struct vc_data *vc);
 void do_blank_screen(int entering_gfx);
 void do_unblank_screen(int leaving_gfx);
-void unblank_screen(void);
 void poke_blanked_console(void);
 int con_font_op(struct vc_data *vc, struct console_font_op *op);
 int con_set_cmap(unsigned char __user *cmap);
-- 
cgit v1.2.3


From 3954cf4338becb1f140bb6fa4f5e9a42f2529b86 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Aug 2022 08:14:24 +0200
Subject: devres: remove devm_ioremap_np

devm_ioremap_np has never been used anywhere since it was added in early
2021, so remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220822061424.151819-1-hch@lst.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/io.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io.h b/include/linux/io.h
index 5fc800390fe4..308f4f0cfb93 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -59,8 +59,6 @@ void __iomem *devm_ioremap_uc(struct device *dev, resource_size_t offset,
 				   resource_size_t size);
 void __iomem *devm_ioremap_wc(struct device *dev, resource_size_t offset,
 				   resource_size_t size);
-void __iomem *devm_ioremap_np(struct device *dev, resource_size_t offset,
-				   resource_size_t size);
 void devm_iounmap(struct device *dev, void __iomem *addr);
 int check_signature(const volatile void __iomem *io_addr,
 			const unsigned char *signature, int length);
-- 
cgit v1.2.3


From c25491747b21536bd56dccb82a109754bbc8d52c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 27 Aug 2022 19:04:37 -1000
Subject: kernfs: Add KERNFS_REMOVING flags

KERNFS_ACTIVATED tracks whether a given node has ever been activated. As a
node was only deactivated on removal, this was used for

 1. Drain optimization (removed by the previous patch).
 2. To hide !activated nodes
 3. To avoid double activations
 4. Reject adding children to a node being removed
 5. Skip activaing a node which is being removed.

We want to decouple deactivation from removal so that nodes can be
deactivated and hidden dynamically, which makes KERNFS_ACTIVATED useless for
all of the above purposes.

#1 is already gone. #2 and #3 can instead test whether the node is currently
active. A new flag KERNFS_REMOVING is added to explicitly mark nodes which
are being removed for #4 and #5.

While this leaves KERNFS_ACTIVATED with no users, leave it be as it will be
used in a following patch.

Cc: Chengming Zhou <zhouchengming@bytedance.com>
Tested-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20220828050440.734579-7-tj@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kernfs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 367044d7708c..b77d257c1f7e 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -112,6 +112,7 @@ enum kernfs_node_flag {
 	KERNFS_SUICIDED		= 0x0800,
 	KERNFS_EMPTY_DIR	= 0x1000,
 	KERNFS_HAS_RELEASE	= 0x2000,
+	KERNFS_REMOVING		= 0x4000,
 };
 
 /* @flags for kernfs_create_root() */
-- 
cgit v1.2.3


From 783bd07d095b722108725af11113795ee046ed0e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 27 Aug 2022 19:04:39 -1000
Subject: kernfs: Implement kernfs_show()

Currently, kernfs nodes can be created hidden and activated later by calling
kernfs_activate() to allow creation of multiple nodes to succeed or fail as
a unit. This is an one-way one-time-only transition. This patch introduces
kernfs_show() which can toggle visibility dynamically.

As the currently proposed use - toggling the cgroup pressure files - only
requires operating on leaf nodes, for the sake of simplicity, restrict it as
such for now.

Hiding uses the same mechanism as deactivation and likewise guarantees that
there are no in-flight operations on completion. KERNFS_ACTIVATED and
KERNFS_HIDDEN are used to manage the interactions between activations and
show/hide operations. A node is visible iff both activated & !hidden.

Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20220828050440.734579-9-tj@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kernfs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index b77d257c1f7e..73f5c120def8 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -108,6 +108,7 @@ enum kernfs_node_flag {
 	KERNFS_HAS_SEQ_SHOW	= 0x0040,
 	KERNFS_HAS_MMAP		= 0x0080,
 	KERNFS_LOCKDEP		= 0x0100,
+	KERNFS_HIDDEN		= 0x0200,
 	KERNFS_SUICIDAL		= 0x0400,
 	KERNFS_SUICIDED		= 0x0800,
 	KERNFS_EMPTY_DIR	= 0x1000,
@@ -430,6 +431,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 				       const char *name,
 				       struct kernfs_node *target);
 void kernfs_activate(struct kernfs_node *kn);
+void kernfs_show(struct kernfs_node *kn, bool show);
 void kernfs_remove(struct kernfs_node *kn);
 void kernfs_break_active_protection(struct kernfs_node *kn);
 void kernfs_unbreak_active_protection(struct kernfs_node *kn);
-- 
cgit v1.2.3


From e2691f6b44ed2135bfd005ad5fbabac4f433a7a1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 27 Aug 2022 19:04:40 -1000
Subject: cgroup: Implement cgroup_file_show()

Add cgroup_file_show() which allows toggling visibility of a cgroup file
using the new kernfs_show(). This will be used to hide psi interface files
on cgroups where it's disabled.

Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20220828050440.734579-10-tj@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cgroup.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ed53bfe7c46c..f61dd135efbe 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -114,6 +114,7 @@ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cftype *cfts);
 void cgroup_file_notify(struct cgroup_file *cfile);
+void cgroup_file_show(struct cgroup_file *cfile, bool show);
 
 int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
-- 
cgit v1.2.3


From 16ede66973c84f890c03584f79158dd5b2d725f5 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 25 Aug 2022 07:53:12 -0700
Subject: sbitmap: fix batched wait_cnt accounting

Batched completions can clear multiple bits, but we're only decrementing
the wait_cnt by one each time. This can cause waiters to never be woken,
stalling IO. Use the batched count instead.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=215679
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20220825145312.1217900-1-kbusch@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 8f5a86e210b9..4d2d5205ab58 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -575,8 +575,9 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
  * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
  * on a &struct sbitmap_queue.
  * @sbq: Bitmap queue to wake up.
+ * @nr: Number of bits cleared.
  */
-void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr);
 
 /**
  * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
-- 
cgit v1.2.3


From e34a0425b8ef524355811e7408dc1d53d08dc538 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 26 Aug 2022 16:34:01 -0300
Subject: vfio/pci: Split linux/vfio_pci_core.h

The header in include/linux should have only the exported interface for
other vfio_pci modules to use.  Internal definitions for vfio_pci.ko
should be in a "priv" header along side the .c files.

Move the internal declarations out of vfio_pci_core.h. They either move to
vfio_pci_priv.h or to the C file that is the only user.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/1-v2-1bd95d72f298+e0e-vfio_pci_priv_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio_pci_core.h | 134 +-----------------------------------------
 1 file changed, 1 insertion(+), 133 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 5579ece4347b..9d18b832e61a 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -20,39 +20,10 @@
 #define VFIO_PCI_CORE_H
 
 #define VFIO_PCI_OFFSET_SHIFT   40
-
 #define VFIO_PCI_OFFSET_TO_INDEX(off)	(off >> VFIO_PCI_OFFSET_SHIFT)
 #define VFIO_PCI_INDEX_TO_OFFSET(index)	((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
 #define VFIO_PCI_OFFSET_MASK	(((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
 
-/* Special capability IDs predefined access */
-#define PCI_CAP_ID_INVALID		0xFF	/* default raw access */
-#define PCI_CAP_ID_INVALID_VIRT		0xFE	/* default virt access */
-
-/* Cap maximum number of ioeventfds per device (arbitrary) */
-#define VFIO_PCI_IOEVENTFD_MAX		1000
-
-struct vfio_pci_ioeventfd {
-	struct list_head	next;
-	struct vfio_pci_core_device	*vdev;
-	struct virqfd		*virqfd;
-	void __iomem		*addr;
-	uint64_t		data;
-	loff_t			pos;
-	int			bar;
-	int			count;
-	bool			test_mem;
-};
-
-struct vfio_pci_irq_ctx {
-	struct eventfd_ctx	*trigger;
-	struct virqfd		*unmask;
-	struct virqfd		*mask;
-	char			*name;
-	bool			masked;
-	struct irq_bypass_producer	producer;
-};
-
 struct vfio_pci_core_device;
 struct vfio_pci_region;
 
@@ -78,23 +49,6 @@ struct vfio_pci_region {
 	u32				flags;
 };
 
-struct vfio_pci_dummy_resource {
-	struct resource		resource;
-	int			index;
-	struct list_head	res_next;
-};
-
-struct vfio_pci_vf_token {
-	struct mutex		lock;
-	uuid_t			uuid;
-	int			users;
-};
-
-struct vfio_pci_mmap_vma {
-	struct vm_area_struct	*vma;
-	struct list_head	vma_next;
-};
-
 struct vfio_pci_core_device {
 	struct vfio_device	vdev;
 	struct pci_dev		*pdev;
@@ -141,92 +95,11 @@ struct vfio_pci_core_device {
 	struct rw_semaphore	memory_lock;
 };
 
-#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
-#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX)
-#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
-#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
-#define irq_is(vdev, type) (vdev->irq_type == type)
-
-void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
-void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
-
-int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev,
-			    uint32_t flags, unsigned index,
-			    unsigned start, unsigned count, void *data);
-
-ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev,
-			   char __user *buf, size_t count,
-			   loff_t *ppos, bool iswrite);
-
-ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
-			size_t count, loff_t *ppos, bool iswrite);
-
-#ifdef CONFIG_VFIO_PCI_VGA
-ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
-			size_t count, loff_t *ppos, bool iswrite);
-#else
-static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev,
-				      char __user *buf, size_t count,
-				      loff_t *ppos, bool iswrite)
-{
-	return -EINVAL;
-}
-#endif
-
-long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
-			uint64_t data, int count, int fd);
-
-int vfio_pci_init_perm_bits(void);
-void vfio_pci_uninit_perm_bits(void);
-
-int vfio_config_init(struct vfio_pci_core_device *vdev);
-void vfio_config_free(struct vfio_pci_core_device *vdev);
-
+/* Will be exported for vfio pci drivers usage */
 int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
 				 unsigned int type, unsigned int subtype,
 				 const struct vfio_pci_regops *ops,
 				 size_t size, u32 flags, void *data);
-
-int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
-			     pci_power_t state);
-
-bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
-void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev);
-u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
-void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
-					u16 cmd);
-
-#ifdef CONFIG_VFIO_PCI_IGD
-int vfio_pci_igd_init(struct vfio_pci_core_device *vdev);
-#else
-static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
-{
-	return -ENODEV;
-}
-#endif
-
-#ifdef CONFIG_VFIO_PCI_ZDEV_KVM
-int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
-				struct vfio_info_cap *caps);
-int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev);
-void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev);
-#else
-static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
-					      struct vfio_info_cap *caps)
-{
-	return -ENODEV;
-}
-
-static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
-{
-	return 0;
-}
-
-static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
-{}
-#endif
-
-/* Will be exported for vfio pci drivers usage */
 void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga,
 			      bool is_disable_idle_d3);
 void vfio_pci_core_close_device(struct vfio_device *core_vdev);
@@ -256,9 +129,4 @@ void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
 pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
 						pci_channel_state_t state);
 
-static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
-{
-	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
-}
-
 #endif /* VFIO_PCI_CORE_H */
-- 
cgit v1.2.3


From 1e979ef5df8b7b604a625343a179b812a7984068 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 26 Aug 2022 16:34:02 -0300
Subject: vfio/pci: Rename vfio_pci_register_dev_region()

As this is part of the vfio_pci_core component it should be called
vfio_pci_core_register_dev_region() like everything else exported from
this module.

Suggested-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/2-v2-1bd95d72f298+e0e-vfio_pci_priv_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio_pci_core.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 9d18b832e61a..e5cf0d3313a6 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -96,10 +96,10 @@ struct vfio_pci_core_device {
 };
 
 /* Will be exported for vfio pci drivers usage */
-int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
-				 unsigned int type, unsigned int subtype,
-				 const struct vfio_pci_regops *ops,
-				 size_t size, u32 flags, void *data);
+int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
+				      unsigned int type, unsigned int subtype,
+				      const struct vfio_pci_regops *ops,
+				      size_t size, u32 flags, void *data);
 void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga,
 			      bool is_disable_idle_d3);
 void vfio_pci_core_close_device(struct vfio_device *core_vdev);
-- 
cgit v1.2.3


From 4813724c4b76b62f8e6b60dd5655633d0db1c9a8 Mon Sep 17 00:00:00 2001
From: Abhishek Sahu <abhsahu@nvidia.com>
Date: Mon, 29 Aug 2022 17:18:48 +0530
Subject: vfio/pci: Mask INTx during runtime suspend

This patch adds INTx handling during runtime suspend/resume.
All the suspend/resume related code for the user to put the device
into the low power state will be added in subsequent patches.

The INTx lines may be shared among devices. Whenever any INTx
interrupt comes for the VFIO devices, then vfio_intx_handler() will be
called for each device sharing the interrupt. Inside vfio_intx_handler(),
it calls pci_check_and_mask_intx() and checks if the interrupt has
been generated for the current device. Now, if the device is already
in the D3cold state, then the config space can not be read. Attempt
to read config space in D3cold state can cause system unresponsiveness
in a few systems. To prevent this, mask INTx in runtime suspend callback,
and unmask the same in runtime resume callback. If INTx has been already
masked, then no handling is needed in runtime suspend/resume callbacks.
'pm_intx_masked' tracks this, and vfio_pci_intx_mask() has been updated
to return true if the INTx vfio_pci_irq_ctx.masked value is changed
inside this function.

For the runtime suspend which is triggered for the no user of VFIO
device, the 'irq_type' will be VFIO_PCI_NUM_IRQS and these
callbacks won't do anything.

The MSI/MSI-X are not shared so similar handling should not be
needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal()
without doing any device-specific config access. When the user performs
any config access or IOCTL after receiving the eventfd notification,
then the device will be moved to the D0 state first before
servicing any request.

Another option was to check this flag 'pm_intx_masked' inside
vfio_intx_handler() instead of masking the interrupts. This flag
is being set inside the runtime_suspend callback but the device
can be in non-D3cold state (for example, if the user has disabled D3cold
explicitly by sysfs, the D3cold is not supported in the platform, etc.).
Also, in D3cold supported case, the device will be in D0 till the
PCI core moves the device into D3cold. In this case, there is
a possibility that the device can generate an interrupt. Adding check
in the IRQ handler will not clear the IRQ status and the interrupt
line will still be asserted. This can cause interrupt flooding.

Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-4-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio_pci_core.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index e5cf0d3313a6..a0f1f36e42a2 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -78,6 +78,7 @@ struct vfio_pci_core_device {
 	bool			needs_reset;
 	bool			nointx;
 	bool			needs_pm_restore;
+	bool			pm_intx_masked;
 	struct pci_saved_state	*pci_saved_state;
 	struct pci_saved_state	*pm_save;
 	int			ioeventfds_nr;
-- 
cgit v1.2.3


From cc2742fe3660cc6500021d3da8f937d326392dbd Mon Sep 17 00:00:00 2001
From: Abhishek Sahu <abhsahu@nvidia.com>
Date: Mon, 29 Aug 2022 17:18:49 +0530
Subject: vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT

Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.

1. D3cold state can't be achieved by writing PCI standard
   PM config registers. This patch implements the following
   newly added low power related device features:
    - VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
    - VFIO_DEVICE_FEATURE_LOW_POWER_EXIT

   The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
   device to make use of low power platform states on the host
   while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
   further use of those power states.

2. The vfio-pci driver uses runtime PM framework for low power entry and
   exit. On the platforms where D3cold state is supported, the runtime
   PM framework will put the device into D3cold otherwise, D3hot or some
   other power state will be used.

   There are various cases where the device will not go into the runtime
   suspended state. For example,

   - The runtime power management is disabled on the host side for
     the device.
   - The user keeps the device busy after calling LOW_POWER_ENTRY.
   - There are dependent devices that are still in runtime active state.

   For these cases, the device will be in the same power state that has
   been configured by the user through PCI_PM_CTRL register.

3. The hypervisors can implement virtual ACPI methods. For example,
   in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
   resources with _ON/_OFF method, then guest linux OS invokes
   the _OFF method during D3cold transition and then _ON during D0
   transition. The hypervisor can tap these virtual ACPI calls and then
   call the low power device feature IOCTL.

4. The 'pm_runtime_engaged' flag tracks the entry and exit to
   runtime PM. This flag is protected with 'memory_lock' semaphore.

5. All the config and other region access are wrapped under
   pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
   device access happens while the device is in the runtime suspended
   state, then the device will be resumed first before access. Once the
   access has been finished, then the device will again go into the
   runtime suspended state.

6. The memory region access through mmap will not be allowed in the low
   power state. Since __vfio_pci_memory_enabled() is a common function,
   so check for 'pm_runtime_engaged' has been added explicitly in
   vfio_pci_mmap_fault() to block only mmap'ed access.

Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio_pci_core.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index a0f1f36e42a2..1025d53fde0b 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -79,6 +79,7 @@ struct vfio_pci_core_device {
 	bool			nointx;
 	bool			needs_pm_restore;
 	bool			pm_intx_masked;
+	bool			pm_runtime_engaged;
 	struct pci_saved_state	*pci_saved_state;
 	struct pci_saved_state	*pm_save;
 	int			ioeventfds_nr;
-- 
cgit v1.2.3


From 453e6c98fd2bdae0df9adbc86af8d8bf1164edd5 Mon Sep 17 00:00:00 2001
From: Abhishek Sahu <abhsahu@nvidia.com>
Date: Mon, 29 Aug 2022 17:18:50 +0530
Subject: vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP

This patch implements VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
device feature. In the VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY, if there is
any access for the VFIO device on the host side, then the device will
be moved out of the low power state without the user's guest driver
involvement. Once the device access has been finished, then the host
can move the device again into low power state. With the low power
entry happened through VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP,
the device will not be moved back into the low power state and
a notification will be sent to the user by triggering wakeup eventfd.

vfio_pci_core_pm_entry() will be called for both the variants of low
power feature entry so add an extra argument for wakeup eventfd context
and store locally in 'struct vfio_pci_core_device'.

For the entry happened without wakeup eventfd, all the exit related
handling will be done by the LOW_POWER_EXIT device feature only.
When the LOW_POWER_EXIT will be called, then the vfio core layer
vfio_device_pm_runtime_get() will increment the usage count and will
resume the device. In the driver runtime_resume callback, the
'pm_wake_eventfd_ctx' will be NULL. Then vfio_pci_core_pm_exit()
will call vfio_pci_runtime_pm_exit() and all the exit related handling
will be done.

For the entry happened with wakeup eventfd, in the driver resume
callback, eventfd will be triggered and all the exit related handling will
be done. When vfio_pci_runtime_pm_exit() will be called by
vfio_pci_core_pm_exit(), then it will return early.
But if the runtime suspend has not happened on the host side, then
all the exit related handling will be done in vfio_pci_core_pm_exit()
only.

Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-6-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio_pci_core.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 1025d53fde0b..089b603bcfdc 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -85,6 +85,7 @@ struct vfio_pci_core_device {
 	int			ioeventfds_nr;
 	struct eventfd_ctx	*err_trigger;
 	struct eventfd_ctx	*req_trigger;
+	struct eventfd_ctx	*pm_wake_eventfd_ctx;
 	struct list_head	dummy_resources_list;
 	struct mutex		ioeventfds_lock;
 	struct list_head	ioeventfds_list;
-- 
cgit v1.2.3


From c8e477c649b40c1a073b7a843d89e51dc0037db7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 30 Jan 2022 19:57:52 -0500
Subject: ->getprocattr(): attribute name is const char *, TYVM...

cast of ->d_name.name to char * is completely wrong - nothing is
allowed to modify its contents.

Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/lsm_hook_defs.h | 2 +-
 include/linux/security.h      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 806448173033..03360d27bedf 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -253,7 +253,7 @@ LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops,
 LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb)
 LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry,
 	 struct inode *inode)
-LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, char *name,
+LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name,
 	 char **value)
 LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size)
 LSM_HOOK(int, 0, ismaclabel, const char *name)
diff --git a/include/linux/security.h b/include/linux/security.h
index 1bc362cb413f..93488c01d9bd 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -461,7 +461,7 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd);
 int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
 			unsigned nsops, int alter);
 void security_d_instantiate(struct dentry *dentry, struct inode *inode);
-int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
+int security_getprocattr(struct task_struct *p, const char *lsm, const char *name,
 			 char **value);
 int security_setprocattr(const char *lsm, const char *name, void *value,
 			 size_t size);
@@ -1301,7 +1301,7 @@ static inline void security_d_instantiate(struct dentry *dentry,
 { }
 
 static inline int security_getprocattr(struct task_struct *p, const char *lsm,
-				       char *name, char **value)
+				       const char *name, char **value)
 {
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From ea4af4aa03c3966c63231b4191da94497de8f034 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Aug 2022 13:19:18 -0400
Subject: nd_jump_link(): constify path

Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/namei.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/namei.h b/include/linux/namei.h
index caeb08a98536..00fee52df842 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -83,7 +83,7 @@ extern int follow_up(struct path *);
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
 
-extern int __must_check nd_jump_link(struct path *path);
+extern int __must_check nd_jump_link(const struct path *path);
 
 static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
 {
-- 
cgit v1.2.3


From 977f1aa5e4d1942bc8012bf3f0e695008979ff4e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 31 Aug 2022 18:44:27 +0000
Subject: net: bql: add more documentation

Add some documentation for netdev_tx_sent_queue() and
netdev_tx_completed_queue()

Stating that netdev_tx_completed_queue() must be called once
per TX completion round is apparently not obvious for everybody.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eec35f9b6616..cfe41c053e11 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3353,6 +3353,16 @@ static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_qu
 #endif
 }
 
+/**
+ *	netdev_tx_sent_queue - report the number of bytes queued to a given tx queue
+ *	@dev_queue: network device queue
+ *	@bytes: number of bytes queued to the device queue
+ *
+ *	Report the number of bytes queued for sending/completion to the network
+ *	device hardware queue. @bytes should be a good approximation and should
+ *	exactly match netdev_completed_queue() @bytes.
+ *	This is typically called once per packet, from ndo_start_xmit().
+ */
 static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue,
 					unsigned int bytes)
 {
@@ -3398,13 +3408,14 @@ static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue,
 }
 
 /**
- * 	netdev_sent_queue - report the number of bytes queued to hardware
- * 	@dev: network device
- * 	@bytes: number of bytes queued to the hardware device queue
+ *	netdev_sent_queue - report the number of bytes queued to hardware
+ *	@dev: network device
+ *	@bytes: number of bytes queued to the hardware device queue
  *
- * 	Report the number of bytes queued for sending/completion to the network
- * 	device hardware queue. @bytes should be a good approximation and should
- * 	exactly match netdev_completed_queue() @bytes
+ *	Report the number of bytes queued for sending/completion to the network
+ *	device hardware queue#0. @bytes should be a good approximation and should
+ *	exactly match netdev_completed_queue() @bytes.
+ *	This is typically called once per packet, from ndo_start_xmit().
  */
 static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes)
 {
@@ -3419,6 +3430,15 @@ static inline bool __netdev_sent_queue(struct net_device *dev,
 				      xmit_more);
 }
 
+/**
+ *	netdev_tx_completed_queue - report number of packets/bytes at TX completion.
+ *	@dev_queue: network device queue
+ *	@pkts: number of packets (currently ignored)
+ *	@bytes: number of bytes dequeued from the device queue
+ *
+ *	Must be called at most once per TX completion round (and not per
+ *	individual packet), so that BQL can adjust its limits appropriately.
+ */
 static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue,
 					     unsigned int pkts, unsigned int bytes)
 {
-- 
cgit v1.2.3


From 3261400639463a853ba2b3be8bd009c2a8089775 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 31 Aug 2022 23:38:09 +0000
Subject: tcp: TX zerocopy should not sense pfmemalloc status

We got a recent syzbot report [1] showing a possible misuse
of pfmemalloc page status in TCP zerocopy paths.

Indeed, for pages coming from user space or other layers,
using page_is_pfmemalloc() is moot, and possibly could give
false positives.

There has been attempts to make page_is_pfmemalloc() more robust,
but not using it in the first place in this context is probably better,
removing cpu cycles.

Note to stable teams :

You need to backport 84ce071e38a6 ("net: introduce
__skb_fill_page_desc_noacc") as a prereq.

Race is more probable after commit c07aea3ef4d4
("mm: add a signature in struct page") because page_is_pfmemalloc()
is now using low order bit from page->lru.next, which can change
more often than page->index.

Low order bit should never be set for lru.next (when used as an anchor
in LRU list), so KCSAN report is mostly a false positive.

Backporting to older kernel versions seems not necessary.

[1]
BUG: KCSAN: data-race in lru_add_fn / tcp_build_frag

write to 0xffffea0004a1d2c8 of 8 bytes by task 18600 on cpu 0:
__list_add include/linux/list.h:73 [inline]
list_add include/linux/list.h:88 [inline]
lruvec_add_folio include/linux/mm_inline.h:105 [inline]
lru_add_fn+0x440/0x520 mm/swap.c:228
folio_batch_move_lru+0x1e1/0x2a0 mm/swap.c:246
folio_batch_add_and_move mm/swap.c:263 [inline]
folio_add_lru+0xf1/0x140 mm/swap.c:490
filemap_add_folio+0xf8/0x150 mm/filemap.c:948
__filemap_get_folio+0x510/0x6d0 mm/filemap.c:1981
pagecache_get_page+0x26/0x190 mm/folio-compat.c:104
grab_cache_page_write_begin+0x2a/0x30 mm/folio-compat.c:116
ext4_da_write_begin+0x2dd/0x5f0 fs/ext4/inode.c:2988
generic_perform_write+0x1d4/0x3f0 mm/filemap.c:3738
ext4_buffered_write_iter+0x235/0x3e0 fs/ext4/file.c:270
ext4_file_write_iter+0x2e3/0x1210
call_write_iter include/linux/fs.h:2187 [inline]
new_sync_write fs/read_write.c:491 [inline]
vfs_write+0x468/0x760 fs/read_write.c:578
ksys_write+0xe8/0x1a0 fs/read_write.c:631
__do_sys_write fs/read_write.c:643 [inline]
__se_sys_write fs/read_write.c:640 [inline]
__x64_sys_write+0x3e/0x50 fs/read_write.c:640
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd

read to 0xffffea0004a1d2c8 of 8 bytes by task 18611 on cpu 1:
page_is_pfmemalloc include/linux/mm.h:1740 [inline]
__skb_fill_page_desc include/linux/skbuff.h:2422 [inline]
skb_fill_page_desc include/linux/skbuff.h:2443 [inline]
tcp_build_frag+0x613/0xb20 net/ipv4/tcp.c:1018
do_tcp_sendpages+0x3e8/0xaf0 net/ipv4/tcp.c:1075
tcp_sendpage_locked net/ipv4/tcp.c:1140 [inline]
tcp_sendpage+0x89/0xb0 net/ipv4/tcp.c:1150
inet_sendpage+0x7f/0xc0 net/ipv4/af_inet.c:833
kernel_sendpage+0x184/0x300 net/socket.c:3561
sock_sendpage+0x5a/0x70 net/socket.c:1054
pipe_to_sendpage+0x128/0x160 fs/splice.c:361
splice_from_pipe_feed fs/splice.c:415 [inline]
__splice_from_pipe+0x222/0x4d0 fs/splice.c:559
splice_from_pipe fs/splice.c:594 [inline]
generic_splice_sendpage+0x89/0xc0 fs/splice.c:743
do_splice_from fs/splice.c:764 [inline]
direct_splice_actor+0x80/0xa0 fs/splice.c:931
splice_direct_to_actor+0x305/0x620 fs/splice.c:886
do_splice_direct+0xfb/0x180 fs/splice.c:974
do_sendfile+0x3bf/0x910 fs/read_write.c:1249
__do_sys_sendfile64 fs/read_write.c:1317 [inline]
__se_sys_sendfile64 fs/read_write.c:1303 [inline]
__x64_sys_sendfile64+0x10c/0x150 fs/read_write.c:1303
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd

value changed: 0x0000000000000000 -> 0xffffea0004a1d288

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 18611 Comm: syz-executor.4 Not tainted 6.0.0-rc2-syzkaller-00248-ge022620b5d05-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/22/2022

Fixes: c07aea3ef4d4 ("mm: add a signature in struct page")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ca8afa382bf2..18e163a3460d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2444,6 +2444,27 @@ static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
 	skb_shinfo(skb)->nr_frags = i + 1;
 }
 
+/**
+ * skb_fill_page_desc_noacc - initialise a paged fragment in an skb
+ * @skb: buffer containing fragment to be initialised
+ * @i: paged fragment index to initialise
+ * @page: the page to use for this fragment
+ * @off: the offset to the data with @page
+ * @size: the length of the data
+ *
+ * Variant of skb_fill_page_desc() which does not deal with
+ * pfmemalloc, if page is not owned by us.
+ */
+static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i,
+					    struct page *page, int off,
+					    int size)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+	__skb_fill_page_desc_noacc(shinfo, i, page, off, size);
+	shinfo->nr_frags = i + 1;
+}
+
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 		     int size, unsigned int truesize);
 
-- 
cgit v1.2.3


From c3f760ef128789252e7c4f10d3c1721422dceba9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 31 Aug 2022 17:00:58 -0700
Subject: net: remove netif_tx_napi_add()

All callers are now gone.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cfe41c053e11..f0068c1ff1df 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2569,8 +2569,6 @@ netif_napi_add_tx_weight(struct net_device *dev,
 	netif_napi_add_weight(dev, napi, poll, weight);
 }
 
-#define netif_tx_napi_add netif_napi_add_tx_weight
-
 /**
  * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
  * @dev:  network device
-- 
cgit v1.2.3


From b30f7c8eb0780e1479a9882526e838664271f4c9 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 1 Sep 2022 13:07:32 +0100
Subject: spi: mux: Fix mux interaction with fast path optimisations

The spi-mux driver is rather too clever and attempts to resubmit any
message that is submitted to it to the parent controller with some
adjusted callbacks.  This does not play at all nicely with the fast
path which now sets flags on the message indicating that it's being
handled through the fast path, we see async messages flagged as being on
the fast path.  Ideally the spi-mux code would duplicate the message but
that's rather invasive and a bit fragile in that it relies on the mux
knowing which fields in the message to copy.  Instead teach the core
that there are controllers which can't cope with the fast path and have
the mux flag itself as being such a controller, ensuring that messages
going via the mux don't get partially handled via the fast path.

This will reduce the performance of any spi-mux connected device since
we'll now always use the thread for both the actual controller and the
mux controller instead of just the actual controller but given that we
were always hitting the slow path anyway it's hopefully not too much of
an additional cost and it allows us to keep the fast path.

Fixes: ae7d2346dc89 ("spi: Don't use the message queue if possible in spi_sync")
Reported-by: Casper Andersson <casper.casan@gmail.com>
Tested-by: Casper Andersson <casper.casan@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20220901120732.49245-1-broonie@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index e6c73d5ff1a8..f089ee1ead58 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -469,6 +469,7 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  *	SPI_TRANS_FAIL_NO_START.
  * @queue_empty: signal green light for opportunistically skipping the queue
  *	for spi_sync transfers.
+ * @must_async: disable all fast paths in the core
  *
  * Each SPI controller can communicate with one or more @spi_device
  * children.  These make a small bus, sharing MOSI, MISO and SCK signals
@@ -690,6 +691,7 @@ struct spi_controller {
 
 	/* Flag for enabling opportunistic skipping of the queue in spi_sync */
 	bool			queue_empty;
+	bool			must_async;
 };
 
 static inline void *spi_controller_get_devdata(struct spi_controller *ctlr)
-- 
cgit v1.2.3


From dc84dbbcc97bfb47e0f2b175d816e601b2890c91 Mon Sep 17 00:00:00 2001
From: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Date: Wed, 31 Aug 2022 11:19:06 +0800
Subject: bpf, tnums: Warn against the usage of tnum_in(tnum_range(), ...)

Commit a657182a5c51 ("bpf: Don't use tnum_range on array range checking
for poke descriptors") has shown that using tnum_range() as argument to
tnum_in() can lead to misleading code that looks like tight bound check
when in fact the actual allowed range is much wider.

Document such behavior to warn against its usage in general, and suggest
some scenario where result can be trusted.

Signed-off-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/984b37f9fdf7ac36831d2137415a4a915744c1b6.1661462653.git.daniel@iogearbox.net
Link: https://www.openwall.com/lists/oss-security/2022/08/26/1
Link: https://lore.kernel.org/bpf/20220831031907.16133-3-shung-hsi.yu@suse.com
Link: https://lore.kernel.org/bpf/20220831031907.16133-2-shung-hsi.yu@suse.com
---
 include/linux/tnum.h | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index 498dbcedb451..1c3948a1d6ad 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -21,7 +21,12 @@ struct tnum {
 struct tnum tnum_const(u64 value);
 /* A completely unknown value */
 extern const struct tnum tnum_unknown;
-/* A value that's unknown except that @min <= value <= @max */
+/* An unknown value that is a superset of @min <= value <= @max.
+ *
+ * Could include values outside the range of [@min, @max].
+ * For example tnum_range(0, 2) is represented by {0, 1, 2, *3*},
+ * rather than the intended set of {0, 1, 2}.
+ */
 struct tnum tnum_range(u64 min, u64 max);
 
 /* Arithmetic and logical ops */
@@ -73,7 +78,18 @@ static inline bool tnum_is_unknown(struct tnum a)
  */
 bool tnum_is_aligned(struct tnum a, u64 size);
 
-/* Returns true if @b represents a subset of @a. */
+/* Returns true if @b represents a subset of @a.
+ *
+ * Note that using tnum_range() as @a requires extra cautions as tnum_in() may
+ * return true unexpectedly due to tnum limited ability to represent tight
+ * range, e.g.
+ *
+ *   tnum_in(tnum_range(0, 2), tnum_const(3)) == true
+ *
+ * As a rule of thumb, if @a is explicitly coded rather than coming from
+ * reg->var_off, it should be in form of tnum_const(), tnum_range(0, 2**n - 1),
+ * or tnum_range(2**n, 2**(n+1) - 1).
+ */
 bool tnum_in(struct tnum a, struct tnum b);
 
 /* Formatting functions.  These have snprintf-like semantics: they will write
-- 
cgit v1.2.3


From 4ff09db1b79b98b4a2a7511571c640b76cab3beb Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 1 Sep 2022 17:28:02 -0700
Subject: bpf: net: Change sk_getsockopt() to take the sockptr_t argument

This patch changes sk_getsockopt() to take the sockptr_t argument
such that it can be used by bpf_getsockopt(SOL_SOCKET) in a
latter patch.

security_socket_getpeersec_stream() is not changed.  It stays
with the __user ptr (optval.user and optlen.user) to avoid changes
to other security hooks.  bpf_getsockopt(SOL_SOCKET) also does not
support SO_PEERSEC.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20220902002802.2888419-1-kafai@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h  | 3 +--
 include/linux/sockptr.h | 5 +++++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index a5f21dc3c432..527ae1d64e27 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -900,8 +900,7 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
 void sk_reuseport_prog_free(struct bpf_prog *prog);
 int sk_detach_filter(struct sock *sk);
-int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
-		  unsigned int len);
+int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);
 
 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
index d45902fb4cad..bae5e2369b4f 100644
--- a/include/linux/sockptr.h
+++ b/include/linux/sockptr.h
@@ -64,6 +64,11 @@ static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset,
 	return 0;
 }
 
+static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size)
+{
+	return copy_to_sockptr_offset(dst, 0, src, size);
+}
+
 static inline void *memdup_sockptr(sockptr_t src, size_t len)
 {
 	void *p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
-- 
cgit v1.2.3


From 728f064cd7ebea8c182e99e6f152c8b4a0a6b071 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 1 Sep 2022 17:28:28 -0700
Subject: bpf: net: Change do_ip_getsockopt() to take the sockptr_t argument

Similar to the earlier patch that changes sk_getsockopt() to
take the sockptr_t argument.  This patch also changes
do_ip_getsockopt() to take the sockptr_t argument such that
a latter patch can make bpf_getsockopt(SOL_IP) to reuse
do_ip_getsockopt().

Note on the change in ip_mc_gsfget().  This function is to
return an array of sockaddr_storage in optval.  This function
is shared between ip_get_mcast_msfilter() and
compat_ip_get_mcast_msfilter().  However, the sockaddr_storage
is stored at different offset of the optval because of
the difference between group_filter and compat_group_filter.
Thus, a new 'ss_offset' argument is added to ip_mc_gsfget().

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20220902002828.2890585-1-kafai@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/igmp.h   | 4 ++--
 include/linux/mroute.h | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 93c262ecbdc9..78890143f079 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -118,9 +118,9 @@ extern int ip_mc_source(int add, int omode, struct sock *sk,
 		struct ip_mreq_source *mreqs, int ifindex);
 extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex);
 extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
-		struct ip_msfilter __user *optval, int __user *optlen);
+			sockptr_t optval, sockptr_t optlen);
 extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
-			struct sockaddr_storage __user *p);
+			sockptr_t optval, size_t offset);
 extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt,
 			  int dif, int sdif);
 extern void ip_mc_init_dev(struct in_device *);
diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 6cbbfe94348c..80b8400ab8b2 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -17,7 +17,7 @@ static inline int ip_mroute_opt(int opt)
 }
 
 int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int);
-int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
+int ip_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t);
 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
 int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
 int ip_mr_init(void);
@@ -29,8 +29,8 @@ static inline int ip_mroute_setsockopt(struct sock *sock, int optname,
 	return -ENOPROTOOPT;
 }
 
-static inline int ip_mroute_getsockopt(struct sock *sock, int optname,
-				       char __user *optval, int __user *optlen)
+static inline int ip_mroute_getsockopt(struct sock *sk, int optname,
+				       sockptr_t optval, sockptr_t optlen)
 {
 	return -ENOPROTOOPT;
 }
-- 
cgit v1.2.3


From 6dadbe4bac68309eb46ab0f30e8ff47a789df49a Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 1 Sep 2022 17:28:53 -0700
Subject: bpf: net: Change do_ipv6_getsockopt() to take the sockptr_t argument

Similar to the earlier patch that changes sk_getsockopt() to
take the sockptr_t argument .  This patch also changes
do_ipv6_getsockopt() to take the sockptr_t argument such that
a latter patch can make bpf_getsockopt(SOL_IPV6) to reuse
do_ipv6_getsockopt().

Note on the change in ip6_mc_msfget().  This function is to
return an array of sockaddr_storage in optval.  This function
is shared between ipv6_get_msfilter() and compat_ipv6_get_msfilter().
However, the sockaddr_storage is stored at different offset of the
optval because of the difference between group_filter and
compat_group_filter.  Thus, a new 'ss_offset' argument is
added to ip6_mc_msfget().

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20220902002853.2892532-1-kafai@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/mroute6.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index bc351a85ce9b..8f2b307fb124 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -27,7 +27,7 @@ struct sock;
 
 #ifdef CONFIG_IPV6_MROUTE
 extern int ip6_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int);
-extern int ip6_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
+extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t);
 extern int ip6_mr_input(struct sk_buff *skb);
 extern int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg);
 extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
@@ -42,7 +42,7 @@ static inline int ip6_mroute_setsockopt(struct sock *sock, int optname,
 
 static inline
 int ip6_mroute_getsockopt(struct sock *sock,
-			  int optname, char __user *optval, int __user *optlen)
+			  int optname, sockptr_t optval, sockptr_t optlen)
 {
 	return -ENOPROTOOPT;
 }
-- 
cgit v1.2.3


From 2aec909912da55a6e469fd6ee8412080a5433ed2 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Aug 2022 11:46:38 +0200
Subject: wifi: use struct_group to copy addresses

We sometimes copy all the addresses from the 802.11 header
for the AAD, which may cause complaints from fortify checks.
Use struct_group() to avoid the compiler warnings/errors.

Change-Id: Ic3ea389105e7813b22095b295079eecdabde5045
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 55e6f4ad0ca6..b6e6d5b40774 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -310,9 +310,11 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 struct ieee80211_hdr {
 	__le16 frame_control;
 	__le16 duration_id;
-	u8 addr1[ETH_ALEN];
-	u8 addr2[ETH_ALEN];
-	u8 addr3[ETH_ALEN];
+	struct_group(addrs,
+		u8 addr1[ETH_ALEN];
+		u8 addr2[ETH_ALEN];
+		u8 addr3[ETH_ALEN];
+	);
 	__le16 seq_ctrl;
 	u8 addr4[ETH_ALEN];
 } __packed __aligned(2);
-- 
cgit v1.2.3


From bce1b56c73826fec8caf6187f0c922ede397a5a8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 4 Sep 2022 06:39:25 -0600
Subject: Revert "sbitmap: fix batched wait_cnt accounting"

This reverts commit 16ede66973c84f890c03584f79158dd5b2d725f5.

This is causing issues with CPU stalls on my test box, revert it for
now until we understand what is going on. It looks like infinite
looping off sbitmap_queue_wake_up(), but hard to tell with a lot of
CPUs hitting this issue and the console scrolling infinitely.

Link: https://lore.kernel.org/linux-block/e742813b-ce5c-0d58-205b-1626f639b1bd@kernel.dk/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 4d2d5205ab58..8f5a86e210b9 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -575,9 +575,8 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
  * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
  * on a &struct sbitmap_queue.
  * @sbq: Bitmap queue to wake up.
- * @nr: Number of bits cleared.
  */
-void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr);
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
 
 /**
  * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
-- 
cgit v1.2.3


From 2e9bffc4f713db465177238f6033f7d367d6f151 Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Thu, 25 Aug 2022 21:38:34 +0200
Subject: phy: rockchip: Support PCIe v3

RK3568 supports PCIe v3 using not Combphy like PCIe v2 on rk3566.
It use a dedicated PCIe-phy. Add support for this.

Initial support by Shawn Lin, modifications by Peter Geis and Frank
Wunderlich.

Add data-lanes property for splitting pcie-lanes across controllers.

The data-lanes is an array where x=0 means lane is disabled and  x > 0
means controller x is assigned to phy lane.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Suggested-by: Peter Geis <pgwipeout@gmail.com>
Signed-off-by: Frank Wunderlich <frank-w@public-files.de>
Link: https://lore.kernel.org/r/20220825193836.54262-4-linux@fw-web.de
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/phy/pcie.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 include/linux/phy/pcie.h

(limited to 'include/linux')

diff --git a/include/linux/phy/pcie.h b/include/linux/phy/pcie.h
new file mode 100644
index 000000000000..e7ac81764576
--- /dev/null
+++ b/include/linux/phy/pcie.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Rockchip Electronics Co., Ltd.
+ */
+#ifndef __PHY_PCIE_H
+#define __PHY_PCIE_H
+
+#define PHY_MODE_PCIE_RC 20
+#define PHY_MODE_PCIE_EP 21
+#define PHY_MODE_PCIE_BIFURCATION 22
+
+#endif
-- 
cgit v1.2.3


From 9c06002682ae0cdd01caf011899224acbc6582b2 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 13 Jul 2022 20:22:35 +0300
Subject: dmaengine: hsu: Include headers we are direct user of

For the sake of integrity, include headers we are direct user of.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220713172235.22611-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/hsu.h               | 6 ++++--
 include/linux/platform_data/dma-hsu.h | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma/hsu.h b/include/linux/dma/hsu.h
index a6b7bc707356..77ea602c287c 100644
--- a/include/linux/dma/hsu.h
+++ b/include/linux/dma/hsu.h
@@ -8,11 +8,13 @@
 #ifndef _DMA_HSU_H
 #define _DMA_HSU_H
 
-#include <linux/device.h>
-#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/kconfig.h>
+#include <linux/types.h>
 
 #include <linux/platform_data/dma-hsu.h>
 
+struct device;
 struct hsu_dma;
 
 /**
diff --git a/include/linux/platform_data/dma-hsu.h b/include/linux/platform_data/dma-hsu.h
index c65b412b2b33..611bae193c1c 100644
--- a/include/linux/platform_data/dma-hsu.h
+++ b/include/linux/platform_data/dma-hsu.h
@@ -8,7 +8,7 @@
 #ifndef _PLATFORM_DATA_DMA_HSU_H
 #define _PLATFORM_DATA_DMA_HSU_H
 
-#include <linux/device.h>
+struct device;
 
 struct hsu_dma_slave {
 	struct device	*dma_dev;
-- 
cgit v1.2.3


From 0eadd36d9123745f70e233a9d93951d05ca1916a Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sat, 3 Sep 2022 23:18:47 -0700
Subject: gpiolib: make fwnode_get_named_gpiod() static

There are no external users of fwnode_get_named_gpiod() anymore, so
let's stop exporting it and mark it as static.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 include/linux/gpio/consumer.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index fe0f460d9a3b..36460ced060b 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -174,10 +174,6 @@ int desc_to_gpio(const struct gpio_desc *desc);
 /* Child properties interface */
 struct fwnode_handle;
 
-struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
-					 const char *propname, int index,
-					 enum gpiod_flags dflags,
-					 const char *label);
 struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode,
 					 const char *con_id, int index,
 					 enum gpiod_flags flags,
@@ -553,15 +549,6 @@ static inline int desc_to_gpio(const struct gpio_desc *desc)
 /* Child properties interface */
 struct fwnode_handle;
 
-static inline
-struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
-					 const char *propname, int index,
-					 enum gpiod_flags dflags,
-					 const char *label)
-{
-	return ERR_PTR(-ENOSYS);
-}
-
 static inline
 struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode,
 					 const char *con_id, int index,
-- 
cgit v1.2.3


From 4a502cf4d77e12119e7061a05d5789cd3129d185 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Fri, 2 Sep 2022 10:32:03 +0200
Subject: net: pcs: add new PCS driver for altera TSE PCS

The Altera Triple Speed Ethernet has a SGMII/1000BaseC PCS that can be
integrated in several ways. It can either be part of the TSE MAC's
address space, accessed through 32 bits accesses on the mapped mdio
device 0, or through a dedicated 16 bits register set.

This driver allows using the TSE PCS outside of altera TSE's driver,
since it can be used standalone by other MACs.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pcs-altera-tse.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 include/linux/pcs-altera-tse.h

(limited to 'include/linux')

diff --git a/include/linux/pcs-altera-tse.h b/include/linux/pcs-altera-tse.h
new file mode 100644
index 000000000000..92ab9f08e835
--- /dev/null
+++ b/include/linux/pcs-altera-tse.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022 Bootlin
+ *
+ * Maxime Chevallier <maxime.chevallier@bootlin.com>
+ */
+
+#ifndef __LINUX_PCS_ALTERA_TSE_H
+#define __LINUX_PCS_ALTERA_TSE_H
+
+struct phylink_pcs;
+struct net_device;
+
+struct phylink_pcs *alt_tse_pcs_create(struct net_device *ndev,
+				       void __iomem *pcs_base, int reg_width);
+
+#endif /* __LINUX_PCS_ALTERA_TSE_H */
-- 
cgit v1.2.3


From dec9b2f1e0455a151a7293c367da22ab973f713e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 2 Sep 2022 16:59:15 +0200
Subject: debugfs: add debugfs_lookup_and_remove()

There is a very common pattern of using
debugfs_remove(debufs_lookup(..)) which results in a dentry leak of the
dentry that was looked up.  Instead of having to open-code the correct
pattern of calling dput() on the dentry, create
debugfs_lookup_and_remove() to handle this pattern automatically and
properly without any memory leaks.

Cc: stable <stable@kernel.org>
Reported-by: Kuyo Chang <kuyo.chang@mediatek.com>
Tested-by: Kuyo Chang <kuyo.chang@mediatek.com>
Link: https://lore.kernel.org/r/YxIaQ8cSinDR881k@kroah.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/debugfs.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index c869f1e73d75..f60674692d36 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -91,6 +91,8 @@ struct dentry *debugfs_create_automount(const char *name,
 void debugfs_remove(struct dentry *dentry);
 #define debugfs_remove_recursive debugfs_remove
 
+void debugfs_lookup_and_remove(const char *name, struct dentry *parent);
+
 const struct file_operations *debugfs_real_fops(const struct file *filp);
 
 int debugfs_file_get(struct dentry *dentry);
@@ -225,6 +227,10 @@ static inline void debugfs_remove(struct dentry *dentry)
 static inline void debugfs_remove_recursive(struct dentry *dentry)
 { }
 
+static inline void debugfs_lookup_and_remove(const char *name,
+					     struct dentry *parent)
+{ }
+
 const struct file_operations *debugfs_real_fops(const struct file *filp);
 
 static inline int debugfs_file_get(struct dentry *dentry)
-- 
cgit v1.2.3


From 9ca05b0f27de928be121cccf07735819dc9e1ed3 Mon Sep 17 00:00:00 2001
From: Maher Sanalla <msanalla@nvidia.com>
Date: Mon, 29 Aug 2022 12:02:27 +0300
Subject: RDMA/mlx5: Rely on RoCE fw cap instead of devlink when setting
 profile

When the RDMA auxiliary driver probes, it sets its profile based on
devlink driverinit value. The latter might not be in sync with FW yet
(In case devlink reload is not performed), thus causing a mismatch
between RDMA driver and FW. This results in the following FW syndrome
when the RDMA driver tries to adjust RoCE state, which fails the probe:

"0xC1F678 | modify_nic_vport_context: roce_en set on a vport that
doesn't support roce"

To prevent this, select the PF profile based on FW RoCE capability
instead of relying on devlink driverinit value.
To provide backward compatibility of the RoCE disable feature, on older
FW's where roce_rw is not set (FW RoCE capability is read-only), keep
the current behavior e.g., rely on devlink driverinit value.

Fixes: fbfa97b4d79f ("net/mlx5: Disable roce at HCA level")
Reviewed-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Maher Sanalla <msanalla@nvidia.com>
Link: https://lore.kernel.org/r/cb34ce9a1df4a24c135cb804db87f7d2418bd6cc.1661763459.git.leonro@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/driver.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 96b16fbe1aa4..d6338fb449c8 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1279,16 +1279,17 @@ enum {
 	MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32,
 };
 
-static inline bool mlx5_is_roce_init_enabled(struct mlx5_core_dev *dev)
+bool mlx5_is_roce_on(struct mlx5_core_dev *dev);
+
+static inline bool mlx5_get_roce_state(struct mlx5_core_dev *dev)
 {
-	struct devlink *devlink = priv_to_devlink(dev);
-	union devlink_param_value val;
-	int err;
-
-	err = devlink_param_driverinit_value_get(devlink,
-						 DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
-						 &val);
-	return err ? MLX5_CAP_GEN(dev, roce) : val.vbool;
+	if (MLX5_CAP_GEN(dev, roce_rw_supported))
+		return MLX5_CAP_GEN(dev, roce);
+
+	/* If RoCE cap is read-only in FW, get RoCE state from devlink
+	 * in order to support RoCE enable/disable feature
+	 */
+	return mlx5_is_roce_on(dev);
 }
 
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From 8a2dd123f12f69e5373d3103da2c97fc36223e0c Mon Sep 17 00:00:00 2001
From: Chris Mi <cmi@nvidia.com>
Date: Mon, 29 Aug 2022 12:06:04 +0300
Subject: RDMA/mlx5: Move function mlx5_core_query_ib_ppcnt() to mlx5_ib

This patch doesn't change any functionality, but move one function
to mlx5_ib because it is not used by mlx5_core.

The actual fix is in the next patch.

Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Chris Mi <cmi@nvidia.com>
Link: https://lore.kernel.org/r/fd47b9138412bd94ed30f838026cbb4cf3878150.1661763871.git.leonro@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/driver.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 76d7661e3e63..432bd202e2fe 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1079,8 +1079,6 @@ int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num);
 void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common);
 int mlx5_query_odp_caps(struct mlx5_core_dev *dev,
 			struct mlx5_odp_caps *odp_caps);
-int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev,
-			     u8 port_num, void *out, size_t sz);
 
 int mlx5_init_rl_table(struct mlx5_core_dev *dev);
 void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3


From 05ad5d4581c3c1cc724fe50d4652833fb9f3037b Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@seco.com>
Date: Fri, 2 Sep 2022 18:02:39 -0400
Subject: net: phy: Add 1000BASE-KX interface mode

Add 1000BASE-KX interface mode. This 1G backplane ethernet as described in
clause 70. Clause 73 autonegotiation is mandatory, and only full duplex
operation is supported.

Although at the PMA level this interface mode is identical to
1000BASE-X, it uses a different form of in-band autonegation. This
justifies a separate interface mode, since the interface mode (along
with the MLO_AN_* autonegotiation mode) sets the type of autonegotiation
which will be used on a link. This results in more than just electrical
differences between the link modes.

With regard to 1000BASE-X, 1000BASE-KX holds a similar position to
SGMII: same signaling, but different autonegotiation. PCS drivers
(which typically handle in-band autonegotiation) may only support
1000BASE-X, and not 1000BASE-KX. Similarly, the phy mode is used to
configure serdes phys with phy_set_mode_ext. Due to the different
electrical standards (SFI or XFI vs Clause 70), they will likely want to
use different configuration. Adding a phy interface mode for
1000BASE-KX helps simplify configuration in these areas.

Signed-off-by: Sean Anderson <sean.anderson@seco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 7c49ab95441b..337230c135f7 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -116,6 +116,7 @@ extern const int phy_10gbit_features_array[1];
  * @PHY_INTERFACE_MODE_USXGMII:  Universal Serial 10GE MII
  * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN
  * @PHY_INTERFACE_MODE_QUSGMII: Quad Universal SGMII
+ * @PHY_INTERFACE_MODE_1000BASEKX: 1000Base-KX - with Clause 73 AN
  * @PHY_INTERFACE_MODE_MAX: Book keeping
  *
  * Describes the interface between the MAC and PHY.
@@ -154,6 +155,7 @@ typedef enum {
 	/* 10GBASE-KR - with Clause 73 AN */
 	PHY_INTERFACE_MODE_10GKR,
 	PHY_INTERFACE_MODE_QUSGMII,
+	PHY_INTERFACE_MODE_1000BASEKX,
 	PHY_INTERFACE_MODE_MAX,
 } phy_interface_t;
 
@@ -251,6 +253,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 		return "trgmii";
 	case PHY_INTERFACE_MODE_1000BASEX:
 		return "1000base-x";
+	case PHY_INTERFACE_MODE_1000BASEKX:
+		return "1000base-kx";
 	case PHY_INTERFACE_MODE_2500BASEX:
 		return "2500base-x";
 	case PHY_INTERFACE_MODE_5GBASER:
-- 
cgit v1.2.3


From 7c8199e24fa09d2344ae0204527d55d7803e8409 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Fri, 2 Sep 2022 14:10:43 -0700
Subject: bpf: Introduce any context BPF specific memory allocator.

Tracing BPF programs can attach to kprobe and fentry. Hence they
run in unknown context where calling plain kmalloc() might not be safe.

Front-end kmalloc() with minimal per-cpu cache of free elements.
Refill this cache asynchronously from irq_work.

BPF programs always run with migration disabled.
It's safe to allocate from cache of the current cpu with irqs disabled.
Free-ing is always done into bucket of the current cpu as well.
irq_work trims extra free elements from buckets with kfree
and refills them with kmalloc, so global kmalloc logic takes care
of freeing objects allocated by one cpu and freed on another.

struct bpf_mem_alloc supports two modes:
- When size != 0 create kmem_cache and bpf_mem_cache for each cpu.
  This is typical bpf hash map use case when all elements have equal size.
- When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
  kmalloc/kfree. Max allocation size is 4096 in this case.
  This is bpf_dynptr and bpf_kptr use case.

bpf_mem_alloc/bpf_mem_free are bpf specific 'wrappers' of kmalloc/kfree.
bpf_mem_cache_alloc/bpf_mem_cache_free are 'wrappers' of kmem_cache_alloc/kmem_cache_free.

The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220902211058.60789-2-alexei.starovoitov@gmail.com
---
 include/linux/bpf_mem_alloc.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 include/linux/bpf_mem_alloc.h

(limited to 'include/linux')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
new file mode 100644
index 000000000000..804733070f8d
--- /dev/null
+++ b/include/linux/bpf_mem_alloc.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#ifndef _BPF_MEM_ALLOC_H
+#define _BPF_MEM_ALLOC_H
+#include <linux/compiler_types.h>
+
+struct bpf_mem_cache;
+struct bpf_mem_caches;
+
+struct bpf_mem_alloc {
+	struct bpf_mem_caches __percpu *caches;
+	struct bpf_mem_cache __percpu *cache;
+};
+
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size);
+void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
+
+/* kmalloc/kfree equivalent: */
+void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
+void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
+
+/* kmem_cache_alloc/free equivalent: */
+void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
+void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
+
+#endif /* _BPF_MEM_ALLOC_H */
-- 
cgit v1.2.3


From 4ab67149f3c6e97c5c506a726f0ebdec38241679 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Fri, 2 Sep 2022 14:10:52 -0700
Subject: bpf: Add percpu allocation support to bpf_mem_alloc.

Extend bpf_mem_alloc to cache free list of fixed size per-cpu allocations.
Once such cache is created bpf_mem_cache_alloc() will return per-cpu objects.
bpf_mem_cache_free() will free them back into global per-cpu pool after
observing RCU grace period.
per-cpu flavor of bpf_mem_alloc is going to be used by per-cpu hash maps.

The free list cache consists of tuples { llist_node, per-cpu pointer }
Unlike alloc_percpu() that returns per-cpu pointer
the bpf_mem_cache_alloc() returns a pointer to per-cpu pointer and
bpf_mem_cache_free() expects to receive it back.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220902211058.60789-11-alexei.starovoitov@gmail.com
---
 include/linux/bpf_mem_alloc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index 804733070f8d..653ed1584a03 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -12,7 +12,7 @@ struct bpf_mem_alloc {
 	struct bpf_mem_cache __percpu *cache;
 };
 
-int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size);
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
 
 /* kmalloc/kfree equivalent: */
-- 
cgit v1.2.3


From 9f2c6e96c65e6fa1aebef546be0c30a5895fcb37 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Fri, 2 Sep 2022 14:10:58 -0700
Subject: bpf: Optimize rcu_barrier usage between hash map and bpf_mem_alloc.

User space might be creating and destroying a lot of hash maps. Synchronous
rcu_barrier-s in a destruction path of hash map delay freeing of hash buckets
and other map memory and may cause artificial OOM situation under stress.
Optimize rcu_barrier usage between bpf hash map and bpf_mem_alloc:
- remove rcu_barrier from hash map, since htab doesn't use call_rcu
  directly and there are no callback to wait for.
- bpf_mem_alloc has call_rcu_in_progress flag that indicates pending callbacks.
  Use it to avoid barriers in fast path.
- When barriers are needed copy bpf_mem_alloc into temp structure
  and wait for rcu barrier-s in the worker to let the rest of
  hash map freeing to proceed.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220902211058.60789-17-alexei.starovoitov@gmail.com
---
 include/linux/bpf_mem_alloc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index 653ed1584a03..3e164b8efaa9 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -3,6 +3,7 @@
 #ifndef _BPF_MEM_ALLOC_H
 #define _BPF_MEM_ALLOC_H
 #include <linux/compiler_types.h>
+#include <linux/workqueue.h>
 
 struct bpf_mem_cache;
 struct bpf_mem_caches;
@@ -10,6 +11,7 @@ struct bpf_mem_caches;
 struct bpf_mem_alloc {
 	struct bpf_mem_caches __percpu *caches;
 	struct bpf_mem_cache __percpu *cache;
+	struct work_struct work;
 };
 
 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
-- 
cgit v1.2.3


From f0b933236ec97de5ee49c60aae57a9c5c4dadc87 Mon Sep 17 00:00:00 2001
From: Cezary Rojewski <cezary.rojewski@intel.com>
Date: Sun, 4 Sep 2022 12:28:39 +0200
Subject: lib/string_helpers: Introduce parse_int_array_user()

Add new helper function to allow for splitting specified user string
into a sequence of integers. Internally it makes use of get_options() so
the returned sequence contains the integers extracted plus an additional
element that begins the sequence and specifies the integers count.

Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Link: https://lore.kernel.org/r/20220904102840.862395-2-cezary.rojewski@intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/string_helpers.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 4d72258d42fd..dc2e726fd820 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -21,6 +21,8 @@ enum string_size_units {
 void string_get_size(u64 size, u64 blk_size, enum string_size_units units,
 		     char *buf, int len);
 
+int parse_int_array_user(const char __user *from, size_t count, int **array);
+
 #define UNESCAPE_SPACE		BIT(0)
 #define UNESCAPE_OCTAL		BIT(1)
 #define UNESCAPE_HEX		BIT(2)
-- 
cgit v1.2.3


From 8409fe92d88c332923130149fe209d1c882b286e Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Sat, 27 Aug 2022 15:03:43 +0200
Subject: PCI: Move PCI_VENDOR_ID_MICROSOFT/PCI_DEVICE_ID_HYPERV_VIDEO
 definitions to pci_ids.h

There are already three places in kernel which define
PCI_VENDOR_ID_MICROSOFT and two for PCI_DEVICE_ID_HYPERV_VIDEO and
there's a need to use these from core VMBus code. Move the defines where
they belong.

No functional change.

Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com> # pci_ids.h
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20220827130345.1320254-2-vkuznets@redhat.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/linux/pci_ids.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 6feade66efdb..15b49e655ce3 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2079,6 +2079,9 @@
 #define PCI_DEVICE_ID_ICE_1712		0x1712
 #define PCI_DEVICE_ID_VT1724		0x1724
 
+#define PCI_VENDOR_ID_MICROSOFT		0x1414
+#define PCI_DEVICE_ID_HYPERV_VIDEO	0x5353
+
 #define PCI_VENDOR_ID_OXSEMI		0x1415
 #define PCI_DEVICE_ID_OXSEMI_12PCI840	0x8403
 #define PCI_DEVICE_ID_OXSEMI_PCIe840		0xC000
-- 
cgit v1.2.3


From 2bc9cd66eb25d0fefbb081421d6586495e25840e Mon Sep 17 00:00:00 2001
From: Vincent Whitchurch <vincent.whitchurch@axis.com>
Date: Mon, 29 Aug 2022 11:18:40 +0200
Subject: iio: Use per-device lockdep class for mlock

If an IIO driver uses callbacks from another IIO driver and calls
iio_channel_start_all_cb() from one of its buffer setup ops, then
lockdep complains due to the lock nesting, as in the below example with
lmp91000.

Since the locks are being taken on different IIO devices, there is no
actual deadlock.  Fix the warning by telling lockdep to use a different
class for each iio_device.

 ============================================
 WARNING: possible recursive locking detected
 --------------------------------------------
 python3/23 is trying to acquire lock:
 (&indio_dev->mlock){+.+.}-{3:3}, at: iio_update_buffers

 but task is already holding lock:
 (&indio_dev->mlock){+.+.}-{3:3}, at: enable_store

 other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(&indio_dev->mlock);
   lock(&indio_dev->mlock);

  *** DEADLOCK ***

  May be due to missing lock nesting notation

 5 locks held by python3/23:
  #0: (sb_writers#5){.+.+}-{0:0}, at: ksys_write
  #1: (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write_iter
  #2: (kn->active#14){.+.+}-{0:0}, at: kernfs_fop_write_iter
  #3: (&indio_dev->mlock){+.+.}-{3:3}, at: enable_store
  #4: (&iio_dev_opaque->info_exist_lock){+.+.}-{3:3}, at: iio_update_buffers

 Call Trace:
  __mutex_lock
  iio_update_buffers
  iio_channel_start_all_cb
  lmp91000_buffer_postenable
  __iio_update_buffers
  enable_store

Fixes: 67e17300dc1d76 ("iio: potentiostat: add LMP91000 support")
Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20220829091840.2791846-1-vincent.whitchurch@axis.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio-opaque.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index 6b3586b3f952..d1f8b30a7c8b 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -11,6 +11,7 @@
  *				checked by device drivers but should be considered
  *				read-only as this is a core internal bit
  * @driver_module:		used to make it harder to undercut users
+ * @mlock_key:			lockdep class for iio_dev lock
  * @info_exist_lock:		lock to prevent use during removal
  * @trig_readonly:		mark the current trigger immutable
  * @event_interface:		event chrdevs associated with interrupt lines
@@ -42,6 +43,7 @@ struct iio_dev_opaque {
 	int				currentmode;
 	int				id;
 	struct module			*driver_module;
+	struct lock_class_key		mlock_key;
 	struct mutex			info_exist_lock;
 	bool				trig_readonly;
 	struct iio_event_interface	*event_interface;
-- 
cgit v1.2.3


From 835e699ef82adfc85ac4cc3f1f237c1adfdefd20 Mon Sep 17 00:00:00 2001
From: Jagath Jog J <jagathjog1996@gmail.com>
Date: Wed, 31 Aug 2022 12:01:16 +0530
Subject: iio: Add new event type gesture and use direction for single and
 double tap

Add new event type for tap called gesture and the direction can be used
to differentiate single and double tap. This may be used by accelerometer
sensors to express single and double tap events. For directional tap,
modifiers like IIO_MOD_(X/Y/Z) can be used along with singletap and
doubletap direction.

Signed-off-by: Jagath Jog J <jagathjog1996@gmail.com>
Link: https://lore.kernel.org/r/20220831063117.4141-2-jagathjog1996@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 27143b03909d..82faa98c719a 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -17,6 +17,8 @@ enum iio_event_info {
 	IIO_EV_INFO_HIGH_PASS_FILTER_3DB,
 	IIO_EV_INFO_LOW_PASS_FILTER_3DB,
 	IIO_EV_INFO_TIMEOUT,
+	IIO_EV_INFO_RESET_TIMEOUT,
+	IIO_EV_INFO_TAP2_MIN_DELAY,
 };
 
 #define IIO_VAL_INT 1
-- 
cgit v1.2.3


From ead384d956345681e1ddf97890d5e15ded015f07 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Fri, 19 Aug 2022 18:20:37 +0800
Subject: efi/loongarch: Add efistub booting support

This patch adds efistub booting support, which is the standard UEFI boot
protocol for LoongArch to use.

We use generic efistub, which means we can pass boot information (i.e.,
system table, memory map, kernel command line, initrd) via a light FDT
and drop a lot of non-standard code.

We use a flat mapping to map the efi runtime in the kernel's address
space. In efi, VA = PA; in kernel, VA = PA + PAGE_OFFSET. As a result,
flat mapping is not identity mapping, SetVirtualAddressMap() is still
needed for the efi runtime.

Tested-by: Xi Ruoyao <xry111@xry111.site>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
[ardb: change fpic to fpie as suggested by Xi Ruoyao]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/pe.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pe.h b/include/linux/pe.h
index daf09ffffe38..1d3836ef9d92 100644
--- a/include/linux/pe.h
+++ b/include/linux/pe.h
@@ -65,6 +65,8 @@
 #define	IMAGE_FILE_MACHINE_SH5		0x01a8
 #define	IMAGE_FILE_MACHINE_THUMB	0x01c2
 #define	IMAGE_FILE_MACHINE_WCEMIPSV2	0x0169
+#define	IMAGE_FILE_MACHINE_LOONGARCH32	0x6232
+#define	IMAGE_FILE_MACHINE_LOONGARCH64	0x6264
 
 /* flags */
 #define IMAGE_FILE_RELOCS_STRIPPED           0x0001
-- 
cgit v1.2.3


From 3aac580d5cc3001ca1627725b3b61edb529f341d Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 1 Sep 2022 06:09:54 -0700
Subject: perf: Add sample_flags to indicate the PMU-filled sample data

On some platforms, some data e.g., timestamps, can be retrieved from
the PMU driver. Usually, the data from the PMU driver is more accurate.
The current perf kernel should output the PMU-filled sample data if
it's available.

To check the availability of the PMU-filled sample data, the current
perf kernel initializes the related fields in the
perf_sample_data_init(). When outputting a sample, the perf checks
whether the field is updated by the PMU driver. If yes, the updated
value will be output. If not, the perf uses an SW way to calculate the
value or just outputs the initialized value if an SW way is unavailable
either.

With more and more data being provided by the PMU driver, more fields
has to be initialized in the perf_sample_data_init(). That will
increase the number of cache lines touched in perf_sample_data_init()
and be harmful to the performance.

Add new "sample_flags" to indicate the PMU-filled sample data. The PMU
driver should set the corresponding PERF_SAMPLE_ flag when the field is
updated. The initialization of the corresponding field is not required
anymore. The following patches will make use of it and remove the
corresponding fields from the perf_sample_data_init(), which will
further minimize the number of cache lines touched.

Only clear the sample flags that have already been done by the PMU
driver in the perf_prepare_sample() for the PERF_RECORD_SAMPLE. For the
other PERF_RECORD_ event type, the sample data is not available.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220901130959.1285717-2-kan.liang@linux.intel.com
---
 include/linux/perf_event.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1999408a9cbb..0978165a2d87 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1008,6 +1008,7 @@ struct perf_sample_data {
 	 * Fields set by perf_sample_data_init(), group so as to
 	 * minimize the cachelines touched.
 	 */
+	u64				sample_flags;
 	u64				addr;
 	struct perf_raw_record		*raw;
 	struct perf_branch_stack	*br_stack;
@@ -1057,6 +1058,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 					 u64 addr, u64 period)
 {
 	/* remaining struct members initialized in perf_prepare_sample() */
+	data->sample_flags = 0;
 	data->addr = addr;
 	data->raw  = NULL;
 	data->br_stack = NULL;
-- 
cgit v1.2.3


From a9a931e2666878343782c82d7d55cc173ddeb3e9 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 1 Sep 2022 06:09:56 -0700
Subject: perf: Use sample_flags for branch stack

Use the new sample_flags to indicate whether the branch stack is filled
by the PMU driver.

Remove the br_stack from the perf_sample_data_init() to minimize the number
of cache lines touched.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220901130959.1285717-4-kan.liang@linux.intel.com
---
 include/linux/perf_event.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0978165a2d87..1e12e79454e0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1011,7 +1011,6 @@ struct perf_sample_data {
 	u64				sample_flags;
 	u64				addr;
 	struct perf_raw_record		*raw;
-	struct perf_branch_stack	*br_stack;
 	u64				period;
 	union perf_sample_weight	weight;
 	u64				txn;
@@ -1021,6 +1020,8 @@ struct perf_sample_data {
 	 * The other fields, optionally {set,used} by
 	 * perf_{prepare,output}_sample().
 	 */
+	struct perf_branch_stack	*br_stack;
+
 	u64				type;
 	u64				ip;
 	struct {
@@ -1061,7 +1062,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->sample_flags = 0;
 	data->addr = addr;
 	data->raw  = NULL;
-	data->br_stack = NULL;
 	data->period = period;
 	data->weight.full = 0;
 	data->data_src.val = PERF_MEM_NA;
-- 
cgit v1.2.3


From 2abe681da0a192ab850a5271d838a7817b469fca Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 1 Sep 2022 06:09:57 -0700
Subject: perf: Use sample_flags for weight

Use the new sample_flags to indicate whether the weight field is filled
by the PMU driver.

Remove the weight field from the perf_sample_data_init() to minimize the
number of cache lines touched.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220901130959.1285717-5-kan.liang@linux.intel.com
---
 include/linux/perf_event.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1e12e79454e0..06a587b5faa9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1012,7 +1012,6 @@ struct perf_sample_data {
 	u64				addr;
 	struct perf_raw_record		*raw;
 	u64				period;
-	union perf_sample_weight	weight;
 	u64				txn;
 	union  perf_mem_data_src	data_src;
 
@@ -1021,6 +1020,7 @@ struct perf_sample_data {
 	 * perf_{prepare,output}_sample().
 	 */
 	struct perf_branch_stack	*br_stack;
+	union perf_sample_weight	weight;
 
 	u64				type;
 	u64				ip;
@@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->addr = addr;
 	data->raw  = NULL;
 	data->period = period;
-	data->weight.full = 0;
 	data->data_src.val = PERF_MEM_NA;
 	data->txn = 0;
 }
-- 
cgit v1.2.3


From e16fd7f2cb1a65555cfe76f983eaefb1eab7471f Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 1 Sep 2022 06:09:58 -0700
Subject: perf: Use sample_flags for data_src

Use the new sample_flags to indicate whether the data_src field is
filled by the PMU driver.

Remove the data_src field from the perf_sample_data_init() to minimize
the number of cache lines touched.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220901130959.1285717-6-kan.liang@linux.intel.com
---
 include/linux/perf_event.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 06a587b5faa9..6849f10dfc7e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1013,7 +1013,6 @@ struct perf_sample_data {
 	struct perf_raw_record		*raw;
 	u64				period;
 	u64				txn;
-	union  perf_mem_data_src	data_src;
 
 	/*
 	 * The other fields, optionally {set,used} by
@@ -1021,6 +1020,7 @@ struct perf_sample_data {
 	 */
 	struct perf_branch_stack	*br_stack;
 	union perf_sample_weight	weight;
+	union  perf_mem_data_src	data_src;
 
 	u64				type;
 	u64				ip;
@@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->addr = addr;
 	data->raw  = NULL;
 	data->period = period;
-	data->data_src.val = PERF_MEM_NA;
 	data->txn = 0;
 }
 
-- 
cgit v1.2.3


From ee9db0e14b0575aa827579dc2471a29ec5fc6877 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 1 Sep 2022 06:09:59 -0700
Subject: perf: Use sample_flags for txn

Use the new sample_flags to indicate whether the txn field is filled by
the PMU driver.

Remove the txn field from the perf_sample_data_init() to minimize the
number of cache lines touched.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220901130959.1285717-7-kan.liang@linux.intel.com
---
 include/linux/perf_event.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6849f10dfc7e..581880ddb9ef 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1012,7 +1012,6 @@ struct perf_sample_data {
 	u64				addr;
 	struct perf_raw_record		*raw;
 	u64				period;
-	u64				txn;
 
 	/*
 	 * The other fields, optionally {set,used} by
@@ -1021,6 +1020,7 @@ struct perf_sample_data {
 	struct perf_branch_stack	*br_stack;
 	union perf_sample_weight	weight;
 	union  perf_mem_data_src	data_src;
+	u64				txn;
 
 	u64				type;
 	u64				ip;
@@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->addr = addr;
 	data->raw  = NULL;
 	data->period = period;
-	data->txn = 0;
 }
 
 /*
-- 
cgit v1.2.3


From 0083d27b21dd2a598df8275b98f89ced532e2e53 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 6 Sep 2022 09:38:42 -1000
Subject: cgroup: Improve cftype add/rm error handling

Let's track whether a cftype is currently added or not using a new flag
__CFTYPE_ADDED so that duplicate operations can be failed safely and
consistently allow using empty cftypes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1283993d7ea8..9fa1652d0493 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -131,6 +131,7 @@ enum {
 	/* internal flags, do not use outside cgroup core proper */
 	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
 	__CFTYPE_NOT_ON_DFL	= (1 << 17),	/* not on default hierarchy */
+	__CFTYPE_ADDED		= (1 << 18),
 };
 
 /*
-- 
cgit v1.2.3


From 8a693f7766f9e27c390c5fec8a5db1f9d1d8177e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 6 Sep 2022 09:38:55 -1000
Subject: cgroup: Remove CFTYPE_PRESSURE

CFTYPE_PRESSURE is used to flag PSI related files so that they are not
created if PSI is disabled during boot. It's a bit weird to use a generic
flag to mark a specific file type. Let's instead move the PSI files into its
own cftypes array and add/rm them conditionally. This is a bit more code but
cleaner.

No userland visible changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/cgroup-defs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 9fa1652d0493..8f481d1b159a 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -126,7 +126,6 @@ enum {
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
 	CFTYPE_DEBUG		= (1 << 5),	/* create when cgroup_debug */
-	CFTYPE_PRESSURE		= (1 << 6),	/* only if pressure feature is enabled */
 
 	/* internal flags, do not use outside cgroup core proper */
 	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
-- 
cgit v1.2.3


From 14db0b3c7b837f4edeb7c1794290c2f345c7f627 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 15 Aug 2022 16:50:51 -0700
Subject: fscrypt: stop using PG_error to track error status

As a step towards freeing the PG_error flag for other uses, change ext4
and f2fs to stop using PG_error to track decryption errors.  Instead, if
a decryption error occurs, just mark the whole bio as failed.  The
coarser granularity isn't really a problem since it isn't any worse than
what the block layer provides, and errors from a multi-page readahead
aren't reported to applications unless a single-page read fails too.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <chao@kernel.org> # for f2fs part
Link: https://lore.kernel.org/r/20220815235052.86545-2-ebiggers@kernel.org
---
 include/linux/fscrypt.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index b95b8601b9c1..488fd8c8f8af 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -351,7 +351,7 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name);
 int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags);
 
 /* bio.c */
-void fscrypt_decrypt_bio(struct bio *bio);
+bool fscrypt_decrypt_bio(struct bio *bio);
 int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 			  sector_t pblk, unsigned int len);
 
@@ -644,8 +644,9 @@ static inline int fscrypt_d_revalidate(struct dentry *dentry,
 }
 
 /* bio.c */
-static inline void fscrypt_decrypt_bio(struct bio *bio)
+static inline bool fscrypt_decrypt_bio(struct bio *bio)
 {
+	return true;
 }
 
 static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
-- 
cgit v1.2.3


From 720e6a435194fb5237833a4a7ec6aa60a78964a8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 31 Aug 2022 08:26:46 -0700
Subject: bpf: Allow struct argument in trampoline based programs

Allow struct argument in trampoline based programs where
the struct size should be <= 16 bytes. In such cases, the argument
will be put into up to 2 registers for bpf, x86_64 and arm64
architectures.

To support arch-specific trampoline manipulation,
add arg_flags for additional struct information about arguments
in btf_func_model. Such information will be used in arch specific
function arch_prepare_bpf_trampoline() to prepare argument access
properly in trampoline.

Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220831152646.2078089-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9c1674973e03..4d32f125f4af 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -727,10 +727,14 @@ enum bpf_cgroup_storage_type {
  */
 #define MAX_BPF_FUNC_REG_ARGS 5
 
+/* The argument is a structure. */
+#define BTF_FMODEL_STRUCT_ARG		BIT(0)
+
 struct btf_func_model {
 	u8 ret_size;
 	u8 nr_args;
 	u8 arg_size[MAX_BPF_FUNC_ARGS];
+	u8 arg_flags[MAX_BPF_FUNC_ARGS];
 };
 
 /* Restore arguments before returning from trampoline to let original function
-- 
cgit v1.2.3


From be376df724aa3b7abdf79390eaab60c58a92f4f0 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Sat, 27 Aug 2022 04:49:03 +0200
Subject: wifi: brcmfmac: add 43439 SDIO ids and initialization

Add HW and SDIO ids for use with the muRata 1YN (Cypress CYW43439).
Add the firmware mapping structures for the CYW43439 chipset.
The 43439 needs some things setup similar to the 43430 chipset.

Signed-off-by: Marek Vasut <marex@denx.de>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20220827024903.617294-1-marex@denx.de
---
 include/linux/mmc/sdio_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 53f0efa0bccf..74f9d9a6d330 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -74,6 +74,7 @@
 #define SDIO_DEVICE_ID_BROADCOM_43362		0xa962
 #define SDIO_DEVICE_ID_BROADCOM_43364		0xa9a4
 #define SDIO_DEVICE_ID_BROADCOM_43430		0xa9a6
+#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439	0xa9af
 #define SDIO_DEVICE_ID_BROADCOM_43455		0xa9bf
 #define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752	0xaae8
 
-- 
cgit v1.2.3


From 9fc18f6d56d5b79d527c17a8100a0965d18345cf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 21 Aug 2022 16:06:44 +0200
Subject: dma-mapping: mark dma_supported static

Now that the remaining users in drivers are gone, this function can be
marked static.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 25a30906289d..0ee20b764000 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -139,7 +139,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		unsigned long attrs);
 bool dma_can_mmap(struct device *dev);
-int dma_supported(struct device *dev, u64 mask);
 bool dma_pci_p2pdma_supported(struct device *dev);
 int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
@@ -248,10 +247,6 @@ static inline bool dma_can_mmap(struct device *dev)
 {
 	return false;
 }
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-	return 0;
-}
 static inline bool dma_pci_p2pdma_supported(struct device *dev)
 {
 	return false;
-- 
cgit v1.2.3


From 283945017cbf685546ba7d065f254ad77eb888b1 Mon Sep 17 00:00:00 2001
From: Yuan Can <yuancan@huawei.com>
Date: Mon, 15 Aug 2022 01:33:39 +0000
Subject: iommu: Remove comment of dev_has_feat in struct doc

dev_has_feat has been removed from iommu_ops in commit
309c56e84602 ("iommu: remove the unused dev_has_feat method"),
remove its description in the struct doc.

Signed-off-by: Yuan Can <yuancan@huawei.com>
Link: https://lore.kernel.org/r/20220815013339.2552-1-yuancan@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ea30f00dc145..a004e87e0e1d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -212,7 +212,7 @@ struct iommu_iotlb_gather {
  * @of_xlate: add OF master IDs to iommu grouping
  * @is_attach_deferred: Check if domain attach should be deferred from iommu
  *                      driver init to device driver init (default no)
- * @dev_has/enable/disable_feat: per device entries to check/enable/disable
+ * @dev_enable/disable_feat: per device entries to enable/disable
  *                               iommu specific features.
  * @sva_bind: Bind process address space to device
  * @sva_unbind: Unbind process address space from device
-- 
cgit v1.2.3


From a1be74c5384c4a47d91ab4af2ed854d3b7eaf763 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 5 Sep 2022 13:58:43 +0300
Subject: net/mlx5: Introduce ifc bits for page tracker

Introduce ifc related stuff to enable using page tracker.

A page tracker is a dirty page tracking object used by the device to
report the tracking log.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20220905105852.26398-2-yishaih@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 83 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 82 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4acd5610e96b..06eab92b9fb3 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -89,6 +89,7 @@ enum {
 	MLX5_OBJ_TYPE_VIRTIO_NET_Q = 0x000d,
 	MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS = 0x001c,
 	MLX5_OBJ_TYPE_MATCH_DEFINER = 0x0018,
+	MLX5_OBJ_TYPE_PAGE_TRACK = 0x46,
 	MLX5_OBJ_TYPE_MKEY = 0xff01,
 	MLX5_OBJ_TYPE_QP = 0xff02,
 	MLX5_OBJ_TYPE_PSV = 0xff03,
@@ -1733,7 +1734,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         max_geneve_tlv_options[0x8];
 	u8         reserved_at_568[0x3];
 	u8         max_geneve_tlv_option_data_len[0x5];
-	u8         reserved_at_570[0x10];
+	u8         reserved_at_570[0x9];
+	u8         adv_virtualization[0x1];
+	u8         reserved_at_57a[0x6];
 
 	u8	   reserved_at_580[0xb];
 	u8	   log_max_dci_stream_channels[0x5];
@@ -11818,4 +11821,82 @@ struct mlx5_ifc_load_vhca_state_out_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_adv_virtualization_cap_bits {
+	u8         reserved_at_0[0x3];
+	u8         pg_track_log_max_num[0x5];
+	u8         pg_track_max_num_range[0x8];
+	u8         pg_track_log_min_addr_space[0x8];
+	u8         pg_track_log_max_addr_space[0x8];
+
+	u8         reserved_at_20[0x3];
+	u8         pg_track_log_min_msg_size[0x5];
+	u8         reserved_at_28[0x3];
+	u8         pg_track_log_max_msg_size[0x5];
+	u8         reserved_at_30[0x3];
+	u8         pg_track_log_min_page_size[0x5];
+	u8         reserved_at_38[0x3];
+	u8         pg_track_log_max_page_size[0x5];
+
+	u8         reserved_at_40[0x7c0];
+};
+
+struct mlx5_ifc_page_track_report_entry_bits {
+	u8         dirty_address_high[0x20];
+
+	u8         dirty_address_low[0x20];
+};
+
+enum {
+	MLX5_PAGE_TRACK_STATE_TRACKING,
+	MLX5_PAGE_TRACK_STATE_REPORTING,
+	MLX5_PAGE_TRACK_STATE_ERROR,
+};
+
+struct mlx5_ifc_page_track_range_bits {
+	u8         start_address[0x40];
+
+	u8         length[0x40];
+};
+
+struct mlx5_ifc_page_track_bits {
+	u8         modify_field_select[0x40];
+
+	u8         reserved_at_40[0x10];
+	u8         vhca_id[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         state[0x4];
+	u8         track_type[0x4];
+	u8         log_addr_space_size[0x8];
+	u8         reserved_at_90[0x3];
+	u8         log_page_size[0x5];
+	u8         reserved_at_98[0x3];
+	u8         log_msg_size[0x5];
+
+	u8         reserved_at_a0[0x8];
+	u8         reporting_qpn[0x18];
+
+	u8         reserved_at_c0[0x18];
+	u8         num_ranges[0x8];
+
+	u8         reserved_at_e0[0x20];
+
+	u8         range_start_address[0x40];
+
+	u8         length[0x40];
+
+	struct     mlx5_ifc_page_track_range_bits track_range[0];
+};
+
+struct mlx5_ifc_create_page_track_obj_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+	struct mlx5_ifc_page_track_bits obj_context;
+};
+
+struct mlx5_ifc_modify_page_track_obj_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+	struct mlx5_ifc_page_track_bits obj_context;
+};
+
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From 939838632b9119614128028eaea3b1d7bf29f16f Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 5 Sep 2022 13:58:44 +0300
Subject: net/mlx5: Query ADV_VIRTUALIZATION capabilities

Query ADV_VIRTUALIZATION capabilities which provide information for
advanced virtualization related features.

Current capabilities refer to the page tracker object which is used for
tracking the pages that are dirtied by the device.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20220905105852.26398-3-yishaih@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/device.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index b5f58fd37a0f..5b41b9fb3d48 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1200,6 +1200,7 @@ enum mlx5_cap_type {
 	MLX5_CAP_DEV_SHAMPO = 0x1d,
 	MLX5_CAP_GENERAL_2 = 0x20,
 	MLX5_CAP_PORT_SELECTION = 0x25,
+	MLX5_CAP_ADV_VIRTUALIZATION = 0x26,
 	/* NUM OF CAP Types */
 	MLX5_CAP_NUM
 };
@@ -1365,6 +1366,14 @@ enum mlx5_qcam_feature_groups {
 	MLX5_GET(port_selection_cap, \
 		 mdev->caps.hca[MLX5_CAP_PORT_SELECTION]->max, cap)
 
+#define MLX5_CAP_ADV_VIRTUALIZATION(mdev, cap) \
+	MLX5_GET(adv_virtualization_cap, \
+		 mdev->caps.hca[MLX5_CAP_ADV_VIRTUALIZATION]->cur, cap)
+
+#define MLX5_CAP_ADV_VIRTUALIZATION_MAX(mdev, cap) \
+	MLX5_GET(adv_virtualization_cap, \
+		 mdev->caps.hca[MLX5_CAP_ADV_VIRTUALIZATION]->max, cap)
+
 #define MLX5_CAP_FLOWTABLE_PORT_SELECTION(mdev, cap) \
 	MLX5_CAP_PORT_SELECTION(mdev, flow_table_properties_port_selection.cap)
 
-- 
cgit v1.2.3


From 2d4697375dea514be202abf563a9419e74489c25 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 1 Sep 2022 00:27:42 +0300
Subject: swab: Add array operations

For now, some simple array operations to swab.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20220831212744.56435-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/swab.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swab.h b/include/linux/swab.h
index bcff5149861a..9b804dbb0d79 100644
--- a/include/linux/swab.h
+++ b/include/linux/swab.h
@@ -20,4 +20,29 @@
 # define swab64s __swab64s
 # define swahw32s __swahw32s
 # define swahb32s __swahb32s
+
+static inline void swab16_array(u16 *buf, unsigned int words)
+{
+	while (words--) {
+		swab16s(buf);
+		buf++;
+	}
+}
+
+static inline void swab32_array(u32 *buf, unsigned int words)
+{
+	while (words--) {
+		swab32s(buf);
+		buf++;
+	}
+}
+
+static inline void swab64_array(u64 *buf, unsigned int words)
+{
+	while (words--) {
+		swab64s(buf);
+		buf++;
+	}
+}
+
 #endif /* _LINUX_SWAB_H */
-- 
cgit v1.2.3


From 359ad15763762c713a51300134e784a72eb9cb80 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 15 Aug 2022 16:26:49 +0100
Subject: iommu: Retire iommu_capable()

With all callers now converted to the device-specific version, retire
the old bus-based interface, and give drivers the chance to indicate
accurate per-instance capabilities.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/d8bd8777d06929ad8f49df7fc80e1b9af32a41b5.1660574547.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a004e87e0e1d..0013a1befe7b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -227,7 +227,7 @@ struct iommu_iotlb_gather {
  * @owner: Driver module providing these ops
  */
 struct iommu_ops {
-	bool (*capable)(enum iommu_cap);
+	bool (*capable)(struct device *dev, enum iommu_cap);
 
 	/* Domain allocation and freeing by the iommu driver */
 	struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
@@ -420,7 +420,6 @@ extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops);
 extern int bus_iommu_probe(struct bus_type *bus);
 extern bool iommu_present(struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
-extern bool iommu_capable(struct bus_type *bus, enum iommu_cap cap);
 extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus);
 extern struct iommu_group *iommu_group_get_by_id(int id);
 extern void iommu_domain_free(struct iommu_domain *domain);
@@ -697,11 +696,6 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap)
 	return false;
 }
 
-static inline bool iommu_capable(struct bus_type *bus, enum iommu_cap cap)
-{
-	return false;
-}
-
 static inline struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
 {
 	return NULL;
-- 
cgit v1.2.3


From 29e932295bfaba792d29e66e8be0637ff3994724 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 15 Aug 2022 17:20:17 +0100
Subject: iommu: Clean up bus_set_iommu()

Clean up the remaining trivial bus_set_iommu() callsites along
with the implementation. Now drivers only have to know and care
about iommu_device instances, phew!

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Matthew Rosato <mjrosato@linux.ibm.com> # s390
Tested-by: Niklas Schnelle <schnelle@linux.ibm.com> # s390
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/ea383d5f4d74ffe200ab61248e5de6e95846180a.1660572783.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0013a1befe7b..35810f87ae74 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -416,7 +416,6 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
 	return dev->iommu->iommu_dev->ops;
 }
 
-extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops);
 extern int bus_iommu_probe(struct bus_type *bus);
 extern bool iommu_present(struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
-- 
cgit v1.2.3


From fa49364cd5e65797014688cba623541083b3e5b0 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 16 Aug 2022 18:28:04 +0100
Subject: iommu/dma: Move public interfaces to linux/iommu.h

The iommu-dma layer is now mostly encapsulated by iommu_dma_ops, with
only a couple more public interfaces left pertaining to MSI integration.
Since these depend on the main IOMMU API header anyway, move their
declarations there, taking the opportunity to update the half-baked
comments to proper kerneldoc along the way.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/9cd99738f52094e6bed44bfee03fa4f288d20695.1660668998.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/dma-iommu.h | 40 ----------------------------------------
 include/linux/iommu.h     | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 24607dc3c2ac..e83de4f1f3d6 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -15,27 +15,10 @@
 
 /* Domain management interface for IOMMU drivers */
 int iommu_get_dma_cookie(struct iommu_domain *domain);
-int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
 void iommu_put_dma_cookie(struct iommu_domain *domain);
 
-/* Setup call for arch DMA mapping code */
-void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
 int iommu_dma_init_fq(struct iommu_domain *domain);
 
-/* The DMA API isn't _quite_ the whole story, though... */
-/*
- * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU device
- *
- * The MSI page will be stored in @desc.
- *
- * Return: 0 on success otherwise an error describing the failure.
- */
-int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr);
-
-/* Update the MSI message if required. */
-void iommu_dma_compose_msi_msg(struct msi_desc *desc,
-			       struct msi_msg *msg);
-
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
 
 void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
@@ -46,15 +29,8 @@ extern bool iommu_dma_forcedac;
 #else /* CONFIG_IOMMU_DMA */
 
 struct iommu_domain;
-struct msi_desc;
-struct msi_msg;
 struct device;
 
-static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base,
-				       u64 dma_limit)
-{
-}
-
 static inline int iommu_dma_init_fq(struct iommu_domain *domain)
 {
 	return -EINVAL;
@@ -65,26 +41,10 @@ static inline int iommu_get_dma_cookie(struct iommu_domain *domain)
 	return -ENODEV;
 }
 
-static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
-{
-	return -ENODEV;
-}
-
 static inline void iommu_put_dma_cookie(struct iommu_domain *domain)
 {
 }
 
-static inline int iommu_dma_prepare_msi(struct msi_desc *desc,
-					phys_addr_t msi_addr)
-{
-	return 0;
-}
-
-static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc,
-					     struct msi_msg *msg)
-{
-}
-
 static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 {
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 35810f87ae74..a325532aeab5 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1063,4 +1063,40 @@ void iommu_debugfs_setup(void);
 static inline void iommu_debugfs_setup(void) {}
 #endif
 
+#ifdef CONFIG_IOMMU_DMA
+#include <linux/msi.h>
+
+/* Setup call for arch DMA mapping code */
+void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
+
+int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
+
+int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr);
+void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg);
+
+#else /* CONFIG_IOMMU_DMA */
+
+struct msi_desc;
+struct msi_msg;
+
+static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
+{
+}
+
+static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
+{
+	return -ENODEV;
+}
+
+static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
+{
+	return 0;
+}
+
+static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+}
+
+#endif	/* CONFIG_IOMMU_DMA */
+
 #endif /* __LINUX_IOMMU_H */
-- 
cgit v1.2.3


From d1b2234b7fbf94348901bed4ea8d015c8a819d02 Mon Sep 17 00:00:00 2001
From: Lior Nahmanson <liorna@nvidia.com>
Date: Mon, 5 Sep 2022 22:21:16 -0700
Subject: net/mlx5: Removed esp_id from struct mlx5_flow_act

esp_id is no longer in used

Signed-off-by: Lior Nahmanson <liorna@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 8e73c377da2c..920cbc9524ad 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -245,7 +245,6 @@ struct mlx5_flow_act {
 	struct mlx5_pkt_reformat *pkt_reformat;
 	union {
 		u32 ipsec_obj_id;
-		uintptr_t esp_id;
 	};
 	u32 flags;
 	struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH];
-- 
cgit v1.2.3


From e227ee990bf974f655ec3132b496409990f6634b Mon Sep 17 00:00:00 2001
From: Lior Nahmanson <liorna@nvidia.com>
Date: Mon, 5 Sep 2022 22:21:17 -0700
Subject: net/mlx5: Generalize Flow Context for new crypto fields

In order to support MACsec offload (and maybe some other crypto features
in the future), generalize flow action parameters / defines to be used by
crypto offlaods other than IPsec.
The following changes made:
ipsec_obj_id field at flow action context was changed to crypto_obj_id,
intreduced a new crypto_type field where IPsec is the default zero type
for backward compatibility.
Action ipsec_decrypt was changed to crypto_decrypt.
Action ipsec_encrypt was changed to crypto_encrypt.

IPsec offload code was updated accordingly for backward compatibility.

Signed-off-by: Lior Nahmanson <liorna@nvidia.com>
Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/fs.h       |  7 ++++---
 include/linux/mlx5/mlx5_ifc.h | 12 ++++++++----
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 920cbc9524ad..e62d50acb6bd 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -243,9 +243,10 @@ struct mlx5_flow_act {
 	u32 action;
 	struct mlx5_modify_hdr  *modify_hdr;
 	struct mlx5_pkt_reformat *pkt_reformat;
-	union {
-		u32 ipsec_obj_id;
-	};
+	struct mlx5_flow_act_crypto_params {
+		u8 type;
+		u32 obj_id;
+	} crypto;
 	u32 flags;
 	struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH];
 	struct ib_counters *counters;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4acd5610e96b..5758218cb3fa 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -3310,8 +3310,8 @@ enum {
 	MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH = 0x100,
 	MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2  = 0x400,
 	MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2 = 0x800,
-	MLX5_FLOW_CONTEXT_ACTION_IPSEC_DECRYPT = 0x1000,
-	MLX5_FLOW_CONTEXT_ACTION_IPSEC_ENCRYPT = 0x2000,
+	MLX5_FLOW_CONTEXT_ACTION_CRYPTO_DECRYPT = 0x1000,
+	MLX5_FLOW_CONTEXT_ACTION_CRYPTO_ENCRYPT = 0x2000,
 	MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO = 0x4000,
 };
 
@@ -3321,6 +3321,10 @@ enum {
 	MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT       = 0x2,
 };
 
+enum {
+	MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC   = 0x0,
+};
+
 struct mlx5_ifc_vlan_bits {
 	u8         ethtype[0x10];
 	u8         prio[0x3];
@@ -3374,7 +3378,7 @@ struct mlx5_ifc_flow_context_bits {
 	u8         extended_destination[0x1];
 	u8         reserved_at_81[0x1];
 	u8         flow_source[0x2];
-	u8         reserved_at_84[0x4];
+	u8         encrypt_decrypt_type[0x4];
 	u8         destination_list_size[0x18];
 
 	u8         reserved_at_a0[0x8];
@@ -3386,7 +3390,7 @@ struct mlx5_ifc_flow_context_bits {
 
 	struct mlx5_ifc_vlan_bits push_vlan_2;
 
-	u8         ipsec_obj_id[0x20];
+	u8         encrypt_decrypt_obj_id[0x20];
 	u8         reserved_at_140[0xc0];
 
 	struct mlx5_ifc_fte_match_param_bits match_value;
-- 
cgit v1.2.3


From 8385c51ff5bceb92f7401138e16b0a617ca2ab02 Mon Sep 17 00:00:00 2001
From: Lior Nahmanson <liorna@nvidia.com>
Date: Mon, 5 Sep 2022 22:21:18 -0700
Subject: net/mlx5: Introduce MACsec Connect-X offload hardware bits and
 structures

Add MACsec offload related IFC structs, layouts and enumerations.

Signed-off-by: Lior Nahmanson <liorna@nvidia.com>
Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/device.h   |  4 ++
 include/linux/mlx5/mlx5_ifc.h | 99 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 101 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index b5f58fd37a0f..2927810f172b 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1198,6 +1198,7 @@ enum mlx5_cap_type {
 	MLX5_CAP_DEV_EVENT = 0x14,
 	MLX5_CAP_IPSEC,
 	MLX5_CAP_DEV_SHAMPO = 0x1d,
+	MLX5_CAP_MACSEC = 0x1f,
 	MLX5_CAP_GENERAL_2 = 0x20,
 	MLX5_CAP_PORT_SELECTION = 0x25,
 	/* NUM OF CAP Types */
@@ -1446,6 +1447,9 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_DEV_SHAMPO(mdev, cap)\
 	MLX5_GET(shampo_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_SHAMPO], cap)
 
+#define MLX5_CAP_MACSEC(mdev, cap)\
+	MLX5_GET(macsec_cap, (mdev)->caps.hca[MLX5_CAP_MACSEC]->cur, cap)
+
 enum {
 	MLX5_CMD_STAT_OK			= 0x0,
 	MLX5_CMD_STAT_INT_ERR			= 0x1,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 5758218cb3fa..8decbf9a7bdd 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -82,6 +82,7 @@ enum {
 	MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM),
 	MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT = (1ULL << 11),
 	MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q = (1ULL << 13),
+	MLX5_GENERAL_OBJ_TYPES_CAP_MACSEC_OFFLOAD = (1ULL << 39),
 };
 
 enum {
@@ -449,7 +450,12 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         reserved_at_60[0x2];
 	u8         reformat_insert[0x1];
 	u8         reformat_remove[0x1];
-	u8         reserver_at_64[0x14];
+	u8         macsec_encrypt[0x1];
+	u8         macsec_decrypt[0x1];
+	u8         reserved_at_66[0x2];
+	u8         reformat_add_macsec[0x1];
+	u8         reformat_remove_macsec[0x1];
+	u8         reserved_at_6a[0xe];
 	u8         log_max_ft_num[0x8];
 
 	u8         reserved_at_80[0x10];
@@ -611,7 +617,11 @@ struct mlx5_ifc_fte_match_set_misc2_bits {
 
 	u8         metadata_reg_a[0x20];
 
-	u8         reserved_at_1a0[0x60];
+	u8         reserved_at_1a0[0x8];
+
+	u8         macsec_syndrome[0x8];
+
+	u8         reserved_at_1b0[0x50];
 };
 
 struct mlx5_ifc_fte_match_set_misc3_bits {
@@ -1276,6 +1286,24 @@ struct mlx5_ifc_ipsec_cap_bits {
 	u8         reserved_at_30[0x7d0];
 };
 
+struct mlx5_ifc_macsec_cap_bits {
+	u8    macsec_epn[0x1];
+	u8    reserved_at_1[0x2];
+	u8    macsec_crypto_esp_aes_gcm_256_encrypt[0x1];
+	u8    macsec_crypto_esp_aes_gcm_128_encrypt[0x1];
+	u8    macsec_crypto_esp_aes_gcm_256_decrypt[0x1];
+	u8    macsec_crypto_esp_aes_gcm_128_decrypt[0x1];
+	u8    reserved_at_7[0x4];
+	u8    log_max_macsec_offload[0x5];
+	u8    reserved_at_10[0x10];
+
+	u8    min_log_macsec_full_replay_window[0x8];
+	u8    max_log_macsec_full_replay_window[0x8];
+	u8    reserved_at_30[0x10];
+
+	u8    reserved_at_40[0x7c0];
+};
+
 enum {
 	MLX5_WQ_TYPE_LINKED_LIST  = 0x0,
 	MLX5_WQ_TYPE_CYCLIC       = 0x1,
@@ -3295,6 +3323,7 @@ union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_device_mem_cap_bits device_mem_cap;
 	struct mlx5_ifc_virtio_emulation_cap_bits virtio_emulation_cap;
 	struct mlx5_ifc_shampo_cap_bits shampo_cap;
+	struct mlx5_ifc_macsec_cap_bits macsec_cap;
 	u8         reserved_at_0[0x8000];
 };
 
@@ -3323,6 +3352,7 @@ enum {
 
 enum {
 	MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC   = 0x0,
+	MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_MACSEC  = 0x1,
 };
 
 struct mlx5_ifc_vlan_bits {
@@ -6320,6 +6350,8 @@ enum mlx5_reformat_ctx_type {
 	MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4,
 	MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf,
 	MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10,
+	MLX5_REFORMAT_TYPE_ADD_MACSEC = 0x11,
+	MLX5_REFORMAT_TYPE_DEL_MACSEC = 0x12,
 };
 
 struct mlx5_ifc_alloc_packet_reformat_context_in_bits {
@@ -11475,6 +11507,7 @@ enum {
 	MLX5_GENERAL_OBJECT_TYPES_IPSEC = 0x13,
 	MLX5_GENERAL_OBJECT_TYPES_SAMPLER = 0x20,
 	MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = 0x24,
+	MLX5_GENERAL_OBJECT_TYPES_MACSEC = 0x27,
 };
 
 enum {
@@ -11525,6 +11558,67 @@ struct mlx5_ifc_modify_ipsec_obj_in_bits {
 	struct mlx5_ifc_ipsec_obj_bits ipsec_object;
 };
 
+struct mlx5_ifc_macsec_aso_bits {
+	u8    valid[0x1];
+	u8    reserved_at_1[0x1];
+	u8    mode[0x2];
+	u8    window_size[0x2];
+	u8    soft_lifetime_arm[0x1];
+	u8    hard_lifetime_arm[0x1];
+	u8    remove_flow_enable[0x1];
+	u8    epn_event_arm[0x1];
+	u8    reserved_at_a[0x16];
+
+	u8    remove_flow_packet_count[0x20];
+
+	u8    remove_flow_soft_lifetime[0x20];
+
+	u8    reserved_at_60[0x80];
+
+	u8    mode_parameter[0x20];
+
+	u8    replay_protection_window[8][0x20];
+};
+
+struct mlx5_ifc_macsec_offload_obj_bits {
+	u8    modify_field_select[0x40];
+
+	u8    confidentiality_en[0x1];
+	u8    reserved_at_41[0x1];
+	u8    esn_en[0x1];
+	u8    esn_overlap[0x1];
+	u8    reserved_at_44[0x2];
+	u8    confidentiality_offset[0x2];
+	u8    reserved_at_48[0x4];
+	u8    aso_return_reg[0x4];
+	u8    reserved_at_50[0x10];
+
+	u8    esn_msb[0x20];
+
+	u8    reserved_at_80[0x8];
+	u8    dekn[0x18];
+
+	u8    reserved_at_a0[0x20];
+
+	u8    sci[0x40];
+
+	u8    reserved_at_100[0x8];
+	u8    macsec_aso_access_pd[0x18];
+
+	u8    reserved_at_120[0x60];
+
+	u8    salt[3][0x20];
+
+	u8    reserved_at_1e0[0x20];
+
+	struct mlx5_ifc_macsec_aso_bits macsec_aso;
+};
+
+struct mlx5_ifc_create_macsec_obj_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+	struct mlx5_ifc_macsec_offload_obj_bits macsec_object;
+};
+
 struct mlx5_ifc_encryption_key_obj_bits {
 	u8         modify_field_select[0x40];
 
@@ -11642,6 +11736,7 @@ enum {
 enum {
 	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_TLS = 0x1,
 	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_IPSEC = 0x2,
+	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_MACSEC = 0x4,
 };
 
 struct mlx5_ifc_tls_static_params_bits {
-- 
cgit v1.2.3


From ee534d7f81ba9cec028580f91429b3dc29b90c7f Mon Sep 17 00:00:00 2001
From: Lior Nahmanson <liorna@nvidia.com>
Date: Mon, 5 Sep 2022 22:21:20 -0700
Subject: net/mlx5: Add MACsec Tx tables support to fs_core

Changed EGRESS_KERNEL namespace to EGRESS_IPSEC and add new
namespace for MACsec TX.
This namespace should be the last namespace for transmitted packets.

Signed-off-by: Lior Nahmanson <liorna@nvidia.com>
Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/fs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index e62d50acb6bd..53d186774206 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -92,7 +92,8 @@ enum mlx5_flow_namespace_type {
 	MLX5_FLOW_NAMESPACE_SNIFFER_RX,
 	MLX5_FLOW_NAMESPACE_SNIFFER_TX,
 	MLX5_FLOW_NAMESPACE_EGRESS,
-	MLX5_FLOW_NAMESPACE_EGRESS_KERNEL,
+	MLX5_FLOW_NAMESPACE_EGRESS_IPSEC,
+	MLX5_FLOW_NAMESPACE_EGRESS_MACSEC,
 	MLX5_FLOW_NAMESPACE_RDMA_RX,
 	MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL,
 	MLX5_FLOW_NAMESPACE_RDMA_TX,
-- 
cgit v1.2.3


From e467b283ffd50cf15b84c73eef68787e257eaed5 Mon Sep 17 00:00:00 2001
From: Lior Nahmanson <liorna@nvidia.com>
Date: Mon, 5 Sep 2022 22:21:21 -0700
Subject: net/mlx5e: Add MACsec TX steering rules

Tx flow steering consists of two flow tables (FTs).

The first FT (crypto table) has two fixed rules:
One default miss rule so non MACsec offloaded packets bypass the MACSec
tables, another rule to make sure that MACsec key exchange (MKE) traffic
passes unencrypted as expected (matched of ethertype).
On each new MACsec offload flow, a new MACsec rule is added.
This rule is matched on metadata_reg_a (which contains the id of the
flow) and invokes the MACsec offload action on match.

The second FT (check table) has two fixed rules:
One rule for verifying that the previous offload actions were
finished successfully and packet need to be transmitted.
Another default rule for dropping packets that were failed in the
offload actions.

The MACsec FTs should be created on demand when the first MACsec rule is
added and destroyed when the last MACsec rule is deleted.

Signed-off-by: Lior Nahmanson <liorna@nvidia.com>
Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/qp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 8bda3ba5b109..be640c749d5e 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -252,6 +252,7 @@ enum {
 
 enum {
 	MLX5_ETH_WQE_FT_META_IPSEC = BIT(0),
+	MLX5_ETH_WQE_FT_META_MACSEC = BIT(1),
 };
 
 struct mlx5_wqe_eth_seg {
-- 
cgit v1.2.3


From 15d187e285b3d5f4fe0328c09566355c1e387ff6 Mon Sep 17 00:00:00 2001
From: Lior Nahmanson <liorna@nvidia.com>
Date: Mon, 5 Sep 2022 22:21:24 -0700
Subject: net/mlx5: Add MACsec Rx tables support to fs_core

Add new namespace for MACsec RX flows.
Encrypted MACsec packets should be first decrypted and stripped
from MACsec header and then continues with the kernel's steering
pipeline.

Signed-off-by: Lior Nahmanson <liorna@nvidia.com>
Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 53d186774206..c7a91981cd5a 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -79,6 +79,7 @@ static inline void build_leftovers_ft_param(int *priority,
 
 enum mlx5_flow_namespace_type {
 	MLX5_FLOW_NAMESPACE_BYPASS,
+	MLX5_FLOW_NAMESPACE_KERNEL_RX_MACSEC,
 	MLX5_FLOW_NAMESPACE_LAG,
 	MLX5_FLOW_NAMESPACE_OFFLOADS,
 	MLX5_FLOW_NAMESPACE_ETHTOOL,
-- 
cgit v1.2.3


From aaac38f614871df252aa7459647bf68d42f7c3e7 Mon Sep 17 00:00:00 2001
From: Vasant Hegde <vasant.hegde@amd.com>
Date: Thu, 25 Aug 2022 06:39:36 +0000
Subject: iommu/amd: Initial support for AMD IOMMU v2 page table

Introduce IO page table framework support for AMD IOMMU v2 page table.
This patch implements 4 level page table within iommu amd driver and
supports 4K/2M/1G page sizes.

Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Link: https://lore.kernel.org/r/20220825063939.8360-7-vasant.hegde@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/io-pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index ca98aeadcc80..ffe616db9409 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -16,6 +16,7 @@ enum io_pgtable_fmt {
 	ARM_V7S,
 	ARM_MALI_LPAE,
 	AMD_IOMMU_V1,
+	AMD_IOMMU_V2,
 	APPLE_DART,
 	IO_PGTABLE_NUM_FMTS,
 };
@@ -260,6 +261,7 @@ extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns;
 
 #endif /* __IO_PGTABLE_H */
-- 
cgit v1.2.3


From 5e0531f6b90ac096fedaf5bd0eae0bb4e5a39da5 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 7 Sep 2022 16:11:25 +0200
Subject: spi: Add capability to perform some transfer with chipselect off

Some components require a few clock cycles with chipselect off before
or/and after the data transfer done with CS on.

Typically IDT 801034 QUAD PCM CODEC datasheet states "Note *: CCLK
should have one cycle before CS goes low, and two cycles after
CS goes high".

The cycles "before" are implicitely provided by all previous activity
on the SPI bus. But the cycles "after" must be provided in order to
terminate the SPI transfer.

In order to use that kind of component, add a cs_off flag to
spi_transfer struct. When this flag is set, the transfer is performed
with chipselect off. This allows consummer to add a dummy transfer
at the end of the transfer list which is performed with chipselect
OFF, providing the required additional clock cycles.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Link: https://lore.kernel.org/r/434165c46f06d802690208a11e7ea2500e8da4c7.1662558898.git.christophe.leroy@csgroup.eu
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index e6c73d5ff1a8..6e6c62c59957 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -847,6 +847,7 @@ struct spi_res {
  *      for this transfer. If 0 the default (from @spi_device) is used.
  * @dummy_data: indicates transfer is dummy bytes transfer.
  * @cs_change: affects chipselect after this transfer completes
+ * @cs_off: performs the transfer with chipselect off.
  * @cs_change_delay: delay between cs deassert and assert when
  *      @cs_change is set and @spi_transfer is not the last in @spi_message
  * @delay: delay to be introduced after this transfer before
@@ -959,6 +960,7 @@ struct spi_transfer {
 	unsigned	cs_change:1;
 	unsigned	tx_nbits:3;
 	unsigned	rx_nbits:3;
+	unsigned	cs_off:1;
 #define	SPI_NBITS_SINGLE	0x01 /* 1bit transfer */
 #define	SPI_NBITS_DUAL		0x02 /* 2bits transfer */
 #define	SPI_NBITS_QUAD		0x04 /* 4bits transfer */
-- 
cgit v1.2.3


From 787f51f210ebe5d7d5e4c8101b148f0018e9c409 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 1 Sep 2022 10:16:49 +0200
Subject: USB/ARM: Switch S3C2410 UDC to GPIO descriptors

This converts the S3C2410 UDC USB device controller to use
GPIO descriptor tables and modern GPIO.

Cc: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Cc: Alim Akhtar <alim.akhtar@samsung.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-samsung-soc@vger.kernel.org
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20220901081649.564348-1-linus.walleij@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/platform_data/usb-s3c2410_udc.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/usb-s3c2410_udc.h b/include/linux/platform_data/usb-s3c2410_udc.h
index 07394819d03b..c0fbe1fe3426 100644
--- a/include/linux/platform_data/usb-s3c2410_udc.h
+++ b/include/linux/platform_data/usb-s3c2410_udc.h
@@ -22,12 +22,6 @@ enum s3c2410_udc_cmd_e {
 struct s3c2410_udc_mach_info {
 	void	(*udc_command)(enum s3c2410_udc_cmd_e);
 	void	(*vbus_draw)(unsigned int ma);
-
-	unsigned int pullup_pin;
-	unsigned int pullup_pin_inverted;
-
-	unsigned int vbus_pin;
-	unsigned char vbus_pin_inverted;
 };
 
 extern void __init s3c24xx_udc_set_platdata(struct s3c2410_udc_mach_info *);
-- 
cgit v1.2.3


From e77cab77f2cb3a1ca2ba8df4af45bb35617ac16d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@linux.intel.com>
Date: Thu, 1 Sep 2022 17:39:32 +0300
Subject: serial: Create uart_xmit_advance()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A very common pattern in the drivers is to advance xmit tail
index and do bookkeeping of Tx'ed characters. Create
uart_xmit_advance() to handle it.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: stable <stable@kernel.org>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220901143934.8850-2-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 6e4f4765d209..1eaea9fe44d8 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -624,6 +624,23 @@ struct uart_state {
 /* number of characters left in xmit buffer before we ask for more */
 #define WAKEUP_CHARS		256
 
+/**
+ * uart_xmit_advance - Advance xmit buffer and account Tx'ed chars
+ * @up: uart_port structure describing the port
+ * @chars: number of characters sent
+ *
+ * This function advances the tail of circular xmit buffer by the number of
+ * @chars transmitted and handles accounting of transmitted bytes (into
+ * @up's icount.tx).
+ */
+static inline void uart_xmit_advance(struct uart_port *up, unsigned int chars)
+{
+	struct circ_buf *xmit = &up->state->xmit;
+
+	xmit->tail = (xmit->tail + chars) & (UART_XMIT_SIZE - 1);
+	up->icount.tx += chars;
+}
+
 struct module;
 struct tty_driver;
 
-- 
cgit v1.2.3


From 85d6b66d31c35158364058ee98fb69ab5bb6a6b1 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Sun, 4 Sep 2022 15:40:39 -0600
Subject: dyndbg: fix module.dyndbg handling

For CONFIG_DYNAMIC_DEBUG=N, the ddebug_dyndbg_module_param_cb()
stub-fn is too permissive:

bash-5.1# modprobe drm JUNKdyndbg
bash-5.1# modprobe drm dyndbgJUNK
[   42.933220] dyndbg param is supported only in CONFIG_DYNAMIC_DEBUG builds
[   42.937484] ACPI: bus type drm_connector registered

This caused no ill effects, because unknown parameters are either
ignored by default with an "unknown parameter" warning, or ignored
because dyndbg allows its no-effect use on non-dyndbg builds.

But since the code has an explicit feedback message, it should be
issued accurately.  Fix with strcmp for exact param-name match.

Fixes: b48420c1d301 dynamic_debug: make dynamic-debug work for module initialization
Reported-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: Jason Baron <jbaron@akamai.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Link: https://lore.kernel.org/r/20220904214134.408619-3-jim.cromie@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/dynamic_debug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index dce631e678dd..f30b01aa9fa4 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -201,7 +201,7 @@ static inline int ddebug_remove_module(const char *mod)
 static inline int ddebug_dyndbg_module_param_cb(char *param, char *val,
 						const char *modname)
 {
-	if (strstr(param, "dyndbg")) {
+	if (!strcmp(param, "dyndbg")) {
 		/* avoid pr_warn(), which wants pr_fmt() fully defined */
 		printk(KERN_WARNING "dyndbg param is supported only in "
 			"CONFIG_DYNAMIC_DEBUG builds\n");
-- 
cgit v1.2.3


From e26ef3af964acfea311403126acee8c56c89e26b Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Sun, 4 Sep 2022 15:40:46 -0600
Subject: dyndbg: drop EXPORTed dynamic_debug_exec_queries

This exported fn is unused, and will not be needed. Lets dump it.

The export was added to let drm control pr_debugs, as part of using
them to avoid drm_debug_enabled overheads.  But its better to just
implement the drm.debug bitmap interface, then its available for
everyone.

Fixes: a2d375eda771 ("dyndbg: refine export, rename to dynamic_debug_exec_queries()")
Fixes: 4c0d77828d4f ("dyndbg: export ddebug_exec_queries")
Acked-by: Jason Baron <jbaron@akamai.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Link: https://lore.kernel.org/r/20220904214134.408619-10-jim.cromie@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/dynamic_debug.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index f30b01aa9fa4..8d9eec5f6d8b 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -55,9 +55,6 @@ struct _ddebug {
 
 #if defined(CONFIG_DYNAMIC_DEBUG_CORE)
 
-/* exported for module authors to exercise >control */
-int dynamic_debug_exec_queries(const char *query, const char *modname);
-
 int ddebug_add_module(struct _ddebug *tab, unsigned int n,
 				const char *modname);
 extern int ddebug_remove_module(const char *mod_name);
@@ -221,12 +218,6 @@ static inline int ddebug_dyndbg_module_param_cb(char *param, char *val,
 				rowsize, groupsize, buf, len, ascii);	\
 	} while (0)
 
-static inline int dynamic_debug_exec_queries(const char *query, const char *modname)
-{
-	pr_warn("kernel not built with CONFIG_DYNAMIC_DEBUG_CORE\n");
-	return 0;
-}
-
 #endif /* !CONFIG_DYNAMIC_DEBUG_CORE */
 
 #endif
-- 
cgit v1.2.3


From b7b4eebdba7b6aea6b34dc29691b71c39d1dbd6a Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Sun, 4 Sep 2022 15:40:48 -0600
Subject: dyndbg: gather __dyndbg[] state into struct _ddebug_info

This new struct composes the linker provided (vector,len) section,
and provides a place to add other __dyndbg[] state-data later:

  descs - the vector of descriptors in __dyndbg section.
  num_descs - length of the data/section.

Use it, in several different ways, as follows:

In lib/dynamic_debug.c:

ddebug_add_module(): Alter params-list, replacing 2 args (array,index)
with a struct _ddebug_info * containing them both, with room for
expansion.  This helps future-proof the function prototype against the
looming addition of class-map info into the dyndbg-state, by providing
a place to add more member fields later.

NB: later add static struct _ddebug_info builtins_state declaration,
not needed yet.

ddebug_add_module() is called in 2 contexts:

In dynamic_debug_init(), declare, init a struct _ddebug_info di
auto-var to use as a cursor.  Then iterate over the prdbg blocks of
the builtin modules, and update the di cursor before calling
_add_module for each.

Its called from kernel/module/main.c:load_info() for each loaded
module:

In internal.h, alter struct load_info, replacing the dyndbg array,len
fields with an embedded _ddebug_info containing them both; and
populate its members in find_module_sections().

The 2 calling contexts differ in that _init deals with contiguous
subranges of __dyndbgs[] section, packed together, while loadable
modules are added one at a time.

So rename ddebug_add_module() into outer/__inner fns, call __inner
from _init, and provide the offset into the builtin __dyndbgs[] where
the module's prdbgs reside.  The cursor provides start, len of the
subrange for each.  The offset will be used later to pack the results
of builtin __dyndbg_sites[] de-duplication, and is 0 and unneeded for
loadable modules,

Note:

kernel/module/main.c includes <dynamic_debug.h> for struct
_ddeubg_info.  This might be prone to include loops, since its also
included by printk.h.  Nothing has broken in robot-land on this.

cc: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Link: https://lore.kernel.org/r/20220904214134.408619-12-jim.cromie@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/dynamic_debug.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 8d9eec5f6d8b..6a2001250da1 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -51,12 +51,16 @@ struct _ddebug {
 #endif
 } __attribute__((aligned(8)));
 
-
+/* encapsulate linker provided built-in (or module) dyndbg data */
+struct _ddebug_info {
+	struct _ddebug *descs;
+	unsigned int num_descs;
+};
 
 #if defined(CONFIG_DYNAMIC_DEBUG_CORE)
 
-int ddebug_add_module(struct _ddebug *tab, unsigned int n,
-				const char *modname);
+int ddebug_add_module(struct _ddebug_info *dyndbg, const char *modname);
+
 extern int ddebug_remove_module(const char *mod_name);
 extern __printf(2, 3)
 void __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...);
@@ -184,8 +188,7 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 #include <linux/errno.h>
 #include <linux/printk.h>
 
-static inline int ddebug_add_module(struct _ddebug *tab, unsigned int n,
-				    const char *modname)
+static inline int ddebug_add_module(struct _ddebug_info *dinfo, const char *modname)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From ca90fca7f7b51830dfb95bf655210a1c84588f15 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Sun, 4 Sep 2022 15:40:49 -0600
Subject: dyndbg: add class_id to pr_debug callsites

DRM issues ~10 exclusive categories of debug messages; to represent
this directly in dyndbg, add a new 6-bit field: struct _ddebug.class_id.
This gives us 64 classes, which should be more than enough.

  #> echo 0x012345678 > /sys/module/drm/parameters/debug

All existing callsites are initialized with _DPRINTK_CLASS_DFLT, which
is 2^6-1.  This reserves 0-62 for use in new categorized/class'd
pr_debugs, which fits perfectly with natural enums (ints: 0..N).

Thats done by extending the init macro: DEFINE_DYNAMIC_DEBUG_METADATA()
with _CLS(cls, ...) suffix, and redefing old name using extended name.

Then extend the factory macro callchain with _cls() versions to provide
the callsite.class_id, with old-names passing _DPRINTK_CLASS_DFLT.

This sets us up to create class'd prdebug callsites (class'd callsites
are those with .class_id != _DPRINTK_CLASS_DFLT).

No behavior change.

cc: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Link: https://lore.kernel.org/r/20220904214134.408619-13-jim.cromie@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/dynamic_debug.h | 71 +++++++++++++++++++++++++++++++++----------
 1 file changed, 55 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 6a2001250da1..633f4e463766 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -6,6 +6,8 @@
 #include <linux/jump_label.h>
 #endif
 
+#include <linux/build_bug.h>
+
 /*
  * An instance of this structure is created in a special
  * ELF section at every dynamic debug callsite.  At runtime,
@@ -21,6 +23,9 @@ struct _ddebug {
 	const char *filename;
 	const char *format;
 	unsigned int lineno:18;
+#define CLS_BITS 6
+	unsigned int class_id:CLS_BITS;
+#define _DPRINTK_CLASS_DFLT		((1 << CLS_BITS) - 1)
 	/*
 	 * The flags field controls the behaviour at the callsite.
 	 * The bits here are changed dynamically when the user
@@ -88,7 +93,7 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 			 const struct ib_device *ibdev,
 			 const char *fmt, ...);
 
-#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt)		\
+#define DEFINE_DYNAMIC_DEBUG_METADATA_CLS(name, cls, fmt)	\
 	static struct _ddebug  __aligned(8)			\
 	__section("__dyndbg") name = {				\
 		.modname = KBUILD_MODNAME,			\
@@ -97,8 +102,14 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 		.format = (fmt),				\
 		.lineno = __LINE__,				\
 		.flags = _DPRINTK_FLAGS_DEFAULT,		\
+		.class_id = cls,				\
 		_DPRINTK_KEY_INIT				\
-	}
+	};							\
+	BUILD_BUG_ON_MSG(cls > _DPRINTK_CLASS_DFLT,		\
+			 "classid value overflow")
+
+#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt)		\
+	DEFINE_DYNAMIC_DEBUG_METADATA_CLS(name, _DPRINTK_CLASS_DFLT, fmt)
 
 #ifdef CONFIG_JUMP_LABEL
 
@@ -129,17 +140,34 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 
 #endif /* CONFIG_JUMP_LABEL */
 
-#define __dynamic_func_call(id, fmt, func, ...) do {	\
-	DEFINE_DYNAMIC_DEBUG_METADATA(id, fmt);		\
-	if (DYNAMIC_DEBUG_BRANCH(id))			\
-		func(&id, ##__VA_ARGS__);		\
-} while (0)
-
-#define __dynamic_func_call_no_desc(id, fmt, func, ...) do {	\
-	DEFINE_DYNAMIC_DEBUG_METADATA(id, fmt);			\
+/*
+ * Factory macros: ($prefix)dynamic_func_call($suffix)
+ *
+ * Lower layer (with __ prefix) gets the callsite metadata, and wraps
+ * the func inside a debug-branch/static-key construct.  Upper layer
+ * (with _ prefix) does the UNIQUE_ID once, so that lower can ref the
+ * name/label multiple times, and tie the elements together.
+ * Multiple flavors:
+ * (|_cls):	adds in _DPRINT_CLASS_DFLT as needed
+ * (|_no_desc):	former gets callsite descriptor as 1st arg (for prdbgs)
+ */
+#define __dynamic_func_call_cls(id, cls, fmt, func, ...) do {	\
+	DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt);	\
 	if (DYNAMIC_DEBUG_BRANCH(id))				\
-		func(__VA_ARGS__);				\
+		func(&id, ##__VA_ARGS__);			\
 } while (0)
+#define __dynamic_func_call(id, fmt, func, ...)				\
+	__dynamic_func_call_cls(id, _DPRINTK_CLASS_DFLT, fmt,		\
+				func, ##__VA_ARGS__)
+
+#define __dynamic_func_call_cls_no_desc(id, cls, fmt, func, ...) do {	\
+	DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt);		\
+	if (DYNAMIC_DEBUG_BRANCH(id))					\
+		func(__VA_ARGS__);					\
+} while (0)
+#define __dynamic_func_call_no_desc(id, fmt, func, ...)			\
+	__dynamic_func_call_cls_no_desc(id, _DPRINTK_CLASS_DFLT,	\
+					fmt, func, ##__VA_ARGS__)
 
 /*
  * "Factory macro" for generating a call to func, guarded by a
@@ -149,22 +177,33 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
  * the varargs. Note that fmt is repeated in invocations of this
  * macro.
  */
+#define _dynamic_func_call_cls(cls, fmt, func, ...)			\
+	__dynamic_func_call_cls(__UNIQUE_ID(ddebug), cls, fmt, func, ##__VA_ARGS__)
 #define _dynamic_func_call(fmt, func, ...)				\
-	__dynamic_func_call(__UNIQUE_ID(ddebug), fmt, func, ##__VA_ARGS__)
+	_dynamic_func_call_cls(_DPRINTK_CLASS_DFLT, fmt, func, ##__VA_ARGS__)
+
 /*
  * A variant that does the same, except that the descriptor is not
  * passed as the first argument to the function; it is only called
  * with precisely the macro's varargs.
  */
-#define _dynamic_func_call_no_desc(fmt, func, ...)	\
-	__dynamic_func_call_no_desc(__UNIQUE_ID(ddebug), fmt, func, ##__VA_ARGS__)
+#define _dynamic_func_call_cls_no_desc(cls, fmt, func, ...)		\
+	__dynamic_func_call_cls_no_desc(__UNIQUE_ID(ddebug), cls, fmt,	\
+					func, ##__VA_ARGS__)
+#define _dynamic_func_call_no_desc(fmt, func, ...)			\
+	_dynamic_func_call_cls_no_desc(_DPRINTK_CLASS_DFLT, fmt,	\
+				       func, ##__VA_ARGS__)
+
+#define dynamic_pr_debug_cls(cls, fmt, ...)				\
+	_dynamic_func_call_cls(cls, fmt, __dynamic_pr_debug,		\
+			   pr_fmt(fmt), ##__VA_ARGS__)
 
 #define dynamic_pr_debug(fmt, ...)				\
-	_dynamic_func_call(fmt,	__dynamic_pr_debug,		\
+	_dynamic_func_call(fmt, __dynamic_pr_debug,		\
 			   pr_fmt(fmt), ##__VA_ARGS__)
 
 #define dynamic_dev_dbg(dev, fmt, ...)				\
-	_dynamic_func_call(fmt,__dynamic_dev_dbg, 		\
+	_dynamic_func_call(fmt, __dynamic_dev_dbg, 		\
 			   dev, fmt, ##__VA_ARGS__)
 
 #define dynamic_netdev_dbg(dev, fmt, ...)			\
-- 
cgit v1.2.3


From 3fc95d80a536e49e38ba4f79ca60cb4e64f99b3b Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Sun, 4 Sep 2022 15:40:50 -0600
Subject: dyndbg: add __pr_debug_cls for testing

For selftest purposes, add __pr_debug_cls(class, fmt, ...)

I didn't think we'd need to define this, since DRM effectively has it
already in drm_dbg, drm_devdbg.  But test_dynamic_debug needs it in
order to demonstrate all the moving parts.

Note the __ prefix; its not intended for general use, at least until a
need emerges.  ISTM the drm.debug model (macro wrappers inserting enum
const 1st arg) is the baseline approach.

That said, nouveau might want it for easy use in its debug macros. TBD.

NB: it does require a builtin-constant class, __pr_debug_cls(i++, ...)
is disallowed by compiler.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Link: https://lore.kernel.org/r/20220904214134.408619-14-jim.cromie@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/dynamic_debug.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 633f4e463766..3c9690da44d9 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -221,6 +221,13 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 				   KERN_DEBUG, prefix_str, prefix_type,	\
 				   rowsize, groupsize, buf, len, ascii)
 
+/* for test only, generally expect drm.debug style macro wrappers */
+#define __pr_debug_cls(cls, fmt, ...) do {			\
+	BUILD_BUG_ON_MSG(!__builtin_constant_p(cls),		\
+			 "expecting constant class int/enum");	\
+	dynamic_pr_debug_cls(cls, fmt, ##__VA_ARGS__);		\
+	} while (0)
+
 #else /* !CONFIG_DYNAMIC_DEBUG_CORE */
 
 #include <linux/string.h>
-- 
cgit v1.2.3


From aad0214f30264c19044a77fc4776def349b76fc4 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Sun, 4 Sep 2022 15:40:51 -0600
Subject: dyndbg: add DECLARE_DYNDBG_CLASSMAP macro

Using DECLARE_DYNDBG_CLASSMAP, modules can declare up to 31 classnames.
By doing so, they authorize dyndbg to manipulate class'd prdbgs (ie:
__pr_debug_cls, and soon drm_*dbg), ala::

   :#> echo class DRM_UT_KMS +p > /proc/dynamic_debug/control

The macro declares and initializes a static struct ddebug_class_map::

 - maps approved class-names to class_ids used in module,
   by array order. forex: DRM_UT_*
 - class-name vals allow validation of "class FOO" queries
   using macro is opt-in
 - enum class_map_type - determines interface, behavior

Each module has its own class-type and class_id space, and only known
class-names will be authorized for a manipulation.  Only DRM modules
should know and respont to this:

  :#> echo class DRM_UT_CORE +p > control	# across all modules

pr_debugs (with default class_id) are still controllable as before.

DECLARE_DYNDBG_CLASSMAP(_var, _maptype, _base, classes...) is::

 _var: name of the static struct var. user passes to module_param_cb()
       if they want a sysfs node.

 _maptype: this is hard-coded to DD_CLASS_TYPE_DISJOINT_BITS for now.

 _base: usually 0, it allows splitting 31 classes into subranges, so
 	that multiple classes / sysfs-nodes can share the module's
 	class-id space.

 classes: list of class_name strings, these are mapped to class-ids
 	  starting at _base.  This class-names list must have a
 	  corresponding ENUM, with SYMBOLS that match the literals,
 	  and 1st enum val = _base.

enum class_map_type has 4 values, on 2 factors::

 - classes are disjoint/independent vs relative/x<y/verbosity.
   disjoint is basis, verbosity by overlay.

 - input NUMBERS vs [+-]CLASS_NAMES
   uints, ideally hex.  vs  +DRM_UT_CORE,-DRM_UT_KMS

DD_CLASS_TYPE_DISJOINT_BITS: classes are separate, one per bit.
   expecting hex input. built for drm.debug, basis for other types.

DD_CLASS_TYPE_DISJOINT_NAMES: input is a CSV of [+-]CLASS_NAMES,
   classes are independent, like DISJOINT

DD_CLASS_TYPE_LEVEL_NUM: input is numeric level, 0-N.
   0 implies silence. use printk to break that.
   relative levels applied on bitmaps.

DD_CLASS_TYPE_LEVEL_NAMES: input is a CSV of [+-]CLASS_NAMES,
   names like: ERR,WARNING,NOTICE,INFO,DEBUG
   avoiding EMERG,ALERT,CRIT,ERR - no point.

NOTES:

The macro places the initialized struct ddebug_class_map into the
__dyndbg_classes section.  That draws an 'orphan' warning which we
handle in the next commit.  The struct attributes are necessary:
__aligned(8) fixed null-ptr derefs, and __used is needed by drm
drivers, which declare class-maps, but don't also declare a
sysfs-param, and thus dont ref the classmap.  The __used insures that
the linkage is made, then the class-map is found at load-time.

While its possible to handle both NAMES and NUMBERS in the same
sys-interface, there is ambiguity to avoid, by disallowing them
together.  Later, if ambiguities are resolved, 2 new enums can permit
both inputs, on verbose & independent types separately, and authors
can select the interface style they like.

The plan is to implement LEVELS in the callbacks, outside of
ddebug_exec_query(), which for simplicity will treat the CLASSES in
the map as disjoint.  The callbacks can see map-type, and apply ++/--
loops (or bitops) to force the relative meanings across the
class-bitmap.

RFC: That leaves 2 issues:

1. doing LEVELs in callbacks means that the native >control interface
doesn't enforce the LEVELS relationship, so you could confusingly have
V3 enabled, but V1 disabled.  OTOH, the control iface already allows
infinite tweaking of the underlying callsites; sysfs node readback can
only tell the user what they previously wrote.

2. All dyndbg >control reduces to a query/command, includes +/-, which
is at-root a kernel patching operation with +/- semantics.  And the
_NAMES handling exposes it to the user, making it API-adjacent.

And its not just >control where +/- gets used (which is settled), the
new place is with sysfs-nodes exposing _*_NAMES classes, and here its
subtly different.

_DISJOINT_NAMES: is simple, independent
_LEVEL_NAMES: masks-on bits 0 .. N-1, N..max off

  # turn on L3,L2,L1 others off
  echo +L3 > /sys/module/test_dynamic_debug/parameters/p_level_names

  # turn on L2,L1 others off
  echo -L3 > /sys/module/test_dynamic_debug/parameters/p_level_names

IOW, the - changes the threshold-on bitpos by 1.

Alternatively, we could treat the +/- as half-duplex, where -L3 turns
off L>2 (and ignores L1), and +L2 would turn on L<=2 (and ignore others).

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Link: https://lore.kernel.org/r/20220904214134.408619-15-jim.cromie@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/dynamic_debug.h | 55 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 3c9690da44d9..98dbf1d49984 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -56,6 +56,61 @@ struct _ddebug {
 #endif
 } __attribute__((aligned(8)));
 
+enum class_map_type {
+	DD_CLASS_TYPE_DISJOINT_BITS,
+	/**
+	 * DD_CLASS_TYPE_DISJOINT_BITS: classes are independent, one per bit.
+	 * expecting hex input. Built for drm.debug, basis for other types.
+	 */
+	DD_CLASS_TYPE_LEVEL_NUM,
+	/**
+	 * DD_CLASS_TYPE_LEVEL_NUM: input is numeric level, 0-N.
+	 * N turns on just bits N-1 .. 0, so N=0 turns all bits off.
+	 */
+	DD_CLASS_TYPE_DISJOINT_NAMES,
+	/**
+	 * DD_CLASS_TYPE_DISJOINT_NAMES: input is a CSV of [+-]CLASS_NAMES,
+	 * classes are independent, like _DISJOINT_BITS.
+	 */
+	DD_CLASS_TYPE_LEVEL_NAMES,
+	/**
+	 * DD_CLASS_TYPE_LEVEL_NAMES: input is a CSV of [+-]CLASS_NAMES,
+	 * intended for names like: INFO,DEBUG,TRACE, with a module prefix
+	 * avoid EMERG,ALERT,CRIT,ERR,WARNING: they're not debug
+	 */
+};
+
+struct ddebug_class_map {
+	struct list_head link;
+	struct module *mod;
+	const char *mod_name;	/* needed for builtins */
+	const char **class_names;
+	const int length;
+	const int base;		/* index of 1st .class_id, allows split/shared space */
+	enum class_map_type map_type;
+};
+
+/**
+ * DECLARE_DYNDBG_CLASSMAP - declare classnames known by a module
+ * @_var:   a struct ddebug_class_map, passed to module_param_cb
+ * @_type:  enum class_map_type, chooses bits/verbose, numeric/symbolic
+ * @_base:  offset of 1st class-name. splits .class_id space
+ * @classes: class-names used to control class'd prdbgs
+ */
+#define DECLARE_DYNDBG_CLASSMAP(_var, _maptype, _base, ...)		\
+	static const char *_var##_classnames[] = { __VA_ARGS__ };	\
+	static struct ddebug_class_map __aligned(8) __used		\
+		__section("__dyndbg_classes") _var = {			\
+		.mod = THIS_MODULE,					\
+		.mod_name = KBUILD_MODNAME,				\
+		.base = _base,						\
+		.map_type = _maptype,					\
+		.length = NUM_TYPE_ARGS(char*, __VA_ARGS__),		\
+		.class_names = _var##_classnames,			\
+	}
+#define NUM_TYPE_ARGS(eltype, ...)				\
+        (sizeof((eltype[]){__VA_ARGS__}) / sizeof(eltype))
+
 /* encapsulate linker provided built-in (or module) dyndbg data */
 struct _ddebug_info {
 	struct _ddebug *descs;
-- 
cgit v1.2.3


From 66f4006b6ace1a1a1a1dca4225972f79a298e251 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Sun, 4 Sep 2022 15:40:52 -0600
Subject: kernel/module: add __dyndbg_classes section

Add __dyndbg_classes section, using __dyndbg as a model. Use it:

vmlinux.lds.h:

KEEP the new section, which also silences orphan section warning on
loadable modules.  Add (__start_/__stop_)__dyndbg_classes linker
symbols for the c externs (below).

kernel/module/main.c:
- fill new fields in find_module_sections(), using section_objs()
- extend callchain prototypes
  to pass classes, length
  load_module(): pass new info to dynamic_debug_setup()
  dynamic_debug_setup(): new params, pass through to ddebug_add_module()

dynamic_debug.c:
- add externs to the linker symbols.

ddebug_add_module():
- It currently builds a debug_table, and *will* find and attach classes.

dynamic_debug_init():
- add class fields to the _ddebug_info cursor var: di.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Link: https://lore.kernel.org/r/20220904214134.408619-16-jim.cromie@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/dynamic_debug.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 98dbf1d49984..9073a43a2039 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -114,7 +114,9 @@ struct ddebug_class_map {
 /* encapsulate linker provided built-in (or module) dyndbg data */
 struct _ddebug_info {
 	struct _ddebug *descs;
+	struct ddebug_class_map *classes;
 	unsigned int num_descs;
+	unsigned int num_classes;
 };
 
 #if defined(CONFIG_DYNAMIC_DEBUG_CORE)
-- 
cgit v1.2.3


From b9400852c0801aebee4fc8b62e6b7cc69c7fcbda Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Sun, 4 Sep 2022 15:40:57 -0600
Subject: dyndbg: add drm.debug style (drm/parameters/debug) bitmap support

Add kernel_param_ops and callbacks to use a class-map to validate and
apply input to a sysfs-node, which allows users to control classes
defined in that class-map.  This supports uses like:

  echo 0x3 > /sys/module/drm/parameters/debug

IE add these:

 - int param_set_dyndbg_classes()
 - int param_get_dyndbg_classes()
 - struct kernel_param_ops param_ops_dyndbg_classes

Following the model of kernel/params.c STANDARD_PARAM_DEFS, these are
non-static and exported.  This might be unnecessary here.

get/set use an augmented kernel_param; the arg refs a new struct
ddebug_class_param, which contains:

- A ptr to user's state-store; a union of &ulong for drm.debug, &int
  for nouveau level debug.  By ref'g the client's bit-state _var, code
  coordinates with existing code (like drm_debug_enabled) which uses
  it, so existing/remaining calls can work unchanged.  Changing
  drm.debug to a ulong allows use of BIT() etc.

- FLAGS: dyndbg.flags toggled by changes to bitmap. Usually just "p".

- MAP: a pointer to struct ddebug_classes_map, which maps those
  class-names to .class_ids 0..N that the module is using.  This
  class-map is declared & initialized by DECLARE_DYNDBG_CLASSMAP.

- map-type: 4 enums DD_CLASS_TYPE_* select 2 input forms and 2 meanings.

numeric input:
  DD_CLASS_TYPE_DISJOINT_BITS	integer input, independent bits. ie: drm.debug
  DD_CLASS_TYPE_LEVEL_NUM	integer input, 0..N levels

classnames-list (comma separated) input:
  DD_CLASS_TYPE_DISJOINT_NAMES	each name affects a bit, others preserved
  DD_CLASS_TYPE_LEVEL_NAMES	names have level meanings, like kern_levels.h

_NAMES    - comma-separated classnames (with optional +-)
_NUM      - numeric input, 0-N expected
_BITS     - numeric input, 0x1F bitmap form expected

_DISJOINT - bits are independent
_LEVEL    - (x<y) on bit-pos.

_DISJOINT treats input like a bit-vector (ala drm.debug), and sets
each bit accordingly.  LEVEL is layered on top of this.

_LEVEL treats input like a bit-pos:N, then sets bits(0..N)=1, and
bits(N+1..max)=0.  This applies (bit<N) semantics on top of disjoint
bits.

USAGES:

A potentially typical _DISJOINT_NAMES use:

  echo +DRM_UT_CORE,+DRM_UT_KMS,-DRM_UT_DRIVER,-DRM_UT_ATOMIC \
       > /sys/module/drm/parameters/debug_catnames

A naive _LEVEL_NAMES use, with one class, that sets all in the
class-map according to (x<y):

  : problem seen
  echo +L7 > /sys/module/test_dynamic_debug/parameters/p_level_names
  : problem solved
  echo -L1 > /sys/module/test_dynamic_debug/parameters/p_level_names

Note this artifact:

  : this is same as prev cmd (due to +/-)
  echo L0 > /sys/module/test_dynamic_debug/parameters/p_level_names

  : this is "even-more" off, but same wo __pr_debug_class(L0, "..").
  echo -L0 > /sys/module/test_dynamic_debug/parameters/p_level_names

A stress-test/make-work usage (kid toggling a light switch):

  echo +L7,L0,L7,L0,L7,L0,L7,L0,L7,L0,L7,L0,L7 \
       > /sys/module/test_dynamic_debug/parameters/p_level_names

ddebug_apply_class_bitmap(): inside-fn, works on bitmaps, receives
new-bits, finds diffs vs client-bitvector holding "current" state,
and issues exec_query to commit the adjustment.

param_set_dyndbg_classes(): interface fn, sends _NAMES to
param_set_dyndbg_classnames() and returns, falls thru to handle _BITS,
_NUM internally, and calls ddebug_apply_class_bitmap().  Finishes by
updating state.

param_set_dyndbg_classnames(): handles classnames-list in loop, calls
ddebug_apply_class_bitmap for each, then updates state.

NOTES:

_LEVEL_ is overlay on _DISJOINT_; inputs are converted to a bitmask,
by the callbacks.  IOW this is possible, and possibly confusing:

  echo class V3 +p > control
  echo class V1 -p > control

IMO thats ok, relative verbosity is an interface property.

_LEVEL_NUM maps still need class-names, even though the names are not
usable at the sysfs interface (unlike with _NAMES style).  The names
are the only way to >control the classes.

 - It must have a "V0" name,
   something below "V1" to turn "V1" off.
   __pr_debug_cls(V0,..) is printk, don't do that.

 - "class names" is required at the >control interface.
 - relative levels are not enforced at >control

_LEVEL_NAMES bear +/- signs, which alters the on-bit-pos by 1.  IOW,
+L2 means L0,L1,L2, and -L2 means just L0,L1.  This kinda spoils the
readback fidelity, since the L0 bit gets turned on by any use of any
L*, except "-L0".

All the interface uncertainty here pertains to the _NAMES features.
Nobody has actually asked for this, so its practical (if a little
tedious) to split it out.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Link: https://lore.kernel.org/r/20220904214134.408619-21-jim.cromie@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/dynamic_debug.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 9073a43a2039..41682278d2e8 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -119,6 +119,15 @@ struct _ddebug_info {
 	unsigned int num_classes;
 };
 
+struct ddebug_class_param {
+	union {
+		unsigned long *bits;
+		unsigned int *lvl;
+	};
+	char flags[8];
+	const struct ddebug_class_map *map;
+};
+
 #if defined(CONFIG_DYNAMIC_DEBUG_CORE)
 
 int ddebug_add_module(struct _ddebug_info *dyndbg, const char *modname);
@@ -278,6 +287,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 				   KERN_DEBUG, prefix_str, prefix_type,	\
 				   rowsize, groupsize, buf, len, ascii)
 
+struct kernel_param;
+int param_set_dyndbg_classes(const char *instr, const struct kernel_param *kp);
+int param_get_dyndbg_classes(char *buffer, const struct kernel_param *kp);
+
 /* for test only, generally expect drm.debug style macro wrappers */
 #define __pr_debug_cls(cls, fmt, ...) do {			\
 	BUILD_BUG_ON_MSG(!__builtin_constant_p(cls),		\
@@ -324,6 +337,14 @@ static inline int ddebug_dyndbg_module_param_cb(char *param, char *val,
 				rowsize, groupsize, buf, len, ascii);	\
 	} while (0)
 
+struct kernel_param;
+static inline int param_set_dyndbg_classes(const char *instr, const struct kernel_param *kp)
+{ return 0; }
+static inline int param_get_dyndbg_classes(char *buffer, const struct kernel_param *kp)
+{ return 0; }
+
 #endif /* !CONFIG_DYNAMIC_DEBUG_CORE */
 
+extern const struct kernel_param_ops param_ops_dyndbg_classes;
+
 #endif
-- 
cgit v1.2.3


From 95f2f26f3cac06cfc046d2b29e60719d7848ea54 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 6 Sep 2022 17:12:58 +0200
Subject: bpf: split btf_check_subprog_arg_match in two

btf_check_subprog_arg_match() was used twice in verifier.c:
- when checking for the type mismatches between a (sub)prog declaration
  and BTF
- when checking the call of a subprog to see if the provided arguments
  are correct and valid

This is problematic when we check if the first argument of a program
(pointer to ctx) is correctly accessed:
To be able to ensure we access a valid memory in the ctx, the verifier
assumes the pointer to context is not null.
This has the side effect of marking the program accessing the entire
context, even if the context is never dereferenced.

For example, by checking the context access with the current code, the
following eBPF program would fail with -EINVAL if the ctx is set to null
from the userspace:

```
SEC("syscall")
int prog(struct my_ctx *args) {
  return 0;
}
```

In that particular case, we do not want to actually check that the memory
is correct while checking for the BTF validity, but we just want to
ensure that the (sub)prog definition matches the BTF we have.

So split btf_check_subprog_arg_match() in two so we can actually check
for the memory used when in a call, and ignore that part when not.

Note that a further patch is in preparation to disentangled
btf_check_func_arg_match() from these two purposes, and so right now we
just add a new hack around that by adding a boolean to this function.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220906151303.2780789-3-benjamin.tissoires@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4d32f125f4af..3cf161cfd396 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1947,6 +1947,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 struct bpf_reg_state;
 int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 				struct bpf_reg_state *regs);
+int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
+			   struct bpf_reg_state *regs);
 int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
 			      const struct btf *btf, u32 func_id,
 			      struct bpf_reg_state *regs,
-- 
cgit v1.2.3


From eb1f7f71c126c8fd50ea81af98f97c4b581ea4ae Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 6 Sep 2022 17:13:02 +0200
Subject: bpf/verifier: allow kfunc to return an allocated mem

For drivers (outside of network), the incoming data is not statically
defined in a struct. Most of the time the data buffer is kzalloc-ed
and thus we can not rely on eBPF and BTF to explore the data.

This commit allows to return an arbitrary memory, previously allocated by
the driver.
An interesting extra point is that the kfunc can mark the exported
memory region as read only or read/write.

So, when a kfunc is not returning a pointer to a struct but to a plain
type, we can consider it is a valid allocated memory assuming that:
- one of the arguments is either called rdonly_buf_size or
  rdwr_buf_size
- and this argument is a const from the caller point of view

We can then use this parameter as the size of the allocated memory.

The memory is either read-only or read-write based on the name
of the size parameter.

Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220906151303.2780789-7-benjamin.tissoires@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  9 ++++++++-
 include/linux/bpf_verifier.h |  2 ++
 include/linux/btf.h          | 10 ++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3cf161cfd396..79883f883ff3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1944,6 +1944,13 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   const char *func_name,
 			   struct btf_func_model *m);
 
+struct bpf_kfunc_arg_meta {
+	u64 r0_size;
+	bool r0_rdonly;
+	int ref_obj_id;
+	u32 flags;
+};
+
 struct bpf_reg_state;
 int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 				struct bpf_reg_state *regs);
@@ -1952,7 +1959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
 			      const struct btf *btf, u32 func_id,
 			      struct bpf_reg_state *regs,
-			      u32 kfunc_flags);
+			      struct bpf_kfunc_arg_meta *meta);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			  struct bpf_reg_state *reg);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1fdddbf3546b..8fbc1d05281e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -598,6 +598,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    struct bpf_attach_target_info *tgt_info);
 void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab);
 
+int mark_chain_precision(struct bpf_verifier_env *env, int regno);
+
 #define BPF_BASE_TYPE_MASK	GENMASK(BPF_BASE_TYPE_BITS - 1, 0)
 
 /* extract base type from bpf_{arg, return, reg}_type. */
diff --git a/include/linux/btf.h b/include/linux/btf.h
index ad93c2d9cc1c..1fcc833a8690 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -441,4 +441,14 @@ static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dt
 }
 #endif
 
+static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t)
+{
+	if (!btf_type_is_ptr(t))
+		return false;
+
+	t = btf_type_skip_modifiers(btf, t->type, NULL);
+
+	return btf_type_is_struct(t);
+}
+
 #endif
-- 
cgit v1.2.3


From 448325199f574d33824dbf9121efb03558412966 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sun, 4 Sep 2022 22:41:14 +0200
Subject: bpf: Add copy_map_value_long to copy to remote percpu memory

bpf_long_memcpy is used while copying to remote percpu regions from BPF
syscall and helpers, so that the copy is atomic at word size
granularity.

This might not be possible when you copy from map value hosting kptrs
from or to percpu maps, as the alignment or size in disjoint regions may
not be multiple of word size.

Hence, to avoid complicating the copy loop, we only use bpf_long_memcpy
when special fields are not present, otherwise use normal memcpy to copy
the disjoint regions.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220904204145.3089-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 52 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 79883f883ff3..6a73e94821c4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -280,14 +280,33 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
 	}
 }
 
-/* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */
-static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
+/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
+ * forced to use 'long' read/writes to try to atomically copy long counters.
+ * Best-effort only.  No barriers here, since it _will_ race with concurrent
+ * updates from BPF programs. Called from bpf syscall and mostly used with
+ * size 8 or 16 bytes, so ask compiler to inline it.
+ */
+static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
+{
+	const long *lsrc = src;
+	long *ldst = dst;
+
+	size /= sizeof(long);
+	while (size--)
+		*ldst++ = *lsrc++;
+}
+
+/* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */
+static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, bool long_memcpy)
 {
 	u32 curr_off = 0;
 	int i;
 
 	if (likely(!map->off_arr)) {
-		memcpy(dst, src, map->value_size);
+		if (long_memcpy)
+			bpf_long_memcpy(dst, src, round_up(map->value_size, 8));
+		else
+			memcpy(dst, src, map->value_size);
 		return;
 	}
 
@@ -299,6 +318,17 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 	}
 	memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off);
 }
+
+static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
+{
+	__copy_map_value(map, dst, src, false);
+}
+
+static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src)
+{
+	__copy_map_value(map, dst, src, true);
+}
+
 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
 			   bool lock_src);
 void bpf_timer_cancel_and_free(void *timer);
@@ -1827,22 +1857,6 @@ int bpf_get_file_flag(int flags);
 int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size,
 			     size_t actual_size);
 
-/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
- * forced to use 'long' read/writes to try to atomically copy long counters.
- * Best-effort only.  No barriers here, since it _will_ race with concurrent
- * updates from BPF programs. Called from bpf syscall and mostly used with
- * size 8 or 16 bytes, so ask compiler to inline it.
- */
-static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
-{
-	const long *lsrc = src;
-	long *ldst = dst;
-
-	size /= sizeof(long);
-	while (size--)
-		*ldst++ = *lsrc++;
-}
-
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr);
 
-- 
cgit v1.2.3


From cc48755808c646666436745b35629c3f0d05e165 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sun, 4 Sep 2022 22:41:16 +0200
Subject: bpf: Add zero_map_value to zero map value with special fields

We need this helper to skip over special fields (bpf_spin_lock,
bpf_timer, kptrs) while zeroing a map value. Use the same logic as
copy_map_value but memset instead of memcpy.

Currently, the code zeroing map value memory does not have to deal with
special fields, hence this is a prerequisite for introducing such
support.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220904204145.3089-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6a73e94821c4..48ae05099f36 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -329,6 +329,25 @@ static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src
 	__copy_map_value(map, dst, src, true);
 }
 
+static inline void zero_map_value(struct bpf_map *map, void *dst)
+{
+	u32 curr_off = 0;
+	int i;
+
+	if (likely(!map->off_arr)) {
+		memset(dst, 0, map->value_size);
+		return;
+	}
+
+	for (i = 0; i < map->off_arr->cnt; i++) {
+		u32 next_off = map->off_arr->field_off[i];
+
+		memset(dst + curr_off, 0, next_off - curr_off);
+		curr_off += map->off_arr->field_sz[i];
+	}
+	memset(dst + curr_off, 0, map->value_size - curr_off);
+}
+
 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
 			   bool lock_src);
 void bpf_timer_cancel_and_free(void *timer);
-- 
cgit v1.2.3


From 5950e5d574c636a07dd21a872c2f8b41f6d20c55 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 Aug 2022 13:18:17 +0200
Subject: freezer: Have {,un}lock_system_sleep() save/restore flags

Rafael explained that the reason for having both PF_NOFREEZE and
PF_FREEZER_SKIP is that {,un}lock_system_sleep() is callable from
kthread context that has previously called set_freezable().

In preparation of merging the flags, have {,un}lock_system_slee() save
and restore current->flags.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20220822114648.725003428@infradead.org
---
 include/linux/suspend.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 70f2921e2e70..34fb72c0db85 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -510,8 +510,8 @@ extern bool pm_save_wakeup_count(unsigned int count);
 extern void pm_wakep_autosleep_enabled(bool set);
 extern void pm_print_active_wakeup_sources(void);
 
-extern void lock_system_sleep(void);
-extern void unlock_system_sleep(void);
+extern unsigned int lock_system_sleep(void);
+extern void unlock_system_sleep(unsigned int);
 
 #else /* !CONFIG_PM_SLEEP */
 
@@ -534,8 +534,8 @@ static inline void pm_system_wakeup(void) {}
 static inline void pm_wakeup_clear(bool reset) {}
 static inline void pm_system_irq_wakeup(unsigned int irq_number) {}
 
-static inline void lock_system_sleep(void) {}
-static inline void unlock_system_sleep(void) {}
+static inline unsigned int lock_system_sleep(void) { return 0; }
+static inline void unlock_system_sleep(unsigned int flags) {}
 
 #endif /* !CONFIG_PM_SLEEP */
 
-- 
cgit v1.2.3


From 1fbcaa923ce2d7e6de17abd74fa076dc1e0be1a2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 Aug 2022 13:18:18 +0200
Subject: freezer,umh: Clean up freezer/initrd interaction

handle_initrd() marks itself as PF_FREEZER_SKIP in order to ensure
that the UMH, which is going to freeze the system, doesn't
indefinitely wait for it's caller.

Rework things by adding UMH_FREEZABLE to indicate the completion is
freezable.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20220822114648.791019324@infradead.org
---
 include/linux/umh.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/umh.h b/include/linux/umh.h
index 244aff638220..5d1f6129b847 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -11,10 +11,11 @@
 struct cred;
 struct file;
 
-#define UMH_NO_WAIT	0	/* don't wait at all */
-#define UMH_WAIT_EXEC	1	/* wait for the exec, but not the process */
-#define UMH_WAIT_PROC	2	/* wait for the process to complete */
-#define UMH_KILLABLE	4	/* wait for EXEC/PROC killable */
+#define UMH_NO_WAIT	0x00	/* don't wait at all */
+#define UMH_WAIT_EXEC	0x01	/* wait for the exec, but not the process */
+#define UMH_WAIT_PROC	0x02	/* wait for the process to complete */
+#define UMH_KILLABLE	0x04	/* wait for EXEC/PROC killable */
+#define UMH_FREEZABLE	0x08	/* wait for EXEC/PROC freezable */
 
 struct subprocess_info {
 	struct work_struct work;
-- 
cgit v1.2.3


From f9fc8cad9728124cefe8844fb53d1814c92c6bfc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 6 Sep 2022 12:39:55 +0200
Subject: sched: Add TASK_ANY for wait_task_inactive()

Now that wait_task_inactive()'s @match_state argument is a mask (like
ttwu()) it is possible to replace the special !match_state case with
an 'all-states' value such that any blocked state will match.

Suggested-by: Ingo Molnar (mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/YxhkzfuFTvRnpUaH@hirez.programming.kicks-ass.net
---
 include/linux/sched.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e7b2f8a5c711..0facaadbae71 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -101,6 +101,8 @@ struct task_group;
 #define TASK_RTLOCK_WAIT		0x1000
 #define TASK_STATE_MAX			0x2000
 
+#define TASK_ANY			(TASK_STATE_MAX-1)
+
 /* Convenience macros for the sake of set_current_state: */
 #define TASK_KILLABLE			(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
 #define TASK_STOPPED			(TASK_WAKEKILL | __TASK_STOPPED)
-- 
cgit v1.2.3


From 929659acea03db6411a32de9037abab9f856f586 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 Aug 2022 13:18:20 +0200
Subject: sched/completion: Add wait_for_completion_state()

Allows waiting with a custom @state.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20220822114648.922711674@infradead.org
---
 include/linux/completion.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 51d9ab079629..62b32b19e0a8 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -103,6 +103,7 @@ extern void wait_for_completion(struct completion *);
 extern void wait_for_completion_io(struct completion *);
 extern int wait_for_completion_interruptible(struct completion *x);
 extern int wait_for_completion_killable(struct completion *x);
+extern int wait_for_completion_state(struct completion *x, unsigned int state);
 extern unsigned long wait_for_completion_timeout(struct completion *x,
 						   unsigned long timeout);
 extern unsigned long wait_for_completion_io_timeout(struct completion *x,
-- 
cgit v1.2.3


From 3f884a10ab41efe2d495bf8ec8d443d70c500a60 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 Aug 2022 13:18:21 +0200
Subject: sched/wait: Add wait_event_state()

Allows waiting with a custom @state.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20220822114648.989212021@infradead.org
---
 include/linux/wait.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 58cfbf81447c..b926eb9f3e26 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -932,6 +932,34 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);
 	__ret;									\
 })
 
+#define __wait_event_state(wq, condition, state)				\
+	___wait_event(wq, condition, state, 0, 0, schedule())
+
+/**
+ * wait_event_state - sleep until a condition gets true
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @state: state to sleep in
+ *
+ * The process is put to sleep (@state) until the @condition evaluates to true
+ * or a signal is received (when allowed by @state).  The @condition is checked
+ * each time the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function will return -ERESTARTSYS if it was interrupted by a signal
+ * (when allowed by @state) and 0 if @condition evaluated to true.
+ */
+#define wait_event_state(wq_head, condition, state)				\
+({										\
+	int __ret = 0;								\
+	might_sleep();								\
+	if (!(condition))							\
+		__ret = __wait_event_state(wq_head, condition, state);		\
+	__ret;									\
+})
+
 #define __wait_event_killable_timeout(wq_head, condition, timeout)		\
 	___wait_event(wq_head, ___wait_cond_timeout(condition),			\
 		      TASK_KILLABLE, 0, timeout,				\
-- 
cgit v1.2.3


From 9963e444f71e671bcbc30d61cf23a2c686ac7d05 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 6 Sep 2022 13:11:38 +0200
Subject: sched: Widen TAKS_state literals

In preparation of adding more states, add a few 0s to the literals as
we've just about ran out.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0facaadbae71..9fc23a2d9813 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -81,25 +81,25 @@ struct task_group;
  */
 
 /* Used in tsk->state: */
-#define TASK_RUNNING			0x0000
-#define TASK_INTERRUPTIBLE		0x0001
-#define TASK_UNINTERRUPTIBLE		0x0002
-#define __TASK_STOPPED			0x0004
-#define __TASK_TRACED			0x0008
+#define TASK_RUNNING			0x00000000
+#define TASK_INTERRUPTIBLE		0x00000001
+#define TASK_UNINTERRUPTIBLE		0x00000002
+#define __TASK_STOPPED			0x00000004
+#define __TASK_TRACED			0x00000008
 /* Used in tsk->exit_state: */
-#define EXIT_DEAD			0x0010
-#define EXIT_ZOMBIE			0x0020
+#define EXIT_DEAD			0x00000010
+#define EXIT_ZOMBIE			0x00000020
 #define EXIT_TRACE			(EXIT_ZOMBIE | EXIT_DEAD)
 /* Used in tsk->state again: */
-#define TASK_PARKED			0x0040
-#define TASK_DEAD			0x0080
-#define TASK_WAKEKILL			0x0100
-#define TASK_WAKING			0x0200
-#define TASK_NOLOAD			0x0400
-#define TASK_NEW			0x0800
+#define TASK_PARKED			0x00000040
+#define TASK_DEAD			0x00000080
+#define TASK_WAKEKILL			0x00000100
+#define TASK_WAKING			0x00000200
+#define TASK_NOLOAD			0x00000400
+#define TASK_NEW			0x00000800
 /* RT specific auxilliary flag to mark RT lock waiters */
-#define TASK_RTLOCK_WAIT		0x1000
-#define TASK_STATE_MAX			0x2000
+#define TASK_RTLOCK_WAIT		0x00001000
+#define TASK_STATE_MAX			0x00002000
 
 #define TASK_ANY			(TASK_STATE_MAX-1)
 
-- 
cgit v1.2.3


From f5d39b020809146cc28e6e73369bf8065e0310aa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 Aug 2022 13:18:22 +0200
Subject: freezer,sched: Rewrite core freezer logic

Rewrite the core freezer to behave better wrt thawing and be simpler
in general.

By replacing PF_FROZEN with TASK_FROZEN, a special block state, it is
ensured frozen tasks stay frozen until thawed and don't randomly wake
up early, as is currently possible.

As such, it does away with PF_FROZEN and PF_FREEZER_SKIP, freeing up
two PF_flags (yay!).

Specifically; the current scheme works a little like:

	freezer_do_not_count();
	schedule();
	freezer_count();

And either the task is blocked, or it lands in try_to_freezer()
through freezer_count(). Now, when it is blocked, the freezer
considers it frozen and continues.

However, on thawing, once pm_freezing is cleared, freezer_count()
stops working, and any random/spurious wakeup will let a task run
before its time.

That is, thawing tries to thaw things in explicit order; kernel
threads and workqueues before doing bringing SMP back before userspace
etc.. However due to the above mentioned races it is entirely possible
for userspace tasks to thaw (by accident) before SMP is back.

This can be a fatal problem in asymmetric ISA architectures (eg ARMv9)
where the userspace task requires a special CPU to run.

As said; replace this with a special task state TASK_FROZEN and add
the following state transitions:

	TASK_FREEZABLE	-> TASK_FROZEN
	__TASK_STOPPED	-> TASK_FROZEN
	__TASK_TRACED	-> TASK_FROZEN

The new TASK_FREEZABLE can be set on any state part of TASK_NORMAL
(IOW. TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE) -- any such state
is already required to deal with spurious wakeups and the freezer
causes one such when thawing the task (since the original state is
lost).

The special __TASK_{STOPPED,TRACED} states *can* be restored since
their canonical state is in ->jobctl.

With this, frozen tasks need an explicit TASK_FROZEN wakeup and are
free of undue (early / spurious) wakeups.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20220822114649.055452969@infradead.org
---
 include/linux/freezer.h      | 245 ++-----------------------------------------
 include/linux/sched.h        |  13 ++-
 include/linux/sunrpc/sched.h |   7 +-
 include/linux/wait.h         |  12 +--
 4 files changed, 26 insertions(+), 251 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 0621c5f86c39..b303472255be 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -8,9 +8,11 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/atomic.h>
+#include <linux/jump_label.h>
 
 #ifdef CONFIG_FREEZER
-extern atomic_t system_freezing_cnt;	/* nr of freezing conds in effect */
+DECLARE_STATIC_KEY_FALSE(freezer_active);
+
 extern bool pm_freezing;		/* PM freezing in effect */
 extern bool pm_nosig_freezing;		/* PM nosig freezing in effect */
 
@@ -22,10 +24,7 @@ extern unsigned int freeze_timeout_msecs;
 /*
  * Check if a process has been frozen
  */
-static inline bool frozen(struct task_struct *p)
-{
-	return p->flags & PF_FROZEN;
-}
+extern bool frozen(struct task_struct *p);
 
 extern bool freezing_slow_path(struct task_struct *p);
 
@@ -34,9 +33,10 @@ extern bool freezing_slow_path(struct task_struct *p);
  */
 static inline bool freezing(struct task_struct *p)
 {
-	if (likely(!atomic_read(&system_freezing_cnt)))
-		return false;
-	return freezing_slow_path(p);
+	if (static_branch_unlikely(&freezer_active))
+		return freezing_slow_path(p);
+
+	return false;
 }
 
 /* Takes and releases task alloc lock using task_lock() */
@@ -48,23 +48,14 @@ extern int freeze_kernel_threads(void);
 extern void thaw_processes(void);
 extern void thaw_kernel_threads(void);
 
-/*
- * DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION
- * If try_to_freeze causes a lockdep warning it means the caller may deadlock
- */
-static inline bool try_to_freeze_unsafe(void)
+static inline bool try_to_freeze(void)
 {
 	might_sleep();
 	if (likely(!freezing(current)))
 		return false;
-	return __refrigerator(false);
-}
-
-static inline bool try_to_freeze(void)
-{
 	if (!(current->flags & PF_NOFREEZE))
 		debug_check_no_locks_held();
-	return try_to_freeze_unsafe();
+	return __refrigerator(false);
 }
 
 extern bool freeze_task(struct task_struct *p);
@@ -79,195 +70,6 @@ static inline bool cgroup_freezing(struct task_struct *task)
 }
 #endif /* !CONFIG_CGROUP_FREEZER */
 
-/*
- * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
- * calls wait_for_completion(&vfork) and reset right after it returns from this
- * function.  Next, the parent should call try_to_freeze() to freeze itself
- * appropriately in case the child has exited before the freezing of tasks is
- * complete.  However, we don't want kernel threads to be frozen in unexpected
- * places, so we allow them to block freeze_processes() instead or to set
- * PF_NOFREEZE if needed. Fortunately, in the ____call_usermodehelper() case the
- * parent won't really block freeze_processes(), since ____call_usermodehelper()
- * (the child) does a little before exec/exit and it can't be frozen before
- * waking up the parent.
- */
-
-
-/**
- * freezer_do_not_count - tell freezer to ignore %current
- *
- * Tell freezers to ignore the current task when determining whether the
- * target frozen state is reached.  IOW, the current task will be
- * considered frozen enough by freezers.
- *
- * The caller shouldn't do anything which isn't allowed for a frozen task
- * until freezer_cont() is called.  Usually, freezer[_do_not]_count() pair
- * wrap a scheduling operation and nothing much else.
- */
-static inline void freezer_do_not_count(void)
-{
-	current->flags |= PF_FREEZER_SKIP;
-}
-
-/**
- * freezer_count - tell freezer to stop ignoring %current
- *
- * Undo freezer_do_not_count().  It tells freezers that %current should be
- * considered again and tries to freeze if freezing condition is already in
- * effect.
- */
-static inline void freezer_count(void)
-{
-	current->flags &= ~PF_FREEZER_SKIP;
-	/*
-	 * If freezing is in progress, the following paired with smp_mb()
-	 * in freezer_should_skip() ensures that either we see %true
-	 * freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP.
-	 */
-	smp_mb();
-	try_to_freeze();
-}
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline void freezer_count_unsafe(void)
-{
-	current->flags &= ~PF_FREEZER_SKIP;
-	smp_mb();
-	try_to_freeze_unsafe();
-}
-
-/**
- * freezer_should_skip - whether to skip a task when determining frozen
- *			 state is reached
- * @p: task in quesion
- *
- * This function is used by freezers after establishing %true freezing() to
- * test whether a task should be skipped when determining the target frozen
- * state is reached.  IOW, if this function returns %true, @p is considered
- * frozen enough.
- */
-static inline bool freezer_should_skip(struct task_struct *p)
-{
-	/*
-	 * The following smp_mb() paired with the one in freezer_count()
-	 * ensures that either freezer_count() sees %true freezing() or we
-	 * see cleared %PF_FREEZER_SKIP and return %false.  This makes it
-	 * impossible for a task to slip frozen state testing after
-	 * clearing %PF_FREEZER_SKIP.
-	 */
-	smp_mb();
-	return p->flags & PF_FREEZER_SKIP;
-}
-
-/*
- * These functions are intended to be used whenever you want allow a sleeping
- * task to be frozen. Note that neither return any clear indication of
- * whether a freeze event happened while in this function.
- */
-
-/* Like schedule(), but should not block the freezer. */
-static inline void freezable_schedule(void)
-{
-	freezer_do_not_count();
-	schedule();
-	freezer_count();
-}
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline void freezable_schedule_unsafe(void)
-{
-	freezer_do_not_count();
-	schedule();
-	freezer_count_unsafe();
-}
-
-/*
- * Like schedule_timeout(), but should not block the freezer.  Do not
- * call this with locks held.
- */
-static inline long freezable_schedule_timeout(long timeout)
-{
-	long __retval;
-	freezer_do_not_count();
-	__retval = schedule_timeout(timeout);
-	freezer_count();
-	return __retval;
-}
-
-/*
- * Like schedule_timeout_interruptible(), but should not block the freezer.  Do not
- * call this with locks held.
- */
-static inline long freezable_schedule_timeout_interruptible(long timeout)
-{
-	long __retval;
-	freezer_do_not_count();
-	__retval = schedule_timeout_interruptible(timeout);
-	freezer_count();
-	return __retval;
-}
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline long freezable_schedule_timeout_interruptible_unsafe(long timeout)
-{
-	long __retval;
-
-	freezer_do_not_count();
-	__retval = schedule_timeout_interruptible(timeout);
-	freezer_count_unsafe();
-	return __retval;
-}
-
-/* Like schedule_timeout_killable(), but should not block the freezer. */
-static inline long freezable_schedule_timeout_killable(long timeout)
-{
-	long __retval;
-	freezer_do_not_count();
-	__retval = schedule_timeout_killable(timeout);
-	freezer_count();
-	return __retval;
-}
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline long freezable_schedule_timeout_killable_unsafe(long timeout)
-{
-	long __retval;
-	freezer_do_not_count();
-	__retval = schedule_timeout_killable(timeout);
-	freezer_count_unsafe();
-	return __retval;
-}
-
-/*
- * Like schedule_hrtimeout_range(), but should not block the freezer.  Do not
- * call this with locks held.
- */
-static inline int freezable_schedule_hrtimeout_range(ktime_t *expires,
-		u64 delta, const enum hrtimer_mode mode)
-{
-	int __retval;
-	freezer_do_not_count();
-	__retval = schedule_hrtimeout_range(expires, delta, mode);
-	freezer_count();
-	return __retval;
-}
-
-/*
- * Freezer-friendly wrappers around wait_event_interruptible(),
- * wait_event_killable() and wait_event_interruptible_timeout(), originally
- * defined in <linux/wait.h>
- */
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-#define wait_event_freezekillable_unsafe(wq, condition)			\
-({									\
-	int __retval;							\
-	freezer_do_not_count();						\
-	__retval = wait_event_killable(wq, (condition));		\
-	freezer_count_unsafe();						\
-	__retval;							\
-})
-
 #else /* !CONFIG_FREEZER */
 static inline bool frozen(struct task_struct *p) { return false; }
 static inline bool freezing(struct task_struct *p) { return false; }
@@ -281,35 +83,8 @@ static inline void thaw_kernel_threads(void) {}
 
 static inline bool try_to_freeze(void) { return false; }
 
-static inline void freezer_do_not_count(void) {}
-static inline void freezer_count(void) {}
-static inline int freezer_should_skip(struct task_struct *p) { return 0; }
 static inline void set_freezable(void) {}
 
-#define freezable_schedule()  schedule()
-
-#define freezable_schedule_unsafe()  schedule()
-
-#define freezable_schedule_timeout(timeout)  schedule_timeout(timeout)
-
-#define freezable_schedule_timeout_interruptible(timeout)		\
-	schedule_timeout_interruptible(timeout)
-
-#define freezable_schedule_timeout_interruptible_unsafe(timeout)	\
-	schedule_timeout_interruptible(timeout)
-
-#define freezable_schedule_timeout_killable(timeout)			\
-	schedule_timeout_killable(timeout)
-
-#define freezable_schedule_timeout_killable_unsafe(timeout)		\
-	schedule_timeout_killable(timeout)
-
-#define freezable_schedule_hrtimeout_range(expires, delta, mode)	\
-	schedule_hrtimeout_range(expires, delta, mode)
-
-#define wait_event_freezekillable_unsafe(wq, condition)			\
-		wait_event_killable(wq, condition)
-
 #endif /* !CONFIG_FREEZER */
 
 #endif	/* FREEZER_H_INCLUDED */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9fc23a2d9813..4c91efd6df82 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -97,12 +97,19 @@ struct task_group;
 #define TASK_WAKING			0x00000200
 #define TASK_NOLOAD			0x00000400
 #define TASK_NEW			0x00000800
-/* RT specific auxilliary flag to mark RT lock waiters */
 #define TASK_RTLOCK_WAIT		0x00001000
-#define TASK_STATE_MAX			0x00002000
+#define TASK_FREEZABLE			0x00002000
+#define __TASK_FREEZABLE_UNSAFE	       (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP))
+#define TASK_FROZEN			0x00008000
+#define TASK_STATE_MAX			0x00010000
 
 #define TASK_ANY			(TASK_STATE_MAX-1)
 
+/*
+ * DO NOT ADD ANY NEW USERS !
+ */
+#define TASK_FREEZABLE_UNSAFE		(TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE)
+
 /* Convenience macros for the sake of set_current_state: */
 #define TASK_KILLABLE			(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
 #define TASK_STOPPED			(TASK_WAKEKILL | __TASK_STOPPED)
@@ -1716,7 +1723,6 @@ extern struct pid *cad_pid;
 #define PF_NPROC_EXCEEDED	0x00001000	/* set_user() noticed that RLIMIT_NPROC was exceeded */
 #define PF_USED_MATH		0x00002000	/* If unset the fpu must be initialized before use */
 #define PF_NOFREEZE		0x00008000	/* This thread should not be frozen */
-#define PF_FROZEN		0x00010000	/* Frozen for system suspend */
 #define PF_KSWAPD		0x00020000	/* I am kswapd */
 #define PF_MEMALLOC_NOFS	0x00040000	/* All allocation requests will inherit GFP_NOFS */
 #define PF_MEMALLOC_NOIO	0x00080000	/* All allocation requests will inherit GFP_NOIO */
@@ -1727,7 +1733,6 @@ extern struct pid *cad_pid;
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_PIN		0x10000000	/* Allocation context constrained to zones which allow long term pinning. */
-#define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
 /*
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index acc62647317c..baeca2f564dc 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -252,7 +252,7 @@ int		rpc_malloc(struct rpc_task *);
 void		rpc_free(struct rpc_task *);
 int		rpciod_up(void);
 void		rpciod_down(void);
-int		__rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *);
+int		rpc_wait_for_completion_task(struct rpc_task *task);
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 struct net;
 void		rpc_show_tasks(struct net *);
@@ -264,11 +264,6 @@ extern struct workqueue_struct *xprtiod_workqueue;
 void		rpc_prepare_task(struct rpc_task *task);
 gfp_t		rpc_task_gfp_mask(void);
 
-static inline int rpc_wait_for_completion_task(struct rpc_task *task)
-{
-	return __rpc_wait_for_completion_task(task, NULL);
-}
-
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS)
 static inline const char * rpc_qname(const struct rpc_wait_queue *q)
 {
diff --git a/include/linux/wait.h b/include/linux/wait.h
index b926eb9f3e26..14ad8a0e9fac 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -361,8 +361,8 @@ do {										\
 } while (0)
 
 #define __wait_event_freezable(wq_head, condition)				\
-	___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,		\
-			    freezable_schedule())
+	___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE),	\
+			0, 0, schedule())
 
 /**
  * wait_event_freezable - sleep (or freeze) until a condition gets true
@@ -420,8 +420,8 @@ do {										\
 
 #define __wait_event_freezable_timeout(wq_head, condition, timeout)		\
 	___wait_event(wq_head, ___wait_cond_timeout(condition),			\
-		      TASK_INTERRUPTIBLE, 0, timeout,				\
-		      __ret = freezable_schedule_timeout(__ret))
+		      (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout,		\
+		      __ret = schedule_timeout(__ret))
 
 /*
  * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
@@ -642,8 +642,8 @@ do {										\
 
 
 #define __wait_event_freezable_exclusive(wq, condition)				\
-	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,			\
-			freezable_schedule())
+	___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\
+			schedule())
 
 #define wait_event_freezable_exclusive(wq, condition)				\
 ({										\
-- 
cgit v1.2.3


From fb04563d1cae6f361892b4a339ad92100b1eb0d0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 6 Sep 2022 13:27:32 +0200
Subject: sched: Show PF_flag holes

Put placeholders in PF_flag so we can see how holey it is.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/YxctoffFFPXONESt@hirez.programming.kicks-ass.net
---
 include/linux/sched.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4c91efd6df82..15e3bd96e4ce 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1722,7 +1722,9 @@ extern struct pid *cad_pid;
 #define PF_MEMALLOC		0x00000800	/* Allocating memory */
 #define PF_NPROC_EXCEEDED	0x00001000	/* set_user() noticed that RLIMIT_NPROC was exceeded */
 #define PF_USED_MATH		0x00002000	/* If unset the fpu must be initialized before use */
+#define PF__HOLE__00004000	0x00004000
 #define PF_NOFREEZE		0x00008000	/* This thread should not be frozen */
+#define PF__HOLE__00010000	0x00010000
 #define PF_KSWAPD		0x00020000	/* I am kswapd */
 #define PF_MEMALLOC_NOFS	0x00040000	/* All allocation requests will inherit GFP_NOFS */
 #define PF_MEMALLOC_NOIO	0x00080000	/* All allocation requests will inherit GFP_NOIO */
@@ -1730,9 +1732,14 @@ extern struct pid *cad_pid;
 						 * I am cleaning dirty pages from some other bdi. */
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
+#define PF__HOLE__00800000	0x00800000
+#define PF__HOLE__01000000	0x01000000
+#define PF__HOLE__02000000	0x02000000
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_PIN		0x10000000	/* Allocation context constrained to zones which allow long term pinning. */
+#define PF__HOLE__20000000	0x20000000
+#define PF__HOLE__40000000	0x40000000
 #define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
 /*
-- 
cgit v1.2.3


From 03b02db93be407103c385814033633364674a6f6 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 6 Sep 2022 14:14:14 +0530
Subject: perf: Consolidate branch sample filter helpers

Besides the branch type filtering requests, 'event.attr.branch_sample_type'
also contains various flags indicating which additional information should
be captured, along with the base branch record. These flags help configure
the underlying hardware, and capture the branch records appropriately when
required e.g after PMU interrupt. But first, this moves an existing helper
perf_sample_save_hw_index() into the header before adding some more helpers
for other branch sample filter flags.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220906084414.396220-1-anshuman.khandual@arm.com
---
 include/linux/perf_event.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 581880ddb9ef..a627528e5f3a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1685,4 +1685,30 @@ static inline void perf_lopwr_cb(bool mode)
 }
 #endif
 
+#ifdef CONFIG_PERF_EVENTS
+static inline bool branch_sample_no_flags(const struct perf_event *event)
+{
+	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS;
+}
+
+static inline bool branch_sample_no_cycles(const struct perf_event *event)
+{
+	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES;
+}
+
+static inline bool branch_sample_type(const struct perf_event *event)
+{
+	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE;
+}
+
+static inline bool branch_sample_hw_index(const struct perf_event *event)
+{
+	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
+}
+
+static inline bool branch_sample_priv(const struct perf_event *event)
+{
+	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE;
+}
+#endif /* CONFIG_PERF_EVENTS */
 #endif /* _LINUX_PERF_EVENT_H */
-- 
cgit v1.2.3


From 7517f08b9a5eef0fa683b976c97d6178d00e6a3d Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 7 Sep 2022 14:49:21 +0530
Subject: perf/core: Expand PERF_EVENT_FLAG_ARCH

Two hardware event flags on x86 platform has overshot PERF_EVENT_FLAG_ARCH
(0x0000ffff). These flags are PERF_X86_EVENT_PEBS_LAT_HYBRID (0x20000) and
PERF_X86_EVENT_AMD_BRS (0x10000). Lets expand PERF_EVENT_FLAG_ARCH mask to
accommodate those flags, and also create room for two more in the future.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: James Clark <james.clark@arm.com>
Link: https://lkml.kernel.org/r/20220907091924.439193-2-anshuman.khandual@arm.com
---
 include/linux/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a627528e5f3a..3e3c07512b75 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -138,7 +138,7 @@ struct hw_perf_event_extra {
  * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific
  * usage.
  */
-#define PERF_EVENT_FLAG_ARCH			0x0000ffff
+#define PERF_EVENT_FLAG_ARCH			0x000fffff
 #define PERF_EVENT_FLAG_USER_READ_CNT		0x80000000
 
 /**
-- 
cgit v1.2.3


From f67dd218fafd9de9a13d095e775b621db76a058f Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 7 Sep 2022 14:49:22 +0530
Subject: perf/core: Assert PERF_EVENT_FLAG_ARCH does not overlap with generic
 flags

This just ensures that PERF_EVENT_FLAG_ARCH does not overlap with generic
hardware event flags.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: James Clark <james.clark@arm.com>
Link: https://lkml.kernel.org/r/20220907091924.439193-3-anshuman.khandual@arm.com
---
 include/linux/perf_event.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3e3c07512b75..f88cb31eaf75 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -141,6 +141,8 @@ struct hw_perf_event_extra {
 #define PERF_EVENT_FLAG_ARCH			0x000fffff
 #define PERF_EVENT_FLAG_USER_READ_CNT		0x80000000
 
+static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0);
+
 /**
  * struct hw_perf_event - performance event hardware details:
  */
-- 
cgit v1.2.3


From 91207f62616f9f51b52436364e6d064f002e9112 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 7 Sep 2022 14:49:23 +0530
Subject: arm64/perf: Assert all platform event flags are within
 PERF_EVENT_FLAG_ARCH

Ensure all platform specific event flags are within PERF_EVENT_FLAG_ARCH.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: James Clark <james.clark@arm.com>
Link: https://lkml.kernel.org/r/20220907091924.439193-4-anshuman.khandual@arm.com
---
 include/linux/perf/arm_pmu.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 0407a38b470a..0356cb6a215d 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -24,10 +24,11 @@
 /*
  * ARM PMU hw_event flags
  */
-/* Event uses a 64bit counter */
-#define ARMPMU_EVT_64BIT		1
-/* Event uses a 47bit counter */
-#define ARMPMU_EVT_47BIT		2
+#define ARMPMU_EVT_64BIT		0x00001 /* Event uses a 64bit counter */
+#define ARMPMU_EVT_47BIT		0x00002 /* Event uses a 47bit counter */
+
+static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_64BIT) == ARMPMU_EVT_64BIT);
+static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_47BIT) == ARMPMU_EVT_47BIT);
 
 #define HW_OP_UNSUPPORTED		0xFFFF
 #define C(_x)				PERF_COUNT_HW_CACHE_##_x
-- 
cgit v1.2.3


From f3c0eba287049237b23d1300376768293eb89e69 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 2 Sep 2022 18:48:55 +0200
Subject: perf: Add a few assertions

While auditing 6b959ba22d34 ("perf/core: Fix reentry problem in
perf_output_read_group()") a few spots were found that wanted
assertions.

Notable for_each_sibling_event() relies on exclusion from
modification. This would normally be holding either ctx->lock or
ctx->mutex, however due to how things are constructed disabling IRQs
is a valid and sufficient substitute for ctx->lock.

Another possible site to add assertions would be the various
pmu::{add,del,read,..}() methods, but that's not trivially expressable
in C -- the best option is wrappers, but those are easy enough to
forget.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/perf_event.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f88cb31eaf75..368bdc4f563f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -61,6 +61,7 @@ struct perf_guest_info_callbacks {
 #include <linux/refcount.h>
 #include <linux/security.h>
 #include <linux/static_call.h>
+#include <linux/lockdep.h>
 #include <asm/local.h>
 
 struct perf_callchain_entry {
@@ -634,7 +635,23 @@ struct pmu_event_list {
 	struct list_head	list;
 };
 
+/*
+ * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex
+ * as such iteration must hold either lock. However, since ctx->lock is an IRQ
+ * safe lock, and is only held by the CPU doing the modification, having IRQs
+ * disabled is sufficient since it will hold-off the IPIs.
+ */
+#ifdef CONFIG_PROVE_LOCKING
+#define lockdep_assert_event_ctx(event)				\
+	WARN_ON_ONCE(__lockdep_enabled &&			\
+		     (this_cpu_read(hardirqs_enabled) ||	\
+		      lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD))
+#else
+#define lockdep_assert_event_ctx(event)
+#endif
+
 #define for_each_sibling_event(sibling, event)			\
+	lockdep_assert_event_ctx(event);			\
 	if ((event)->group_leader == (event))			\
 		list_for_each_entry((sibling), &(event)->sibling_list, sibling_list)
 
-- 
cgit v1.2.3


From d219d2a9a92e39aa92799efe8f2aa21259b6dd82 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 29 Aug 2022 13:37:17 -0700
Subject: overflow: Allow mixed type arguments

When the check_[op]_overflow() helpers were introduced, all arguments
were required to be the same type to make the fallback macros simpler.
However, now that the fallback macros have been removed[1], it is fine
to allow mixed types, which makes using the helpers much more useful,
as they can be used to test for type-based overflows (e.g. adding two
large ints but storing into a u8), as would be handy in the drm core[2].

Remove the restriction, and add additional self-tests that exercise
some of the mixed-type overflow cases, and double-check for accidental
macro side-effects.

[1] https://git.kernel.org/linus/4eb6bd55cfb22ffc20652732340c4962f3ac9a91
[2] https://lore.kernel.org/lkml/20220824084514.2261614-2-gwan-gyeong.mun@intel.com

Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: linux-hardening@vger.kernel.org
Reviewed-by: Andrzej Hajda <andrzej.hajda@intel.com>
Reviewed-by: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Tested-by: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/overflow.h | 72 +++++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index 0eb3b192f07a..19dfdd74835e 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -51,40 +51,50 @@ static inline bool __must_check __must_check_overflow(bool overflow)
 	return unlikely(overflow);
 }
 
-/*
- * For simplicity and code hygiene, the fallback code below insists on
- * a, b and *d having the same type (similar to the min() and max()
- * macros), whereas gcc's type-generic overflow checkers accept
- * different types. Hence we don't just make check_add_overflow an
- * alias for __builtin_add_overflow, but add type checks similar to
- * below.
+/** check_add_overflow() - Calculate addition with overflow checking
+ *
+ * @a: first addend
+ * @b: second addend
+ * @d: pointer to store sum
+ *
+ * Returns 0 on success.
+ *
+ * *@d holds the results of the attempted addition, but is not considered
+ * "safe for use" on a non-zero return value, which indicates that the
+ * sum has overflowed or been truncated.
  */
-#define check_add_overflow(a, b, d) __must_check_overflow(({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	__builtin_add_overflow(__a, __b, __d);	\
-}))
+#define check_add_overflow(a, b, d)	\
+	__must_check_overflow(__builtin_add_overflow(a, b, d))
 
-#define check_sub_overflow(a, b, d) __must_check_overflow(({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	__builtin_sub_overflow(__a, __b, __d);	\
-}))
+/** check_sub_overflow() - Calculate subtraction with overflow checking
+ *
+ * @a: minuend; value to subtract from
+ * @b: subtrahend; value to subtract from @a
+ * @d: pointer to store difference
+ *
+ * Returns 0 on success.
+ *
+ * *@d holds the results of the attempted subtraction, but is not considered
+ * "safe for use" on a non-zero return value, which indicates that the
+ * difference has underflowed or been truncated.
+ */
+#define check_sub_overflow(a, b, d)	\
+	__must_check_overflow(__builtin_sub_overflow(a, b, d))
 
-#define check_mul_overflow(a, b, d) __must_check_overflow(({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	__builtin_mul_overflow(__a, __b, __d);	\
-}))
+/** check_mul_overflow() - Calculate multiplication with overflow checking
+ *
+ * @a: first factor
+ * @b: second factor
+ * @d: pointer to store product
+ *
+ * Returns 0 on success.
+ *
+ * *@d holds the results of the attempted multiplication, but is not
+ * considered "safe for use" on a non-zero return value, which indicates
+ * that the product has overflowed or been truncated.
+ */
+#define check_mul_overflow(a, b, d)	\
+	__must_check_overflow(__builtin_mul_overflow(a, b, d))
 
 /** check_shl_overflow() - Calculate a left-shifted value and check overflow
  *
-- 
cgit v1.2.3


From dfbafa70bde26c40615f8c538ce68dac82a64fb4 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 26 Aug 2022 11:04:43 -0700
Subject: string: Introduce strtomem() and strtomem_pad()

One of the "legitimate" uses of strncpy() is copying a NUL-terminated
string into a fixed-size non-NUL-terminated character array. To avoid
the weaknesses and ambiguity of intent when using strncpy(), provide
replacement functions that explicitly distinguish between trailing
padding and not, and require the destination buffer size be discoverable
by the compiler.

For example:

struct obj {
	int foo;
	char small[4] __nonstring;
	char big[8] __nonstring;
	int bar;
};

struct obj p;

/* This will truncate to 4 chars with no trailing NUL */
strncpy(p.small, "hello", sizeof(p.small));
/* p.small contains 'h', 'e', 'l', 'l' */

/* This will NUL pad to 8 chars. */
strncpy(p.big, "hello", sizeof(p.big));
/* p.big contains 'h', 'e', 'l', 'l', 'o', '\0', '\0', '\0' */

When the "__nonstring" attributes are missing, the intent of the
programmer becomes ambiguous for whether the lack of a trailing NUL
in the p.small copy is a bug. Additionally, it's not clear whether
the trailing padding in the p.big copy is _needed_. Both cases
become unambiguous with:

strtomem(p.small, "hello");
strtomem_pad(p.big, "hello", 0);

See also https://github.com/KSPP/linux/issues/90

Expand the memcpy KUnit tests to include these functions.

Cc: Wolfram Sang <wsa+renesas@sang-engineering.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 32 +++++++++++++++++++++++++++++++
 include/linux/string.h         | 43 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 3b401fa0f374..8e8c2c87b1d5 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -77,6 +77,38 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
 #define POS	__pass_object_size(1)
 #define POS0	__pass_object_size(0)
 
+/**
+ * strncpy - Copy a string to memory with non-guaranteed NUL padding
+ *
+ * @p: pointer to destination of copy
+ * @q: pointer to NUL-terminated source string to copy
+ * @size: bytes to write at @p
+ *
+ * If strlen(@q) >= @size, the copy of @q will stop after @size bytes,
+ * and @p will NOT be NUL-terminated
+ *
+ * If strlen(@q) < @size, following the copy of @q, trailing NUL bytes
+ * will be written to @p until @size total bytes have been written.
+ *
+ * Do not use this function. While FORTIFY_SOURCE tries to avoid
+ * over-reads of @q, it cannot defend against writing unterminated
+ * results to @p. Using strncpy() remains ambiguous and fragile.
+ * Instead, please choose an alternative, so that the expectation
+ * of @p's contents is unambiguous:
+ *
+ * +--------------------+-----------------+------------+
+ * | @p needs to be:    | padded to @size | not padded |
+ * +====================+=================+============+
+ * |     NUL-terminated | strscpy_pad()   | strscpy()  |
+ * +--------------------+-----------------+------------+
+ * | not NUL-terminated | strtomem_pad()  | strtomem() |
+ * +--------------------+-----------------+------------+
+ *
+ * Note strscpy*()'s differing return values for detecting truncation,
+ * and strtomem*()'s expectation that the destination is marked with
+ * __nonstring when it is a character array.
+ *
+ */
 __FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3)
 char *strncpy(char * const POS p, const char *q, __kernel_size_t size)
 {
diff --git a/include/linux/string.h b/include/linux/string.h
index 61ec7e4f6311..cf7607b32102 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -260,6 +260,49 @@ static inline const char *kbasename(const char *path)
 void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
 		    int pad);
 
+/**
+ * strtomem_pad - Copy NUL-terminated string to non-NUL-terminated buffer
+ *
+ * @dest: Pointer of destination character array (marked as __nonstring)
+ * @src: Pointer to NUL-terminated string
+ * @pad: Padding character to fill any remaining bytes of @dest after copy
+ *
+ * This is a replacement for strncpy() uses where the destination is not
+ * a NUL-terminated string, but with bounds checking on the source size, and
+ * an explicit padding character. If padding is not required, use strtomem().
+ *
+ * Note that the size of @dest is not an argument, as the length of @dest
+ * must be discoverable by the compiler.
+ */
+#define strtomem_pad(dest, src, pad)	do {				\
+	const size_t _dest_len = __builtin_object_size(dest, 1);	\
+									\
+	BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||		\
+		     _dest_len == (size_t)-1);				\
+	memcpy_and_pad(dest, _dest_len, src, strnlen(src, _dest_len), pad); \
+} while (0)
+
+/**
+ * strtomem - Copy NUL-terminated string to non-NUL-terminated buffer
+ *
+ * @dest: Pointer of destination character array (marked as __nonstring)
+ * @src: Pointer to NUL-terminated string
+ *
+ * This is a replacement for strncpy() uses where the destination is not
+ * a NUL-terminated string, but with bounds checking on the source size, and
+ * without trailing padding. If padding is required, use strtomem_pad().
+ *
+ * Note that the size of @dest is not an argument, as the length of @dest
+ * must be discoverable by the compiler.
+ */
+#define strtomem(dest, src)	do {					\
+	const size_t _dest_len = __builtin_object_size(dest, 1);	\
+									\
+	BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||		\
+		     _dest_len == (size_t)-1);				\
+	memcpy(dest, src, min(_dest_len, strnlen(src, _dest_len)));	\
+} while (0)
+
 /**
  * memset_after - Set a value after a struct member to the end of a struct
  *
-- 
cgit v1.2.3


From d07c0acb4f41cc42a0d97530946965b3e4fa68c1 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 2 Sep 2022 13:02:26 -0700
Subject: fortify: Fix __compiletime_strlen() under UBSAN_BOUNDS_LOCAL

With CONFIG_FORTIFY=y and CONFIG_UBSAN_LOCAL_BOUNDS=y enabled, we observe
a runtime panic while running Android's Compatibility Test Suite's (CTS)
android.hardware.input.cts.tests. This is stemming from a strlen()
call in hidinput_allocate().

__compiletime_strlen() is implemented in terms of __builtin_object_size(),
then does an array access to check for NUL-termination. A quirk of
__builtin_object_size() is that for strings whose values are runtime
dependent, __builtin_object_size(str, 1 or 0) returns the maximum size
of possible values when those sizes are determinable at compile time.
Example:

  static const char *v = "FOO BAR";
  static const char *y = "FOO BA";
  unsigned long x (int z) {
      // Returns 8, which is:
      // max(__builtin_object_size(v, 1), __builtin_object_size(y, 1))
      return __builtin_object_size(z ? v : y, 1);
  }

So when FORTIFY_SOURCE is enabled, the current implementation of
__compiletime_strlen() will try to access beyond the end of y at runtime
using the size of v. Mixed with UBSAN_LOCAL_BOUNDS we get a fault.

hidinput_allocate() has a local C string whose value is control flow
dependent on a switch statement, so __builtin_object_size(str, 1)
evaluates to the maximum string length, making all other cases fault on
the last character check. hidinput_allocate() could be cleaned up to
avoid runtime calls to strlen() since the local variable can only have
literal values, so there's no benefit to trying to fortify the strlen
call site there.

Perform a __builtin_constant_p() check against index 0 earlier in the
macro to filter out the control-flow-dependant case. Add a KUnit test
for checking the expected behavioral characteristics of FORTIFY_SOURCE
internals.

Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Tom Rix <trix@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Cc: David Gow <davidgow@google.com>
Cc: Yury Norov <yury.norov@gmail.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Sander Vanheule <sander@svanheule.net>
Cc: linux-hardening@vger.kernel.org
Cc: llvm@lists.linux.dev
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Tested-by: Android Treehugger Robot
Link: https://android-review.googlesource.com/c/kernel/common/+/2206839
Co-developed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 8e8c2c87b1d5..be264091f7a7 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -19,7 +19,8 @@ void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("
 	unsigned char *__p = (unsigned char *)(p);		\
 	size_t __ret = (size_t)-1;				\
 	size_t __p_size = __builtin_object_size(p, 1);		\
-	if (__p_size != (size_t)-1) {				\
+	if (__p_size != (size_t)-1 &&				\
+	    __builtin_constant_p(*__p)) {			\
 		size_t __p_len = __p_size - 1;			\
 		if (__builtin_constant_p(__p[__p_len]) &&	\
 		    __p[__p_len] == '\0')			\
-- 
cgit v1.2.3


From 311fb40aa0569abacc430b0d66ee41470803111f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 2 Sep 2022 13:23:06 -0700
Subject: fortify: Use SIZE_MAX instead of (size_t)-1

Clean up uses of "(size_t)-1" in favor of SIZE_MAX.

Cc: linux-hardening@vger.kernel.org
Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index be264091f7a7..e46af17d23d0 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -3,6 +3,7 @@
 #define _LINUX_FORTIFY_STRING_H_
 
 #include <linux/const.h>
+#include <linux/limits.h>
 
 #define __FORTIFY_INLINE extern __always_inline __gnu_inline __overloadable
 #define __RENAME(x) __asm__(#x)
@@ -17,9 +18,9 @@ void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("
 #define __compiletime_strlen(p)					\
 ({								\
 	unsigned char *__p = (unsigned char *)(p);		\
-	size_t __ret = (size_t)-1;				\
+	size_t __ret = SIZE_MAX;				\
 	size_t __p_size = __builtin_object_size(p, 1);		\
-	if (__p_size != (size_t)-1 &&				\
+	if (__p_size != SIZE_MAX &&				\
 	    __builtin_constant_p(*__p)) {			\
 		size_t __p_len = __p_size - 1;			\
 		if (__builtin_constant_p(__p[__p_len]) &&	\
@@ -127,7 +128,7 @@ char *strcat(char * const POS p, const char *q)
 {
 	size_t p_size = __builtin_object_size(p, 1);
 
-	if (p_size == (size_t)-1)
+	if (p_size == SIZE_MAX)
 		return __underlying_strcat(p, q);
 	if (strlcat(p, q, p_size) >= p_size)
 		fortify_panic(__func__);
@@ -142,7 +143,7 @@ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size
 	size_t ret;
 
 	/* We can take compile-time actions when maxlen is const. */
-	if (__builtin_constant_p(maxlen) && p_len != (size_t)-1) {
+	if (__builtin_constant_p(maxlen) && p_len != SIZE_MAX) {
 		/* If p is const, we can use its compile-time-known len. */
 		if (maxlen >= p_size)
 			return p_len;
@@ -170,7 +171,7 @@ __kernel_size_t __fortify_strlen(const char * const POS p)
 	size_t p_size = __builtin_object_size(p, 1);
 
 	/* Give up if we don't know how large p is. */
-	if (p_size == (size_t)-1)
+	if (p_size == SIZE_MAX)
 		return __underlying_strlen(p);
 	ret = strnlen(p, p_size);
 	if (p_size <= ret)
@@ -187,7 +188,7 @@ __FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, si
 	size_t q_len;	/* Full count of source string length. */
 	size_t len;	/* Count of characters going into destination. */
 
-	if (p_size == (size_t)-1 && q_size == (size_t)-1)
+	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
 		return __real_strlcpy(p, q, size);
 	q_len = strlen(q);
 	len = (q_len >= size) ? size - 1 : q_len;
@@ -215,7 +216,7 @@ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, s
 	size_t q_size = __builtin_object_size(q, 1);
 
 	/* If we cannot get size of p and q default to call strscpy. */
-	if (p_size == (size_t) -1 && q_size == (size_t) -1)
+	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
 		return __real_strscpy(p, q, size);
 
 	/*
@@ -260,7 +261,7 @@ char *strncat(char * const POS p, const char * const POS q, __kernel_size_t coun
 	size_t p_size = __builtin_object_size(p, 1);
 	size_t q_size = __builtin_object_size(q, 1);
 
-	if (p_size == (size_t)-1 && q_size == (size_t)-1)
+	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
 		return __underlying_strncat(p, q, count);
 	p_len = strlen(p);
 	copy_len = strnlen(q, count);
@@ -301,10 +302,10 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size,
 	/*
 	 * Always stop accesses beyond the struct that contains the
 	 * field, when the buffer's remaining size is known.
-	 * (The -1 test is to optimize away checks where the buffer
+	 * (The SIZE_MAX test is to optimize away checks where the buffer
 	 * lengths are unknown.)
 	 */
-	if (p_size != (size_t)(-1) && p_size < size)
+	if (p_size != SIZE_MAX && p_size < size)
 		fortify_panic("memset");
 }
 
@@ -395,11 +396,11 @@ __FORTIFY_INLINE void fortify_memcpy_chk(__kernel_size_t size,
 	/*
 	 * Always stop accesses beyond the struct that contains the
 	 * field, when the buffer's remaining size is known.
-	 * (The -1 test is to optimize away checks where the buffer
+	 * (The SIZE_MAX test is to optimize away checks where the buffer
 	 * lengths are unknown.)
 	 */
-	if ((p_size != (size_t)(-1) && p_size < size) ||
-	    (q_size != (size_t)(-1) && q_size < size))
+	if ((p_size != SIZE_MAX && p_size < size) ||
+	    (q_size != SIZE_MAX && q_size < size))
 		fortify_panic(func);
 }
 
@@ -498,7 +499,7 @@ char *strcpy(char * const POS p, const char * const POS q)
 	size_t size;
 
 	/* If neither buffer size is known, immediately give up. */
-	if (p_size == (size_t)-1 && q_size == (size_t)-1)
+	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
 		return __underlying_strcpy(p, q);
 	size = strlen(q) + 1;
 	/* Compile-time check for const size overflow. */
-- 
cgit v1.2.3


From 54d9469bc515dc5fcbc20eecbe19cea868b70d68 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 24 Jun 2021 15:39:26 -0700
Subject: fortify: Add run-time WARN for cross-field memcpy()

Enable run-time checking of dynamic memcpy() and memmove() lengths,
issuing a WARN when a write would exceed the size of the target struct
member, when built with CONFIG_FORTIFY_SOURCE=y. This would have
caught all of the memcpy()-based buffer overflows in the last 3 years,
specifically covering all the cases where the destination buffer size
is known at compile time.

This change ONLY adds a run-time warning. As false positives are currently
still expected, this will not block the overflow. The new warnings will
look like this:

  memcpy: detected field-spanning write (size N) of single field "var->dest" (size M)
  WARNING: CPU: n PID: pppp at source/file/path.c:nr function+0xXX/0xXX [module]

There may be false positives in the kernel where intentional
field-spanning writes are happening. These need to be addressed
similarly to how the compile-time cases were addressed: add a
struct_group(), split the memcpy(), or some other refactoring.

In order to make counting/investigating instances of added runtime checks
easier, each instance includes the destination variable name as a WARN
argument, prefixed with 'field "'. Therefore, on an x86_64 defconfig
build, it is trivial to inspect the build artifacts to find instances.
For example on an x86_64 defconfig build, there are 78 new run-time
memcpy() bounds checks added:

  $ for i in vmlinux $(find . -name '*.ko'); do \
      strings "$i" | grep '^field "'; done | wc -l
  78

Simple cases where a destination buffer is known to be a dynamic size
do not generate a WARN. For example:

struct normal_flex_array {
	void *a;
	int b;
	u32 c;
	size_t array_size;
	u8 flex_array[];
};

struct normal_flex_array *instance;
...
/* These will be ignored for run-time bounds checking. */
memcpy(instance, src, len);
memcpy(instance->flex_array, src, len);

However, one of the dynamic-sized destination cases is irritatingly
unable to be detected by the compiler: when using memcpy() to target
a composite struct member which contains a trailing flexible array
struct. For example:

struct wrapper {
	int foo;
	char bar;
	struct normal_flex_array embedded;
};

struct wrapper *instance;
...
/* This will incorrectly WARN when len > sizeof(instance->embedded) */
memcpy(&instance->embedded, src, len);

These cases end up appearing to the compiler to be sized as if the
flexible array had 0 elements. :( For more details see:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832
https://godbolt.org/z/vW6x8vh4P

These "composite flexible array structure destination" cases will be
need to be flushed out and addressed on a case-by-case basis.

Regardless, for the general case of using memcpy() on flexible array
destinations, future APIs will be created to handle common cases. Those
can be used to migrate away from open-coded memcpy() so that proper
error handling (instead of trapping) can be used.

As mentioned, none of these bounds checks block any overflows
currently. For users that have tested their workloads, do not encounter
any warnings, and wish to make these checks stop any overflows, they
can use a big hammer and set the sysctl panic_on_warn=1.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 70 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 67 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index e46af17d23d0..ff879efe94ed 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_FORTIFY_STRING_H_
 #define _LINUX_FORTIFY_STRING_H_
 
+#include <linux/bug.h>
 #include <linux/const.h>
 #include <linux/limits.h>
 
@@ -353,7 +354,7 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size,
  * V = vulnerable to run-time overflow (will need refactoring to solve)
  *
  */
-__FORTIFY_INLINE void fortify_memcpy_chk(__kernel_size_t size,
+__FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
 					 const size_t p_size,
 					 const size_t q_size,
 					 const size_t p_size_field,
@@ -402,16 +403,79 @@ __FORTIFY_INLINE void fortify_memcpy_chk(__kernel_size_t size,
 	if ((p_size != SIZE_MAX && p_size < size) ||
 	    (q_size != SIZE_MAX && q_size < size))
 		fortify_panic(func);
+
+	/*
+	 * Warn when writing beyond destination field size.
+	 *
+	 * We must ignore p_size_field == 0 for existing 0-element
+	 * fake flexible arrays, until they are all converted to
+	 * proper flexible arrays.
+	 *
+	 * The implementation of __builtin_object_size() behaves
+	 * like sizeof() when not directly referencing a flexible
+	 * array member, which means there will be many bounds checks
+	 * that will appear at run-time, without a way for them to be
+	 * detected at compile-time (as can be done when the destination
+	 * is specifically the flexible array member).
+	 * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832
+	 */
+	if (p_size_field != 0 && p_size_field != SIZE_MAX &&
+	    p_size != p_size_field && p_size_field < size)
+		return true;
+
+	return false;
 }
 
 #define __fortify_memcpy_chk(p, q, size, p_size, q_size,		\
 			     p_size_field, q_size_field, op) ({		\
 	size_t __fortify_size = (size_t)(size);				\
-	fortify_memcpy_chk(__fortify_size, p_size, q_size,		\
-			   p_size_field, q_size_field, #op);		\
+	WARN_ONCE(fortify_memcpy_chk(__fortify_size, p_size, q_size,	\
+				     p_size_field, q_size_field, #op),	\
+		  #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
+		  __fortify_size,					\
+		  "field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \
+		  p_size_field);					\
 	__underlying_##op(p, q, __fortify_size);			\
 })
 
+/*
+ * Notes about compile-time buffer size detection:
+ *
+ * With these types...
+ *
+ *	struct middle {
+ *		u16 a;
+ *		u8 middle_buf[16];
+ *		int b;
+ *	};
+ *	struct end {
+ *		u16 a;
+ *		u8 end_buf[16];
+ *	};
+ *	struct flex {
+ *		int a;
+ *		u8 flex_buf[];
+ *	};
+ *
+ *	void func(TYPE *ptr) { ... }
+ *
+ * Cases where destination size cannot be currently detected:
+ * - the size of ptr's object (seemingly by design, gcc & clang fail):
+ *	__builtin_object_size(ptr, 1) == SIZE_MAX
+ * - the size of flexible arrays in ptr's obj (by design, dynamic size):
+ *	__builtin_object_size(ptr->flex_buf, 1) == SIZE_MAX
+ * - the size of ANY array at the end of ptr's obj (gcc and clang bug):
+ *	__builtin_object_size(ptr->end_buf, 1) == SIZE_MAX
+ *	https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836
+ *
+ * Cases where destination size is currently detected:
+ * - the size of non-array members within ptr's object:
+ *	__builtin_object_size(ptr->a, 1) == 2
+ * - the size of non-flexible-array in the middle of ptr's obj:
+ *	__builtin_object_size(ptr->middle_buf, 1) == 16
+ *
+ */
+
 /*
  * __builtin_object_size() must be captured here to avoid evaluating argument
  * side-effects further into the macro layers.
-- 
cgit v1.2.3


From b239da34203f49c40b5d656220c39647c3ff0b3c Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sun, 4 Sep 2022 22:41:28 +0200
Subject: bpf: Add helper macro bpf_for_each_reg_in_vstate

For a lot of use cases in future patches, we will want to modify the
state of registers part of some same 'group' (e.g. same ref_obj_id). It
won't just be limited to releasing reference state, but setting a type
flag dynamically based on certain actions, etc.

Hence, we need a way to easily pass a callback to the function that
iterates over all registers in current bpf_verifier_state in all frames
upto (and including) the curframe.

While in C++ we would be able to easily use a lambda to pass state and
the callback together, sadly we aren't using C++ in the kernel. The next
best thing to avoid defining a function for each case seems like
statement expressions in GNU C. The kernel already uses them heavily,
hence they can passed to the macro in the style of a lambda. The
statement expression will then be substituted in the for loop bodies.

Variables __state and __reg are set to current bpf_func_state and reg
for each invocation of the expression inside the passed in verifier
state.

Then, convert mark_ptr_or_null_regs, clear_all_pkt_pointers,
release_reference, find_good_pkt_pointers, find_equal_scalars to
use bpf_for_each_reg_in_vstate.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220904204145.3089-16-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 8fbc1d05281e..b49a349cc6ae 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -348,6 +348,27 @@ struct bpf_verifier_state {
 	     iter < frame->allocated_stack / BPF_REG_SIZE;		\
 	     iter++, reg = bpf_get_spilled_reg(iter, frame))
 
+/* Invoke __expr over regsiters in __vst, setting __state and __reg */
+#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr)   \
+	({                                                               \
+		struct bpf_verifier_state *___vstate = __vst;            \
+		int ___i, ___j;                                          \
+		for (___i = 0; ___i <= ___vstate->curframe; ___i++) {    \
+			struct bpf_reg_state *___regs;                   \
+			__state = ___vstate->frame[___i];                \
+			___regs = __state->regs;                         \
+			for (___j = 0; ___j < MAX_BPF_REG; ___j++) {     \
+				__reg = &___regs[___j];                  \
+				(void)(__expr);                          \
+			}                                                \
+			bpf_for_each_spilled_reg(___j, __state, __reg) { \
+				if (!__reg)                              \
+					continue;                        \
+				(void)(__expr);                          \
+			}                                                \
+		}                                                        \
+	})
+
 /* linked list of verifier states used to prune search */
 struct bpf_verifier_state_list {
 	struct bpf_verifier_state state;
-- 
cgit v1.2.3


From d01387fc16421cbbf95d1fda8fe1258195396c64 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 7 Sep 2022 15:52:31 +0100
Subject: firmware: arm_ffa: Add pointer to the ffa_dev_ops in struct ffa_dev

Currently ffa_dev_ops_get() is the way to fetch the ffa_dev_ops pointer.
It checks if the ffa_dev structure pointer is valid before returning the
ffa_dev_ops pointer.

Instead, the pointer can be made part of the ffa_dev structure and since
the core driver is incharge of creating ffa_device for each identified
partition, there is no need to check for the validity explicitly if the
pointer is embedded in the structure.

Add the pointer to the ffa_dev_ops in the ffa_dev structure itself and
initialise the same as part of creation of the device.

Link: https://lore.kernel.org/r/20220907145240.1683088-2-sudeep.holla@arm.com
Reviewed-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index e5c76c1ef9ed..91b47e42b73d 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -17,6 +17,7 @@ struct ffa_device {
 	bool mode_32bit;
 	uuid_t uuid;
 	struct device dev;
+	const struct ffa_dev_ops *ops;
 };
 
 #define to_ffa_dev(d) container_of(d, struct ffa_device, dev)
@@ -47,7 +48,8 @@ static inline void *ffa_dev_get_drvdata(struct ffa_device *fdev)
 }
 
 #if IS_REACHABLE(CONFIG_ARM_FFA_TRANSPORT)
-struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id);
+struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id,
+				       const struct ffa_dev_ops *ops);
 void ffa_device_unregister(struct ffa_device *ffa_dev);
 int ffa_driver_register(struct ffa_driver *driver, struct module *owner,
 			const char *mod_name);
@@ -57,7 +59,8 @@ const struct ffa_dev_ops *ffa_dev_ops_get(struct ffa_device *dev);
 
 #else
 static inline
-struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id)
+struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id,
+				       const struct ffa_dev_ops *ops)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 55bf84fd0a76894ae29c69b3552e073fa37818be Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 7 Sep 2022 15:52:33 +0100
Subject: firmware: arm_ffa: Remove ffa_dev_ops_get()

The only user of this exported ffa_dev_ops_get() was OPTEE driver which
now uses ffa_dev->ops directly, there are no other users for this.

Also, since any ffa driver can use ffa_dev->ops directly, there will be
no need for ffa_dev_ops_get(), so just remove ffa_dev_ops_get().

Link: https://lore.kernel.org/r/20220907145240.1683088-4-sudeep.holla@arm.com
Reviewed-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 91b47e42b73d..556f50f27fb1 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -55,7 +55,6 @@ int ffa_driver_register(struct ffa_driver *driver, struct module *owner,
 			const char *mod_name);
 void ffa_driver_unregister(struct ffa_driver *driver);
 bool ffa_device_is_valid(struct ffa_device *ffa_dev);
-const struct ffa_dev_ops *ffa_dev_ops_get(struct ffa_device *dev);
 
 #else
 static inline
@@ -79,11 +78,6 @@ static inline void ffa_driver_unregister(struct ffa_driver *driver) {}
 static inline
 bool ffa_device_is_valid(struct ffa_device *ffa_dev) { return false; }
 
-static inline
-const struct ffa_dev_ops *ffa_dev_ops_get(struct ffa_device *dev)
-{
-	return NULL;
-}
 #endif /* CONFIG_ARM_FFA_TRANSPORT */
 
 #define ffa_register(driver) \
-- 
cgit v1.2.3


From 8c3812c8f74f050278d734ec4b90149d84bdbefb Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 7 Sep 2022 15:52:36 +0100
Subject: firmware: arm_ffa: Make memory apis ffa_device independent

There is a requirement to make memory APIs independent of the ffa_device.
One of the use-case is to have a common memory driver that manages the
memory for all the ffa_devices. That common memory driver won't be a
ffa_driver or won't have any ffa_device associated with it. So having
these memory APIs accessible without a ffa_device is needed and should
be possible as most of these are handled by the partition manager(SPM
or hypervisor).

Drop the ffa_device argument to the memory APIs and make them ffa_device
independent.

Link: https://lore.kernel.org/r/20220907145240.1683088-7-sudeep.holla@arm.com
Acked-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 556f50f27fb1..eafab07c9f58 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -262,10 +262,8 @@ struct ffa_dev_ops {
 	int (*sync_send_receive)(struct ffa_device *dev,
 				 struct ffa_send_direct_data *data);
 	int (*memory_reclaim)(u64 g_handle, u32 flags);
-	int (*memory_share)(struct ffa_device *dev,
-			    struct ffa_mem_ops_args *args);
-	int (*memory_lend)(struct ffa_device *dev,
-			   struct ffa_mem_ops_args *args);
+	int (*memory_share)(struct ffa_mem_ops_args *args);
+	int (*memory_lend)(struct ffa_mem_ops_args *args);
 };
 
 #endif /* _LINUX_ARM_FFA_H */
-- 
cgit v1.2.3


From 7aa7a97989557011f762a4b7c2e4e3b061b638e4 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 7 Sep 2022 15:52:37 +0100
Subject: firmware: arm_ffa: Rename ffa_dev_ops as ffa_ops

Except the message APIs, all other APIs are ffa_device independent and can
be used without any associated ffa_device from a non ffa_driver.

In order to reflect the same, just rename ffa_dev_ops as ffa_ops to
avoid any confusion or to keep it simple.

Link: https://lore.kernel.org/r/20220907145240.1683088-8-sudeep.holla@arm.com
Suggested-by: Sumit Garg <sumit.garg@linaro.org>
Reviewed-by: Sumit Garg <sumit.garg@linaro.org>
Reviewed-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index eafab07c9f58..4c4b06783035 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -17,7 +17,7 @@ struct ffa_device {
 	bool mode_32bit;
 	uuid_t uuid;
 	struct device dev;
-	const struct ffa_dev_ops *ops;
+	const struct ffa_ops *ops;
 };
 
 #define to_ffa_dev(d) container_of(d, struct ffa_device, dev)
@@ -49,7 +49,7 @@ static inline void *ffa_dev_get_drvdata(struct ffa_device *fdev)
 
 #if IS_REACHABLE(CONFIG_ARM_FFA_TRANSPORT)
 struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id,
-				       const struct ffa_dev_ops *ops);
+				       const struct ffa_ops *ops);
 void ffa_device_unregister(struct ffa_device *ffa_dev);
 int ffa_driver_register(struct ffa_driver *driver, struct module *owner,
 			const char *mod_name);
@@ -59,7 +59,7 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev);
 #else
 static inline
 struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id,
-				       const struct ffa_dev_ops *ops)
+				       const struct ffa_ops *ops)
 {
 	return NULL;
 }
@@ -254,7 +254,7 @@ struct ffa_mem_ops_args {
 	struct ffa_mem_region_attributes *attrs;
 };
 
-struct ffa_dev_ops {
+struct ffa_ops {
 	u32 (*api_version_get)(void);
 	int (*partition_info_get)(const char *uuid_str,
 				  struct ffa_partition_info *buffer);
-- 
cgit v1.2.3


From bb1be749850055d88d839eff0962e5915788f228 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 7 Sep 2022 15:52:38 +0100
Subject: firmware: arm_ffa: Add v1.1 get_partition_info support

FF-A v1.1 adds support to discovery the UUIDs of the partitions that was
missing in v1.0 and which the driver workarounds by using UUIDs supplied
by the ffa_drivers.

Add the v1.1 get_partition_info support and disable the workaround if
the detected FF-A version is greater than v1.0.

Link: https://lore.kernel.org/r/20220907145240.1683088-9-sudeep.holla@arm.com
Reviewed-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 4c4b06783035..09567ffd1f49 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -107,6 +107,7 @@ struct ffa_partition_info {
 /* partition can send and receive indirect messages. */
 #define FFA_PARTITION_INDIRECT_MSG	BIT(2)
 	u32 properties;
+	u32 uuid[4];
 };
 
 /* For use with FFA_MSG_SEND_DIRECT_{REQ,RESP} which pass data via registers */
-- 
cgit v1.2.3


From 106b11b1ccd5a43432d9517f4a26629a1658cfe6 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 7 Sep 2022 15:52:39 +0100
Subject: firmware: arm_ffa: Set up 32bit execution mode flag using partiion
 property

FF-A v1.1 adds a flag in the partition properties to indicate if the
partition runs in the AArch32 or AArch64 execution state. Use the same
to set-up the 32-bit execution flag mode in the ffa_dev automatically
if the detected firmware version is above v1.0 and ignore any requests
to do the same from the ffa_driver.

Link: https://lore.kernel.org/r/20220907145240.1683088-10-sudeep.holla@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 09567ffd1f49..5964b6104996 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -106,6 +106,8 @@ struct ffa_partition_info {
 #define FFA_PARTITION_DIRECT_SEND	BIT(1)
 /* partition can send and receive indirect messages. */
 #define FFA_PARTITION_INDIRECT_MSG	BIT(2)
+/* partition runs in the AArch64 execution state. */
+#define FFA_PARTITION_AARCH64_EXEC	BIT(8)
 	u32 properties;
 	u32 uuid[4];
 };
-- 
cgit v1.2.3


From 5b0c6328e47dccf552996ca711005ca3f44034e9 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 7 Sep 2022 15:52:40 +0100
Subject: firmware: arm_ffa: Split up ffa_ops into info, message and memory
 operations

In preparation to make memory operations accessible for a non
ffa_driver/device, it is better to split the ffa_ops into different
categories of operations: info, message and memory. The info and memory
are ffa_device independent and can be used without any associated
ffa_device from a non ffa_driver.

However, we don't export these info and memory APIs yet without the user.
The first users of these APIs can export them.

Link: https://lore.kernel.org/r/20220907145240.1683088-11-sudeep.holla@arm.com
Reviewed-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 5964b6104996..5f02d2e6b9d9 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -257,16 +257,28 @@ struct ffa_mem_ops_args {
 	struct ffa_mem_region_attributes *attrs;
 };
 
-struct ffa_ops {
+struct ffa_info_ops {
 	u32 (*api_version_get)(void);
 	int (*partition_info_get)(const char *uuid_str,
 				  struct ffa_partition_info *buffer);
+};
+
+struct ffa_msg_ops {
 	void (*mode_32bit_set)(struct ffa_device *dev);
 	int (*sync_send_receive)(struct ffa_device *dev,
 				 struct ffa_send_direct_data *data);
+};
+
+struct ffa_mem_ops {
 	int (*memory_reclaim)(u64 g_handle, u32 flags);
 	int (*memory_share)(struct ffa_mem_ops_args *args);
 	int (*memory_lend)(struct ffa_mem_ops_args *args);
 };
 
+struct ffa_ops {
+	const struct ffa_info_ops *info_ops;
+	const struct ffa_msg_ops *msg_ops;
+	const struct ffa_mem_ops *mem_ops;
+};
+
 #endif /* _LINUX_ARM_FFA_H */
-- 
cgit v1.2.3


From 2f79cdfe58c13949bbbb65ba5926abfe9561d0ec Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 31 Aug 2022 09:46:12 -0700
Subject: fs: only do a memory barrier for the first set_buffer_uptodate()

Commit d4252071b97d ("add barriers to buffer_uptodate and
set_buffer_uptodate") added proper memory barriers to the buffer head
BH_Uptodate bit, so that anybody who tests a buffer for being up-to-date
will be guaranteed to actually see initialized state.

However, that commit didn't _just_ add the memory barrier, it also ended
up dropping the "was it already set" logic that the BUFFER_FNS() macro
had.

That's conceptually the right thing for a generic "this is a memory
barrier" operation, but in the case of the buffer contents, we really
only care about the memory barrier for the _first_ time we set the bit,
in that the only memory ordering protection we need is to avoid anybody
seeing uninitialized memory contents.

Any other access ordering wouldn't be about the BH_Uptodate bit anyway,
and would require some other proper lock (typically BH_Lock or the folio
lock).  A reader that races with somebody invalidating the buffer head
isn't an issue wrt the memory ordering, it's a serialization issue.

Now, you'd think that the buffer head operations don't matter in this
day and age (and I certainly thought so), but apparently some loads
still end up being heavy users of buffer heads.  In particular, the
kernel test robot reported that not having this bit access optimization
in place caused a noticeable direct IO performance regression on ext4:

  fxmark.ssd_ext4_no_jnl_DWTL_54_directio.works/sec -26.5% regression

although you presumably need a fast disk and a lot of cores to actually
notice.

Link: https://lore.kernel.org/all/Yw8L7HTZ%2FdE2%2Fo9C@xsang-OptiPlex-9020/
Reported-by: kernel test robot <oliver.sang@intel.com>
Tested-by: Fengwei Yin <fengwei.yin@intel.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/buffer_head.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 089c9ade4325..df518c429667 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -137,6 +137,17 @@ BUFFER_FNS(Defer_Completion, defer_completion)
 
 static __always_inline void set_buffer_uptodate(struct buffer_head *bh)
 {
+	/*
+	 * If somebody else already set this uptodate, they will
+	 * have done the memory barrier, and a reader will thus
+	 * see *some* valid buffer state.
+	 *
+	 * Any other serialization (with IO errors or whatever that
+	 * might clear the bit) has to come from other state (eg BH_Lock).
+	 */
+	if (test_bit(BH_Uptodate, &bh->b_state))
+		return;
+
 	/*
 	 * make it consistent with folio_mark_uptodate
 	 * pairs with smp_load_acquire in buffer_uptodate
-- 
cgit v1.2.3


From 86432b7f8f92b784c2e4af5b02766fb44052abf7 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 8 Sep 2022 16:05:18 +0300
Subject: spi: Group cs_change and cs_off flags together in struct spi_transfer

The commit 5e0531f6b90a ("spi: Add capability to perform some transfer
with chipselect off") added a new flag but squeezed it into a wrong
group of struct spi_transfer members (note that SPI_NBITS_* are macros
for easier interpretation of the tx_nbits and rx_nbits bitfields).

Group cs_change and cs_off flags together and their doc strings.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220908130518.32186-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 6e6c62c59957..19acd4c9c7e3 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -846,8 +846,8 @@ struct spi_res {
  * @bits_per_word: select a bits_per_word other than the device default
  *      for this transfer. If 0 the default (from @spi_device) is used.
  * @dummy_data: indicates transfer is dummy bytes transfer.
- * @cs_change: affects chipselect after this transfer completes
  * @cs_off: performs the transfer with chipselect off.
+ * @cs_change: affects chipselect after this transfer completes
  * @cs_change_delay: delay between cs deassert and assert when
  *      @cs_change is set and @spi_transfer is not the last in @spi_message
  * @delay: delay to be introduced after this transfer before
@@ -957,10 +957,10 @@ struct spi_transfer {
 	struct sg_table rx_sg;
 
 	unsigned	dummy_data:1;
+	unsigned	cs_off:1;
 	unsigned	cs_change:1;
 	unsigned	tx_nbits:3;
 	unsigned	rx_nbits:3;
-	unsigned	cs_off:1;
 #define	SPI_NBITS_SINGLE	0x01 /* 1bit transfer */
 #define	SPI_NBITS_DUAL		0x02 /* 2bits transfer */
 #define	SPI_NBITS_QUAD		0x04 /* 4bits transfer */
-- 
cgit v1.2.3


From 58ccf0190d19d9a8a41f8a02b9e06742b58df4a1 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 8 Sep 2022 21:34:42 +0300
Subject: vfio: Add an IOVA bitmap support

The new facility adds a bunch of wrappers that abstract how an IOVA range
is represented in a bitmap that is granulated by a given page_size. So it
translates all the lifting of dealing with user pointers into its
corresponding kernel addresses backing said user memory into doing finally
the (non-atomic) bitmap ops to change various bits.

The formula for the bitmap is:

   data[(iova / page_size) / 64] & (1ULL << (iova % 64))

Where 64 is the number of bits in a unsigned long (depending on arch)

It introduces an IOVA iterator that uses a windowing scheme to minimize the
pinning overhead, as opposed to pinning it on demand 4K at a time. Assuming
a 4K kernel page and 4K requested page size, we can use a single kernel
page to hold 512 page pointers, mapping 2M of bitmap, representing 64G of
IOVA space.

An example usage of these helpers for a given @base_iova, @page_size,
@length and __user @data:

   bitmap = iova_bitmap_alloc(base_iova, page_size, length, data);
   if (IS_ERR(bitmap))
       return -ENOMEM;

   ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn);

   iova_bitmap_free(bitmap);

Each iteration of the @dirty_reporter_fn is called with a unique @iova
and @length argument, indicating the current range available through the
iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty
areas (@iova_length) within that provided range, as following:

   iova_bitmap_set(bitmap, iova, iova_length);

The facility is intended to be used for user bitmaps representing dirtied
IOVAs by IOMMU (via IOMMUFD) and PCI Devices (via vfio-pci).

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20220908183448.195262-5-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/iova_bitmap.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 include/linux/iova_bitmap.h

(limited to 'include/linux')

diff --git a/include/linux/iova_bitmap.h b/include/linux/iova_bitmap.h
new file mode 100644
index 000000000000..c006cf0a25f3
--- /dev/null
+++ b/include/linux/iova_bitmap.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates.
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+#ifndef _IOVA_BITMAP_H_
+#define _IOVA_BITMAP_H_
+
+#include <linux/types.h>
+
+struct iova_bitmap;
+
+typedef int (*iova_bitmap_fn_t)(struct iova_bitmap *bitmap,
+				unsigned long iova, size_t length,
+				void *opaque);
+
+struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
+				      unsigned long page_size,
+				      u64 __user *data);
+void iova_bitmap_free(struct iova_bitmap *bitmap);
+int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
+			 iova_bitmap_fn_t fn);
+void iova_bitmap_set(struct iova_bitmap *bitmap,
+		     unsigned long iova, size_t length);
+
+#endif
-- 
cgit v1.2.3


From 80c4b92a2dc48cce82a0348add48533db7e07314 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 8 Sep 2022 21:34:43 +0300
Subject: vfio: Introduce the DMA logging feature support

Introduce the DMA logging feature support in the vfio core layer.

It includes the processing of the device start/stop/report DMA logging
UAPIs and calling the relevant driver 'op' to do the work.

Specifically,
Upon start, the core translates the given input ranges into an interval
tree, checks for unexpected overlapping, non aligned ranges and then
pass the translated input to the driver for start tracking the given
ranges.

Upon report, the core translates the given input user space bitmap and
page size into an IOVA kernel bitmap iterator. Then it iterates it and
call the driver to set the corresponding bits for the dirtied pages in a
specific IOVA range.

Upon stop, the driver is called to stop the previous started tracking.

The next patches from the series will introduce the mlx5 driver
implementation for the logging ops.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20220908183448.195262-6-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index e05ddc6fe6a5..0e2826559091 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -14,6 +14,7 @@
 #include <linux/workqueue.h>
 #include <linux/poll.h>
 #include <uapi/linux/vfio.h>
+#include <linux/iova_bitmap.h>
 
 struct kvm;
 
@@ -33,10 +34,11 @@ struct vfio_device {
 	struct device *dev;
 	const struct vfio_device_ops *ops;
 	/*
-	 * mig_ops is a static property of the vfio_device which must be set
-	 * prior to registering the vfio_device.
+	 * mig_ops/log_ops is a static property of the vfio_device which must
+	 * be set prior to registering the vfio_device.
 	 */
 	const struct vfio_migration_ops *mig_ops;
+	const struct vfio_log_ops *log_ops;
 	struct vfio_group *group;
 	struct vfio_device_set *dev_set;
 	struct list_head dev_set_list;
@@ -108,6 +110,28 @@ struct vfio_migration_ops {
 				   enum vfio_device_mig_state *curr_state);
 };
 
+/**
+ * @log_start: Optional callback to ask the device start DMA logging.
+ * @log_stop: Optional callback to ask the device stop DMA logging.
+ * @log_read_and_clear: Optional callback to ask the device read
+ *         and clear the dirty DMAs in some given range.
+ *
+ * The vfio core implementation of the DEVICE_FEATURE_DMA_LOGGING_ set
+ * of features does not track logging state relative to the device,
+ * therefore the device implementation of vfio_log_ops must handle
+ * arbitrary user requests. This includes rejecting subsequent calls
+ * to log_start without an intervening log_stop, as well as graceful
+ * handling of log_stop and log_read_and_clear from invalid states.
+ */
+struct vfio_log_ops {
+	int (*log_start)(struct vfio_device *device,
+		struct rb_root_cached *ranges, u32 nnodes, u64 *page_size);
+	int (*log_stop)(struct vfio_device *device);
+	int (*log_read_and_clear)(struct vfio_device *device,
+		unsigned long iova, unsigned long length,
+		struct iova_bitmap *dirty);
+};
+
 /**
  * vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl
  * @flags: Arg from the device_feature op
-- 
cgit v1.2.3


From 1537bf26e212ffcf007d0590958025f6bfdd4ac8 Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <sergey.matyukevich@syntacore.com>
Date: Tue, 30 Aug 2022 18:53:05 +0300
Subject: perf: RISC-V: exclude invalid pmu counters from SBI calls

SBI firmware may not provide information for some counters in response
to SBI_EXT_PMU_COUNTER_GET_INFO call. Exclude such counters from the
subsequent SBI requests. For this purpose use global mask to keep track
of fully specified counters.

Signed-off-by: Sergey Matyukevich <sergey.matyukevich@syntacore.com>
Reviewed-by: Atish Patra <atishp@rivosinc.com>
Link: https://lore.kernel.org/r/20220830155306.301714-3-geomatsi@gmail.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 include/linux/perf/riscv_pmu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h
index bf66fe011fa8..e17e86ad6f3a 100644
--- a/include/linux/perf/riscv_pmu.h
+++ b/include/linux/perf/riscv_pmu.h
@@ -45,7 +45,7 @@ struct riscv_pmu {
 
 	irqreturn_t	(*handle_irq)(int irq_num, void *dev);
 
-	int		num_counters;
+	unsigned long	cmask;
 	u64		(*ctr_read)(struct perf_event *event);
 	int		(*ctr_get_idx)(struct perf_event *event);
 	int		(*ctr_get_width)(int idx);
-- 
cgit v1.2.3


From bb5721f063ba63cd54cc5916881a706c21e6abc6 Mon Sep 17 00:00:00 2001
From: Colin Foster <colin.foster@in-advantage.com>
Date: Mon, 5 Sep 2022 09:21:25 -0700
Subject: mfd: ocelot: Add helper to get regmap from a resource

Several ocelot-related modules are designed for MMIO / regmaps. As such,
they often use a combination of devm_platform_get_and_ioremap_resource()
and devm_regmap_init_mmio().

Operating in an MFD might be different, in that it could be memory mapped,
or it could be SPI, I2C... In these cases a fallback to use IORESOURCE_REG
instead of IORESOURCE_MEM becomes necessary.

When this happens, there's redundant logic that needs to be implemented in
every driver. In order to avoid this redundancy, utilize a single function
that, if the MFD scenario is enabled, will perform this fallback logic.

Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20220905162132.2943088-2-colin.foster@in-advantage.com
---
 include/linux/mfd/ocelot.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 include/linux/mfd/ocelot.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/ocelot.h b/include/linux/mfd/ocelot.h
new file mode 100644
index 000000000000..dd72073d2d4f
--- /dev/null
+++ b/include/linux/mfd/ocelot.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/* Copyright 2022 Innovative Advantage Inc. */
+
+#ifndef _LINUX_MFD_OCELOT_H
+#define _LINUX_MFD_OCELOT_H
+
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/types.h>
+
+struct resource;
+
+static inline struct regmap *
+ocelot_regmap_from_resource_optional(struct platform_device *pdev,
+				     unsigned int index,
+				     const struct regmap_config *config)
+{
+	struct device *dev = &pdev->dev;
+	struct resource *res;
+	void __iomem *regs;
+
+	/*
+	 * Don't use _get_and_ioremap_resource() here, since that will invoke
+	 * prints of "invalid resource" which will simply add confusion.
+	 */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, index);
+	if (res) {
+		regs = devm_ioremap_resource(dev, res);
+		if (IS_ERR(regs))
+			return ERR_CAST(regs);
+		return devm_regmap_init_mmio(dev, regs, config);
+	}
+
+	/*
+	 * Fall back to using REG and getting the resource from the parent
+	 * device, which is possible in an MFD configuration
+	 */
+	if (dev->parent) {
+		res = platform_get_resource(pdev, IORESOURCE_REG, index);
+		if (!res)
+			return NULL;
+
+		return dev_get_regmap(dev->parent, res->name);
+	}
+
+	return NULL;
+}
+
+static inline struct regmap *
+ocelot_regmap_from_resource(struct platform_device *pdev, unsigned int index,
+			    const struct regmap_config *config)
+{
+	struct regmap *map;
+
+	map = ocelot_regmap_from_resource_optional(pdev, index, config);
+	return map ?: ERR_PTR(-ENOENT);
+}
+
+#endif
-- 
cgit v1.2.3


From 39f7d0832c28a6a31abb5b7363e7b1ba0714fe18 Mon Sep 17 00:00:00 2001
From: Colin Foster <colin.foster@in-advantage.com>
Date: Mon, 5 Sep 2022 09:21:30 -0700
Subject: resource: add define macro for register address resources

DEFINE_RES_ macros have been created for the commonly used resource types,
but not IORESOURCE_REG. Add the macro so it can be used in a similar manner
to all other resource types.

Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20220905162132.2943088-7-colin.foster@in-advantage.com
---
 include/linux/ioport.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 616b683563a9..8a76dca9deee 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -172,6 +172,11 @@ enum {
 #define DEFINE_RES_MEM(_start, _size)					\
 	DEFINE_RES_MEM_NAMED((_start), (_size), NULL)
 
+#define DEFINE_RES_REG_NAMED(_start, _size, _name)			\
+	DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_REG)
+#define DEFINE_RES_REG(_start, _size)					\
+	DEFINE_RES_REG_NAMED((_start), (_size), NULL)
+
 #define DEFINE_RES_IRQ_NAMED(_irq, _name)				\
 	DEFINE_RES_NAMED((_irq), 1, (_name), IORESOURCE_IRQ)
 #define DEFINE_RES_IRQ(_irq)						\
-- 
cgit v1.2.3


From f2042ed21da7f8886c93efefff61f93e6d57e9bd Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 16 Aug 2022 18:28:05 +0100
Subject: iommu/dma: Make header private

Now that dma-iommu.h only contains internal interfaces, make it
private to the IOMMU subsytem.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/b237e06c56a101f77af142a54b629b27aa179d22.1660668998.git.robin.murphy@arm.com
[ joro : re-add stub for iommu_dma_get_resv_regions ]
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/dma-iommu.h | 53 -----------------------------------------------
 1 file changed, 53 deletions(-)
 delete mode 100644 include/linux/dma-iommu.h

(limited to 'include/linux')

diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
deleted file mode 100644
index e83de4f1f3d6..000000000000
--- a/include/linux/dma-iommu.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2014-2015 ARM Ltd.
- */
-#ifndef __DMA_IOMMU_H
-#define __DMA_IOMMU_H
-
-#include <linux/errno.h>
-#include <linux/types.h>
-
-#ifdef CONFIG_IOMMU_DMA
-#include <linux/dma-mapping.h>
-#include <linux/iommu.h>
-#include <linux/msi.h>
-
-/* Domain management interface for IOMMU drivers */
-int iommu_get_dma_cookie(struct iommu_domain *domain);
-void iommu_put_dma_cookie(struct iommu_domain *domain);
-
-int iommu_dma_init_fq(struct iommu_domain *domain);
-
-void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
-
-void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
-		struct iommu_domain *domain);
-
-extern bool iommu_dma_forcedac;
-
-#else /* CONFIG_IOMMU_DMA */
-
-struct iommu_domain;
-struct device;
-
-static inline int iommu_dma_init_fq(struct iommu_domain *domain)
-{
-	return -EINVAL;
-}
-
-static inline int iommu_get_dma_cookie(struct iommu_domain *domain)
-{
-	return -ENODEV;
-}
-
-static inline void iommu_put_dma_cookie(struct iommu_domain *domain)
-{
-}
-
-static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
-{
-}
-
-#endif	/* CONFIG_IOMMU_DMA */
-#endif	/* __DMA_IOMMU_H */
-- 
cgit v1.2.3


From c9874d3ffeaf8ee215187692ed918b3031d996d1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 16 Aug 2018 11:47:53 -0400
Subject: termios: start unifying non-UAPI parts of asm/termios.h

* new header (linut/termios_internal.h), pulled by the users of those
suckers
* defaults for INIT_C_CC and externs for conversion helpers moved over
there
* remove termios-base.h (empty now)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/YxDmptU7dNGZ+/Hn@ZenIV
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/termios_internal.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 include/linux/termios_internal.h

(limited to 'include/linux')

diff --git a/include/linux/termios_internal.h b/include/linux/termios_internal.h
new file mode 100644
index 000000000000..103ca0370948
--- /dev/null
+++ b/include/linux/termios_internal.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TERMIOS_CONV_H
+#define _LINUX_TERMIOS_CONV_H
+
+#include <linux/uaccess.h>
+#include <asm/termios.h>
+
+#ifndef INIT_C_CC
+/*	intr=^C		quit=^\		erase=del	kill=^U
+	eof=^D		vtime=\0	vmin=\1		sxtc=\0
+	start=^Q	stop=^S		susp=^Z		eol=\0
+	reprint=^R	discard=^U	werase=^W	lnext=^V
+	eol2=\0
+*/
+#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
+#endif
+
+int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *);
+int kernel_termios_to_user_termio(struct termio __user *, struct ktermios *);
+#ifdef TCGETS2
+int user_termios_to_kernel_termios(struct ktermios *, struct termios2 __user *);
+int kernel_termios_to_user_termios(struct termios2 __user *, struct ktermios *);
+int user_termios_to_kernel_termios_1(struct ktermios *, struct termios __user *);
+int kernel_termios_to_user_termios_1(struct termios __user *, struct ktermios *);
+#else /* TCGETS2 */
+int user_termios_to_kernel_termios(struct ktermios *, struct termios __user *);
+int kernel_termios_to_user_termios(struct termios __user *, struct ktermios *);
+#endif /* TCGETS2 */
+
+#endif /* _LINUX_TERMIOS_CONV_H */
-- 
cgit v1.2.3


From 38fc315a73f7a1b2d19eb32dd55de089b78b2a54 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 9 Aug 2022 17:17:04 -0400
Subject: termios: consolidate values for VDISCARD in INIT_C_CC

On old systems it used to be ^O.  Linux had never actually used
the value, but INIT_C_CC (on i386) did initialize it to ^O;
unfortunately, it had a typo in the comment claiming that to be
^U.  Most of the architectures copied the (correct) definition
along with mistaken comment.  alpha, powerpc and sparc tried
to make the definition match comment.

However, util-linux still resets it to ^O on any architecture,
^O is the historical value, kernel ignores it anyway and finally,
Linus said "Just change everybody to do the same, nobody cares
about VDISCARD".

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/YxDmy//MKzs3ye7l@ZenIV
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/termios_internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/termios_internal.h b/include/linux/termios_internal.h
index 103ca0370948..7eb3598c109d 100644
--- a/include/linux/termios_internal.h
+++ b/include/linux/termios_internal.h
@@ -9,7 +9,7 @@
 /*	intr=^C		quit=^\		erase=del	kill=^U
 	eof=^D		vtime=\0	vmin=\1		sxtc=\0
 	start=^Q	stop=^S		susp=^Z		eol=\0
-	reprint=^R	discard=^U	werase=^W	lnext=^V
+	reprint=^R	discard=^O	werase=^W	lnext=^V
 	eol2=\0
 */
 #define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
-- 
cgit v1.2.3


From d04f9915fa44b52d7a91080677381a082238e9c4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 9 Sep 2018 17:57:42 -0400
Subject: make generic INIT_C_CC a bit more generic

turn it into an array initializer; then alpha, mips and powerpc
variants fold into it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/YxDm7M6M91gC2RPL@ZenIV
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/termios_internal.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/termios_internal.h b/include/linux/termios_internal.h
index 7eb3598c109d..8a53141ab44a 100644
--- a/include/linux/termios_internal.h
+++ b/include/linux/termios_internal.h
@@ -12,7 +12,20 @@
 	reprint=^R	discard=^O	werase=^W	lnext=^V
 	eol2=\0
 */
-#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
+#define INIT_C_CC {		\
+	[VINTR] = 'C'-0x40,	\
+	[VQUIT] = '\\'-0x40,	\
+	[VERASE] = '\177',	\
+	[VKILL] = 'U'-0x40,	\
+	[VEOF] = 'D'-0x40,	\
+	[VSTART] = 'Q'-0x40,	\
+	[VSTOP] = 'S'-0x40,	\
+	[VSUSP] = 'Z'-0x40,	\
+	[VREPRINT] = 'R'-0x40,	\
+	[VDISCARD] = 'O'-0x40,	\
+	[VWERASE] = 'W'-0x40,	\
+	[VLNEXT] = 'V'-0x40,	\
+	[VMIN] = 1 }
 #endif
 
 int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *);
-- 
cgit v1.2.3


From e7b4c812b9685e22753d6355e53fdeaaa22862dd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 9 Aug 2022 19:41:40 -0400
Subject: termios: convert the last (sparc) INIT_C_CC to array

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/YxDnDCR2VRTA3Etp@ZenIV
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/termios_internal.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/termios_internal.h b/include/linux/termios_internal.h
index 8a53141ab44a..d77f29e5e2b7 100644
--- a/include/linux/termios_internal.h
+++ b/include/linux/termios_internal.h
@@ -5,13 +5,19 @@
 #include <linux/uaccess.h>
 #include <asm/termios.h>
 
-#ifndef INIT_C_CC
 /*	intr=^C		quit=^\		erase=del	kill=^U
 	eof=^D		vtime=\0	vmin=\1		sxtc=\0
 	start=^Q	stop=^S		susp=^Z		eol=\0
 	reprint=^R	discard=^O	werase=^W	lnext=^V
 	eol2=\0
 */
+
+#ifdef VDSUSP
+#define INIT_C_CC_VDSUSP_EXTRA [VDSUSP] = 'Y'-0x40,
+#else
+#define INIT_C_CC_VDSUSP_EXTRA
+#endif
+
 #define INIT_C_CC {		\
 	[VINTR] = 'C'-0x40,	\
 	[VQUIT] = '\\'-0x40,	\
@@ -25,8 +31,8 @@
 	[VDISCARD] = 'O'-0x40,	\
 	[VWERASE] = 'W'-0x40,	\
 	[VLNEXT] = 'V'-0x40,	\
+	INIT_C_CC_VDSUSP_EXTRA	\
 	[VMIN] = 1 }
-#endif
 
 int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *);
 int kernel_termios_to_user_termio(struct termio __user *, struct ktermios *);
-- 
cgit v1.2.3


From 89bbeb7e3199e1514729aa6de8057289e6375fe6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 9 Aug 2022 19:41:40 -0400
Subject: termios: get rid of non-UAPI asm/termios.h

	All non-UAPI asm/termios.h consist of include of UAPI counterpart
and, possibly, include of linux/uaccess.h

	The latter can't be simply removed, even though nothing in
linux/termios.h doesn't depend upon it anymore - there are several
places that rely upon that indirect chain of includes to pull
linux/uaccess.h.  So the include needs to be lifted out of there -
we lift into tty_driver.h, serdev.h and places that pull asm/termios.h,
but none of
	* linux/uaccess.h (obvious)
	* net/sock.h (pulls uaccess.h)
	* linux/{tty,tty_driver,serdev}.h (tty.h pulls tty_driver.h)

That leaves us just with the include of UAPI asm/termios.h, which is
what <asm/termios.h> will resolve to if we simply remove non-UAPI header.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/YxDnKvYCHn/ogBUv@ZenIV
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serdev.h     | 1 +
 include/linux/tty_driver.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index 3368c261ab62..66f624fc618c 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -7,6 +7,7 @@
 
 #include <linux/types.h>
 #include <linux/device.h>
+#include <linux/uaccess.h>
 #include <linux/termios.h>
 #include <linux/delay.h>
 
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index b2456b545ba0..f961164a5274 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -7,6 +7,7 @@
 #include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/cdev.h>
+#include <linux/uaccess.h>
 #include <linux/termios.h>
 #include <linux/seq_file.h>
 
-- 
cgit v1.2.3


From d79ddb069c5257a924456eb99b53fc1ea715c0a3 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 26 Aug 2022 00:41:05 +0800
Subject: sched/psi: Move private helpers to sched/stats.h

This patch move psi_task_change/psi_task_switch declarations out of
PSI public header, since they are only needed for implementing the
PSI stats tracking in sched/stats.h

psi_task_switch is obvious, psi_task_change can't be public helper
since it doesn't check psi_disabled static key. And there is no
any user now, so put it in sched/stats.h too.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20220825164111.29534-5-zhouchengming@bytedance.com
---
 include/linux/psi.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/psi.h b/include/linux/psi.h
index dd74411ac21d..fffd229fbf19 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -18,10 +18,6 @@ extern struct psi_group psi_system;
 
 void psi_init(void);
 
-void psi_task_change(struct task_struct *task, int clear, int set);
-void psi_task_switch(struct task_struct *prev, struct task_struct *next,
-		     bool sleep);
-
 void psi_memstall_enter(unsigned long *flags);
 void psi_memstall_leave(unsigned long *flags);
 
-- 
cgit v1.2.3


From 71dbdde7914d32e86f01ac1f6e54e964c9dfdbd9 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 26 Aug 2022 00:41:07 +0800
Subject: sched/psi: Remove NR_ONCPU task accounting

We put all fields updated by the scheduler in the first cacheline of
struct psi_group_cpu for performance.

Since we want add another PSI_IRQ_FULL to track IRQ/SOFTIRQ pressure,
we need to reclaim space first. This patch remove NR_ONCPU task accounting
in struct psi_group_cpu, use one bit in state_mask to track instead.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Tested-by: Chengming Zhou <zhouchengming@bytedance.com>
Link: https://lore.kernel.org/r/20220825164111.29534-7-zhouchengming@bytedance.com
---
 include/linux/psi_types.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index c7fe7c089718..54cb74946db4 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -15,13 +15,6 @@ enum psi_task_count {
 	NR_IOWAIT,
 	NR_MEMSTALL,
 	NR_RUNNING,
-	/*
-	 * This can't have values other than 0 or 1 and could be
-	 * implemented as a bit flag. But for now we still have room
-	 * in the first cacheline of psi_group_cpu, and this way we
-	 * don't have to special case any state tracking for it.
-	 */
-	NR_ONCPU,
 	/*
 	 * For IO and CPU stalls the presence of running/oncpu tasks
 	 * in the domain means a partial rather than a full stall.
@@ -32,16 +25,18 @@ enum psi_task_count {
 	 * threads and memstall ones.
 	 */
 	NR_MEMSTALL_RUNNING,
-	NR_PSI_TASK_COUNTS = 5,
+	NR_PSI_TASK_COUNTS = 4,
 };
 
 /* Task state bitmasks */
 #define TSK_IOWAIT	(1 << NR_IOWAIT)
 #define TSK_MEMSTALL	(1 << NR_MEMSTALL)
 #define TSK_RUNNING	(1 << NR_RUNNING)
-#define TSK_ONCPU	(1 << NR_ONCPU)
 #define TSK_MEMSTALL_RUNNING	(1 << NR_MEMSTALL_RUNNING)
 
+/* Only one task can be scheduled, no corresponding task count */
+#define TSK_ONCPU	(1 << NR_PSI_TASK_COUNTS)
+
 /* Resources that workloads could be stalled on */
 enum psi_res {
 	PSI_IO,
@@ -68,6 +63,9 @@ enum psi_states {
 	NR_PSI_STATES = 7,
 };
 
+/* Use one bit in the state mask to track TSK_ONCPU */
+#define PSI_ONCPU	(1 << NR_PSI_STATES)
+
 enum psi_aggregators {
 	PSI_AVGS = 0,
 	PSI_POLL,
-- 
cgit v1.2.3


From 52b1364ba0b105122d6de0e719b36db705011ac1 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 26 Aug 2022 00:41:08 +0800
Subject: sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure

Now PSI already tracked workload pressure stall information for
CPU, memory and IO. Apart from these, IRQ/SOFTIRQ could have
obvious impact on some workload productivity, such as web service
workload.

When CONFIG_IRQ_TIME_ACCOUNTING, we can get IRQ/SOFTIRQ delta time
from update_rq_clock_task(), in which we can record that delta
to CPU curr task's cgroups as PSI_IRQ_FULL status.

Note we don't use PSI_IRQ_SOME since IRQ/SOFTIRQ always happen in
the current task on the CPU, make nothing productive could run
even if it were runnable, so we only use PSI_IRQ_FULL.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20220825164111.29534-8-zhouchengming@bytedance.com
---
 include/linux/psi_types.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 54cb74946db4..40c28171cd91 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -42,7 +42,10 @@ enum psi_res {
 	PSI_IO,
 	PSI_MEM,
 	PSI_CPU,
-	NR_PSI_RESOURCES = 3,
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	PSI_IRQ,
+#endif
+	NR_PSI_RESOURCES,
 };
 
 /*
@@ -58,9 +61,12 @@ enum psi_states {
 	PSI_MEM_FULL,
 	PSI_CPU_SOME,
 	PSI_CPU_FULL,
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	PSI_IRQ_FULL,
+#endif
 	/* Only per-CPU, to weigh the CPU in the global average: */
 	PSI_NONIDLE,
-	NR_PSI_STATES = 7,
+	NR_PSI_STATES,
 };
 
 /* Use one bit in the state mask to track TSK_ONCPU */
-- 
cgit v1.2.3


From 57899a6610e67ba26fa3251ebbef4a5ed21efc5d Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 26 Aug 2022 00:41:09 +0800
Subject: sched/psi: Consolidate cgroup_psi()

cgroup_psi() can't return psi_group for root cgroup, so we have many
open code "psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi".

This patch move cgroup_psi() definition to <linux/psi.h>, in which
we can return psi_system for root cgroup, so can handle all cgroups.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20220825164111.29534-9-zhouchengming@bytedance.com
---
 include/linux/cgroup.h | 5 -----
 include/linux/psi.h    | 6 ++++++
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b0914aa26506..80cb970257be 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -673,11 +673,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 	pr_cont_kernfs_path(cgrp->kn);
 }
 
-static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
-{
-	return cgrp->psi;
-}
-
 bool cgroup_psi_enabled(void);
 
 static inline void cgroup_init_kthreadd(void)
diff --git a/include/linux/psi.h b/include/linux/psi.h
index fffd229fbf19..362a74ca1d3b 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 #include <linux/poll.h>
 #include <linux/cgroup-defs.h>
+#include <linux/cgroup.h>
 
 struct seq_file;
 struct css_set;
@@ -30,6 +31,11 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
 			poll_table *wait);
 
 #ifdef CONFIG_CGROUPS
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+{
+	return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+}
+
 int psi_cgroup_alloc(struct cgroup *cgrp);
 void psi_cgroup_free(struct cgroup *cgrp);
 void cgroup_move_task(struct task_struct *p, struct css_set *to);
-- 
cgit v1.2.3


From dc86aba751e2867244411adda1562f6664747019 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 26 Aug 2022 00:41:10 +0800
Subject: sched/psi: Cache parent psi_group to speed up group iteration

We use iterate_groups() to iterate each level psi_group to update
PSI stats, which is a very hot path.

In current code, iterate_groups() have to use multiple branches and
cgroup_parent() to get parent psi_group for each level, which is not
very efficient.

This patch cache parent psi_group in struct psi_group, only need to get
psi_group of task itself first, then just use group->parent to iterate.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20220825164111.29534-10-zhouchengming@bytedance.com
---
 include/linux/psi_types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 40c28171cd91..a0b746258c68 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -151,6 +151,8 @@ struct psi_trigger {
 };
 
 struct psi_group {
+	struct psi_group *parent;
+
 	/* Protects data used by the aggregator */
 	struct mutex avgs_lock;
 
-- 
cgit v1.2.3


From 34f26a15611afb03c33df6819359d36f5b382589 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Wed, 7 Sep 2022 17:03:32 +0800
Subject: sched/psi: Per-cgroup PSI accounting disable/re-enable interface

PSI accounts stalls for each cgroup separately and aggregates it
at each level of the hierarchy. This may cause non-negligible overhead
for some workloads when under deep level of the hierarchy.

commit 3958e2d0c34e ("cgroup: make per-cgroup pressure stall tracking configurable")
make PSI to skip per-cgroup stall accounting, only account system-wide
to avoid this each level overhead.

But for our use case, we also want leaf cgroup PSI stats accounted for
userspace adjustment on that cgroup, apart from only system-wide adjustment.

So this patch introduce a per-cgroup PSI accounting disable/re-enable
interface "cgroup.pressure", which is a read-write single value file that
allowed values are "0" and "1", the defaults is "1" so per-cgroup
PSI stats is enabled by default.

Implementation details:

It should be relatively straight-forward to disable and re-enable
state aggregation, time tracking, averaging on a per-cgroup level,
if we can live with losing history from while it was disabled.
I.e. the avgs will restart from 0, total= will have gaps.

But it's hard or complex to stop/restart groupc->tasks[] updates,
which is not implemented in this patch. So we always update
groupc->tasks[] and PSI_ONCPU bit in psi_group_change() even when
the cgroup PSI stats is disabled.

Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/20220907090332.2078-1-zhouchengming@bytedance.com
---
 include/linux/cgroup-defs.h | 3 +++
 include/linux/psi.h         | 2 ++
 include/linux/psi_types.h   | 3 +++
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 4bcf56b3491c..7df76b318245 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -428,6 +428,9 @@ struct cgroup {
 	struct cgroup_file procs_file;	/* handle for "cgroup.procs" */
 	struct cgroup_file events_file;	/* handle for "cgroup.events" */
 
+	/* handles for "{cpu,memory,io,irq}.pressure" */
+	struct cgroup_file psi_files[NR_PSI_RESOURCES];
+
 	/*
 	 * The bitmask of subsystems enabled on the child cgroups.
 	 * ->subtree_control is the one configured through
diff --git a/include/linux/psi.h b/include/linux/psi.h
index 362a74ca1d3b..b029a847def1 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -39,6 +39,7 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
 int psi_cgroup_alloc(struct cgroup *cgrp);
 void psi_cgroup_free(struct cgroup *cgrp);
 void cgroup_move_task(struct task_struct *p, struct css_set *to);
+void psi_cgroup_restart(struct psi_group *group);
 #endif
 
 #else /* CONFIG_PSI */
@@ -60,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
 {
 	rcu_assign_pointer(p->cgroups, to);
 }
+static inline void psi_cgroup_restart(struct psi_group *group) {}
 #endif
 
 #endif /* CONFIG_PSI */
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index a0b746258c68..6e4372735068 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -152,6 +152,7 @@ struct psi_trigger {
 
 struct psi_group {
 	struct psi_group *parent;
+	bool enabled;
 
 	/* Protects data used by the aggregator */
 	struct mutex avgs_lock;
@@ -194,6 +195,8 @@ struct psi_group {
 
 #else /* CONFIG_PSI */
 
+#define NR_PSI_RESOURCES	0
+
 struct psi_group { };
 
 #endif /* CONFIG_PSI */
-- 
cgit v1.2.3


From b9be19eef35587b15da0a5d887d6e04c0c6cefab Mon Sep 17 00:00:00 2001
From: Phil Auld <pauld@redhat.com>
Date: Tue, 6 Sep 2022 16:35:42 -0400
Subject: drivers/base: Fix unsigned comparison to -1 in CPUMAP_FILE_MAX_BYTES

As PAGE_SIZE is unsigned long, -1 > PAGE_SIZE when NR_CPUS <= 3.
This leads to very large file sizes:

topology$ ls -l
total 0
-r--r--r-- 1 root root 18446744073709551615 Sep  5 11:59 core_cpus
-r--r--r-- 1 root root                 4096 Sep  5 11:59 core_cpus_list
-r--r--r-- 1 root root                 4096 Sep  5 10:58 core_id
-r--r--r-- 1 root root 18446744073709551615 Sep  5 10:10 core_siblings
-r--r--r-- 1 root root                 4096 Sep  5 11:59 core_siblings_list
-r--r--r-- 1 root root 18446744073709551615 Sep  5 11:59 die_cpus
-r--r--r-- 1 root root                 4096 Sep  5 11:59 die_cpus_list
-r--r--r-- 1 root root                 4096 Sep  5 11:59 die_id
-r--r--r-- 1 root root 18446744073709551615 Sep  5 11:59 package_cpus
-r--r--r-- 1 root root                 4096 Sep  5 11:59 package_cpus_list
-r--r--r-- 1 root root                 4096 Sep  5 10:58 physical_package_id
-r--r--r-- 1 root root 18446744073709551615 Sep  5 10:10 thread_siblings
-r--r--r-- 1 root root                 4096 Sep  5 11:59 thread_siblings_list

Adjust the inequality to catch the case when NR_CPUS is configured
to a small value.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Yury Norov <yury.norov@gmail.com>
Cc: stable@vger.kernel.org
Cc: feng xiangjun <fengxj325@gmail.com>
Fixes: 7ee951acd31a ("drivers/base: fix userspace break from using bin_attributes for cpumap and cpulist")
Reported-by: feng xiangjun <fengxj325@gmail.com>
Signed-off-by: Phil Auld <pauld@redhat.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index bd047864c7ac..e8ad12b5b9d2 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1127,9 +1127,10 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
  * cover a worst-case of every other cpu being on one of two nodes for a
  * very large NR_CPUS.
  *
- *  Use PAGE_SIZE as a minimum for smaller configurations.
+ *  Use PAGE_SIZE as a minimum for smaller configurations while avoiding
+ *  unsigned comparison to -1.
  */
-#define CPUMAP_FILE_MAX_BYTES  ((((NR_CPUS * 9)/32 - 1) > PAGE_SIZE) \
+#define CPUMAP_FILE_MAX_BYTES  (((NR_CPUS * 9)/32 > PAGE_SIZE) \
 					? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE)
 #define CPULIST_FILE_MAX_BYTES  (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE)
 
-- 
cgit v1.2.3


From 811d59fdf56a17c66742578fe8be8a6a841af448 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 29 Aug 2022 11:29:49 -0500
Subject: ACPI: s2idle: Add a new ->check() callback for platform_s2idle_ops

On some platforms it is found that Linux more aggressively enters s2idle
than Windows enters Modern Standby and this uncovers some synchronization
issues for the platform.  To aid in debugging this class of problems in
the future, add support for an extra optional callback intended for
drivers to emit extra debugging.

Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20220829162953.5947-2-mario.limonciello@amd.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/acpi.h    | 1 +
 include/linux/suspend.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6f64b2f3dc54..acaa2ddc067d 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1075,6 +1075,7 @@ acpi_status acpi_os_prepare_extended_sleep(u8 sleep_state,
 struct acpi_s2idle_dev_ops {
 	struct list_head list_node;
 	void (*prepare)(void);
+	void (*check)(void);
 	void (*restore)(void);
 };
 int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg);
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 70f2921e2e70..03ed42ed2c7f 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -191,6 +191,7 @@ struct platform_s2idle_ops {
 	int (*begin)(void);
 	int (*prepare)(void);
 	int (*prepare_late)(void);
+	void (*check)(void);
 	bool (*wake)(void);
 	void (*restore_early)(void);
 	void (*restore)(void);
-- 
cgit v1.2.3


From 6bb057bfd9d509755349cd2a6ca5e5e6e6071304 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Tue, 16 Aug 2022 13:16:26 +0300
Subject: ACPI: resource: Add helper function acpi_dev_get_memory_resources()

Wrapper function that finds all memory type resources by
using acpi_dev_get_resources(). It removes the need for the
drivers to check the resource data type separately.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6f64b2f3dc54..ed4aa395cc49 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -506,6 +506,7 @@ int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list,
 			   void *preproc_data);
 int acpi_dev_get_dma_resources(struct acpi_device *adev,
 			       struct list_head *list);
+int acpi_dev_get_memory_resources(struct acpi_device *adev, struct list_head *list);
 int acpi_dev_filter_resource_type(struct acpi_resource *ares,
 				  unsigned long types);
 
-- 
cgit v1.2.3


From d4f7bdb2ed7bf320a8772258fdc257655d225afb Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Wed, 7 Sep 2022 10:40:37 -0600
Subject: bpf: Add stub for btf_struct_access()

Add corresponding unimplemented stub for when CONFIG_BPF_SYSCALL=n

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/4021398e884433b1fef57a4d28361bb9fcf1bd05.1662568410.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 48ae05099f36..54178b9e9c3a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2211,6 +2211,15 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id)
 	return ERR_PTR(-ENOTSUPP);
 }
 
+static inline int btf_struct_access(struct bpf_verifier_log *log,
+				    const struct btf *btf,
+				    const struct btf_type *t, int off, int size,
+				    enum bpf_access_type atype,
+				    u32 *next_btf_id, enum bpf_type_flag *flag)
+{
+	return -EACCES;
+}
+
 static inline const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
-- 
cgit v1.2.3


From 1bfe26fb082724be453e4d7fd9bb358e3ba669b2 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Thu, 8 Sep 2022 16:07:16 -0700
Subject: bpf: Add verifier support for custom callback return range

Verifier logic to confirm that a callback function returns 0 or 1 was
added in commit 69c087ba6225b ("bpf: Add bpf_for_each_map_elem() helper").
At the time, callback return value was only used to continue or stop
iteration.

In order to support callbacks with a broader return value range, such as
those added in rbtree series[0] and others, add a callback_ret_range to
bpf_func_state. Verifier's helpers which set in_callback_fn will also
set the new field, which the verifier will later use to check return
value bounds.

Default to tnum_range(0, 0) instead of using tnum_unknown as a sentinel
value as the latter would prevent the valid range (0, U64_MAX) being
used. Previous global default tnum_range(0, 1) is explicitly set for
extant callback helpers. The change to global default was made after
discussion around this patch in rbtree series [1], goal here is to make
it more obvious that callback_ret_range should be explicitly set.

  [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com/
  [1]: lore.kernel.org/bpf/20220830172759.4069786-2-davemarchevsky@fb.com/

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220908230716.2751723-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b49a349cc6ae..e197f8fb27e2 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -248,6 +248,7 @@ struct bpf_func_state {
 	 */
 	u32 async_entry_cnt;
 	bool in_callback_fn;
+	struct tnum callback_ret_range;
 	bool in_async_callback_fn;
 
 	/* The following fields should be last. See copy_func_state() */
-- 
cgit v1.2.3


From 9cd4f1434479f1ac25c440c421fbf52069079914 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sun, 11 Sep 2022 11:18:45 +0800
Subject: iommu/vt-d: Fix possible recursive locking in intel_iommu_init()

The global rwsem dmar_global_lock was introduced by commit 3a5670e8ac932
("iommu/vt-d: Introduce a rwsem to protect global data structures"). It
is used to protect DMAR related global data from DMAR hotplug operations.

The dmar_global_lock used in the intel_iommu_init() might cause recursive
locking issue, for example, intel_iommu_get_resv_regions() is taking the
dmar_global_lock from within a section where intel_iommu_init() already
holds it via probe_acpi_namespace_devices().

Using dmar_global_lock in intel_iommu_init() could be relaxed since it is
unlikely that any IO board must be hot added before the IOMMU subsystem is
initialized. This eliminates the possible recursive locking issue by moving
down DMAR hotplug support after the IOMMU is initialized and removing the
uses of dmar_global_lock in intel_iommu_init().

Fixes: d5692d4af08cd ("iommu/vt-d: Fix suspicious RCU usage in probe_acpi_namespace_devices()")
Reported-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/894db0ccae854b35c73814485569b634237b5538.1657034828.git.robin.murphy@arm.com
Link: https://lore.kernel.org/r/20220718235325.3952426-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/dmar.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index d81a51978d01..8917a32173c4 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -65,6 +65,7 @@ struct dmar_pci_notify_info {
 
 extern struct rw_semaphore dmar_global_lock;
 extern struct list_head dmar_drhd_units;
+extern int intel_iommu_enabled;
 
 #define for_each_drhd_unit(drhd)					\
 	list_for_each_entry_rcu(drhd, &dmar_drhd_units, list,		\
@@ -88,7 +89,8 @@ extern struct list_head dmar_drhd_units;
 static inline bool dmar_rcu_check(void)
 {
 	return rwsem_is_locked(&dmar_global_lock) ||
-	       system_state == SYSTEM_BOOTING;
+	       system_state == SYSTEM_BOOTING ||
+	       (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled);
 }
 
 #define	dmar_rcu_dereference(p)	rcu_dereference_check((p), dmar_rcu_check())
-- 
cgit v1.2.3


From a1c7c1a40478db4edef8ad0a0c6975c8921088b7 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 19 Jul 2022 13:41:31 +0200
Subject: power: supply: Explain maintenance charging

In order for everyone to understand clearly why we want to use
maintenance charging for batteries, expand the description with two
diagrams and some text.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/power_supply.h | 48 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index cb380c1d9459..aa2c4a7c4826 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -374,9 +374,37 @@ struct power_supply_vbat_ri_table {
  *   These timers should be chosen to align with the typical discharge curve
  *   for the battery.
  *
- * When the main CC/CV charging is complete the battery can optionally be
- * maintenance charged at the voltages from this table: a table of settings is
- * traversed using a slightly lower current and voltage than what is used for
+ * Ordinary CC/CV charging will stop charging when the charge current goes
+ * below charge_term_current_ua, and then restart it (if the device is still
+ * plugged into the charger) at charge_restart_voltage_uv. This happens in most
+ * consumer products because the power usage while connected to a charger is
+ * not zero, and devices are not manufactured to draw power directly from the
+ * charger: instead they will at all times dissipate the battery a little, like
+ * the power used in standby mode. This will over time give a charge graph
+ * such as this:
+ *
+ * Energy
+ *  ^      ...        ...      ...      ...      ...      ...      ...
+ *  |    .   .       .  .     .  .     .  .     .  .     .  .     .
+ *  |  ..     .   ..     .  ..    .  ..    .  ..    .  ..    .  ..
+ *  |.          ..        ..       ..       ..       ..       ..
+ *  +-------------------------------------------------------------------> t
+ *
+ * Practically this means that the Li-ions are wandering back and forth in the
+ * battery and this causes degeneration of the battery anode and cathode.
+ * To prolong the life of the battery, maintenance charging is applied after
+ * reaching charge_term_current_ua to hold up the charge in the battery while
+ * consuming power, thus lowering the wear on the battery:
+ *
+ * Energy
+ *  ^      .......................................
+ *  |    .                                        ......................
+ *  |  ..
+ *  |.
+ *  +-------------------------------------------------------------------> t
+ *
+ * Maintenance charging uses the voltages from this table: a table of settings
+ * is traversed using a slightly lower current and voltage than what is used for
  * CC/CV charging. The maintenance charging will for safety reasons not go on
  * indefinately: we lower the current and voltage with successive maintenance
  * settings, then disable charging completely after we reach the last one,
@@ -385,14 +413,22 @@ struct power_supply_vbat_ri_table {
  * ordinary CC/CV charging from there.
  *
  * As an example, a Samsung EB425161LA Lithium-Ion battery is CC/CV charged
- * at 900mA to 4340mV, then maintenance charged at 600mA and 4150mV for
- * 60 hours, then maintenance charged at 600mA and 4100mV for 200 hours.
+ * at 900mA to 4340mV, then maintenance charged at 600mA and 4150mV for up to
+ * 60 hours, then maintenance charged at 600mA and 4100mV for up to 200 hours.
  * After this the charge cycle is restarted waiting for
  * charge_restart_voltage_uv.
  *
  * For most mobile electronics this type of maintenance charging is enough for
  * the user to disconnect the device and make use of it before both maintenance
- * charging cycles are complete.
+ * charging cycles are complete, if the current and voltage has been chosen
+ * appropriately. These need to be determined from battery discharge curves
+ * and expected standby current.
+ *
+ * If the voltage anyway drops to charge_restart_voltage_uv during maintenance
+ * charging, ordinary CC/CV charging is restarted. This can happen if the
+ * device is e.g. actively used during charging, so more current is drawn than
+ * the expected stand-by current. Also overvoltage protection will be applied
+ * as usual.
  */
 struct power_supply_maintenance_charge_table {
 	int charge_current_max_ua;
-- 
cgit v1.2.3


From 65d3440e8d2e9e024ef2357f8ff021611b068c99 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 26 Aug 2022 10:18:07 -0700
Subject: mm/memory-failure: fix detection of memory_failure() handlers

Some pagemap types, like MEMORY_DEVICE_GENERIC (device-dax) do not even
have pagemap ops which results in crash signatures like this:

  BUG: kernel NULL pointer dereference, address: 0000000000000010
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  PGD 8000000205073067 P4D 8000000205073067 PUD 2062b3067 PMD 0
  Oops: 0000 [#1] PREEMPT SMP PTI
  CPU: 22 PID: 4535 Comm: device-dax Tainted: G           OE    N 6.0.0-rc2+ #59
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
  RIP: 0010:memory_failure+0x667/0xba0
 [..]
  Call Trace:
   <TASK>
   ? _printk+0x58/0x73
   do_madvise.part.0.cold+0xaf/0xc5

Check for ops before checking if the ops have a memory_failure()
handler.

Link: https://lkml.kernel.org/r/166153428781.2758201.1990616683438224741.stgit@dwillia2-xfh.jf.intel.com
Fixes: 33a8f7f2b3a3 ("pagemap,pmem: introduce ->memory_failure()")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memremap.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 19010491a603..c3b4cc84877b 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -139,6 +139,11 @@ struct dev_pagemap {
 	};
 };
 
+static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap)
+{
+	return pgmap->ops && pgmap->ops->memory_failure;
+}
+
 static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
 {
 	if (pgmap->flags & PGMAP_ALTMAP_VALID)
-- 
cgit v1.2.3


From 825cf206ed510c4a1758bef8957e2b039253e2e3 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 26 Aug 2022 23:58:44 -0700
Subject: statx: add direct I/O alignment information

Traditionally, the conditions for when DIO (direct I/O) is supported
were fairly simple.  For both block devices and regular files, DIO had
to be aligned to the logical block size of the block device.

However, due to filesystem features that have been added over time (e.g.
multi-device support, data journalling, inline data, encryption, verity,
compression, checkpoint disabling, log-structured mode), the conditions
for when DIO is allowed on a regular file have gotten increasingly
complex.  Whether a particular regular file supports DIO, and with what
alignment, can depend on various file attributes and filesystem mount
options, as well as which block device(s) the file's data is located on.

Moreover, the general rule of DIO needing to be aligned to the block
device's logical block size was recently relaxed to allow user buffers
(but not file offsets) aligned to the DMA alignment instead.  See
commit bf8d08532bc1 ("iomap: add support for dma aligned direct-io").

XFS has an ioctl XFS_IOC_DIOINFO that exposes DIO alignment information.
Uplifting this to the VFS is one possibility.  However, as discussed
(https://lore.kernel.org/linux-fsdevel/20220120071215.123274-1-ebiggers@kernel.org/T/#u),
this ioctl is rarely used and not known to be used outside of
XFS-specific code.  It was also never intended to indicate when a file
doesn't support DIO at all, nor was it intended for block devices.

Therefore, let's expose this information via statx().  Add the
STATX_DIOALIGN flag and two new statx fields associated with it:

* stx_dio_mem_align: the alignment (in bytes) required for user memory
  buffers for DIO, or 0 if DIO is not supported on the file.

* stx_dio_offset_align: the alignment (in bytes) required for file
  offsets and I/O segment lengths for DIO, or 0 if DIO is not supported
  on the file.  This will only be nonzero if stx_dio_mem_align is
  nonzero, and vice versa.

Note that as with other statx() extensions, if STATX_DIOALIGN isn't set
in the returned statx struct, then these new fields won't be filled in.
This will happen if the file is neither a regular file nor a block
device, or if the file is a regular file and the filesystem doesn't
support STATX_DIOALIGN.  It might also happen if the caller didn't
include STATX_DIOALIGN in the request mask, since statx() isn't required
to return unrequested information.

This commit only adds the VFS-level plumbing for STATX_DIOALIGN.  For
regular files, individual filesystems will still need to add code to
support it.  For block devices, a separate commit will wire it up too.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220827065851.135710-2-ebiggers@kernel.org
---
 include/linux/stat.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/stat.h b/include/linux/stat.h
index 7df06931f25d..ff277ced50e9 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -50,6 +50,8 @@ struct kstat {
 	struct timespec64 btime;			/* File creation time */
 	u64		blocks;
 	u64		mnt_id;
+	u32		dio_mem_align;
+	u32		dio_offset_align;
 };
 
 #endif
-- 
cgit v1.2.3


From 2d985f8c6b91b5007a16e640bb9c038c5fb2839b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 26 Aug 2022 23:58:45 -0700
Subject: vfs: support STATX_DIOALIGN on block devices

Add support for STATX_DIOALIGN to block devices, so that direct I/O
alignment restrictions are exposed to userspace in a generic way.

Note that this breaks the tradition of stat operating only on the block
device node, not the block device itself.  However, it was felt that
doing this is preferable, in order to make the interface useful and
avoid needing separate interfaces for regular files and block devices.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Link: https://lore.kernel.org/r/20220827065851.135710-3-ebiggers@kernel.org
---
 include/linux/blkdev.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 84b13fdd34a7..8038c5fbde40 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1498,6 +1498,7 @@ int sync_blockdev(struct block_device *bdev);
 int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend);
 int sync_blockdev_nowait(struct block_device *bdev);
 void sync_bdevs(bool wait);
+void bdev_statx_dioalign(struct inode *inode, struct kstat *stat);
 void printk_all_partitions(void);
 #else
 static inline void invalidate_bdev(struct block_device *bdev)
@@ -1514,6 +1515,9 @@ static inline int sync_blockdev_nowait(struct block_device *bdev)
 static inline void sync_bdevs(bool wait)
 {
 }
+static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
+{
+}
 static inline void printk_all_partitions(void)
 {
 }
-- 
cgit v1.2.3


From 53dd3f802a6e269868cb599609287a841e65a996 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 26 Aug 2022 23:58:46 -0700
Subject: fscrypt: change fscrypt_dio_supported() to prepare for STATX_DIOALIGN

To prepare for STATX_DIOALIGN support, make two changes to
fscrypt_dio_supported().

First, remove the filesystem-block-alignment check and make the
filesystems handle it instead.  It previously made sense to have it in
fs/crypto/; however, to support STATX_DIOALIGN the alignment restriction
would have to be returned to filesystems.  It ends up being simpler if
filesystems handle this part themselves, especially for f2fs which only
allows fs-block-aligned DIO in the first place.

Second, make fscrypt_dio_supported() work on inodes whose encryption key
hasn't been set up yet, by making it set up the key if needed.  This is
required for statx(), since statx() doesn't require a file descriptor.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220827065851.135710-4-ebiggers@kernel.org
---
 include/linux/fscrypt.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 7d2f1e0f23b1..13598859d5b3 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -768,7 +768,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
 bool fscrypt_mergeable_bio_bh(struct bio *bio,
 			      const struct buffer_head *next_bh);
 
-bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter);
+bool fscrypt_dio_supported(struct inode *inode);
 
 u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks);
 
@@ -801,11 +801,8 @@ static inline bool fscrypt_mergeable_bio_bh(struct bio *bio,
 	return true;
 }
 
-static inline bool fscrypt_dio_supported(struct kiocb *iocb,
-					 struct iov_iter *iter)
+static inline bool fscrypt_dio_supported(struct inode *inode)
 {
-	const struct inode *inode = file_inode(iocb->ki_filp);
-
 	return !fscrypt_needs_contents_encryption(inode);
 }
 
-- 
cgit v1.2.3


From a7f4e6e4c47c41869fe5bea17e013b5557c57ed3 Mon Sep 17 00:00:00 2001
From: Zach O'Keefe <zokeefe@google.com>
Date: Wed, 6 Jul 2022 16:59:25 -0700
Subject: mm/thp: add flag to enforce sysfs THP in hugepage_vma_check()

MADV_COLLAPSE is not coupled to the kernel-oriented sysfs THP settings[1].

hugepage_vma_check() is the authority on determining if a VMA is eligible
for THP allocation/collapse, and currently enforces the sysfs THP
settings.  Add a flag to disable these checks.  For now, only apply this
arg to anon and file, which use /sys/kernel/transparent_hugepage/enabled.
We can expand this to shmem, which uses
/sys/kernel/transparent_hugepage/shmem_enabled, later.

Use this flag in collapse_pte_mapped_thp() where previously the VMA flags
passed to hugepage_vma_check() were OR'd with VM_HUGEPAGE to elide the
VM_HUGEPAGE check in "madvise" THP mode.  Prior to "mm: khugepaged: check
THP flag in hugepage_vma_check()", this check also didn't check "never"
THP mode.  As such, this restores the previous behavior of
collapse_pte_mapped_thp() where sysfs THP settings are ignored.  See
comment in code for justification why this is OK.

[1] https://lore.kernel.org/linux-mm/CAAa6QmQxay1_=Pmt8oCX2-Va18t44FV-Vs-WsQt_6+qBks4nZA@mail.gmail.com/

Link: https://lkml.kernel.org/r/20220706235936.2197195-8-zokeefe@google.com
Signed-off-by: Zach O'Keefe <zokeefe@google.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Chris Kennelly <ckennelly@google.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Pavel Begunkov <asml.silence@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rongwei Wang <rongwei.wang@linux.alibaba.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: "Souptick Joarder (HPE)" <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 768e5261fdae..b0e80cc72b0c 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 	       !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
 }
 
-bool hugepage_vma_check(struct vm_area_struct *vma,
-			unsigned long vm_flags,
-			bool smaps, bool in_pf);
+bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
+			bool smaps, bool in_pf, bool enforce_sysfs);
 
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
@@ -321,8 +320,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 }
 
 static inline bool hugepage_vma_check(struct vm_area_struct *vma,
-				       unsigned long vm_flags,
-				       bool smaps, bool in_pf)
+				      unsigned long vm_flags, bool smaps,
+				      bool in_pf, bool enforce_sysfs)
 {
 	return false;
 }
-- 
cgit v1.2.3


From 7d8faaf155454f8798ec56404faca29a82689c77 Mon Sep 17 00:00:00 2001
From: Zach O'Keefe <zokeefe@google.com>
Date: Wed, 6 Jul 2022 16:59:27 -0700
Subject: mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse

This idea was introduced by David Rientjes[1].

Introduce a new madvise mode, MADV_COLLAPSE, that allows users to request
a synchronous collapse of memory at their own expense.

The benefits of this approach are:

* CPU is charged to the process that wants to spend the cycles for the
  THP
* Avoid unpredictable timing of khugepaged collapse

Semantics

This call is independent of the system-wide THP sysfs settings, but will
fail for memory marked VM_NOHUGEPAGE.  If the ranges provided span
multiple VMAs, the semantics of the collapse over each VMA is independent
from the others.  This implies a hugepage cannot cross a VMA boundary.  If
collapse of a given hugepage-aligned/sized region fails, the operation may
continue to attempt collapsing the remainder of memory specified.

The memory ranges provided must be page-aligned, but are not required to
be hugepage-aligned.  If the memory ranges are not hugepage-aligned, the
start/end of the range will be clamped to the first/last hugepage-aligned
address covered by said range.  The memory ranges must span at least one
hugepage-sized region.

All non-resident pages covered by the range will first be
swapped/faulted-in, before being internally copied onto a freshly
allocated hugepage.  Unmapped pages will have their data directly
initialized to 0 in the new hugepage.  However, for every eligible
hugepage aligned/sized region to-be collapsed, at least one page must
currently be backed by memory (a PMD covering the address range must
already exist).

Allocation for the new hugepage may enter direct reclaim and/or
compaction, regardless of VMA flags.  When the system has multiple NUMA
nodes, the hugepage will be allocated from the node providing the most
native pages.  This operation operates on the current state of the
specified process and makes no persistent changes or guarantees on how
pages will be mapped, constructed, or faulted in the future

Return Value

If all hugepage-sized/aligned regions covered by the provided range were
either successfully collapsed, or were already PMD-mapped THPs, this
operation will be deemed successful.  On success, process_madvise(2)
returns the number of bytes advised, and madvise(2) returns 0.  Else, -1
is returned and errno is set to indicate the error for the most-recently
attempted hugepage collapse.  Note that many failures might have occurred,
since the operation may continue to collapse in the event a single
hugepage-sized/aligned region fails.

	ENOMEM	Memory allocation failed or VMA not found
	EBUSY	Memcg charging failed
	EAGAIN	Required resource temporarily unavailable.  Try again
		might succeed.
	EINVAL	Other error: No PMD found, subpage doesn't have Present
		bit set, "Special" page no backed by struct page, VMA
		incorrectly sized, address not page-aligned, ...

Most notable here is ENOMEM and EBUSY (new to madvise) which are intended
to provide the caller with actionable feedback so they may take an
appropriate fallback measure.

Use Cases

An immediate user of this new functionality are malloc() implementations
that manage memory in hugepage-sized chunks, but sometimes subrelease
memory back to the system in native-sized chunks via MADV_DONTNEED;
zapping the pmd.  Later, when the memory is hot, the implementation could
madvise(MADV_COLLAPSE) to re-back the memory by THPs to regain hugepage
coverage and dTLB performance.  TCMalloc is such an implementation that
could benefit from this[2].

Only privately-mapped anon memory is supported for now, but additional
support for file, shmem, and HugeTLB high-granularity mappings[2] is
expected.  File and tmpfs/shmem support would permit:

* Backing executable text by THPs.  Current support provided by
  CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which
  might impair services from serving at their full rated load after
  (re)starting.  Tricks like mremap(2)'ing text onto anonymous memory to
  immediately realize iTLB performance prevents page sharing and demand
  paging, both of which increase steady state memory footprint.  With
  MADV_COLLAPSE, we get the best of both worlds: Peak upfront performance
  and lower RAM footprints.
* Backing guest memory by hugapages after the memory contents have been
  migrated in native-page-sized chunks to a new host, in a
  userfaultfd-based live-migration stack.

[1] https://lore.kernel.org/linux-mm/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com/
[2] https://github.com/google/tcmalloc/tree/master/tcmalloc

[jrdr.linux@gmail.com: avoid possible memory leak in failure path]
  Link: https://lkml.kernel.org/r/20220713024109.62810-1-jrdr.linux@gmail.com
[zokeefe@google.com add missing kfree() to madvise_collapse()]
  Link: https://lore.kernel.org/linux-mm/20220713024109.62810-1-jrdr.linux@gmail.com/
  Link: https://lkml.kernel.org/r/20220713161851.1879439-1-zokeefe@google.com
[zokeefe@google.com: delay computation of hpage boundaries until use]]
  Link: https://lkml.kernel.org/r/20220720140603.1958773-4-zokeefe@google.com
Link: https://lkml.kernel.org/r/20220706235936.2197195-10-zokeefe@google.com
Signed-off-by: Zach O'Keefe <zokeefe@google.com>
Signed-off-by: "Souptick Joarder (HPE)" <jrdr.linux@gmail.com>
Suggested-by: David Rientjes <rientjes@google.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Chris Kennelly <ckennelly@google.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Hildenbrand <david@redhat.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Pavel Begunkov <asml.silence@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rongwei Wang <rongwei.wang@linux.alibaba.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b0e80cc72b0c..38265f9f782e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -218,6 +218,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
 
 int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
 		     int advice);
+int madvise_collapse(struct vm_area_struct *vma,
+		     struct vm_area_struct **prev,
+		     unsigned long start, unsigned long end);
 void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
 			   unsigned long end, long adjust_next);
 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
@@ -361,9 +364,16 @@ static inline void split_huge_pmd_address(struct vm_area_struct *vma,
 static inline int hugepage_madvise(struct vm_area_struct *vma,
 				   unsigned long *vm_flags, int advice)
 {
-	BUG();
-	return 0;
+	return -EINVAL;
 }
+
+static inline int madvise_collapse(struct vm_area_struct *vma,
+				   struct vm_area_struct **prev,
+				   unsigned long start, unsigned long end)
+{
+	return -EINVAL;
+}
+
 static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 					 unsigned long start,
 					 unsigned long end,
-- 
cgit v1.2.3


From e9c2dbc8bf71a5039604a1dc45b10f24a2098f3b Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang29@zte.com.cn>
Date: Mon, 8 Aug 2022 00:56:45 +0000
Subject: mm/vmscan: define macros for refaults in struct lruvec

The magic number 0 and 1 are used in several places in vmscan.c.
Define macros for them to improve code readability.

Link: https://lkml.kernel.org/r/20220808005644.1721066-1-yang.yang29@zte.com.cn
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e24b40c52468..8f571dc7c524 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -306,6 +306,8 @@ static inline bool is_active_lru(enum lru_list lru)
 	return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
 }
 
+#define WORKINGSET_ANON 0
+#define WORKINGSET_FILE 1
 #define ANON_AND_FILE 2
 
 enum lruvec_flags {
-- 
cgit v1.2.3


From d2226ebd5484afcf9f9b71b394ec1567a7730eb1 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Fri, 5 Aug 2022 08:59:03 +0800
Subject: mm/hugetlb: add dedicated func to get 'allowed' nodemask for current
 process

Muchun Song found that after MPOL_PREFERRED_MANY policy was introduced in
commit b27abaccf8e8 ("mm/mempolicy: add MPOL_PREFERRED_MANY for multiple
preferred nodes"), the policy_nodemask_current()'s semantics for this new
policy has been changed, which returns 'preferred' nodes instead of
'allowed' nodes.

With the changed semantic of policy_nodemask_current, a task with
MPOL_PREFERRED_MANY policy could fail to get its reservation even though
it can fall back to other nodes (either defined by cpusets or all online
nodes) for that reservation failing mmap calles unnecessarily early.

The fix is to not consider MPOL_PREFERRED_MANY for reservations at all
because they, unlike MPOL_MBIND, do not pose any actual hard constrain.

Michal suggested the policy_nodemask_current() is only used by hugetlb,
and could be moved to hugetlb code with more explicit name to enforce the
'allowed' semantics for which only MPOL_BIND policy matters.

apply_policy_zone() is made extern to be called in hugetlb code and its
return value is changed to bool.

[1]. https://lore.kernel.org/lkml/20220801084207.39086-1-songmuchun@bytedance.com/t/

Link: https://lkml.kernel.org/r/20220805005903.95563-1-feng.tang@intel.com
Fixes: b27abaccf8e8 ("mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes")
Signed-off-by: Feng Tang <feng.tang@intel.com>
Reported-by: Muchun Song <songmuchun@bytedance.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Ben Widawsky <bwidawsk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 668389b4b53d..d232de7cdc56 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -151,13 +151,6 @@ extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
 				const nodemask_t *mask);
 extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
 
-static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
-{
-	struct mempolicy *mpol = get_task_policy(current);
-
-	return policy_nodemask(gfp, mpol);
-}
-
 extern unsigned int mempolicy_slab_node(void);
 
 extern enum zone_type policy_zone;
@@ -189,6 +182,7 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol)
 	return  (pol->mode == MPOL_PREFERRED_MANY);
 }
 
+extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);
 
 #else
 
@@ -294,11 +288,6 @@ static inline void mpol_put_task_policy(struct task_struct *task)
 {
 }
 
-static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
-{
-	return NULL;
-}
-
 static inline bool mpol_is_preferred_many(struct mempolicy *pol)
 {
 	return  false;
-- 
cgit v1.2.3


From b84e04f1baeebe6872b22a027cfc558621e842d4 Mon Sep 17 00:00:00 2001
From: Imran Khan <imran.f.khan@oracle.com>
Date: Mon, 15 Aug 2022 05:53:53 +1000
Subject: kfence: add sysfs interface to disable kfence for selected slabs.

By default kfence allocation can happen for any slab object, whose size is
up to PAGE_SIZE, as long as that allocation is the first allocation after
expiration of kfence sample interval.  But in certain debugging scenarios
we may be interested in debugging corruptions involving some specific slub
objects like dentry or ext4_* etc.  In such cases limiting kfence for
allocations involving only specific slub objects will increase the
probablity of catching the issue since kfence pool will not be consumed by
other slab objects.

This patch introduces a sysfs interface
'/sys/kernel/slab/<name>/skip_kfence' to disable kfence for specific
slabs.  Having the interface work in this way does not impact
current/default behavior of kfence and allows us to use kfence for
specific slabs (when needed) as well.  The decision to skip/use kfence is
taken depending on whether kmem_cache.flags has (newly introduced)
SLAB_SKIP_KFENCE flag set or not.

Link: https://lkml.kernel.org/r/20220814195353.2540848-1-imran.f.khan@oracle.com
Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/slab.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0fefdf528e0d..352e3f082acc 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -119,6 +119,12 @@
  */
 #define SLAB_NO_USER_FLAGS	((slab_flags_t __force)0x10000000U)
 
+#ifdef CONFIG_KFENCE
+#define SLAB_SKIP_KFENCE	((slab_flags_t __force)0x20000000U)
+#else
+#define SLAB_SKIP_KFENCE	0
+#endif
+
 /* The following flags affect the page allocator grouping pages by mobility */
 /* Objects are reclaimable */
 #define SLAB_RECLAIM_ACCOUNT	((slab_flags_t __force)0x00020000U)
-- 
cgit v1.2.3


From 736a8ccce99c633b2457996bb9bf2cefff0db43d Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Fri, 29 Jul 2022 16:01:04 +0800
Subject: hugetlb_cgroup: remove unneeded return value

The return value of set_hugetlb_cgroup and set_hugetlb_cgroup_rsvd are
always ignored. Remove them to clean up the code.

Link: https://lkml.kernel.org/r/20220729080106.12752-4-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mina Almasry <almasrymina@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb_cgroup.h | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 379344828e78..630cd255d0cf 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -90,32 +90,31 @@ hugetlb_cgroup_from_page_rsvd(struct page *page)
 	return __hugetlb_cgroup_from_page(page, true);
 }
 
-static inline int __set_hugetlb_cgroup(struct page *page,
+static inline void __set_hugetlb_cgroup(struct page *page,
 				       struct hugetlb_cgroup *h_cg, bool rsvd)
 {
 	VM_BUG_ON_PAGE(!PageHuge(page), page);
 
 	if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
-		return -1;
+		return;
 	if (rsvd)
 		set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD,
 				 (unsigned long)h_cg);
 	else
 		set_page_private(page + SUBPAGE_INDEX_CGROUP,
 				 (unsigned long)h_cg);
-	return 0;
 }
 
-static inline int set_hugetlb_cgroup(struct page *page,
+static inline void set_hugetlb_cgroup(struct page *page,
 				     struct hugetlb_cgroup *h_cg)
 {
-	return __set_hugetlb_cgroup(page, h_cg, false);
+	__set_hugetlb_cgroup(page, h_cg, false);
 }
 
-static inline int set_hugetlb_cgroup_rsvd(struct page *page,
+static inline void set_hugetlb_cgroup_rsvd(struct page *page,
 					  struct hugetlb_cgroup *h_cg)
 {
-	return __set_hugetlb_cgroup(page, h_cg, true);
+	__set_hugetlb_cgroup(page, h_cg, true);
 }
 
 static inline bool hugetlb_cgroup_disabled(void)
@@ -199,16 +198,14 @@ hugetlb_cgroup_from_page_rsvd(struct page *page)
 	return NULL;
 }
 
-static inline int set_hugetlb_cgroup(struct page *page,
+static inline void set_hugetlb_cgroup(struct page *page,
 				     struct hugetlb_cgroup *h_cg)
 {
-	return 0;
 }
 
-static inline int set_hugetlb_cgroup_rsvd(struct page *page,
+static inline void set_hugetlb_cgroup_rsvd(struct page *page,
 					  struct hugetlb_cgroup *h_cg)
 {
-	return 0;
 }
 
 static inline bool hugetlb_cgroup_disabled(void)
-- 
cgit v1.2.3


From 4d86d4f7227c6f2acfbbbe0623d49865aa71b756 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Tue, 26 Jul 2022 16:02:41 -0700
Subject: mm: add more BUILD_BUG_ONs to gfp_migratetype()

gfp_migratetype() also expects GFP_RECLAIMABLE and
GFP_MOVABLE|GFP_RECLAIMABLE to be shiftable into MIGRATE_* enum values, so
add some more BUILD_BUG_ONs to reflect this assumption.

Link: https://linux-review.googlesource.com/id/Iae64e2182f75c3aca776a486b71a72571d66d83e
Link: https://lkml.kernel.org/r/20220726230241.3770532-1-pcc@google.com
Signed-off-by: Peter Collingbourne <pcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f314be58fa77..ea6cb9399152 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -18,6 +18,9 @@ static inline int gfp_migratetype(const gfp_t gfp_flags)
 	VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
 	BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
 	BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
+	BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE);
+	BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >>
+		      GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC);
 
 	if (unlikely(page_group_by_mobility_disabled))
 		return MIGRATE_UNMOVABLE;
-- 
cgit v1.2.3


From 33024536bafd9129f1d16ade0974671c648700ac Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 13 Jul 2022 16:39:51 +0800
Subject: memory tiering: hot page selection with hint page fault latency

Patch series "memory tiering: hot page selection", v4.

To optimize page placement in a memory tiering system with NUMA balancing,
the hot pages in the slow memory nodes need to be identified.
Essentially, the original NUMA balancing implementation selects the mostly
recently accessed (MRU) pages to promote.  But this isn't a perfect
algorithm to identify the hot pages.  Because the pages with quite low
access frequency may be accessed eventually given the NUMA balancing page
table scanning period could be quite long (e.g.  60 seconds).  So in this
patchset, we implement a new hot page identification algorithm based on
the latency between NUMA balancing page table scanning and hint page
fault.  Which is a kind of mostly frequently accessed (MFU) algorithm.

In NUMA balancing memory tiering mode, if there are hot pages in slow
memory node and cold pages in fast memory node, we need to promote/demote
hot/cold pages between the fast and cold memory nodes.

A choice is to promote/demote as fast as possible.  But the CPU cycles and
memory bandwidth consumed by the high promoting/demoting throughput will
hurt the latency of some workload because of accessing inflating and slow
memory bandwidth contention.

A way to resolve this issue is to restrict the max promoting/demoting
throughput.  It will take longer to finish the promoting/demoting.  But
the workload latency will be better.  This is implemented in this patchset
as the page promotion rate limit mechanism.

The promotion hot threshold is workload and system configuration
dependent.  So in this patchset, a method to adjust the hot threshold
automatically is implemented.  The basic idea is to control the number of
the candidate promotion pages to match the promotion rate limit.

We used the pmbench memory accessing benchmark tested the patchset on a
2-socket server system with DRAM and PMEM installed.  The test results are
as follows,

		pmbench score		promote rate
		 (accesses/s)			MB/s
		-------------		------------
base		  146887704.1		       725.6
hot selection     165695601.2		       544.0
rate limit	  162814569.8		       165.2
auto adjustment	  170495294.0                  136.9

From the results above,

With hot page selection patch [1/3], the pmbench score increases about
12.8%, and promote rate (overhead) decreases about 25.0%, compared with
base kernel.

With rate limit patch [2/3], pmbench score decreases about 1.7%, and
promote rate decreases about 69.6%, compared with hot page selection
patch.

With threshold auto adjustment patch [3/3], pmbench score increases about
4.7%, and promote rate decrease about 17.1%, compared with rate limit
patch.

Baolin helped to test the patchset with MySQL on a machine which contains
1 DRAM node (30G) and 1 PMEM node (126G).

sysbench /usr/share/sysbench/oltp_read_write.lua \
......
--tables=200 \
--table-size=1000000 \
--report-interval=10 \
--threads=16 \
--time=120

The tps can be improved about 5%.


This patch (of 3):

To optimize page placement in a memory tiering system with NUMA balancing,
the hot pages in the slow memory node need to be identified.  Essentially,
the original NUMA balancing implementation selects the mostly recently
accessed (MRU) pages to promote.  But this isn't a perfect algorithm to
identify the hot pages.  Because the pages with quite low access frequency
may be accessed eventually given the NUMA balancing page table scanning
period could be quite long (e.g.  60 seconds).  The most frequently
accessed (MFU) algorithm is better.

So, in this patch we implemented a better hot page selection algorithm.
Which is based on NUMA balancing page table scanning and hint page fault
as follows,

- When the page tables of the processes are scanned to change PTE/PMD
  to be PROT_NONE, the current time is recorded in struct page as scan
  time.

- When the page is accessed, hint page fault will occur.  The scan
  time is gotten from the struct page.  And The hint page fault
  latency is defined as

    hint page fault time - scan time

The shorter the hint page fault latency of a page is, the higher the
probability of their access frequency to be higher.  So the hint page
fault latency is a better estimation of the page hot/cold.

It's hard to find some extra space in struct page to hold the scan time.
Fortunately, we can reuse some bits used by the original NUMA balancing.

NUMA balancing uses some bits in struct page to store the page accessing
CPU and PID (referring to page_cpupid_xchg_last()).  Which is used by the
multi-stage node selection algorithm to avoid to migrate pages shared
accessed by the NUMA nodes back and forth.  But for pages in the slow
memory node, even if they are shared accessed by multiple NUMA nodes, as
long as the pages are hot, they need to be promoted to the fast memory
node.  So the accessing CPU and PID information are unnecessary for the
slow memory pages.  We can reuse these bits in struct page to record the
scan time.  For the fast memory pages, these bits are used as before.

For the hot threshold, the default value is 1 second, which works well in
our performance test.  All pages with hint page fault latency < hot
threshold will be considered hot.

It's hard for users to determine the hot threshold.  So we don't provide a
kernel ABI to set it, just provide a debugfs interface for advanced users
to experiment.  We will continue to work on a hot threshold automatic
adjustment mechanism.

The downside of the above method is that the response time to the workload
hot spot changing may be much longer.  For example,

- A previous cold memory area becomes hot

- The hint page fault will be triggered.  But the hint page fault
  latency isn't shorter than the hot threshold.  So the pages will
  not be promoted.

- When the memory area is scanned again, maybe after a scan period,
  the hint page fault latency measured will be shorter than the hot
  threshold and the pages will be promoted.

To mitigate this, if there are enough free space in the fast memory node,
the hot threshold will not be used, all pages will be promoted upon the
hint page fault for fast response.

Thanks Zhong Jiang reported and tested the fix for a bug when disabling
memory tiering mode dynamically.

Link: https://lkml.kernel.org/r/20220713083954.34196-1-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20220713083954.34196-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Wei Xu <weixugc@google.com>
Cc: osalvador <osalvador@suse.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Zhong Jiang <zhongjiang-ali@linux.alibaba.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21f8b27bd9fd..27839b158ca4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1255,6 +1255,18 @@ static inline int folio_nid(const struct folio *folio)
 }
 
 #ifdef CONFIG_NUMA_BALANCING
+/* page access time bits needs to hold at least 4 seconds */
+#define PAGE_ACCESS_TIME_MIN_BITS	12
+#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS
+#define PAGE_ACCESS_TIME_BUCKETS				\
+	(PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT)
+#else
+#define PAGE_ACCESS_TIME_BUCKETS	0
+#endif
+
+#define PAGE_ACCESS_TIME_MASK				\
+	(LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS)
+
 static inline int cpu_pid_to_cpupid(int cpu, int pid)
 {
 	return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
@@ -1318,12 +1330,25 @@ static inline void page_cpupid_reset_last(struct page *page)
 	page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
 }
 #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
+
+static inline int xchg_page_access_time(struct page *page, int time)
+{
+	int last_time;
+
+	last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
+	return last_time << PAGE_ACCESS_TIME_BUCKETS;
+}
 #else /* !CONFIG_NUMA_BALANCING */
 static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 {
 	return page_to_nid(page); /* XXX */
 }
 
+static inline int xchg_page_access_time(struct page *page, int time)
+{
+	return 0;
+}
+
 static inline int page_cpupid_last(struct page *page)
 {
 	return page_to_nid(page); /* XXX */
-- 
cgit v1.2.3


From c6833e10008f976a173dd5abdf992e492cbc3bcf Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 13 Jul 2022 16:39:52 +0800
Subject: memory tiering: rate limit NUMA migration throughput

In NUMA balancing memory tiering mode, if there are hot pages in slow
memory node and cold pages in fast memory node, we need to promote/demote
hot/cold pages between the fast and cold memory nodes.

A choice is to promote/demote as fast as possible.  But the CPU cycles and
memory bandwidth consumed by the high promoting/demoting throughput will
hurt the latency of some workload because of accessing inflating and slow
memory bandwidth contention.

A way to resolve this issue is to restrict the max promoting/demoting
throughput.  It will take longer to finish the promoting/demoting.  But
the workload latency will be better.  This is implemented in this patch as
the page promotion rate limit mechanism.

The number of the candidate pages to be promoted to the fast memory node
via NUMA balancing is counted, if the count exceeds the limit specified by
the users, the NUMA balancing promotion will be stopped until the next
second.

A new sysctl knob kernel.numa_balancing_promote_rate_limit_MBps is added
for the users to specify the limit.

Link: https://lkml.kernel.org/r/20220713083954.34196-3-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: osalvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zhong Jiang <zhongjiang-ali@linux.alibaba.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h       | 7 +++++++
 include/linux/sched/sysctl.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8f571dc7c524..a0003eaa751f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -221,6 +221,7 @@ enum node_stat_item {
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 	PGPROMOTE_SUCCESS,	/* promote successfully */
+	PGPROMOTE_CANDIDATE,	/* candidate pages to promote */
 #endif
 	NR_VM_NODE_STAT_ITEMS
 };
@@ -998,6 +999,12 @@ typedef struct pglist_data {
 	struct deferred_split deferred_split_queue;
 #endif
 
+#ifdef CONFIG_NUMA_BALANCING
+	/* start time in ms of current promote rate limit period */
+	unsigned int nbp_rl_start;
+	/* number of promote candidate pages at start time of current rate limit period */
+	unsigned long nbp_rl_nr_cand;
+#endif
 	/* Fields commonly accessed by the page reclaim scanner */
 
 	/*
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index e650946816d0..303ee7dd0c7e 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -27,6 +27,7 @@ enum sched_tunable_scaling {
 
 #ifdef CONFIG_NUMA_BALANCING
 extern int sysctl_numa_balancing_mode;
+extern unsigned int sysctl_numa_balancing_promote_rate_limit;
 #else
 #define sysctl_numa_balancing_mode	0
 #endif
-- 
cgit v1.2.3


From c959924b0dc53bf6252793f41480bc01b9792570 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 13 Jul 2022 16:39:53 +0800
Subject: memory tiering: adjust hot threshold automatically

The promotion hot threshold is workload and system configuration
dependent.  So in this patch, a method to adjust the hot threshold
automatically is implemented.  The basic idea is to control the number of
the candidate promotion pages to match the promotion rate limit.  If the
hint page fault latency of a page is less than the hot threshold, we will
try to promote the page, and the page is called the candidate promotion
page.

If the number of the candidate promotion pages in the statistics interval
is much more than the promotion rate limit, the hot threshold will be
decreased to reduce the number of the candidate promotion pages.
Otherwise, the hot threshold will be increased to increase the number of
the candidate promotion pages.

To make the above method works, in each statistics interval, the total
number of the pages to check (on which the hint page faults occur) and the
hot/cold distribution need to be stable.  Because the page tables are
scanned linearly in NUMA balancing, but the hot/cold distribution isn't
uniform along the address usually, the statistics interval should be
larger than the NUMA balancing scan period.  So in the patch, the max scan
period is used as statistics interval and it works well in our tests.

Link: https://lkml.kernel.org/r/20220713083954.34196-4-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: osalvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zhong Jiang <zhongjiang-ali@linux.alibaba.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a0003eaa751f..025754b0bc09 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1004,6 +1004,15 @@ typedef struct pglist_data {
 	unsigned int nbp_rl_start;
 	/* number of promote candidate pages at start time of current rate limit period */
 	unsigned long nbp_rl_nr_cand;
+	/* promote threshold in ms */
+	unsigned int nbp_threshold;
+	/* start time in ms of current promote threshold adjustment period */
+	unsigned int nbp_th_start;
+	/*
+	 * number of promote candidate pages at stat time of current promote
+	 * threshold adjustment period
+	 */
+	unsigned long nbp_th_nr_cand;
 #endif
 	/* Fields commonly accessed by the page reclaim scanner */
 
-- 
cgit v1.2.3


From 0192445cb2f7ed1cd7a95a0fc8c7645480baba25 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Mon, 15 Aug 2022 10:39:59 -0400
Subject: arch: mm: rename FORCE_MAX_ZONEORDER to ARCH_FORCE_MAX_ORDER

This Kconfig option is used by individual arch to set its desired
MAX_ORDER.  Rename it to reflect its actual use.

Link: https://lkml.kernel.org/r/20220815143959.1511278-1-zi.yan@sent.com
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Zi Yan <ziy@nvidia.com>
Acked-by: Guo Ren <guoren@kernel.org>			[csky]
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>	[arm64]
Acked-by: Huacai Chen <chenhuacai@kernel.org>		[LoongArch]
Acked-by: Michael Ellerman <mpe@ellerman.id.au>		[powerpc]
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Taichi Sugaya <sugaya.taichi@socionext.com>
Cc: Neil Armstrong <narmstrong@baylibre.com>
Cc: Qin Jian <qinjian@cqplus1.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
Cc: Ley Foon Tan <ley.foon.tan@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 025754b0bc09..fd61347b4b1f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -24,10 +24,10 @@
 #include <asm/page.h>
 
 /* Free memory management - zoned buddy allocator.  */
-#ifndef CONFIG_FORCE_MAX_ZONEORDER
+#ifndef CONFIG_ARCH_FORCE_MAX_ORDER
 #define MAX_ORDER 11
 #else
-#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
+#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
 #endif
 #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
 
-- 
cgit v1.2.3


From fb70c4878d6b3001ef40fa39432a38d8cabdcbf7 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 15 Aug 2022 19:10:17 +0800
Subject: mm: kill find_min_pfn_with_active_regions()

find_min_pfn_with_active_regions() is only called from free_area_init().
Open-code the PHYS_PFN(memblock_start_of_DRAM()) into free_area_init(),
and kill find_min_pfn_with_active_regions().

Link: https://lkml.kernel.org/r/20220815111017.39341-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 27839b158ca4..e98ef2cb1176 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2520,7 +2520,6 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn,
 						unsigned long end_pfn);
 extern void get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn);
-extern unsigned long find_min_pfn_with_active_regions(void);
 
 #ifndef CONFIG_NUMA
 static inline int early_pfn_to_nid(unsigned long pfn)
-- 
cgit v1.2.3


From b1d5488a252dc9c0d9574100d0b8d807bf154603 Mon Sep 17 00:00:00 2001
From: Charan Teja Kalla <quic_charante@quicinc.com>
Date: Thu, 18 Aug 2022 19:20:00 +0530
Subject: mm: fix use-after free of page_ext after race with memory-offline

The below is one path where race between page_ext and offline of the
respective memory blocks will cause use-after-free on the access of
page_ext structure.

process1		              process2
---------                             ---------
a)doing /proc/page_owner           doing memory offline
			           through offline_pages.

b) PageBuddy check is failed
   thus proceed to get the
   page_owner information
   through page_ext access.
page_ext = lookup_page_ext(page);

				    migrate_pages();
				    .................
				Since all pages are successfully
				migrated as part of the offline
				operation,send MEM_OFFLINE notification
				where for page_ext it calls:
				offline_page_ext()-->
				__free_page_ext()-->
				   free_page_ext()-->
				     vfree(ms->page_ext)
			           mem_section->page_ext = NULL

c) Check for the PAGE_EXT
   flags in the page_ext->flags
   access results into the
   use-after-free (leading to
   the translation faults).

As mentioned above, there is really no synchronization between page_ext
access and its freeing in the memory_offline.

The memory offline steps(roughly) on a memory block is as below:

1) Isolate all the pages

2) while(1)
  try free the pages to buddy.(->free_list[MIGRATE_ISOLATE])

3) delete the pages from this buddy list.

4) Then free page_ext.(Note: The struct page is still alive as it is
   freed only during hot remove of the memory which frees the memmap,
   which steps the user might not perform).

This design leads to the state where struct page is alive but the struct
page_ext is freed, where the later is ideally part of the former which
just representing the page_flags (check [3] for why this design is
chosen).

The abovementioned race is just one example __but the problem persists in
the other paths too involving page_ext->flags access(eg:
page_is_idle())__.

Fix all the paths where offline races with page_ext access by maintaining
synchronization with rcu lock and is achieved in 3 steps:

1) Invalidate all the page_ext's of the sections of a memory block by
   storing a flag in the LSB of mem_section->page_ext.

2) Wait until all the existing readers to finish working with the
   ->page_ext's with synchronize_rcu().  Any parallel process that starts
   after this call will not get page_ext, through lookup_page_ext(), for
   the block parallel offline operation is being performed.

3) Now safely free all sections ->page_ext's of the block on which
   offline operation is being performed.

Note: If synchronize_rcu() takes time then optimizations can be done in
this path through call_rcu()[2].

Thanks to David Hildenbrand for his views/suggestions on the initial
discussion[1] and Pavan kondeti for various inputs on this patch.

[1] https://lore.kernel.org/linux-mm/59edde13-4167-8550-86f0-11fc67882107@quicinc.com/
[2] https://lore.kernel.org/all/a26ce299-aed1-b8ad-711e-a49e82bdd180@quicinc.com/T/#u
[3] https://lore.kernel.org/all/6fa6b7aa-731e-891c-3efb-a03d6a700efa@redhat.com/

[quic_charante@quicinc.com: rename label `loop' to `ext_put_continue' per David]
  Link: https://lkml.kernel.org/r/1661496993-11473-1-git-send-email-quic_charante@quicinc.com
Link: https://lkml.kernel.org/r/1660830600-9068-1-git-send-email-quic_charante@quicinc.com
Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Fernand Sieber <sieberf@amazon.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Pavan Kondeti <quic_pkondeti@quicinc.com>
Cc: SeongJae Park <sjpark@amazon.de>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_ext.h  | 17 +++++++++++------
 include/linux/page_idle.h | 34 ++++++++++++++++++++++++----------
 2 files changed, 35 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index fabb2e1e087f..ed27198cdaf4 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -55,7 +55,8 @@ static inline void page_ext_init(void)
 }
 #endif
 
-struct page_ext *lookup_page_ext(const struct page *page);
+extern struct page_ext *page_ext_get(struct page *page);
+extern void page_ext_put(struct page_ext *page_ext);
 
 static inline struct page_ext *page_ext_next(struct page_ext *curr)
 {
@@ -71,11 +72,6 @@ static inline void pgdat_page_ext_init(struct pglist_data *pgdat)
 {
 }
 
-static inline struct page_ext *lookup_page_ext(const struct page *page)
-{
-	return NULL;
-}
-
 static inline void page_ext_init(void)
 {
 }
@@ -87,5 +83,14 @@ static inline void page_ext_init_flatmem_late(void)
 static inline void page_ext_init_flatmem(void)
 {
 }
+
+static inline struct page_ext *page_ext_get(struct page *page)
+{
+	return NULL;
+}
+
+static inline void page_ext_put(struct page_ext *page_ext)
+{
+}
 #endif /* CONFIG_PAGE_EXTENSION */
 #endif /* __LINUX_PAGE_EXT_H */
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 4663dfed1293..5cb7bd2078ec 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -13,65 +13,79 @@
  * If there is not enough space to store Idle and Young bits in page flags, use
  * page ext flags instead.
  */
-
 static inline bool folio_test_young(struct folio *folio)
 {
-	struct page_ext *page_ext = lookup_page_ext(&folio->page);
+	struct page_ext *page_ext = page_ext_get(&folio->page);
+	bool page_young;
 
 	if (unlikely(!page_ext))
 		return false;
 
-	return test_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+	page_young = test_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+	page_ext_put(page_ext);
+
+	return page_young;
 }
 
 static inline void folio_set_young(struct folio *folio)
 {
-	struct page_ext *page_ext = lookup_page_ext(&folio->page);
+	struct page_ext *page_ext = page_ext_get(&folio->page);
 
 	if (unlikely(!page_ext))
 		return;
 
 	set_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+	page_ext_put(page_ext);
 }
 
 static inline bool folio_test_clear_young(struct folio *folio)
 {
-	struct page_ext *page_ext = lookup_page_ext(&folio->page);
+	struct page_ext *page_ext = page_ext_get(&folio->page);
+	bool page_young;
 
 	if (unlikely(!page_ext))
 		return false;
 
-	return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+	page_young = test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+	page_ext_put(page_ext);
+
+	return page_young;
 }
 
 static inline bool folio_test_idle(struct folio *folio)
 {
-	struct page_ext *page_ext = lookup_page_ext(&folio->page);
+	struct page_ext *page_ext = page_ext_get(&folio->page);
+	bool page_idle;
 
 	if (unlikely(!page_ext))
 		return false;
 
-	return test_bit(PAGE_EXT_IDLE, &page_ext->flags);
+	page_idle =  test_bit(PAGE_EXT_IDLE, &page_ext->flags);
+	page_ext_put(page_ext);
+
+	return page_idle;
 }
 
 static inline void folio_set_idle(struct folio *folio)
 {
-	struct page_ext *page_ext = lookup_page_ext(&folio->page);
+	struct page_ext *page_ext = page_ext_get(&folio->page);
 
 	if (unlikely(!page_ext))
 		return;
 
 	set_bit(PAGE_EXT_IDLE, &page_ext->flags);
+	page_ext_put(page_ext);
 }
 
 static inline void folio_clear_idle(struct folio *folio)
 {
-	struct page_ext *page_ext = lookup_page_ext(&folio->page);
+	struct page_ext *page_ext = page_ext_get(&folio->page);
 
 	if (unlikely(!page_ext))
 		return;
 
 	clear_bit(PAGE_EXT_IDLE, &page_ext->flags);
+	page_ext_put(page_ext);
 }
 #endif /* !CONFIG_64BIT */
 
-- 
cgit v1.2.3


From e2f8f44b7686bf65747f485ac7c9c43de8b8de09 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eb@emlix.com>
Date: Mon, 22 Aug 2022 15:01:32 +0200
Subject: mm: pagewalk: fix documentation of PTE hole handling

Empty PTEs are passed to the pte_entry callback, not to pte_hole.

Link: https://lkml.kernel.org/r/3695521.kQq0lBPeGt@devpool047
Signed-off-by: Rolf Eike Beer <eb@emlix.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagewalk.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index ac7b38ad5903..f3fafb731ffd 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -15,12 +15,12 @@ struct mm_walk;
  *			this handler is required to be able to handle
  *			pmd_trans_huge() pmds.  They may simply choose to
  *			split_huge_page() instead of handling it explicitly.
- * @pte_entry:		if set, called for each non-empty PTE (lowest-level)
- *			entry
+ * @pte_entry:		if set, called for each PTE (lowest-level) entry,
+ *			including empty ones
  * @pte_hole:		if set, called for each hole at all levels,
- *			depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD
- *			4:PTE. Any folded depths (where PTRS_PER_P?D is equal
- *			to 1) are skipped.
+ *			depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
+ *			Any folded depths (where PTRS_PER_P?D is equal to 1)
+ *			are skipped.
  * @hugetlb_entry:	if set, called for each hugetlb entry
  * @test_walk:		caller specific callback function to determine whether
  *			we walk over the current vma or not. Returning 0 means
-- 
cgit v1.2.3


From 408587baee39753304dd572dacf374f75545d25a Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 25 Aug 2022 00:05:05 +0000
Subject: mm: page_counter: rearrange struct page_counter fields
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With memcg v2 enabled, memcg->memory.usage is a very hot member for the
workloads doing memcg charging on multiple CPUs concurrently.
Particularly the network intensive workloads.  In addition, there is a
false cache sharing between memory.usage and memory.high on the charge
path.  This patch moves the usage into a separate cacheline and move all
the read most fields into separate cacheline.

To evaluate the impact of this optimization, on a 72 CPUs machine, we ran
the following workload in a three level of cgroup hierarchy.

 $ netserver -6
 # 36 instances of netperf with following params
 $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K

Results (average throughput of netperf):
Without (6.0-rc1)	10482.7 Mbps
With patch		12413.7 Mbps (18.4% improvement)

With the patch, the throughput improved by 18.4%.

One side-effect of this patch is the increase in the size of struct
mem_cgroup.  For example with this patch on 64 bit build, the size of
struct mem_cgroup increased from 4032 bytes to 4416 bytes.  However for
the performance improvement, this additional size is worth it.  In
addition there are opportunities to reduce the size of struct mem_cgroup
like deprecation of kmem and tcpmem page counters and better packing.

Link: https://lkml.kernel.org/r/20220825000506.239406-3-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
Reviewed-by: Feng Tang <feng.tang@intel.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Michal Koutný" <mkoutny@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_counter.h | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 679591301994..78a1c934e416 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -3,15 +3,26 @@
 #define _LINUX_PAGE_COUNTER_H
 
 #include <linux/atomic.h>
+#include <linux/cache.h>
 #include <linux/kernel.h>
 #include <asm/page.h>
 
+#if defined(CONFIG_SMP)
+struct pc_padding {
+	char x[0];
+} ____cacheline_internodealigned_in_smp;
+#define PC_PADDING(name)	struct pc_padding name
+#else
+#define PC_PADDING(name)
+#endif
+
 struct page_counter {
+	/*
+	 * Make sure 'usage' does not share cacheline with any other field. The
+	 * memcg->memory.usage is a hot member of struct mem_cgroup.
+	 */
 	atomic_long_t usage;
-	unsigned long min;
-	unsigned long low;
-	unsigned long high;
-	unsigned long max;
+	PC_PADDING(_pad1_);
 
 	/* effective memory.min and memory.min usage tracking */
 	unsigned long emin;
@@ -23,18 +34,18 @@ struct page_counter {
 	atomic_long_t low_usage;
 	atomic_long_t children_low_usage;
 
-	/* legacy */
 	unsigned long watermark;
 	unsigned long failcnt;
 
-	/*
-	 * 'parent' is placed here to be far from 'usage' to reduce
-	 * cache false sharing, as 'usage' is written mostly while
-	 * parent is frequently read for cgroup's hierarchical
-	 * counting nature.
-	 */
+	/* Keep all the read most fields in a separete cacheline. */
+	PC_PADDING(_pad2_);
+
+	unsigned long min;
+	unsigned long low;
+	unsigned long high;
+	unsigned long max;
 	struct page_counter *parent;
-};
+} ____cacheline_internodealigned_in_smp;
 
 #if BITS_PER_LONG == 32
 #define PAGE_COUNTER_MAX LONG_MAX
-- 
cgit v1.2.3


From 1813e51eece0ad6f4aacaeb738e7cced46feb470 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 25 Aug 2022 00:05:06 +0000
Subject: memcg: increase MEMCG_CHARGE_BATCH to 64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For several years, MEMCG_CHARGE_BATCH was kept at 32 but with bigger
machines and the network intensive workloads requiring througput in Gbps,
32 is too small and makes the memcg charging path a bottleneck.  For now,
increase it to 64 for easy acceptance to 6.0.  We will need to revisit
this in future for ever increasing demand of higher performance.

Please note that the memcg charge path drain the per-cpu memcg charge
stock, so there should not be any oom behavior change.  Though it does
have impact on rstat flushing and high limit reclaim backoff.

To evaluate the impact of this optimization, on a 72 CPUs machine, we
ran the following workload in a three level of cgroup hierarchy.

 $ netserver -6
 # 36 instances of netperf with following params
 $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K

Results (average throughput of netperf):
Without (6.0-rc1)       10482.7 Mbps
With patch              17064.7 Mbps (62.7% improvement)

With the patch, the throughput improved by 62.7%.

Link: https://lkml.kernel.org/r/20220825000506.239406-4-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Feng Tang <feng.tang@intel.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Michal Koutný" <mkoutny@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6257867fbf95..a2461f9a8738 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -354,10 +354,11 @@ struct mem_cgroup {
 };
 
 /*
- * size of first charge trial. "32" comes from vmscan.c's magic value.
- * TODO: maybe necessary to use big numbers in big irons.
+ * size of first charge trial.
+ * TODO: maybe necessary to use big numbers in big irons or dynamic based of the
+ * workload.
  */
-#define MEMCG_CHARGE_BATCH 32U
+#define MEMCG_CHARGE_BATCH 64U
 
 extern struct mem_cgroup *root_mem_cgroup;
 
-- 
cgit v1.2.3


From c4f20f1479c456d9dd1c1e6d8bf956a25de742dc Mon Sep 17 00:00:00 2001
From: Li Zhe <lizhe.67@bytedance.com>
Date: Thu, 25 Aug 2022 18:27:14 +0800
Subject: page_ext: introduce boot parameter 'early_page_ext'

In commit 2f1ee0913ce5 ("Revert "mm: use early_pfn_to_nid in
page_ext_init""), we call page_ext_init() after page_alloc_init_late() to
avoid some panic problem.  It seems that we cannot track early page
allocations in current kernel even if page structure has been initialized
early.

This patch introduces a new boot parameter 'early_page_ext' to resolve
this problem.  If we pass it to the kernel, page_ext_init() will be moved
up and the feature 'deferred initialization of struct pages' will be
disabled to initialize the page allocator early and prevent the panic
problem above.  It can help us to catch early page allocations.  This is
useful especially when we find that the free memory value is not the same
right after different kernel booting.

[akpm@linux-foundation.org: fix section issue by removing __meminitdata]
Link: https://lkml.kernel.org/r/20220825102714.669-1-lizhe.67@bytedance.com
Signed-off-by: Li Zhe <lizhe.67@bytedance.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
Cc: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_ext.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index ed27198cdaf4..22be4582faae 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -36,9 +36,15 @@ struct page_ext {
 	unsigned long flags;
 };
 
+extern bool early_page_ext;
 extern unsigned long page_ext_size;
 extern void pgdat_page_ext_init(struct pglist_data *pgdat);
 
+static inline bool early_page_ext_enabled(void)
+{
+	return early_page_ext;
+}
+
 #ifdef CONFIG_SPARSEMEM
 static inline void page_ext_init_flatmem(void)
 {
@@ -68,6 +74,11 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr)
 #else /* !CONFIG_PAGE_EXTENSION */
 struct page_ext;
 
+static inline bool early_page_ext_enabled(void)
+{
+	return false;
+}
+
 static inline void pgdat_page_ext_init(struct pglist_data *pgdat)
 {
 }
-- 
cgit v1.2.3


From 35b471467f88b8d8b52ba5b006a29b9d057d09bf Mon Sep 17 00:00:00 2001
From: "Vishal Moola (Oracle)" <vishal.moola@gmail.com>
Date: Tue, 23 Aug 2022 17:40:17 -0700
Subject: filemap: add filemap_get_folios_contig()

Patch series "Convert to filemap_get_folios_contig()", v3.

This patch series replaces find_get_pages_contig() with
filemap_get_folios_contig().


This patch (of 7):

This function is meant to replace find_get_pages_contig().

Unlike find_get_pages_contig(), filemap_get_folios_contig() no longer
takes in a target number of pages to find - It returns up to 15 contiguous
folios.

To be more consistent with filemap_get_folios(),
filemap_get_folios_contig() now also updates the start index passed in,
and takes an end index.

Link: https://lkml.kernel.org/r/20220824004023.77310-1-vishal.moola@gmail.com
Link: https://lkml.kernel.org/r/20220824004023.77310-2-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: David Sterba <dsterb@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0178b2040ea3..8689c32d628b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -718,6 +718,8 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
 
 unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
 		pgoff_t end, struct folio_batch *fbatch);
+unsigned filemap_get_folios_contig(struct address_space *mapping,
+		pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 			       unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
-- 
cgit v1.2.3


From 48658d8509d2db3391c95aa74308a2b1fc8e8461 Mon Sep 17 00:00:00 2001
From: "Vishal Moola (Oracle)" <vishal.moola@gmail.com>
Date: Tue, 23 Aug 2022 17:40:23 -0700
Subject: filemap: remove find_get_pages_contig()

All callers of find_get_pages_contig() have been removed, so it is no
longer needed.

Link: https://lkml.kernel.org/r/20220824004023.77310-8-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.com>
Cc: David Sterba <dsterb@suse.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 8689c32d628b..09de43e36a64 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -720,8 +720,6 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
 		pgoff_t end, struct folio_batch *fbatch);
 unsigned filemap_get_folios_contig(struct address_space *mapping,
 		pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
-unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
-			       unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
 			pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
 			struct page **pages);
-- 
cgit v1.2.3


From 639118d1571f70b1157b4bb5ac574b0ab0f38099 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Sat, 27 Aug 2022 19:20:43 +0800
Subject: mm: kill is_memblock_offlined()

Directly check state of struct memory_block, no need a single function.

Link: https://lkml.kernel.org/r/20220827112043.187028-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index e0b2209ab71c..54675791bc50 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -11,7 +11,6 @@ struct page;
 struct zone;
 struct pglist_data;
 struct mem_section;
-struct memory_block;
 struct memory_group;
 struct resource;
 struct vmem_altmap;
@@ -333,7 +332,6 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 extern void remove_pfn_range_from_zone(struct zone *zone,
 				       unsigned long start_pfn,
 				       unsigned long nr_pages);
-extern bool is_memblock_offlined(struct memory_block *mem);
 extern int sparse_add_section(int nid, unsigned long pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap,
 		struct dev_pagemap *pgmap);
-- 
cgit v1.2.3


From b4a0215e11dcfe23a48c65c6d6c82c0c2c551a48 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Sat, 27 Aug 2022 19:19:59 +0800
Subject: mm: fix null-ptr-deref in kswapd_is_running()

kswapd_run/stop() will set pgdat->kswapd to NULL, which could race with
kswapd_is_running() in kcompactd(),

kswapd_run/stop()                       kcompactd()
                                          kswapd_is_running()
  pgdat->kswapd // error or nomal ptr
                                          verify pgdat->kswapd
                                            // load non-NULL
pgdat->kswapd
  pgdat->kswapd = NULL
                                          task_is_running(pgdat->kswapd)
                                            // Null pointer derefence

KASAN reports the null-ptr-deref shown below,

  vmscan: Failed to start kswapd on node 0
  ...
  BUG: KASAN: null-ptr-deref in kcompactd+0x440/0x504
  Read of size 8 at addr 0000000000000024 by task kcompactd0/37

  CPU: 0 PID: 37 Comm: kcompactd0 Kdump: loaded Tainted: G           OE     5.10.60 #1
  Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
  Call trace:
   dump_backtrace+0x0/0x394
   show_stack+0x34/0x4c
   dump_stack+0x158/0x1e4
   __kasan_report+0x138/0x140
   kasan_report+0x44/0xdc
   __asan_load8+0x94/0xd0
   kcompactd+0x440/0x504
   kthread+0x1a4/0x1f0
   ret_from_fork+0x10/0x18

At present kswapd/kcompactd_run() and kswapd/kcompactd_stop() are protected
by mem_hotplug_begin/done(), but without kcompactd(). There is no need to
involve memory hotplug lock in kcompactd(), so let's add a new mutex to
protect pgdat->kswapd accesses.

Also, because the kcompactd task will check the state of kswapd task, it's
better to call kcompactd_stop() before kswapd_stop() to reduce lock
conflicts.

[akpm@linux-foundation.org: add comments]
Link: https://lkml.kernel.org/r/20220827111959.186838-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 20 ++++++++++++++++++++
 include/linux/mmzone.h         |  6 ++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 54675791bc50..51052969dbfe 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -215,6 +215,22 @@ void put_online_mems(void);
 void mem_hotplug_begin(void);
 void mem_hotplug_done(void);
 
+/* See kswapd_is_running() */
+static inline void pgdat_kswapd_lock(pg_data_t *pgdat)
+{
+	mutex_lock(&pgdat->kswapd_lock);
+}
+
+static inline void pgdat_kswapd_unlock(pg_data_t *pgdat)
+{
+	mutex_unlock(&pgdat->kswapd_lock);
+}
+
+static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat)
+{
+	mutex_init(&pgdat->kswapd_lock);
+}
+
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 #define pfn_to_online_page(pfn)			\
 ({						\
@@ -251,6 +267,10 @@ static inline bool movable_node_is_enabled(void)
 {
 	return false;
 }
+
+static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {}
+static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {}
+static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {}
 #endif /* ! CONFIG_MEMORY_HOTPLUG */
 
 /*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fd61347b4b1f..18cf0fc5ce67 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -956,8 +956,10 @@ typedef struct pglist_data {
 	atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */
 	unsigned long nr_reclaim_start;	/* nr pages written while throttled
 					 * when throttling started. */
-	struct task_struct *kswapd;	/* Protected by
-					   mem_hotplug_begin/done() */
+#ifdef CONFIG_MEMORY_HOTPLUG
+	struct mutex kswapd_lock;
+#endif
+	struct task_struct *kswapd;	/* Protected by kswapd_lock */
 	int kswapd_order;
 	enum zone_type kswapd_highest_zoneidx;
 
-- 
cgit v1.2.3


From a38c94ed59fc312b51a33d867444ce73c6805ff6 Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Mon, 29 Aug 2022 17:57:09 +0800
Subject: mm/thp: simplify has_transparent_hugepage by using IS_BUILTIN

Simplify code of has_transparent_hugepage define by using IS_BUILTIN.

No functional change.

Link: https://lkml.kernel.org/r/20220829095709.3287462-1-liushixin2@huawei.com
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 014ee8f0fbaa..5911761487de 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1598,11 +1598,7 @@ typedef unsigned int pgtbl_mod_mask;
 #endif
 
 #ifndef has_transparent_hugepage
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define has_transparent_hugepage() 1
-#else
-#define has_transparent_hugepage() 0
-#endif
+#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
 #endif
 
 /*
-- 
cgit v1.2.3


From bcd0dea5f4fb3d098b03ef3064fe6c8201d7c515 Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Mon, 29 Aug 2022 17:51:25 +0800
Subject: mm/thp: remove redundant CONFIG_TRANSPARENT_HUGEPAGE

Simplify code by removing redundant CONFIG_TRANSPARENT_HUGEPAGE judgment.

No functional change.

Link: https://lkml.kernel.org/r/20220829095125.3284567-1-liushixin2@huawei.com
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5911761487de..d13b4f7cc5be 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1276,8 +1276,7 @@ static inline int pgd_devmap(pgd_t pgd)
 #endif
 
 #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
-	(defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
-	 !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
+	!defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 static inline int pud_trans_huge(pud_t pud)
 {
 	return 0;
-- 
cgit v1.2.3


From 214f8796907b8015b778badf4710a4701472779a Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Thu, 1 Sep 2022 21:34:52 +0800
Subject: fs/buffer: remove __breadahead_gfp()

Patch series "fs/buffer: remove ll_rw_block()", v2.

ll_rw_block() will skip locked buffer before submitting IO, it assumes
that locked buffer means it is under IO.  This assumption is not always
true because we cannot guarantee every buffer lock path would submit IO.
After commit 88dbcbb3a484 ("blkdev: avoid migration stalls for blkdev
pages"), buffer_migrate_folio_norefs() becomes one exceptional case, and
there may be others.  So ll_rw_block() is not safe on the sync read path,
we could get false positive EIO return value when filesystem reading
metadata.  It seems that it could be only used on the readahead path.

Unfortunately, many filesystem misuse the ll_rw_block() on the sync read
path.  This patch set just remove ll_rw_block() and add new friendly
helpers, which could prevent false positive EIO on the read metadata path.
Thanks for the suggestion from Jan, the original discussion is at [1].

 patch 1: remove unused helpers in fs/buffer.c
 patch 2: add new bh_read_[*] helpers
 patch 3-11: remove all ll_rw_block() calls in filesystems
 patch 12-14: do some leftover cleanups.

[1]. https://lore.kernel.org/linux-mm/20220825080146.2021641-1-chengzhihao1@huawei.com/


This patch (of 14):

No one use __breadahead_gfp() and sb_breadahead_unmovable() any more,
remove them.

Link: https://lkml.kernel.org/r/20220901133505.2510834-1-yi.zhang@huawei.com
Link: https://lkml.kernel.org/r/20220901133505.2510834-2-yi.zhang@huawei.com
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Evgeniy Dushistov <dushistov@mail.ru>
Cc: Heming Zhao <ocfs2-devel@oss.oracle.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Yu Kuai <yukuai3@huawei.com>
Cc: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 089c9ade4325..c3863c417b00 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -214,8 +214,6 @@ struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
 void __brelse(struct buffer_head *);
 void __bforget(struct buffer_head *);
 void __breadahead(struct block_device *, sector_t block, unsigned int size);
-void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size,
-		  gfp_t gfp);
 struct buffer_head *__bread_gfp(struct block_device *,
 				sector_t block, unsigned size, gfp_t gfp);
 void invalidate_bh_lrus(void);
@@ -340,12 +338,6 @@ sb_breadahead(struct super_block *sb, sector_t block)
 	__breadahead(sb->s_bdev, block, sb->s_blocksize);
 }
 
-static inline void
-sb_breadahead_unmovable(struct super_block *sb, sector_t block)
-{
-	__breadahead_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
-}
-
 static inline struct buffer_head *
 sb_getblk(struct super_block *sb, sector_t block)
 {
-- 
cgit v1.2.3


From fdee117ee86479fd2644bcd9ac2b2469e55722d1 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Thu, 1 Sep 2022 21:34:53 +0800
Subject: fs/buffer: add some new buffer read helpers

Current ll_rw_block() helper is fragile because it assumes that locked
buffer means it's under IO which is submitted by some other who holds
the lock, it skip buffer if it failed to get the lock, so it's only
safe on the readahead path. Unfortunately, now that most filesystems
still use this helper mistakenly on the sync metadata read path. There
is no guarantee that the one who holds the buffer lock always submit IO
(e.g. buffer_migrate_folio_norefs() after commit 88dbcbb3a484 ("blkdev:
avoid migration stalls for blkdev pages"), it could lead to false
positive -EIO when submitting reading IO.

This patch add some friendly buffer read helpers to prepare replacing
ll_rw_block() and similar calls. We can only call bh_readahead_[]
helpers for the readahead paths.

Link: https://lkml.kernel.org/r/20220901133505.2510834-3-yi.zhang@huawei.com
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index c3863c417b00..6d09785bed9f 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -232,6 +232,9 @@ void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 int bh_uptodate_or_lock(struct buffer_head *bh);
 int bh_submit_read(struct buffer_head *bh);
+int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
+void __bh_read_batch(int nr, struct buffer_head *bhs[],
+		     blk_opf_t op_flags, bool force_lock);
 
 extern int buffer_heads_over_limit;
 
@@ -399,6 +402,41 @@ static inline struct buffer_head *__getblk(struct block_device *bdev,
 	return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
 }
 
+static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags)
+{
+	if (!buffer_uptodate(bh) && trylock_buffer(bh)) {
+		if (!buffer_uptodate(bh))
+			__bh_read(bh, op_flags, false);
+		else
+			unlock_buffer(bh);
+	}
+}
+
+static inline void bh_read_nowait(struct buffer_head *bh, blk_opf_t op_flags)
+{
+	if (!bh_uptodate_or_lock(bh))
+		__bh_read(bh, op_flags, false);
+}
+
+/* Returns 1 if buffer uptodated, 0 on success, and -EIO on error. */
+static inline int bh_read(struct buffer_head *bh, blk_opf_t op_flags)
+{
+	if (bh_uptodate_or_lock(bh))
+		return 1;
+	return __bh_read(bh, op_flags, true);
+}
+
+static inline void bh_read_batch(int nr, struct buffer_head *bhs[])
+{
+	__bh_read_batch(nr, bhs, 0, true);
+}
+
+static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[],
+				      blk_opf_t op_flags)
+{
+	__bh_read_batch(nr, bhs, op_flags, false);
+}
+
 /**
  *  __bread() - reads a specified block and returns the bh
  *  @bdev: the block_device to read from
-- 
cgit v1.2.3


From 79f5978420691fb84593f98557ea56f8b32228f2 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Thu, 1 Sep 2022 21:35:03 +0800
Subject: fs/buffer: remove ll_rw_block() helper

Now that all ll_rw_block() users has been replaced to new safe helpers,
we just remove it here.

Link: https://lkml.kernel.org/r/20220901133505.2510834-13-yi.zhang@huawei.com
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 6d09785bed9f..b415d8bc2a09 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -223,7 +223,6 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
 void unlock_buffer(struct buffer_head *bh);
 void __lock_buffer(struct buffer_head *bh);
-void ll_rw_block(blk_opf_t, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
 int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
 void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
-- 
cgit v1.2.3


From 454552d0145486cf82572e73cca266ab6a56e86b Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Thu, 1 Sep 2022 21:35:05 +0800
Subject: fs/buffer: remove bh_submit_read() helper

bh_submit_read() has no user anymore, just remove it.

Link: https://lkml.kernel.org/r/20220901133505.2510834-15-yi.zhang@huawei.com
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index b415d8bc2a09..9b6556d3f110 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -230,7 +230,6 @@ int submit_bh(blk_opf_t, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 int bh_uptodate_or_lock(struct buffer_head *bh);
-int bh_submit_read(struct buffer_head *bh);
 int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
 void __bh_read_batch(int nr, struct buffer_head *bhs[],
 		     blk_opf_t op_flags, bool force_lock);
-- 
cgit v1.2.3


From 263b899802fc43cd0e6979f819271dfbb93c94af Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Thu, 1 Sep 2022 20:00:21 +0800
Subject: hugetlb: make hugetlb_cma_check() static

Patch series "A few cleanup patches for hugetlb", v2.

This series contains a few cleanup patches to use helper functions to
simplify the codes, remove unneeded nid parameter and so on. More
details can be found in the respective changelogs.


This patch (of 10):

Make hugetlb_cma_check() static as it's only used inside mm/hugetlb.c.

Link: https://lkml.kernel.org/r/20220901120030.63318-1-linmiaohe@huawei.com
Link: https://lkml.kernel.org/r/20220901120030.63318-2-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3ec981a0d8b3..57e72954a482 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1123,14 +1123,10 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
 
 #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
 extern void __init hugetlb_cma_reserve(int order);
-extern void __init hugetlb_cma_check(void);
 #else
 static inline __init void hugetlb_cma_reserve(int order)
 {
 }
-static inline __init void hugetlb_cma_check(void)
-{
-}
 #endif
 
 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
-- 
cgit v1.2.3


From 088b8aa537c2c767765f1c19b555f21ffe555786 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 1 Sep 2022 10:35:59 +0200
Subject: mm: fix PageAnonExclusive clearing racing with concurrent RCU
 GUP-fast

commit 6c287605fd56 ("mm: remember exclusively mapped anonymous pages with
PG_anon_exclusive") made sure that when PageAnonExclusive() has to be
cleared during temporary unmapping of a page, that the PTE is
cleared/invalidated and that the TLB is flushed.

What we want to achieve in all cases is that we cannot end up with a pin on
an anonymous page that may be shared, because such pins would be
unreliable and could result in memory corruptions when the mapped page
and the pin go out of sync due to a write fault.

That TLB flush handling was inspired by an outdated comment in
mm/ksm.c:write_protect_page(), which similarly required the TLB flush in
the past to synchronize with GUP-fast. However, ever since general RCU GUP
fast was introduced in commit 2667f50e8b81 ("mm: introduce a general RCU
get_user_pages_fast()"), a TLB flush is no longer sufficient to handle
concurrent GUP-fast in all cases -- it only handles traditional IPI-based
GUP-fast correctly.

Peter Xu (thankfully) questioned whether that TLB flush is really
required. On architectures that send an IPI broadcast on TLB flush,
it works as expected. To synchronize with RCU GUP-fast properly, we're
conceptually fine, however, we have to enforce a certain memory order and
are missing memory barriers.

Let's document that, avoid the TLB flush where possible and use proper
explicit memory barriers where required. We shouldn't really care about the
additional memory barriers here, as we're not on extremely hot paths --
and we're getting rid of some TLB flushes.

We use a smp_mb() pair for handling concurrent pinning and a
smp_rmb()/smp_wmb() pair for handling the corner case of only temporary
PTE changes but permanent PageAnonExclusive changes.

One extreme example, whereby GUP-fast takes a R/O pin and KSM wants to
convert an exclusive anonymous page to a KSM page, and that page is already
mapped write-protected (-> no PTE change) would be:

	Thread 0 (KSM)			Thread 1 (GUP-fast)

					(B1) Read the PTE
					# (B2) skipped without FOLL_WRITE
	(A1) Clear PTE
	smp_mb()
	(A2) Check pinned
					(B3) Pin the mapped page
					smp_mb()
	(A3) Clear PageAnonExclusive
	smp_wmb()
	(A4) Restore PTE
					(B4) Check if the PTE changed
					smp_rmb()
					(B5) Check PageAnonExclusive

Thread 1 will properly detect that PageAnonExclusive was cleared and
back off.

Note that we don't need a memory barrier between checking if the page is
pinned and clearing PageAnonExclusive, because stores are not
speculated.

The possible issues due to reordering are of theoretical nature so far
and attempts to reproduce the race failed.

Especially the "no PTE change" case isn't the common case, because we'd
need an exclusive anonymous page that's mapped R/O and the PTE is clean
in KSM code -- and using KSM with page pinning isn't extremely common.
Further, the clear+TLB flush we used for now implies a memory barrier.
So the problematic missing part should be the missing memory barrier
after pinning but before checking if the PTE changed.

Link: https://lkml.kernel.org/r/20220901083559.67446-1-david@redhat.com
Fixes: 6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Christoph von Recklinghausen <crecklin@redhat.com>
Cc: Don Dutile <ddutile@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h   |  9 +++++--
 include/linux/rmap.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 68 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e98ef2cb1176..8a5ad9d050bf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2999,8 +2999,8 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
  * PageAnonExclusive() has to protect against concurrent GUP:
  * * Ordinary GUP: Using the PT lock
  * * GUP-fast and fork(): mm->write_protect_seq
- * * GUP-fast and KSM or temporary unmapping (swap, migration):
- *   clear/invalidate+flush of the page table entry
+ * * GUP-fast and KSM or temporary unmapping (swap, migration): see
+ *    page_try_share_anon_rmap()
  *
  * Must be called with the (sub)page that's actually referenced via the
  * page table entry, which might not necessarily be the head page for a
@@ -3021,6 +3021,11 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page)
 	 */
 	if (!PageAnon(page))
 		return false;
+
+	/* Paired with a memory barrier in page_try_share_anon_rmap(). */
+	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+		smp_rmb();
+
 	/*
 	 * Note that PageKsm() pages cannot be exclusive, and consequently,
 	 * cannot get pinned.
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bf80adca980b..72b2bcc37f73 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -267,7 +267,7 @@ dup:
  * @page: the exclusive anonymous page to try marking possibly shared
  *
  * The caller needs to hold the PT lock and has to have the page table entry
- * cleared/invalidated+flushed, to properly sync against GUP-fast.
+ * cleared/invalidated.
  *
  * This is similar to page_try_dup_anon_rmap(), however, not used during fork()
  * to duplicate a mapping, but instead to prepare for KSM or temporarily
@@ -283,12 +283,68 @@ static inline int page_try_share_anon_rmap(struct page *page)
 {
 	VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
 
-	/* See page_try_dup_anon_rmap(). */
-	if (likely(!is_device_private_page(page) &&
-	    unlikely(page_maybe_dma_pinned(page))))
-		return -EBUSY;
+	/* device private pages cannot get pinned via GUP. */
+	if (unlikely(is_device_private_page(page))) {
+		ClearPageAnonExclusive(page);
+		return 0;
+	}
 
+	/*
+	 * We have to make sure that when we clear PageAnonExclusive, that
+	 * the page is not pinned and that concurrent GUP-fast won't succeed in
+	 * concurrently pinning the page.
+	 *
+	 * Conceptually, PageAnonExclusive clearing consists of:
+	 * (A1) Clear PTE
+	 * (A2) Check if the page is pinned; back off if so.
+	 * (A3) Clear PageAnonExclusive
+	 * (A4) Restore PTE (optional, but certainly not writable)
+	 *
+	 * When clearing PageAnonExclusive, we cannot possibly map the page
+	 * writable again, because anon pages that may be shared must never
+	 * be writable. So in any case, if the PTE was writable it cannot
+	 * be writable anymore afterwards and there would be a PTE change. Only
+	 * if the PTE wasn't writable, there might not be a PTE change.
+	 *
+	 * Conceptually, GUP-fast pinning of an anon page consists of:
+	 * (B1) Read the PTE
+	 * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
+	 * (B3) Pin the mapped page
+	 * (B4) Check if the PTE changed by re-reading it; back off if so.
+	 * (B5) If the original PTE is not writable, check if
+	 *	PageAnonExclusive is not set; back off if so.
+	 *
+	 * If the PTE was writable, we only have to make sure that GUP-fast
+	 * observes a PTE change and properly backs off.
+	 *
+	 * If the PTE was not writable, we have to make sure that GUP-fast either
+	 * detects a (temporary) PTE change or that PageAnonExclusive is cleared
+	 * and properly backs off.
+	 *
+	 * Consequently, when clearing PageAnonExclusive(), we have to make
+	 * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
+	 * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
+	 * and (B5) happen in the right memory order.
+	 *
+	 * We assume that there might not be a memory barrier after
+	 * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
+	 * so we use explicit ones here.
+	 */
+
+	/* Paired with the memory barrier in try_grab_folio(). */
+	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+		smp_mb();
+
+	if (unlikely(page_maybe_dma_pinned(page)))
+		return -EBUSY;
 	ClearPageAnonExclusive(page);
+
+	/*
+	 * This is conceptually a smp_wmb() paired with the smp_rmb() in
+	 * gup_must_unshare().
+	 */
+	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+		smp_mb__after_atomic();
 	return 0;
 }
 
-- 
cgit v1.2.3


From 7bb5da0d490b2d836c5218f5186ee588d2145310 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Thu, 30 Jun 2022 23:32:57 +0100
Subject: kexec: turn all kexec_mutex acquisitions into trylocks

Patch series "kexec, panic: Making crash_kexec() NMI safe", v4.


This patch (of 2):

Most acquistions of kexec_mutex are done via mutex_trylock() - those were
a direct "translation" from:

  8c5a1cf0ad3a ("kexec: use a mutex for locking rather than xchg()")

there have however been two additions since then that use mutex_lock():
crash_get_memory_size() and crash_shrink_memory().

A later commit will replace said mutex with an atomic variable, and
locking operations will become atomic_cmpxchg().  Rather than having those
mutex_lock() become while (atomic_cmpxchg(&lock, 0, 1)), turn them into
trylocks that can return -EBUSY on acquisition failure.

This does halve the printable size of the crash kernel, but that's still
neighbouring 2G for 32bit kernels which should be ample enough.

Link: https://lkml.kernel.org/r/20220630223258.4144112-1-vschneid@redhat.com
Link: https://lkml.kernel.org/r/20220630223258.4144112-2-vschneid@redhat.com
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Juri Lelli <jlelli@redhat.com>
Cc: Luis Claudio R. Goncalves <lgoncalv@redhat.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 13e6c4b58f07..41a686996aaa 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -427,7 +427,7 @@ extern int kexec_load_disabled;
 extern bool kexec_in_progress;
 
 int crash_shrink_memory(unsigned long new_size);
-size_t crash_get_memory_size(void);
+ssize_t crash_get_memory_size(void);
 
 #ifndef arch_kexec_protect_crashkres
 /*
-- 
cgit v1.2.3


From 2be9880dc87342dc7ae459c9ea5c9ee2a45b33d8 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 19 Aug 2022 09:44:06 +0800
Subject: kernel: exit: cleanup release_thread()

Only x86 has own release_thread(), introduce a new weak release_thread()
function to clean empty definitions in other ARCHs.

Link: https://lkml.kernel.org/r/20220819014406.32266-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Guo Ren <guoren@kernel.org>				[csky]
Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Brian Cain <bcain@quicinc.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>			[powerpc]
Acked-by: Stafford Horne <shorne@gmail.com>			[openrisc]
Acked-by: Catalin Marinas <catalin.marinas@arm.com>		[arm64]
Acked-by: Huacai Chen <chenhuacai@kernel.org>			[LoongArch]
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Chris Zankel <chris@zankel.net>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Guo Ren <guoren@kernel.org> [csky]
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xuerui Wang <kernel@xen0n.name>
Cc: Yoshinori Sato <ysato@users.osdn.me>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/task.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 81cab4b01edc..d6c48163c6de 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -127,6 +127,9 @@ static inline void put_task_struct_many(struct task_struct *t, int nr)
 
 void put_task_struct_rcu_user(struct task_struct *task);
 
+/* Free all architecture-specific resources held by a thread. */
+void release_thread(struct task_struct *dead_task);
+
 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
 extern int arch_task_struct_size __read_mostly;
 #else
-- 
cgit v1.2.3


From da3f52ba359590d8eb465ae0d8b6e11d6fd9432f Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Sun, 21 Aug 2022 21:30:11 +0200
Subject: iversion: use atomic64_try_cmpxchg)

Use atomic64_try_cmpxchg instead of
atomic64_cmpxchg (*ptr, old, new) == old in inode_set_max_iversion_raw,
inode_maybe_inc_version and inode_query_iversion. x86 CMPXCHG instruction
returns success in ZF flag, so this change saves a compare after cmpxchg
(and related move instruction in front of cmpxchg).

Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg
fails, enabling further code simplifications.

The loop in inode_maybe_inc_iversion improves from:

    5563:	48 89 ca             	mov    %rcx,%rdx
    5566:	48 89 c8             	mov    %rcx,%rax
    5569:	48 83 e2 fe          	and    $0xfffffffffffffffe,%rdx
    556d:	48 83 c2 02          	add    $0x2,%rdx
    5571:	f0 48 0f b1 16       	lock cmpxchg %rdx,(%rsi)
    5576:	48 39 c1             	cmp    %rax,%rcx
    5579:	0f 84 85 fc ff ff    	je     5204 <...>
    557f:	48 89 c1             	mov    %rax,%rcx
    5582:	eb df                	jmp    5563 <...>

to:

    5563:	48 89 c2             	mov    %rax,%rdx
    5566:	48 83 e2 fe          	and    $0xfffffffffffffffe,%rdx
    556a:	48 83 c2 02          	add    $0x2,%rdx
    556e:	f0 48 0f b1 11       	lock cmpxchg %rdx,(%rcx)
    5573:	0f 84 8b fc ff ff    	je     5204 <...>
    5579:	eb e8                	jmp    5563 <...>

Link: https://lkml.kernel.org/r/20220821193011.88208-1-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/iversion.h | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iversion.h b/include/linux/iversion.h
index 3bfebde5a1a6..eb5a15810169 100644
--- a/include/linux/iversion.h
+++ b/include/linux/iversion.h
@@ -123,17 +123,12 @@ inode_peek_iversion_raw(const struct inode *inode)
 static inline void
 inode_set_max_iversion_raw(struct inode *inode, u64 val)
 {
-	u64 cur, old;
+	u64 cur = inode_peek_iversion_raw(inode);
 
-	cur = inode_peek_iversion_raw(inode);
-	for (;;) {
+	do {
 		if (cur > val)
 			break;
-		old = atomic64_cmpxchg(&inode->i_version, cur, val);
-		if (likely(old == cur))
-			break;
-		cur = old;
-	}
+	} while (!atomic64_try_cmpxchg(&inode->i_version, &cur, val));
 }
 
 /**
@@ -197,7 +192,7 @@ inode_set_iversion_queried(struct inode *inode, u64 val)
 static inline bool
 inode_maybe_inc_iversion(struct inode *inode, bool force)
 {
-	u64 cur, old, new;
+	u64 cur, new;
 
 	/*
 	 * The i_version field is not strictly ordered with any other inode
@@ -211,19 +206,14 @@ inode_maybe_inc_iversion(struct inode *inode, bool force)
 	 */
 	smp_mb();
 	cur = inode_peek_iversion_raw(inode);
-	for (;;) {
+	do {
 		/* If flag is clear then we needn't do anything */
 		if (!force && !(cur & I_VERSION_QUERIED))
 			return false;
 
 		/* Since lowest bit is flag, add 2 to avoid it */
 		new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
-
-		old = atomic64_cmpxchg(&inode->i_version, cur, new);
-		if (likely(old == cur))
-			break;
-		cur = old;
-	}
+	} while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
 	return true;
 }
 
@@ -304,10 +294,10 @@ inode_peek_iversion(const struct inode *inode)
 static inline u64
 inode_query_iversion(struct inode *inode)
 {
-	u64 cur, old, new;
+	u64 cur, new;
 
 	cur = inode_peek_iversion_raw(inode);
-	for (;;) {
+	do {
 		/* If flag is already set, then no need to swap */
 		if (cur & I_VERSION_QUERIED) {
 			/*
@@ -320,11 +310,7 @@ inode_query_iversion(struct inode *inode)
 		}
 
 		new = cur | I_VERSION_QUERIED;
-		old = atomic64_cmpxchg(&inode->i_version, cur, new);
-		if (likely(old == cur))
-			break;
-		cur = old;
-	}
+	} while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
 	return cur >> I_VERSION_QUERIED_SHIFT;
 }
 
-- 
cgit v1.2.3


From e1d7c7609ae0933b59840390c1b207ac0a925c8b Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 22 Aug 2022 16:38:51 +0200
Subject: bitops: use try_cmpxchg in set_mask_bits and bit_clear_unless

Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in
set_mask_bits and bit_clear_unless.  x86 CMPXCHG instruction returns
success in ZF flag, so this change saves a compare after cmpxchg (and
related move instruction in front of cmpxchg).

Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg
fails, enabling further code simplifications.

Link: https://lkml.kernel.org/r/20220822143851.3290-1-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/bitops.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 3b89c64bcfd8..fb24a513d917 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -328,10 +328,10 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
 	const typeof(*(ptr)) mask__ = (mask), bits__ = (bits);	\
 	typeof(*(ptr)) old__, new__;				\
 								\
+	old__ = READ_ONCE(*(ptr));				\
 	do {							\
-		old__ = READ_ONCE(*(ptr));			\
 		new__ = (old__ & ~mask__) | bits__;		\
-	} while (cmpxchg(ptr, old__, new__) != old__);		\
+	} while (!try_cmpxchg(ptr, &old__, new__));		\
 								\
 	old__;							\
 })
@@ -343,11 +343,12 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
 	const typeof(*(ptr)) clear__ = (clear), test__ = (test);\
 	typeof(*(ptr)) old__, new__;				\
 								\
+	old__ = READ_ONCE(*(ptr));				\
 	do {							\
-		old__ = READ_ONCE(*(ptr));			\
+		if (old__ & test__)				\
+			break;					\
 		new__ = old__ & ~clear__;			\
-	} while (!(old__ & test__) &&				\
-		 cmpxchg(ptr, old__, new__) != old__);		\
+	} while (!try_cmpxchg(ptr, &old__, new__));		\
 								\
 	!(old__ & test__);					\
 })
-- 
cgit v1.2.3


From 4acb83417cadfdcbe64215f9d0ddcf3132af808e Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 9 Sep 2022 11:40:22 -0700
Subject: sbitmap: fix batched wait_cnt accounting

Batched completions can clear multiple bits, but we're only decrementing
the wait_cnt by one each time. This can cause waiters to never be woken,
stalling IO. Use the batched count instead.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=215679
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20220909184022.1709476-1-kbusch@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 8f5a86e210b9..4d2d5205ab58 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -575,8 +575,9 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
  * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
  * on a &struct sbitmap_queue.
  * @sbq: Bitmap queue to wake up.
+ * @nr: Number of bits cleared.
  */
-void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr);
 
 /**
  * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
-- 
cgit v1.2.3


From 320fb0f91e55ba248d4bad106b408e59099cfa89 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 29 Aug 2022 10:22:37 +0800
Subject: blk-throttle: fix that io throttle can only work for single bio

Test scripts:
cd /sys/fs/cgroup/blkio/
echo "8:0 1024" > blkio.throttle.write_bps_device
echo $$ > cgroup.procs
dd if=/dev/zero of=/dev/sda bs=10k count=1 oflag=direct &
dd if=/dev/zero of=/dev/sda bs=10k count=1 oflag=direct &

Test result:
10240 bytes (10 kB, 10 KiB) copied, 10.0134 s, 1.0 kB/s
10240 bytes (10 kB, 10 KiB) copied, 10.0135 s, 1.0 kB/s

The problem is that the second bio is finished after 10s instead of 20s.

Root cause:
1) second bio will be flagged:

__blk_throtl_bio
 while (true) {
  ...
  if (sq->nr_queued[rw]) -> some bio is throttled already
   break
 };
 bio_set_flag(bio, BIO_THROTTLED); -> flag the bio

2) flagged bio will be dispatched without waiting:

throtl_dispatch_tg
 tg_may_dispatch
  tg_with_in_bps_limit
   if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED))
    *wait = 0; -> wait time is zero
    return true;

commit 9f5ede3c01f9 ("block: throttle split bio in case of iops limit")
support to count split bios for iops limit, thus it adds flagged bio
checking in tg_with_in_bps_limit() so that split bios will only count
once for bps limit, however, it introduce a new problem that io throttle
won't work if multiple bios are throttled.

In order to fix the problem, handle iops/bps limit in different ways:

1) for iops limit, there is no flag to record if the bio is throttled,
   and iops is always applied.
2) for bps limit, original bio will be flagged with BIO_BPS_THROTTLED,
   and io throttle will ignore bio with the flag.

Noted this patch also remove the code to set flag in __bio_clone(), it's
introduced in commit 111be8839817 ("block-throttle: avoid double
charge"), and author thinks split bio can be resubmited and throttled
again, which is wrong because split bio will continue to dispatch from
caller.

Fixes: 9f5ede3c01f9 ("block: throttle split bio in case of iops limit")
Cc: <stable@vger.kernel.org>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20220829022240.3348319-2-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h       | 2 +-
 include/linux/blk_types.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index ca22b06700a9..2c5806997bbf 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -509,7 +509,7 @@ static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
 {
 	bio_clear_flag(bio, BIO_REMAPPED);
 	if (bio->bi_bdev != bdev)
-		bio_clear_flag(bio, BIO_THROTTLED);
+		bio_clear_flag(bio, BIO_BPS_THROTTLED);
 	bio->bi_bdev = bdev;
 	bio_associate_blkg(bio);
 }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1ef99790f6ed..41afb4cfb9b0 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -325,7 +325,7 @@ enum {
 	BIO_QUIET,		/* Make BIO Quiet */
 	BIO_CHAIN,		/* chained bio, ->bi_remaining in effect */
 	BIO_REFFED,		/* bio has elevated ->bi_cnt */
-	BIO_THROTTLED,		/* This bio has already been subjected to
+	BIO_BPS_THROTTLED,	/* This bio has already been subjected to
 				 * throttling rules. Don't do it again. */
 	BIO_TRACE_COMPLETION,	/* bio_endio() should trace the final completion
 				 * of this bio. */
-- 
cgit v1.2.3


From fd72cb1bb5c71d2738a17cbdee6280365a451706 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Thu, 8 Sep 2022 00:50:59 +0300
Subject: mei: add kdoc for struct mei_aux_device

struct mei_aux_device is an interface structure
requires proper documenation.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220907215113.1596567-3-tomas.winkler@intel.com
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 include/linux/mei_aux.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mei_aux.h b/include/linux/mei_aux.h
index 587f25128848..a0cb587006d5 100644
--- a/include/linux/mei_aux.h
+++ b/include/linux/mei_aux.h
@@ -7,6 +7,12 @@
 
 #include <linux/auxiliary_bus.h>
 
+/**
+ * struct mei_aux_device - mei auxiliary device
+ * @aux_dev: - auxiliary device object
+ * @irq: interrupt driving the mei auxiliary device
+ * @bar: mmio resource bar reserved to mei auxiliary device
+ */
 struct mei_aux_device {
 	struct auxiliary_device aux_dev;
 	int irq;
-- 
cgit v1.2.3


From ed57967ab64f1dcdb9efb78a3f4a950dfdccf544 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Thu, 8 Sep 2022 00:51:00 +0300
Subject: mei: add slow_firmware flag to the mei auxiliary device

Add slow_firmware flag to the mei auxiliary device info
to inform the mei driver about slow underlying firmware.
Such firmware will require to use larger operation timeouts.

Signed-off-by: Alexander Usyskin <alexander.usyskin@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220907215113.1596567-4-tomas.winkler@intel.com
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 include/linux/mei_aux.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mei_aux.h b/include/linux/mei_aux.h
index a0cb587006d5..4894d8bf4159 100644
--- a/include/linux/mei_aux.h
+++ b/include/linux/mei_aux.h
@@ -12,11 +12,14 @@
  * @aux_dev: - auxiliary device object
  * @irq: interrupt driving the mei auxiliary device
  * @bar: mmio resource bar reserved to mei auxiliary device
+ * @slow_firmware: The device has slow underlying firmware.
+ *                 Such firmware will require to use larger operation timeouts.
  */
 struct mei_aux_device {
 	struct auxiliary_device aux_dev;
 	int irq;
 	struct resource bar;
+	bool slow_firmware;
 };
 
 #define auxiliary_dev_to_mei_aux_dev(auxiliary_dev) \
-- 
cgit v1.2.3


From 342e4c7e2d38e421f99898b629b5d82e5ae90323 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Thu, 8 Sep 2022 00:51:08 +0300
Subject: mei: gsc: setup gsc extended operational memory

1. Retrieve extended operational memory physical pointers from the
   auxiliary device info.
2. Setup memory registers.
3. Notify firmware that the memory is ready by sending the memory
   ready command.
4. Disable PXP device if GSC is not in PXP mode.

CC: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Alexander Usyskin <alexander.usyskin@intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220907215113.1596567-12-tomas.winkler@intel.com
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 include/linux/mei_aux.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mei_aux.h b/include/linux/mei_aux.h
index 4894d8bf4159..506912ad363b 100644
--- a/include/linux/mei_aux.h
+++ b/include/linux/mei_aux.h
@@ -12,6 +12,8 @@
  * @aux_dev: - auxiliary device object
  * @irq: interrupt driving the mei auxiliary device
  * @bar: mmio resource bar reserved to mei auxiliary device
+ * @ext_op_mem: resource for extend operational memory
+ *              used in graphics PXP mode.
  * @slow_firmware: The device has slow underlying firmware.
  *                 Such firmware will require to use larger operation timeouts.
  */
@@ -19,6 +21,7 @@ struct mei_aux_device {
 	struct auxiliary_device aux_dev;
 	int irq;
 	struct resource bar;
+	struct resource ext_op_mem;
 	bool slow_firmware;
 };
 
-- 
cgit v1.2.3


From a47126ec29f538e1197862919f94d3b6668144a4 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 9 Sep 2022 15:24:57 -0500
Subject: PCI/PTM: Cache PTM Capability offset

Cache the PTM Capability offset instead of searching for it every time we
enable/disable PTM or save/restore PTM state.  No functional change
intended.

Link: https://lore.kernel.org/r/20220909202505.314195-2-helgaas@kernel.org
Tested-by: Rajvi Jingar <rajvi.jingar@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 060af91bafcd..54be939023a3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -475,6 +475,7 @@ struct pci_dev {
 	unsigned int	broken_cmd_compl:1;	/* No compl for some cmds */
 #endif
 #ifdef CONFIG_PCIE_PTM
+	u16		ptm_cap;		/* PTM Capability */
 	unsigned int	ptm_root:1;
 	unsigned int	ptm_enabled:1;
 	u8		ptm_granularity;
-- 
cgit v1.2.3


From e8bdc5ea481638e0a4fd5639050d2b170417f493 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 9 Sep 2022 15:25:00 -0500
Subject: PCI/PTM: Add pci_suspend_ptm() and pci_resume_ptm()

We disable PTM during suspend because that allows some Root Ports to enter
lower-power PM states, which means we also need to disable PTM for all
downstream devices.  Add pci_suspend_ptm() and pci_resume_ptm() for this
purpose.

pci_enable_ptm() and pci_disable_ptm() are for drivers to use to enable or
disable PTM.  They use dev->ptm_enabled to keep track of whether PTM should
be enabled.

pci_suspend_ptm() and pci_resume_ptm() are PCI core-internal functions to
temporarily disable PTM during suspend and (depending on dev->ptm_enabled)
re-enable PTM during resume.

Enable/disable/suspend/resume all use internal __pci_enable_ptm() and
__pci_disable_ptm() functions that only update the PTM Control register.
Outline:

  pci_enable_ptm(struct pci_dev *dev)
  {
     __pci_enable_ptm(dev);
     dev->ptm_enabled = 1;
     pci_ptm_info(dev);
  }

  pci_disable_ptm(struct pci_dev *dev)
  {
     if (dev->ptm_enabled) {
       __pci_disable_ptm(dev);
       dev->ptm_enabled = 0;
     }
  }

  pci_suspend_ptm(struct pci_dev *dev)
  {
     if (dev->ptm_enabled)
       __pci_disable_ptm(dev);
  }

  pci_resume_ptm(struct pci_dev *dev)
  {
     if (dev->ptm_enabled)
       __pci_enable_ptm(dev);
  }

Nothing currently calls pci_resume_ptm(); the suspend path saves the PTM
state before disabling PTM, so the PTM state restore in the resume path
implicitly re-enables it.  A future change will use pci_resume_ptm() to fix
some problems with this approach.

Link: https://lore.kernel.org/r/20220909202505.314195-5-helgaas@kernel.org
Tested-by: Rajvi Jingar <rajvi.jingar@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 54be939023a3..cb5f796e3319 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1678,10 +1678,12 @@ bool pci_ats_disabled(void);
 
 #ifdef CONFIG_PCIE_PTM
 int pci_enable_ptm(struct pci_dev *dev, u8 *granularity);
+void pci_disable_ptm(struct pci_dev *dev);
 bool pcie_ptm_enabled(struct pci_dev *dev);
 #else
 static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity)
 { return -EINVAL; }
+static inline void pci_disable_ptm(struct pci_dev *dev) { }
 static inline bool pcie_ptm_enabled(struct pci_dev *dev)
 { return false; }
 #endif
-- 
cgit v1.2.3


From 4b9d1bc7911c9d9159c4881455c584cde99fbb19 Mon Sep 17 00:00:00 2001
From: Jiangshan Yi <yijiangshan@kylinos.cn>
Date: Tue, 6 Sep 2022 13:49:01 +0800
Subject: Input: hp_sdc: fix spelling typo in comment

Fix spelling typo in comment.

Reported-by: k2ci <kernel-bot@kylinos.cn>
Signed-off-by: Jiangshan Yi <yijiangshan@kylinos.cn>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 include/linux/hp_sdc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hp_sdc.h b/include/linux/hp_sdc.h
index 6f1dee7e67e0..9be8704e2d38 100644
--- a/include/linux/hp_sdc.h
+++ b/include/linux/hp_sdc.h
@@ -180,7 +180,7 @@ switch (val) {						\
 
 #define HP_SDC_CMD_SET_IM	0x40    /* 010xxxxx == set irq mask */
 
-/* The documents provided do not explicitly state that all registers betweem 
+/* The documents provided do not explicitly state that all registers between
  * 0x01 and 0x1f inclusive can be read by sending their register index as a 
  * command, but this is implied and appears to be the case.
  */
-- 
cgit v1.2.3


From 6f7a71f804287a7566314ab1a73d8ca2c18ca0d7 Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Tue, 13 Sep 2022 14:34:54 +0200
Subject: regulator: Add driver for MT6331 PMIC regulators

Add a driver for the regulators found in the MT6331 PMIC.
This PMIC features six buck and 21 Low DropOut (LDO) regulators.

Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20220913123456.384513-3-angelogioacchino.delregno@collabora.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/mt6331-regulator.h | 46 ++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 include/linux/regulator/mt6331-regulator.h

(limited to 'include/linux')

diff --git a/include/linux/regulator/mt6331-regulator.h b/include/linux/regulator/mt6331-regulator.h
new file mode 100644
index 000000000000..2801a9879c14
--- /dev/null
+++ b/include/linux/regulator/mt6331-regulator.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 Collabora Ltd.
+ * Author: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+
+#ifndef __LINUX_REGULATOR_MT6331_H
+#define __LINUX_REGULATOR_MT6331_H
+
+enum {
+	/* BUCK */
+	MT6331_ID_VDVFS11 = 0,
+	MT6331_ID_VDVFS12,
+	MT6331_ID_VDVFS13,
+	MT6331_ID_VDVFS14,
+	MT6331_ID_VCORE2,
+	MT6331_ID_VIO18,
+	/* LDO */
+	MT6331_ID_VTCXO1,
+	MT6331_ID_VTCXO2,
+	MT6331_ID_AVDD32_AUD,
+	MT6331_ID_VAUXA32,
+	MT6331_ID_VCAMA,
+	MT6331_ID_VIO28,
+	MT6331_ID_VCAM_AF,
+	MT6331_ID_VMC,
+	MT6331_ID_VMCH,
+	MT6331_ID_VEMC33,
+	MT6331_ID_VGP1,
+	MT6331_ID_VSIM1,
+	MT6331_ID_VSIM2,
+	MT6331_ID_VMIPI,
+	MT6331_ID_VIBR,
+	MT6331_ID_VGP4,
+	MT6331_ID_VCAMD,
+	MT6331_ID_VUSB10,
+	MT6331_ID_VCAM_IO,
+	MT6331_ID_VSRAM_DVFS1,
+	MT6331_ID_VGP2,
+	MT6331_ID_VGP3,
+	MT6331_ID_VRTC,
+	MT6331_ID_VDIG18,
+	MT6331_ID_VREG_MAX
+};
+
+#endif /* __LINUX_REGULATOR_MT6331_H */
-- 
cgit v1.2.3


From 1cc5a52e873a4f9725eafe5aa9cd213b7b58e29e Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Tue, 13 Sep 2022 14:34:56 +0200
Subject: regulator: Add driver for MT6332 PMIC regulators

Add a driver for the regulators found in the MT6332 PMICs,
including six buck and four LDO regulators.

Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20220913123456.384513-5-angelogioacchino.delregno@collabora.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/mt6332-regulator.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 include/linux/regulator/mt6332-regulator.h

(limited to 'include/linux')

diff --git a/include/linux/regulator/mt6332-regulator.h b/include/linux/regulator/mt6332-regulator.h
new file mode 100644
index 000000000000..af5e3ed31029
--- /dev/null
+++ b/include/linux/regulator/mt6332-regulator.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 Collabora Ltd.
+ * Author: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+
+#ifndef __LINUX_REGULATOR_MT6332_H
+#define __LINUX_REGULATOR_MT6332_H
+
+enum {
+	/* BUCK */
+	MT6332_ID_VDRAM = 0,
+	MT6332_ID_VDVFS2,
+	MT6332_ID_VPA,
+	MT6332_ID_VRF1,
+	MT6332_ID_VRF2,
+	MT6332_ID_VSBST,
+	/* LDO */
+	MT6332_ID_VAUXB32,
+	MT6332_ID_VBIF28,
+	MT6332_ID_VDIG18,
+	MT6332_ID_VSRAM_DVFS2,
+	MT6332_ID_VUSB33,
+	MT6332_ID_VREG_MAX
+};
+
+#endif /* __LINUX_REGULATOR_MT6332_H */
-- 
cgit v1.2.3


From 1dd611a9c55f6657328429b988e302323691c3dc Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 19 Aug 2022 23:26:17 +0200
Subject: mmc: core: Switch to basic workqueue API for sdio_irq_work

The delay parameter isn't set by any user, therefore simplify the code
and switch to the basic workqueue API w/o delay support. This also
reduces the size of struct mmc_host.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/13d8200a-e2a8-d907-38ce-a16fc5ce14aa@gmail.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index eb8bc5b9b0b7..8fdd3cf971a3 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -476,7 +476,7 @@ struct mmc_host {
 
 	unsigned int		sdio_irqs;
 	struct task_struct	*sdio_irq_thread;
-	struct delayed_work	sdio_irq_work;
+	struct work_struct	sdio_irq_work;
 	bool			sdio_irq_pending;
 	atomic_t		sdio_irq_thread_abort;
 
-- 
cgit v1.2.3


From 96a7457a14d9cf98cf58de1e26c03180e0f28141 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 12 Sep 2022 19:07:19 +0200
Subject: can: skb: unify skb CAN frame identification helpers

Replace open coded checks for sk_buffs containing Classical CAN and
CAN FD frame structures as a preparation for CAN XL support.

With the added length check the unintended processing of CAN XL frames
having the CANXL_XLF bit set can be suppressed even when the skb->len
fits to non CAN XL frames.

The CAN_RAW socket needs a rework to use these helpers. Therefore the
use of these helpers is postponed to the CAN_RAW CAN XL integration.

The J1939 protocol gets a check for Classical CAN frames too.

Acked-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/all/20220912170725.120748-2-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/skb.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index 182749e858b3..27ebfc28510f 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -97,10 +97,20 @@ static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb)
 	return nskb;
 }
 
+static inline bool can_is_can_skb(const struct sk_buff *skb)
+{
+	struct can_frame *cf = (struct can_frame *)skb->data;
+
+	/* the CAN specific type of skb is identified by its data length */
+	return (skb->len == CAN_MTU && cf->len <= CAN_MAX_DLEN);
+}
+
 static inline bool can_is_canfd_skb(const struct sk_buff *skb)
 {
+	struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
+
 	/* the CAN specific type of skb is identified by its data length */
-	return skb->len == CANFD_MTU;
+	return (skb->len == CANFD_MTU && cfd->len <= CANFD_MAX_DLEN);
 }
 
 #endif /* !_CAN_SKB_H */
-- 
cgit v1.2.3


From 467ef4c7b9d1c22ee64342804bf92549d765df14 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 12 Sep 2022 19:07:20 +0200
Subject: can: skb: add skb CAN frame data length helpers

Add two helpers to retrieve the data length from CAN sk_buffs and prepare
the length information to be a uint16 value for the CAN XL support.

Acked-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/all/20220912170725.120748-3-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/skb.h | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index 27ebfc28510f..ddffc2fc008c 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -20,7 +20,8 @@ void can_flush_echo_skb(struct net_device *dev);
 int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
 		     unsigned int idx, unsigned int frame_len);
 struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx,
-				   u8 *len_ptr, unsigned int *frame_len_ptr);
+				   unsigned int *len_ptr,
+				   unsigned int *frame_len_ptr);
 unsigned int __must_check can_get_echo_skb(struct net_device *dev,
 					   unsigned int idx,
 					   unsigned int *frame_len_ptr);
@@ -113,4 +114,25 @@ static inline bool can_is_canfd_skb(const struct sk_buff *skb)
 	return (skb->len == CANFD_MTU && cfd->len <= CANFD_MAX_DLEN);
 }
 
+/* get length element value from can[fd]_frame structure */
+static inline unsigned int can_skb_get_len_val(struct sk_buff *skb)
+{
+	const struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
+
+	return cfd->len;
+}
+
+/* get needed data length inside CAN frame for all frame types (RTR aware) */
+static inline unsigned int can_skb_get_data_len(struct sk_buff *skb)
+{
+	unsigned int len = can_skb_get_len_val(skb);
+	const struct can_frame *cf = (struct can_frame *)skb->data;
+
+	/* RTR frames have an actual length of zero */
+	if (can_is_can_skb(skb) && cf->can_id & CAN_RTR_FLAG)
+		return 0;
+
+	return len;
+}
+
 #endif /* !_CAN_SKB_H */
-- 
cgit v1.2.3


From fb08cba12b52cba4366e858932307649dc5304e2 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 12 Sep 2022 19:07:23 +0200
Subject: can: canxl: update CAN infrastructure for CAN XL frames

- add new ETH_P_CANXL ethernet protocol type
- update skb checks for CAN XL
- add alloc_canxl_skb() which now needs a data length parameter
- introduce init_can_skb_reserve() to reduce code duplication

Acked-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/all/20220912170725.120748-6-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/skb.h | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index ddffc2fc008c..1abc25a8d144 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -30,6 +30,9 @@ void can_free_echo_skb(struct net_device *dev, unsigned int idx,
 struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf);
 struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 				struct canfd_frame **cfd);
+struct sk_buff *alloc_canxl_skb(struct net_device *dev,
+				struct canxl_frame **cxl,
+				unsigned int data_len);
 struct sk_buff *alloc_can_err_skb(struct net_device *dev,
 				  struct can_frame **cf);
 bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb);
@@ -114,11 +117,29 @@ static inline bool can_is_canfd_skb(const struct sk_buff *skb)
 	return (skb->len == CANFD_MTU && cfd->len <= CANFD_MAX_DLEN);
 }
 
-/* get length element value from can[fd]_frame structure */
+static inline bool can_is_canxl_skb(const struct sk_buff *skb)
+{
+	const struct canxl_frame *cxl = (struct canxl_frame *)skb->data;
+
+	if (skb->len < CANXL_HDR_SIZE + CANXL_MIN_DLEN || skb->len > CANXL_MTU)
+		return false;
+
+	/* this also checks valid CAN XL data length boundaries */
+	if (skb->len != CANXL_HDR_SIZE + cxl->len)
+		return false;
+
+	return cxl->flags & CANXL_XLF;
+}
+
+/* get length element value from can[|fd|xl]_frame structure */
 static inline unsigned int can_skb_get_len_val(struct sk_buff *skb)
 {
+	const struct canxl_frame *cxl = (struct canxl_frame *)skb->data;
 	const struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
 
+	if (can_is_canxl_skb(skb))
+		return cxl->len;
+
 	return cfd->len;
 }
 
-- 
cgit v1.2.3


From ebf87fc728502244550eaf8819fc785e2014b2ad Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 12 Sep 2022 19:07:24 +0200
Subject: can: dev: add CAN XL support to virtual CAN

Make use of new can_skb_get_data_len() helper.
Add support for variable CANXL MTU using the new can_is_canxl_dev_mtu().

Acked-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/all/20220912170725.120748-7-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/dev.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index c3e50e537e39..58f5431a5559 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -147,6 +147,11 @@ static inline u32 can_get_static_ctrlmode(struct can_priv *priv)
 	return priv->ctrlmode & ~priv->ctrlmode_supported;
 }
 
+static inline bool can_is_canxl_dev_mtu(unsigned int mtu)
+{
+	return (mtu >= CANXL_MIN_MTU && mtu <= CANXL_MAX_MTU);
+}
+
 void can_setup(struct net_device *dev);
 
 struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
-- 
cgit v1.2.3


From 85ebe0afd3f8377a9b506321314f4b8344caa8ec Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <william.gray@linaro.org>
Date: Thu, 18 Aug 2022 12:28:10 -0400
Subject: isa: Introduce the module_isa_driver_with_irq helper macro

Several ISA drivers feature IRQ support that can configured via an "irq"
array module parameter. This array typically matches directly with the
respective "base" array module parameter. To reduce code repetition, a
module_isa_driver_with_irq helper macro is introduced to provide a check
ensuring that the number of "irq" passed to the module matches with the
respective number of "base".

Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
Acked-by: William Breathitt Gray <william.gray@linaro.org>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 include/linux/isa.h | 52 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/isa.h b/include/linux/isa.h
index e30963190968..4fbbf5e36e08 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -38,6 +38,32 @@ static inline void isa_unregister_driver(struct isa_driver *d)
 }
 #endif
 
+#define module_isa_driver_init(__isa_driver, __num_isa_dev) \
+static int __init __isa_driver##_init(void) \
+{ \
+	return isa_register_driver(&(__isa_driver), __num_isa_dev); \
+} \
+module_init(__isa_driver##_init)
+
+#define module_isa_driver_with_irq_init(__isa_driver, __num_isa_dev, __num_irq) \
+static int __init __isa_driver##_init(void) \
+{ \
+	if (__num_irq != __num_isa_dev) { \
+		pr_err("%s: Number of irq (%u) does not match number of base (%u)\n", \
+		       __isa_driver.driver.name, __num_irq, __num_isa_dev); \
+		return -EINVAL; \
+	} \
+	return isa_register_driver(&(__isa_driver), __num_isa_dev); \
+} \
+module_init(__isa_driver##_init)
+
+#define module_isa_driver_exit(__isa_driver) \
+static void __exit __isa_driver##_exit(void) \
+{ \
+	isa_unregister_driver(&(__isa_driver)); \
+} \
+module_exit(__isa_driver##_exit)
+
 /**
  * module_isa_driver() - Helper macro for registering a ISA driver
  * @__isa_driver: isa_driver struct
@@ -48,16 +74,22 @@ static inline void isa_unregister_driver(struct isa_driver *d)
  * use this macro once, and calling it replaces module_init and module_exit.
  */
 #define module_isa_driver(__isa_driver, __num_isa_dev) \
-static int __init __isa_driver##_init(void) \
-{ \
-	return isa_register_driver(&(__isa_driver), __num_isa_dev); \
-} \
-module_init(__isa_driver##_init); \
-static void __exit __isa_driver##_exit(void) \
-{ \
-	isa_unregister_driver(&(__isa_driver)); \
-} \
-module_exit(__isa_driver##_exit);
+module_isa_driver_init(__isa_driver, __num_isa_dev); \
+module_isa_driver_exit(__isa_driver)
+
+/**
+ * module_isa_driver_with_irq() - Helper macro for registering an ISA driver with irq
+ * @__isa_driver: isa_driver struct
+ * @__num_isa_dev: number of devices to register
+ * @__num_irq: number of IRQ to register
+ *
+ * Helper macro for ISA drivers with irq that do not do anything special in
+ * module init/exit. Each module may only use this macro once, and calling it
+ * replaces module_init and module_exit.
+ */
+#define module_isa_driver_with_irq(__isa_driver, __num_isa_dev, __num_irq) \
+module_isa_driver_with_irq_init(__isa_driver, __num_isa_dev, __num_irq); \
+module_isa_driver_exit(__isa_driver)
 
 /**
  * max_num_isa_dev() - Maximum possible number registered of an ISA device
-- 
cgit v1.2.3


From 47e34cb74d376ddfeaef94abb1d6dfb3c905ee51 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 12 Sep 2022 08:45:44 -0700
Subject: bpf: Add verifier check for BPF_PTR_POISON retval and arg

BPF_PTR_POISON was added in commit c0a5a21c25f37 ("bpf: Allow storing
referenced kptr in map") to denote a bpf_func_proto btf_id which the
verifier will replace with a dynamically-determined btf_id at verification
time.

This patch adds verifier 'poison' functionality to BPF_PTR_POISON in
order to prepare for expanded use of the value to poison ret- and
arg-btf_id in ongoing work, namely rbtree and linked list patchsets
[0, 1]. Specifically, when the verifier checks helper calls, it assumes
that BPF_PTR_POISON'ed ret type will be replaced with a valid type before
- or in lieu of - the default ret_btf_id logic. Similarly for arg btf_id.

If poisoned btf_id reaches default handling block for either, consider
this a verifier internal error and fail verification. Otherwise a helper
w/ poisoned btf_id but no verifier logic replacing the type will cause a
crash as the invalid pointer is dereferenced.

Also move BPF_PTR_POISON to existing include/linux/posion.h header and
remove unnecessary shift.

  [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com
  [1]: lore.kernel.org/bpf/20220904204145.3089-1-memxor@gmail.com

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220912154544.1398199-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/poison.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index d62ef5a6b4e9..2d3249eb0e62 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -81,4 +81,7 @@
 /********** net/core/page_pool.c **********/
 #define PP_SIGNATURE		(0x40 + POISON_POINTER_DELTA)
 
+/********** kernel/bpf/ **********/
+#define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA))
+
 #endif
-- 
cgit v1.2.3


From 2747b93ebbede2af2d7bb088b9ddae3193ceede8 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 25 Aug 2022 18:08:56 +0200
Subject: locking: Detect includes rwlock.h outside of spinlock.h

From: Michael S. Tsirkin <mst@redhat.com>

The check for __LINUX_SPINLOCK_H within rwlock.h (and other files)
detects the direct include of the header file if it is at the very
beginning of the include section.
If it is listed later then chances are high that spinlock.h was already
included (including rwlock.h) and the additional listing of rwlock.h
will not cause any failure.

On PREEMPT_RT this additional rwlock.h will lead to compile failures
since it uses a different rwlock implementation.

Add __LINUX_INSIDE_SPINLOCK_H to spinlock.h and check for this instead
of __LINUX_SPINLOCK_H to detect wrong includes. This will help detect
direct includes of rwlock.h with without PREEMPT_RT enabled.

[ bigeasy: add remaining __LINUX_SPINLOCK_H user and rewrite
  commit description. ]

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/YweemHxJx7O8rjBx@linutronix.de
---
 include/linux/rwlock.h           | 2 +-
 include/linux/spinlock.h         | 2 ++
 include/linux/spinlock_api_smp.h | 2 +-
 include/linux/spinlock_api_up.h  | 2 +-
 include/linux/spinlock_rt.h      | 2 +-
 include/linux/spinlock_up.h      | 2 +-
 6 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index 8f416c5e929e..c0ef596f340b 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -1,7 +1,7 @@
 #ifndef __LINUX_RWLOCK_H
 #define __LINUX_RWLOCK_H
 
-#ifndef __LINUX_SPINLOCK_H
+#ifndef __LINUX_INSIDE_SPINLOCK_H
 # error "please don't include this file directly"
 #endif
 
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 5c0c5174155d..1341f7d62da4 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __LINUX_SPINLOCK_H
 #define __LINUX_SPINLOCK_H
+#define __LINUX_INSIDE_SPINLOCK_H
 
 /*
  * include/linux/spinlock.h - generic spinlock/rwlock declarations
@@ -492,4 +493,5 @@ int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask,
 
 void free_bucket_spinlocks(spinlock_t *locks);
 
+#undef __LINUX_INSIDE_SPINLOCK_H
 #endif /* __LINUX_SPINLOCK_H */
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 51fa0dab68c4..89eb6f4c659c 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -1,7 +1,7 @@
 #ifndef __LINUX_SPINLOCK_API_SMP_H
 #define __LINUX_SPINLOCK_API_SMP_H
 
-#ifndef __LINUX_SPINLOCK_H
+#ifndef __LINUX_INSIDE_SPINLOCK_H
 # error "please don't include this file directly"
 #endif
 
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
index b8ba00ccccde..819aeba1c87e 100644
--- a/include/linux/spinlock_api_up.h
+++ b/include/linux/spinlock_api_up.h
@@ -1,7 +1,7 @@
 #ifndef __LINUX_SPINLOCK_API_UP_H
 #define __LINUX_SPINLOCK_API_UP_H
 
-#ifndef __LINUX_SPINLOCK_H
+#ifndef __LINUX_INSIDE_SPINLOCK_H
 # error "please don't include this file directly"
 #endif
 
diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index 835aedaf68ac..61c49b16f69a 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -2,7 +2,7 @@
 #ifndef __LINUX_SPINLOCK_RT_H
 #define __LINUX_SPINLOCK_RT_H
 
-#ifndef __LINUX_SPINLOCK_H
+#ifndef __LINUX_INSIDE_SPINLOCK_H
 #error Do not include directly. Use spinlock.h
 #endif
 
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 16521074b6f7..c87204247592 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -1,7 +1,7 @@
 #ifndef __LINUX_SPINLOCK_UP_H
 #define __LINUX_SPINLOCK_UP_H
 
-#ifndef __LINUX_SPINLOCK_H
+#ifndef __LINUX_INSIDE_SPINLOCK_H
 # error "please don't include this file directly"
 #endif
 
-- 
cgit v1.2.3


From f24a0b1c22c2e90abb4ee1f7a3b0f0d8fc2ede5f Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime@cerno.tech>
Date: Tue, 16 Aug 2022 13:25:09 +0200
Subject: clk: Mention that .recalc_rate can return 0 on error

Multiple platforms (amlogic, imx8) return 0 when the clock rate cannot
be determined properly by the recalc_rate hook. Mention in the
documentation that the framework is ok with that.

Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Link: https://lore.kernel.org/r/20220816112530.1837489-5-maxime@cerno.tech
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 1615010aa0ec..9a14cfa0d201 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -118,8 +118,9 @@ struct clk_duty {
  *
  * @recalc_rate	Recalculate the rate of this clock, by querying hardware. The
  *		parent rate is an input parameter.  It is up to the caller to
- *		ensure that the prepare_mutex is held across this call.
- *		Returns the calculated rate.  Optional, but recommended - if
+ *		ensure that the prepare_mutex is held across this call. If the
+ *		driver cannot figure out a rate for this clock, it must return
+ *		0. Returns the calculated rate. Optional, but recommended - if
  *		this op is not set then clock rate will be initialized to 0.
  *
  * @round_rate:	Given a target rate as input, returns the closest rate actually
-- 
cgit v1.2.3


From c35e84b0977617f96fb1dbef3fb8d71a861325c0 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime@cerno.tech>
Date: Tue, 16 Aug 2022 13:25:21 +0200
Subject: clk: Introduce clk_hw_init_rate_request()

clk-divider instantiates clk_rate_request internally for its round_rate
implementations to share the code with its determine_rate
implementations.

However, it's missing a few fields (min_rate, max_rate) that would be
initialized properly if it was using clk_core_init_rate_req().

Let's create the clk_hw_init_rate_request() function for clock providers
to be able to share the code to instation clk_rate_requests with the
framework. This will also be useful for some tests introduced in later
patches.

Tested-by: Alexander Stein <alexander.stein@ew.tq-group.com> # imx8mp
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com> # exynos4210, meson g12b
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Link: https://lore.kernel.org/r/20220816112530.1837489-17-maxime@cerno.tech
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 9a14cfa0d201..d857717aa386 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -42,6 +42,8 @@ struct dentry;
  * struct clk_rate_request - Structure encoding the clk constraints that
  * a clock user might require.
  *
+ * Should be initialized by calling clk_hw_init_rate_request().
+ *
  * @rate:		Requested clock rate. This field will be adjusted by
  *			clock drivers according to hardware capabilities.
  * @min_rate:		Minimum rate imposed by clk users.
@@ -60,6 +62,10 @@ struct clk_rate_request {
 	struct clk_hw *best_parent_hw;
 };
 
+void clk_hw_init_rate_request(const struct clk_hw *hw,
+			      struct clk_rate_request *req,
+			      unsigned long rate);
+
 /**
  * struct clk_duty - Struture encoding the duty cycle ratio of a clock
  *
-- 
cgit v1.2.3


From 22fb0e284fbc3c1b85d24c5a1df8ea3ac82ab1d1 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime@cerno.tech>
Date: Tue, 16 Aug 2022 13:25:25 +0200
Subject: clk: Constify clk_has_parent()

clk_has_parent() doesn't modify the clocks being passed, so let's make
it const.

Suggested-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Link: https://lore.kernel.org/r/20220816112530.1837489-21-maxime@cerno.tech
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/clk.h b/include/linux/clk.h
index c13061cabdfc..1ef013324237 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -799,7 +799,7 @@ int clk_set_rate_exclusive(struct clk *clk, unsigned long rate);
  *
  * Returns true if @parent is a possible parent for @clk, false otherwise.
  */
-bool clk_has_parent(struct clk *clk, struct clk *parent);
+bool clk_has_parent(const struct clk *clk, const struct clk *parent);
 
 /**
  * clk_set_rate_range - set a rate range for a clock source
-- 
cgit v1.2.3


From 262ca38f4b6eb418b20b8e1d6d8495c6a98727c1 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime@cerno.tech>
Date: Tue, 16 Aug 2022 13:25:26 +0200
Subject: clk: Stop forwarding clk_rate_requests to the parent

If the clock cannot modify its rate and has CLK_SET_RATE_PARENT,
clk_mux_determine_rate_flags(), clk_core_round_rate_nolock() and a
number of drivers will forward the clk_rate_request to the parent clock.

clk_core_round_rate_nolock() will pass the pointer directly, which means
that we pass a clk_rate_request to the parent that has the rate,
min_rate and max_rate of the child, and the best_parent_rate and
best_parent_hw fields will be relative to the child as well, so will
point to our current clock and its rate. The most common case for
CLK_SET_RATE_PARENT is that the child and parent clock rates will be
equal, so the rate field isn't a worry, but the other fields are.

Similarly, if the parent clock driver ever modifies the best_parent_rate
or best_parent_hw, this will be applied to the child once the call to
clk_core_round_rate_nolock() is done. best_parent_hw is probably not
going to be a valid parent, and best_parent_rate might lead to a parent
rate change different to the one that was initially computed.

clk_mux_determine_rate_flags() and the affected drivers will copy the
request before forwarding it to the parents, so they won't be affected
by the latter issue, but the former is still going to be there and will
lead to erroneous data and context being passed to the various clock
drivers in the same sub-tree.

Let's create two new functions, clk_core_forward_rate_req() and
clk_hw_forward_rate_request() for the framework and the clock providers
that will copy a request from a child clock and update the context to
match the parent's. We also update the relevant call sites in the
framework and drivers to use that new function.

Let's also add a test to make sure we avoid regressions there.

Tested-by: Alexander Stein <alexander.stein@ew.tq-group.com> # imx8mp
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com> # exynos4210, meson g12b
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Link: https://lore.kernel.org/r/20220816112530.1837489-22-maxime@cerno.tech
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index d857717aa386..8bce6c524f29 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -65,6 +65,11 @@ struct clk_rate_request {
 void clk_hw_init_rate_request(const struct clk_hw *hw,
 			      struct clk_rate_request *req,
 			      unsigned long rate);
+void clk_hw_forward_rate_request(const struct clk_hw *core,
+				 const struct clk_rate_request *old_req,
+				 const struct clk_hw *parent,
+				 struct clk_rate_request *req,
+				 unsigned long parent_rate);
 
 /**
  * struct clk_duty - Struture encoding the duty cycle ratio of a clock
-- 
cgit v1.2.3


From 253993253466ba7187730b196174146d5247e97b Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime@cerno.tech>
Date: Tue, 16 Aug 2022 13:25:28 +0200
Subject: clk: Introduce the clk_hw_get_rate_range function

Some clock providers are hand-crafting their clk_rate_request, and need
to figure out the current boundaries of their clk_hw to fill it
properly.

Let's create such a function for clock providers.

Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Link: https://lore.kernel.org/r/20220816112530.1837489-24-maxime@cerno.tech
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 8bce6c524f29..8724a3547a79 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1267,6 +1267,8 @@ int clk_mux_determine_rate_flags(struct clk_hw *hw,
 				 struct clk_rate_request *req,
 				 unsigned long flags);
 void clk_hw_reparent(struct clk_hw *hw, struct clk_hw *new_parent);
+void clk_hw_get_rate_range(struct clk_hw *hw, unsigned long *min_rate,
+			   unsigned long *max_rate);
 void clk_hw_set_rate_range(struct clk_hw *hw, unsigned long min_rate,
 			   unsigned long max_rate);
 
-- 
cgit v1.2.3


From b404cb45990bf24d41c29fe856aafb0746a7b81f Mon Sep 17 00:00:00 2001
From: Xinlei Lee <xinlei.lee@mediatek.com>
Date: Wed, 14 Sep 2022 21:21:00 +0800
Subject: soc: mediatek: Add mmsys func to adapt to dpi output for MT8186
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add mmsys func to manipulate dpi output format config for MT8186.

Co-developed-by: Jitao Shi <jitao.shi@mediatek.com>
Signed-off-by: Jitao Shi <jitao.shi@mediatek.com>
Signed-off-by: Xinlei Lee <xinlei.lee@mediatek.com>
Reviewed-by: Nís F. R. A. Prado <nfraprado@collabora.com>
Link: https://lore.kernel.org/all/1663161662-1598-2-git-send-email-xinlei.lee@mediatek.com/
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 include/linux/soc/mediatek/mtk-mmsys.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h
index 59117d970daf..d2b02bb43768 100644
--- a/include/linux/soc/mediatek/mtk-mmsys.h
+++ b/include/linux/soc/mediatek/mtk-mmsys.h
@@ -65,4 +65,6 @@ void mtk_mmsys_ddp_disconnect(struct device *dev,
 			      enum mtk_ddp_comp_id cur,
 			      enum mtk_ddp_comp_id next);
 
+void mtk_mmsys_ddp_dpi_fmt_config(struct device *dev, u32 val);
+
 #endif /* __MTK_MMSYS_H */
-- 
cgit v1.2.3


From 7187440dd7c45fd8b6dd4f3ff56b03ca4aa1bbd2 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 8 Sep 2022 15:20:23 +0300
Subject: iov_iter: use "maxpages" parameter

This was intended to be "maxpages" instead of INT_MAX.  There is only
one caller and it passes INT_MAX so this does not affect runtime.

Fixes: b93235e68921 ("tls: cap the output scatter list to something reasonable")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/uio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 5896af36199c..2e3134b14ffd 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -298,7 +298,7 @@ iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
 		shorted = iov_iter_count(i) - max_bytes;
 		iov_iter_truncate(i, max_bytes);
 	}
-	npages = iov_iter_npages(i, INT_MAX);
+	npages = iov_iter_npages(i, maxpages);
 	if (shorted)
 		iov_iter_reexpand(i, iov_iter_count(i) + shorted);
 
-- 
cgit v1.2.3


From 82f00b24f532557fb0e15a6a2747859e4b70c4bd Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Fri, 9 Sep 2022 17:46:55 +0800
Subject: crypto: hisilicon/qm - get hardware features from hardware registers

Before hardware V3, hardwares do not provide the feature registers,
driver resolves hardware differences based on the hardware version.
As a result, the driver does not support the new hardware.

Hardware V3 and later versions support to obtain hardware features,
such as power-gating management and doorbell isolation, through
the hardware registers. To be compatible with later hardware versions,
the features of the current device is obtained by reading the
hardware registers instead of the hardware version.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/hisi_acc_qm.h | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 116e8bd68c99..851c962ba473 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -168,6 +168,15 @@ enum qm_vf_state {
 	QM_NOT_READY,
 };
 
+enum qm_cap_bits {
+	QM_SUPPORT_DB_ISOLATION = 0x0,
+	QM_SUPPORT_FUNC_QOS,
+	QM_SUPPORT_STOP_QP,
+	QM_SUPPORT_MB_COMMAND,
+	QM_SUPPORT_SVA_PREFETCH,
+	QM_SUPPORT_RPM,
+};
+
 struct dfx_diff_registers {
 	u32 *regs;
 	u32 reg_offset;
@@ -258,6 +267,18 @@ struct hisi_qm_err_ini {
 	void (*err_info_init)(struct hisi_qm *qm);
 };
 
+struct hisi_qm_cap_info {
+	u32 type;
+	/* Register offset */
+	u32 offset;
+	/* Bit offset in register */
+	u32 shift;
+	u32 mask;
+	u32 v1_val;
+	u32 v2_val;
+	u32 v3_val;
+};
+
 struct hisi_qm_list {
 	struct mutex lock;
 	struct list_head list;
@@ -278,6 +299,9 @@ struct hisi_qm {
 	struct pci_dev *pdev;
 	void __iomem *io_base;
 	void __iomem *db_io_base;
+
+	/* Capbility version, 0: not supports */
+	u32 cap_ver;
 	u32 sqe_size;
 	u32 qp_base;
 	u32 qp_num;
@@ -304,6 +328,8 @@ struct hisi_qm {
 	struct hisi_qm_err_info err_info;
 	struct hisi_qm_err_status err_status;
 	unsigned long misc_ctl; /* driver removing and reset sched */
+	/* Device capability bit */
+	unsigned long caps;
 
 	struct rw_semaphore qps_lock;
 	struct idr qp_idr;
@@ -326,8 +352,6 @@ struct hisi_qm {
 	bool use_sva;
 	bool is_frozen;
 
-	/* doorbell isolation enable */
-	bool use_db_isolation;
 	resource_size_t phys_base;
 	resource_size_t db_phys_base;
 	struct uacce_device *uacce;
@@ -501,6 +525,9 @@ void hisi_qm_pm_init(struct hisi_qm *qm);
 int hisi_qm_get_dfx_access(struct hisi_qm *qm);
 void hisi_qm_put_dfx_access(struct hisi_qm *qm);
 void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset);
+u32 hisi_qm_get_hw_info(struct hisi_qm *qm,
+			const struct hisi_qm_cap_info *info_table,
+			u32 index, bool is_read);
 
 /* Used by VFIO ACC live migration driver */
 struct pci_driver *hisi_sec_get_pf_driver(void);
-- 
cgit v1.2.3


From 129a9f340172b4f3857260a7a7bb9d7b3496ba50 Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Fri, 9 Sep 2022 17:46:56 +0800
Subject: crypto: hisilicon/qm - get qp num and depth from hardware registers

Hardware V3 and later versions can obtain qp num and depth supported
by the hardware from registers. To be compatible with later hardware
versions, get qp num and depth from registers instead of fixed marcos.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/hisi_acc_qm.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 851c962ba473..371c0e7ffd27 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -109,7 +109,6 @@
 			 QM_MAILBOX_TIMEOUT | QM_FLR_TIMEOUT)
 #define QM_BASE_CE			QM_ECC_1BIT
 
-#define QM_Q_DEPTH			1024
 #define QM_MIN_QNUM                     2
 #define HISI_ACC_SGL_SGE_NR_MAX		255
 #define QM_SHAPER_CFG			0x100164
@@ -310,6 +309,8 @@ struct hisi_qm {
 	u32 max_qp_num;
 	u32 vfs_num;
 	u32 db_interval;
+	u16 eq_depth;
+	u16 aeq_depth;
 	struct list_head list;
 	struct hisi_qm_list *qm_list;
 
@@ -375,6 +376,8 @@ struct hisi_qp_ops {
 
 struct hisi_qp {
 	u32 qp_id;
+	u16 sq_depth;
+	u16 cq_depth;
 	u8 alg_type;
 	u8 req_type;
 
-- 
cgit v1.2.3


From d90fab0deb8e580aa001f6876e4436c21e944f27 Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Fri, 9 Sep 2022 17:46:58 +0800
Subject: crypto: hisilicon/qm - get error type from hardware registers

Hardware V3 and later versions support get error type from
registers. To be compatible with later hardware versions,
get error type from registers instead of fixed marco.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/hisi_acc_qm.h | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 371c0e7ffd27..e230c7c46110 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -87,28 +87,6 @@
 #define PEH_AXUSER_CFG			0x401001
 #define PEH_AXUSER_CFG_ENABLE		0xffffffff
 
-#define QM_AXI_RRESP			BIT(0)
-#define QM_AXI_BRESP			BIT(1)
-#define QM_ECC_MBIT			BIT(2)
-#define QM_ECC_1BIT			BIT(3)
-#define QM_ACC_GET_TASK_TIMEOUT		BIT(4)
-#define QM_ACC_DO_TASK_TIMEOUT		BIT(5)
-#define QM_ACC_WB_NOT_READY_TIMEOUT	BIT(6)
-#define QM_SQ_CQ_VF_INVALID		BIT(7)
-#define QM_CQ_VF_INVALID		BIT(8)
-#define QM_SQ_VF_INVALID		BIT(9)
-#define QM_DB_TIMEOUT			BIT(10)
-#define QM_OF_FIFO_OF			BIT(11)
-#define QM_DB_RANDOM_INVALID		BIT(12)
-#define QM_MAILBOX_TIMEOUT		BIT(13)
-#define QM_FLR_TIMEOUT			BIT(14)
-
-#define QM_BASE_NFE	(QM_AXI_RRESP | QM_AXI_BRESP | QM_ECC_MBIT | \
-			 QM_ACC_GET_TASK_TIMEOUT | QM_DB_TIMEOUT | \
-			 QM_OF_FIFO_OF | QM_DB_RANDOM_INVALID | \
-			 QM_MAILBOX_TIMEOUT | QM_FLR_TIMEOUT)
-#define QM_BASE_CE			QM_ECC_1BIT
-
 #define QM_MIN_QNUM                     2
 #define HISI_ACC_SGL_SGE_NR_MAX		255
 #define QM_SHAPER_CFG			0x100164
@@ -240,7 +218,10 @@ struct hisi_qm_err_info {
 	char *acpi_rst;
 	u32 msi_wr_port;
 	u32 ecc_2bits_mask;
-	u32 dev_ce_mask;
+	u32 qm_shutdown_mask;
+	u32 dev_shutdown_mask;
+	u32 qm_reset_mask;
+	u32 dev_reset_mask;
 	u32 ce;
 	u32 nfe;
 	u32 fe;
-- 
cgit v1.2.3


From 882597aff2d442a52ca173c293939232c2e1ebea Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 14 Sep 2022 07:14:24 -0700
Subject: Input: auo-pixcir-ts - drop support for platform data

Currently there are no users of auo_pixcir_ts_platdata in the mainline, and
having it (with legacy gpio numbers) prevents us from converting the driver
to gpiod API, so let's drop it.

If, in the future, someone wants to use this driver on non-device tree,
non-ACPI system, they should use static device properties instead of
platform data.

Reviewed-by: Heiko Stuebner <heiko@sntech.de>
Link: https://lore.kernel.org/r/20220914141428.2201784-1-dmitry.torokhov@gmail.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/input/auo-pixcir-ts.h | 44 -------------------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 include/linux/input/auo-pixcir-ts.h

(limited to 'include/linux')

diff --git a/include/linux/input/auo-pixcir-ts.h b/include/linux/input/auo-pixcir-ts.h
deleted file mode 100644
index ed0776997a7a..000000000000
--- a/include/linux/input/auo-pixcir-ts.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Driver for AUO in-cell touchscreens
- *
- * Copyright (c) 2011 Heiko Stuebner <heiko@sntech.de>
- *
- * based on auo_touch.h from Dell Streak kernel
- *
- * Copyright (c) 2008 QUALCOMM Incorporated.
- * Copyright (c) 2008 QUALCOMM USA, INC.
- */
-
-#ifndef __AUO_PIXCIR_TS_H__
-#define __AUO_PIXCIR_TS_H__
-
-/*
- * Interrupt modes:
- * periodical:		interrupt is asserted periodicaly
- * compare coordinates:	interrupt is asserted when coordinates change
- * indicate touch:	interrupt is asserted during touch
- */
-#define AUO_PIXCIR_INT_PERIODICAL	0x00
-#define AUO_PIXCIR_INT_COMP_COORD	0x01
-#define AUO_PIXCIR_INT_TOUCH_IND	0x02
-
-/*
- * @gpio_int		interrupt gpio
- * @int_setting		one of AUO_PIXCIR_INT_*
- * @init_hw		hardwarespecific init
- * @exit_hw		hardwarespecific shutdown
- * @x_max		x-resolution
- * @y_max		y-resolution
- */
-struct auo_pixcir_ts_platdata {
-	int gpio_int;
-	int gpio_rst;
-
-	int int_setting;
-
-	unsigned int x_max;
-	unsigned int y_max;
-};
-
-#endif
-- 
cgit v1.2.3


From f67f12ff57bcfcd7d64280f748787793217faeaf Mon Sep 17 00:00:00 2001
From: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Date: Fri, 9 Sep 2022 22:36:08 +0300
Subject: ata: libahci_platform: Introduce reset assertion/deassertion methods

Currently the ACHI-platform library supports only the assert and deassert
reset signals and ignores the platforms with self-deasserting reset lines.
That prone to having the platforms with self-deasserting reset method
misbehaviour when it comes to resuming from sleep state after the clocks
have been fully disabled. For such cases the controller needs to be fully
reset all over after the reference clocks are enabled and stable,
otherwise the controller state machine might be in an undetermined state.

The best solution would be to auto-detect which reset method is supported
by the particular platform and use it implicitly in the framework of the
ahci_platform_enable_resources()/ahci_platform_disable_resources()
methods. Alas it can't be implemented due to the AHCI-platform library
already supporting the shared reset control lines. As [1] says in such
case we have to use only one of the next methods:
+ reset_control_assert()/reset_control_deassert();
+ reset_control_reset()/reset_control_rearm().
If the driver had an exclusive control over the reset lines we could have
been able to manipulate the lines with no much limitation and just used
the combination of the methods above to cover all the possible
reset-control cases. Since the shared reset control has already been
advertised and couldn't be changed with no risk to breaking the platforms
relying on it, we have no choice but to make the platform drivers to
determine which reset methods the platform reset system supports.

In order to implement both types of reset control support we suggest to
introduce the new AHCI-platform flag: AHCI_PLATFORM_RST_TRIGGER, which
when passed to the ahci_platform_get_resources() method together with the
AHCI_PLATFORM_GET_RESETS flag will indicate that the reset lines are
self-deasserting thus the reset_control_reset()/reset_control_rearm() will
be used to control the reset state. Otherwise the
reset_control_deassert()/reset_control_assert() methods will be utilized.

[1] Documentation/driver-api/reset.rst

Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/ahci_platform.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h
index 49e5383d4222..6d7dd472d370 100644
--- a/include/linux/ahci_platform.h
+++ b/include/linux/ahci_platform.h
@@ -23,6 +23,8 @@ int ahci_platform_enable_phys(struct ahci_host_priv *hpriv);
 void ahci_platform_disable_phys(struct ahci_host_priv *hpriv);
 int ahci_platform_enable_clks(struct ahci_host_priv *hpriv);
 void ahci_platform_disable_clks(struct ahci_host_priv *hpriv);
+int ahci_platform_deassert_rsts(struct ahci_host_priv *hpriv);
+int ahci_platform_assert_rsts(struct ahci_host_priv *hpriv);
 int ahci_platform_enable_regulators(struct ahci_host_priv *hpriv);
 void ahci_platform_disable_regulators(struct ahci_host_priv *hpriv);
 int ahci_platform_enable_resources(struct ahci_host_priv *hpriv);
@@ -41,6 +43,7 @@ int ahci_platform_resume_host(struct device *dev);
 int ahci_platform_suspend(struct device *dev);
 int ahci_platform_resume(struct device *dev);
 
-#define AHCI_PLATFORM_GET_RESETS	0x01
+#define AHCI_PLATFORM_GET_RESETS	BIT(0)
+#define AHCI_PLATFORM_RST_TRIGGER	BIT(1)
 
 #endif /* _AHCI_PLATFORM_H */
-- 
cgit v1.2.3


From 6ce73f3a6fc0ad6af56ba4c14ab7a410876f8286 Mon Sep 17 00:00:00 2001
From: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Date: Fri, 9 Sep 2022 22:36:16 +0300
Subject: ata: libahci_platform: Add function returning a clock-handle by id

Since all the clocks are retrieved by the method
ahci_platform_get_resources() there is no need for the LLD (glue) drivers
to be looking for some particular of them in the kernel clocks table
again. Instead we suggest to add a simple method returning a
device-specific clock with passed connection ID if it is managed to be
found. Otherwise the function will return NULL. Thus the glue-drivers
won't need to either manually touching the hpriv->clks array or calling
clk_get()-friends. The AHCI platform drivers will be able to use the new
function right after the ahci_platform_get_resources() method invocation
and up to the device removal.

Note the method is left unused here, but will be utilized in the framework
of the DWC AHCI SATA driver being added in the next commit.

Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/ahci_platform.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h
index 6d7dd472d370..17fa26215292 100644
--- a/include/linux/ahci_platform.h
+++ b/include/linux/ahci_platform.h
@@ -13,6 +13,7 @@
 
 #include <linux/compiler.h>
 
+struct clk;
 struct device;
 struct ata_port_info;
 struct ahci_host_priv;
@@ -21,6 +22,8 @@ struct scsi_host_template;
 
 int ahci_platform_enable_phys(struct ahci_host_priv *hpriv);
 void ahci_platform_disable_phys(struct ahci_host_priv *hpriv);
+struct clk *ahci_platform_find_clk(struct ahci_host_priv *hpriv,
+				   const char *con_id);
 int ahci_platform_enable_clks(struct ahci_host_priv *hpriv);
 void ahci_platform_disable_clks(struct ahci_host_priv *hpriv);
 int ahci_platform_deassert_rsts(struct ahci_host_priv *hpriv);
-- 
cgit v1.2.3


From bfeb7e399bacae4ee46ad978f5fce3e47f0978d6 Mon Sep 17 00:00:00 2001
From: Yauheni Kaliuta <ykaliuta@redhat.com>
Date: Mon, 5 Sep 2022 12:01:49 +0300
Subject: bpf: Use bpf_capable() instead of CAP_SYS_ADMIN for blinding decision

The full CAP_SYS_ADMIN requirement for blinding looks too strict nowadays.
These days given unprivileged BPF is disabled by default, the main users
for constant blinding coming from unprivileged in particular via cBPF -> eBPF
migration (e.g. old-style socket filters).

Signed-off-by: Yauheni Kaliuta <ykaliuta@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220831090655.156434-1-ykaliuta@redhat.com
Link: https://lore.kernel.org/bpf/20220905090149.61221-1-ykaliuta@redhat.com
---
 include/linux/filter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 527ae1d64e27..75335432fcbc 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1099,7 +1099,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
 		return false;
 	if (!bpf_jit_harden)
 		return false;
-	if (bpf_jit_harden == 1 && capable(CAP_SYS_ADMIN))
+	if (bpf_jit_harden == 1 && bpf_capable())
 		return false;
 
 	return true;
-- 
cgit v1.2.3


From ceea991a019c57a1fb0edd12a5f836a0fa431aee Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sat, 3 Sep 2022 15:11:54 +0200
Subject: bpf: Move bpf_dispatcher function out of ftrace locations

The dispatcher function is attached/detached to trampoline by
dispatcher update function. At the same time it's available as
ftrace attachable function.

After discussion [1] the proposed solution is to use compiler
attributes to alter bpf_dispatcher_##name##_func function:

  - remove it from being instrumented with __no_instrument_function__
    attribute, so ftrace has no track of it

  - but still generate 5 nop instructions with patchable_function_entry(5)
    attribute, which are expected by bpf_arch_text_poke used by
    dispatcher update function

Enabling HAVE_DYNAMIC_FTRACE_NO_PATCHABLE option for x86, so
__patchable_function_entries functions are not part of ftrace/mcount
locations.

Adding attributes to bpf_dispatcher_XXX function on x86_64 so it's
kept out of ftrace locations and has 5 byte nop generated at entry.

These attributes need to be arch specific as pointed out by Ilya
Leoshkevic in here [2].

The dispatcher image is generated only for x86_64 arch, so the
code can stay as is for other archs.

  [1] https://lore.kernel.org/bpf/20220722110811.124515-1-jolsa@kernel.org/
  [2] https://lore.kernel.org/bpf/969a14281a7791c334d476825863ee449964dd0c.camel@linux.ibm.com/

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/bpf/20220903131154.420467-3-jolsa@kernel.org
---
 include/linux/bpf.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 54178b9e9c3a..e0dbe0c0a17e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -977,7 +977,14 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs);
 	},							\
 }
 
+#ifdef CONFIG_X86_64
+#define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5)))
+#else
+#define BPF_DISPATCHER_ATTRIBUTES
+#endif
+
 #define DEFINE_BPF_DISPATCHER(name)					\
+	notrace BPF_DISPATCHER_ATTRIBUTES				\
 	noinline __nocfi unsigned int bpf_dispatcher_##name##_func(	\
 		const void *ctx,					\
 		const struct bpf_insn *insnsi,				\
-- 
cgit v1.2.3


From c2f2e2c3aecdbabf822272a4b6e7d91537633cd9 Mon Sep 17 00:00:00 2001
From: ChiaEn Wu <chiaen_wu@richtek.com>
Date: Thu, 15 Sep 2022 17:47:32 +0800
Subject: lib: add linear range index macro

Add linear_range_idx macro for declaring the linear_range struct simply.

Reviewed-by: Matti Vaittinen <mazziesaccount@gmail.com>
Signed-off-by: ChiaEn Wu <chiaen_wu@richtek.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/linear_range.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/linear_range.h b/include/linux/linear_range.h
index fd3d0b358f22..2e4f4c3539c0 100644
--- a/include/linux/linear_range.h
+++ b/include/linux/linear_range.h
@@ -26,6 +26,17 @@ struct linear_range {
 	unsigned int step;
 };
 
+#define LINEAR_RANGE(_min, _min_sel, _max_sel, _step)		\
+	{							\
+		.min = _min,					\
+		.min_sel = _min_sel,				\
+		.max_sel = _max_sel,				\
+		.step = _step,					\
+	}
+
+#define LINEAR_RANGE_IDX(_idx, _min, _min_sel, _max_sel, _step)	\
+	[_idx] = LINEAR_RANGE(_min, _min_sel, _max_sel, _step)
+
 unsigned int linear_range_values_in_range(const struct linear_range *r);
 unsigned int linear_range_values_in_range_array(const struct linear_range *r,
 						int ranges);
-- 
cgit v1.2.3


From c7007d9f19527b47992ff78a088e8697a9e9d5f5 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 2 May 2022 00:55:49 +0200
Subject: efi/libstub: add some missing EFI prototypes

Define the correct prototypes for the load_image, start_image and
unload_image boot service pointers so we can call them from the EFI
zboot code.

Also add some prototypes related to installation and deinstallation of
protocols in to the EFI protocol database, including some definitions
related to device paths.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index d2b84c2fec39..af90f7989f80 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -368,6 +368,9 @@ void efi_native_runtime_setup(void);
 #define UV_SYSTEM_TABLE_GUID			EFI_GUID(0x3b13a7d4, 0x633e, 0x11dd,  0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93)
 #define LINUX_EFI_CRASH_GUID			EFI_GUID(0xcfc8fc79, 0xbe2e, 0x4ddc,  0x97, 0xf0, 0x9f, 0x98, 0xbf, 0xe2, 0x98, 0xa0)
 #define LOADED_IMAGE_PROTOCOL_GUID		EFI_GUID(0x5b1b31a1, 0x9562, 0x11d2,  0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
+#define LOADED_IMAGE_DEVICE_PATH_PROTOCOL_GUID	EFI_GUID(0xbc62157e, 0x3e33, 0x4fec,  0x99, 0x20, 0x2d, 0x3b, 0x36, 0xd7, 0x50, 0xdf)
+#define EFI_DEVICE_PATH_PROTOCOL_GUID		EFI_GUID(0x09576e91, 0x6d3f, 0x11d2,  0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
+#define EFI_DEVICE_PATH_TO_TEXT_PROTOCOL_GUID	EFI_GUID(0x8b843e20, 0x8132, 0x4852,  0x90, 0xcc, 0x55, 0x1a, 0x4e, 0x4a, 0x7f, 0x1c)
 #define EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID	EFI_GUID(0x9042a9de, 0x23dc, 0x4a38,  0x96, 0xfb, 0x7a, 0xde, 0xd0, 0x80, 0x51, 0x6a)
 #define EFI_UGA_PROTOCOL_GUID			EFI_GUID(0x982c298b, 0xf4fa, 0x41cb,  0xb8, 0x38, 0x77, 0xaa, 0x68, 0x8f, 0xb8, 0x39)
 #define EFI_PCI_IO_PROTOCOL_GUID		EFI_GUID(0x4cf5b200, 0x68b8, 0x4ca5,  0x9e, 0xec, 0xb2, 0x3e, 0x3f, 0x50, 0x02, 0x9a)
@@ -952,6 +955,7 @@ extern int efi_status_to_err(efi_status_t status);
 #define   EFI_DEV_MEDIA_VENDOR			 3
 #define   EFI_DEV_MEDIA_FILE			 4
 #define   EFI_DEV_MEDIA_PROTOCOL		 5
+#define   EFI_DEV_MEDIA_REL_OFFSET		 8
 #define EFI_DEV_BIOS_BOOT		0x05
 #define EFI_DEV_END_PATH		0x7F
 #define EFI_DEV_END_PATH2		0xFF
@@ -982,12 +986,20 @@ struct efi_vendor_dev_path {
 	u8				vendordata[];
 } __packed;
 
+struct efi_rel_offset_dev_path {
+	struct efi_generic_dev_path	header;
+	u32				reserved;
+	u64				starting_offset;
+	u64				ending_offset;
+} __packed;
+
 struct efi_dev_path {
 	union {
 		struct efi_generic_dev_path	header;
 		struct efi_acpi_dev_path	acpi;
 		struct efi_pci_dev_path		pci;
 		struct efi_vendor_dev_path	vendor;
+		struct efi_rel_offset_dev_path	rel_offset;
 	};
 } __packed;
 
-- 
cgit v1.2.3


From f5a5e83379b537f6252526bb4d285b771f6f0b89 Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan@marcan.st>
Date: Wed, 14 Sep 2022 09:34:31 +0100
Subject: soc: apple: rtkit: Add apple_rtkit_poll

This allows a client to receive messages in atomic context, by polling.

Signed-off-by: Hector Martin <marcan@marcan.st>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Sven Peter <sven@svenpeter.dev>
Reviewed-by: Eric Curtin <ecurtin@redhat.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/soc/apple/rtkit.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/apple/rtkit.h b/include/linux/soc/apple/rtkit.h
index 88eb832eac7b..c9cabb679cd1 100644
--- a/include/linux/soc/apple/rtkit.h
+++ b/include/linux/soc/apple/rtkit.h
@@ -152,4 +152,16 @@ int apple_rtkit_send_message(struct apple_rtkit *rtk, u8 ep, u64 message,
 int apple_rtkit_send_message_wait(struct apple_rtkit *rtk, u8 ep, u64 message,
 				  unsigned long timeout, bool atomic);
 
+/*
+ * Process incoming messages in atomic context.
+ * This only guarantees that messages arrive as far as the recv_message_early
+ * callback; drivers expecting to handle incoming messages synchronously
+ * by calling this function must do it that way.
+ * Will return 1 if some data was processed, 0 if none was, or a
+ * negative error code on failure.
+ *
+ * @rtk:            RTKit reference
+ */
+int apple_rtkit_poll(struct apple_rtkit *rtk);
+
 #endif /* _LINUX_APPLE_RTKIT_H_ */
-- 
cgit v1.2.3


From 460d9cb62f7fe82cc835d6b3924a8d06fd2d510a Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel@sholland.org>
Date: Sun, 14 Aug 2022 23:12:44 -0500
Subject: soc: sunxi: sram: Return void from the release function

There is no point in returning an error here, as the caller can do
nothing about it. In fact, all callers already ignore the return value.

Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Samuel Holland <samuel@sholland.org>
Reviewed-by: Heiko Stuebner <heiko@sntech.de>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Link: https://lore.kernel.org/r/20220815041248.53268-8-samuel@sholland.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 include/linux/soc/sunxi/sunxi_sram.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/sunxi/sunxi_sram.h b/include/linux/soc/sunxi/sunxi_sram.h
index c5f663bba9c2..60e274d1b821 100644
--- a/include/linux/soc/sunxi/sunxi_sram.h
+++ b/include/linux/soc/sunxi/sunxi_sram.h
@@ -14,6 +14,6 @@
 #define _SUNXI_SRAM_H_
 
 int sunxi_sram_claim(struct device *dev);
-int sunxi_sram_release(struct device *dev);
+void sunxi_sram_release(struct device *dev);
 
 #endif /* _SUNXI_SRAM_H_ */
-- 
cgit v1.2.3


From e63efbcaba7d6f6846b1c2ec5cf259249b9ed88f Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan@marcan.st>
Date: Fri, 16 Sep 2022 17:02:46 +0100
Subject: wifi: brcmfmac: pcie: Read Apple OTP information

On Apple platforms, the One Time Programmable ROM in the Broadcom chips
contains information about the specific board design (module, vendor,
version) that is required to select the correct NVRAM file. Parse this
OTP ROM and extract the required strings.

Note that the user OTP offset/size is per-chip. This patch does not add
any chips yet.

Reviewed-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Hector Martin <marcan@marcan.st>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/E1oZDni-0077aM-I6@rmk-PC.armlinux.org.uk
---
 include/linux/bcma/bcma_driver_chipcommon.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index e3314f746bfa..2d94c30ed439 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -271,6 +271,7 @@
 #define  BCMA_CC_SROM_CONTROL_OP_WRDIS	0x40000000
 #define  BCMA_CC_SROM_CONTROL_OP_WREN	0x60000000
 #define  BCMA_CC_SROM_CONTROL_OTPSEL	0x00000010
+#define  BCMA_CC_SROM_CONTROL_OTP_PRESENT	0x00000020
 #define  BCMA_CC_SROM_CONTROL_LOCK	0x00000008
 #define  BCMA_CC_SROM_CONTROL_SIZE_MASK	0x00000006
 #define  BCMA_CC_SROM_CONTROL_SIZE_1K	0x00000000
-- 
cgit v1.2.3


From 555bb4ccd1dd78d0263eae31629fe1fdd65c1fb5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Aug 2022 18:41:24 +0200
Subject: preempt: Provide preempt_[dis|en]able_nested()

On PREEMPT_RT enabled kernels, spinlocks and rwlocks are neither disabling
preemption nor interrupts. Though there are a few places which depend on
the implicit preemption/interrupt disable of those locks, e.g. seqcount
write sections, per CPU statistics updates etc.

To avoid sprinkling CONFIG_PREEMPT_RT conditionals all over the place, add
preempt_disable_nested() and preempt_enable_nested() which should be
descriptive enough.

Add a lockdep assertion for the !PREEMPT_RT case to catch callers which
do not have preemption disabled.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220825164131.402717-2-bigeasy@linutronix.de
---
 include/linux/preempt.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index b4381f255a5c..0df425bf9bd7 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -421,4 +421,46 @@ static inline void migrate_enable(void) { }
 
 #endif /* CONFIG_SMP */
 
+/**
+ * preempt_disable_nested - Disable preemption inside a normally preempt disabled section
+ *
+ * Use for code which requires preemption protection inside a critical
+ * section which has preemption disabled implicitly on non-PREEMPT_RT
+ * enabled kernels, by e.g.:
+ *  - holding a spinlock/rwlock
+ *  - soft interrupt context
+ *  - regular interrupt handlers
+ *
+ * On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft
+ * interrupt context and regular interrupt handlers are preemptible and
+ * only prevent migration. preempt_disable_nested() ensures that preemption
+ * is disabled for cases which require CPU local serialization even on
+ * PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP.
+ *
+ * The use cases are code sequences which are not serialized by a
+ * particular lock instance, e.g.:
+ *  - seqcount write side critical sections where the seqcount is not
+ *    associated to a particular lock and therefore the automatic
+ *    protection mechanism does not work. This prevents a live lock
+ *    against a preempting high priority reader.
+ *  - RMW per CPU variable updates like vmstat.
+ */
+/* Macro to avoid header recursion hell vs. lockdep */
+#define preempt_disable_nested()				\
+do {								\
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))			\
+		preempt_disable();				\
+	else							\
+		lockdep_assert_preemption_disabled();		\
+} while (0)
+
+/**
+ * preempt_enable_nested - Undo the effect of preempt_disable_nested()
+ */
+static __always_inline void preempt_enable_nested(void)
+{
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable();
+}
+
 #endif /* __LINUX_PREEMPT_H */
-- 
cgit v1.2.3


From a738e9bad6030d4fd33bfd7db3399a250b7e94d8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Aug 2022 18:41:27 +0200
Subject: mm/debug: Provide VM_WARN_ON_IRQS_ENABLED()

Some places in the VM code expect interrupts disabled, which is a valid
expectation on non-PREEMPT_RT kernels, but does not hold on RT kernels in
some places because the RT spinlock substitution does not disable
interrupts.

To avoid sprinkling CONFIG_PREEMPT_RT conditionals into those places,
provide VM_WARN_ON_IRQS_ENABLED() which is only enabled when VM_DEBUG=y and
PREEMPT_RT=n.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Link: https://lore.kernel.org/r/20220825164131.402717-5-bigeasy@linutronix.de
---
 include/linux/mmdebug.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 15ae78cd2853..b8728d11c949 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm);
 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #endif
 
+#ifdef CONFIG_DEBUG_VM_IRQSOFF
+#define VM_WARN_ON_IRQS_ENABLED() WARN_ON_ONCE(!irqs_disabled())
+#else
+#define VM_WARN_ON_IRQS_ENABLED() do { } while (0)
+#endif
+
 #ifdef CONFIG_DEBUG_VIRTUAL
 #define VIRTUAL_BUG_ON(cond) BUG_ON(cond)
 #else
-- 
cgit v1.2.3


From 44b0c2957adc62b86fcd51adeaf8e993171bc319 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Aug 2022 18:41:31 +0200
Subject: u64_stats: Streamline the implementation

The u64 stats code handles 3 different cases:

  - 32bit UP
  - 32bit SMP
  - 64bit

with an unreadable #ifdef maze, which was recently expanded with PREEMPT_RT
conditionals.

Reduce it to two cases (32bit and 64bit) and drop the optimization for
32bit UP as suggested by Linus.

Use the new preempt_disable/enable_nested() helpers to get rid of the
CONFIG_PREEMPT_RT conditionals.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220825164131.402717-9-bigeasy@linutronix.de
---
 include/linux/u64_stats_sync.h | 145 ++++++++++++++++++-----------------------
 1 file changed, 64 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index 6ad4e9032d53..46040d66334a 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -8,7 +8,7 @@
  *
  * Key points :
  *
- * -  Use a seqcount on 32-bit SMP, only disable preemption for 32-bit UP.
+ * -  Use a seqcount on 32-bit
  * -  The whole thing is a no-op on 64-bit architectures.
  *
  * Usage constraints:
@@ -20,7 +20,8 @@
  *    writer and also spin forever.
  *
  * 3) Write side must use the _irqsave() variant if other writers, or a reader,
- *    can be invoked from an IRQ context.
+ *    can be invoked from an IRQ context. On 64bit systems this variant does not
+ *    disable interrupts.
  *
  * 4) If reader fetches several counters, there is no guarantee the whole values
  *    are consistent w.r.t. each other (remember point #2: seqcounts are not
@@ -29,11 +30,6 @@
  * 5) Readers are allowed to sleep or be preempted/interrupted: they perform
  *    pure reads.
  *
- * 6) Readers must use both u64_stats_fetch_{begin,retry}_irq() if the stats
- *    might be updated from a hardirq or softirq context (remember point #1:
- *    seqcounts are not used for UP kernels). 32-bit UP stat readers could read
- *    corrupted 64-bit values otherwise.
- *
  * Usage :
  *
  * Stats producer (writer) should use following template granted it already got
@@ -66,7 +62,7 @@
 #include <linux/seqlock.h>
 
 struct u64_stats_sync {
-#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
+#if BITS_PER_LONG == 32
 	seqcount_t	seq;
 #endif
 };
@@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_stats_t *p)
 	local64_inc(&p->v);
 }
 
-#else
+static inline void u64_stats_init(struct u64_stats_sync *syncp) { }
+static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { }
+static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { }
+static inline unsigned long __u64_stats_irqsave(void) { return 0; }
+static inline void __u64_stats_irqrestore(unsigned long flags) { }
+static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+{
+	return 0;
+}
+static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+					   unsigned int start)
+{
+	return false;
+}
+
+#else /* 64 bit */
 
 typedef struct {
 	u64		v;
@@ -123,123 +134,95 @@ static inline void u64_stats_inc(u64_stats_t *p)
 {
 	p->v++;
 }
-#endif
 
-#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
-#define u64_stats_init(syncp)	seqcount_init(&(syncp)->seq)
-#else
 static inline void u64_stats_init(struct u64_stats_sync *syncp)
 {
+	seqcount_init(&syncp->seq);
 }
-#endif
 
-static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
+static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp)
 {
-#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
-	if (IS_ENABLED(CONFIG_PREEMPT_RT))
-		preempt_disable();
+	preempt_disable_nested();
 	write_seqcount_begin(&syncp->seq);
-#endif
 }
 
-static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
+static inline void __u64_stats_update_end(struct u64_stats_sync *syncp)
 {
-#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
 	write_seqcount_end(&syncp->seq);
-	if (IS_ENABLED(CONFIG_PREEMPT_RT))
-		preempt_enable();
-#endif
+	preempt_enable_nested();
 }
 
-static inline unsigned long
-u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
+static inline unsigned long __u64_stats_irqsave(void)
 {
-	unsigned long flags = 0;
+	unsigned long flags;
 
-#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
-	if (IS_ENABLED(CONFIG_PREEMPT_RT))
-		preempt_disable();
-	else
-		local_irq_save(flags);
-	write_seqcount_begin(&syncp->seq);
-#endif
+	local_irq_save(flags);
 	return flags;
 }
 
-static inline void
-u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
-				unsigned long flags)
+static inline void __u64_stats_irqrestore(unsigned long flags)
 {
-#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
-	write_seqcount_end(&syncp->seq);
-	if (IS_ENABLED(CONFIG_PREEMPT_RT))
-		preempt_enable();
-	else
-		local_irq_restore(flags);
-#endif
+	local_irq_restore(flags);
 }
 
 static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
-#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
 	return read_seqcount_begin(&syncp->seq);
-#else
-	return 0;
-#endif
 }
 
-static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+					   unsigned int start)
 {
-#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
-	preempt_disable();
-#endif
-	return __u64_stats_fetch_begin(syncp);
+	return read_seqcount_retry(&syncp->seq, start);
 }
+#endif /* !64 bit */
 
-static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
-					 unsigned int start)
+static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
 {
-#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
-	return read_seqcount_retry(&syncp->seq, start);
-#else
-	return false;
-#endif
+	__u64_stats_update_begin(syncp);
+}
+
+static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
+{
+	__u64_stats_update_end(syncp);
+}
+
+static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
+{
+	unsigned long flags = __u64_stats_irqsave();
+
+	__u64_stats_update_begin(syncp);
+	return flags;
+}
+
+static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
+						   unsigned long flags)
+{
+	__u64_stats_update_end(syncp);
+	__u64_stats_irqrestore(flags);
+}
+
+static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+{
+	return __u64_stats_fetch_begin(syncp);
 }
 
 static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
-#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
-	preempt_enable();
-#endif
 	return __u64_stats_fetch_retry(syncp, start);
 }
 
-/*
- * In case irq handlers can update u64 counters, readers can use following helpers
- * - SMP 32bit arches use seqcount protection, irq safe.
- * - UP 32bit must disable irqs.
- * - 64bit have no problem atomically reading u64 values, irq safe.
- */
+/* Obsolete interfaces */
 static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
 {
-#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
-	preempt_disable();
-#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
-	local_irq_disable();
-#endif
-	return __u64_stats_fetch_begin(syncp);
+	return u64_stats_fetch_begin(syncp);
 }
 
 static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
 					     unsigned int start)
 {
-#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
-	preempt_enable();
-#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
-	local_irq_enable();
-#endif
-	return __u64_stats_fetch_retry(syncp, start);
+	return u64_stats_fetch_retry(syncp, start);
 }
 
 #endif /* _LINUX_U64_STATS_SYNC_H */
-- 
cgit v1.2.3


From 6a164c646999847b843e651f71c53dfaceb2c2b4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 9 May 2022 16:04:08 +0200
Subject: genirq: Provide generic_handle_domain_irq_safe().

commit 509853f9e1e7b ("genirq: Provide generic_handle_irq_safe()")
addressed the problem of demultiplexing interrupt handlers which are force
threaded on PREEMPT_RT enabled kernels which means that the demultiplexed
handler is invoked with interrupts enabled which triggers a lockdep
warning due to a non-irq safe lock acquisition.

The same problem exists for the irq domain based interrupt handling via
generic_handle_domain_irq() which has been reported against the AMD
pin-ctrl driver.

Provide generic_handle_domain_irq_safe() which can used from any context.

[ tglx: Split the usage sites out and massaged changelog ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/YnkfWFzvusFFktSt@linutronix.de
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215954
---
 include/linux/irqdesc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 1cd4e36890fb..844a8e30e6de 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -169,6 +169,7 @@ int generic_handle_irq_safe(unsigned int irq);
  * conversion failed.
  */
 int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq);
+int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq);
 int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq);
 #endif
 
-- 
cgit v1.2.3


From b88c48bfdd85820b19f0c0295a88d1596876e7c8 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 26 Aug 2022 20:26:41 +0300
Subject: pwm: core: Get rid of unused devm_of_pwm_get()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The devm_of_pwm_get() has recently lost its single user, drop
the dead API as well.

Note, the new code should use either plain pwm_get() or managed
devm_pwm_get() or devm_fwnode_pwm_get() APIs.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20220826172642.16404-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 include/linux/pwm.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 9429930c5566..572ba92e4206 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -408,8 +408,6 @@ struct pwm_device *of_pwm_get(struct device *dev, struct device_node *np,
 void pwm_put(struct pwm_device *pwm);
 
 struct pwm_device *devm_pwm_get(struct device *dev, const char *con_id);
-struct pwm_device *devm_of_pwm_get(struct device *dev, struct device_node *np,
-				   const char *con_id);
 struct pwm_device *devm_fwnode_pwm_get(struct device *dev,
 				       struct fwnode_handle *fwnode,
 				       const char *con_id);
@@ -517,14 +515,6 @@ static inline struct pwm_device *devm_pwm_get(struct device *dev,
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct pwm_device *devm_of_pwm_get(struct device *dev,
-						 struct device_node *np,
-						 const char *con_id)
-{
-	might_sleep();
-	return ERR_PTR(-ENODEV);
-}
-
 static inline struct pwm_device *
 devm_fwnode_pwm_get(struct device *dev, struct fwnode_handle *fwnode,
 		    const char *con_id)
-- 
cgit v1.2.3


From b5ae0ad56455dfb277b165b9270382f0e1c1d943 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 26 Aug 2022 20:26:42 +0300
Subject: pwm: core: Make of_pwm_get() static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no users outside of PWM core of the of_pwm_get().
Make it static.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20220826172642.16404-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 include/linux/pwm.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 572ba92e4206..d70c6e5a839d 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -403,8 +403,6 @@ struct pwm_device *of_pwm_single_xlate(struct pwm_chip *pc,
 				       const struct of_phandle_args *args);
 
 struct pwm_device *pwm_get(struct device *dev, const char *con_id);
-struct pwm_device *of_pwm_get(struct device *dev, struct device_node *np,
-			      const char *con_id);
 void pwm_put(struct pwm_device *pwm);
 
 struct pwm_device *devm_pwm_get(struct device *dev, const char *con_id);
@@ -495,14 +493,6 @@ static inline struct pwm_device *pwm_get(struct device *dev,
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct pwm_device *of_pwm_get(struct device *dev,
-					    struct device_node *np,
-					    const char *con_id)
-{
-	might_sleep();
-	return ERR_PTR(-ENODEV);
-}
-
 static inline void pwm_put(struct pwm_device *pwm)
 {
 	might_sleep();
-- 
cgit v1.2.3


From 41929b72eb5dab26280dfc1e7c1523f0f4853dde Mon Sep 17 00:00:00 2001
From: Michael Shych <michaelsh@nvidia.com>
Date: Wed, 10 Aug 2022 20:15:50 +0300
Subject: platform_data/emc2305: define platform data for EMC2305 driver

Introduce platform data structure for EM2305 driver to allow configuration
device PWMs and thermal zones by passing required platform data
to the driver. If no platform data is provided, the driver is supposed
to work with default settings.

Signed-off-by: Michael Shych <michaelsh@nvidia.com>
Reviewed-by: Vadim Pasternak <vadimp@nvidia.com>
Link: https://lore.kernel.org/r/20220810171552.56417-2-michaelsh@nvidia.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 include/linux/platform_data/emc2305.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 include/linux/platform_data/emc2305.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/emc2305.h b/include/linux/platform_data/emc2305.h
new file mode 100644
index 000000000000..54d672dd6f7d
--- /dev/null
+++ b/include/linux/platform_data/emc2305.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __LINUX_PLATFORM_DATA_EMC2305__
+#define __LINUX_PLATFORM_DATA_EMC2305__
+
+#define EMC2305_PWM_MAX	5
+
+/**
+ * struct emc2305_platform_data - EMC2305 driver platform data
+ * @max_state: maximum cooling state of the cooling device;
+ * @pwm_num: number of active channels;
+ * @pwm_separate: separate PWM settings for every channel;
+ * @pwm_min: array of minimum PWM per channel;
+ */
+struct emc2305_platform_data {
+	u8 max_state;
+	u8 pwm_num;
+	bool pwm_separate;
+	u8 pwm_min[EMC2305_PWM_MAX];
+};
+
+#endif
-- 
cgit v1.2.3


From 5db72fdb74983a1e81331aadf99ae2305f277562 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 13 Sep 2022 19:31:40 +0300
Subject: ACPI: utils: Add acpi_dev_uid_to_integer() helper to get _UID as
 integer

Some users interpret _UID only as integer and for them it's easier to
have an integer representation of _UID. Add respective helper for that.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6f64b2f3dc54..9434db02cb60 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -798,6 +798,11 @@ acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *u
 	return false;
 }
 
+static inline int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer)
+{
+	return -ENODEV;
+}
+
 static inline struct acpi_device *
 acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv)
 {
-- 
cgit v1.2.3


From 38bef8e57f2149acd2c910a98f57dd6291d2e0ec Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 5 Sep 2022 16:08:17 -0700
Subject: smp: add set_nr_cpu_ids()

In preparation to support compile-time nr_cpu_ids, add a setter for
the variable.

This is a no-op for all arches.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index e8ad12b5b9d2..24763997894d 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -39,6 +39,11 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
 #define nr_cpu_ids		1U
 #else
 extern unsigned int nr_cpu_ids;
+
+static inline void set_nr_cpu_ids(unsigned int nr)
+{
+	nr_cpu_ids = nr;
+}
 #endif
 
 #ifdef CONFIG_CPUMASK_OFFSTACK
-- 
cgit v1.2.3


From 7102b3bb070fdf4580a05cbfc5ad3c0691dc4bf9 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 5 Sep 2022 16:08:18 -0700
Subject: lib/cpumask: delete misleading comment

The comment says that HOTPLUG config option enables all cpus in
cpu_possible_mask up to NR_CPUs. This is wrong. Even if HOTPLUG is
enabled, the mask is populated on boot with respect to ACPI/DT records.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 24763997894d..9d2f0e3e927e 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -72,10 +72,6 @@ static inline void set_nr_cpu_ids(unsigned int nr)
  *  cpu_online_mask is the dynamic subset of cpu_present_mask,
  *  indicating those CPUs available for scheduling.
  *
- *  If HOTPLUG is enabled, then cpu_possible_mask is forced to have
- *  all NR_CPUS bits set, otherwise it is just the set of CPUs that
- *  ACPI reports present at boot.
- *
  *  If HOTPLUG is enabled, then cpu_present_mask varies dynamically,
  *  depending on what ACPI reports as currently plugged in, otherwise
  *  cpu_present_mask is just a copy of cpu_possible_mask.
-- 
cgit v1.2.3


From aa47a7c215e79a2ade6916f163c5a17b561bce4f Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 5 Sep 2022 16:08:19 -0700
Subject: lib/cpumask: deprecate nr_cpumask_bits

Cpumask code is written in assumption that when CONFIG_CPUMASK_OFFSTACK
is enabled, all cpumasks have boot-time defined size, otherwise the size
is always NR_CPUS.

The latter is wrong because the number of possible cpus is always
calculated on boot, and it may be less than NR_CPUS.

On my 4-cpu arm64 VM the nr_cpu_ids is 4, as expected, and nr_cpumask_bits
is 256, which corresponds to NR_CPUS. This not only leads to useless
traversing of cpumask bits greater than 4, this also makes some cpumask
routines fail.

For example, cpumask_full(0b1111000..000) would erroneously return false
in the example above because tail bits in the mask are all unset.

This patch deprecates nr_cpumask_bits and wires it to nr_cpu_ids
unconditionally, so that cpumask routines will not waste time traversing
unused part of cpu masks. It also fixes cpumask_full() and similar
routines.

As a side effect, because now a length of cpumasks is defined at run-time
even if CPUMASK_OFFSTACK is disabled, compiler can't optimize corresponding
functions.

It increases kernel size by ~2.5KB if OFFSTACK is off. This is addressed in
the following patch.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 9d2f0e3e927e..2f6622cead1f 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -46,13 +46,8 @@ static inline void set_nr_cpu_ids(unsigned int nr)
 }
 #endif
 
-#ifdef CONFIG_CPUMASK_OFFSTACK
-/* Assuming NR_CPUS is huge, a runtime limit is more efficient.  Also,
- * not all bits may be allocated. */
+/* Deprecated. Always use nr_cpu_ids. */
 #define nr_cpumask_bits	nr_cpu_ids
-#else
-#define nr_cpumask_bits	((unsigned int)NR_CPUS)
-#endif
 
 /*
  * The following particular system cpumasks and operations manage
-- 
cgit v1.2.3


From a050910972bb25152b42ad2e544652117c5ad915 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 2 May 2022 01:08:16 +0200
Subject: efi/libstub: implement generic EFI zboot

Implement a minimal EFI app that decompresses the real kernel image and
launches it using the firmware's LoadImage and StartImage boot services.
This removes the need for any arch-specific hacks.

Note that on systems that have UEFI secure boot policies enabled,
LoadImage/StartImage require images to be signed, or their hashes known
a priori, in order to be permitted to boot.

There are various possible strategies to work around this requirement,
but they all rely either on overriding internal PI/DXE protocols (which
are not part of the EFI spec) or omitting the firmware provided
LoadImage() and StartImage() boot services, which is also undesirable,
given that they encapsulate platform specific policies related to secure
boot and measured boot, but also related to memory permissions (whether
or not and which types of heap allocations have both write and execute
permissions.)

The only generic and truly portable way around this is to simply sign
both the inner and the outer image with the same key/cert pair, so this
is what is implemented here.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index af90f7989f80..5efc3105f8e0 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -411,6 +411,7 @@ void efi_native_runtime_setup(void);
 #define LINUX_EFI_TPM_FINAL_LOG_GUID		EFI_GUID(0x1e2ed096, 0x30e2, 0x4254,  0xbd, 0x89, 0x86, 0x3b, 0xbe, 0xf8, 0x23, 0x25)
 #define LINUX_EFI_MEMRESERVE_TABLE_GUID		EFI_GUID(0x888eb0c6, 0x8ede, 0x4ff5,  0xa8, 0xf0, 0x9a, 0xee, 0x5c, 0xb9, 0x77, 0xc2)
 #define LINUX_EFI_INITRD_MEDIA_GUID		EFI_GUID(0x5568e427, 0x68fc, 0x4f3d,  0xac, 0x74, 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68)
+#define LINUX_EFI_ZBOOT_MEDIA_GUID		EFI_GUID(0xe565a30d, 0x47da, 0x4dbd,  0xb3, 0x54, 0x9b, 0xb5, 0xc8, 0x4f, 0x8b, 0xe2)
 #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID	EFI_GUID(0xc451ed2b, 0x9694, 0x45d3,  0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89)
 #define LINUX_EFI_COCO_SECRET_AREA_GUID		EFI_GUID(0xadf956ad, 0xe98c, 0x484c,  0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47)
 
-- 
cgit v1.2.3


From db01868bf2e919a10551066d54cf4bef5dd5a01e Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 11 Sep 2022 04:06:57 +0300
Subject: net: introduce iterators over synced hw addresses

Some network drivers use __dev_mc_sync()/__dev_uc_sync() and therefore
program the hardware only with addresses with a non-zero sync_cnt.

Some of the above drivers also need to save/restore the address
filtering lists when certain events happen, and they need to walk
through the struct net_device :: uc and struct net_device :: mc lists.
But these lists contain unsynced addresses too.

To keep the appearance of an elementary form of data encapsulation,
provide iterators through these lists that only look at entries with a
non-zero sync_cnt, instead of filtering entries out from device drivers.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f0068c1ff1df..9f42fc871c3b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -253,11 +253,17 @@ struct netdev_hw_addr_list {
 #define netdev_uc_empty(dev) netdev_hw_addr_list_empty(&(dev)->uc)
 #define netdev_for_each_uc_addr(ha, dev) \
 	netdev_hw_addr_list_for_each(ha, &(dev)->uc)
+#define netdev_for_each_synced_uc_addr(_ha, _dev) \
+	netdev_for_each_uc_addr((_ha), (_dev)) \
+		if ((_ha)->sync_cnt)
 
 #define netdev_mc_count(dev) netdev_hw_addr_list_count(&(dev)->mc)
 #define netdev_mc_empty(dev) netdev_hw_addr_list_empty(&(dev)->mc)
 #define netdev_for_each_mc_addr(ha, dev) \
 	netdev_hw_addr_list_for_each(ha, &(dev)->mc)
+#define netdev_for_each_synced_mc_addr(_ha, _dev) \
+	netdev_for_each_mc_addr((_ha), (_dev)) \
+		if ((_ha)->sync_cnt)
 
 struct hh_cache {
 	unsigned int	hh_len;
-- 
cgit v1.2.3


From 1e839143d674603b0bbbc4c513bca35404967dbc Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 2 Sep 2022 15:29:23 +0200
Subject: HID: core: store the unique system identifier in hid_device

This unique identifier is currently used only for ensuring uniqueness in
sysfs. However, this could be handful for userspace to refer to a specific
hid_device by this id.

2 use cases are in my mind: LEDs (and their naming convention), and
HID-BPF.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220902132938.2409206-9-benjamin.tissoires@redhat.com
---
 include/linux/hid.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index 4363a63b9775..a43dd17bc78f 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -658,6 +658,8 @@ struct hid_device {							/* device report descriptor */
 	struct list_head debug_list;
 	spinlock_t  debug_list_lock;
 	wait_queue_head_t debug_wait;
+
+	unsigned int id;						/* system unique id */
 };
 
 #define to_hid_device(pdev) \
-- 
cgit v1.2.3


From ead77b65aef430d3bfe63524c243a60a29eb8d90 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 2 Sep 2022 15:29:24 +0200
Subject: HID: export hid_report_type to uapi

When we are dealing with eBPF, we need to have access to the report type.
Currently our implementation differs from the USB standard, making it
impossible for users to know the exact value besides hardcoding it
themselves.

And instead of a blank define, convert it as an enum.

Note that we need to also do change in the ll_driver API, but given
that this will have a wider impact outside of this tree, we leave this
as a TODO for the future.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220902132938.2409206-10-benjamin.tissoires@redhat.com
---
 include/linux/hid.h | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index a43dd17bc78f..b1a33dbbc78e 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -314,15 +314,6 @@ struct hid_item {
 #define HID_BAT_ABSOLUTESTATEOFCHARGE	0x00850065
 
 #define HID_VD_ASUS_CUSTOM_MEDIA_KEYS	0xff310076
-/*
- * HID report types --- Ouch! HID spec says 1 2 3!
- */
-
-#define HID_INPUT_REPORT	0
-#define HID_OUTPUT_REPORT	1
-#define HID_FEATURE_REPORT	2
-
-#define HID_REPORT_TYPES	3
 
 /*
  * HID connect requests
@@ -509,7 +500,7 @@ struct hid_report {
 	struct list_head hidinput_list;
 	struct list_head field_entry_list;		/* ordered list of input fields */
 	unsigned int id;				/* id of this report */
-	unsigned int type;				/* report type */
+	enum hid_report_type type;			/* report type */
 	unsigned int application;			/* application usage for this report */
 	struct hid_field *field[HID_MAX_FIELDS];	/* fields of the report */
 	struct hid_field_entry *field_entries;		/* allocated memory of input field_entry */
@@ -926,7 +917,8 @@ extern int hidinput_connect(struct hid_device *hid, unsigned int force);
 extern void hidinput_disconnect(struct hid_device *);
 
 int hid_set_field(struct hid_field *, unsigned, __s32);
-int hid_input_report(struct hid_device *, int type, u8 *, u32, int);
+int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size,
+		     int interrupt);
 struct hid_field *hidinput_get_led_field(struct hid_device *hid);
 unsigned int hidinput_count_leds(struct hid_device *hid);
 __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code);
@@ -935,11 +927,11 @@ int __hid_request(struct hid_device *hid, struct hid_report *rep, int reqtype);
 u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags);
 struct hid_device *hid_allocate_device(void);
 struct hid_report *hid_register_report(struct hid_device *device,
-				       unsigned int type, unsigned int id,
+				       enum hid_report_type type, unsigned int id,
 				       unsigned int application);
 int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size);
 struct hid_report *hid_validate_values(struct hid_device *hid,
-				       unsigned int type, unsigned int id,
+				       enum hid_report_type type, unsigned int id,
 				       unsigned int field_index,
 				       unsigned int report_counts);
 
@@ -1111,7 +1103,7 @@ void hid_hw_request(struct hid_device *hdev,
 		    struct hid_report *report, int reqtype);
 int hid_hw_raw_request(struct hid_device *hdev,
 		       unsigned char reportnum, __u8 *buf,
-		       size_t len, unsigned char rtype, int reqtype);
+		       size_t len, enum hid_report_type rtype, int reqtype);
 int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len);
 
 /**
@@ -1184,8 +1176,8 @@ static inline u32 hid_report_len(struct hid_report *report)
 	return DIV_ROUND_UP(report->size, 8) + (report->id > 0);
 }
 
-int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size,
-		int interrupt);
+int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size,
+			 int interrupt);
 
 /* HID quirks API */
 unsigned long hid_lookup_quirk(const struct hid_device *hdev);
-- 
cgit v1.2.3


From 735e1bb1b8067e209941a6bdfde23214696ff47e Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 2 Sep 2022 15:29:25 +0200
Subject: HID: convert defines of HID class requests into a proper enum

This allows to export the type in BTF and so in the automatically
generated vmlinux.h. It will also add some static checks on the users
when we change the ll driver API (see not below).

Note that we need to also do change in the ll_driver API, but given
that this will have a wider impact outside of this tree, we leave this
as a TODO for the future.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220902132938.2409206-11-benjamin.tissoires@redhat.com
---
 include/linux/hid.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index b1a33dbbc78e..8677ae38599e 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -923,7 +923,7 @@ struct hid_field *hidinput_get_led_field(struct hid_device *hid);
 unsigned int hidinput_count_leds(struct hid_device *hid);
 __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code);
 void hid_output_report(struct hid_report *report, __u8 *data);
-int __hid_request(struct hid_device *hid, struct hid_report *rep, int reqtype);
+int __hid_request(struct hid_device *hid, struct hid_report *rep, enum hid_class_request reqtype);
 u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags);
 struct hid_device *hid_allocate_device(void);
 struct hid_report *hid_register_report(struct hid_device *device,
@@ -1100,10 +1100,11 @@ void hid_hw_stop(struct hid_device *hdev);
 int __must_check hid_hw_open(struct hid_device *hdev);
 void hid_hw_close(struct hid_device *hdev);
 void hid_hw_request(struct hid_device *hdev,
-		    struct hid_report *report, int reqtype);
+		    struct hid_report *report, enum hid_class_request reqtype);
 int hid_hw_raw_request(struct hid_device *hdev,
 		       unsigned char reportnum, __u8 *buf,
-		       size_t len, enum hid_report_type rtype, int reqtype);
+		       size_t len, enum hid_report_type rtype,
+		       enum hid_class_request reqtype);
 int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len);
 
 /**
@@ -1131,7 +1132,7 @@ static inline int hid_hw_power(struct hid_device *hdev, int level)
  * @reqtype: hid request type
  */
 static inline int hid_hw_idle(struct hid_device *hdev, int report, int idle,
-		int reqtype)
+		enum hid_class_request reqtype)
 {
 	if (hdev->ll_driver->idle)
 		return hdev->ll_driver->idle(hdev, report, idle, reqtype);
-- 
cgit v1.2.3


From 176042404ee6a96ba7e9054e1bda6220360a26ad Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Sep 2022 10:41:56 +0100
Subject: mm: add PSI accounting around ->read_folio and ->readahead calls

PSI tries to account for the cost of bringing back in pages discarded by
the MM LRU management.  Currently the prime place for that is hooked into
the bio submission path, which is a rather bad place:

 - it does not actually account I/O for non-block file systems, of which
   we have many
 - it adds overhead and a layering violation to the block layer

Add the accounting into the two places in the core MM code that read
pages into an address space by calling into ->read_folio and ->readahead
so that the entire file system operations are covered, to broaden
the coverage and allow removing the accounting in the block layer going
forward.

As psi_memstall_enter can deal with nested calls this will not lead to
double accounting even while the bio annotations are still present.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20220915094200.139713-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/pagemap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0178b2040ea3..201dc7281640 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1173,6 +1173,8 @@ struct readahead_control {
 	pgoff_t _index;
 	unsigned int _nr_pages;
 	unsigned int _batch_count;
+	bool _workingset;
+	unsigned long _pflags;
 };
 
 #define DEFINE_READAHEAD(ractl, f, r, m, i)				\
-- 
cgit v1.2.3


From 118f3663fbc658e9ad6165e129076981c7b685c5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Sep 2022 10:42:00 +0100
Subject: block: remove PSI accounting from the bio layer

PSI accounting is now done by the VM code, where it should have been
since the beginning.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20220915094200.139713-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 41afb4cfb9b0..e0b098089ef2 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -321,7 +321,6 @@ enum {
 	BIO_NO_PAGE_REF,	/* don't put release vec pages */
 	BIO_CLONED,		/* doesn't own data */
 	BIO_BOUNCED,		/* bio is a bounce bio */
-	BIO_WORKINGSET,		/* contains userspace workingset pages */
 	BIO_QUIET,		/* Make BIO Quiet */
 	BIO_CHAIN,		/* chained bio, ->bi_remaining in effect */
 	BIO_REFFED,		/* bio has elevated ->bi_cnt */
-- 
cgit v1.2.3


From 256dea9134c38fcc409ec7ea6a1eb67829b563bc Mon Sep 17 00:00:00 2001
From: Ronak Jain <ronak.jain@xilinx.com>
Date: Wed, 14 Sep 2022 18:03:15 +0530
Subject: firmware: xilinx: add support for sd/gem config

Add new APIs in firmware to configure SD/GEM registers. Internally
it calls PM IOCTL for below SD/GEM register configuration:
- SD/EMMC select
- SD slot type
- SD base clock
- SD 8 bit support
- SD fixed config
- GEM SGMII Mode
- GEM fixed config

Signed-off-by: Ronak Jain <ronak.jain@xilinx.com>
Signed-off-by: Radhey Shyam Pandey <radhey.shyam.pandey@amd.com>
Reviewed-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 45 ++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 9f50dacbf7d6..76d2b3ebad84 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -153,6 +153,9 @@ enum pm_ioctl_id {
 	/* Runtime feature configuration */
 	IOCTL_SET_FEATURE_CONFIG = 26,
 	IOCTL_GET_FEATURE_CONFIG = 27,
+	/* Dynamic SD/GEM configuration */
+	IOCTL_SET_SD_CONFIG = 30,
+	IOCTL_SET_GEM_CONFIG = 31,
 };
 
 enum pm_query_id {
@@ -399,6 +402,30 @@ enum pm_feature_config_id {
 	PM_FEATURE_EXTWDT_VALUE = 4,
 };
 
+/**
+ * enum pm_sd_config_type - PM SD configuration.
+ * @SD_CONFIG_EMMC_SEL: To set SD_EMMC_SEL in CTRL_REG_SD and SD_SLOTTYPE
+ * @SD_CONFIG_BASECLK: To set SD_BASECLK in SD_CONFIG_REG1
+ * @SD_CONFIG_8BIT: To set SD_8BIT in SD_CONFIG_REG2
+ * @SD_CONFIG_FIXED: To set fixed config registers
+ */
+enum pm_sd_config_type {
+	SD_CONFIG_EMMC_SEL = 1,
+	SD_CONFIG_BASECLK = 2,
+	SD_CONFIG_8BIT = 3,
+	SD_CONFIG_FIXED = 4,
+};
+
+/**
+ * enum pm_gem_config_type - PM GEM configuration.
+ * @GEM_CONFIG_SGMII_MODE: To set GEM_SGMII_MODE in GEM_CLK_CTRL register
+ * @GEM_CONFIG_FIXED: To set fixed config registers
+ */
+enum pm_gem_config_type {
+	GEM_CONFIG_SGMII_MODE = 1,
+	GEM_CONFIG_FIXED = 2,
+};
+
 /**
  * struct zynqmp_pm_query_data - PM query data
  * @qid:	query ID
@@ -475,6 +502,9 @@ int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id);
 int zynqmp_pm_set_feature_config(enum pm_feature_config_id id, u32 value);
 int zynqmp_pm_get_feature_config(enum pm_feature_config_id id, u32 *payload);
 int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset);
+int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value);
+int zynqmp_pm_set_gem_config(u32 node, enum pm_gem_config_type config,
+			     u32 value);
 #else
 static inline int zynqmp_pm_get_api_version(u32 *version)
 {
@@ -745,6 +775,21 @@ static inline int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset)
 {
 	return -ENODEV;
 }
+
+static inline int zynqmp_pm_set_sd_config(u32 node,
+					  enum pm_sd_config_type config,
+					  u32 value)
+{
+	return -ENODEV;
+}
+
+static inline int zynqmp_pm_set_gem_config(u32 node,
+					   enum pm_gem_config_type config,
+					   u32 value)
+{
+	return -ENODEV;
+}
+
 #endif
 
 #endif /* __FIRMWARE_ZYNQMP_H__ */
-- 
cgit v1.2.3


From 17df341d35265c8e14d71a48886fc7dcf92ef8a1 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 11 Sep 2022 13:39:01 +0200
Subject: headers: Remove some left-over license text

Remove a left-over from commit 2874c5fd2842 ("treewide: Replace GPLv2
boilerplate/reference with SPDX - rule 152")

There is no need for an empty "License:".

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/0e5ff727626b748238f4b78932f81572143d8f0b.1662896317.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/if_pppol2tp.h | 2 --
 include/linux/if_pppox.h    | 2 --
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_pppol2tp.h b/include/linux/if_pppol2tp.h
index 96d40942e5a3..c87efd333faa 100644
--- a/include/linux/if_pppol2tp.h
+++ b/include/linux/if_pppol2tp.h
@@ -4,8 +4,6 @@
  *
  * This file supplies definitions required by the PPP over L2TP driver
  * (l2tp_ppp.c).  All version information wrt this file is located in l2tp_ppp.c
- *
- * License:
  */
 #ifndef __LINUX_IF_PPPOL2TP_H
 #define __LINUX_IF_PPPOL2TP_H
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index 69e813bcb947..ff3beda1312c 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -5,8 +5,6 @@
  *
  * This file supplies definitions required by the PPP over Ethernet driver
  * (pppox.c).  All version information wrt this file is located in pppox.c
- *
- * License:
  */
 #ifndef __LINUX_IF_PPPOX_H
 #define __LINUX_IF_PPPOX_H
-- 
cgit v1.2.3


From fdf214978a71b2749d26f6da2b1d51d9ac23831d Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Tue, 20 Sep 2022 08:15:24 -0600
Subject: bpf: Move nf_conn extern declarations to filter.h

We're seeing the following new warnings on netdev/build_32bit and
netdev/build_allmodconfig_warn CI jobs:

    ../net/core/filter.c:8608:1: warning: symbol
    'nf_conn_btf_access_lock' was not declared. Should it be static?
    ../net/core/filter.c:8611:5: warning: symbol 'nfct_bsa' was not
    declared. Should it be static?

Fix by ensuring extern declaration is present while compiling filter.o.

Fixes: 864b656f82cc ("bpf: Add support for writing to nf_conn:mark")
Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/2bd2e0283df36d8a4119605878edb1838d144174.1663683114.git.dxu@dxuuu.xyz
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/filter.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 75335432fcbc..98e28126c24b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -567,6 +567,12 @@ struct sk_filter {
 
 DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 
+extern struct mutex nf_conn_btf_access_lock;
+extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf,
+				     const struct btf_type *t, int off, int size,
+				     enum bpf_access_type atype, u32 *next_btf_id,
+				     enum bpf_type_flag *flag);
+
 typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
 					  const struct bpf_insn *insnsi,
 					  unsigned int (*bpf_func)(const void *,
-- 
cgit v1.2.3


From 6f9c07be9d020489326098801f0661f754c7c865 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 5 Sep 2022 16:08:20 -0700
Subject: lib/cpumask: add FORCE_NR_CPUS config option

The size of cpumasks is hard-limited by compile-time parameter NR_CPUS,
but defined at boot-time when kernel parses ACPI/DT tables, and stored in
nr_cpu_ids. In many practical cases, number of CPUs for a target is known
at compile time, and can be provided with NR_CPUS.

In that case, compiler may be instructed to rely on NR_CPUS as on actual
number of CPUs, not an upper limit. It allows to optimize many cpumask
routines and significantly shrink size of the kernel image.

This patch adds FORCE_NR_CPUS option to teach the compiler to rely on
NR_CPUS and enable corresponding optimizations.

If FORCE_NR_CPUS=y, kernel will not set nr_cpu_ids at boot, but only check
that the actual number of possible CPUs is equal to NR_CPUS, and WARN if
that doesn't hold.

The new option is especially useful in embedded applications because
kernel configurations are unique for each SoC, the number of CPUs is
constant and known well, and memory limitations are typically harder.

For my 4-CPU ARM64 build with NR_CPUS=4, FORCE_NR_CPUS=y saves 46KB:
  add/remove: 3/4 grow/shrink: 46/729 up/down: 652/-46952 (-46300)

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 2f6622cead1f..1b442fb2001f 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -35,16 +35,20 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
  */
 #define cpumask_pr_args(maskp)		nr_cpu_ids, cpumask_bits(maskp)
 
-#if NR_CPUS == 1
-#define nr_cpu_ids		1U
+#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
+#define nr_cpu_ids ((unsigned int)NR_CPUS)
 #else
 extern unsigned int nr_cpu_ids;
+#endif
 
 static inline void set_nr_cpu_ids(unsigned int nr)
 {
+#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
+	WARN_ON(nr != nr_cpu_ids);
+#else
 	nr_cpu_ids = nr;
-}
 #endif
+}
 
 /* Deprecated. Always use nr_cpu_ids. */
 #define nr_cpumask_bits	nr_cpu_ids
-- 
cgit v1.2.3


From 690aa8c3ae308bc696ec8b1b357b995193927083 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Fri, 16 Sep 2022 14:28:32 +0200
Subject: ata: fix ata_id_sense_reporting_enabled() and
 ata_id_has_sense_reporting()

ACS-5 section
7.13.6.41 Words 85..87, 120: Commands and feature sets supported or enabled
states that:

If bit 15 of word 86 is set to one, bit 14 of word 119 is set to one,
and bit 15 of word 119 is cleared to zero, then word 119 is valid.

If bit 15 of word 86 is set to one, bit 14 of word 120 is set to one,
and bit 15 of word 120 is cleared to zero, then word 120 is valid.

(This text also exists in really old ACS standards, e.g. ACS-3.)

Currently, ata_id_sense_reporting_enabled() and
ata_id_has_sense_reporting() both check bit 15 of word 86,
but neither of them check that bit 14 of word 119 is set to one,
or that bit 15 of word 119 is cleared to zero.

Additionally, make ata_id_sense_reporting_enabled() return false
if !ata_id_has_sense_reporting(), similar to how e.g.
ata_id_flush_ext_enabled() returns false if !ata_id_has_flush_ext().

Fixes: e87fd28cf9a2 ("libata: Implement support for sense data reporting")
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/ata.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 21292b5bbb55..868bfd503aee 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -771,16 +771,21 @@ static inline bool ata_id_has_read_log_dma_ext(const u16 *id)
 
 static inline bool ata_id_has_sense_reporting(const u16 *id)
 {
-	if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15)))
+	if (!(id[ATA_ID_CFS_ENABLE_2] & BIT(15)))
+		return false;
+	if ((id[ATA_ID_COMMAND_SET_3] & (BIT(15) | BIT(14))) != BIT(14))
 		return false;
-	return id[ATA_ID_COMMAND_SET_3] & (1 << 6);
+	return id[ATA_ID_COMMAND_SET_3] & BIT(6);
 }
 
 static inline bool ata_id_sense_reporting_enabled(const u16 *id)
 {
-	if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15)))
+	if (!ata_id_has_sense_reporting(id))
+		return false;
+	/* ata_id_has_sense_reporting() == true, word 86 must have bit 15 set */
+	if ((id[ATA_ID_COMMAND_SET_4] & (BIT(15) | BIT(14))) != BIT(14))
 		return false;
-	return id[ATA_ID_COMMAND_SET_4] & (1 << 6);
+	return id[ATA_ID_COMMAND_SET_4] & BIT(6);
 }
 
 /**
-- 
cgit v1.2.3


From 9c6e09a434e1317e09b78b3b69cd384022ec9a03 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Fri, 16 Sep 2022 14:28:33 +0200
Subject: ata: fix ata_id_has_devslp()

ACS-5 section
7.13.6.36 Word 78: Serial ATA features supported
states that:

If word 76 is not 0000h or FFFFh, word 78 reports the features supported
by the device. If this word is not supported, the word shall be cleared
to zero.

(This text also exists in really old ACS standards, e.g. ACS-3.)

Additionally, move the macro to the other ATA_ID_FEATURE_SUPP macros
(which already have this check), thus making it more likely that the
next ATA_ID_FEATURE_SUPP macro that is added will include this check.

Fixes: 65fe1f0f66a5 ("ahci: implement aggressive SATA device sleep support")
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/ata.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 868bfd503aee..bc136a43689f 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -566,6 +566,10 @@ struct ata_bmdma_prd {
 	((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
 	  ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
 	 ((id)[ATA_ID_FEATURE_SUPP] & (1 << 2)))
+#define ata_id_has_devslp(id)	\
+	((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
+	  ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
+	 ((id)[ATA_ID_FEATURE_SUPP] & (1 << 8)))
 #define ata_id_iordy_disable(id) ((id)[ATA_ID_CAPABILITY] & (1 << 10))
 #define ata_id_has_iordy(id) ((id)[ATA_ID_CAPABILITY] & (1 << 11))
 #define ata_id_u32(id,n)	\
@@ -578,7 +582,6 @@ struct ata_bmdma_prd {
 
 #define ata_id_cdb_intr(id)	(((id)[ATA_ID_CONFIG] & 0x60) == 0x20)
 #define ata_id_has_da(id)	((id)[ATA_ID_SATA_CAPABILITY_2] & (1 << 4))
-#define ata_id_has_devslp(id)	((id)[ATA_ID_FEATURE_SUPP] & (1 << 8))
 #define ata_id_has_ncq_autosense(id) \
 				((id)[ATA_ID_FEATURE_SUPP] & (1 << 7))
 
-- 
cgit v1.2.3


From a5fb6bf853148974dbde092ec1bde553bea5e49f Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Fri, 16 Sep 2022 14:28:34 +0200
Subject: ata: fix ata_id_has_ncq_autosense()

ACS-5 section
7.13.6.36 Word 78: Serial ATA features supported
states that:

If word 76 is not 0000h or FFFFh, word 78 reports the features supported
by the device. If this word is not supported, the word shall be cleared
to zero.

(This text also exists in really old ACS standards, e.g. ACS-3.)

Additionally, move the macro to the other ATA_ID_FEATURE_SUPP macros
(which already have this check), thus making it more likely that the
next ATA_ID_FEATURE_SUPP macro that is added will include this check.

Fixes: 5b01e4b9efa0 ("libata: Implement NCQ autosense")
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/ata.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index bc136a43689f..4845443e0f08 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -570,6 +570,10 @@ struct ata_bmdma_prd {
 	((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
 	  ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
 	 ((id)[ATA_ID_FEATURE_SUPP] & (1 << 8)))
+#define ata_id_has_ncq_autosense(id) \
+	((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
+	  ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
+	 ((id)[ATA_ID_FEATURE_SUPP] & (1 << 7)))
 #define ata_id_iordy_disable(id) ((id)[ATA_ID_CAPABILITY] & (1 << 10))
 #define ata_id_has_iordy(id) ((id)[ATA_ID_CAPABILITY] & (1 << 11))
 #define ata_id_u32(id,n)	\
@@ -582,8 +586,6 @@ struct ata_bmdma_prd {
 
 #define ata_id_cdb_intr(id)	(((id)[ATA_ID_CONFIG] & 0x60) == 0x20)
 #define ata_id_has_da(id)	((id)[ATA_ID_SATA_CAPABILITY_2] & (1 << 4))
-#define ata_id_has_ncq_autosense(id) \
-				((id)[ATA_ID_FEATURE_SUPP] & (1 << 7))
 
 static inline bool ata_id_has_hipm(const u16 *id)
 {
-- 
cgit v1.2.3


From 630624cb1b5826d753ac8e01a0e42de43d66dedf Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Fri, 16 Sep 2022 14:28:35 +0200
Subject: ata: fix ata_id_has_dipm()

ACS-5 section
7.13.6.36 Word 78: Serial ATA features supported
states that:

If word 76 is not 0000h or FFFFh, word 78 reports the features supported
by the device. If this word is not supported, the word shall be cleared
to zero.

(This text also exists in really old ACS standards, e.g. ACS-3.)

The problem with ata_id_has_dipm() is that the while it performs a
check against 0 and 0xffff, it performs the check against
ATA_ID_FEATURE_SUPP (word 78), the same word where the feature bit
is stored.

Fix this by performing the check against ATA_ID_SATA_CAPABILITY
(word 76), like required by the spec. The feature bit check itself
is of course still performed against ATA_ID_FEATURE_SUPP (word 78).

Additionally, move the macro to the other ATA_ID_FEATURE_SUPP macros
(which already have this check), thus making it more likely that the
next ATA_ID_FEATURE_SUPP macro that is added will include this check.

Fixes: ca77329fb713 ("[libata] Link power management infrastructure")
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/ata.h | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 4845443e0f08..e3050e153a71 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -574,6 +574,10 @@ struct ata_bmdma_prd {
 	((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
 	  ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
 	 ((id)[ATA_ID_FEATURE_SUPP] & (1 << 7)))
+#define ata_id_has_dipm(id)	\
+	((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
+	  ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
+	 ((id)[ATA_ID_FEATURE_SUPP] & (1 << 3)))
 #define ata_id_iordy_disable(id) ((id)[ATA_ID_CAPABILITY] & (1 << 10))
 #define ata_id_has_iordy(id) ((id)[ATA_ID_CAPABILITY] & (1 << 11))
 #define ata_id_u32(id,n)	\
@@ -597,17 +601,6 @@ static inline bool ata_id_has_hipm(const u16 *id)
 	return val & (1 << 9);
 }
 
-static inline bool ata_id_has_dipm(const u16 *id)
-{
-	u16 val = id[ATA_ID_FEATURE_SUPP];
-
-	if (val == 0 || val == 0xffff)
-		return false;
-
-	return val & (1 << 3);
-}
-
-
 static inline bool ata_id_has_fua(const u16 *id)
 {
 	if ((id[ATA_ID_CFSSE] & 0xC000) != 0x4000)
-- 
cgit v1.2.3


From 7ebb5f8e001037efbdf18afabbbebf71bd98825e Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 21 Sep 2022 10:40:53 +0800
Subject: Revert "iommu/vt-d: Fix possible recursive locking in
 intel_iommu_init()"

This reverts commit 9cd4f1434479f1ac25c440c421fbf52069079914.

Some issues were reported on the original commit. Some thunderbolt devices
don't work anymore due to the following DMA fault.

DMAR: DRHD: handling fault status reg 2
DMAR: [INTR-REMAP] Request device [09:00.0] fault index 0x8080
      [fault reason 0x25]
      Blocked a compatibility format interrupt request

Bring it back for now to avoid functional regression.

Fixes: 9cd4f1434479f ("iommu/vt-d: Fix possible recursive locking in intel_iommu_init()")
Link: https://lore.kernel.org/linux-iommu/485A6EA5-6D58-42EA-B298-8571E97422DE@getmailspring.com/
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216497
Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: <stable@vger.kernel.org> # 5.19.x
Reported-and-tested-by: George Hilliard <thirtythreeforty@gmail.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20220920081701.3453504-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/dmar.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 8917a32173c4..d81a51978d01 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -65,7 +65,6 @@ struct dmar_pci_notify_info {
 
 extern struct rw_semaphore dmar_global_lock;
 extern struct list_head dmar_drhd_units;
-extern int intel_iommu_enabled;
 
 #define for_each_drhd_unit(drhd)					\
 	list_for_each_entry_rcu(drhd, &dmar_drhd_units, list,		\
@@ -89,8 +88,7 @@ extern int intel_iommu_enabled;
 static inline bool dmar_rcu_check(void)
 {
 	return rwsem_is_locked(&dmar_global_lock) ||
-	       system_state == SYSTEM_BOOTING ||
-	       (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled);
+	       system_state == SYSTEM_BOOTING;
 }
 
 #define	dmar_rcu_dereference(p)	rcu_dereference_check((p), dmar_rcu_check())
-- 
cgit v1.2.3


From 65394169bdae073bfb2c6816f5bf095bd7d53e61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <kernel@kempniu.pl>
Date: Wed, 29 Jun 2022 14:57:34 +0200
Subject: mtd: track maximum number of bitflips for each read request
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

mtd_read_oob() callers are currently oblivious to the details of ECC
errors detected during the read operation - they only learn (through the
return value) whether any corrected bitflips or uncorrectable errors
occurred.  More detailed ECC information can be useful to user-space
applications for making better-informed choices about moving data
around.

Extend struct mtd_oob_ops with a pointer to a newly-introduced struct
mtd_req_stats and set its 'max_bitflips' field to the maximum number of
bitflips found in a single ECC step during the read operation performed
by mtd_read_oob().  This is a prerequisite for ultimately passing that
value back to user space.

Suggested-by: Boris Brezillon <boris.brezillon@collabora.com>
Signed-off-by: Michał Kępień <kernel@kempniu.pl>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20220629125737.14418-2-kernel@kempniu.pl
---
 include/linux/mtd/mtd.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 955aee14b0f7..fccad1766458 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -40,6 +40,10 @@ struct mtd_erase_region_info {
 	unsigned long *lockmap;		/* If keeping bitmap of locks */
 };
 
+struct mtd_req_stats {
+	unsigned int max_bitflips;
+};
+
 /**
  * struct mtd_oob_ops - oob operation operands
  * @mode:	operation mode
@@ -70,6 +74,7 @@ struct mtd_oob_ops {
 	uint32_t	ooboffs;
 	uint8_t		*datbuf;
 	uint8_t		*oobbuf;
+	struct mtd_req_stats *stats;
 };
 
 /**
-- 
cgit v1.2.3


From 7bea6056927727f98f4efdd338f112f7517f05b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <kernel@kempniu.pl>
Date: Wed, 29 Jun 2022 14:57:36 +0200
Subject: mtd: add ECC error accounting for each read request
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend struct mtd_req_stats with two new fields holding the number of
corrected bitflips and uncorrectable errors detected during a read
operation.  This is a prerequisite for ultimately passing those counters
to user space, where they can be useful to applications for making
better-informed choices about moving data around.

Unlike 'max_bitflips' (which is set - in a common code path - to the
return value of a function called while the MTD device's mutex is held),
these counters have to be maintained in each MTD driver which defines
the '_read_oob' callback because the statistics need to be calculated
while the MTD device's mutex is held.

Suggested-by: Boris Brezillon <boris.brezillon@collabora.com>
Signed-off-by: Michał Kępień <kernel@kempniu.pl>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20220629125737.14418-4-kernel@kempniu.pl
---
 include/linux/mtd/mtd.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index fccad1766458..c12a5930f32c 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -41,6 +41,8 @@ struct mtd_erase_region_info {
 };
 
 struct mtd_req_stats {
+	unsigned int uncorrectable_errors;
+	unsigned int corrected_bitflips;
 	unsigned int max_bitflips;
 };
 
-- 
cgit v1.2.3


From b2bed51a5261f4266ecb857bba680a7f668d3ddf Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 20 Sep 2022 13:06:26 -0700
Subject: block: Fix the enum blk_eh_timer_return documentation

The documentation of the blk_eh_timer_return enumeration values does not
reflect correctly how e.g. the SCSI core uses these values. Fix the
documentation.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Fixes: 88b0cfad2888 ("block: document the blk_eh_timer_return values")
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Link: https://lore.kernel.org/r/20220920200626.3422296-1-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 74b99d716b0b..e21ff85751cf 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -268,9 +268,16 @@ static inline void rq_list_move(struct request **src, struct request **dst,
 	rq_list_add(dst, rq);
 }
 
+/**
+ * enum blk_eh_timer_return - How the timeout handler should proceed
+ * @BLK_EH_DONE: The block driver completed the command or will complete it at
+ *	a later time.
+ * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the
+ *	request to complete.
+ */
 enum blk_eh_timer_return {
-	BLK_EH_DONE,		/* drivers has completed the command */
-	BLK_EH_RESET_TIMER,	/* reset timer and try again */
+	BLK_EH_DONE,
+	BLK_EH_RESET_TIMER,
 };
 
 #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
-- 
cgit v1.2.3


From 9f0deaa12d832f488500a5afe9b912e9b3cfc432 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Tue, 16 Aug 2022 06:59:59 -0700
Subject: eventfd: guard wake_up in eventfd fs calls as well

Guard wakeups that the user can trigger, and that may end up triggering a
call back into eventfd_signal. This is in addition to the current approach
that only guards in eventfd_signal.

Rename in_eventfd_signal -> in_eventfd at the same time to reflect this.

Without this there would be a deadlock in the following code using libaio:

int main()
{
	struct io_context *ctx = NULL;
	struct iocb iocb;
	struct iocb *iocbs[] = { &iocb };
	int evfd;
        uint64_t val = 1;

	evfd = eventfd(0, EFD_CLOEXEC);
	assert(!io_setup(2, &ctx));
	io_prep_poll(&iocb, evfd, POLLIN);
	io_set_eventfd(&iocb, evfd);
	assert(1 == io_submit(ctx, 1, iocbs));
        write(evfd, &val, 8);
}

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20220816135959.1490641-1-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/eventfd.h | 2 +-
 include/linux/sched.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 305d5f19093b..30eb30d6909b 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -46,7 +46,7 @@ void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
 
 static inline bool eventfd_signal_allowed(void)
 {
-	return !current->in_eventfd_signal;
+	return !current->in_eventfd;
 }
 
 #else /* CONFIG_EVENTFD */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e7b2f8a5c711..8d82d6d32670 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -936,7 +936,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_EVENTFD
 	/* Recursion prevention for eventfd_signal() */
-	unsigned			in_eventfd_signal:1;
+	unsigned			in_eventfd:1;
 #endif
 #ifdef CONFIG_IOMMU_SVA
 	unsigned			pasid_activated:1;
-- 
cgit v1.2.3


From c0e0d6ba25f180ab76d3c18f8b360a119dffa634 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Tue, 30 Aug 2022 05:50:10 -0700
Subject: io_uring: add IORING_SETUP_DEFER_TASKRUN

Allow deferring async tasks until the user calls io_uring_enter(2) with
the IORING_ENTER_GETEVENTS flag. Enable this mode with a flag at
io_uring_setup time. This functionality requires that the later
io_uring_enter will be called from the same submission task, and therefore
restrict this flag to work only when IORING_SETUP_SINGLE_ISSUER is also
set.

Being able to hand pick when tasks are run prevents the problem where
there is current work to be done, however task work runs anyway.

For example, a common workload would obtain a batch of CQEs, and process
each one. Interrupting this to additional taskwork would add latency but
not gain anything. If instead task work is deferred to just before more
CQEs are obtained then no additional latency is added.

The way this is implemented is by trying to keep task work local to a
io_ring_ctx, rather than to the submission task. This is required, as the
application will want to wake up only a single io_ring_ctx at a time to
process work, and so the lists of work have to be kept separate.

This has some other benefits like not having to check the task continually
in handle_tw_list (and potentially unlocking/locking those), and reducing
locks in the submit & process completions path.

There are networking cases where using this option can reduce request
latency by 50%. For example a contrived example using [1] where the client
sends 2k data and receives the same data back while doing some system
calls (to trigger task work) shows this reduction. The reason ends up
being that if sending responses is delayed by processing task work, then
the client side sits idle. Whereas reordering the sends first means that
the client runs it's workload in parallel with the local task work.

[1]:
Using https://github.com/DylanZA/netbench/tree/defer_run
Client:
./netbench  --client_only 1 --control_port 10000 --host <host> --tx "epoll --threads 16 --per_thread 1 --size 2048 --resp 2048 --workload 1000"
Server:
./netbench  --server_only 1 --control_port 10000  --rx "io_uring --defer_taskrun 0 --workload 100"   --rx "io_uring  --defer_taskrun 1 --workload 100"

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220830125013.570060-5-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 677a25d44d7f..d56ff2185168 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -301,6 +301,8 @@ struct io_ring_ctx {
 		struct io_hash_table	cancel_table;
 		bool			poll_multi_queue;
 
+		struct llist_head	work_llist;
+
 		struct list_head	io_buffers_comp;
 	} ____cacheline_aligned_in_smp;
 
-- 
cgit v1.2.3


From 21a091b970cdbcf3e8ff829234b51be6f9192766 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Tue, 30 Aug 2022 05:50:12 -0700
Subject: io_uring: signal registered eventfd to process deferred task work

Some workloads rely on a registered eventfd (via
io_uring_register_eventfd(3)) in order to wake up and process the
io_uring.

In the case of a ring setup with IORING_SETUP_DEFER_TASKRUN, that eventfd
also needs to be signalled when there are tasks to run.

This changes an old behaviour which assumed 1 eventfd signal implied at
least 1 CQE, however only when this new flag is set (and so old users will
not notice). This should be expected with the IORING_SETUP_DEFER_TASKRUN
flag as it is not guaranteed that every task will result in a CQE.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220830125013.570060-7-dylany@fb.com
[axboe: fold in call_rcu() serialization fix]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d56ff2185168..aa4d90a53866 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -184,6 +184,8 @@ struct io_ev_fd {
 	struct eventfd_ctx	*cq_ev_fd;
 	unsigned int		eventfd_async: 1;
 	struct rcu_head		rcu;
+	atomic_t		refs;
+	atomic_t		ops;
 };
 
 struct io_alloc_cache {
-- 
cgit v1.2.3


From de27e18e86173b704beaa19f0ee376f3305c4794 Mon Sep 17 00:00:00 2001
From: Kanchan Joshi <joshi.k@samsung.com>
Date: Tue, 23 Aug 2022 21:44:40 +0530
Subject: fs: add file_operations->uring_cmd_iopoll

io_uring will invoke this to do completion polling on uring-cmd
operations.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Link: https://lore.kernel.org/r/20220823161443.49436-2-joshi.k@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9eced4cc286e..d6badd19784f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2132,6 +2132,7 @@ struct file_operations {
 				   loff_t len, unsigned int remap_flags);
 	int (*fadvise)(struct file *, loff_t, loff_t, int);
 	int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
+	int (*uring_cmd_iopoll)(struct io_uring_cmd *ioucmd);
 } __randomize_layout;
 
 struct inode_operations {
-- 
cgit v1.2.3


From 5756a3a7e713bcab705a5f0c810a2b1f7f4ecfaa Mon Sep 17 00:00:00 2001
From: Kanchan Joshi <joshi.k@samsung.com>
Date: Tue, 23 Aug 2022 21:44:41 +0530
Subject: io_uring: add iopoll infrastructure for io_uring_cmd

Put this up in the same way as iopoll is done for regular read/write IO.
Make place for storing a cookie into struct io_uring_cmd on submission.
Perform the completion using the ->uring_cmd_iopoll handler.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Link: https://lore.kernel.org/r/20220823161443.49436-3-joshi.k@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 4a2f6cc5a492..58676c0a398f 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -20,8 +20,12 @@ enum io_uring_cmd_flags {
 struct io_uring_cmd {
 	struct file	*file;
 	const void	*cmd;
-	/* callback to defer completions to task context */
-	void (*task_work_cb)(struct io_uring_cmd *cmd);
+	union {
+		/* callback to defer completions to task context */
+		void (*task_work_cb)(struct io_uring_cmd *cmd);
+		/* used for polled completion */
+		void *cookie;
+	};
 	u32		cmd_op;
 	u32		pad;
 	u8		pdu[32]; /* available inline for free use */
-- 
cgit v1.2.3


From c6e99ea482e2a9e1fef2488891242f9749584225 Mon Sep 17 00:00:00 2001
From: Kanchan Joshi <joshi.k@samsung.com>
Date: Tue, 23 Aug 2022 21:44:42 +0530
Subject: block: export blk_rq_is_poll

This is in preparation to support iopoll for nvme passthrough.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Link: https://lore.kernel.org/r/20220823161443.49436-4-joshi.k@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 92294a5fb083..de384f5d2c37 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -980,6 +980,7 @@ int blk_rq_map_kern(struct request_queue *, struct request *, void *,
 int blk_rq_append_bio(struct request *rq, struct bio *bio);
 void blk_execute_rq_nowait(struct request *rq, bool at_head);
 blk_status_t blk_execute_rq(struct request *rq, bool at_head);
+bool blk_rq_is_poll(struct request *rq);
 
 struct req_iterator {
 	struct bvec_iter iter;
-- 
cgit v1.2.3


From de97fcb30316410a2c46be102f074a454ecc6cf1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 2 Sep 2022 15:18:05 -0600
Subject: fs: add batch and poll flags to the uring_cmd_iopoll() handler

We need the poll_flags to know how to poll for the IO, and we should
have the batch structure in preparation for supporting batched
completions with iopoll.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/fs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index d6badd19784f..01681d061a6a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2132,7 +2132,8 @@ struct file_operations {
 				   loff_t len, unsigned int remap_flags);
 	int (*fadvise)(struct file *, loff_t, loff_t, int);
 	int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
-	int (*uring_cmd_iopoll)(struct io_uring_cmd *ioucmd);
+	int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
+				unsigned int poll_flags);
 } __randomize_layout;
 
 struct inode_operations {
-- 
cgit v1.2.3


From 1d7b61c06dc310421911dac7c5d2d15b754c8b63 Mon Sep 17 00:00:00 2001
From: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Date: Wed, 21 Sep 2022 15:50:44 +0200
Subject: remoteproc: virtio: Create platform device for the remoteproc_virtio

Define a platform driver to manage the remoteproc virtio device as
a platform devices.

The platform device allows to pass rproc_vdev_data platform data to
specify properties that are stored in the rproc_vdev structure.

Such approach will allow to preserve legacy remoteproc virtio device
creation but also to probe the device using device tree mechanism.

remoteproc_virtio.c update:
  - Add rproc_virtio_driver platform driver. The probe ops replaces
    the rproc_rvdev_add_device function.
  - All reference to the rvdev->dev has been updated to rvdev-pdev->dev.
  - rproc_rvdev_release is removed as associated to the rvdev device.
  - The use of rvdev->kref counter is replaced by get/put_device on the
    remoteproc virtio platform device.
  - The vdev device no longer increments rproc device counter.
    increment/decrement is done in rproc_virtio_probe/rproc_virtio_remove
    function in charge of the vrings allocation/free.

remoteproc_core.c update:
  Migrate from the rvdev device to the rvdev platform device.
  From this patch, when a vdev resource is found in the resource table
  the remoteproc core register a platform device.

Signed-off-by: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Link: https://lore.kernel.org/r/20220921135044.917140-5-arnaud.pouliquen@foss.st.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/linux/remoteproc.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index aea79c77db0f..1abf56ad02da 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -616,9 +616,8 @@ struct rproc_vring {
 
 /**
  * struct rproc_vdev - remoteproc state for a supported virtio device
- * @refcount: reference counter for the vdev and vring allocations
  * @subdev: handle for registering the vdev as a rproc subdevice
- * @dev: device struct used for reference count semantics
+ * @pdev: remoteproc virtio platform device
  * @id: virtio device id (as in virtio_ids.h)
  * @node: list node
  * @rproc: the rproc handle
@@ -627,10 +626,9 @@ struct rproc_vring {
  * @index: vdev position versus other vdev declared in resource table
  */
 struct rproc_vdev {
-	struct kref refcount;
 
 	struct rproc_subdev subdev;
-	struct device dev;
+	struct platform_device *pdev;
 
 	unsigned int id;
 	struct list_head node;
-- 
cgit v1.2.3


From 14a99e130f2758bc826a7e7a8bdf6f7400b54f0f Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Wed, 14 Sep 2022 19:07:28 -0700
Subject: lib/find_bit: create find_first_zero_bit_le()

find_first_zero_bit_le() is an alias to find_next_zero_bit_le(),
despite that 'next' is known to be slower than 'first' version.

Now that we have common FIND_FIRST_BIT() macro helper, it's trivial
to implement find_first_zero_bit_le() as a real function.

Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/find.h | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/find.h b/include/linux/find.h
index 424ef67d4a42..2464bff5de04 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -17,6 +17,10 @@ extern unsigned long _find_first_and_bit(const unsigned long *addr1,
 extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
 extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
 
+#ifdef __BIG_ENDIAN
+unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size);
+#endif
+
 #ifndef find_next_bit
 /**
  * find_next_bit - find the next set bit in a memory region
@@ -251,6 +255,20 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned
 }
 #endif
 
+#ifndef find_first_zero_bit_le
+static inline
+unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0);
+
+		return val == ~0UL ? size : ffz(val);
+	}
+
+	return _find_first_zero_bit_le(addr, size);
+}
+#endif
+
 #ifndef find_next_bit_le
 static inline
 unsigned long find_next_bit_le(const void *addr, unsigned
@@ -270,11 +288,6 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 }
 #endif
 
-#ifndef find_first_zero_bit_le
-#define find_first_zero_bit_le(addr, size) \
-	find_next_zero_bit_le((addr), (size), 0)
-#endif
-
 #else
 #error "Please fix <asm/byteorder.h>"
 #endif
-- 
cgit v1.2.3


From e79864f3164c573afce09ec4b72b75ebe061c14d Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Wed, 14 Sep 2022 19:07:29 -0700
Subject: lib/find_bit: optimize find_next_bit() functions

Over the past couple years, the function _find_next_bit() was extended
with parameters that modify its behavior to implement and- zero- and le-
flavors. The parameters are passed at compile time, but current design
prevents a compiler from optimizing out the conditionals.

As find_next_bit() API grows, I expect that more parameters will be added.
Current design would require more conditional code in _find_next_bit(),
which would bloat the helper even more and make it barely readable.

This patch replaces _find_next_bit() with a macro FIND_NEXT_BIT, and adds
a set of wrappers, so that the compile-time optimizations become possible.

The common logic is moved to the new macro, and all flavors may be
generated by providing a FETCH macro parameter, like in this example:

  #define FIND_NEXT_BIT(FETCH, MUNGE, size, start) ...

  find_next_xornot_and_bit(addr1, addr2, addr3, size, start)
  {
	return FIND_NEXT_BIT(addr1[idx] ^ ~addr2[idx] & addr3[idx],
				/* nop */, size, start);
  }

The FETCH may be of any complexity, as soon as it only refers the bitmap(s)
and an iterator idx.

MUNGE is here to support _le code generation for BE builds. May be
empty.

I ran find_bit_benchmark 16 times on top of 6.0-rc2 and 16 times on top
of 6.0-rc2 + this series. The results for kvm/x86_64 are:

                      v6.0-rc2  Optimized       Difference  Z-score
Random dense bitmap         ns         ns        ns      %
find_next_bit:          787735     670546    117189   14.9     3.97
find_next_zero_bit:     777492     664208    113284   14.6    10.51
find_last_bit:          830925     687573    143352   17.3     2.35
find_first_bit:        3874366    3306635    567731   14.7     1.84
find_first_and_bit:   40677125   37739887   2937238    7.2     1.36
find_next_and_bit:      347865     304456     43409   12.5     1.35

Random sparse bitmap
find_next_bit:           19816      14021      5795   29.2     6.10
find_next_zero_bit:    1318901    1223794     95107    7.2     1.41
find_last_bit:           14573      13514      1059    7.3     6.92
find_first_bit:        1313321    1249024     64297    4.9     1.53
find_first_and_bit:       8921       8098       823    9.2     4.56
find_next_and_bit:        9796       7176      2620   26.7     5.39

Where the statistics is significant (z-score > 3), the improvement
is ~15%.

According to the bloat-o-meter, the Image size is 10-11K less:

x86_64/defconfig:
add/remove: 32/14 grow/shrink: 61/782 up/down: 6344/-16521 (-10177)

arm64/defconfig:
add/remove: 3/2 grow/shrink: 50/714 up/down: 608/-11556 (-10948)

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/find.h | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/find.h b/include/linux/find.h
index 2464bff5de04..dead6f53a97b 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -8,9 +8,12 @@
 
 #include <linux/bitops.h>
 
-extern unsigned long _find_next_bit(const unsigned long *addr1,
-		const unsigned long *addr2, unsigned long nbits,
-		unsigned long start, unsigned long invert, unsigned long le);
+unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits,
+				unsigned long start);
+unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
+					unsigned long nbits, unsigned long start);
+unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
+					 unsigned long start);
 extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
 extern unsigned long _find_first_and_bit(const unsigned long *addr1,
 					 const unsigned long *addr2, unsigned long size);
@@ -19,6 +22,10 @@ extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long siz
 
 #ifdef __BIG_ENDIAN
 unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size);
+unsigned long _find_next_zero_bit_le(const  unsigned long *addr, unsigned
+					long size, unsigned long offset);
+unsigned long _find_next_bit_le(const unsigned long *addr, unsigned
+				long size, unsigned long offset);
 #endif
 
 #ifndef find_next_bit
@@ -45,7 +52,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
 		return val ? __ffs(val) : size;
 	}
 
-	return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
+	return _find_next_bit(addr, size, offset);
 }
 #endif
 
@@ -75,7 +82,7 @@ unsigned long find_next_and_bit(const unsigned long *addr1,
 		return val ? __ffs(val) : size;
 	}
 
-	return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
+	return _find_next_and_bit(addr1, addr2, size, offset);
 }
 #endif
 
@@ -103,7 +110,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
 		return val == ~0UL ? size : ffz(val);
 	}
 
-	return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
+	return _find_next_zero_bit(addr, size, offset);
 }
 #endif
 
@@ -251,7 +258,7 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned
 		return val == ~0UL ? size : ffz(val);
 	}
 
-	return _find_next_bit(addr, NULL, size, offset, ~0UL, 1);
+	return _find_next_zero_bit_le(addr, size, offset);
 }
 #endif
 
@@ -284,7 +291,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 		return val ? __ffs(val) : size;
 	}
 
-	return _find_next_bit(addr, NULL, size, offset, 0UL, 1);
+	return _find_next_bit_le(addr, size, offset);
 }
 #endif
 
-- 
cgit v1.2.3


From cb9ff3f3b84c95867856c3be42de73972feb1249 Mon Sep 17 00:00:00 2001
From: Kevin Tian <kevin.tian@intel.com>
Date: Wed, 21 Sep 2022 18:43:47 +0800
Subject: vfio: Add helpers for unifying vfio_device life cycle

The idea is to let vfio core manage the vfio_device life cycle instead
of duplicating the logic cross drivers. This is also a preparatory
step for adding struct device into vfio_device.

New pair of helpers together with a kref in vfio_device:

 - vfio_alloc_device()
 - vfio_put_device()

Drivers can register @init/@release callbacks to manage any private
state wrapping the vfio_device.

However vfio-ccw doesn't fit this model due to a life cycle mess
that its private structure mixes both parent and mdev info hence must
be allocated/freed outside of the life cycle of vfio device.

Per prior discussions this won't be fixed in short term by IBM folks.

Instead of waiting for those modifications introduce another helper
vfio_init_device() so ccw can call it to initialize a pre-allocated
vfio_device.

Further implication of the ccw trick is that vfio_device cannot be
freed uniformly in vfio core. Instead, require *EVERY* driver to
implement @release and free vfio_device inside. Then ccw can choose
to delay the free at its own discretion.

Another trick down the road is that kvzalloc() is used to accommodate
the need of gvt which uses vzalloc() while all others use kzalloc().
So drivers should call a helper vfio_free_device() to free the
vfio_device instead of assuming that kfree() or vfree() is appliable.

Later once the ccw mess is fixed we can remove those tricks and
fully handle structure alloc/free in vfio core.

Existing vfio_{un}init_group_dev() will be deprecated after all
existing usages are converted to the new model.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Co-developed-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/20220921104401.38898-2-kevin.tian@intel.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0e2826559091..f67cac700e6f 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -47,7 +47,8 @@ struct vfio_device {
 	struct kvm *kvm;
 
 	/* Members below here are private, not for driver use */
-	refcount_t refcount;
+	struct kref kref;	/* object life cycle */
+	refcount_t refcount;	/* user count on registered device*/
 	unsigned int open_count;
 	struct completion comp;
 	struct list_head group_next;
@@ -57,6 +58,8 @@ struct vfio_device {
 /**
  * struct vfio_device_ops - VFIO bus driver device callbacks
  *
+ * @init: initialize private fields in device structure
+ * @release: Reclaim private fields in device structure
  * @open_device: Called when the first file descriptor is opened for this device
  * @close_device: Opposite of open_device
  * @read: Perform read(2) on device file descriptor
@@ -74,6 +77,8 @@ struct vfio_device {
  */
 struct vfio_device_ops {
 	char	*name;
+	int	(*init)(struct vfio_device *vdev);
+	void	(*release)(struct vfio_device *vdev);
 	int	(*open_device)(struct vfio_device *vdev);
 	void	(*close_device)(struct vfio_device *vdev);
 	ssize_t	(*read)(struct vfio_device *vdev, char __user *buf,
@@ -161,6 +166,24 @@ static inline int vfio_check_feature(u32 flags, size_t argsz, u32 supported_ops,
 	return 1;
 }
 
+struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
+				       const struct vfio_device_ops *ops);
+#define vfio_alloc_device(dev_struct, member, dev, ops)				\
+	container_of(_vfio_alloc_device(sizeof(struct dev_struct) +		\
+					BUILD_BUG_ON_ZERO(offsetof(		\
+						struct dev_struct, member)),	\
+					dev, ops),				\
+		     struct dev_struct, member)
+
+int vfio_init_device(struct vfio_device *device, struct device *dev,
+		     const struct vfio_device_ops *ops);
+void vfio_free_device(struct vfio_device *device);
+void vfio_device_release(struct kref *kref);
+static inline void vfio_put_device(struct vfio_device *device)
+{
+	kref_put(&device->kref, vfio_device_release);
+}
+
 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
 			 const struct vfio_device_ops *ops);
 void vfio_uninit_group_dev(struct vfio_device *device);
-- 
cgit v1.2.3


From 63d7c77989de98d3e92611dbb858028b74dca377 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Wed, 21 Sep 2022 18:43:48 +0800
Subject: vfio/pci: Use the new device life cycle helpers

Also introduce two pci core helpers as @init/@release for pci drivers:

 - vfio_pci_core_init_dev()
 - vfio_pci_core_release_dev()

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20220921104401.38898-3-kevin.tian@intel.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio_pci_core.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 089b603bcfdc..0499ea836058 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -109,6 +109,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev);
 void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
 			       struct pci_dev *pdev,
 			       const struct vfio_device_ops *vfio_pci_ops);
+int vfio_pci_core_init_dev(struct vfio_device *core_vdev);
+void vfio_pci_core_release_dev(struct vfio_device *core_vdev);
 int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev);
 void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev);
 void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev);
-- 
cgit v1.2.3


From 27aeb915595b87165a3004aab05b2b837d01e6ed Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Wed, 21 Sep 2022 18:43:50 +0800
Subject: vfio/hisi_acc: Use the new device life cycle helpers

Tidy up @probe so all migration specific initialization logic is moved
to migration specific @init callback.

Remove vfio_pci_core_{un}init_device() given no user now.

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Link: https://lore.kernel.org/r/20220921104401.38898-5-kevin.tian@intel.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio_pci_core.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 0499ea836058..367fd79226a3 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -106,13 +106,9 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
 void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga,
 			      bool is_disable_idle_d3);
 void vfio_pci_core_close_device(struct vfio_device *core_vdev);
-void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
-			       struct pci_dev *pdev,
-			       const struct vfio_device_ops *vfio_pci_ops);
 int vfio_pci_core_init_dev(struct vfio_device *core_vdev);
 void vfio_pci_core_release_dev(struct vfio_device *core_vdev);
 int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev);
-void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev);
 void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev);
 extern const struct pci_error_handlers vfio_pci_core_err_handlers;
 int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev,
-- 
cgit v1.2.3


From ebb72b765fb49685c4603d2bff47a4ab5d2580a9 Mon Sep 17 00:00:00 2001
From: Kevin Tian <kevin.tian@intel.com>
Date: Wed, 21 Sep 2022 18:43:59 +0800
Subject: vfio/ccw: Use the new device life cycle helpers

ccw is the only exception which cannot use vfio_alloc_device() because
its private device structure is designed to serve both mdev and parent.
Life cycle of the parent is managed by css_driver so vfio_ccw_private
must be allocated/freed in css_driver probe/remove path instead of
conforming to vfio core life cycle for mdev.

Given that use a wait/completion scheme so the mdev remove path waits
after vfio_put_device() until receiving a completion notification from
@release. The completion indicates that all active references on
vfio_device have been released.

After that point although free of vfio_ccw_private is delayed to
css_driver it's at least guaranteed to have no parallel reference on
released vfio device part from other code paths.

memset() in @probe is removed. vfio_device is either already cleared
when probed for the first time or cleared in @release from last probe.

The right fix is to introduce separate structures for mdev and parent,
but this won't happen in short term per prior discussions.

Remove vfio_init/uninit_group_dev() as no user now.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Link: https://lore.kernel.org/r/20220921104401.38898-14-kevin.tian@intel.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index f67cac700e6f..3cf857b1eec7 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -184,9 +184,6 @@ static inline void vfio_put_device(struct vfio_device *device)
 	kref_put(&device->kref, vfio_device_release);
 }
 
-void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
-			 const struct vfio_device_ops *ops);
-void vfio_uninit_group_dev(struct vfio_device *device);
 int vfio_register_group_dev(struct vfio_device *device);
 int vfio_register_emulated_iommu_dev(struct vfio_device *device);
 void vfio_unregister_group_dev(struct vfio_device *device);
-- 
cgit v1.2.3


From 3c28a76124b25882411f005924be73795b6ef078 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Wed, 21 Sep 2022 18:44:01 +0800
Subject: vfio: Add struct device to vfio_device

and replace kref. With it a 'vfio-dev/vfioX' node is created under the
sysfs path of the parent, indicating the device is bound to a vfio
driver, e.g.:

/sys/devices/pci0000\:6f/0000\:6f\:01.0/vfio-dev/vfio0

It is also a preparatory step toward adding cdev for supporting future
device-oriented uAPI.

Add Documentation/ABI/testing/sysfs-devices-vfio-dev.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20220921104401.38898-16-kevin.tian@intel.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 3cf857b1eec7..ee399a768070 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -47,7 +47,8 @@ struct vfio_device {
 	struct kvm *kvm;
 
 	/* Members below here are private, not for driver use */
-	struct kref kref;	/* object life cycle */
+	unsigned int index;
+	struct device device;	/* device.kref covers object life circle */
 	refcount_t refcount;	/* user count on registered device*/
 	unsigned int open_count;
 	struct completion comp;
@@ -178,10 +179,9 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
 int vfio_init_device(struct vfio_device *device, struct device *dev,
 		     const struct vfio_device_ops *ops);
 void vfio_free_device(struct vfio_device *device);
-void vfio_device_release(struct kref *kref);
 static inline void vfio_put_device(struct vfio_device *device)
 {
-	kref_put(&device->kref, vfio_device_release);
+	put_device(&device->device);
 }
 
 int vfio_register_group_dev(struct vfio_device *device);
-- 
cgit v1.2.3


From 583c1f420173f7d84413a1a1fbf5109d798b4faa Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 19 Sep 2022 19:00:57 -0500
Subject: bpf: Define new BPF_MAP_TYPE_USER_RINGBUF map type

We want to support a ringbuf map type where samples are published from
user-space, to be consumed by BPF programs. BPF currently supports a
kernel -> user-space circular ring buffer via the BPF_MAP_TYPE_RINGBUF
map type.  We'll need to define a new map type for user-space -> kernel,
as none of the helpers exported for BPF_MAP_TYPE_RINGBUF will apply
to a user-space producer ring buffer, and we'll want to add one or
more helper functions that would not apply for a kernel-producer
ring buffer.

This patch therefore adds a new BPF_MAP_TYPE_USER_RINGBUF map type
definition. The map type is useless in its current form, as there is no
way to access or use it for anything until we one or more BPF helpers. A
follow-on patch will therefore add a new helper function that allows BPF
programs to run callbacks on samples that are published to the ring
buffer.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220920000100.477320-2-void@manifault.com
---
 include/linux/bpf_types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 2b9112b80171..2c6a4f2562a7 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -126,6 +126,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
 
 BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
-- 
cgit v1.2.3


From 20571567384428dfc9fe5cf9f2e942e1df13c2dd Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 19 Sep 2022 19:00:58 -0500
Subject: bpf: Add bpf_user_ringbuf_drain() helper

In a prior change, we added a new BPF_MAP_TYPE_USER_RINGBUF map type which
will allow user-space applications to publish messages to a ring buffer
that is consumed by a BPF program in kernel-space. In order for this
map-type to be useful, it will require a BPF helper function that BPF
programs can invoke to drain samples from the ring buffer, and invoke
callbacks on those samples. This change adds that capability via a new BPF
helper function:

bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx,
                       u64 flags)

BPF programs may invoke this function to run callback_fn() on a series of
samples in the ring buffer. callback_fn() has the following signature:

long callback_fn(struct bpf_dynptr *dynptr, void *context);

Samples are provided to the callback in the form of struct bpf_dynptr *'s,
which the program can read using BPF helper functions for querying
struct bpf_dynptr's.

In order to support bpf_ringbuf_drain(), a new PTR_TO_DYNPTR register
type is added to the verifier to reflect a dynptr that was allocated by
a helper function and passed to a BPF program. Unlike PTR_TO_STACK
dynptrs which are allocated on the stack by a BPF program, PTR_TO_DYNPTR
dynptrs need not use reference tracking, as the BPF helper is trusted to
properly free the dynptr before returning. The verifier currently only
supports PTR_TO_DYNPTR registers that are also DYNPTR_TYPE_LOCAL.

Note that while the corresponding user-space libbpf logic will be added
in a subsequent patch, this patch does contain an implementation of the
.map_poll() callback for BPF_MAP_TYPE_USER_RINGBUF maps. This
.map_poll() callback guarantees that an epoll-waiting user-space
producer will receive at least one event notification whenever at least
one sample is drained in an invocation of bpf_user_ringbuf_drain(),
provided that the function is not invoked with the BPF_RB_NO_WAKEUP
flag. If the BPF_RB_FORCE_WAKEUP flag is provided, a wakeup
notification is sent even if no sample was drained.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220920000100.477320-3-void@manifault.com
---
 include/linux/bpf.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e0dbe0c0a17e..33e543b86e1a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -451,7 +451,7 @@ enum bpf_type_flag {
 	/* DYNPTR points to memory local to the bpf program. */
 	DYNPTR_TYPE_LOCAL	= BIT(8 + BPF_BASE_TYPE_BITS),
 
-	/* DYNPTR points to a ringbuf record. */
+	/* DYNPTR points to a kernel-produced ringbuf record. */
 	DYNPTR_TYPE_RINGBUF	= BIT(9 + BPF_BASE_TYPE_BITS),
 
 	/* Size is known at compile time. */
@@ -656,6 +656,7 @@ enum bpf_reg_type {
 	PTR_TO_MEM,		 /* reg points to valid memory region */
 	PTR_TO_BUF,		 /* reg points to a read/write buffer */
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
+	PTR_TO_DYNPTR,		 /* reg points to a dynptr */
 	__BPF_REG_TYPE_MAX,
 
 	/* Extended reg_types. */
@@ -1394,6 +1395,11 @@ struct bpf_array {
 #define BPF_MAP_CAN_READ	BIT(0)
 #define BPF_MAP_CAN_WRITE	BIT(1)
 
+/* Maximum number of user-producer ring buffer samples that can be drained in
+ * a call to bpf_user_ringbuf_drain().
+ */
+#define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024)
+
 static inline u32 bpf_map_flags_to_cap(struct bpf_map *map)
 {
 	u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);
@@ -2495,6 +2501,7 @@ extern const struct bpf_func_proto bpf_loop_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
 extern const struct bpf_func_proto bpf_set_retval_proto;
 extern const struct bpf_func_proto bpf_get_retval_proto;
+extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
@@ -2639,7 +2646,7 @@ enum bpf_dynptr_type {
 	BPF_DYNPTR_TYPE_INVALID,
 	/* Points to memory that is local to the bpf program */
 	BPF_DYNPTR_TYPE_LOCAL,
-	/* Underlying data is a ringbuf record */
+	/* Underlying data is a kernel-produced ringbuf record */
 	BPF_DYNPTR_TYPE_RINGBUF,
 };
 
-- 
cgit v1.2.3


From b8d31762a0ae6861e1115302ee338560d853e317 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:42 +0200
Subject: btf: Allow dynamic pointer parameters in kfuncs

Allow dynamic pointers (struct bpf_dynptr_kern *) to be specified as
parameters in kfuncs. Also, ensure that dynamic pointers passed as argument
are valid and initialized, are a pointer to the stack, and of the type
local. More dynamic pointer types can be supported in the future.

To properly detect whether a parameter is of the desired type, introduce
the stringify_struct() macro to compare the returned structure name with
the desired name. In addition, protect against structure renames, by
halting the build with BUILD_BUG_ON(), so that developers have to revisit
the code.

To check if a dynamic pointer passed to the kfunc is valid and initialized,
and if its type is local, export the existing functions
is_dynptr_reg_valid_init() and is_dynptr_type_expected().

Cc: Joanne Koong <joannelkoong@gmail.com>
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220920075951.929132-5-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 5 +++++
 include/linux/btf.h          | 9 +++++++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e197f8fb27e2..9e1e6965f407 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -593,6 +593,11 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state
 			     u32 regno);
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 		   u32 regno, u32 mem_size);
+bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
+			      struct bpf_reg_state *reg);
+bool is_dynptr_type_expected(struct bpf_verifier_env *env,
+			     struct bpf_reg_state *reg,
+			     enum bpf_arg_type arg_type);
 
 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 1fcc833a8690..f9aababc5d78 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -52,6 +52,15 @@
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
 
+/*
+ * Return the name of the passed struct, if exists, or halt the build if for
+ * example the structure gets renamed. In this way, developers have to revisit
+ * the code using that structure name, and update it accordingly.
+ */
+#define stringify_struct(x)			\
+	({ BUILD_BUG_ON(sizeof(struct x) < 0);	\
+	   __stringify(x); })
+
 struct btf;
 struct btf_member;
 struct btf_type;
-- 
cgit v1.2.3


From 51df4865718540f51bb5d3e552c50dc88e1333d6 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:43 +0200
Subject: bpf: Export bpf_dynptr_get_size()

Export bpf_dynptr_get_size(), so that kernel code dealing with eBPF dynamic
pointers can obtain the real size of data carried by this data structure.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Acked-by: KP Singh <kpsingh@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220920075951.929132-6-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 33e543b86e1a..6535fb1e21b9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2654,6 +2654,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
 		     enum bpf_dynptr_type type, u32 offset, u32 size);
 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
 int bpf_dynptr_check_size(u32 size);
+u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr);
 
 #ifdef CONFIG_BPF_LSM
 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
-- 
cgit v1.2.3


From 90fd8f26edd47942203639bf3a5dde8fa1931a0e Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:44 +0200
Subject: KEYS: Move KEY_LOOKUP_ to include/linux/key.h and define
 KEY_LOOKUP_ALL

In preparation for the patch that introduces the bpf_lookup_user_key() eBPF
kfunc, move KEY_LOOKUP_ definitions to include/linux/key.h, to be able to
validate the kfunc parameters. Add them to enum key_lookup_flag, so that
all the current ones and the ones defined in the future are automatically
exported through BTF and available to eBPF programs.

Also, add KEY_LOOKUP_ALL to the enum, with the logical OR of currently
defined flags as value, to facilitate checking whether a variable contains
only those flags.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lore.kernel.org/r/20220920075951.929132-7-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/key.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index 7febc4881363..d27477faf00d 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -88,6 +88,12 @@ enum key_need_perm {
 	KEY_DEFER_PERM_CHECK,	/* Special: permission check is deferred */
 };
 
+enum key_lookup_flag {
+	KEY_LOOKUP_CREATE = 0x01,
+	KEY_LOOKUP_PARTIAL = 0x02,
+	KEY_LOOKUP_ALL = (KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL),
+};
+
 struct seq_file;
 struct user_struct;
 struct signal_struct;
-- 
cgit v1.2.3


From f3cf4134c5c6c47b9b5c7aa3cb2d67e107887a7b Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:45 +0200
Subject: bpf: Add bpf_lookup_*_key() and bpf_key_put() kfuncs

Add the bpf_lookup_user_key(), bpf_lookup_system_key() and bpf_key_put()
kfuncs, to respectively search a key with a given key handle serial number
and flags, obtain a key from a pre-determined ID defined in
include/linux/verification.h, and cleanup.

Introduce system_keyring_id_check() to validate the keyring ID parameter of
bpf_lookup_system_key().

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20220920075951.929132-8-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          | 8 ++++++++
 include/linux/verification.h | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6535fb1e21b9..a1435b019aca 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2664,4 +2664,12 @@ static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {}
 static inline void bpf_cgroup_atype_put(int cgroup_atype) {}
 #endif /* CONFIG_BPF_LSM */
 
+struct key;
+
+#ifdef CONFIG_KEYS
+struct bpf_key {
+	struct key *key;
+	bool has_ref;
+};
+#endif /* CONFIG_KEYS */
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/verification.h b/include/linux/verification.h
index a655923335ae..f34e50ebcf60 100644
--- a/include/linux/verification.h
+++ b/include/linux/verification.h
@@ -17,6 +17,14 @@
 #define VERIFY_USE_SECONDARY_KEYRING ((struct key *)1UL)
 #define VERIFY_USE_PLATFORM_KEYRING  ((struct key *)2UL)
 
+static inline int system_keyring_id_check(u64 id)
+{
+	if (id > (unsigned long)VERIFY_USE_PLATFORM_KEYRING)
+		return -EINVAL;
+
+	return 0;
+}
+
 /*
  * The use to which an asymmetric key is being put.
  */
-- 
cgit v1.2.3


From 05b24ff9b2cfabfcfd951daaa915a036ab53c9e1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 16 Sep 2022 09:19:14 +0200
Subject: bpf: Prevent bpf program recursion for raw tracepoint probes

We got report from sysbot [1] about warnings that were caused by
bpf program attached to contention_begin raw tracepoint triggering
the same tracepoint by using bpf_trace_printk helper that takes
trace_printk_lock lock.

 Call Trace:
  <TASK>
  ? trace_event_raw_event_bpf_trace_printk+0x5f/0x90
  bpf_trace_printk+0x2b/0xe0
  bpf_prog_a9aec6167c091eef_prog+0x1f/0x24
  bpf_trace_run2+0x26/0x90
  native_queued_spin_lock_slowpath+0x1c6/0x2b0
  _raw_spin_lock_irqsave+0x44/0x50
  bpf_trace_printk+0x3f/0xe0
  bpf_prog_a9aec6167c091eef_prog+0x1f/0x24
  bpf_trace_run2+0x26/0x90
  native_queued_spin_lock_slowpath+0x1c6/0x2b0
  _raw_spin_lock_irqsave+0x44/0x50
  bpf_trace_printk+0x3f/0xe0
  bpf_prog_a9aec6167c091eef_prog+0x1f/0x24
  bpf_trace_run2+0x26/0x90
  native_queued_spin_lock_slowpath+0x1c6/0x2b0
  _raw_spin_lock_irqsave+0x44/0x50
  bpf_trace_printk+0x3f/0xe0
  bpf_prog_a9aec6167c091eef_prog+0x1f/0x24
  bpf_trace_run2+0x26/0x90
  native_queued_spin_lock_slowpath+0x1c6/0x2b0
  _raw_spin_lock_irqsave+0x44/0x50
  __unfreeze_partials+0x5b/0x160
  ...

The can be reproduced by attaching bpf program as raw tracepoint on
contention_begin tracepoint. The bpf prog calls bpf_trace_printk
helper. Then by running perf bench the spin lock code is forced to
take slow path and call contention_begin tracepoint.

Fixing this by skipping execution of the bpf program if it's
already running, Using bpf prog 'active' field, which is being
currently used by trampoline programs for the same reason.

Moving bpf_prog_inc_misses_counter to syscall.c because
trampoline.c is compiled in just for CONFIG_BPF_JIT option.

Reviewed-by: Stanislav Fomichev <sdf@google.com>
Reported-by: syzbot+2251879aa068ad9c960d@syzkaller.appspotmail.com
[1] https://lore.kernel.org/bpf/YxhFe3EwqchC%2FfYf@krava/T/#t
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220916071914.7156-1-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a1435b019aca..edd43edb27d6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2042,6 +2042,8 @@ static inline bool has_current_bpf_ctx(void)
 {
 	return !!current->bpf_ctx;
 }
+
+void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -2264,6 +2266,10 @@ static inline bool has_current_bpf_ctx(void)
 {
 	return false;
 }
+
+static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog)
+{
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
-- 
cgit v1.2.3


From d7e7b9af104c7b389a0c21eb26532511bce4b510 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 1 Sep 2022 12:32:06 -0700
Subject: fscrypt: stop using keyrings subsystem for fscrypt_master_key

The approach of fs/crypto/ internally managing the fscrypt_master_key
structs as the payloads of "struct key" objects contained in a
"struct key" keyring has outlived its usefulness.  The original idea was
to simplify the code by reusing code from the keyrings subsystem.
However, several issues have arisen that can't easily be resolved:

- When a master key struct is destroyed, blk_crypto_evict_key() must be
  called on any per-mode keys embedded in it.  (This started being the
  case when inline encryption support was added.)  Yet, the keyrings
  subsystem can arbitrarily delay the destruction of keys, even past the
  time the filesystem was unmounted.  Therefore, currently there is no
  easy way to call blk_crypto_evict_key() when a master key is
  destroyed.  Currently, this is worked around by holding an extra
  reference to the filesystem's request_queue(s).  But it was overlooked
  that the request_queue reference is *not* guaranteed to pin the
  corresponding blk_crypto_profile too; for device-mapper devices that
  support inline crypto, it doesn't.  This can cause a use-after-free.

- When the last inode that was using an incompletely-removed master key
  is evicted, the master key removal is completed by removing the key
  struct from the keyring.  Currently this is done via key_invalidate().
  Yet, key_invalidate() takes the key semaphore.  This can deadlock when
  called from the shrinker, since in fscrypt_ioctl_add_key(), memory is
  allocated with GFP_KERNEL under the same semaphore.

- More generally, the fact that the keyrings subsystem can arbitrarily
  delay the destruction of keys (via garbage collection delay, or via
  random processes getting temporary key references) is undesirable, as
  it means we can't strictly guarantee that all secrets are ever wiped.

- Doing the master key lookups via the keyrings subsystem results in the
  key_permission LSM hook being called.  fscrypt doesn't want this, as
  all access control for encrypted files is designed to happen via the
  files themselves, like any other files.  The workaround which SELinux
  users are using is to change their SELinux policy to grant key search
  access to all domains.  This works, but it is an odd extra step that
  shouldn't really have to be done.

The fix for all these issues is to change the implementation to what I
should have done originally: don't use the keyrings subsystem to keep
track of the filesystem's fscrypt_master_key structs.  Instead, just
store them in a regular kernel data structure, and rework the reference
counting, locking, and lifetime accordingly.  Retain support for
RCU-mode key lookups by using a hash table.  Replace fscrypt_sb_free()
with fscrypt_sb_delete(), which releases the keys synchronously and runs
a bit earlier during unmount, so that block devices are still available.

A side effect of this patch is that neither the master keys themselves
nor the filesystem keyrings will be listed in /proc/keys anymore.
("Master key users" and the master key users keyrings will still be
listed.)  However, this was mostly an implementation detail, and it was
intended just for debugging purposes.  I don't know of anyone using it.

This patch does *not* change how "master key users" (->mk_users) works;
that still uses the keyrings subsystem.  That is still needed for key
quotas, and changing that isn't necessary to solve the issues listed
above.  If we decide to change that too, it would be a separate patch.

I've marked this as fixing the original commit that added the fscrypt
keyring, but as noted above the most important issue that this patch
fixes wasn't introduced until the addition of inline encryption support.

Fixes: 22d94f493bfb ("fscrypt: add FS_IOC_ADD_ENCRYPTION_KEY ioctl")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220901193208.138056-2-ebiggers@kernel.org
---
 include/linux/fs.h      | 2 +-
 include/linux/fscrypt.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9eced4cc286e..0830486f47ef 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1472,7 +1472,7 @@ struct super_block {
 	const struct xattr_handler **s_xattr;
 #ifdef CONFIG_FS_ENCRYPTION
 	const struct fscrypt_operations	*s_cop;
-	struct key		*s_master_keys; /* master crypto keys in use */
+	struct fscrypt_keyring	*s_master_keys; /* master crypto keys in use */
 #endif
 #ifdef CONFIG_FS_VERITY
 	const struct fsverity_operations *s_vop;
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 488fd8c8f8af..db5bb5650bf2 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -310,7 +310,7 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
 }
 
 /* keyring.c */
-void fscrypt_sb_free(struct super_block *sb);
+void fscrypt_sb_delete(struct super_block *sb);
 int fscrypt_ioctl_add_key(struct file *filp, void __user *arg);
 int fscrypt_add_test_dummy_key(struct super_block *sb,
 			       const struct fscrypt_dummy_policy *dummy_policy);
@@ -524,7 +524,7 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
 }
 
 /* keyring.c */
-static inline void fscrypt_sb_free(struct super_block *sb)
+static inline void fscrypt_sb_delete(struct super_block *sb)
 {
 }
 
-- 
cgit v1.2.3


From 0e91fc1e0f5c70ce575451103ec66c2ec21f1a6e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Sep 2022 12:32:08 -0700
Subject: fscrypt: work on block_devices instead of request_queues

request_queues are a block layer implementation detail that should not
leak into file systems.  Change the fscrypt inline crypto code to
retrieve block devices instead of request_queues from the file system.
As part of that, clean up the interaction with multi-device file systems
by returning both the number of devices and the actual device array in a
single method call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
[ebiggers: bug fixes and minor tweaks]
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220901193208.138056-4-ebiggers@kernel.org
---
 include/linux/fscrypt.h | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index db5bb5650bf2..1f12ebb4a69d 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -161,24 +161,21 @@ struct fscrypt_operations {
 				      int *ino_bits_ret, int *lblk_bits_ret);
 
 	/*
-	 * Return the number of block devices to which the filesystem may write
-	 * encrypted file contents.
+	 * Return an array of pointers to the block devices to which the
+	 * filesystem may write encrypted file contents, NULL if the filesystem
+	 * only has a single such block device, or an ERR_PTR() on error.
+	 *
+	 * On successful non-NULL return, *num_devs is set to the number of
+	 * devices in the returned array.  The caller must free the returned
+	 * array using kfree().
 	 *
 	 * If the filesystem can use multiple block devices (other than block
 	 * devices that aren't used for encrypted file contents, such as
 	 * external journal devices), and wants to support inline encryption,
 	 * then it must implement this function.  Otherwise it's not needed.
 	 */
-	int (*get_num_devices)(struct super_block *sb);
-
-	/*
-	 * If ->get_num_devices() returns a value greater than 1, then this
-	 * function is called to get the array of request_queues that the
-	 * filesystem is using -- one per block device.  (There may be duplicate
-	 * entries in this array, as block devices can share a request_queue.)
-	 */
-	void (*get_devices)(struct super_block *sb,
-			    struct request_queue **devs);
+	struct block_device **(*get_devices)(struct super_block *sb,
+					     unsigned int *num_devs);
 };
 
 static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode)
-- 
cgit v1.2.3


From d7f06bdd6ee87fbefa05af5f57361d85e7715b11 Mon Sep 17 00:00:00 2001
From: Phil Auld <pauld@redhat.com>
Date: Tue, 6 Sep 2022 16:35:42 -0400
Subject: drivers/base: Fix unsigned comparison to -1 in CPUMAP_FILE_MAX_BYTES

As PAGE_SIZE is unsigned long, -1 > PAGE_SIZE when NR_CPUS <= 3.
This leads to very large file sizes:

topology$ ls -l
total 0
-r--r--r-- 1 root root 18446744073709551615 Sep  5 11:59 core_cpus
-r--r--r-- 1 root root                 4096 Sep  5 11:59 core_cpus_list
-r--r--r-- 1 root root                 4096 Sep  5 10:58 core_id
-r--r--r-- 1 root root 18446744073709551615 Sep  5 10:10 core_siblings
-r--r--r-- 1 root root                 4096 Sep  5 11:59 core_siblings_list
-r--r--r-- 1 root root 18446744073709551615 Sep  5 11:59 die_cpus
-r--r--r-- 1 root root                 4096 Sep  5 11:59 die_cpus_list
-r--r--r-- 1 root root                 4096 Sep  5 11:59 die_id
-r--r--r-- 1 root root 18446744073709551615 Sep  5 11:59 package_cpus
-r--r--r-- 1 root root                 4096 Sep  5 11:59 package_cpus_list
-r--r--r-- 1 root root                 4096 Sep  5 10:58 physical_package_id
-r--r--r-- 1 root root 18446744073709551615 Sep  5 10:10 thread_siblings
-r--r--r-- 1 root root                 4096 Sep  5 11:59 thread_siblings_list

Adjust the inequality to catch the case when NR_CPUS is configured
to a small value.

Fixes: 7ee951acd31a ("drivers/base: fix userspace break from using bin_attributes for cpumap and cpulist")
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Yury Norov <yury.norov@gmail.com>
Cc: stable@vger.kernel.org
Cc: feng xiangjun <fengxj325@gmail.com>
Reported-by: feng xiangjun <fengxj325@gmail.com>
Signed-off-by: Phil Auld <pauld@redhat.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Link: https://lore.kernel.org/r/20220906203542.1796629-1-pauld@redhat.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cpumask.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index bd047864c7ac..e8ad12b5b9d2 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1127,9 +1127,10 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
  * cover a worst-case of every other cpu being on one of two nodes for a
  * very large NR_CPUS.
  *
- *  Use PAGE_SIZE as a minimum for smaller configurations.
+ *  Use PAGE_SIZE as a minimum for smaller configurations while avoiding
+ *  unsigned comparison to -1.
  */
-#define CPUMAP_FILE_MAX_BYTES  ((((NR_CPUS * 9)/32 - 1) > PAGE_SIZE) \
+#define CPUMAP_FILE_MAX_BYTES  (((NR_CPUS * 9)/32 > PAGE_SIZE) \
 					? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE)
 #define CPULIST_FILE_MAX_BYTES  (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE)
 
-- 
cgit v1.2.3


From cca1fd41ab2862465d75443822d751e4f9a112ee Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <william.gray@linaro.org>
Date: Thu, 22 Sep 2022 07:20:57 -0400
Subject: counter: Realign counter_comp comment block to 80 characters

The member documentation comment lines for struct counter_comp extend
past the 80-characters column boundary due to extra identation at the
start of each section. This patch realigns the comment block within the
80-characters boundary by removing these superfluous indents.

Reviewed-by: Yanteng Si <siyanteng@loongson.cn>
Link: https://lore.kernel.org/r/20220902120839.4260-1-william.gray@linaro.org/
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
Link: https://lore.kernel.org/r/8294b04153c33602e9c3dd21ac90c1e99bd0fdaf.1663844776.git.william.gray@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/counter.h | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index 1fe17f5adb09..a81234bc8ea8 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -38,64 +38,64 @@ enum counter_comp_type {
  * @type:		Counter component data type
  * @name:		device-specific component name
  * @priv:		component-relevant data
- * @action_read:		Synapse action mode read callback. The read value of the
+ * @action_read:	Synapse action mode read callback. The read value of the
  *			respective Synapse action mode should be passed back via
  *			the action parameter.
- * @device_u8_read:		Device u8 component read callback. The read value of the
+ * @device_u8_read:	Device u8 component read callback. The read value of the
  *			respective Device u8 component should be passed back via
  *			the val parameter.
- * @count_u8_read:		Count u8 component read callback. The read value of the
+ * @count_u8_read:	Count u8 component read callback. The read value of the
  *			respective Count u8 component should be passed back via
  *			the val parameter.
- * @signal_u8_read:		Signal u8 component read callback. The read value of the
+ * @signal_u8_read:	Signal u8 component read callback. The read value of the
  *			respective Signal u8 component should be passed back via
  *			the val parameter.
- * @device_u32_read:		Device u32 component read callback. The read value of
+ * @device_u32_read:	Device u32 component read callback. The read value of
  *			the respective Device u32 component should be passed
  *			back via the val parameter.
- * @count_u32_read:		Count u32 component read callback. The read value of the
+ * @count_u32_read:	Count u32 component read callback. The read value of the
  *			respective Count u32 component should be passed back via
  *			the val parameter.
- * @signal_u32_read:		Signal u32 component read callback. The read value of
+ * @signal_u32_read:	Signal u32 component read callback. The read value of
  *			the respective Signal u32 component should be passed
  *			back via the val parameter.
- * @device_u64_read:		Device u64 component read callback. The read value of
+ * @device_u64_read:	Device u64 component read callback. The read value of
  *			the respective Device u64 component should be passed
  *			back via the val parameter.
- * @count_u64_read:		Count u64 component read callback. The read value of the
+ * @count_u64_read:	Count u64 component read callback. The read value of the
  *			respective Count u64 component should be passed back via
  *			the val parameter.
- * @signal_u64_read:		Signal u64 component read callback. The read value of
+ * @signal_u64_read:	Signal u64 component read callback. The read value of
  *			the respective Signal u64 component should be passed
  *			back via the val parameter.
- * @action_write:		Synapse action mode write callback. The write value of
+ * @action_write:	Synapse action mode write callback. The write value of
  *			the respective Synapse action mode is passed via the
  *			action parameter.
- * @device_u8_write:		Device u8 component write callback. The write value of
+ * @device_u8_write:	Device u8 component write callback. The write value of
  *			the respective Device u8 component is passed via the val
  *			parameter.
- * @count_u8_write:		Count u8 component write callback. The write value of
+ * @count_u8_write:	Count u8 component write callback. The write value of
  *			the respective Count u8 component is passed via the val
  *			parameter.
- * @signal_u8_write:		Signal u8 component write callback. The write value of
+ * @signal_u8_write:	Signal u8 component write callback. The write value of
  *			the respective Signal u8 component is passed via the val
  *			parameter.
- * @device_u32_write:		Device u32 component write callback. The write value of
+ * @device_u32_write:	Device u32 component write callback. The write value of
  *			the respective Device u32 component is passed via the
  *			val parameter.
- * @count_u32_write:		Count u32 component write callback. The write value of
+ * @count_u32_write:	Count u32 component write callback. The write value of
  *			the respective Count u32 component is passed via the val
  *			parameter.
- * @signal_u32_write:		Signal u32 component write callback. The write value of
+ * @signal_u32_write:	Signal u32 component write callback. The write value of
  *			the respective Signal u32 component is passed via the
  *			val parameter.
- * @device_u64_write:		Device u64 component write callback. The write value of
+ * @device_u64_write:	Device u64 component write callback. The write value of
  *			the respective Device u64 component is passed via the
  *			val parameter.
- * @count_u64_write:		Count u64 component write callback. The write value of
+ * @count_u64_write:	Count u64 component write callback. The write value of
  *			the respective Count u64 component is passed via the val
  *			parameter.
- * @signal_u64_write:		Signal u64 component write callback. The write value of
+ * @signal_u64_write:	Signal u64 component write callback. The write value of
  *			the respective Signal u64 component is passed via the
  *			val parameter.
  */
-- 
cgit v1.2.3


From 4d269ed485298e8a09485a664e7b35b370ab4ada Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:09 +0000
Subject: x86/resctrl: Kill off alloc_enabled

rdt_resources_all[] used to have extra entries for L2CODE/L2DATA.
These were hidden from resctrl by the alloc_enabled value.

Now that the L2/L2CODE/L2DATA resources have been merged together,
alloc_enabled doesn't mean anything, it always has the same value as
alloc_capable which indicates allocation is supported by this resource.

Remove alloc_enabled and its helpers.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-2-james.morse@arm.com
---
 include/linux/resctrl.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 21deb5212bbd..386ab3a41500 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -130,7 +130,6 @@ struct resctrl_schema;
 /**
  * struct rdt_resource - attributes of a resctrl resource
  * @rid:		The index of the resource
- * @alloc_enabled:	Is allocation enabled on this machine
  * @mon_enabled:	Is monitoring enabled for this feature
  * @alloc_capable:	Is allocation available on this machine
  * @mon_capable:	Is monitor feature available on this machine
@@ -150,7 +149,6 @@ struct resctrl_schema;
  */
 struct rdt_resource {
 	int			rid;
-	bool			alloc_enabled;
 	bool			mon_enabled;
 	bool			alloc_capable;
 	bool			mon_capable;
-- 
cgit v1.2.3


From bab6ee736873becc0216ba5fd159394e272d01b2 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:10 +0000
Subject: x86/resctrl: Merge mon_capable and mon_enabled

mon_enabled and mon_capable are always set as a pair by
rdt_get_mon_l3_config().

There is no point having two values.

Merge them together.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-3-james.morse@arm.com
---
 include/linux/resctrl.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 386ab3a41500..8180c539800d 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -130,7 +130,6 @@ struct resctrl_schema;
 /**
  * struct rdt_resource - attributes of a resctrl resource
  * @rid:		The index of the resource
- * @mon_enabled:	Is monitoring enabled for this feature
  * @alloc_capable:	Is allocation available on this machine
  * @mon_capable:	Is monitor feature available on this machine
  * @num_rmid:		Number of RMIDs available
@@ -149,7 +148,6 @@ struct resctrl_schema;
  */
 struct rdt_resource {
 	int			rid;
-	bool			mon_enabled;
 	bool			alloc_capable;
 	bool			mon_capable;
 	int			num_rmid;
-- 
cgit v1.2.3


From de84a090d99a3b991bd89cd86a94b65d15bd1bbe Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 20 Sep 2022 12:11:21 +0200
Subject: net: ethernet: mtk_eth_wed: add wed support for mt7986 chipset

Introduce Wireless Etherne Dispatcher support on transmission side
for mt7986 chipset

Tested-by: Daniel Golle <daniel@makrotopia.org>
Co-developed-by: Bo Jiao <Bo.Jiao@mediatek.com>
Signed-off-by: Bo Jiao <Bo.Jiao@mediatek.com>
Co-developed-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/soc/mediatek/mtk_wed.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 7e00cca06709..592221a7149b 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -14,6 +14,7 @@ struct mtk_wdma_desc;
 struct mtk_wed_ring {
 	struct mtk_wdma_desc *desc;
 	dma_addr_t desc_phys;
+	u32 desc_size;
 	int size;
 
 	u32 reg_base;
@@ -45,10 +46,17 @@ struct mtk_wed_device {
 		struct pci_dev *pci_dev;
 
 		u32 wpdma_phys;
+		u32 wpdma_int;
+		u32 wpdma_mask;
+		u32 wpdma_tx;
+		u32 wpdma_txfree;
 
 		u16 token_start;
 		unsigned int nbuf;
 
+		u8 tx_tbit[MTK_WED_TX_QUEUES];
+		u8 txfree_tbit;
+
 		u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id);
 		int (*offload_enable)(struct mtk_wed_device *wed);
 		void (*offload_disable)(struct mtk_wed_device *wed);
-- 
cgit v1.2.3


From 2b2ba3ecb2411c5e2a0d670be5e9ded2c93351e9 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 20 Sep 2022 12:11:22 +0200
Subject: net: ethernet: mtk_eth_wed: add axi bus support

Other than pcie bus, introduce support for axi bus to mtk wed driver.
Axi bus is used to connect mt7986-wmac soc chip available on mt7986
device.

Tested-by: Daniel Golle <daniel@makrotopia.org>
Co-developed-by: Bo Jiao <Bo.Jiao@mediatek.com>
Signed-off-by: Bo Jiao <Bo.Jiao@mediatek.com>
Co-developed-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/soc/mediatek/mtk_wed.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 592221a7149b..4450c8b7a1cb 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -11,6 +11,11 @@
 struct mtk_wed_hw;
 struct mtk_wdma_desc;
 
+enum mtk_wed_bus_tye {
+	MTK_WED_BUS_PCIE,
+	MTK_WED_BUS_AXI,
+};
+
 struct mtk_wed_ring {
 	struct mtk_wdma_desc *desc;
 	dma_addr_t desc_phys;
@@ -43,7 +48,11 @@ struct mtk_wed_device {
 
 	/* filled by driver: */
 	struct {
-		struct pci_dev *pci_dev;
+		union {
+			struct platform_device *platform_dev;
+			struct pci_dev *pci_dev;
+		};
+		enum mtk_wed_bus_tye bus_type;
 
 		u32 wpdma_phys;
 		u32 wpdma_int;
-- 
cgit v1.2.3


From 3a7232cdf19e39e7f24c493117b373788b348af2 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:11 +0000
Subject: x86/resctrl: Add domain online callback for resctrl work

Because domains are exposed to user-space via resctrl, the filesystem
must update its state when CPU hotplug callbacks are triggered.

Some of this work is common to any architecture that would support
resctrl, but the work is tied up with the architecture code to
allocate the memory.

Move domain_setup_mon_state(), the monitor subdir creation call and the
mbm/limbo workers into a new resctrl_online_domain() call. These bits
are not specific to the architecture. Grouping them in one function
allows that code to be moved to /fs/ and re-used by another architecture.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-4-james.morse@arm.com
---
 include/linux/resctrl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 8180c539800d..d512455b4c3a 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -192,5 +192,6 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
 u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 			    u32 closid, enum resctrl_conf_type type);
+int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
 
 #endif /* _RESCTRL_H */
-- 
cgit v1.2.3


From 798fd4b9ac37fec571f55fb8592497b0dd5f7a73 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:13 +0000
Subject: x86/resctrl: Add domain offline callback for resctrl work

Because domains are exposed to user-space via resctrl, the filesystem
must update its state when CPU hotplug callbacks are triggered.

Some of this work is common to any architecture that would support
resctrl, but the work is tied up with the architecture code to
free the memory.

Move the monitor subdir removal and the cancelling of the mbm/limbo
works into a new resctrl_offline_domain() call. These bits are not
specific to the architecture. Grouping them in one function allows
that code to be moved to /fs/ and re-used by another architecture.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-6-james.morse@arm.com
---
 include/linux/resctrl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index d512455b4c3a..5d283bdd6162 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -193,5 +193,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
 u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 			    u32 closid, enum resctrl_conf_type type);
 int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
+void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
 
 #endif /* _RESCTRL_H */
-- 
cgit v1.2.3


From 7a4e0d2c7fb8e28bb8ce0687925c9cf91d65f2a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= <nabijaczleweli@nabijaczleweli.xyz>
Date: Fri, 16 Sep 2022 03:54:59 +0200
Subject: tty: remove TTY_MAGIC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to Greg, in the context of magic numbers as defined in
magic-number.rst, "the tty layer should not need this and I'll gladly
take patches"

Acked-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Ref: https://lore.kernel.org/linux-doc/YyMlovoskUcHLEb7@kroah.com/
Link: https://lore.kernel.org/r/476d024cd6b04160a5de381ea2b9856b60088cbd.1663288066.git.nabijaczleweli@nabijaczleweli.xyz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index ae41893f8653..730c3301d710 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -122,8 +122,6 @@ struct tty_operations;
 /**
  * struct tty_struct - state associated with a tty while open
  *
- * @magic: magic value set early in @alloc_tty_struct to %TTY_MAGIC, for
- *	   debugging purposes
  * @kref: reference counting by tty_kref_get() and tty_kref_put(), reaching zero
  *	  frees the structure
  * @dev: class device or %NULL (e.g. ptys, serdev)
@@ -193,7 +191,6 @@ struct tty_operations;
  * &struct tty_port.
  */
 struct tty_struct {
-	int	magic;
 	struct kref kref;
 	struct device *dev;
 	struct tty_driver *driver;
@@ -260,9 +257,6 @@ struct tty_file_private {
 	struct list_head list;
 };
 
-/* tty magic number */
-#define TTY_MAGIC		0x5401
-
 /**
  * DOC: TTY Struct Flags
  *
-- 
cgit v1.2.3


From 5052df99d3bc3cd281222bbcba44323b2d0937d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= <nabijaczleweli@nabijaczleweli.xyz>
Date: Fri, 16 Sep 2022 03:55:05 +0200
Subject: tty: remove TTY_DRIVER_MAGIC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to Greg, in the context of magic numbers as defined in
magic-number.rst, "the tty layer should not need this and I'll gladly
take patches"

Acked-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Ref: https://lore.kernel.org/linux-doc/YyMlovoskUcHLEb7@kroah.com/
Link: https://lore.kernel.org/r/723478a270a3858f27843cbec621df4d5d44efcc.1663288066.git.nabijaczleweli@nabijaczleweli.xyz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_driver.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index f961164a5274..e00034118c7b 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -397,7 +397,6 @@ struct tty_operations {
 /**
  * struct tty_driver -- driver for TTY devices
  *
- * @magic: set to %TTY_DRIVER_MAGIC in __tty_alloc_driver()
  * @kref: reference counting. Reaching zero frees all the internals and the
  *	  driver.
  * @cdevs: allocated/registered character /dev devices
@@ -433,7 +432,6 @@ struct tty_operations {
  * @driver_name, @name, @type, @subtype, @init_termios, and @ops.
  */
 struct tty_driver {
-	int	magic;
 	struct kref kref;
 	struct cdev **cdevs;
 	struct module	*owner;
@@ -490,9 +488,6 @@ static inline void tty_set_operations(struct tty_driver *driver,
 	driver->ops = op;
 }
 
-/* tty driver magic number */
-#define TTY_DRIVER_MAGIC		0x5402
-
 /**
  * DOC: TTY Driver Flags
  *
-- 
cgit v1.2.3


From 9906890c89e4dbd900ed87ad3040080339a7f411 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@orcam.me.uk>
Date: Wed, 21 Sep 2022 00:35:32 +0100
Subject: serial: 8250: Let drivers request full 16550A feature probing

A SERIAL_8250_16550A_VARIANTS configuration option has been recently
defined that lets one request the 8250 driver not to probe for 16550A
device features so as to reduce the driver's device startup time in
virtual machines.

Some actual hardware devices require these features to have been fully
determined however for their driver to work correctly, so define a flag
to let drivers request full 16550A feature probing on a device-by-device
basis if required regardless of the SERIAL_8250_16550A_VARIANTS option
setting chosen.

Fixes: dc56ecb81a0a ("serial: 8250: Support disabling mdelay-filled probes of 16550A variants")
Cc: stable@vger.kernel.org # v5.6+
Reported-by: Anders Blomdell <anders.blomdell@control.lth.se>
Signed-off-by: Maciej W. Rozycki <macro@orcam.me.uk>
Link: https://lore.kernel.org/r/alpine.DEB.2.21.2209202357520.41633@angie.orcam.me.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 02a4299b7d42..2ae182f0f1de 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -422,7 +422,7 @@ struct uart_icount {
 	__u32	buf_overrun;
 };
 
-typedef unsigned int __bitwise upf_t;
+typedef u64 __bitwise upf_t;
 typedef unsigned int __bitwise upstat_t;
 
 struct uart_port {
@@ -530,6 +530,7 @@ struct uart_port {
 #define UPF_FIXED_PORT		((__force upf_t) (1 << 29))
 #define UPF_DEAD		((__force upf_t) (1 << 30))
 #define UPF_IOREMAP		((__force upf_t) (1 << 31))
+#define UPF_FULL_PROBE		((__force upf_t) (1ULL << 32))
 
 #define __UPF_CHANGE_MASK	0x17fff
 #define UPF_CHANGE_MASK		((__force upf_t) __UPF_CHANGE_MASK)
-- 
cgit v1.2.3


From 46a8973c4d9d7b12e0e4dd9f589d08d420fb6c0d Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@orcam.me.uk>
Date: Wed, 21 Sep 2022 00:35:42 +0100
Subject: serial: 8250: Switch UART port flags to using BIT_ULL

Use BIT_ULL rather than encoding bits explicitly where applicable with
UART port flags.  This makes a (__force upf_t) cast redundant, but keep
it for visual consistency with the flags defined in terms of userspace
macros.

Signed-off-by: Maciej W. Rozycki <macro@orcam.me.uk>
Link: https://lore.kernel.org/r/alpine.DEB.2.21.2209210007030.41633@angie.orcam.me.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 2ae182f0f1de..9e0a9d379390 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -513,24 +513,24 @@ struct uart_port {
 #define UPF_BUGGY_UART		((__force upf_t) ASYNC_BUGGY_UART     /* 14 */ )
 #define UPF_MAGIC_MULTIPLIER	((__force upf_t) ASYNC_MAGIC_MULTIPLIER /* 16 */ )
 
-#define UPF_NO_THRE_TEST	((__force upf_t) (1 << 19))
+#define UPF_NO_THRE_TEST	((__force upf_t) BIT_ULL(19))
 /* Port has hardware-assisted h/w flow control */
-#define UPF_AUTO_CTS		((__force upf_t) (1 << 20))
-#define UPF_AUTO_RTS		((__force upf_t) (1 << 21))
+#define UPF_AUTO_CTS		((__force upf_t) BIT_ULL(20))
+#define UPF_AUTO_RTS		((__force upf_t) BIT_ULL(21))
 #define UPF_HARD_FLOW		((__force upf_t) (UPF_AUTO_CTS | UPF_AUTO_RTS))
 /* Port has hardware-assisted s/w flow control */
-#define UPF_SOFT_FLOW		((__force upf_t) (1 << 22))
-#define UPF_CONS_FLOW		((__force upf_t) (1 << 23))
-#define UPF_SHARE_IRQ		((__force upf_t) (1 << 24))
-#define UPF_EXAR_EFR		((__force upf_t) (1 << 25))
-#define UPF_BUG_THRE		((__force upf_t) (1 << 26))
+#define UPF_SOFT_FLOW		((__force upf_t) BIT_ULL(22))
+#define UPF_CONS_FLOW		((__force upf_t) BIT_ULL(23))
+#define UPF_SHARE_IRQ		((__force upf_t) BIT_ULL(24))
+#define UPF_EXAR_EFR		((__force upf_t) BIT_ULL(25))
+#define UPF_BUG_THRE		((__force upf_t) BIT_ULL(26))
 /* The exact UART type is known and should not be probed.  */
-#define UPF_FIXED_TYPE		((__force upf_t) (1 << 27))
-#define UPF_BOOT_AUTOCONF	((__force upf_t) (1 << 28))
-#define UPF_FIXED_PORT		((__force upf_t) (1 << 29))
-#define UPF_DEAD		((__force upf_t) (1 << 30))
-#define UPF_IOREMAP		((__force upf_t) (1 << 31))
-#define UPF_FULL_PROBE		((__force upf_t) (1ULL << 32))
+#define UPF_FIXED_TYPE		((__force upf_t) BIT_ULL(27))
+#define UPF_BOOT_AUTOCONF	((__force upf_t) BIT_ULL(28))
+#define UPF_FIXED_PORT		((__force upf_t) BIT_ULL(29))
+#define UPF_DEAD		((__force upf_t) BIT_ULL(30))
+#define UPF_IOREMAP		((__force upf_t) BIT_ULL(31))
+#define UPF_FULL_PROBE		((__force upf_t) BIT_ULL(32))
 
 #define __UPF_CHANGE_MASK	0x17fff
 #define UPF_CHANGE_MASK		((__force upf_t) __UPF_CHANGE_MASK)
-- 
cgit v1.2.3


From 039d4926379b1d1c17b51cf21c500a5eed86899e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@linux.intel.com>
Date: Thu, 22 Sep 2022 10:00:05 +0300
Subject: serial: 8250: Toggle IER bits on only after irq has been set up
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Invoking TIOCVHANGUP on 8250_mid port on Ice Lake-D and then reopening
the port triggers these faults during serial8250_do_startup():

  DMAR: DRHD: handling fault status reg 3
  DMAR: [DMA Write NO_PASID] Request device [00:1a.0] fault addr 0x0 [fault reason 0x05] PTE Write access is not set

If the IRQ hasn't been set up yet, the UART will have zeroes in its MSI
address/data registers. Disabling the IRQ at the interrupt controller
won't stop the UART from performing a DMA write to the address programmed
in its MSI address register (zero) when it wants to signal an interrupt.

The UARTs (in Ice Lake-D) implement PCI 2.1 style MSI without masking
capability, so there is no way to mask the interrupt at the source PCI
function level, except disabling the MSI capability entirely, but that
would cause it to fall back to INTx# assertion, and the PCI specification
prohibits disabling the MSI capability as a way to mask a function's
interrupt service request.

The MSI address register is zeroed by the hangup as the irq is freed.
The interrupt is signalled during serial8250_do_startup() performing a
THRE test that temporarily toggles THRI in IER. The THRE test currently
occurs before UART's irq (and MSI address) is properly set up.

Refactor serial8250_do_startup() such that irq is set up before the
THRE test. The current irq setup code is intermixed with the timer
setup code. As THRE test must be performed prior to the timer setup,
extract it into own function and call it only after the THRE test.

The ->setup_timer() needs to be part of the struct uart_8250_ops in
order to not create circular dependency between 8250 and 8250_base
modules.

Fixes: 40b36daad0ac ("[PATCH] 8250 UART backup timer")
Reported-by: Lennert Buytenhek <buytenh@arista.com>
Tested-by: Lennert Buytenhek <buytenh@arista.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220922070005.2965-1-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_8250.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 43edd10e8c96..19376bee9667 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -74,6 +74,7 @@ struct uart_8250_port;
 struct uart_8250_ops {
 	int		(*setup_irq)(struct uart_8250_port *);
 	void		(*release_irq)(struct uart_8250_port *);
+	void		(*setup_timer)(struct uart_8250_port *);
 };
 
 struct uart_8250_em485 {
-- 
cgit v1.2.3


From 781096d971dfe3c5f9401a300bdb0b148a600584 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:16 +0000
Subject: x86/resctrl: Create mba_sc configuration in the rdt_domain

To support resctrl's MBA software controller, the architecture must provide
a second configuration array to hold the mbps_val[] from user-space.

This complicates the interface between the architecture specific code and
the filesystem portions of resctrl that will move to /fs/, to allow
multiple architectures to support resctrl.

Make the filesystem parts of resctrl create an array for the mba_sc
values. The software controller can be changed to use this, allowing
the architecture code to only consider the values configured in hardware.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-9-james.morse@arm.com
---
 include/linux/resctrl.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 5d283bdd6162..93dfe553b364 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -15,6 +15,9 @@ int proc_resctrl_show(struct seq_file *m,
 
 #endif
 
+/* max value for struct rdt_domain's mbps_val */
+#define MBA_MAX_MBPS   U32_MAX
+
 /**
  * enum resctrl_conf_type - The type of configuration.
  * @CDP_NONE:	No prioritisation, both code and data are controlled or monitored.
@@ -53,6 +56,9 @@ struct resctrl_staged_config {
  * @cqm_work_cpu:	worker CPU for CQM h/w counters
  * @plr:		pseudo-locked region (if any) associated with domain
  * @staged_config:	parsed configuration to be applied
+ * @mbps_val:		When mba_sc is enabled, this holds the array of user
+ *			specified control values for mba_sc in MBps, indexed
+ *			by closid
  */
 struct rdt_domain {
 	struct list_head		list;
@@ -67,6 +73,7 @@ struct rdt_domain {
 	int				cqm_work_cpu;
 	struct pseudo_lock_region	*plr;
 	struct resctrl_staged_config	staged_config[CDP_NUM_TYPES];
+	u32				*mbps_val;
 };
 
 /**
-- 
cgit v1.2.3


From ff6357bb50023af2a1dc8f113930082c5252c753 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:19 +0000
Subject: x86/resctrl: Allow update_mba_bw() to update controls directly

update_mba_bw() calculates a new control value for the MBA resource
based on the user provided mbps_val and the current measured
bandwidth. Some control values need remapping by delay_bw_map().

It does this by calling wrmsrl() directly. This needs splitting
up to be done by an architecture specific helper, so that the
remainder can eventually be moved to /fs/.

Add resctrl_arch_update_one() to apply one configuration value
to the provided resource and domain. This avoids the staging
and cross-calling that is only needed with changes made by
user-space. delay_bw_map() moves to be part of the arch code,
to maintain the 'percentage control' view of MBA resources
in resctrl.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-12-james.morse@arm.com
---
 include/linux/resctrl.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 93dfe553b364..f4c9101df461 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -197,6 +197,14 @@ struct resctrl_schema {
 /* The number of closid supported by this resource regardless of CDP */
 u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
+
+/*
+ * Update the ctrl_val and apply this config right now.
+ * Must be called on one of the domain's CPUs.
+ */
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
+			    u32 closid, enum resctrl_conf_type t, u32 cfg_val);
+
 u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 			    u32 closid, enum resctrl_conf_type type);
 int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
-- 
cgit v1.2.3


From 21803630c4ffb433bab2951fbe0ee7b8dbcc8bcb Mon Sep 17 00:00:00 2001
From: Emeel Hakim <ehakim@nvidia.com>
Date: Wed, 21 Sep 2022 11:10:46 -0700
Subject: net/mlx5: Fix fields name prefix in MACsec

Fix ifc fields name to be consistent with the device spec document.

Fixes: 8385c51ff5bc ("net/mlx5: Introduce MACsec Connect-X offload hardware bits and structures")
Signed-off-by: Emeel Hakim <ehakim@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 8decbf9a7bdd..da0ed11fcebd 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -11585,15 +11585,15 @@ struct mlx5_ifc_macsec_offload_obj_bits {
 
 	u8    confidentiality_en[0x1];
 	u8    reserved_at_41[0x1];
-	u8    esn_en[0x1];
-	u8    esn_overlap[0x1];
+	u8    epn_en[0x1];
+	u8    epn_overlap[0x1];
 	u8    reserved_at_44[0x2];
 	u8    confidentiality_offset[0x2];
 	u8    reserved_at_48[0x4];
 	u8    aso_return_reg[0x4];
 	u8    reserved_at_50[0x10];
 
-	u8    esn_msb[0x20];
+	u8    epn_msb[0x20];
 
 	u8    reserved_at_80[0x8];
 	u8    dekn[0x18];
-- 
cgit v1.2.3


From 23cc83c6ca87c9a4da62b9ddf4d0fc09f3054f81 Mon Sep 17 00:00:00 2001
From: Emeel Hakim <ehakim@nvidia.com>
Date: Wed, 21 Sep 2022 11:10:49 -0700
Subject: net/mlx5: Add ifc bits for MACsec extended packet number (EPN) and
 replay protection

Add ifc bits related to advanced steering operations (ASO) and general
object modify for macsec to use as part of offloading EPN and replay
protection features.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Emeel Hakim <ehakim@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index da0ed11fcebd..bd577b99b146 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -11558,6 +11558,20 @@ struct mlx5_ifc_modify_ipsec_obj_in_bits {
 	struct mlx5_ifc_ipsec_obj_bits ipsec_object;
 };
 
+enum {
+	MLX5_MACSEC_ASO_REPLAY_PROTECTION = 0x1,
+};
+
+enum {
+	MLX5_MACSEC_ASO_REPLAY_WIN_32BIT  = 0x0,
+	MLX5_MACSEC_ASO_REPLAY_WIN_64BIT  = 0x1,
+	MLX5_MACSEC_ASO_REPLAY_WIN_128BIT = 0x2,
+	MLX5_MACSEC_ASO_REPLAY_WIN_256BIT = 0x3,
+};
+
+#define MLX5_MACSEC_ASO_INC_SN  0x2
+#define MLX5_MACSEC_ASO_REG_C_4_5 0x2
+
 struct mlx5_ifc_macsec_aso_bits {
 	u8    valid[0x1];
 	u8    reserved_at_1[0x1];
@@ -11619,6 +11633,21 @@ struct mlx5_ifc_create_macsec_obj_in_bits {
 	struct mlx5_ifc_macsec_offload_obj_bits macsec_object;
 };
 
+struct mlx5_ifc_modify_macsec_obj_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+	struct mlx5_ifc_macsec_offload_obj_bits macsec_object;
+};
+
+enum {
+	MLX5_MODIFY_MACSEC_BITMASK_EPN_OVERLAP = BIT(0),
+	MLX5_MODIFY_MACSEC_BITMASK_EPN_MSB = BIT(1),
+};
+
+struct mlx5_ifc_query_macsec_obj_out_bits {
+	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
+	struct mlx5_ifc_macsec_offload_obj_bits macsec_object;
+};
+
 struct mlx5_ifc_encryption_key_obj_bits {
 	u8         modify_field_select[0x40];
 
-- 
cgit v1.2.3


From 4411a6c0abd3e55b4a4fb9432b3a0553f12337c2 Mon Sep 17 00:00:00 2001
From: Emeel Hakim <ehakim@nvidia.com>
Date: Wed, 21 Sep 2022 11:10:53 -0700
Subject: net/mlx5e: Support MACsec offload extended packet number (EPN)

MACsec EPN splits the packet number (PN) into two 32-bits fields,
epn_lsb (32 least significant bits (LSBs) of PN) and epn_msb (32
most significant bits (MSBs) of PN).
Epn_msb bits are managed by SW and for that HW is required to send
an object change event of type EPN event notifying the SW to update
the epn_msb in addition, once epn_msb is updated SW update HW with
the new epn_msb value for HW to perform replay protection.
To prevent HW from stopping while handling the event, SW manages
another bit for HW called epn_overlap, HW uses the latter to get
an indication regarding how to read the epn_msb value correctly
while still receiving packets.
Add epn event handling that updates the epn_overlap and epn_msb for
every 2^31 packets according to the following logic:
if epn_lsb crosses 2^31 (half sequence number wraparound) upon HW
relevant event, SW updates the esn_overlap value to OLD (value = 1).
When the epn_lsb crosses 2^32 (full sequence number wraparound)
upon HW relevant event, SW updates the esn_overlap to NEW
(value = 0) and increment the esn_msb.
When using MACsec EPN a salt and short secure channel id (ssci)
needs to be provided by the user, when offloading EPN need to pass
this salt and ssci to the HW to be used in the initial vector (IV)
calculations.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Emeel Hakim <ehakim@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/device.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 2927810f172b..dcd60fb9e6b4 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -325,6 +325,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
 	MLX5_EVENT_TYPE_WQ_ACCESS_ERROR	   = 0x11,
 	MLX5_EVENT_TYPE_SRQ_CATAS_ERROR	   = 0x12,
+	MLX5_EVENT_TYPE_OBJECT_CHANGE	   = 0x27,
 
 	MLX5_EVENT_TYPE_INTERNAL_ERROR	   = 0x08,
 	MLX5_EVENT_TYPE_PORT_CHANGE	   = 0x09,
@@ -699,6 +700,12 @@ struct mlx5_eqe_temp_warning {
 	__be64 sensor_warning_lsb;
 } __packed;
 
+struct mlx5_eqe_obj_change {
+	u8      rsvd0[2];
+	__be16  obj_type;
+	__be32  obj_id;
+} __packed;
+
 #define SYNC_RST_STATE_MASK    0xf
 
 enum sync_rst_state_type {
@@ -737,6 +744,7 @@ union ev_data {
 	struct mlx5_eqe_xrq_err		xrq_err;
 	struct mlx5_eqe_sync_fw_update	sync_fw_update;
 	struct mlx5_eqe_vhca_state	vhca_state;
+	struct mlx5_eqe_obj_change	obj_change;
 } __packed;
 
 struct mlx5_eqe {
-- 
cgit v1.2.3


From 6edf2576a6cc46460c164831517a36064eb8109c Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 13 Sep 2022 14:54:20 +0800
Subject: mm/slub: enable debugging memory wasting of kmalloc

kmalloc's API family is critical for mm, with one nature that it will
round up the request size to a fixed one (mostly power of 2). Say
when user requests memory for '2^n + 1' bytes, actually 2^(n+1) bytes
could be allocated, so in worst case, there is around 50% memory
space waste.

The wastage is not a big issue for requests that get allocated/freed
quickly, but may cause problems with objects that have longer life
time.

We've met a kernel boot OOM panic (v5.10), and from the dumped slab
info:

    [   26.062145] kmalloc-2k            814056KB     814056KB

From debug we found there are huge number of 'struct iova_magazine',
whose size is 1032 bytes (1024 + 8), so each allocation will waste
1016 bytes. Though the issue was solved by giving the right (bigger)
size of RAM, it is still nice to optimize the size (either use a
kmalloc friendly size or create a dedicated slab for it).

And from lkml archive, there was another crash kernel OOM case [1]
back in 2019, which seems to be related with the similar slab waste
situation, as the log is similar:

    [    4.332648] iommu: Adding device 0000:20:02.0 to group 16
    [    4.338946] swapper/0 invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null), order=0, oom_score_adj=0
    ...
    [    4.857565] kmalloc-2048           59164KB      59164KB

The crash kernel only has 256M memory, and 59M is pretty big here.
(Note: the related code has been changed and optimised in recent
kernel [2], these logs are just picked to demo the problem, also
a patch changing its size to 1024 bytes has been merged)

So add an way to track each kmalloc's memory waste info, and
leverage the existing SLUB debug framework (specifically
SLUB_STORE_USER) to show its call stack of original allocation,
so that user can evaluate the waste situation, identify some hot
spots and optimize accordingly, for a better utilization of memory.

The waste info is integrated into existing interface:
'/sys/kernel/debug/slab/kmalloc-xx/alloc_traces', one example of
'kmalloc-4k' after boot is:

 126 ixgbe_alloc_q_vector+0xbe/0x830 [ixgbe] waste=233856/1856 age=280763/281414/282065 pid=1330 cpus=32 nodes=1
     __kmem_cache_alloc_node+0x11f/0x4e0
     __kmalloc_node+0x4e/0x140
     ixgbe_alloc_q_vector+0xbe/0x830 [ixgbe]
     ixgbe_init_interrupt_scheme+0x2ae/0xc90 [ixgbe]
     ixgbe_probe+0x165f/0x1d20 [ixgbe]
     local_pci_probe+0x78/0xc0
     work_for_cpu_fn+0x26/0x40
     ...

which means in 'kmalloc-4k' slab, there are 126 requests of
2240 bytes which got a 4KB space (wasting 1856 bytes each
and 233856 bytes in total), from ixgbe_alloc_q_vector().

And when system starts some real workload like multiple docker
instances, there could are more severe waste.

[1]. https://lkml.org/lkml/2019/8/12/266
[2]. https://lore.kernel.org/lkml/2920df89-9975-5785-f79b-257d3052dfaf@huawei.com/

[Thanks Hyeonggon for pointing out several bugs about sorting/format]
[Thanks Vlastimil for suggesting way to reduce memory usage of
 orig_size and keep it only for kmalloc objects]

Signed-off-by: Feng Tang <feng.tang@intel.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0fefdf528e0d..a713b0e5bbcd 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -29,6 +29,8 @@
 #define SLAB_RED_ZONE		((slab_flags_t __force)0x00000400U)
 /* DEBUG: Poison objects */
 #define SLAB_POISON		((slab_flags_t __force)0x00000800U)
+/* Indicate a kmalloc slab */
+#define SLAB_KMALLOC		((slab_flags_t __force)0x00001000U)
 /* Align objs on cache lines */
 #define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000U)
 /* Use GFP_DMA memory */
-- 
cgit v1.2.3


From fea62d370d7a1ba288d71d0cae7ad47c2a02b839 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:22 +0000
Subject: x86/resctrl: Allow per-rmid arch private storage to be reset

To abstract the rmid counters into a helper that returns the number
of bytes counted, architecture specific per-rmid state is needed.

It needs to be possible to reset this hidden state, as the values
may outlive the life of an rmid, or the mount time of the filesystem.

mon_event_read() is called with first = true when an rmid is first
allocated in mkdir_mondata_subdir(). Add resctrl_arch_reset_rmid()
and call it from __mon_event_count()'s rr->first check.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-15-james.morse@arm.com
---
 include/linux/resctrl.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index f4c9101df461..818456770176 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -32,6 +32,16 @@ enum resctrl_conf_type {
 
 #define CDP_NUM_TYPES	(CDP_DATA + 1)
 
+/*
+ * Event IDs, the values match those used to program IA32_QM_EVTSEL before
+ * reading IA32_QM_CTR on RDT systems.
+ */
+enum resctrl_event_id {
+	QOS_L3_OCCUP_EVENT_ID		= 0x01,
+	QOS_L3_MBM_TOTAL_EVENT_ID	= 0x02,
+	QOS_L3_MBM_LOCAL_EVENT_ID	= 0x03,
+};
+
 /**
  * struct resctrl_staged_config - parsed configuration to be applied
  * @new_ctrl:		new ctrl value to be loaded
@@ -210,4 +220,17 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
 void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
 
+/**
+ * resctrl_arch_reset_rmid() - Reset any private state associated with rmid
+ *			       and eventid.
+ * @r:		The domain's resource.
+ * @d:		The rmid's domain.
+ * @rmid:	The rmid whose counter values should be reset.
+ * @eventid:	The eventid whose counter values should be reset.
+ *
+ * This can be called from any CPU.
+ */
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
+			     u32 rmid, enum resctrl_event_id eventid);
+
 #endif /* _RESCTRL_H */
-- 
cgit v1.2.3


From 72bc36956f73ac54f19a7ac7302fb274069bec18 Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@seco.com>
Date: Tue, 20 Sep 2022 18:12:28 -0400
Subject: net: phylink: Document MAC_(A)SYM_PAUSE

This documents the possible MLO_PAUSE_* settings which can result from
different combinations of MAC_(A)SYM_PAUSE. Special note is paid to
settings which can result from user configuration (MLO_PAUSE_AN). The
autonegotiation results are more-or-less a direct consequence of IEEE
802.3 Table 28B-2.

Signed-off-by: Sean Anderson <sean.anderson@seco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 6d06896fc20d..1f997e14bf80 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -21,6 +21,35 @@ enum {
 	MLO_AN_FIXED,	/* Fixed-link mode */
 	MLO_AN_INBAND,	/* In-band protocol */
 
+	/* MAC_SYM_PAUSE and MAC_ASYM_PAUSE are used when configuring our
+	 * autonegotiation advertisement. They correspond to the PAUSE and
+	 * ASM_DIR bits defined by 802.3, respectively.
+	 *
+	 * The following table lists the values of tx_pause and rx_pause which
+	 * might be requested in mac_link_up. The exact values depend on either
+	 * the results of autonegotation (if MLO_PAUSE_AN is set) or user
+	 * configuration (if MLO_PAUSE_AN is not set).
+	 *
+	 * MAC_SYM_PAUSE MAC_ASYM_PAUSE MLO_PAUSE_AN tx_pause/rx_pause
+	 * ============= ============== ============ ==================
+	 *             0              0            0 0/0
+	 *             0              0            1 0/0
+	 *             0              1            0 0/0, 0/1, 1/0, 1/1
+	 *             0              1            1 0/0,      1/0
+	 *             1              0            0 0/0,           1/1
+	 *             1              0            1 0/0,           1/1
+	 *             1              1            0 0/0, 0/1, 1/0, 1/1
+	 *             1              1            1 0/0, 0/1,      1/1
+	 *
+	 * If you set MAC_ASYM_PAUSE, the user may request any combination of
+	 * tx_pause and rx_pause. You do not have to support these
+	 * combinations.
+	 *
+	 * However, you should support combinations of tx_pause and rx_pause
+	 * which might be the result of autonegotation. For example, don't set
+	 * MAC_SYM_PAUSE unless your device can support tx_pause and rx_pause
+	 * at the same time.
+	 */
 	MAC_SYM_PAUSE	= BIT(0),
 	MAC_ASYM_PAUSE	= BIT(1),
 	MAC_10HD	= BIT(2),
-- 
cgit v1.2.3


From 606116529ab2d12e93bf751f74ed50a621b46846 Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@seco.com>
Date: Tue, 20 Sep 2022 18:12:29 -0400
Subject: net: phylink: Export phylink_caps_to_linkmodes

This function is convenient for MAC drivers. They can use it to add or
remove particular link modes based on capabilities (such as if half
duplex is not supported for a particular interface mode).

Signed-off-by: Sean Anderson <sean.anderson@seco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 1f997e14bf80..7cf26d7a522d 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -547,6 +547,7 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
 		 phy_interface_t interface, int speed, int duplex);
 #endif
 
+void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps);
 void phylink_get_linkmodes(unsigned long *linkmodes, phy_interface_t interface,
 			   unsigned long mac_capabilities);
 void phylink_generic_validate(struct phylink_config *config,
-- 
cgit v1.2.3


From 3e6eab8f3ef93cd78cd4b67f497ef6035eb073ad Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@seco.com>
Date: Tue, 20 Sep 2022 18:12:30 -0400
Subject: net: phylink: Generate caps and convert to linkmodes separately

If we call phylink_caps_to_linkmodes directly from
phylink_get_linkmodes, it is difficult to re-use this functionality in
MAC drivers. This is because MAC drivers must then work with an ethtool
linkmode bitmap, instead of with mac capabilities. Instead, let the
caller of phylink_get_linkmodes do the conversion. To reflect this
change, rename the function to phylink_get_capabilities.

Signed-off-by: Sean Anderson <sean.anderson@seco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 7cf26d7a522d..cc039ae7e80c 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -548,8 +548,8 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
 #endif
 
 void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps);
-void phylink_get_linkmodes(unsigned long *linkmodes, phy_interface_t interface,
-			   unsigned long mac_capabilities);
+unsigned long phylink_get_capabilities(phy_interface_t interface,
+				       unsigned long mac_capabilities);
 void phylink_generic_validate(struct phylink_config *config,
 			      unsigned long *supported,
 			      struct phylink_link_state *state);
-- 
cgit v1.2.3


From 0c3e10cb44232833a50cb8e3e784c432906a60c1 Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@seco.com>
Date: Tue, 20 Sep 2022 18:12:31 -0400
Subject: net: phy: Add support for rate matching

This adds support for rate matching (also known as rate adaptation) to
the phy subsystem. The general idea is that the phy interface runs at
one speed, and the MAC throttles the rate at which it sends packets to
the link speed. There's a good overview of several techniques for
achieving this at [1]. This patch adds support for three: pause-frame
based (such as in Aquantia phys), CRS-based (such as in 10PASS-TS and
2BASE-TL), and open-loop-based (such as in 10GBASE-W).

This patch makes a few assumptions and a few non assumptions about the
types of rate matching available. First, it assumes that different phys
may use different forms of rate matching. Second, it assumes that phys
can use rate matching for any of their supported link speeds (e.g. if a
phy supports 10BASE-T and XGMII, then it can adapt XGMII to 10BASE-T).
Third, it does not assume that all interface modes will use the same
form of rate matching. Fourth, it does not assume that all phy devices
will support rate matching (even if some do). Relaxing or strengthening
these (non-)assumptions could result in a different API. For example, if
all interface modes were assumed to use the same form of rate matching,
then a bitmask of interface modes supportting rate matching would
suffice.

For some better visibility into the process, the current rate matching
mode is exposed as part of the ethtool ksettings. For the moment, only
read access is supported. I'm not sure what userspace might want to
configure yet (disable it altogether, disable just one mode, specify the
mode to use, etc.). For the moment, since only pause-based rate
adaptation support is added in the next few commits, rate matching can
be disabled altogether by adjusting the advertisement.

802.3 calls this feature "rate adaptation" in clause 49 (10GBASE-R) and
"rate matching" in clause 61 (10PASS-TL and 2BASE-TS). Aquantia also calls
this feature "rate adaptation". I chose "rate matching" because it is
shorter, and because Russell doesn't think "adaptation" is correct in this
context.

Signed-off-by: Sean Anderson <sean.anderson@seco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 337230c135f7..9c66f357f489 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -280,7 +280,6 @@ static inline const char *phy_modes(phy_interface_t interface)
 	}
 }
 
-
 #define PHY_INIT_TIMEOUT	100000
 #define PHY_FORCE_TIMEOUT	10
 
@@ -574,6 +573,7 @@ struct macsec_ops;
  * @lp_advertising: Current link partner advertised linkmodes
  * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited
  * @autoneg: Flag autoneg being used
+ * @rate_matching: Current rate matching mode
  * @link: Current link state
  * @autoneg_complete: Flag auto negotiation of the link has completed
  * @mdix: Current crossover
@@ -641,6 +641,8 @@ struct phy_device {
 	unsigned irq_suspended:1;
 	unsigned irq_rerun:1;
 
+	int rate_matching;
+
 	enum phy_state state;
 
 	u32 dev_flags;
@@ -805,6 +807,21 @@ struct phy_driver {
 	 */
 	int (*get_features)(struct phy_device *phydev);
 
+	/**
+	 * @get_rate_matching: Get the supported type of rate matching for a
+	 * particular phy interface. This is used by phy consumers to determine
+	 * whether to advertise lower-speed modes for that interface. It is
+	 * assumed that if a rate matching mode is supported on an interface,
+	 * then that interface's rate can be adapted to all slower link speeds
+	 * supported by the phy. If iface is %PHY_INTERFACE_MODE_NA, and the phy
+	 * supports any kind of rate matching for any interface, then it must
+	 * return that rate matching mode (preferring %RATE_MATCH_PAUSE to
+	 * %RATE_MATCH_CRS). If the interface is not supported, this should
+	 * return %RATE_MATCH_NONE.
+	 */
+	int (*get_rate_matching)(struct phy_device *phydev,
+				   phy_interface_t iface);
+
 	/* PHY Power Management */
 	/** @suspend: Suspend the hardware, saving state if needed */
 	int (*suspend)(struct phy_device *phydev);
@@ -971,6 +988,7 @@ struct phy_fixup {
 
 const char *phy_speed_to_str(int speed);
 const char *phy_duplex_to_str(unsigned int duplex);
+const char *phy_rate_matching_to_str(int rate_matching);
 
 int phy_interface_num_ports(phy_interface_t interface);
 
@@ -1687,6 +1705,8 @@ int phy_disable_interrupts(struct phy_device *phydev);
 void phy_request_interrupt(struct phy_device *phydev);
 void phy_free_interrupt(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
+int phy_get_rate_matching(struct phy_device *phydev,
+			    phy_interface_t iface);
 void phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
 void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
 void phy_advertise_supported(struct phy_device *phydev);
-- 
cgit v1.2.3


From ae0e4bb2a0e0e434e9f98fb4994093ec2ee71997 Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@seco.com>
Date: Tue, 20 Sep 2022 18:12:32 -0400
Subject: net: phylink: Adjust link settings based on rate matching

If the phy is configured to use pause-based rate matching, ensure that
the link is full duplex with pause frame reception enabled. As
suggested, if pause-based rate matching is enabled by the phy, then
pause reception is unconditionally enabled.

The interface duplex is determined based on the rate matching type.
When rate matching is enabled, so is the speed. We assume the maximum
interface speed is used. This is only relevant for MLO_AN_PHY. For
MLO_AN_INBAND, the MAC/PCS's view of the interface speed will be used.

Although there are no RATE_ADAPT_CRS phys in-tree, it has been added for
comparison (and the implementation is quite simple).

Co-developed-by: Russell King <linux@armlinux.org.uk>
Signed-off-by: Sean Anderson <sean.anderson@seco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index cc039ae7e80c..5c99c21e42b5 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -88,6 +88,10 @@ static inline bool phylink_autoneg_inband(unsigned int mode)
  * @speed: link speed, one of the SPEED_* constants.
  * @duplex: link duplex mode, one of DUPLEX_* constants.
  * @pause: link pause state, described by MLO_PAUSE_* constants.
+ * @rate_matching: rate matching being performed, one of the RATE_MATCH_*
+ *   constants. If rate matching is taking place, then the speed/duplex of
+ *   the medium link mode (@speed and @duplex) and the speed/duplex of the phy
+ *   interface mode (@interface) are different.
  * @link: true if the link is up.
  * @an_enabled: true if autonegotiation is enabled/desired.
  * @an_complete: true if autonegotiation has completed.
@@ -99,6 +103,7 @@ struct phylink_link_state {
 	int speed;
 	int duplex;
 	int pause;
+	int rate_matching;
 	unsigned int link:1;
 	unsigned int an_enabled:1;
 	unsigned int an_complete:1;
-- 
cgit v1.2.3


From b7e9294885b610791fcebc799bf2a9e219446fd6 Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@seco.com>
Date: Tue, 20 Sep 2022 18:12:33 -0400
Subject: net: phylink: Adjust advertisement based on rate matching

This adds support for adjusting the advertisement for pause-based rate
matching. This may result in a lossy link, since the final link settings
are not adjusted. Asymmetric pause support is necessary. It would be
possible for a MAC supporting only symmetric pause to use pause-based rate
adaptation, but only if pause reception was enabled as well.

Signed-off-by: Sean Anderson <sean.anderson@seco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 5c99c21e42b5..664dd409feb9 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -554,7 +554,8 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
 
 void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps);
 unsigned long phylink_get_capabilities(phy_interface_t interface,
-				       unsigned long mac_capabilities);
+				       unsigned long mac_capabilities,
+				       int rate_matching);
 void phylink_generic_validate(struct phylink_config *config,
 			      unsigned long *supported,
 			      struct phylink_link_state *state);
-- 
cgit v1.2.3


From 4d044c521a63b2cd394ea6e3547012032145e47e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:23 +0000
Subject: x86/resctrl: Abstract __rmid_read()

__rmid_read() selects the specified eventid and returns the counter
value from the MSR. The error handling is architecture specific, and
handled by the callers, rdtgroup_mondata_show() and __mon_event_count().

Error handling should be handled by architecture specific code, as
a different architecture may have different requirements. MPAM's
counters can report that they are 'not ready', requiring a second
read after a short delay. This should be hidden from resctrl.

Make __rmid_read() the architecture specific function for reading
a counter. Rename it resctrl_arch_rmid_read() and move the error
handling into it.

A read from a counter that hardware supports but resctrl does not
now returns -EINVAL instead of -EIO from the default case in
__mon_event_count(). It isn't possible for user-space to see this
change as resctrl doesn't expose counters it doesn't support.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-16-james.morse@arm.com
---
 include/linux/resctrl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 818456770176..efe60dd7fd21 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -219,6 +219,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 			    u32 closid, enum resctrl_conf_type type);
 int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
 void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
+int resctrl_arch_rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *res);
 
 /**
  * resctrl_arch_reset_rmid() - Reset any private state associated with rmid
-- 
cgit v1.2.3


From 8286618aca331bf17323ff3023ca831ac6e4b86f Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:24 +0000
Subject: x86/resctrl: Pass the required parameters into
 resctrl_arch_rmid_read()

resctrl_arch_rmid_read() is intended as the function that an
architecture agnostic resctrl filesystem driver can use to
read a value in bytes from a hardware register. Currently the function
returns the MBM values in chunks directly from hardware.

To convert this to bytes, some correction and overflow calculations
are needed. These depend on the resource and domain structures.
Overflow detection requires the old chunks value. None of this
is available to resctrl_arch_rmid_read(). MPAM requires the
resource and domain structures to find the MMIO device that holds
the registers.

Pass the resource and domain to resctrl_arch_rmid_read(). This makes
rmid_dirty() too big. Instead merge it with its only caller, and the
name is kept as a local variable.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-17-james.morse@arm.com
---
 include/linux/resctrl.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index efe60dd7fd21..7ccfa0d1bb34 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -219,7 +219,23 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 			    u32 closid, enum resctrl_conf_type type);
 int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
 void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
-int resctrl_arch_rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *res);
+
+/**
+ * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid
+ *			      for this resource and domain.
+ * @r:			resource that the counter should be read from.
+ * @d:			domain that the counter should be read from.
+ * @rmid:		rmid of the counter to read.
+ * @eventid:		eventid to read, e.g. L3 occupancy.
+ * @val:		result of the counter read in chunks.
+ *
+ * Call from process context on a CPU that belongs to domain @d.
+ *
+ * Return:
+ * 0 on success, or -EIO, -EINVAL etc on error.
+ */
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
+			   u32 rmid, enum resctrl_event_id eventid, u64 *val);
 
 /**
  * resctrl_arch_reset_rmid() - Reset any private state associated with rmid
-- 
cgit v1.2.3


From ae2328b52962531c2d7c6b531022a3eb2d680f17 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:27 +0000
Subject: x86/resctrl: Rename and change the units of resctrl_cqm_threshold

resctrl_cqm_threshold is stored in a hardware specific chunk size,
but exposed to user-space as bytes.

This means the filesystem parts of resctrl need to know how the hardware
counts, to convert the user provided byte value to chunks. The interface
between the architecture's resctrl code and the filesystem ought to
treat everything as bytes.

Change the unit of resctrl_cqm_threshold to bytes. resctrl_arch_rmid_read()
still returns its value in chunks, so this needs converting to bytes.
As all the users have been touched, rename the variable to
resctrl_rmid_realloc_threshold, which describes what the value is for.

Neither r->num_rmid nor hw_res->mon_scale are guaranteed to be a power
of 2, so the existing code introduces a rounding error from resctrl's
theoretical fraction of the cache usage. This behaviour is kept as it
ensures the user visible value matches the value read from hardware
when the rmid will be reallocated.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-20-james.morse@arm.com
---
 include/linux/resctrl.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 7ccfa0d1bb34..9995d043650a 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -250,4 +250,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
 			     u32 rmid, enum resctrl_event_id eventid);
 
+extern unsigned int resctrl_rmid_realloc_threshold;
+
 #endif /* _RESCTRL_H */
-- 
cgit v1.2.3


From d80975e264c8f01518890f3d91ab5bada8fa7f5e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:28 +0000
Subject: x86/resctrl: Add resctrl_rmid_realloc_limit to abstract x86's
 boot_cpu_data

resctrl_rmid_realloc_threshold can be set by user-space. The maximum
value is specified by the architecture.

Currently max_threshold_occ_write() reads the maximum value from
boot_cpu_data.x86_cache_size, which is not portable to another
architecture.

Add resctrl_rmid_realloc_limit to describe the maximum size in bytes
that user-space can set the threshold to.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-21-james.morse@arm.com
---
 include/linux/resctrl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 9995d043650a..cb857f753322 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -251,5 +251,6 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
 			     u32 rmid, enum resctrl_event_id eventid);
 
 extern unsigned int resctrl_rmid_realloc_threshold;
+extern unsigned int resctrl_rmid_realloc_limit;
 
 #endif /* _RESCTRL_H */
-- 
cgit v1.2.3


From f7b1843eca6fe295ba0c71fc02a3291954078f2b Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:29 +0000
Subject: x86/resctrl: Make resctrl_arch_rmid_read() return values in bytes

resctrl_arch_rmid_read() returns a value in chunks, as read from the
hardware. This needs scaling to bytes by mon_scale, as provided by
the architecture code.

Now that resctrl_arch_rmid_read() performs the overflow and corrections
itself, it may as well return a value in bytes directly. This allows
the accesses to the architecture specific 'hw' structure to be removed.

Move the mon_scale conversion into resctrl_arch_rmid_read().
mbm_bw_count() is updated to calculate bandwidth from bytes.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-22-james.morse@arm.com
---
 include/linux/resctrl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index cb857f753322..0cf5b20c6ddf 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -227,7 +227,7 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
  * @d:			domain that the counter should be read from.
  * @rmid:		rmid of the counter to read.
  * @eventid:		eventid to read, e.g. L3 occupancy.
- * @val:		result of the counter read in chunks.
+ * @val:		result of the counter read in bytes.
  *
  * Call from process context on a CPU that belongs to domain @d.
  *
-- 
cgit v1.2.3


From 1e1f26635e5459db4134952369b76b8d59c50438 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Mon, 19 Sep 2022 19:58:03 -0700
Subject: ASoC: ssm2518: drop support for platform data

There are currently no users of this driver's platform data in the
mainline kernel, so let's drop it.

Newer devices should use DT, ACPI, or static software properties to
describe the hardware.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Link: https://lore.kernel.org/r/20220920025804.1788667-1-dmitry.torokhov@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/platform_data/ssm2518.h | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 include/linux/platform_data/ssm2518.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/ssm2518.h b/include/linux/platform_data/ssm2518.h
deleted file mode 100644
index 3f9e632d6f63..000000000000
--- a/include/linux/platform_data/ssm2518.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * SSM2518 amplifier audio driver
- *
- * Copyright 2013 Analog Devices Inc.
- *  Author: Lars-Peter Clausen <lars@metafoo.de>
- */
-
-#ifndef __LINUX_PLATFORM_DATA_SSM2518_H__
-#define __LINUX_PLATFORM_DATA_SSM2518_H__
-
-/**
- * struct ssm2518_platform_data - Platform data for the ssm2518 driver
- * @enable_gpio: GPIO connected to the nSD pin. Set to -1 if the nSD pin is
- *            hardwired.
- */
-struct ssm2518_platform_data {
-	int enable_gpio;
-};
-
-#endif
-- 
cgit v1.2.3


From 22873deac9e7b273bbf17eee515c8170510d861a Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Sat, 24 Sep 2022 06:59:59 +0200
Subject: vfs: add vfs_tmpfile_open() helper

This helper unifies tmpfile creation with opening.

Existing vfs_tmpfile() callers outside of fs/namei.c will be converted to
using this helper.  There are two such callers: cachefile and overlayfs.

The cachefiles code currently uses the open_with_fake_path() helper to open
the tmpfile, presumably to disable accounting of the open file.  Overlayfs
uses tmpfile for copy_up, which means these struct file instances will be
short lived, hence it doesn't really matter if they are accounted or not.
Disable accounting in this helper too, which should be okay for both
callers.

Add MAY_OPEN permission checking for consistency.  Like for create(2)
read/write permissions are not checked.

Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/linux/fs.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9eced4cc286e..15fafda95dd3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2007,6 +2007,10 @@ static inline int vfs_whiteout(struct user_namespace *mnt_userns,
 struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
 			   struct dentry *dentry, umode_t mode, int open_flag);
 
+struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns,
+			const struct path *parentpath,
+			umode_t mode, int open_flag, const struct cred *cred);
+
 int vfs_mkobj(struct dentry *, umode_t,
 		int (*f)(struct dentry *, umode_t, void *),
 		void *);
-- 
cgit v1.2.3


From 3e9d4c593558ea86f49e10e62373a54c7f5a63e4 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Sat, 24 Sep 2022 07:00:00 +0200
Subject: vfs: make vfs_tmpfile() static

No callers outside of fs/namei.c anymore.

Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/linux/fs.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 15fafda95dd3..02646542f6bb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2004,9 +2004,6 @@ static inline int vfs_whiteout(struct user_namespace *mnt_userns,
 			 WHITEOUT_DEV);
 }
 
-struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
-			   struct dentry *dentry, umode_t mode, int open_flag);
-
 struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns,
 			const struct path *parentpath,
 			umode_t mode, int open_flag, const struct cred *cred);
-- 
cgit v1.2.3


From 9751b338656f05a0ce918befd5118fcd970c71c6 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Sat, 24 Sep 2022 07:00:00 +0200
Subject: vfs: move open right after ->tmpfile()

Create a helper finish_open_simple() that opens the file with the original
dentry.  Handle the error case here as well to simplify callers.

Call this helper right after ->tmpfile() is called.

Next patch will change the tmpfile API and move this call into tmpfile
instances.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/linux/fs.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 02646542f6bb..a3c50869e79b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2780,6 +2780,15 @@ extern int finish_open(struct file *file, struct dentry *dentry,
 			int (*open)(struct inode *, struct file *));
 extern int finish_no_open(struct file *file, struct dentry *dentry);
 
+/* Helper for the simple case when original dentry is used */
+static inline int finish_open_simple(struct file *file, int error)
+{
+	if (error)
+		return error;
+
+	return finish_open(file, file->f_path.dentry, NULL);
+}
+
 /* fs/dcache.c */
 extern void __init vfs_caches_init_early(void);
 extern void __init vfs_caches_init(void);
-- 
cgit v1.2.3


From 863f144f12add1f4eab80b70561a90857c524a8b Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Sat, 24 Sep 2022 07:00:00 +0200
Subject: vfs: open inside ->tmpfile()

This is in preparation for adding tmpfile support to fuse, which requires
that the tmpfile creation and opening are done as a single operation.

Replace the 'struct dentry *' argument of i_op->tmpfile with
'struct file *'.

Call finish_open_simple() as the last thing in ->tmpfile() instances (may
be omitted in the error case).

Change d_tmpfile() argument to 'struct file *' as well to make callers more
readable.

Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/linux/dcache.h | 3 ++-
 include/linux/fs.h     | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 92c78ed02b54..bde9f8ff8869 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -16,6 +16,7 @@
 #include <linux/wait.h>
 
 struct path;
+struct file;
 struct vfsmount;
 
 /*
@@ -250,7 +251,7 @@ extern struct dentry * d_make_root(struct inode *);
 /* <clickety>-<click> the ramfs-type tree */
 extern void d_genocide(struct dentry *);
 
-extern void d_tmpfile(struct dentry *, struct inode *);
+extern void d_tmpfile(struct file *, struct inode *);
 
 extern struct dentry *d_find_alias(struct inode *);
 extern void d_prune_aliases(struct inode *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a3c50869e79b..8218d9964ff8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2168,7 +2168,7 @@ struct inode_operations {
 			   struct file *, unsigned open_flag,
 			   umode_t create_mode);
 	int (*tmpfile) (struct user_namespace *, struct inode *,
-			struct dentry *, umode_t);
+			struct file *, umode_t);
 	int (*set_acl)(struct user_namespace *, struct inode *,
 		       struct posix_acl *, int);
 	int (*fileattr_set)(struct user_namespace *mnt_userns,
-- 
cgit v1.2.3


From 4a575865c1ea67018d96acda9b43e5d3d25b2366 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Fri, 16 Sep 2022 13:20:49 +0100
Subject: mtd: allow getting MTD device associated with a specific DT node
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MTD subsystem API allows interacting with MTD devices (e.g. reading,
writing, handling bad blocks). So far a random driver could get MTD
device only by its name (get_mtd_device_nm()). This change allows
getting them also by a DT node.

This API is required for drivers handling DT defined MTD partitions in a
specific way (e.g. U-Boot (sub)partition with environment variables).

Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20220916122100.170016-3-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mtd/mtd.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 955aee14b0f7..6fc841ceef31 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -677,6 +677,7 @@ extern int mtd_device_unregister(struct mtd_info *master);
 extern struct mtd_info *get_mtd_device(struct mtd_info *mtd, int num);
 extern int __get_mtd_device(struct mtd_info *mtd);
 extern void __put_mtd_device(struct mtd_info *mtd);
+extern struct mtd_info *of_get_mtd_device_by_node(struct device_node *np);
 extern struct mtd_info *get_mtd_device_nm(const char *name);
 extern void put_mtd_device(struct mtd_info *mtd);
 
-- 
cgit v1.2.3


From aade55c86033bee868a93e4bf3843c9c99e84526 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 22 Sep 2022 16:54:10 +0300
Subject: device property: Add const qualifier to device_get_match_data()
 parameter

Add const qualifier to the device_get_match_data() parameter.
Some of the future users may utilize this function without
forcing the type.

All the same, dev_fwnode() may be used with a const qualifier.

Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220922135410.49694-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/property.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index a5b429d623f6..117cc200c656 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -32,7 +32,7 @@ enum dev_dma_attr {
 	DEV_DMA_COHERENT,
 };
 
-struct fwnode_handle *dev_fwnode(struct device *dev);
+struct fwnode_handle *dev_fwnode(const struct device *dev);
 
 bool device_property_present(struct device *dev, const char *propname);
 int device_property_read_u8_array(struct device *dev, const char *propname,
@@ -387,7 +387,7 @@ bool device_dma_supported(struct device *dev);
 
 enum dev_dma_attr device_get_dma_attr(struct device *dev);
 
-const void *device_get_match_data(struct device *dev);
+const void *device_get_match_data(const struct device *dev);
 
 int device_get_phy_mode(struct device *dev);
 int fwnode_get_phy_mode(struct fwnode_handle *fwnode);
-- 
cgit v1.2.3


From bf2ee8d0c385f883a00473768b67faf2189b2410 Mon Sep 17 00:00:00 2001
From: Jianmin Lv <lvjianmin@loongson.cn>
Date: Sun, 11 Sep 2022 17:06:34 +0800
Subject: ACPI: scan: Support multiple DMA windows with different offsets

In DT systems configurations, of_dma_get_range() returns struct
bus_dma_region DMA regions; they are used to set-up devices
DMA windows with different offset available for translation between DMA
address and CPU address.

In ACPI systems configuration, acpi_dma_get_range() does not return
DMA regions yet and that precludes setting up the dev->dma_range_map
pointer and therefore DMA regions with multiple offsets.

Update acpi_dma_get_range() to return struct bus_dma_region
DMA regions like of_dma_get_range() does.

After updating acpi_dma_get_range(), acpi_arch_dma_setup() is changed for
ARM64, where the original dma_addr and size are removed as these
arguments are now redundant, and pass 0 and U64_MAX for dma_base
and size of arch_setup_dma_ops; this is a simplification consistent
with what other ACPI architectures also pass to iommu_setup_dma_ops().

Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Jianmin Lv <lvjianmin@loongson.cn>
Reviewed-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6f64b2f3dc54..bb41623dab77 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -281,12 +281,12 @@ void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa);
 
 #ifdef CONFIG_ARM64
 void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa);
-void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size);
+void acpi_arch_dma_setup(struct device *dev);
 #else
 static inline void
 acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
 static inline void
-acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size) { }
+acpi_arch_dma_setup(struct device *dev) { }
 #endif
 
 int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma);
@@ -977,8 +977,7 @@ static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev)
 	return DEV_DMA_NOT_SUPPORTED;
 }
 
-static inline int acpi_dma_get_range(struct device *dev, u64 *dma_addr,
-				     u64 *offset, u64 *size)
+static inline int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map)
 {
 	return -ENODEV;
 }
-- 
cgit v1.2.3


From c78c43fe7d42524c8f364aaf95ef3652e7f1186b Mon Sep 17 00:00:00 2001
From: Jianmin Lv <lvjianmin@loongson.cn>
Date: Sun, 11 Sep 2022 17:06:35 +0800
Subject: LoongArch: Use acpi_arch_dma_setup() and remove ARCH_HAS_PHYS_TO_DMA

Use _DMA defined in ACPI spec for translation between
DMA address and CPU address, and implement acpi_arch_dma_setup
for initializing dev->dma_range_map, where acpi_dma_get_range
is called for parsing _DMA.

e.g.
If we have two dma ranges:
cpu address      dma address    size         offset
0x200080000000   0x2080000000   0x400000000  0x1fe000000000
0x400080000000   0x4080000000   0x400000000  0x3fc000000000

_DMA for pci devices should be declared in host bridge as
flowing:

Name (_DMA, ResourceTemplate() {
        QWordMemory (ResourceProducer,
            PosDecode,
            MinFixed,
            MaxFixed,
            NonCacheable,
            ReadWrite,
            0x0,
            0x4080000000,
            0x447fffffff,
            0x3fc000000000,
            0x400000000,
            ,
            ,
            )

        QWordMemory (ResourceProducer,
            PosDecode,
            MinFixed,
            MaxFixed,
            NonCacheable,
            ReadWrite,
            0x0,
            0x2080000000,
            0x247fffffff,
            0x1fe000000000,
            0x400000000,
            ,
            ,
            )
    })

Acked-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Jianmin Lv <lvjianmin@loongson.cn>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index bb41623dab77..a71d73a0d43e 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -279,14 +279,17 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { }
 
 void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa);
 
+#if defined(CONFIG_ARM64) || defined(CONFIG_LOONGARCH)
+void acpi_arch_dma_setup(struct device *dev);
+#else
+static inline void acpi_arch_dma_setup(struct device *dev) { }
+#endif
+
 #ifdef CONFIG_ARM64
 void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa);
-void acpi_arch_dma_setup(struct device *dev);
 #else
 static inline void
 acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
-static inline void
-acpi_arch_dma_setup(struct device *dev) { }
 #endif
 
 int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma);
-- 
cgit v1.2.3


From 43cf36974d760a3d1c705a83de89ac58059e5f0b Mon Sep 17 00:00:00 2001
From: Daniel Scally <djrscally@gmail.com>
Date: Thu, 22 Sep 2022 00:04:37 +0100
Subject: platform/x86: int3472: Support multiple clock consumers

At present, the tps68470.c only supports a single clock consumer when
passing platform data to the clock driver. In some devices multiple
sensors depend on the clock provided by a single TPS68470 and so all
need to be able to acquire the clock. Support passing multiple
consumers as platform data.

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Daniel Scally <djrscally@gmail.com>
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/platform_data/tps68470.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/tps68470.h b/include/linux/platform_data/tps68470.h
index 126d082c3f2e..e605a2cab07f 100644
--- a/include/linux/platform_data/tps68470.h
+++ b/include/linux/platform_data/tps68470.h
@@ -27,9 +27,14 @@ struct tps68470_regulator_platform_data {
 	const struct regulator_init_data *reg_init_data[TPS68470_NUM_REGULATORS];
 };
 
-struct tps68470_clk_platform_data {
+struct tps68470_clk_consumer {
 	const char *consumer_dev_name;
 	const char *consumer_con_id;
 };
 
+struct tps68470_clk_platform_data {
+	unsigned int n_consumers;
+	struct tps68470_clk_consumer consumers[];
+};
+
 #endif
-- 
cgit v1.2.3


From 7c7f9bc986e698873b489c371a08f206979d06b7 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Thu, 22 Sep 2022 18:27:33 +0200
Subject: serial: Deassert Transmit Enable on probe in driver-specific way
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a UART port is newly registered, uart_configure_port() seeks to
deassert RS485 Transmit Enable by setting the RTS bit in port->mctrl.
However a number of UART drivers interpret a set RTS bit as *assertion*
instead of deassertion:  Affected drivers include those using
serial8250_em485_config() (except 8250_bcm2835aux.c) and some using
mctrl_gpio (e.g. imx.c).

Since the interpretation of the RTS bit is driver-specific, it is not
suitable as a means to centrally deassert Transmit Enable in the serial
core.  Instead, the serial core must call on drivers to deassert it in
their driver-specific way.  One way to achieve that is to call
->rs485_config().  It implicitly deasserts Transmit Enable.

So amend uart_configure_port() and uart_resume_port() to invoke
uart_rs485_config().  That allows removing calls to uart_rs485_config()
from drivers' ->probe() hooks and declaring the function static.

Skip any invocation of ->set_mctrl() if RS485 is enabled.  RS485 has no
hardware flow control, so the modem control lines are irrelevant and
need not be touched.  When leaving RS485 mode, reset the modem control
lines to the state stored in port->mctrl.  That way, UARTs which are
muxed between RS485 and RS232 transceivers drive the lines correctly
when switched to RS232.  (serial8250_do_startup() historically raises
the OUT1 modem signal because otherwise interrupts are not signaled on
ancient PC UARTs, but I believe that no longer applies to modern,
RS485-capable UARTs and is thus safe to be skipped.)

imx.c modifies port->mctrl whenever Transmit Enable is asserted and
deasserted.  Stop it from doing that so port->mctrl reflects the RS232
line state.

8250_omap.c deasserts Transmit Enable on ->runtime_resume() by calling
->set_mctrl().  Because that is now a no-op in RS485 mode, amend the
function to call serial8250_em485_stop_tx().

fsl_lpuart.c retrieves and applies the RS485 device tree properties
after registering the UART port.  Because applying now happens on
registration in uart_configure_port(), move retrieval of the properties
ahead of uart_add_one_port().

Link: https://lore.kernel.org/all/20220329085050.311408-1-matthias.schiffer@ew.tq-group.com/
Link: https://lore.kernel.org/all/8f538a8903795f22f9acc94a9a31b03c9c4ccacb.camel@ginzinger.com/
Fixes: d3b3404df318 ("serial: Fix incorrect rs485 polarity on uart open")
Cc: stable@vger.kernel.org # v4.14+
Reported-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
Reported-by: Roosen Henri <Henri.Roosen@ginzinger.com>
Tested-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Link: https://lore.kernel.org/r/2de36eba3fbe11278d5002e4e501afe0ceaca039.1663863805.git.lukas@wunner.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index b4c21f84ad79..d657f2a42a7b 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -951,5 +951,4 @@ static inline int uart_handle_break(struct uart_port *port)
 					 !((cflag) & CLOCAL))
 
 int uart_get_rs485_mode(struct uart_port *port);
-int uart_rs485_config(struct uart_port *port);
 #endif /* LINUX_SERIAL_CORE_H */
-- 
cgit v1.2.3


From 1a77dd1c2bb5d4a58c16d198cf593720787c02e4 Mon Sep 17 00:00:00 2001
From: Arun Easi <aeasi@marvell.com>
Date: Wed, 7 Sep 2022 16:33:08 -0700
Subject: scsi: tracing: Fix compile error in trace_array calls when TRACING is
 disabled

Fix this compilation error seen when CONFIG_TRACING is not enabled:

drivers/scsi/qla2xxx/qla_os.c: In function 'qla_trace_init':
drivers/scsi/qla2xxx/qla_os.c:2854:25: error: implicit declaration of function
'trace_array_get_by_name'; did you mean 'trace_array_set_clr_event'?
[-Werror=implicit-function-declaration]
 2854 |         qla_trc_array = trace_array_get_by_name("qla2xxx");
      |                         ^~~~~~~~~~~~~~~~~~~~~~~
      |                         trace_array_set_clr_event

drivers/scsi/qla2xxx/qla_os.c: In function 'qla_trace_uninit':
drivers/scsi/qla2xxx/qla_os.c:2869:9: error: implicit declaration of function
'trace_array_put' [-Werror=implicit-function-declaration]
 2869 |         trace_array_put(qla_trc_array);
      |         ^~~~~~~~~~~~~~~

Link: https://lore.kernel.org/r/20220907233308.4153-2-aeasi@marvell.com
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Arun Easi <aeasi@marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/trace.h | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace.h b/include/linux/trace.h
index bf169612ffe1..b5e16e438448 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -2,8 +2,6 @@
 #ifndef _LINUX_TRACE_H
 #define _LINUX_TRACE_H
 
-#ifdef CONFIG_TRACING
-
 #define TRACE_EXPORT_FUNCTION	BIT(0)
 #define TRACE_EXPORT_EVENT	BIT(1)
 #define TRACE_EXPORT_MARKER	BIT(2)
@@ -28,6 +26,8 @@ struct trace_export {
 	int flags;
 };
 
+#ifdef CONFIG_TRACING
+
 int register_ftrace_export(struct trace_export *export);
 int unregister_ftrace_export(struct trace_export *export);
 
@@ -48,6 +48,38 @@ void osnoise_arch_unregister(void);
 void osnoise_trace_irq_entry(int id);
 void osnoise_trace_irq_exit(int id, const char *desc);
 
+#else /* CONFIG_TRACING */
+static inline int register_ftrace_export(struct trace_export *export)
+{
+	return -EINVAL;
+}
+static inline int unregister_ftrace_export(struct trace_export *export)
+{
+	return 0;
+}
+static inline void trace_printk_init_buffers(void)
+{
+}
+static inline int trace_array_printk(struct trace_array *tr, unsigned long ip,
+				     const char *fmt, ...)
+{
+	return 0;
+}
+static inline int trace_array_init_printk(struct trace_array *tr)
+{
+	return -EINVAL;
+}
+static inline void trace_array_put(struct trace_array *tr)
+{
+}
+static inline struct trace_array *trace_array_get_by_name(const char *name)
+{
+	return NULL;
+}
+static inline int trace_array_destroy(struct trace_array *tr)
+{
+	return 0;
+}
 #endif	/* CONFIG_TRACING */
 
 #endif	/* _LINUX_TRACE_H */
-- 
cgit v1.2.3


From 38622010a6de3a62cc72688348548854ed82dcf5 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Mon, 15 Aug 2022 13:54:28 -0700
Subject: btrfs: send: add support for fs-verity

Preserve the fs-verity status of a btrfs file across send/recv.

There is no facility for installing the Merkle tree contents directly on
the receiving filesystem, so we package up the parameters used to enable
verity found in the verity descriptor. This gives the receive side
enough information to properly enable verity again. Note that this means
that receive will have to re-compute the whole Merkle tree, similar to
how compression worked before encoded_write.

Since the file becomes read-only after verity is enabled, it is
important that verity is added to the send stream after any file writes.
Therefore, when we process a verity item, merely note that it happened,
then actually create the command in the send stream during
'finish_inode_if_needed'.

This also creates V3 of the send stream format, without any format
changes besides adding the new commands and attributes.

Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/fsverity.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index 7af030fa3c36..40f14e5fed9d 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -22,6 +22,9 @@
  */
 #define FS_VERITY_MAX_DIGEST_SIZE	SHA512_DIGEST_SIZE
 
+/* Arbitrary limit to bound the kmalloc() size.  Can be changed. */
+#define FS_VERITY_MAX_DESCRIPTOR_SIZE	16384
+
 /* Verity operations for filesystems */
 struct fsverity_operations {
 
-- 
cgit v1.2.3


From 691cdf016d3be6f66a3ea384809be229e0f9c590 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 7 Sep 2022 11:34:45 +0200
Subject: powerpc: Rely on generic definition of hugepd_t and is_hugepd when
 unused

CONFIG_ARCH_HAS_HUGEPD is used to tell core mm when huge page
directories are used.

When they are not used, no need to provide hugepd_t or is_hugepd(),
just rely on the core mm fallback definition.

For that, change core mm behaviour so that CONFIG_ARCH_HAS_HUGEPD
is used instead of indirect is_hugepd macro existence.

powerpc being the only user of huge page directories, there is no
impact on other architectures.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/da81462d93069bb90fe5e762dd3283a644318937.1662543243.git.christophe.leroy@csgroup.eu
---
 include/linux/hugetlb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3ec981a0d8b3..1ec1535be04f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -17,7 +17,7 @@ struct ctl_table;
 struct user_struct;
 struct mmu_gather;
 
-#ifndef is_hugepd
+#ifndef CONFIG_ARCH_HAS_HUGEPD
 typedef struct { unsigned long pd; } hugepd_t;
 #define is_hugepd(hugepd) (0)
 #define __hugepd(x) ((hugepd_t) { (x) })
-- 
cgit v1.2.3


From 4f58330fcc8482aa90674e1f40f601e82f18ed4a Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 13 Sep 2022 12:47:20 +0100
Subject: iommu/iova: Fix module config properly

IOMMU_IOVA is intended to be an optional library for users to select as
and when they desire. Since it can be a module now, this means that
built-in code which has chosen not to select it should not fail to link
if it happens to have selected as a module by someone else. Replace
IS_ENABLED() with IS_REACHABLE() to do the right thing.

CC: Thierry Reding <thierry.reding@gmail.com>
Reported-by: John Garry <john.garry@huawei.com>
Fixes: 15bbdec3931e ("iommu: Make the iova library a module")
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Thierry Reding <treding@nvidia.com>
Link: https://lore.kernel.org/r/548c2f683ca379aface59639a8f0cccc3a1ac050.1663069227.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iova.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iova.h b/include/linux/iova.h
index c6ba6d95d79c..83c00fac2acb 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -75,7 +75,7 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
 	return iova >> iova_shift(iovad);
 }
 
-#if IS_ENABLED(CONFIG_IOMMU_IOVA)
+#if IS_REACHABLE(CONFIG_IOMMU_IOVA)
 int iova_cache_get(void);
 void iova_cache_put(void);
 
-- 
cgit v1.2.3


From dc09fe1c5edd9c27a52cb6dc5a7bb4452d45c71c Mon Sep 17 00:00:00 2001
From: Sven Peter <sven@svenpeter.dev>
Date: Fri, 16 Sep 2022 11:41:51 +0200
Subject: iommu/io-pgtable-dart: Add DART PTE support for t6000

The DARTs present in the M1 Pro/Max/Ultra SoC use a diffent PTE format.
They support a 42bit physical address space by shifting the paddr and
extending its mask inside the PTE.
They also come with mandatory sub-page protection now which we just
configure to always allow access to the entire page. This feature is
already present but optional on the previous DARTs which allows to
unconditionally configure it.

Signed-off-by: Sven Peter <sven@svenpeter.dev>
Co-developed-by: Janne Grunau <j@jannau.net>
Signed-off-by: Janne Grunau <j@jannau.net>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by: Hector Martin <marcan@marcan.st>
Link: https://lore.kernel.org/r/20220916094152.87137-5-j@jannau.net
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/io-pgtable.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index ca98aeadcc80..b768937382cd 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -17,6 +17,7 @@ enum io_pgtable_fmt {
 	ARM_MALI_LPAE,
 	AMD_IOMMU_V1,
 	APPLE_DART,
+	APPLE_DART2,
 	IO_PGTABLE_NUM_FMTS,
 };
 
-- 
cgit v1.2.3


From c59fb127583869350256656b7ed848c398bef879 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 21 Sep 2022 00:32:01 +0000
Subject: KVM: remove KVM_REQ_UNHALT

KVM_REQ_UNHALT is now unnecessary because it is replaced by the return
value of kvm_vcpu_block/kvm_vcpu_halt.  Remove it.

No functional change intended.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Message-Id: <20220921003201.1441511-13-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 04c7e5f2f727..32f259fa5801 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -151,12 +151,11 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQUEST_NO_ACTION      BIT(10)
 /*
  * Architecture-independent vcpu->requests bit members
- * Bits 4-7 are reserved for more arch-independent bits.
+ * Bits 3-7 are reserved for more arch-independent bits.
  */
 #define KVM_REQ_TLB_FLUSH         (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_VM_DEAD           (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_UNBLOCK           2
-#define KVM_REQ_UNHALT            3
 #define KVM_REQUEST_ARCH_BASE     8
 
 /*
-- 
cgit v1.2.3


From 9fca7115827b2e5f48d84e50bceb4edfd4cb6375 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:45 -0700
Subject: cfi: Remove CONFIG_CFI_CLANG_SHADOW

In preparation to switching to -fsanitize=kcfi, remove support for the
CFI module shadow that will no longer be needed.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-4-samitolvanen@google.com
---
 include/linux/cfi.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index c6dfc1ed0626..4ab51c067007 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -20,18 +20,6 @@ extern void __cfi_check(uint64_t id, void *ptr, void *diag);
 #define __CFI_ADDRESSABLE(fn, __attr) \
 	const void *__cfi_jt_ ## fn __visible __attr = (void *)&fn
 
-#ifdef CONFIG_CFI_CLANG_SHADOW
-
-extern void cfi_module_add(struct module *mod, unsigned long base_addr);
-extern void cfi_module_remove(struct module *mod, unsigned long base_addr);
-
-#else
-
-static inline void cfi_module_add(struct module *mod, unsigned long base_addr) {}
-static inline void cfi_module_remove(struct module *mod, unsigned long base_addr) {}
-
-#endif /* CONFIG_CFI_CLANG_SHADOW */
-
 #else /* !CONFIG_CFI_CLANG */
 
 #ifdef CONFIG_X86_KERNEL_IBT
-- 
cgit v1.2.3


From 92efda8eb15295a07f450828b2db14485bfc09c2 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:46 -0700
Subject: cfi: Drop __CFI_ADDRESSABLE

The __CFI_ADDRESSABLE macro is used for init_module and cleanup_module
to ensure we have the address of the CFI jump table, and with
CONFIG_X86_KERNEL_IBT to ensure LTO won't optimize away the symbols.
As __CFI_ADDRESSABLE is no longer necessary with -fsanitize=kcfi, add
a more flexible version of the __ADDRESSABLE macro and always ensure
these symbols won't be dropped.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-5-samitolvanen@google.com
---
 include/linux/cfi.h      | 20 --------------------
 include/linux/compiler.h |  6 ++++--
 include/linux/module.h   |  4 ++--
 3 files changed, 6 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 4ab51c067007..2cdbc0fbd0ab 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -13,26 +13,6 @@ typedef void (*cfi_check_fn)(uint64_t id, void *ptr, void *diag);
 /* Compiler-generated function in each module, and the kernel */
 extern void __cfi_check(uint64_t id, void *ptr, void *diag);
 
-/*
- * Force the compiler to generate a CFI jump table entry for a function
- * and store the jump table address to __cfi_jt_<function>.
- */
-#define __CFI_ADDRESSABLE(fn, __attr) \
-	const void *__cfi_jt_ ## fn __visible __attr = (void *)&fn
-
-#else /* !CONFIG_CFI_CLANG */
-
-#ifdef CONFIG_X86_KERNEL_IBT
-
-#define __CFI_ADDRESSABLE(fn, __attr) \
-	const void *__cfi_jt_ ## fn __visible __attr = (void *)&fn
-
-#endif /* CONFIG_X86_KERNEL_IBT */
-
 #endif /* CONFIG_CFI_CLANG */
 
-#ifndef __CFI_ADDRESSABLE
-#define __CFI_ADDRESSABLE(fn, __attr)
-#endif
-
 #endif /* _LINUX_CFI_H */
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 7713d7bcdaea..7bfafc69172a 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -221,9 +221,11 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
  * otherwise, or eliminated entirely due to lack of references that are
  * visible to the compiler.
  */
-#define __ADDRESSABLE(sym) \
-	static void * __section(".discard.addressable") __used \
+#define ___ADDRESSABLE(sym, __attrs) \
+	static void * __used __attrs \
 		__UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)&sym;
+#define __ADDRESSABLE(sym) \
+	___ADDRESSABLE(sym, __section(".discard.addressable"))
 
 /**
  * offset_to_ptr - convert a relative memory offset to an absolute pointer
diff --git a/include/linux/module.h b/include/linux/module.h
index 518296ea7f73..8937b020ec04 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -132,7 +132,7 @@ extern void cleanup_module(void);
 	{ return initfn; }					\
 	int init_module(void) __copy(initfn)			\
 		__attribute__((alias(#initfn)));		\
-	__CFI_ADDRESSABLE(init_module, __initdata);
+	___ADDRESSABLE(init_module, __initdata);
 
 /* This is only required if you want to be unloadable. */
 #define module_exit(exitfn)					\
@@ -140,7 +140,7 @@ extern void cleanup_module(void);
 	{ return exitfn; }					\
 	void cleanup_module(void) __copy(exitfn)		\
 		__attribute__((alias(#exitfn)));		\
-	__CFI_ADDRESSABLE(cleanup_module, __exitdata);
+	___ADDRESSABLE(cleanup_module, __exitdata);
 
 #endif
 
-- 
cgit v1.2.3


From 89245600941e4e0f87d77f60ee269b5e61ef4e49 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:47 -0700
Subject: cfi: Switch to -fsanitize=kcfi

Switch from Clang's original forward-edge control-flow integrity
implementation to -fsanitize=kcfi, which is better suited for the
kernel, as it doesn't require LTO, doesn't use a jump table that
requires altering function references, and won't break cross-module
function address equality.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-6-samitolvanen@google.com
---
 include/linux/cfi.h            | 29 +++++++++++++++++++++++++----
 include/linux/compiler-clang.h | 14 +++-----------
 include/linux/module.h         |  6 +++---
 3 files changed, 31 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 2cdbc0fbd0ab..5e134f4ce8b7 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -2,17 +2,38 @@
 /*
  * Clang Control Flow Integrity (CFI) support.
  *
- * Copyright (C) 2021 Google LLC
+ * Copyright (C) 2022 Google LLC
  */
 #ifndef _LINUX_CFI_H
 #define _LINUX_CFI_H
 
+#include <linux/bug.h>
+#include <linux/module.h>
+
 #ifdef CONFIG_CFI_CLANG
-typedef void (*cfi_check_fn)(uint64_t id, void *ptr, void *diag);
+enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
+				      unsigned long *target, u32 type);
 
-/* Compiler-generated function in each module, and the kernel */
-extern void __cfi_check(uint64_t id, void *ptr, void *diag);
+static inline enum bug_trap_type report_cfi_failure_noaddr(struct pt_regs *regs,
+							   unsigned long addr)
+{
+	return report_cfi_failure(regs, addr, NULL, 0);
+}
 
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+bool is_cfi_trap(unsigned long addr);
+#endif
 #endif /* CONFIG_CFI_CLANG */
 
+#ifdef CONFIG_MODULES
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+			 struct module *mod);
+#else
+static inline void module_cfi_finalize(const Elf_Ehdr *hdr,
+				       const Elf_Shdr *sechdrs,
+				       struct module *mod) {}
+#endif /* CONFIG_ARCH_USES_CFI_TRAPS */
+#endif /* CONFIG_MODULES */
+
 #endif /* _LINUX_CFI_H */
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index c84fec767445..42e55579d649 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -66,17 +66,9 @@
 # define __noscs	__attribute__((__no_sanitize__("shadow-call-stack")))
 #endif
 
-#define __nocfi		__attribute__((__no_sanitize__("cfi")))
-#define __cficanonical	__attribute__((__cfi_canonical_jump_table__))
-
-#if defined(CONFIG_CFI_CLANG)
-/*
- * With CONFIG_CFI_CLANG, the compiler replaces function address
- * references with the address of the function's CFI jump table
- * entry. The function_nocfi macro always returns the address of the
- * actual function instead.
- */
-#define function_nocfi(x)	__builtin_function_start(x)
+#if __has_feature(kcfi)
+/* Disable CFI checking inside a function. */
+#define __nocfi		__attribute__((__no_sanitize__("kcfi")))
 #endif
 
 /*
diff --git a/include/linux/module.h b/include/linux/module.h
index 8937b020ec04..ec61fb53979a 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -27,7 +27,6 @@
 #include <linux/tracepoint-defs.h>
 #include <linux/srcu.h>
 #include <linux/static_call_types.h>
-#include <linux/cfi.h>
 
 #include <linux/percpu.h>
 #include <asm/module.h>
@@ -387,8 +386,9 @@ struct module {
 	const s32 *crcs;
 	unsigned int num_syms;
 
-#ifdef CONFIG_CFI_CLANG
-	cfi_check_fn cfi_check;
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+	s32 *kcfi_traps;
+	s32 *kcfi_traps_end;
 #endif
 
 	/* Kernel parameters. */
-- 
cgit v1.2.3


From e84e008e7b02c015047e76261726da1550130a59 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:48 -0700
Subject: cfi: Add type helper macros

With CONFIG_CFI_CLANG, assembly functions called indirectly
from C code must be annotated with type identifiers to pass CFI
checking.  In order to make this easier, the compiler emits a
__kcfi_typeid_<function> symbol for each address-taken function
declaration in C, which contains the expected type identifier that
we can refer to in assembly code.

Add a typed version of SYM_FUNC_START, which emits the type
identifier before the function. Architectures that support KCFI can
define their own __CFI_TYPE macro to override the default preamble
format.

As an example, for the x86_64 blowfish_dec_blk function, the
compiler emits the following type symbol:

$ readelf -sW vmlinux | grep __kcfi_typeid_blowfish_dec_blk
120204: 00000000ef478db5     0 NOTYPE  WEAK   DEFAULT  ABS
	__kcfi_typeid_blowfish_dec_blk

And SYM_TYPED_FUNC_START will generate the following preamble based
on the __CFI_TYPE definition for the architecture:

$ objdump -dr arch/x86/crypto/blowfish-x86_64-asm_64.o
     ...
0000000000000400 <__cfi_blowfish_dec_blk>:
     ...
     40b:       b8 00 00 00 00          mov    $0x0,%eax
                   40c: R_X86_64_32 __kcfi_typeid_blowfish_dec_blk

0000000000000410 <blowfish_dec_blk>:
     ...

Note that the address of all assembly functions annotated with
SYM_TYPED_FUNC_START must be taken in C code that's linked into the
binary or the missing __kcfi_typeid_ symbol will result in a linker
error with CONFIG_CFI_CLANG. If the code that contains the indirect
call is not always compiled in, __ADDRESSABLE(functionname) can be
used to ensure that the __kcfi_typeid_ symbol is emitted.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-7-samitolvanen@google.com
---
 include/linux/cfi_types.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 include/linux/cfi_types.h

(limited to 'include/linux')

diff --git a/include/linux/cfi_types.h b/include/linux/cfi_types.h
new file mode 100644
index 000000000000..6b8713675765
--- /dev/null
+++ b/include/linux/cfi_types.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Clang Control Flow Integrity (CFI) type definitions.
+ */
+#ifndef _LINUX_CFI_TYPES_H
+#define _LINUX_CFI_TYPES_H
+
+#ifdef __ASSEMBLY__
+#include <linux/linkage.h>
+
+#ifdef CONFIG_CFI_CLANG
+/*
+ * Use the __kcfi_typeid_<function> type identifier symbol to
+ * annotate indirectly called assembly functions. The compiler emits
+ * these symbols for all address-taken function declarations in C
+ * code.
+ */
+#ifndef __CFI_TYPE
+#define __CFI_TYPE(name)				\
+	.4byte __kcfi_typeid_##name
+#endif
+
+#define SYM_TYPED_ENTRY(name, linkage, align...)	\
+	linkage(name) ASM_NL				\
+	align ASM_NL					\
+	__CFI_TYPE(name) ASM_NL				\
+	name:
+
+#define SYM_TYPED_START(name, linkage, align...)	\
+	SYM_TYPED_ENTRY(name, linkage, align)
+
+#else /* CONFIG_CFI_CLANG */
+
+#define SYM_TYPED_START(name, linkage, align...)	\
+	SYM_START(name, linkage, align)
+
+#endif /* CONFIG_CFI_CLANG */
+
+#ifndef SYM_TYPED_FUNC_START
+#define SYM_TYPED_FUNC_START(name) 			\
+	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* _LINUX_CFI_TYPES_H */
-- 
cgit v1.2.3


From 5dbbb3eaa2a784342f3206b77381b516181d089c Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:54 -0700
Subject: init: Drop __nocfi from __init

It's no longer necessary to disable CFI checking for all __init
functions. Drop the __nocfi attribute from __init.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-13-samitolvanen@google.com
---
 include/linux/init.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index baf0b29a7010..88f2964097f5 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -47,7 +47,7 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__section(".init.text") __cold  __latent_entropy __noinitretpoline __nocfi
+#define __init		__section(".init.text") __cold  __latent_entropy __noinitretpoline
 #define __initdata	__section(".init.data")
 #define __initconst	__section(".init.rodata")
 #define __exitdata	__section(".exit.data")
-- 
cgit v1.2.3


From 607289a7cd7a3ca42b8a6877fcb6072e6eb20c34 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:55 -0700
Subject: treewide: Drop function_nocfi

With -fsanitize=kcfi, we no longer need function_nocfi() as
the compiler won't change function references to point to a
jump table. Remove all implementations and uses of the macro.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-14-samitolvanen@google.com
---
 include/linux/compiler.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 7bfafc69172a..973a1bfd7ef5 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -203,16 +203,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	__v;								\
 })
 
-/*
- * With CONFIG_CFI_CLANG, the compiler replaces function addresses in
- * instrumented C code with jump table addresses. Architectures that
- * support CFI can define this macro to return the actual function address
- * when needed.
- */
-#ifndef function_nocfi
-#define function_nocfi(x) (x)
-#endif
-
 #endif /* __KERNEL__ */
 
 /*
-- 
cgit v1.2.3


From 5659b598b4dcb352b1a567c55fc5a658bc80076c Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:57 -0700
Subject: treewide: Drop __cficanonical

CONFIG_CFI_CLANG doesn't use a jump table anymore and therefore,
won't change function references to point elsewhere. Remove the
__cficanonical attribute and all uses of it.

Note that the Clang definition of the attribute was removed earlier,
just clean up the no-op definition and users.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-16-samitolvanen@google.com
---
 include/linux/compiler_types.h | 4 ----
 include/linux/init.h           | 4 ++--
 include/linux/pci.h            | 4 ++--
 3 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 4f2a819fd60a..6f2ec0976e2d 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -265,10 +265,6 @@ struct ftrace_likely_data {
 # define __nocfi
 #endif
 
-#ifndef __cficanonical
-# define __cficanonical
-#endif
-
 /*
  * Any place that could be marked with the "alloc_size" attribute is also
  * a place to be marked with the "malloc" attribute. Do this as part of the
diff --git a/include/linux/init.h b/include/linux/init.h
index 88f2964097f5..a0a90cd73ebe 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -220,8 +220,8 @@ extern bool initcall_debug;
 	__initcall_name(initstub, __iid, id)
 
 #define __define_initcall_stub(__stub, fn)			\
-	int __init __cficanonical __stub(void);			\
-	int __init __cficanonical __stub(void)			\
+	int __init __stub(void);				\
+	int __init __stub(void)					\
 	{ 							\
 		return fn();					\
 	}							\
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 060af91bafcd..5da0846aa3c1 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2019,8 +2019,8 @@ enum pci_fixup_pass {
 #ifdef CONFIG_LTO_CLANG
 #define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
 				  class_shift, hook, stub)		\
-	void __cficanonical stub(struct pci_dev *dev);			\
-	void __cficanonical stub(struct pci_dev *dev)			\
+	void stub(struct pci_dev *dev);					\
+	void stub(struct pci_dev *dev)					\
 	{ 								\
 		hook(dev); 						\
 	}								\
-- 
cgit v1.2.3


From fa35198f39571bbdae53c5b321020021eaad6bd2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 19 Sep 2022 16:33:33 -0700
Subject: fortify: Explicitly check bounds are compile-time constants

In preparation for replacing __builtin_object_size() with
__builtin_dynamic_object_size(), all the compile-time size checks
need to check that the bounds comparisons are, in fact, known at
compile-time. Enforce what was guaranteed with __bos(). In other words,
since all uses of __bos() were constant expressions, it was not required
to test for this. When these change to __bdos(), they _may_ be constant
expressions, and the checks are only valid when the prior condition
holds. This results in no binary differences.

Cc: linux-hardening@vger.kernel.org
Link: https://lore.kernel.org/lkml/20220920192202.190793-3-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 49 ++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index ff879efe94ed..1c582224c525 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -80,6 +80,11 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
 #define POS	__pass_object_size(1)
 #define POS0	__pass_object_size(0)
 
+#define __compiletime_lessthan(bounds, length)	(	\
+	__builtin_constant_p((bounds) < (length)) &&	\
+	(bounds) < (length)				\
+)
+
 /**
  * strncpy - Copy a string to memory with non-guaranteed NUL padding
  *
@@ -117,7 +122,7 @@ char *strncpy(char * const POS p, const char *q, __kernel_size_t size)
 {
 	size_t p_size = __builtin_object_size(p, 1);
 
-	if (__builtin_constant_p(size) && p_size < size)
+	if (__compiletime_lessthan(p_size, size))
 		__write_overflow();
 	if (p_size < size)
 		fortify_panic(__func__);
@@ -224,7 +229,7 @@ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, s
 	 * If size can be known at compile time and is greater than
 	 * p_size, generate a compile time write overflow error.
 	 */
-	if (__builtin_constant_p(size) && size > p_size)
+	if (__compiletime_lessthan(p_size, size))
 		__write_overflow();
 
 	/*
@@ -281,15 +286,16 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size,
 		/*
 		 * Length argument is a constant expression, so we
 		 * can perform compile-time bounds checking where
-		 * buffer sizes are known.
+		 * buffer sizes are also known at compile time.
 		 */
 
 		/* Error when size is larger than enclosing struct. */
-		if (p_size > p_size_field && p_size < size)
+		if (__compiletime_lessthan(p_size_field, p_size) &&
+		    __compiletime_lessthan(p_size, size))
 			__write_overflow();
 
 		/* Warn when write size is larger than dest field. */
-		if (p_size_field < size)
+		if (__compiletime_lessthan(p_size_field, size))
 			__write_overflow_field(p_size_field, size);
 	}
 	/*
@@ -365,25 +371,28 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
 		/*
 		 * Length argument is a constant expression, so we
 		 * can perform compile-time bounds checking where
-		 * buffer sizes are known.
+		 * buffer sizes are also known at compile time.
 		 */
 
 		/* Error when size is larger than enclosing struct. */
-		if (p_size > p_size_field && p_size < size)
+		if (__compiletime_lessthan(p_size_field, p_size) &&
+		    __compiletime_lessthan(p_size, size))
 			__write_overflow();
-		if (q_size > q_size_field && q_size < size)
+		if (__compiletime_lessthan(q_size_field, q_size) &&
+		    __compiletime_lessthan(q_size, size))
 			__read_overflow2();
 
 		/* Warn when write size argument larger than dest field. */
-		if (p_size_field < size)
+		if (__compiletime_lessthan(p_size_field, size))
 			__write_overflow_field(p_size_field, size);
 		/*
 		 * Warn for source field over-read when building with W=1
 		 * or when an over-write happened, so both can be fixed at
 		 * the same time.
 		 */
-		if ((IS_ENABLED(KBUILD_EXTRA_WARN1) || p_size_field < size) &&
-		    q_size_field < size)
+		if ((IS_ENABLED(KBUILD_EXTRA_WARN1) ||
+		     __compiletime_lessthan(p_size_field, size)) &&
+		    __compiletime_lessthan(q_size_field, size))
 			__read_overflow2_field(q_size_field, size);
 	}
 	/*
@@ -494,7 +503,7 @@ __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size)
 {
 	size_t p_size = __builtin_object_size(p, 0);
 
-	if (__builtin_constant_p(size) && p_size < size)
+	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
 	if (p_size < size)
 		fortify_panic(__func__);
@@ -508,9 +517,9 @@ int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t
 	size_t q_size = __builtin_object_size(q, 0);
 
 	if (__builtin_constant_p(size)) {
-		if (p_size < size)
+		if (__compiletime_lessthan(p_size, size))
 			__read_overflow();
-		if (q_size < size)
+		if (__compiletime_lessthan(q_size, size))
 			__read_overflow2();
 	}
 	if (p_size < size || q_size < size)
@@ -523,7 +532,7 @@ void *memchr(const void * const POS0 p, int c, __kernel_size_t size)
 {
 	size_t p_size = __builtin_object_size(p, 0);
 
-	if (__builtin_constant_p(size) && p_size < size)
+	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
 	if (p_size < size)
 		fortify_panic(__func__);
@@ -535,7 +544,7 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size)
 {
 	size_t p_size = __builtin_object_size(p, 0);
 
-	if (__builtin_constant_p(size) && p_size < size)
+	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
 	if (p_size < size)
 		fortify_panic(__func__);
@@ -547,7 +556,7 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp
 {
 	size_t p_size = __builtin_object_size(p, 0);
 
-	if (__builtin_constant_p(size) && p_size < size)
+	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
 	if (p_size < size)
 		fortify_panic(__func__);
@@ -563,11 +572,13 @@ char *strcpy(char * const POS p, const char * const POS q)
 	size_t size;
 
 	/* If neither buffer size is known, immediately give up. */
-	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
+	if (__builtin_constant_p(p_size) &&
+	    __builtin_constant_p(q_size) &&
+	    p_size == SIZE_MAX && q_size == SIZE_MAX)
 		return __underlying_strcpy(p, q);
 	size = strlen(q) + 1;
 	/* Compile-time check for const size overflow. */
-	if (__builtin_constant_p(size) && p_size < size)
+	if (__compiletime_lessthan(p_size, size))
 		__write_overflow();
 	/* Run-time check for dynamic size overflow. */
 	if (p_size < size)
-- 
cgit v1.2.3


From 9f7d69c5cd23904a29178a7ecc4eee9c1cfba04b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 19 Sep 2022 19:50:32 -0700
Subject: fortify: Convert to struct vs member helpers

In preparation for adding support for __builtin_dynamic_object_size(),
wrap each instance of __builtin_object_size(p, N) with either the new
__struct_size(p) as __bos(p, 0), or __member_size(p) as __bos(p, 1).
This will allow us to replace the definitions with __bdos() next.
There are no binary differences from this change.

Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Tom Rix <trix@redhat.com>
Cc: linux-hardening@vger.kernel.org
Cc: llvm@lists.linux.dev
Link: https://lore.kernel.org/lkml/20220920192202.190793-4-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 68 ++++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 1c582224c525..b62c90cfafaf 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -20,7 +20,7 @@ void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("
 ({								\
 	unsigned char *__p = (unsigned char *)(p);		\
 	size_t __ret = SIZE_MAX;				\
-	size_t __p_size = __builtin_object_size(p, 1);		\
+	size_t __p_size = __member_size(p);			\
 	if (__p_size != SIZE_MAX &&				\
 	    __builtin_constant_p(*__p)) {			\
 		size_t __p_len = __p_size - 1;			\
@@ -72,13 +72,15 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
 	__underlying_memcpy(dst, src, bytes)
 
 /*
- * Clang's use of __builtin_object_size() within inlines needs hinting via
- * __pass_object_size(). The preference is to only ever use type 1 (member
+ * Clang's use of __builtin_*object_size() within inlines needs hinting via
+ * __pass_*object_size(). The preference is to only ever use type 1 (member
  * size, rather than struct size), but there remain some stragglers using
  * type 0 that will be converted in the future.
  */
-#define POS	__pass_object_size(1)
-#define POS0	__pass_object_size(0)
+#define POS			__pass_object_size(1)
+#define POS0			__pass_object_size(0)
+#define __struct_size(p)	__builtin_object_size(p, 0)
+#define __member_size(p)	__builtin_object_size(p, 1)
 
 #define __compiletime_lessthan(bounds, length)	(	\
 	__builtin_constant_p((bounds) < (length)) &&	\
@@ -120,7 +122,7 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
 __FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3)
 char *strncpy(char * const POS p, const char *q, __kernel_size_t size)
 {
-	size_t p_size = __builtin_object_size(p, 1);
+	size_t p_size = __member_size(p);
 
 	if (__compiletime_lessthan(p_size, size))
 		__write_overflow();
@@ -132,7 +134,7 @@ char *strncpy(char * const POS p, const char *q, __kernel_size_t size)
 __FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2)
 char *strcat(char * const POS p, const char *q)
 {
-	size_t p_size = __builtin_object_size(p, 1);
+	size_t p_size = __member_size(p);
 
 	if (p_size == SIZE_MAX)
 		return __underlying_strcat(p, q);
@@ -144,7 +146,7 @@ char *strcat(char * const POS p, const char *q)
 extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
 __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size_t maxlen)
 {
-	size_t p_size = __builtin_object_size(p, 1);
+	size_t p_size = __member_size(p);
 	size_t p_len = __compiletime_strlen(p);
 	size_t ret;
 
@@ -174,7 +176,7 @@ __FORTIFY_INLINE __diagnose_as(__builtin_strlen, 1)
 __kernel_size_t __fortify_strlen(const char * const POS p)
 {
 	__kernel_size_t ret;
-	size_t p_size = __builtin_object_size(p, 1);
+	size_t p_size = __member_size(p);
 
 	/* Give up if we don't know how large p is. */
 	if (p_size == SIZE_MAX)
@@ -189,8 +191,8 @@ __kernel_size_t __fortify_strlen(const char * const POS p)
 extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
 __FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, size_t size)
 {
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
+	size_t p_size = __member_size(p);
+	size_t q_size = __member_size(q);
 	size_t q_len;	/* Full count of source string length. */
 	size_t len;	/* Count of characters going into destination. */
 
@@ -218,8 +220,8 @@ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, s
 {
 	size_t len;
 	/* Use string size rather than possible enclosing struct size. */
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
+	size_t p_size = __member_size(p);
+	size_t q_size = __member_size(q);
 
 	/* If we cannot get size of p and q default to call strscpy. */
 	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
@@ -264,8 +266,8 @@ __FORTIFY_INLINE __diagnose_as(__builtin_strncat, 1, 2, 3)
 char *strncat(char * const POS p, const char * const POS q, __kernel_size_t count)
 {
 	size_t p_len, copy_len;
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
+	size_t p_size = __member_size(p);
+	size_t q_size = __member_size(q);
 
 	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
 		return __underlying_strncat(p, q, count);
@@ -323,11 +325,11 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size,
 })
 
 /*
- * __builtin_object_size() must be captured here to avoid evaluating argument
- * side-effects further into the macro layers.
+ * __struct_size() vs __member_size() must be captured here to avoid
+ * evaluating argument side-effects further into the macro layers.
  */
 #define memset(p, c, s) __fortify_memset_chk(p, c, s,			\
-		__builtin_object_size(p, 0), __builtin_object_size(p, 1))
+		__struct_size(p), __member_size(p))
 
 /*
  * To make sure the compiler can enforce protection against buffer overflows,
@@ -420,7 +422,7 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
 	 * fake flexible arrays, until they are all converted to
 	 * proper flexible arrays.
 	 *
-	 * The implementation of __builtin_object_size() behaves
+	 * The implementation of __builtin_*object_size() behaves
 	 * like sizeof() when not directly referencing a flexible
 	 * array member, which means there will be many bounds checks
 	 * that will appear at run-time, without a way for them to be
@@ -486,22 +488,22 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
  */
 
 /*
- * __builtin_object_size() must be captured here to avoid evaluating argument
- * side-effects further into the macro layers.
+ * __struct_size() vs __member_size() must be captured here to avoid
+ * evaluating argument side-effects further into the macro layers.
  */
 #define memcpy(p, q, s)  __fortify_memcpy_chk(p, q, s,			\
-		__builtin_object_size(p, 0), __builtin_object_size(q, 0), \
-		__builtin_object_size(p, 1), __builtin_object_size(q, 1), \
+		__struct_size(p), __struct_size(q),			\
+		__member_size(p), __member_size(q),			\
 		memcpy)
 #define memmove(p, q, s)  __fortify_memcpy_chk(p, q, s,			\
-		__builtin_object_size(p, 0), __builtin_object_size(q, 0), \
-		__builtin_object_size(p, 1), __builtin_object_size(q, 1), \
+		__struct_size(p), __struct_size(q),			\
+		__member_size(p), __member_size(q),			\
 		memmove)
 
 extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan);
 __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size)
 {
-	size_t p_size = __builtin_object_size(p, 0);
+	size_t p_size = __struct_size(p);
 
 	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
@@ -513,8 +515,8 @@ __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size)
 __FORTIFY_INLINE __diagnose_as(__builtin_memcmp, 1, 2, 3)
 int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t size)
 {
-	size_t p_size = __builtin_object_size(p, 0);
-	size_t q_size = __builtin_object_size(q, 0);
+	size_t p_size = __struct_size(p);
+	size_t q_size = __struct_size(q);
 
 	if (__builtin_constant_p(size)) {
 		if (__compiletime_lessthan(p_size, size))
@@ -530,7 +532,7 @@ int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t
 __FORTIFY_INLINE __diagnose_as(__builtin_memchr, 1, 2, 3)
 void *memchr(const void * const POS0 p, int c, __kernel_size_t size)
 {
-	size_t p_size = __builtin_object_size(p, 0);
+	size_t p_size = __struct_size(p);
 
 	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
@@ -542,7 +544,7 @@ void *memchr(const void * const POS0 p, int c, __kernel_size_t size)
 void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv);
 __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size)
 {
-	size_t p_size = __builtin_object_size(p, 0);
+	size_t p_size = __struct_size(p);
 
 	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
@@ -554,7 +556,7 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size)
 extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup);
 __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp)
 {
-	size_t p_size = __builtin_object_size(p, 0);
+	size_t p_size = __struct_size(p);
 
 	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
@@ -567,8 +569,8 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp
 __FORTIFY_INLINE __diagnose_as(__builtin_strcpy, 1, 2)
 char *strcpy(char * const POS p, const char * const POS q)
 {
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
+	size_t p_size = __member_size(p);
+	size_t q_size = __member_size(q);
 	size_t size;
 
 	/* If neither buffer size is known, immediately give up. */
-- 
cgit v1.2.3


From 90bfc37b5ab91c1a6165e3e5cfc49bf04571b762 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Sep 2022 15:09:53 -0400
Subject: SUNRPC: Fix svcxdr_init_decode's end-of-buffer calculation

Ensure that stream-based argument decoding can't go past the actual
end of the receive buffer. xdr_init_decode's calculation of the
value of xdr->end over-estimates the end of the buffer because the
Linux kernel RPC server code does not remove the size of the RPC
header from rqstp->rq_arg before calling the upper layer's
dispatcher.

The server-side still uses the svc_getnl() macros to decode the
RPC call header. These macros reduce the length of the head iov
but do not update the total length of the message in the buffer
(buf->len).

A proper fix for this would be to replace the use of svc_getnl() and
friends in the RPC header decoder, but that would be a large and
invasive change that would be difficult to backport.

Fixes: 5191955d6fc6 ("SUNRPC: Prepare for xdr_stream-style decoding on the server-side")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index daecb009c05b..5a830b66f059 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -544,16 +544,27 @@ static inline void svc_reserve_auth(struct svc_rqst *rqstp, int space)
 }
 
 /**
- * svcxdr_init_decode - Prepare an xdr_stream for svc Call decoding
+ * svcxdr_init_decode - Prepare an xdr_stream for Call decoding
  * @rqstp: controlling server RPC transaction context
  *
+ * This function currently assumes the RPC header in rq_arg has
+ * already been decoded. Upon return, xdr->p points to the
+ * location of the upper layer header.
  */
 static inline void svcxdr_init_decode(struct svc_rqst *rqstp)
 {
 	struct xdr_stream *xdr = &rqstp->rq_arg_stream;
-	struct kvec *argv = rqstp->rq_arg.head;
+	struct xdr_buf *buf = &rqstp->rq_arg;
+	struct kvec *argv = buf->head;
 
-	xdr_init_decode(xdr, &rqstp->rq_arg, argv->iov_base, NULL);
+	/*
+	 * svc_getnl() and friends do not keep the xdr_buf's ::len
+	 * field up to date. Refresh that field before initializing
+	 * the argument decoding stream.
+	 */
+	buf->len = buf->head->iov_len + buf->page_len + buf->tail->iov_len;
+
+	xdr_init_decode(xdr, buf, argv->iov_base, NULL);
 	xdr_set_scratch_page(xdr, rqstp->rq_scratch_page);
 }
 
-- 
cgit v1.2.3


From 1242a87da0d8cd2a428e96ca68e7ea899b0f4624 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Sep 2022 15:09:59 -0400
Subject: SUNRPC: Fix svcxdr_init_encode's buflen calculation

Commit 2825a7f90753 ("nfsd4: allow encoding across page boundaries")
added an explicit computation of the remaining length in the rq_res
XDR buffer.

The computation appears to suffer from an "off-by-one" bug. Because
buflen is too large by one page, XDR encoding can run off the end of
the send buffer by eventually trying to use the struct page address
in rq_page_end, which always contains NULL.

Fixes: bddfdbcddbe2 ("NFSD: Extract the svcxdr_init_encode() helper")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 5a830b66f059..0ca8a8ffb47e 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -587,7 +587,7 @@ static inline void svcxdr_init_encode(struct svc_rqst *rqstp)
 	xdr->end = resv->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
 	buf->len = resv->iov_len;
 	xdr->page_ptr = buf->pages - 1;
-	buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages);
+	buf->buflen = PAGE_SIZE * (rqstp->rq_page_end - buf->pages);
 	buf->buflen -= rqstp->rq_auth_slack;
 	xdr->rqst = NULL;
 }
-- 
cgit v1.2.3


From 103cc1fafee48adb91fca0e19deb869fd23e46ab Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:22:38 -0400
Subject: SUNRPC: Parametrize how much of argsize should be zeroed

Currently, SUNRPC clears the whole of .pc_argsize before processing
each incoming RPC transaction. Add an extra parameter to struct
svc_procedure to enable upper layers to reduce the amount of each
operation's argument structure that is zeroed by SUNRPC.

The size of struct nfsd4_compoundargs, in particular, is a lot to
clear on each incoming RPC Call. A subsequent patch will cut this
down to something closer to what NFSv2 and NFSv3 uses.

This patch should cause no behavior changes.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 0ca8a8ffb47e..88de45491376 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -472,6 +472,7 @@ struct svc_procedure {
 	/* XDR free result: */
 	void			(*pc_release)(struct svc_rqst *);
 	unsigned int		pc_argsize;	/* argument struct size */
+	unsigned int		pc_argzero;	/* how much of argument to clear */
 	unsigned int		pc_ressize;	/* result struct size */
 	unsigned int		pc_cachetype;	/* cache info (NFS) */
 	unsigned int		pc_xdrressize;	/* maximum size of XDR reply */
-- 
cgit v1.2.3


From 98124f5bd6c76699d514fbe491dd95265369cc99 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:22:56 -0400
Subject: NFSD: Refactor common code out of dirlist helpers

The dust has settled a bit and it's become obvious what code is
totally common between nfsd_init_dirlist_pages() and
nfsd3_init_dirlist_pages(). Move that common code to SUNRPC.

The new helper brackets the existing xdr_init_decode_pages() API.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/xdr.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 69175029abbb..f84e2a1358e1 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -240,6 +240,8 @@ typedef int	(*kxdrdproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 
 extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf,
 			    __be32 *p, struct rpc_rqst *rqst);
+extern void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
+			   struct page **pages, struct rpc_rqst *rqst);
 extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
 extern int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec,
 		size_t nbytes);
-- 
cgit v1.2.3


From 24291caf8447f6fc060c8d00136bdc30ee207f38 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 17 Sep 2022 20:07:12 -0700
Subject: lib/bitmap: add bitmap_weight_and()

The function calculates Hamming weight of (bitmap1 & bitmap2). Now we
have to do like this:
	tmp = bitmap_alloc(nbits);
	bitmap_and(tmp, map1, map2, nbits);
	weight = bitmap_weight(tmp, nbits);
	bitmap_free(tmp);

This requires additional memory, adds pressure on alloc subsystem, and
way less cache-friendly than just:
	weight = bitmap_weight_and(map1, map2, nbits);

The following patches apply it for cpumask functions.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h  | 12 ++++++++++++
 include/linux/cpumask.h | 11 +++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index f65410a49fda..b2aef45af0db 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -51,6 +51,7 @@ struct device;
  *  bitmap_empty(src, nbits)                    Are all bits zero in *src?
  *  bitmap_full(src, nbits)                     Are all bits set in *src?
  *  bitmap_weight(src, nbits)                   Hamming Weight: number set bits
+ *  bitmap_weight_and(src1, src2, nbits)        Hamming Weight of and'ed bitmap
  *  bitmap_set(dst, pos, nbits)                 Set specified bit area
  *  bitmap_clear(dst, pos, nbits)               Clear specified bit area
  *  bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
@@ -164,6 +165,8 @@ bool __bitmap_intersects(const unsigned long *bitmap1,
 bool __bitmap_subset(const unsigned long *bitmap1,
 		     const unsigned long *bitmap2, unsigned int nbits);
 unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
+unsigned int __bitmap_weight_and(const unsigned long *bitmap1,
+				 const unsigned long *bitmap2, unsigned int nbits);
 void __bitmap_set(unsigned long *map, unsigned int start, int len);
 void __bitmap_clear(unsigned long *map, unsigned int start, int len);
 
@@ -439,6 +442,15 @@ unsigned int bitmap_weight(const unsigned long *src, unsigned int nbits)
 	return __bitmap_weight(src, nbits);
 }
 
+static __always_inline
+unsigned long bitmap_weight_and(const unsigned long *src1,
+				const unsigned long *src2, unsigned int nbits)
+{
+	if (small_const_nbits(nbits))
+		return hweight_long(*src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits));
+	return __bitmap_weight_and(src1, src2, nbits);
+}
+
 static __always_inline void bitmap_set(unsigned long *map, unsigned int start,
 		unsigned int nbits)
 {
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 1b442fb2001f..9a71af8097ca 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -586,6 +586,17 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp)
 	return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
+/**
+ * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2)
+ * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
+ * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
+ */
+static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
+						const struct cpumask *srcp2)
+{
+	return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
+}
+
 /**
  * cpumask_shift_right - *dstp = *srcp >> n
  * @dstp: the cpumask result
-- 
cgit v1.2.3


From 3cea8d475327756066e2a54f0b651bb7284dd448 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 17 Sep 2022 20:07:13 -0700
Subject: lib: add find_nth{,_and,_andnot}_bit()

Kernel lacks for a function that searches for Nth bit in a bitmap.
Usually people do it like this:
	for_each_set_bit(bit, mask, size)
		if (n-- == 0)
			return bit;

We can do it more efficiently, if we:
1. find a word containing Nth bit, using hweight(); and
2. find the bit, using a helper fns(), that works similarly to
   __ffs() and ffz().

fns() is implemented as a simple loop. For x86_64, there's PDEP instruction
to do that: ret = clz(pdep(1 << idx, num)). However, for large bitmaps the
most of improvement comes from using hweight(), so I kept fns() simple.

New find_nth_bit() is ~70 times faster on x86_64/kvm in find_bit benchmark:
find_nth_bit:                  7154190 ns,  16411 iterations
for_each_bit:                505493126 ns,  16315 iterations

With all that, a family of 3 new functions is added, and used where
appropriate in the following patches.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitops.h | 19 +++++++++++
 include/linux/find.h   | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 3b89c64bcfd8..d7dd83fafeba 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -247,6 +247,25 @@ static inline unsigned long __ffs64(u64 word)
 	return __ffs((unsigned long)word);
 }
 
+/**
+ * fns - find N'th set bit in a word
+ * @word: The word to search
+ * @n: Bit to find
+ */
+static inline unsigned long fns(unsigned long word, unsigned int n)
+{
+	unsigned int bit;
+
+	while (word) {
+		bit = __ffs(word);
+		if (n-- == 0)
+			return bit;
+		__clear_bit(bit, &word);
+	}
+
+	return BITS_PER_LONG;
+}
+
 /**
  * assign_bit - Assign value to a bit in memory
  * @nr: the bit to set
diff --git a/include/linux/find.h b/include/linux/find.h
index dead6f53a97b..b100944daba0 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -15,6 +15,11 @@ unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long
 unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
 					 unsigned long start);
 extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
+unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n);
+unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
+				unsigned long size, unsigned long n);
+unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
+					unsigned long size, unsigned long n);
 extern unsigned long _find_first_and_bit(const unsigned long *addr1,
 					 const unsigned long *addr2, unsigned long size);
 extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
@@ -136,6 +141,87 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
 }
 #endif
 
+/**
+ * find_nth_bit - find N'th set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum number of bits to search
+ * @n: The number of set bit, which position is needed, counting from 0
+ *
+ * The following is semantically equivalent:
+ *	 idx = find_nth_bit(addr, size, 0);
+ *	 idx = find_first_bit(addr, size);
+ *
+ * Returns the bit number of the N'th set bit.
+ * If no such, returns @size.
+ */
+static inline
+unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n)
+{
+	if (n >= size)
+		return size;
+
+	if (small_const_nbits(size)) {
+		unsigned long val =  *addr & GENMASK(size - 1, 0);
+
+		return val ? fns(val, n) : size;
+	}
+
+	return __find_nth_bit(addr, size, n);
+}
+
+/**
+ * find_nth_and_bit - find N'th set bit in 2 memory regions
+ * @addr1: The 1st address to start the search at
+ * @addr2: The 2nd address to start the search at
+ * @size: The maximum number of bits to search
+ * @n: The number of set bit, which position is needed, counting from 0
+ *
+ * Returns the bit number of the N'th set bit.
+ * If no such, returns @size.
+ */
+static inline
+unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
+				unsigned long size, unsigned long n)
+{
+	if (n >= size)
+		return size;
+
+	if (small_const_nbits(size)) {
+		unsigned long val =  *addr1 & *addr2 & GENMASK(size - 1, 0);
+
+		return val ? fns(val, n) : size;
+	}
+
+	return __find_nth_and_bit(addr1, addr2, size, n);
+}
+
+/**
+ * find_nth_andnot_bit - find N'th set bit in 2 memory regions,
+ *			 flipping bits in 2nd region
+ * @addr1: The 1st address to start the search at
+ * @addr2: The 2nd address to start the search at
+ * @size: The maximum number of bits to search
+ * @n: The number of set bit, which position is needed, counting from 0
+ *
+ * Returns the bit number of the N'th set bit.
+ * If no such, returns @size.
+ */
+static inline
+unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
+				unsigned long size, unsigned long n)
+{
+	if (n >= size)
+		return size;
+
+	if (small_const_nbits(size)) {
+		unsigned long val =  *addr1 & (~*addr2) & GENMASK(size - 1, 0);
+
+		return val ? fns(val, n) : size;
+	}
+
+	return __find_nth_andnot_bit(addr1, addr2, size, n);
+}
+
 #ifndef find_first_and_bit
 /**
  * find_first_and_bit - find the first set bit in both memory regions
-- 
cgit v1.2.3


From 97848c10f9f8a8ce4296b149d06cab424eba05b3 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 17 Sep 2022 20:07:15 -0700
Subject: lib/bitmap: remove bitmap_ord_to_pos

Now that we have find_nth_bit(), we can drop bitmap_ord_to_pos().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h   | 1 -
 include/linux/nodemask.h | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index b2aef45af0db..7d6d73b78147 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -225,7 +225,6 @@ void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int n
 #else
 #define bitmap_copy_le bitmap_copy
 #endif
-unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
 int bitmap_print_to_pagebuf(bool list, char *buf,
 				   const unsigned long *maskp, int nmaskbits);
 
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 4b71a96190a8..0c45fb066caa 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -508,8 +508,7 @@ static inline int node_random(const nodemask_t *maskp)
 
 	w = nodes_weight(*maskp);
 	if (w)
-		bit = bitmap_ord_to_pos(maskp->bits,
-			get_random_int() % w, MAX_NUMNODES);
+		bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_int() % w);
 	return bit;
 #else
 	return 0;
-- 
cgit v1.2.3


From 944c417daeb63fa345fe0f754c57a5a23ca6d701 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 17 Sep 2022 20:07:16 -0700
Subject: cpumask: add cpumask_nth_{,and,andnot}

Add cpumask_nth_{,and,andnot} as wrappers around corresponding
find functions, and use it in cpumask_local_spread().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 9a71af8097ca..e4f9136a4a63 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -337,6 +337,50 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 	return i;
 }
 
+/**
+ * cpumask_nth - get the first cpu in a cpumask
+ * @srcp: the cpumask pointer
+ * @cpu: the N'th cpu to find, starting from 0
+ *
+ * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ */
+static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
+{
+	return find_nth_bit(cpumask_bits(srcp), nr_cpumask_bits, cpumask_check(cpu));
+}
+
+/**
+ * cpumask_nth_and - get the first cpu in 2 cpumasks
+ * @srcp1: the cpumask pointer
+ * @srcp2: the cpumask pointer
+ * @cpu: the N'th cpu to find, starting from 0
+ *
+ * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ */
+static inline
+unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
+							const struct cpumask *srcp2)
+{
+	return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
+				nr_cpumask_bits, cpumask_check(cpu));
+}
+
+/**
+ * cpumask_nth_andnot - get the first cpu set in 1st cpumask, and clear in 2nd.
+ * @srcp1: the cpumask pointer
+ * @srcp2: the cpumask pointer
+ * @cpu: the N'th cpu to find, starting from 0
+ *
+ * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ */
+static inline
+unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
+							const struct cpumask *srcp2)
+{
+	return find_nth_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
+				nr_cpumask_bits, cpumask_check(cpu));
+}
+
 #define CPU_BITS_NONE						\
 {								\
 	[0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL			\
-- 
cgit v1.2.3


From eab3126571ed1e3e57ce0f066b566af472ebc47a Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 3 Jun 2022 15:29:22 +0200
Subject: efi: libstub: simplify efi_get_memory_map() and struct
 efi_boot_memmap

Currently, struct efi_boot_memmap is a struct that is passed around
between callers of efi_get_memory_map() and the users of the resulting
data, and which carries pointers to various variables whose values are
provided by the EFI GetMemoryMap() boot service.

This is overly complex, and it is much easier to carry these values in
the struct itself. So turn the struct into one that carries these data
items directly, including a flex array for the variable number of EFI
memory descriptors that the boot service may return.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index d2b84c2fec39..f1b3e0d1b3fa 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -518,6 +518,15 @@ typedef union {
 	efi_system_table_32_t mixed_mode;
 } efi_system_table_t;
 
+struct efi_boot_memmap {
+	unsigned long		map_size;
+	unsigned long		desc_size;
+	u32			desc_ver;
+	unsigned long		map_key;
+	unsigned long		buff_size;
+	efi_memory_desc_t	map[];
+};
+
 /*
  * Architecture independent structure for describing a memory map for the
  * benefit of efi_memmap_init_early(), and for passing context between
-- 
cgit v1.2.3


From de185b56e8a62822d4e1cdb3e068b38ca709aa47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 21 Sep 2022 20:05:00 +0200
Subject: blk-cgroup: pass a gendisk to blkcg_schedule_throttle

Pass the gendisk to blkcg_schedule_throttle as part of moving the
blk-cgroup infrastructure to be gendisk based.  Remove the unused
!BLK_CGROUP stub while we're at it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Andreas Herrmann <aherrmann@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20220921180501.1539876-17-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 9f40dbc65f82..dd5841a42c33 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -18,14 +18,14 @@
 
 struct bio;
 struct cgroup_subsys_state;
-struct request_queue;
+struct gendisk;
 
 #define FC_APPID_LEN              129
 
 #ifdef CONFIG_BLK_CGROUP
 extern struct cgroup_subsys_state * const blkcg_root_css;
 
-void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
+void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay);
 void blkcg_maybe_throttle_current(void);
 bool blk_cgroup_congested(void);
 void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css);
@@ -39,7 +39,6 @@ struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio);
 
 static inline void blkcg_maybe_throttle_current(void) { }
 static inline bool blk_cgroup_congested(void) { return false; }
-static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
 static inline struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio)
 {
 	return NULL;
-- 
cgit v1.2.3


From 21c9e90ab9a4c991d21dd15cc5163c99a885d4a8 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Tue, 30 Aug 2022 20:36:01 +0800
Subject: mm, hwpoison: use num_poisoned_pages_sub() to decrease
 num_poisoned_pages

Use num_poisoned_pages_sub() to combine multiple atomic ops into one. Also
num_poisoned_pages_dec() can be killed as there's no caller now.

Link: https://lkml.kernel.org/r/20220830123604.25763-4-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapops.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index a3d435bf9f97..88825d1785d2 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -485,11 +485,6 @@ static inline void num_poisoned_pages_inc(void)
 	atomic_long_inc(&num_poisoned_pages);
 }
 
-static inline void num_poisoned_pages_dec(void)
-{
-	atomic_long_dec(&num_poisoned_pages);
-}
-
 static inline void num_poisoned_pages_sub(long i)
 {
 	atomic_long_sub(i, &num_poisoned_pages);
-- 
cgit v1.2.3


From eba4d770efc86a3710e36b828190858abfa3bb74 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 11 Aug 2022 12:13:26 -0400
Subject: mm/swap: comment all the ifdef in swapops.h

swapops.h contains quite a few layers of ifdef, some of the "else" and
"endif" doesn't get proper comment on the macro so it's hard to follow on
what are they referring to.  Add the comments.

Link: https://lkml.kernel.org/r/20220811161331.37055-3-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Suggested-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapops.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 88825d1785d2..7d1b74046520 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -247,8 +247,8 @@ extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 #ifdef CONFIG_HUGETLB_PAGE
 extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
 extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
-#endif
-#else
+#endif	/* CONFIG_HUGETLB_PAGE */
+#else  /* CONFIG_MIGRATION */
 static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
 {
 	return swp_entry(0, 0);
@@ -276,7 +276,7 @@ static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 #ifdef CONFIG_HUGETLB_PAGE
 static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
 static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
-#endif
+#endif	/* CONFIG_HUGETLB_PAGE */
 static inline int is_writable_migration_entry(swp_entry_t entry)
 {
 	return 0;
@@ -286,7 +286,7 @@ static inline int is_readable_migration_entry(swp_entry_t entry)
 	return 0;
 }
 
-#endif
+#endif	/* CONFIG_MIGRATION */
 
 typedef unsigned long pte_marker;
 
@@ -426,7 +426,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
 {
 	return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
 }
-#else
+#else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
 static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
 		struct page *page)
 {
@@ -455,7 +455,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
 {
 	return 0;
 }
-#endif
+#endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
 
 #ifdef CONFIG_MEMORY_FAILURE
 
@@ -490,7 +490,7 @@ static inline void num_poisoned_pages_sub(long i)
 	atomic_long_sub(i, &num_poisoned_pages);
 }
 
-#else
+#else  /* CONFIG_MEMORY_FAILURE */
 
 static inline swp_entry_t make_hwpoison_entry(struct page *page)
 {
@@ -509,7 +509,7 @@ static inline void num_poisoned_pages_inc(void)
 static inline void num_poisoned_pages_sub(long i)
 {
 }
-#endif
+#endif  /* CONFIG_MEMORY_FAILURE */
 
 static inline int non_swap_entry(swp_entry_t entry)
 {
-- 
cgit v1.2.3


From 0d206b5d2e0d7d7f09ac9540e3ab3e35a34f536e Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 11 Aug 2022 12:13:27 -0400
Subject: mm/swap: add swp_offset_pfn() to fetch PFN from swap entry

We've got a bunch of special swap entries that stores PFN inside the swap
offset fields.  To fetch the PFN, normally the user just calls
swp_offset() assuming that'll be the PFN.

Add a helper swp_offset_pfn() to fetch the PFN instead, fetching only the
max possible length of a PFN on the host, meanwhile doing proper check
with MAX_PHYSMEM_BITS to make sure the swap offsets can actually store the
PFNs properly always using the BUILD_BUG_ON() in is_pfn_swap_entry().

One reason to do so is we never tried to sanitize whether swap offset can
really fit for storing PFN.  At the meantime, this patch also prepares us
with the future possibility to store more information inside the swp
offset field, so assuming "swp_offset(entry)" to be the PFN will not stand
any more very soon.

Replace many of the swp_offset() callers to use swp_offset_pfn() where
proper.  Note that many of the existing users are not candidates for the
replacement, e.g.:

  (1) When the swap entry is not a pfn swap entry at all, or,
  (2) when we wanna keep the whole swp_offset but only change the swp type.

For the latter, it can happen when fork() triggered on a write-migration
swap entry pte, we may want to only change the migration type from
write->read but keep the rest, so it's not "fetching PFN" but "changing
swap type only".  They're left aside so that when there're more
information within the swp offset they'll be carried over naturally in
those cases.

Since at it, dropping hwpoison_entry_to_pfn() because that's exactly what
the new swp_offset_pfn() is about.

Link: https://lkml.kernel.org/r/20220811161331.37055-4-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapops.h | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 7d1b74046520..578212fbf2be 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -23,6 +23,20 @@
 #define SWP_TYPE_SHIFT	(BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
 #define SWP_OFFSET_MASK	((1UL << SWP_TYPE_SHIFT) - 1)
 
+/*
+ * Definitions only for PFN swap entries (see is_pfn_swap_entry()).  To
+ * store PFN, we only need SWP_PFN_BITS bits.  Each of the pfn swap entries
+ * can use the extra bits to store other information besides PFN.
+ */
+#ifdef MAX_PHYSMEM_BITS
+#define SWP_PFN_BITS			(MAX_PHYSMEM_BITS - PAGE_SHIFT)
+#else  /* MAX_PHYSMEM_BITS */
+#define SWP_PFN_BITS			(BITS_PER_LONG - PAGE_SHIFT)
+#endif	/* MAX_PHYSMEM_BITS */
+#define SWP_PFN_MASK			(BIT(SWP_PFN_BITS) - 1)
+
+static inline bool is_pfn_swap_entry(swp_entry_t entry);
+
 /* Clear all flags but only keep swp_entry_t related information */
 static inline pte_t pte_swp_clear_flags(pte_t pte)
 {
@@ -64,6 +78,17 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 	return entry.val & SWP_OFFSET_MASK;
 }
 
+/*
+ * This should only be called upon a pfn swap entry to get the PFN stored
+ * in the swap entry.  Please refers to is_pfn_swap_entry() for definition
+ * of pfn swap entry.
+ */
+static inline unsigned long swp_offset_pfn(swp_entry_t entry)
+{
+	VM_BUG_ON(!is_pfn_swap_entry(entry));
+	return swp_offset(entry) & SWP_PFN_MASK;
+}
+
 /* check whether a pte points to a swap entry */
 static inline int is_swap_pte(pte_t pte)
 {
@@ -369,7 +394,7 @@ static inline int pte_none_mostly(pte_t pte)
 
 static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
 {
-	struct page *p = pfn_to_page(swp_offset(entry));
+	struct page *p = pfn_to_page(swp_offset_pfn(entry));
 
 	/*
 	 * Any use of migration entries may only occur while the
@@ -387,6 +412,9 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
  */
 static inline bool is_pfn_swap_entry(swp_entry_t entry)
 {
+	/* Make sure the swp offset can always store the needed fields */
+	BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);
+
 	return is_migration_entry(entry) || is_device_private_entry(entry) ||
 	       is_device_exclusive_entry(entry);
 }
@@ -475,11 +503,6 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
 	return swp_type(entry) == SWP_HWPOISON;
 }
 
-static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
-{
-	return swp_offset(entry);
-}
-
 static inline void num_poisoned_pages_inc(void)
 {
 	atomic_long_inc(&num_poisoned_pages);
-- 
cgit v1.2.3


From 2e3468778dbe3ec389a10c21a703bb8e5be5cfbc Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 11 Aug 2022 12:13:29 -0400
Subject: mm: remember young/dirty bit for page migrations

When page migration happens, we always ignore the young/dirty bit settings
in the old pgtable, and marking the page as old in the new page table
using either pte_mkold() or pmd_mkold(), and keeping the pte clean.

That's fine from functional-wise, but that's not friendly to page reclaim
because the moving page can be actively accessed within the procedure.
Not to mention hardware setting the young bit can bring quite some
overhead on some systems, e.g.  x86_64 needs a few hundreds nanoseconds to
set the bit.  The same slowdown problem to dirty bits when the memory is
first written after page migration happened.

Actually we can easily remember the A/D bit configuration and recover the
information after the page is migrated.  To achieve it, define a new set
of bits in the migration swap offset field to cache the A/D bits for old
pte.  Then when removing/recovering the migration entry, we can recover
the A/D bits even if the page changed.

One thing to mention is that here we used max_swapfile_size() to detect
how many swp offset bits we have, and we'll only enable this feature if we
know the swp offset is big enough to store both the PFN value and the A/D
bits.  Otherwise the A/D bits are dropped like before.

Link: https://lkml.kernel.org/r/20220811161331.37055-6-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapops.h | 99 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 578212fbf2be..11b874f212a2 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -8,6 +8,10 @@
 
 #ifdef CONFIG_MMU
 
+#ifdef CONFIG_SWAP
+#include <linux/swapfile.h>
+#endif	/* CONFIG_SWAP */
+
 /*
  * swapcache pages are stored in the swapper_space radix tree.  We want to
  * get good packing density in that tree, so the index should be dense in
@@ -35,6 +39,31 @@
 #endif	/* MAX_PHYSMEM_BITS */
 #define SWP_PFN_MASK			(BIT(SWP_PFN_BITS) - 1)
 
+/**
+ * Migration swap entry specific bitfield definitions.  Layout:
+ *
+ *   |----------+--------------------|
+ *   | swp_type | swp_offset         |
+ *   |----------+--------+-+-+-------|
+ *   |          | resv   |D|A|  PFN  |
+ *   |----------+--------+-+-+-------|
+ *
+ * @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A)
+ * @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D)
+ *
+ * Note: A/D bits will be stored in migration entries iff there're enough
+ * free bits in arch specific swp offset.  By default we'll ignore A/D bits
+ * when migrating a page.  Please refer to migration_entry_supports_ad()
+ * for more information.  If there're more bits besides PFN and A/D bits,
+ * they should be reserved and always be zeros.
+ */
+#define SWP_MIG_YOUNG_BIT		(SWP_PFN_BITS)
+#define SWP_MIG_DIRTY_BIT		(SWP_PFN_BITS + 1)
+#define SWP_MIG_TOTAL_BITS		(SWP_PFN_BITS + 2)
+
+#define SWP_MIG_YOUNG			BIT(SWP_MIG_YOUNG_BIT)
+#define SWP_MIG_DIRTY			BIT(SWP_MIG_DIRTY_BIT)
+
 static inline bool is_pfn_swap_entry(swp_entry_t entry);
 
 /* Clear all flags but only keep swp_entry_t related information */
@@ -265,6 +294,57 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
 	return swp_entry(SWP_MIGRATION_WRITE, offset);
 }
 
+/*
+ * Returns whether the host has large enough swap offset field to support
+ * carrying over pgtable A/D bits for page migrations.  The result is
+ * pretty much arch specific.
+ */
+static inline bool migration_entry_supports_ad(void)
+{
+	/*
+	 * max_swapfile_size() returns the max supported swp-offset plus 1.
+	 * We can support the migration A/D bits iff the pfn swap entry has
+	 * the offset large enough to cover all of them (PFN, A & D bits).
+	 */
+#ifdef CONFIG_SWAP
+	return max_swapfile_size() >= (1UL << SWP_MIG_TOTAL_BITS);
+#else  /* CONFIG_SWAP */
+	return false;
+#endif	/* CONFIG_SWAP */
+}
+
+static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
+{
+	if (migration_entry_supports_ad())
+		return swp_entry(swp_type(entry),
+				 swp_offset(entry) | SWP_MIG_YOUNG);
+	return entry;
+}
+
+static inline bool is_migration_entry_young(swp_entry_t entry)
+{
+	if (migration_entry_supports_ad())
+		return swp_offset(entry) & SWP_MIG_YOUNG;
+	/* Keep the old behavior of aging page after migration */
+	return false;
+}
+
+static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
+{
+	if (migration_entry_supports_ad())
+		return swp_entry(swp_type(entry),
+				 swp_offset(entry) | SWP_MIG_DIRTY);
+	return entry;
+}
+
+static inline bool is_migration_entry_dirty(swp_entry_t entry)
+{
+	if (migration_entry_supports_ad())
+		return swp_offset(entry) & SWP_MIG_DIRTY;
+	/* Keep the old behavior of clean page after migration */
+	return false;
+}
+
 extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 					spinlock_t *ptl);
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
@@ -311,6 +391,25 @@ static inline int is_readable_migration_entry(swp_entry_t entry)
 	return 0;
 }
 
+static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
+{
+	return entry;
+}
+
+static inline bool is_migration_entry_young(swp_entry_t entry)
+{
+	return false;
+}
+
+static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
+{
+	return entry;
+}
+
+static inline bool is_migration_entry_dirty(swp_entry_t entry)
+{
+	return false;
+}
 #endif	/* CONFIG_MIGRATION */
 
 typedef unsigned long pte_marker;
-- 
cgit v1.2.3


From be45a4902c7caa717fee6b2f671e59b396ed395c Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 11 Aug 2022 12:13:30 -0400
Subject: mm/swap: cache maximum swapfile size when init swap

We used to have swapfile_maximum_size() fetching a maximum value of
swapfile size per-arch.

As the caller of max_swapfile_size() grows, this patch introduce a
variable "swapfile_maximum_size" and cache the value of old
max_swapfile_size(), so that we don't need to calculate the value every
time.

Caching the value in swapfile_init() is safe because when reaching the
phase we should have initialized all the relevant information.  Here the
major arch to take care of is x86, which defines the max swapfile size
based on L1TF mitigation.

Here both X86_BUG_L1TF or l1tf_mitigation should have been setup properly
when reaching swapfile_init().  As a reference, the code path looks like
this for x86:

- start_kernel
  - setup_arch
    - early_cpu_init
      - early_identify_cpu --> setup X86_BUG_L1TF
  - parse_early_param
    - l1tf_cmdline --> set l1tf_mitigation
  - check_bugs
    - l1tf_select_mitigation --> set l1tf_mitigation
  - arch_call_rest_init
    - rest_init
      - kernel_init
        - kernel_init_freeable
          - do_basic_setup
            - do_initcalls --> calls swapfile_init() (initcall level 4)

The swapfile size only depends on swp pte format on non-x86 archs, so
caching it is safe too.

Since at it, rename max_swapfile_size() to arch_max_swapfile_size()
because arch can define its own function, so it's more straightforward to
have "arch_" as its prefix.  At the meantime, export swapfile_maximum_size
to replace the old usages of max_swapfile_size().

[peterx@redhat.com: declare arch_max_swapfile_size) in swapfile.h]
  Link: https://lkml.kernel.org/r/YxTh1GuC6ro5fKL5@xz-m1.local
Link: https://lkml.kernel.org/r/20220811161331.37055-7-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapfile.h | 5 ++++-
 include/linux/swapops.h  | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index 54078542134c..e2d11ae4e73d 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -8,6 +8,9 @@
  */
 extern struct swap_info_struct *swap_info[];
 extern unsigned long generic_max_swapfile_size(void);
-extern unsigned long max_swapfile_size(void);
+unsigned long arch_max_swapfile_size(void);
+
+/* Maximum swapfile size supported for the arch (not inclusive). */
+extern unsigned long swapfile_maximum_size;
 
 #endif /* _LINUX_SWAPFILE_H */
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 11b874f212a2..027b4095e132 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -307,7 +307,7 @@ static inline bool migration_entry_supports_ad(void)
 	 * the offset large enough to cover all of them (PFN, A & D bits).
 	 */
 #ifdef CONFIG_SWAP
-	return max_swapfile_size() >= (1UL << SWP_MIG_TOTAL_BITS);
+	return swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS);
 #else  /* CONFIG_SWAP */
 	return false;
 #endif	/* CONFIG_SWAP */
-- 
cgit v1.2.3


From 5154e607967d3f587fda84a40abbf900275016c9 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 11 Aug 2022 12:13:31 -0400
Subject: mm/swap: cache swap migration A/D bits support

Introduce a variable swap_migration_ad_supported to cache whether the arch
supports swap migration A/D bits.

Here one thing to mention is that SWP_MIG_TOTAL_BITS will internally
reference the other macro MAX_PHYSMEM_BITS, which is a function call on
x86 (constant on all the rest of archs).

It's safe to reference it in swapfile_init() because when reaching here
we're already during initcalls level 4 so we must have initialized 5-level
pgtable for x86_64 (right after early_identify_cpu() finishes).

- start_kernel
  - setup_arch
    - early_cpu_init
      - get_cpu_cap --> fetch from CPUID (including X86_FEATURE_LA57)
      - early_identify_cpu --> clear X86_FEATURE_LA57 (if early lvl5 not enabled (USE_EARLY_PGTABLE_L5))
  - arch_call_rest_init
    - rest_init
      - kernel_init
        - kernel_init_freeable
          - do_basic_setup
            - do_initcalls --> calls swapfile_init() (initcall level 4)

This should slightly speed up the migration swap entry handlings.

Link: https://lkml.kernel.org/r/20220811161331.37055-8-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapfile.h | 2 ++
 include/linux/swapops.h  | 7 +------
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index e2d11ae4e73d..7ed529a77c5b 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -12,5 +12,7 @@ unsigned long arch_max_swapfile_size(void);
 
 /* Maximum swapfile size supported for the arch (not inclusive). */
 extern unsigned long swapfile_maximum_size;
+/* Whether swap migration entry supports storing A/D bits for the arch */
+extern bool swap_migration_ad_supported;
 
 #endif /* _LINUX_SWAPFILE_H */
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 027b4095e132..86b95ccb81bb 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -301,13 +301,8 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
  */
 static inline bool migration_entry_supports_ad(void)
 {
-	/*
-	 * max_swapfile_size() returns the max supported swp-offset plus 1.
-	 * We can support the migration A/D bits iff the pfn swap entry has
-	 * the offset large enough to cover all of them (PFN, A & D bits).
-	 */
 #ifdef CONFIG_SWAP
-	return swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS);
+	return swap_migration_ad_supported;
 #else  /* CONFIG_SWAP */
 	return false;
 #endif	/* CONFIG_SWAP */
-- 
cgit v1.2.3


From aa1cf99b87e934e761b46ce2b925335a398980da Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang29@zte.com.cn>
Date: Mon, 15 Aug 2022 07:11:35 +0000
Subject: delayacct: support re-entrance detection of thrashing accounting

Once upon a time, we only support accounting thrashing of page cache.
Then Joonsoo introduced workingset detection for anonymous pages and we
gained the ability to account thrashing of them[1].

For page cache thrashing accounting, there is no suitable place to do it
in fs level likes swap_readpage().  So we have to do it in
folio_wait_bit_common().

Then for anonymous pages thrashing accounting, we have to do it in both
swap_readpage() and folio_wait_bit_common().  This likes PSI, so we should
let thrashing accounting supports re-entrance detection.

This patch is to prepare complete thrashing accounting, and is based on
patch "filemap: make the accounting of thrashing more consistent".

[1] commit aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU")

Link: https://lkml.kernel.org/r/20220815071134.74551-1-yang.yang29@zte.com.cn
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: CGEL ZTE <cgel.zte@gmail.com>
Reviewed-by: Ran Xiaokai <ran.xiaokai@zte.com.cn>
Reviewed-by: wangyong <wang.yong12@zte.com.cn>
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/delayacct.h | 16 ++++++++--------
 include/linux/sched.h     |  4 ++++
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 58aea2d7385c..0da97dba9ef8 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -73,8 +73,8 @@ extern int delayacct_add_tsk(struct taskstats *, struct task_struct *);
 extern __u64 __delayacct_blkio_ticks(struct task_struct *);
 extern void __delayacct_freepages_start(void);
 extern void __delayacct_freepages_end(void);
-extern void __delayacct_thrashing_start(void);
-extern void __delayacct_thrashing_end(void);
+extern void __delayacct_thrashing_start(bool *in_thrashing);
+extern void __delayacct_thrashing_end(bool *in_thrashing);
 extern void __delayacct_swapin_start(void);
 extern void __delayacct_swapin_end(void);
 extern void __delayacct_compact_start(void);
@@ -143,22 +143,22 @@ static inline void delayacct_freepages_end(void)
 		__delayacct_freepages_end();
 }
 
-static inline void delayacct_thrashing_start(void)
+static inline void delayacct_thrashing_start(bool *in_thrashing)
 {
 	if (!static_branch_unlikely(&delayacct_key))
 		return;
 
 	if (current->delays)
-		__delayacct_thrashing_start();
+		__delayacct_thrashing_start(in_thrashing);
 }
 
-static inline void delayacct_thrashing_end(void)
+static inline void delayacct_thrashing_end(bool *in_thrashing)
 {
 	if (!static_branch_unlikely(&delayacct_key))
 		return;
 
 	if (current->delays)
-		__delayacct_thrashing_end();
+		__delayacct_thrashing_end(in_thrashing);
 }
 
 static inline void delayacct_swapin_start(void)
@@ -237,9 +237,9 @@ static inline void delayacct_freepages_start(void)
 {}
 static inline void delayacct_freepages_end(void)
 {}
-static inline void delayacct_thrashing_start(void)
+static inline void delayacct_thrashing_start(bool *in_thrashing)
 {}
-static inline void delayacct_thrashing_end(void)
+static inline void delayacct_thrashing_end(bool *in_thrashing)
 {}
 static inline void delayacct_swapin_start(void)
 {}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e7b2f8a5c711..d9a2466664f7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -944,6 +944,10 @@ struct task_struct {
 #ifdef	CONFIG_CPU_SUP_INTEL
 	unsigned			reported_split_lock:1;
 #endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+	/* delay due to memory thrashing */
+	unsigned                        in_thrashing:1;
+#endif
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
-- 
cgit v1.2.3


From e1fd09e3d1dd4a1a8b3b33bc1fd647eee9f4e475 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 18 Sep 2022 01:59:58 -0600
Subject: mm: x86, arm64: add arch_has_hw_pte_young()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Multi-Gen LRU Framework", v14.

What's new
==========
1. OpenWrt, in addition to Android, Arch Linux Zen, Armbian, ChromeOS,
   Liquorix, post-factum and XanMod, is now shipping MGLRU on 5.15.
2. Fixed long-tailed direct reclaim latency seen on high-memory (TBs)
   machines. The old direct reclaim backoff, which tries to enforce a
   minimum fairness among all eligible memcgs, over-swapped by about
   (total_mem>>DEF_PRIORITY)-nr_to_reclaim. The new backoff, which
   pulls the plug on swapping once the target is met, trades some
   fairness for curtailed latency:
   https://lore.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com/
3. Fixed minior build warnings and conflicts. More comments and nits.

TLDR
====
The current page reclaim is too expensive in terms of CPU usage and it
often makes poor choices about what to evict. This patchset offers an
alternative solution that is performant, versatile and
straightforward.

Patchset overview
=================
The design and implementation overview is in patch 14:
https://lore.kernel.org/r/20220918080010.2920238-15-yuzhao@google.com/

01. mm: x86, arm64: add arch_has_hw_pte_young()
02. mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
Take advantage of hardware features when trying to clear the accessed
bit in many PTEs.

03. mm/vmscan.c: refactor shrink_node()
04. Revert "include/linux/mm_inline.h: fold __update_lru_size() into
    its sole caller"
Minor refactors to improve readability for the following patches.

05. mm: multi-gen LRU: groundwork
Adds the basic data structure and the functions that insert pages to
and remove pages from the multi-gen LRU (MGLRU) lists.

06. mm: multi-gen LRU: minimal implementation
A minimal implementation without optimizations.

07. mm: multi-gen LRU: exploit locality in rmap
Exploits spatial locality to improve efficiency when using the rmap.

08. mm: multi-gen LRU: support page table walks
Further exploits spatial locality by optionally scanning page tables.

09. mm: multi-gen LRU: optimize multiple memcgs
Optimizes the overall performance for multiple memcgs running mixed
types of workloads.

10. mm: multi-gen LRU: kill switch
Adds a kill switch to enable or disable MGLRU at runtime.

11. mm: multi-gen LRU: thrashing prevention
12. mm: multi-gen LRU: debugfs interface
Provide userspace with features like thrashing prevention, working set
estimation and proactive reclaim.

13. mm: multi-gen LRU: admin guide
14. mm: multi-gen LRU: design doc
Add an admin guide and a design doc.

Benchmark results
=================
Independent lab results
-----------------------
Based on the popularity of searches [01] and the memory usage in
Google's public cloud, the most popular open-source memory-hungry
applications, in alphabetical order, are:
      Apache Cassandra      Memcached
      Apache Hadoop         MongoDB
      Apache Spark          PostgreSQL
      MariaDB (MySQL)       Redis

An independent lab evaluated MGLRU with the most widely used benchmark
suites for the above applications. They posted 960 data points along
with kernel metrics and perf profiles collected over more than 500
hours of total benchmark time. Their final reports show that, with 95%
confidence intervals (CIs), the above applications all performed
significantly better for at least part of their benchmark matrices.

On 5.14:
1. Apache Spark [02] took 95% CIs [9.28, 11.19]% and [12.20, 14.93]%
   less wall time to sort three billion random integers, respectively,
   under the medium- and the high-concurrency conditions, when
   overcommitting memory. There were no statistically significant
   changes in wall time for the rest of the benchmark matrix.
2. MariaDB [03] achieved 95% CIs [5.24, 10.71]% and [20.22, 25.97]%
   more transactions per minute (TPM), respectively, under the medium-
   and the high-concurrency conditions, when overcommitting memory.
   There were no statistically significant changes in TPM for the rest
   of the benchmark matrix.
3. Memcached [04] achieved 95% CIs [23.54, 32.25]%, [20.76, 41.61]%
   and [21.59, 30.02]% more operations per second (OPS), respectively,
   for sequential access, random access and Gaussian (distribution)
   access, when THP=always; 95% CIs [13.85, 15.97]% and
   [23.94, 29.92]% more OPS, respectively, for random access and
   Gaussian access, when THP=never. There were no statistically
   significant changes in OPS for the rest of the benchmark matrix.
4. MongoDB [05] achieved 95% CIs [2.23, 3.44]%, [6.97, 9.73]% and
   [2.16, 3.55]% more operations per second (OPS), respectively, for
   exponential (distribution) access, random access and Zipfian
   (distribution) access, when underutilizing memory; 95% CIs
   [8.83, 10.03]%, [21.12, 23.14]% and [5.53, 6.46]% more OPS,
   respectively, for exponential access, random access and Zipfian
   access, when overcommitting memory.

On 5.15:
5. Apache Cassandra [06] achieved 95% CIs [1.06, 4.10]%, [1.94, 5.43]%
   and [4.11, 7.50]% more operations per second (OPS), respectively,
   for exponential (distribution) access, random access and Zipfian
   (distribution) access, when swap was off; 95% CIs [0.50, 2.60]%,
   [6.51, 8.77]% and [3.29, 6.75]% more OPS, respectively, for
   exponential access, random access and Zipfian access, when swap was
   on.
6. Apache Hadoop [07] took 95% CIs [5.31, 9.69]% and [2.02, 7.86]%
   less average wall time to finish twelve parallel TeraSort jobs,
   respectively, under the medium- and the high-concurrency
   conditions, when swap was on. There were no statistically
   significant changes in average wall time for the rest of the
   benchmark matrix.
7. PostgreSQL [08] achieved 95% CI [1.75, 6.42]% more transactions per
   minute (TPM) under the high-concurrency condition, when swap was
   off; 95% CIs [12.82, 18.69]% and [22.70, 46.86]% more TPM,
   respectively, under the medium- and the high-concurrency
   conditions, when swap was on. There were no statistically
   significant changes in TPM for the rest of the benchmark matrix.
8. Redis [09] achieved 95% CIs [0.58, 5.94]%, [6.55, 14.58]% and
   [11.47, 19.36]% more total operations per second (OPS),
   respectively, for sequential access, random access and Gaussian
   (distribution) access, when THP=always; 95% CIs [1.27, 3.54]%,
   [10.11, 14.81]% and [8.75, 13.64]% more total OPS, respectively,
   for sequential access, random access and Gaussian access, when
   THP=never.

Our lab results
---------------
To supplement the above results, we ran the following benchmark suites
on 5.16-rc7 and found no regressions [10].
      fs_fio_bench_hdd_mq      pft
      fs_lmbench               pgsql-hammerdb
      fs_parallelio            redis
      fs_postmark              stream
      hackbench                sysbenchthread
      kernbench                tpcc_spark
      memcached                unixbench
      multichase               vm-scalability
      mutilate                 will-it-scale
      nginx

[01] https://trends.google.com
[02] https://lore.kernel.org/r/20211102002002.92051-1-bot@edi.works/
[03] https://lore.kernel.org/r/20211009054315.47073-1-bot@edi.works/
[04] https://lore.kernel.org/r/20211021194103.65648-1-bot@edi.works/
[05] https://lore.kernel.org/r/20211109021346.50266-1-bot@edi.works/
[06] https://lore.kernel.org/r/20211202062806.80365-1-bot@edi.works/
[07] https://lore.kernel.org/r/20211209072416.33606-1-bot@edi.works/
[08] https://lore.kernel.org/r/20211218071041.24077-1-bot@edi.works/
[09] https://lore.kernel.org/r/20211122053248.57311-1-bot@edi.works/
[10] https://lore.kernel.org/r/20220104202247.2903702-1-yuzhao@google.com/

Read-world applications
=======================
Third-party testimonials
------------------------
Konstantin reported [11]:
   I have Archlinux with 8G RAM + zswap + swap. While developing, I
   have lots of apps opened such as multiple LSP-servers for different
   langs, chats, two browsers, etc... Usually, my system gets quickly
   to a point of SWAP-storms, where I have to kill LSP-servers,
   restart browsers to free memory, etc, otherwise the system lags
   heavily and is barely usable.

   1.5 day ago I migrated from 5.11.15 kernel to 5.12 + the LRU
   patchset, and I started up by opening lots of apps to create memory
   pressure, and worked for a day like this. Till now I had not a
   single SWAP-storm, and mind you I got 3.4G in SWAP. I was never
   getting to the point of 3G in SWAP before without a single
   SWAP-storm.

Vaibhav from IBM reported [12]:
   In a synthetic MongoDB Benchmark, seeing an average of ~19%
   throughput improvement on POWER10(Radix MMU + 64K Page Size) with
   MGLRU patches on top of 5.16 kernel for MongoDB + YCSB across
   three different request distributions, namely, Exponential, Uniform
   and Zipfan.

Shuang from U of Rochester reported [13]:
   With the MGLRU, fio achieved 95% CIs [38.95, 40.26]%, [4.12, 6.64]%
   and [9.26, 10.36]% higher throughput, respectively, for random
   access, Zipfian (distribution) access and Gaussian (distribution)
   access, when the average number of jobs per CPU is 1; 95% CIs
   [42.32, 49.15]%, [9.44, 9.89]% and [20.99, 22.86]% higher
   throughput, respectively, for random access, Zipfian access and
   Gaussian access, when the average number of jobs per CPU is 2.

Daniel from Michigan Tech reported [14]:
   With Memcached allocating ~100GB of byte-addressable Optante,
   performance improvement in terms of throughput (measured as queries
   per second) was about 10% for a series of workloads.

Large-scale deployments
-----------------------
We've rolled out MGLRU to tens of millions of ChromeOS users and
about a million Android users. Google's fleetwide profiling [15] shows
an overall 40% decrease in kswapd CPU usage, in addition to
improvements in other UX metrics, e.g., an 85% decrease in the number
of low-memory kills at the 75th percentile and an 18% decrease in
app launch time at the 50th percentile.

The downstream kernels that have been using MGLRU include:
1. Android [16]
2. Arch Linux Zen [17]
3. Armbian [18]
4. ChromeOS [19]
5. Liquorix [20]
6. OpenWrt [21]
7. post-factum [22]
8. XanMod [23]

[11] https://lore.kernel.org/r/140226722f2032c86301fbd326d91baefe3d7d23.camel@yandex.ru/
[12] https://lore.kernel.org/r/87czj3mux0.fsf@vajain21.in.ibm.com/
[13] https://lore.kernel.org/r/20220105024423.26409-1-szhai2@cs.rochester.edu/
[14] https://lore.kernel.org/r/CA+4-3vksGvKd18FgRinxhqHetBS1hQekJE2gwco8Ja-bJWKtFw@mail.gmail.com/
[15] https://dl.acm.org/doi/10.1145/2749469.2750392
[16] https://android.com
[17] https://archlinux.org
[18] https://armbian.com
[19] https://chromium.org
[20] https://liquorix.net
[21] https://openwrt.org
[22] https://codeberg.org/pf-kernel
[23] https://xanmod.org

Summary
=======
The facts are:
1. The independent lab results and the real-world applications
   indicate substantial improvements; there are no known regressions.
2. Thrashing prevention, working set estimation and proactive reclaim
   work out of the box; there are no equivalent solutions.
3. There is a lot of new code; no smaller changes have been
   demonstrated similar effects.

Our options, accordingly, are:
1. Given the amount of evidence, the reported improvements will likely
   materialize for a wide range of workloads.
2. Gauging the interest from the past discussions, the new features
   will likely be put to use for both personal computers and data
   centers.
3. Based on Google's track record, the new code will likely be well
   maintained in the long term. It'd be more difficult if not
   impossible to achieve similar effects with other approaches.


This patch (of 14):

Some architectures automatically set the accessed bit in PTEs, e.g., x86
and arm64 v8.2.  On architectures that do not have this capability,
clearing the accessed bit in a PTE usually triggers a page fault following
the TLB miss of this PTE (to emulate the accessed bit).

Being aware of this capability can help make better decisions, e.g.,
whether to spread the work out over a period of time to reduce bursty page
faults when trying to clear the accessed bit in many PTEs.

Note that theoretically this capability can be unreliable, e.g.,
hotplugged CPUs might be different from builtin ones.  Therefore it should
not be used in architecture-independent code that involves correctness,
e.g., to determine whether TLB flushes are required (in combination with
the accessed bit).

Link: https://lkml.kernel.org/r/20220918080010.2920238-1-yuzhao@google.com
Link: https://lkml.kernel.org/r/20220918080010.2920238-2-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Acked-by: Will Deacon <will@kernel.org>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index d13b4f7cc5be..375e8e7e64f4 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -260,6 +260,19 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
+#ifndef arch_has_hw_pte_young
+/*
+ * Return whether the accessed bit is supported on the local CPU.
+ *
+ * This stub assumes accessing through an old PTE triggers a page fault.
+ * Architectures that automatically set the access bit should overwrite it.
+ */
+static inline bool arch_has_hw_pte_young(void)
+{
+	return false;
+}
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 				       unsigned long address,
-- 
cgit v1.2.3


From eed9a328aa1ae6ac1edaa026957e6882f57de0dd Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 18 Sep 2022 01:59:59 -0600
Subject: mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some architectures support the accessed bit in non-leaf PMD entries, e.g.,
x86 sets the accessed bit in a non-leaf PMD entry when using it as part of
linear address translation [1].  Page table walkers that clear the
accessed bit may use this capability to reduce their search space.

Note that:
1. Although an inline function is preferable, this capability is added
   as a configuration option for consistency with the existing macros.
2. Due to the little interest in other varieties, this capability was
   only tested on Intel and AMD CPUs.

Thanks to the following developers for their efforts [2][3].
  Randy Dunlap <rdunlap@infradead.org>
  Stephen Rothwell <sfr@canb.auug.org.au>

[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
     Volume 3 (June 2021), section 4.8
[2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/
[3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/

Link: https://lkml.kernel.org/r/20220918080010.2920238-3-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 375e8e7e64f4..a108b60a6962 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -213,7 +213,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 #endif
 
 #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
 static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 					    unsigned long address,
 					    pmd_t *pmdp)
@@ -234,7 +234,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 	BUILD_BUG();
 	return 0;
 }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
 #endif
 
 #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-- 
cgit v1.2.3


From aa1b67903a19e026d1749241fad177f6185c2d42 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 18 Sep 2022 02:00:01 -0600
Subject: Revert "include/linux/mm_inline.h: fold __update_lru_size() into its
 sole caller"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch undoes the following refactor: commit 289ccba18af4
("include/linux/mm_inline.h: fold __update_lru_size() into its sole
caller")

The upcoming changes to include/linux/mm_inline.h will reuse
__update_lru_size().

Link: https://lkml.kernel.org/r/20220918080010.2920238-5-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_inline.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 7b25b53c474a..fb8aadb81cd6 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -34,7 +34,7 @@ static inline int page_is_file_lru(struct page *page)
 	return folio_is_file_lru(page_folio(page));
 }
 
-static __always_inline void update_lru_size(struct lruvec *lruvec,
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
 				enum lru_list lru, enum zone_type zid,
 				long nr_pages)
 {
@@ -43,6 +43,13 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
 	__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
 	__mod_zone_page_state(&pgdat->node_zones[zid],
 				NR_ZONE_LRU_BASE + lru, nr_pages);
+}
+
+static __always_inline void update_lru_size(struct lruvec *lruvec,
+				enum lru_list lru, enum zone_type zid,
+				long nr_pages)
+{
+	__update_lru_size(lruvec, lru, zid, nr_pages);
 #ifdef CONFIG_MEMCG
 	mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
 #endif
-- 
cgit v1.2.3


From ec1c86b25f4bdd9dce6436c0539d2a6ae676e1c4 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 18 Sep 2022 02:00:02 -0600
Subject: mm: multi-gen LRU: groundwork
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Evictable pages are divided into multiple generations for each lruvec.
The youngest generation number is stored in lrugen->max_seq for both
anon and file types as they are aged on an equal footing. The oldest
generation numbers are stored in lrugen->min_seq[] separately for anon
and file types as clean file pages can be evicted regardless of swap
constraints. These three variables are monotonically increasing.

Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
in order to fit into the gen counter in folio->flags. Each truncated
generation number is an index to lrugen->lists[]. The sliding window
technique is used to track at least MIN_NR_GENS and at most
MAX_NR_GENS generations. The gen counter stores a value within [1,
MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
stores 0.

There are two conceptually independent procedures: "the aging", which
produces young generations, and "the eviction", which consumes old
generations.  They form a closed-loop system, i.e., "the page reclaim".
Both procedures can be invoked from userspace for the purposes of working
set estimation and proactive reclaim.  These techniques are commonly used
to optimize job scheduling (bin packing) in data centers [1][2].

To avoid confusion, the terms "hot" and "cold" will be applied to the
multi-gen LRU, as a new convention; the terms "active" and "inactive" will
be applied to the active/inactive LRU, as usual.

The protection of hot pages and the selection of cold pages are based
on page access channels and patterns. There are two access channels:
one through page tables and the other through file descriptors. The
protection of the former channel is by design stronger because:
1. The uncertainty in determining the access patterns of the former
   channel is higher due to the approximation of the accessed bit.
2. The cost of evicting the former channel is higher due to the TLB
   flushes required and the likelihood of encountering the dirty bit.
3. The penalty of underprotecting the former channel is higher because
   applications usually do not prepare themselves for major page
   faults like they do for blocked I/O. E.g., GUI applications
   commonly use dedicated I/O threads to avoid blocking rendering
   threads.

There are also two access patterns: one with temporal locality and the
other without.  For the reasons listed above, the former channel is
assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is
present; the latter channel is assumed to follow the latter pattern unless
outlying refaults have been observed [3][4].

The next patch will address the "outlying refaults".  Three macros, i.e.,
LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in
this patch to make the entire patchset less diffy.

A page is added to the youngest generation on faulting.  The aging needs
to check the accessed bit at least twice before handing this page over to
the eviction.  The first check takes care of the accessed bit set on the
initial fault; the second check makes sure this page has not been used
since then.  This protocol, AKA second chance, requires a minimum of two
generations, hence MIN_NR_GENS.

[1] https://dl.acm.org/doi/10.1145/3297858.3304053
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
[3] https://lwn.net/Articles/495543/
[4] https://lwn.net/Articles/815342/

Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_inline.h         | 175 ++++++++++++++++++++++++++++++++++++++
 include/linux/mmzone.h            | 102 ++++++++++++++++++++++
 include/linux/page-flags-layout.h |  13 +--
 include/linux/page-flags.h        |   4 +-
 include/linux/sched.h             |   4 +
 5 files changed, 291 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index fb8aadb81cd6..2ff703900fd0 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -40,6 +40,9 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
 {
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
+	lockdep_assert_held(&lruvec->lru_lock);
+	WARN_ON_ONCE(nr_pages != (int)nr_pages);
+
 	__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
 	__mod_zone_page_state(&pgdat->node_zones[zid],
 				NR_ZONE_LRU_BASE + lru, nr_pages);
@@ -101,11 +104,177 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio)
 	return lru;
 }
 
+#ifdef CONFIG_LRU_GEN
+
+static inline bool lru_gen_enabled(void)
+{
+	return true;
+}
+
+static inline bool lru_gen_in_fault(void)
+{
+	return current->in_lru_fault;
+}
+
+static inline int lru_gen_from_seq(unsigned long seq)
+{
+	return seq % MAX_NR_GENS;
+}
+
+static inline int folio_lru_gen(struct folio *folio)
+{
+	unsigned long flags = READ_ONCE(folio->flags);
+
+	return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+{
+	unsigned long max_seq = lruvec->lrugen.max_seq;
+
+	VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+
+	/* see the comment on MIN_NR_GENS */
+	return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
+}
+
+static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
+				       int old_gen, int new_gen)
+{
+	int type = folio_is_file_lru(folio);
+	int zone = folio_zonenum(folio);
+	int delta = folio_nr_pages(folio);
+	enum lru_list lru = type * LRU_INACTIVE_FILE;
+	struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+	VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
+	VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
+	VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
+
+	if (old_gen >= 0)
+		WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
+			   lrugen->nr_pages[old_gen][type][zone] - delta);
+	if (new_gen >= 0)
+		WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
+			   lrugen->nr_pages[new_gen][type][zone] + delta);
+
+	/* addition */
+	if (old_gen < 0) {
+		if (lru_gen_is_active(lruvec, new_gen))
+			lru += LRU_ACTIVE;
+		__update_lru_size(lruvec, lru, zone, delta);
+		return;
+	}
+
+	/* deletion */
+	if (new_gen < 0) {
+		if (lru_gen_is_active(lruvec, old_gen))
+			lru += LRU_ACTIVE;
+		__update_lru_size(lruvec, lru, zone, -delta);
+		return;
+	}
+}
+
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+	unsigned long seq;
+	unsigned long flags;
+	int gen = folio_lru_gen(folio);
+	int type = folio_is_file_lru(folio);
+	int zone = folio_zonenum(folio);
+	struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+	VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
+
+	if (folio_test_unevictable(folio))
+		return false;
+	/*
+	 * There are three common cases for this page:
+	 * 1. If it's hot, e.g., freshly faulted in or previously hot and
+	 *    migrated, add it to the youngest generation.
+	 * 2. If it's cold but can't be evicted immediately, i.e., an anon page
+	 *    not in swapcache or a dirty page pending writeback, add it to the
+	 *    second oldest generation.
+	 * 3. Everything else (clean, cold) is added to the oldest generation.
+	 */
+	if (folio_test_active(folio))
+		seq = lrugen->max_seq;
+	else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
+		 (folio_test_reclaim(folio) &&
+		  (folio_test_dirty(folio) || folio_test_writeback(folio))))
+		seq = lrugen->min_seq[type] + 1;
+	else
+		seq = lrugen->min_seq[type];
+
+	gen = lru_gen_from_seq(seq);
+	flags = (gen + 1UL) << LRU_GEN_PGOFF;
+	/* see the comment on MIN_NR_GENS about PG_active */
+	set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
+
+	lru_gen_update_size(lruvec, folio, -1, gen);
+	/* for folio_rotate_reclaimable() */
+	if (reclaiming)
+		list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+	else
+		list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
+
+	return true;
+}
+
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+	unsigned long flags;
+	int gen = folio_lru_gen(folio);
+
+	if (gen < 0)
+		return false;
+
+	VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+	VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+
+	/* for folio_migrate_flags() */
+	flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
+	flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
+	gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+
+	lru_gen_update_size(lruvec, folio, gen, -1);
+	list_del(&folio->lru);
+
+	return true;
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline bool lru_gen_enabled(void)
+{
+	return false;
+}
+
+static inline bool lru_gen_in_fault(void)
+{
+	return false;
+}
+
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+	return false;
+}
+
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+	return false;
+}
+
+#endif /* CONFIG_LRU_GEN */
+
 static __always_inline
 void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
 {
 	enum lru_list lru = folio_lru_list(folio);
 
+	if (lru_gen_add_folio(lruvec, folio, false))
+		return;
+
 	update_lru_size(lruvec, lru, folio_zonenum(folio),
 			folio_nr_pages(folio));
 	if (lru != LRU_UNEVICTABLE)
@@ -123,6 +292,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
 {
 	enum lru_list lru = folio_lru_list(folio);
 
+	if (lru_gen_add_folio(lruvec, folio, true))
+		return;
+
 	update_lru_size(lruvec, lru, folio_zonenum(folio),
 			folio_nr_pages(folio));
 	/* This is not expected to be used on LRU_UNEVICTABLE */
@@ -140,6 +312,9 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
 {
 	enum lru_list lru = folio_lru_list(folio);
 
+	if (lru_gen_del_folio(lruvec, folio, false))
+		return;
+
 	if (lru != LRU_UNEVICTABLE)
 		list_del(&folio->lru);
 	update_lru_size(lruvec, lru, folio_zonenum(folio),
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 18cf0fc5ce67..6f4ea078d90f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -317,6 +317,102 @@ enum lruvec_flags {
 					 */
 };
 
+#endif /* !__GENERATING_BOUNDS_H */
+
+/*
+ * Evictable pages are divided into multiple generations. The youngest and the
+ * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
+ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
+ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
+ * corresponding generation. The gen counter in folio->flags stores gen+1 while
+ * a page is on one of lrugen->lists[]. Otherwise it stores 0.
+ *
+ * A page is added to the youngest generation on faulting. The aging needs to
+ * check the accessed bit at least twice before handing this page over to the
+ * eviction. The first check takes care of the accessed bit set on the initial
+ * fault; the second check makes sure this page hasn't been used since then.
+ * This process, AKA second chance, requires a minimum of two generations,
+ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
+ * LRU, e.g., /proc/vmstat, these two generations are considered active; the
+ * rest of generations, if they exist, are considered inactive. See
+ * lru_gen_is_active().
+ *
+ * PG_active is always cleared while a page is on one of lrugen->lists[] so that
+ * the aging needs not to worry about it. And it's set again when a page
+ * considered active is isolated for non-reclaiming purposes, e.g., migration.
+ * See lru_gen_add_folio() and lru_gen_del_folio().
+ *
+ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
+ * number of categories of the active/inactive LRU when keeping track of
+ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
+ * in folio->flags.
+ */
+#define MIN_NR_GENS		2U
+#define MAX_NR_GENS		4U
+
+#ifndef __GENERATING_BOUNDS_H
+
+struct lruvec;
+
+#define LRU_GEN_MASK		((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+#define LRU_REFS_MASK		((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+
+#ifdef CONFIG_LRU_GEN
+
+enum {
+	LRU_GEN_ANON,
+	LRU_GEN_FILE,
+};
+
+/*
+ * The youngest generation number is stored in max_seq for both anon and file
+ * types as they are aged on an equal footing. The oldest generation numbers are
+ * stored in min_seq[] separately for anon and file types as clean file pages
+ * can be evicted regardless of swap constraints.
+ *
+ * Normally anon and file min_seq are in sync. But if swapping is constrained,
+ * e.g., out of swap space, file min_seq is allowed to advance and leave anon
+ * min_seq behind.
+ *
+ * The number of pages in each generation is eventually consistent and therefore
+ * can be transiently negative.
+ */
+struct lru_gen_struct {
+	/* the aging increments the youngest generation number */
+	unsigned long max_seq;
+	/* the eviction increments the oldest generation numbers */
+	unsigned long min_seq[ANON_AND_FILE];
+	/* the multi-gen LRU lists, lazily sorted on eviction */
+	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+	/* the multi-gen LRU sizes, eventually consistent */
+	long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+};
+
+void lru_gen_init_lruvec(struct lruvec *lruvec);
+
+#ifdef CONFIG_MEMCG
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
+void lru_gen_exit_memcg(struct mem_cgroup *memcg);
+#endif
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+{
+}
+#endif
+
+#endif /* CONFIG_LRU_GEN */
+
 struct lruvec {
 	struct list_head		lists[NR_LRU_LISTS];
 	/* per lruvec lru_lock for memcg */
@@ -334,6 +430,10 @@ struct lruvec {
 	unsigned long			refaults[ANON_AND_FILE];
 	/* Various lruvec state flags (enum lruvec_flags) */
 	unsigned long			flags;
+#ifdef CONFIG_LRU_GEN
+	/* evictable pages divided into generations */
+	struct lru_gen_struct		lrugen;
+#endif
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
 #endif
@@ -749,6 +849,8 @@ static inline bool zone_is_empty(struct zone *zone)
 #define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
 #define LAST_CPUPID_PGOFF	(ZONES_PGOFF - LAST_CPUPID_WIDTH)
 #define KASAN_TAG_PGOFF		(LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
+#define LRU_GEN_PGOFF		(KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
+#define LRU_REFS_PGOFF		(LRU_GEN_PGOFF - LRU_REFS_WIDTH)
 
 /*
  * Define the bit shifts to access each section.  For non-existent
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index ef1e3e736e14..240905407a18 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -55,7 +55,8 @@
 #define SECTIONS_WIDTH		0
 #endif
 
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
+	<= BITS_PER_LONG - NR_PAGEFLAGS
 #define NODES_WIDTH		NODES_SHIFT
 #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
 #error "Vmemmap: No space for nodes field in page flags"
@@ -89,8 +90,8 @@
 #define LAST_CPUPID_SHIFT 0
 #endif
 
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
-	<= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+	KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
 #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
 #else
 #define LAST_CPUPID_WIDTH 0
@@ -100,10 +101,12 @@
 #define LAST_CPUPID_NOT_IN_PAGE_FLAGS
 #endif
 
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
-	> BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+	KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
 #error "Not enough bits in page flags"
 #endif
 
+#define LRU_REFS_WIDTH	0
+
 #endif
 #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 465ff35a8c00..0b0ae5084e60 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1058,7 +1058,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
 	 1UL << PG_private	| 1UL << PG_private_2	|	\
 	 1UL << PG_writeback	| 1UL << PG_reserved	|	\
 	 1UL << PG_slab		| 1UL << PG_active 	|	\
-	 1UL << PG_unevictable	| __PG_MLOCKED)
+	 1UL << PG_unevictable	| __PG_MLOCKED | LRU_GEN_MASK)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
@@ -1069,7 +1069,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
  * alloc-free cycle to prevent from reusing the page.
  */
 #define PAGE_FLAGS_CHECK_AT_PREP	\
-	(PAGEFLAGS_MASK & ~__PG_HWPOISON)
+	((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
 
 #define PAGE_FLAGS_PRIVATE				\
 	(1UL << PG_private | 1UL << PG_private_2)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d9a2466664f7..a2dcfb91df03 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -914,6 +914,10 @@ struct task_struct {
 #ifdef CONFIG_MEMCG
 	unsigned			in_user_fault:1;
 #endif
+#ifdef CONFIG_LRU_GEN
+	/* whether the LRU algorithm may apply to this access */
+	unsigned			in_lru_fault:1;
+#endif
 #ifdef CONFIG_COMPAT_BRK
 	unsigned			brk_randomized:1;
 #endif
-- 
cgit v1.2.3


From ac35a490237446b71e3b4b782b1596967edd0aa8 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 18 Sep 2022 02:00:03 -0600
Subject: mm: multi-gen LRU: minimal implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To avoid confusion, the terms "promotion" and "demotion" will be applied
to the multi-gen LRU, as a new convention; the terms "activation" and
"deactivation" will be applied to the active/inactive LRU, as usual.

The aging produces young generations.  Given an lruvec, it increments
max_seq when max_seq-min_seq+1 approaches MIN_NR_GENS.  The aging promotes
hot pages to the youngest generation when it finds them accessed through
page tables; the demotion of cold pages happens consequently when it
increments max_seq.  Promotion in the aging path does not involve any LRU
list operations, only the updates of the gen counter and
lrugen->nr_pages[]; demotion, unless as the result of the increment of
max_seq, requires LRU list operations, e.g., lru_deactivate_fn().  The
aging has the complexity O(nr_hot_pages), since it is only interested in
hot pages.

The eviction consumes old generations.  Given an lruvec, it increments
min_seq when lrugen->lists[] indexed by min_seq%MAX_NR_GENS becomes empty.
A feedback loop modeled after the PID controller monitors refaults over
anon and file types and decides which type to evict when both types are
available from the same generation.

The protection of pages accessed multiple times through file descriptors
takes place in the eviction path.  Each generation is divided into
multiple tiers.  A page accessed N times through file descriptors is in
tier order_base_2(N).  Tiers do not have dedicated lrugen->lists[], only
bits in folio->flags.  The aforementioned feedback loop also monitors
refaults over all tiers and decides when to protect pages in which tiers
(N>1), using the first tier (N=0,1) as a baseline.  The first tier
contains single-use unmapped clean pages, which are most likely the best
choices.  In contrast to promotion in the aging path, the protection of a
page in the eviction path is achieved by moving this page to the next
generation, i.e., min_seq+1, if the feedback loop decides so.  This
approach has the following advantages:

1. It removes the cost of activation in the buffered access path by
   inferring whether pages accessed multiple times through file
   descriptors are statistically hot and thus worth protecting in the
   eviction path.
2. It takes pages accessed through page tables into account and avoids
   overprotecting pages accessed multiple times through file
   descriptors. (Pages accessed through page tables are in the first
   tier, since N=0.)
3. More tiers provide better protection for pages accessed more than
   twice through file descriptors, when under heavy buffered I/O
   workloads.

Server benchmark results:
  Single workload:
    fio (buffered I/O): +[30, 32]%
                IOPS         BW
      5.19-rc1: 2673k        10.2GiB/s
      patch1-6: 3491k        13.3GiB/s

  Single workload:
    memcached (anon): -[4, 6]%
                Ops/sec      KB/sec
      5.19-rc1: 1161501.04   45177.25
      patch1-6: 1106168.46   43025.04

  Configurations:
    CPU: two Xeon 6154
    Mem: total 256G

    Node 1 was only used as a ram disk to reduce the variance in the
    results.

    patch drivers/block/brd.c <<EOF
    99,100c99,100
    < 	gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
    < 	page = alloc_page(gfp_flags);
    ---
    > 	gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM | __GFP_THISNODE;
    > 	page = alloc_pages_node(1, gfp_flags, 0);
    EOF

    cat >>/etc/systemd/system.conf <<EOF
    CPUAffinity=numa
    NUMAPolicy=bind
    NUMAMask=0
    EOF

    cat >>/etc/memcached.conf <<EOF
    -m 184320
    -s /var/run/memcached/memcached.sock
    -a 0766
    -t 36
    -B binary
    EOF

    cat fio.sh
    modprobe brd rd_nr=1 rd_size=113246208
    swapoff -a
    mkfs.ext4 /dev/ram0
    mount -t ext4 /dev/ram0 /mnt

    mkdir /sys/fs/cgroup/user.slice/test
    echo 38654705664 >/sys/fs/cgroup/user.slice/test/memory.max
    echo $$ >/sys/fs/cgroup/user.slice/test/cgroup.procs
    fio -name=mglru --numjobs=72 --directory=/mnt --size=1408m \
      --buffered=1 --ioengine=io_uring --iodepth=128 \
      --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
      --rw=randread --random_distribution=random --norandommap \
      --time_based --ramp_time=10m --runtime=5m --group_reporting

    cat memcached.sh
    modprobe brd rd_nr=1 rd_size=113246208
    swapoff -a
    mkswap /dev/ram0
    swapon /dev/ram0

    memtier_benchmark -S /var/run/memcached/memcached.sock \
      -P memcache_binary -n allkeys --key-minimum=1 \
      --key-maximum=65000000 --key-pattern=P:P -c 1 -t 36 \
      --ratio 1:0 --pipeline 8 -d 2000

    memtier_benchmark -S /var/run/memcached/memcached.sock \
      -P memcache_binary -n allkeys --key-minimum=1 \
      --key-maximum=65000000 --key-pattern=R:R -c 1 -t 36 \
      --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed

Client benchmark results:
  kswapd profiles:
    5.19-rc1
      40.33%  page_vma_mapped_walk (overhead)
      21.80%  lzo1x_1_do_compress (real work)
       7.53%  do_raw_spin_lock
       3.95%  _raw_spin_unlock_irq
       2.52%  vma_interval_tree_iter_next
       2.37%  folio_referenced_one
       2.28%  vma_interval_tree_subtree_search
       1.97%  anon_vma_interval_tree_iter_first
       1.60%  ptep_clear_flush
       1.06%  __zram_bvec_write

    patch1-6
      39.03%  lzo1x_1_do_compress (real work)
      18.47%  page_vma_mapped_walk (overhead)
       6.74%  _raw_spin_unlock_irq
       3.97%  do_raw_spin_lock
       2.49%  ptep_clear_flush
       2.48%  anon_vma_interval_tree_iter_first
       1.92%  folio_referenced_one
       1.88%  __zram_bvec_write
       1.48%  memmove
       1.31%  vma_interval_tree_iter_next

  Configurations:
    CPU: single Snapdragon 7c
    Mem: total 4G

    ChromeOS MemoryPressure [1]

[1] https://chromium.googlesource.com/chromiumos/platform/tast-tests/

Link: https://lkml.kernel.org/r/20220918080010.2920238-7-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_inline.h         | 36 ++++++++++++++++++++++++++++++++++
 include/linux/mmzone.h            | 41 +++++++++++++++++++++++++++++++++++++++
 include/linux/page-flags-layout.h |  5 ++++-
 3 files changed, 81 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 2ff703900fd0..f2b2296a42f9 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -121,6 +121,33 @@ static inline int lru_gen_from_seq(unsigned long seq)
 	return seq % MAX_NR_GENS;
 }
 
+static inline int lru_hist_from_seq(unsigned long seq)
+{
+	return seq % NR_HIST_GENS;
+}
+
+static inline int lru_tier_from_refs(int refs)
+{
+	VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
+
+	/* see the comment in folio_lru_refs() */
+	return order_base_2(refs + 1);
+}
+
+static inline int folio_lru_refs(struct folio *folio)
+{
+	unsigned long flags = READ_ONCE(folio->flags);
+	bool workingset = flags & BIT(PG_workingset);
+
+	/*
+	 * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
+	 * total number of accesses is N>1, since N=0,1 both map to the first
+	 * tier. lru_tier_from_refs() will account for this off-by-one. Also see
+	 * the comment on MAX_NR_TIERS.
+	 */
+	return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
+}
+
 static inline int folio_lru_gen(struct folio *folio)
 {
 	unsigned long flags = READ_ONCE(folio->flags);
@@ -173,6 +200,15 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
 		__update_lru_size(lruvec, lru, zone, -delta);
 		return;
 	}
+
+	/* promotion */
+	if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
+		__update_lru_size(lruvec, lru, zone, -delta);
+		__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
+	}
+
+	/* demotion requires isolation, e.g., lru_deactivate_fn() */
+	VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
 }
 
 static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6f4ea078d90f..7e343420bfb1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -350,6 +350,28 @@ enum lruvec_flags {
 #define MIN_NR_GENS		2U
 #define MAX_NR_GENS		4U
 
+/*
+ * Each generation is divided into multiple tiers. A page accessed N times
+ * through file descriptors is in tier order_base_2(N). A page in the first tier
+ * (N=0,1) is marked by PG_referenced unless it was faulted in through page
+ * tables or read ahead. A page in any other tier (N>1) is marked by
+ * PG_referenced and PG_workingset. This implies a minimum of two tiers is
+ * supported without using additional bits in folio->flags.
+ *
+ * In contrast to moving across generations which requires the LRU lock, moving
+ * across tiers only involves atomic operations on folio->flags and therefore
+ * has a negligible cost in the buffered access path. In the eviction path,
+ * comparisons of refaulted/(evicted+protected) from the first tier and the
+ * rest infer whether pages accessed multiple times through file descriptors
+ * are statistically hot and thus worth protecting.
+ *
+ * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
+ * number of categories of the active/inactive LRU when keeping track of
+ * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
+ * folio->flags.
+ */
+#define MAX_NR_TIERS		4U
+
 #ifndef __GENERATING_BOUNDS_H
 
 struct lruvec;
@@ -364,6 +386,16 @@ enum {
 	LRU_GEN_FILE,
 };
 
+#define MIN_LRU_BATCH		BITS_PER_LONG
+#define MAX_LRU_BATCH		(MIN_LRU_BATCH * 64)
+
+/* whether to keep historical stats from evicted generations */
+#ifdef CONFIG_LRU_GEN_STATS
+#define NR_HIST_GENS		MAX_NR_GENS
+#else
+#define NR_HIST_GENS		1U
+#endif
+
 /*
  * The youngest generation number is stored in max_seq for both anon and file
  * types as they are aged on an equal footing. The oldest generation numbers are
@@ -386,6 +418,15 @@ struct lru_gen_struct {
 	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
 	/* the multi-gen LRU sizes, eventually consistent */
 	long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+	/* the exponential moving average of refaulted */
+	unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
+	/* the exponential moving average of evicted+protected */
+	unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
+	/* the first tier doesn't need protection, hence the minus one */
+	unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
+	/* can be modified without holding the LRU lock */
+	atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
 };
 
 void lru_gen_init_lruvec(struct lruvec *lruvec);
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index 240905407a18..7d79818dc065 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -106,7 +106,10 @@
 #error "Not enough bits in page flags"
 #endif
 
-#define LRU_REFS_WIDTH	0
+/* see the comment on MAX_NR_TIERS */
+#define LRU_REFS_WIDTH	min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \
+			    ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \
+			    NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH)
 
 #endif
 #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
-- 
cgit v1.2.3


From 018ee47f14893d500131dfca2ff9f3ff8ebd4ed2 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 18 Sep 2022 02:00:04 -0600
Subject: mm: multi-gen LRU: exploit locality in rmap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Searching the rmap for PTEs mapping each page on an LRU list (to test and
clear the accessed bit) can be expensive because pages from different VMAs
(PA space) are not cache friendly to the rmap (VA space).  For workloads
mostly using mapped pages, searching the rmap can incur the highest CPU
cost in the reclaim path.

This patch exploits spatial locality to reduce the trips into the rmap.
When shrink_page_list() walks the rmap and finds a young PTE, a new
function lru_gen_look_around() scans at most BITS_PER_LONG-1 adjacent
PTEs.  On finding another young PTE, it clears the accessed bit and
updates the gen counter of the page mapped by this PTE to
(max_seq%MAX_NR_GENS)+1.

Server benchmark results:
  Single workload:
    fio (buffered I/O): no change

  Single workload:
    memcached (anon): +[3, 5]%
                Ops/sec      KB/sec
      patch1-6: 1106168.46   43025.04
      patch1-7: 1147696.57   44640.29

  Configurations:
    no change

Client benchmark results:
  kswapd profiles:
    patch1-6
      39.03%  lzo1x_1_do_compress (real work)
      18.47%  page_vma_mapped_walk (overhead)
       6.74%  _raw_spin_unlock_irq
       3.97%  do_raw_spin_lock
       2.49%  ptep_clear_flush
       2.48%  anon_vma_interval_tree_iter_first
       1.92%  folio_referenced_one
       1.88%  __zram_bvec_write
       1.48%  memmove
       1.31%  vma_interval_tree_iter_next

    patch1-7
      48.16%  lzo1x_1_do_compress (real work)
       8.20%  page_vma_mapped_walk (overhead)
       7.06%  _raw_spin_unlock_irq
       2.92%  ptep_clear_flush
       2.53%  __zram_bvec_write
       2.11%  do_raw_spin_lock
       2.02%  memmove
       1.93%  lru_gen_look_around
       1.56%  free_unref_page_list
       1.40%  memset

  Configurations:
    no change

Link: https://lkml.kernel.org/r/20220918080010.2920238-8-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 31 +++++++++++++++++++++++++++++++
 include/linux/mm.h         |  5 +++++
 include/linux/mmzone.h     |  6 ++++++
 3 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a2461f9a8738..9b8ab121d948 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -445,6 +445,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
  * - LRU isolation
  * - lock_page_memcg()
  * - exclusive reference
+ * - mem_cgroup_trylock_pages()
  *
  * For a kmem folio a caller should hold an rcu read lock to protect memcg
  * associated with a kmem folio from being released.
@@ -506,6 +507,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
  * - LRU isolation
  * - lock_page_memcg()
  * - exclusive reference
+ * - mem_cgroup_trylock_pages()
  *
  * For a kmem page a caller should hold an rcu read lock to protect memcg
  * associated with a kmem page from being released.
@@ -960,6 +962,23 @@ void unlock_page_memcg(struct page *page);
 
 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
 
+/* try to stablize folio_memcg() for all the pages in a memcg */
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+	rcu_read_lock();
+
+	if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
+		return true;
+
+	rcu_read_unlock();
+	return false;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+	rcu_read_unlock();
+}
+
 /* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void mod_memcg_state(struct mem_cgroup *memcg,
 				   int idx, int val)
@@ -1434,6 +1453,18 @@ static inline void folio_memcg_unlock(struct folio *folio)
 {
 }
 
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+	/* to match folio_memcg_rcu() */
+	rcu_read_lock();
+	return true;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+	rcu_read_unlock();
+}
+
 static inline void mem_cgroup_handle_over_high(void)
 {
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8a5ad9d050bf..7cc9ffc19e7f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1490,6 +1490,11 @@ static inline unsigned long folio_pfn(struct folio *folio)
 	return page_to_pfn(&folio->page);
 }
 
+static inline struct folio *pfn_folio(unsigned long pfn)
+{
+	return page_folio(pfn_to_page(pfn));
+}
+
 static inline atomic_t *folio_pincount_ptr(struct folio *folio)
 {
 	return &folio_page(folio, 1)->compound_pincount;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7e343420bfb1..9ef5aa37c60c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -375,6 +375,7 @@ enum lruvec_flags {
 #ifndef __GENERATING_BOUNDS_H
 
 struct lruvec;
+struct page_vma_mapped_walk;
 
 #define LRU_GEN_MASK		((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
 #define LRU_REFS_MASK		((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
@@ -430,6 +431,7 @@ struct lru_gen_struct {
 };
 
 void lru_gen_init_lruvec(struct lruvec *lruvec);
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
 
 #ifdef CONFIG_MEMCG
 void lru_gen_init_memcg(struct mem_cgroup *memcg);
@@ -442,6 +444,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
 {
 }
 
+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+}
+
 #ifdef CONFIG_MEMCG
 static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
 {
-- 
cgit v1.2.3


From bd74fdaea146029e4fa12c6de89adbe0779348a9 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 18 Sep 2022 02:00:05 -0600
Subject: mm: multi-gen LRU: support page table walks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To further exploit spatial locality, the aging prefers to walk page tables
to search for young PTEs and promote hot pages.  A kill switch will be
added in the next patch to disable this behavior.  When disabled, the
aging relies on the rmap only.

NB: this behavior has nothing similar with the page table scanning in the
2.4 kernel [1], which searches page tables for old PTEs, adds cold pages
to swapcache and unmaps them.

To avoid confusion, the term "iteration" specifically means the traversal
of an entire mm_struct list; the term "walk" will be applied to page
tables and the rmap, as usual.

An mm_struct list is maintained for each memcg, and an mm_struct follows
its owner task to the new memcg when this task is migrated.  Given an
lruvec, the aging iterates lruvec_memcg()->mm_list and calls
walk_page_range() with each mm_struct on this list to promote hot pages
before it increments max_seq.

When multiple page table walkers iterate the same list, each of them gets
a unique mm_struct; therefore they can run concurrently.  Page table
walkers ignore any misplaced pages, e.g., if an mm_struct was migrated,
pages it left in the previous memcg will not be promoted when its current
memcg is under reclaim.  Similarly, page table walkers will not promote
pages from nodes other than the one under reclaim.

This patch uses the following optimizations when walking page tables:
1. It tracks the usage of mm_struct's between context switches so that
   page table walkers can skip processes that have been sleeping since
   the last iteration.
2. It uses generational Bloom filters to record populated branches so
   that page table walkers can reduce their search space based on the
   query results, e.g., to skip page tables containing mostly holes or
   misplaced pages.
3. It takes advantage of the accessed bit in non-leaf PMD entries when
   CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
4. It does not zigzag between a PGD table and the same PMD table
   spanning multiple VMAs. IOW, it finishes all the VMAs within the
   range of the same PMD table before it returns to a PGD table. This
   improves the cache performance for workloads that have large
   numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5.

Server benchmark results:
  Single workload:
    fio (buffered I/O): no change

  Single workload:
    memcached (anon): +[8, 10]%
                Ops/sec      KB/sec
      patch1-7: 1147696.57   44640.29
      patch1-8: 1245274.91   48435.66

  Configurations:
    no change

Client benchmark results:
  kswapd profiles:
    patch1-7
      48.16%  lzo1x_1_do_compress (real work)
       8.20%  page_vma_mapped_walk (overhead)
       7.06%  _raw_spin_unlock_irq
       2.92%  ptep_clear_flush
       2.53%  __zram_bvec_write
       2.11%  do_raw_spin_lock
       2.02%  memmove
       1.93%  lru_gen_look_around
       1.56%  free_unref_page_list
       1.40%  memset

    patch1-8
      49.44%  lzo1x_1_do_compress (real work)
       6.19%  page_vma_mapped_walk (overhead)
       5.97%  _raw_spin_unlock_irq
       3.13%  get_pfn_folio
       2.85%  ptep_clear_flush
       2.42%  __zram_bvec_write
       2.08%  do_raw_spin_lock
       1.92%  memmove
       1.44%  alloc_zspage
       1.36%  memset

  Configurations:
    no change

Thanks to the following developers for their efforts [3].
  kernel test robot <lkp@intel.com>

[1] https://lwn.net/Articles/23732/
[2] https://llvm.org/docs/ScudoHardenedAllocator.html
[3] https://lore.kernel.org/r/202204160827.ekEARWQo-lkp@intel.com/

Link: https://lkml.kernel.org/r/20220918080010.2920238-9-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  5 +++
 include/linux/mm_types.h   | 76 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mmzone.h     | 56 +++++++++++++++++++++++++++++++++-
 include/linux/swap.h       |  4 +++
 4 files changed, 140 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9b8ab121d948..344022f102c2 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -350,6 +350,11 @@ struct mem_cgroup {
 	struct deferred_split deferred_split_queue;
 #endif
 
+#ifdef CONFIG_LRU_GEN
+	/* per-memcg mm_struct list */
+	struct lru_gen_mm_list mm_list;
+#endif
+
 	struct mem_cgroup_per_node *nodeinfo[];
 };
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index cf97f3884fda..e1797813cc2c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -672,6 +672,22 @@ struct mm_struct {
 		 */
 		unsigned long ksm_merging_pages;
 #endif
+#ifdef CONFIG_LRU_GEN
+		struct {
+			/* this mm_struct is on lru_gen_mm_list */
+			struct list_head list;
+			/*
+			 * Set when switching to this mm_struct, as a hint of
+			 * whether it has been used since the last time per-node
+			 * page table walkers cleared the corresponding bits.
+			 */
+			unsigned long bitmap;
+#ifdef CONFIG_MEMCG
+			/* points to the memcg of "owner" above */
+			struct mem_cgroup *memcg;
+#endif
+		} lru_gen;
+#endif /* CONFIG_LRU_GEN */
 	} __randomize_layout;
 
 	/*
@@ -698,6 +714,66 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 	return (struct cpumask *)&mm->cpu_bitmap;
 }
 
+#ifdef CONFIG_LRU_GEN
+
+struct lru_gen_mm_list {
+	/* mm_struct list for page table walkers */
+	struct list_head fifo;
+	/* protects the list above */
+	spinlock_t lock;
+};
+
+void lru_gen_add_mm(struct mm_struct *mm);
+void lru_gen_del_mm(struct mm_struct *mm);
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm);
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+	INIT_LIST_HEAD(&mm->lru_gen.list);
+	mm->lru_gen.bitmap = 0;
+#ifdef CONFIG_MEMCG
+	mm->lru_gen.memcg = NULL;
+#endif
+}
+
+static inline void lru_gen_use_mm(struct mm_struct *mm)
+{
+	/*
+	 * When the bitmap is set, page reclaim knows this mm_struct has been
+	 * used since the last time it cleared the bitmap. So it might be worth
+	 * walking the page tables of this mm_struct to clear the accessed bit.
+	 */
+	WRITE_ONCE(mm->lru_gen.bitmap, -1);
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_add_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_del_mm(struct mm_struct *mm)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+}
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_use_mm(struct mm_struct *mm)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
 struct mmu_gather;
 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9ef5aa37c60c..b1635c4020dc 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -408,7 +408,7 @@ enum {
  * min_seq behind.
  *
  * The number of pages in each generation is eventually consistent and therefore
- * can be transiently negative.
+ * can be transiently negative when reset_batch_size() is pending.
  */
 struct lru_gen_struct {
 	/* the aging increments the youngest generation number */
@@ -430,6 +430,53 @@ struct lru_gen_struct {
 	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
 };
 
+enum {
+	MM_LEAF_TOTAL,		/* total leaf entries */
+	MM_LEAF_OLD,		/* old leaf entries */
+	MM_LEAF_YOUNG,		/* young leaf entries */
+	MM_NONLEAF_TOTAL,	/* total non-leaf entries */
+	MM_NONLEAF_FOUND,	/* non-leaf entries found in Bloom filters */
+	MM_NONLEAF_ADDED,	/* non-leaf entries added to Bloom filters */
+	NR_MM_STATS
+};
+
+/* double-buffering Bloom filters */
+#define NR_BLOOM_FILTERS	2
+
+struct lru_gen_mm_state {
+	/* set to max_seq after each iteration */
+	unsigned long seq;
+	/* where the current iteration continues (inclusive) */
+	struct list_head *head;
+	/* where the last iteration ended (exclusive) */
+	struct list_head *tail;
+	/* to wait for the last page table walker to finish */
+	struct wait_queue_head wait;
+	/* Bloom filters flip after each iteration */
+	unsigned long *filters[NR_BLOOM_FILTERS];
+	/* the mm stats for debugging */
+	unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
+	/* the number of concurrent page table walkers */
+	int nr_walkers;
+};
+
+struct lru_gen_mm_walk {
+	/* the lruvec under reclaim */
+	struct lruvec *lruvec;
+	/* unstable max_seq from lru_gen_struct */
+	unsigned long max_seq;
+	/* the next address within an mm to scan */
+	unsigned long next_addr;
+	/* to batch promoted pages */
+	int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+	/* to batch the mm stats */
+	int mm_stats[NR_MM_STATS];
+	/* total batched items */
+	int batched;
+	bool can_swap;
+	bool force_scan;
+};
+
 void lru_gen_init_lruvec(struct lruvec *lruvec);
 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
 
@@ -480,6 +527,8 @@ struct lruvec {
 #ifdef CONFIG_LRU_GEN
 	/* evictable pages divided into generations */
 	struct lru_gen_struct		lrugen;
+	/* to concurrently iterate lru_gen_mm_list */
+	struct lru_gen_mm_state		mm_state;
 #endif
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
@@ -1176,6 +1225,11 @@ typedef struct pglist_data {
 
 	unsigned long		flags;
 
+#ifdef CONFIG_LRU_GEN
+	/* kswap mm walk data */
+	struct lru_gen_mm_walk	mm_walk;
+#endif
+
 	ZONE_PADDING(_pad2_)
 
 	/* Per-node vmstats */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 43150b9bbc5c..6308150b234a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -162,6 +162,10 @@ union swap_header {
  */
 struct reclaim_state {
 	unsigned long reclaimed_slab;
+#ifdef CONFIG_LRU_GEN
+	/* per-thread mm walk data */
+	struct lru_gen_mm_walk *mm_walk;
+#endif
 };
 
 #ifdef __KERNEL__
-- 
cgit v1.2.3


From 354ed597442952fb680c9cafc7e4eb8a76f9514c Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 18 Sep 2022 02:00:07 -0600
Subject: mm: multi-gen LRU: kill switch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
can be disabled include:
  0x0001: the multi-gen LRU core
  0x0002: walking page table, when arch_has_hw_pte_young() returns
          true
  0x0004: clearing the accessed bit in non-leaf PMD entries, when
          CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
  [yYnN]: apply to all the components above
E.g.,
  echo y >/sys/kernel/mm/lru_gen/enabled
  cat /sys/kernel/mm/lru_gen/enabled
  0x0007
  echo 5 >/sys/kernel/mm/lru_gen/enabled
  cat /sys/kernel/mm/lru_gen/enabled
  0x0005

NB: the page table walks happen on the scale of seconds under heavy memory
pressure, in which case the mmap_lock contention is a lesser concern,
compared with the LRU lock contention and the I/O congestion.  So far the
only well-known case of the mmap_lock contention happens on Android, due
to Scudo [1] which allocates several thousand VMAs for merely a few
hundred MBs.  The SPF and the Maple Tree also have provided their own
assessments [2][3].  However, if walking page tables does worsen the
mmap_lock contention, the kill switch can be used to disable it.  In this
case the multi-gen LRU will suffer a minor performance degradation, as
shown previously.

Clearing the accessed bit in non-leaf PMD entries can also be disabled,
since this behavior was not tested on x86 varieties other than Intel and
AMD.

[1] https://source.android.com/devices/tech/debug/scudo
[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/
[3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/

Link: https://lkml.kernel.org/r/20220918080010.2920238-11-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cgroup.h    | 15 ++++++++++++++-
 include/linux/mm_inline.h | 15 +++++++++++++--
 include/linux/mmzone.h    |  9 +++++++++
 3 files changed, 36 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ac5d0515680e..9179463c3c9f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
 	css_put(&cgrp->self);
 }
 
+extern struct mutex cgroup_mutex;
+
+static inline void cgroup_lock(void)
+{
+	mutex_lock(&cgroup_mutex);
+}
+
+static inline void cgroup_unlock(void)
+{
+	mutex_unlock(&cgroup_mutex);
+}
+
 /**
  * task_css_set_check - obtain a task's css_set with extra access conditions
  * @task: the task to obtain css_set for
@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
  * as locks used during the cgroup_subsys::attach() methods.
  */
 #ifdef CONFIG_PROVE_RCU
-extern struct mutex cgroup_mutex;
 extern spinlock_t css_set_lock;
 #define task_css_set_check(task, __c)					\
 	rcu_dereference_check((task)->cgroups,				\
@@ -708,6 +719,8 @@ struct cgroup;
 static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
 static inline void css_get(struct cgroup_subsys_state *css) {}
 static inline void css_put(struct cgroup_subsys_state *css) {}
+static inline void cgroup_lock(void) {}
+static inline void cgroup_unlock(void) {}
 static inline int cgroup_attach_task_all(struct task_struct *from,
 					 struct task_struct *t) { return 0; }
 static inline int cgroupstats_build(struct cgroupstats *stats,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index f2b2296a42f9..4949eda9a9a2 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -106,10 +106,21 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio)
 
 #ifdef CONFIG_LRU_GEN
 
+#ifdef CONFIG_LRU_GEN_ENABLED
 static inline bool lru_gen_enabled(void)
 {
-	return true;
+	DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+	return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
+}
+#else
+static inline bool lru_gen_enabled(void)
+{
+	DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+	return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
 }
+#endif
 
 static inline bool lru_gen_in_fault(void)
 {
@@ -222,7 +233,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
 
 	VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
 
-	if (folio_test_unevictable(folio))
+	if (folio_test_unevictable(folio) || !lrugen->enabled)
 		return false;
 	/*
 	 * There are three common cases for this page:
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b1635c4020dc..95c58c7fbdff 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -387,6 +387,13 @@ enum {
 	LRU_GEN_FILE,
 };
 
+enum {
+	LRU_GEN_CORE,
+	LRU_GEN_MM_WALK,
+	LRU_GEN_NONLEAF_YOUNG,
+	NR_LRU_GEN_CAPS
+};
+
 #define MIN_LRU_BATCH		BITS_PER_LONG
 #define MAX_LRU_BATCH		(MIN_LRU_BATCH * 64)
 
@@ -428,6 +435,8 @@ struct lru_gen_struct {
 	/* can be modified without holding the LRU lock */
 	atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
 	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+	/* whether the multi-gen LRU is enabled */
+	bool enabled;
 };
 
 enum {
-- 
cgit v1.2.3


From 1332a809d95a4fc763cabe5ecb6d4fb6a6d941b2 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 18 Sep 2022 02:00:08 -0600
Subject: mm: multi-gen LRU: thrashing prevention
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as
requested by many desktop users [1].

When set to value N, it prevents the working set of N milliseconds from
getting evicted.  The OOM killer is triggered if this working set cannot
be kept in memory.  Based on the average human detectable lag (~100ms),
N=1000 usually eliminates intolerable lags due to thrashing.  Larger
values like N=3000 make lags less noticeable at the risk of premature OOM
kills.

Compared with the size-based approach [2], this time-based approach
has the following advantages:

1. It is easier to configure because it is agnostic to applications
   and memory sizes.
2. It is more reliable because it is directly wired to the OOM killer.

[1] https://lore.kernel.org/r/Ydza%2FzXKY9ATRoh6@google.com/
[2] https://lore.kernel.org/r/20101028191523.GA14972@google.com/

Link: https://lkml.kernel.org/r/20220918080010.2920238-12-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 95c58c7fbdff..87347945270b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -422,6 +422,8 @@ struct lru_gen_struct {
 	unsigned long max_seq;
 	/* the eviction increments the oldest generation numbers */
 	unsigned long min_seq[ANON_AND_FILE];
+	/* the birth time of each generation in jiffies */
+	unsigned long timestamps[MAX_NR_GENS];
 	/* the multi-gen LRU lists, lazily sorted on eviction */
 	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
 	/* the multi-gen LRU sizes, eventually consistent */
-- 
cgit v1.2.3


From d6c3af7d8a2ba5602c28841248c551a712ac50f5 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 18 Sep 2022 02:00:09 -0600
Subject: mm: multi-gen LRU: debugfs interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add /sys/kernel/debug/lru_gen for working set estimation and proactive
reclaim.  These techniques are commonly used to optimize job scheduling
(bin packing) in data centers [1][2].

Compared with the page table-based approach and the PFN-based
approach, this lruvec-based approach has the following advantages:
1. It offers better choices because it is aware of memcgs, NUMA nodes,
   shared mappings and unmapped page cache.
2. It is more scalable because it is O(nr_hot_pages), whereas the
   PFN-based approach is O(nr_total_pages).

Add /sys/kernel/debug/lru_gen_full for debugging.

[1] https://dl.acm.org/doi/10.1145/3297858.3304053
[2] https://dl.acm.org/doi/10.1145/3503222.3507731

Link: https://lkml.kernel.org/r/20220918080010.2920238-13-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/nodemask.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 4b71a96190a8..3a0eec9f2faa 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -493,6 +493,7 @@ static inline int num_node_state(enum node_states state)
 #define first_online_node	0
 #define first_memory_node	0
 #define next_online_node(nid)	(MAX_NUMNODES)
+#define next_memory_node(nid)	(MAX_NUMNODES)
 #define nr_node_ids		1U
 #define nr_online_nodes		1U
 
-- 
cgit v1.2.3


From 992bf77591cb7e696fcc59aa7e64d1200b673513 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Thu, 18 Aug 2022 18:40:33 +0530
Subject: mm/demotion: add support for explicit memory tiers

Patch series "mm/demotion: Memory tiers and demotion", v15.

The current kernel has the basic memory tiering support: Inactive pages on
a higher tier NUMA node can be migrated (demoted) to a lower tier NUMA
node to make room for new allocations on the higher tier NUMA node.
Frequently accessed pages on a lower tier NUMA node can be migrated
(promoted) to a higher tier NUMA node to improve the performance.

In the current kernel, memory tiers are defined implicitly via a demotion
path relationship between NUMA nodes, which is created during the kernel
initialization and updated when a NUMA node is hot-added or hot-removed.
The current implementation puts all nodes with CPU into the highest tier,
and builds the tier hierarchy tier-by-tier by establishing the per-node
demotion targets based on the distances between nodes.

This current memory tier kernel implementation needs to be improved for
several important use cases:

* The current tier initialization code always initializes each
  memory-only NUMA node into a lower tier.  But a memory-only NUMA node
  may have a high performance memory device (e.g.  a DRAM-backed
  memory-only node on a virtual machine) and that should be put into a
  higher tier.

* The current tier hierarchy always puts CPU nodes into the top tier.
  But on a system with HBM (e.g.  GPU memory) devices, these memory-only
  HBM NUMA nodes should be in the top tier, and DRAM nodes with CPUs are
  better to be placed into the next lower tier.

* Also because the current tier hierarchy always puts CPU nodes into the
  top tier, when a CPU is hot-added (or hot-removed) and triggers a memory
  node from CPU-less into a CPU node (or vice versa), the memory tier
  hierarchy gets changed, even though no memory node is added or removed.
  This can make the tier hierarchy unstable and make it difficult to
  support tier-based memory accounting.

* A higher tier node can only be demoted to nodes with shortest distance
  on the next lower tier as defined by the demotion path, not any other
  node from any lower tier.  This strict, demotion order does not work in
  all use cases (e.g.  some use cases may want to allow cross-socket
  demotion to another node in the same demotion tier as a fallback when
  the preferred demotion node is out of space), and has resulted in the
  feature request for an interface to override the system-wide, per-node
  demotion order from the userspace.  This demotion order is also
  inconsistent with the page allocation fallback order when all the nodes
  in a higher tier are out of space: The page allocation can fall back to
  any node from any lower tier, whereas the demotion order doesn't allow
  that.

This patch series make the creation of memory tiers explicit under the
control of device driver.

Memory Tier Initialization
==========================

Linux kernel presents memory devices as NUMA nodes and each memory device
is of a specific type.  The memory type of a device is represented by its
abstract distance.  A memory tier corresponds to a range of abstract
distance.  This allows for classifying memory devices with a specific
performance range into a memory tier.

By default, all memory nodes are assigned to the default tier with
abstract distance 512.

A device driver can move its memory nodes from the default tier.  For
example, PMEM can move its memory nodes below the default tier, whereas
GPU can move its memory nodes above the default tier.

The kernel initialization code makes the decision on which exact tier a
memory node should be assigned to based on the requests from the device
drivers as well as the memory device hardware information provided by the
firmware.

Hot-adding/removing CPUs doesn't affect memory tier hierarchy.


This patch (of 10):

In the current kernel, memory tiers are defined implicitly via a demotion
path relationship between NUMA nodes, which is created during the kernel
initialization and updated when a NUMA node is hot-added or hot-removed.
The current implementation puts all nodes with CPU into the highest tier,
and builds the tier hierarchy by establishing the per-node demotion
targets based on the distances between nodes.

This current memory tier kernel implementation needs to be improved for
several important use cases,

The current tier initialization code always initializes each memory-only
NUMA node into a lower tier.  But a memory-only NUMA node may have a high
performance memory device (e.g.  a DRAM-backed memory-only node on a
virtual machine) that should be put into a higher tier.

The current tier hierarchy always puts CPU nodes into the top tier.  But
on a system with HBM or GPU devices, the memory-only NUMA nodes mapping
these devices should be in the top tier, and DRAM nodes with CPUs are
better to be placed into the next lower tier.

With current kernel higher tier node can only be demoted to nodes with
shortest distance on the next lower tier as defined by the demotion path,
not any other node from any lower tier.  This strict, demotion order does
not work in all use cases (e.g.  some use cases may want to allow
cross-socket demotion to another node in the same demotion tier as a
fallback when the preferred demotion node is out of space), This demotion
order is also inconsistent with the page allocation fallback order when
all the nodes in a higher tier are out of space: The page allocation can
fall back to any node from any lower tier, whereas the demotion order
doesn't allow that.

This patch series address the above by defining memory tiers explicitly.

Linux kernel presents memory devices as NUMA nodes and each memory device
is of a specific type.  The memory type of a device is represented by its
abstract distance.  A memory tier corresponds to a range of abstract
distance.  This allows for classifying memory devices with a specific
performance range into a memory tier.

This patch configures the range/chunk size to be 128.  The default DRAM
abstract distance is 512.  We can have 4 memory tiers below the default
DRAM with abstract distance range 0 - 127, 127 - 255, 256- 383, 384 - 511.
Faster memory devices can be placed in these faster(higher) memory tiers.
Slower memory devices like persistent memory will have abstract distance
higher than the default DRAM level.

[akpm@linux-foundation.org: fix comment, per Aneesh]
Link: https://lkml.kernel.org/r/20220818131042.113280-1-aneesh.kumar@linux.ibm.com
Link: https://lkml.kernel.org/r/20220818131042.113280-2-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Wei Xu <weixugc@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hesham Almatary <hesham.almatary@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Jagdish Gediya <jvgediya.oss@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 include/linux/memory-tiers.h

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
new file mode 100644
index 000000000000..ecada7bf4091
--- /dev/null
+++ b/include/linux/memory-tiers.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MEMORY_TIERS_H
+#define _LINUX_MEMORY_TIERS_H
+
+/*
+ * Each tier cover a abstrace distance chunk size of 128
+ */
+#define MEMTIER_CHUNK_BITS	7
+#define MEMTIER_CHUNK_SIZE	(1 << MEMTIER_CHUNK_BITS)
+/*
+ * Smaller abstract distance values imply faster (higher) memory tiers. Offset
+ * the DRAM adistance so that we can accommodate devices with a slightly lower
+ * adistance value (slightly faster) than default DRAM adistance to be part of
+ * the same memory tier.
+ */
+#define MEMTIER_ADISTANCE_DRAM	((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
+
+#endif  /* _LINUX_MEMORY_TIERS_H */
-- 
cgit v1.2.3


From 9195244022788935eac0df16132394ffa5613542 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Thu, 18 Aug 2022 18:40:34 +0530
Subject: mm/demotion: move memory demotion related code

This moves memory demotion related code to mm/memory-tiers.c.  No
functional change in this patch.

Link: https://lkml.kernel.org/r/20220818131042.113280-3-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Wei Xu <weixugc@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hesham Almatary <hesham.almatary@huawei.com>
Cc: Jagdish Gediya <jvgediya.oss@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 8 ++++++++
 include/linux/migrate.h      | 2 --
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index ecada7bf4091..5f24396da76c 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -15,4 +15,12 @@
  */
 #define MEMTIER_ADISTANCE_DRAM	((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
 
+#ifdef CONFIG_NUMA
+#include <linux/types.h>
+extern bool numa_demotion_enabled;
+
+#else
+
+#define numa_demotion_enabled	false
+#endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 22c0a0cf5e0c..96f8c84413fe 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -103,7 +103,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 #if defined(CONFIG_MIGRATION) && defined(CONFIG_NUMA)
 extern void set_migration_target_nodes(void);
 extern void migrate_on_reclaim_init(void);
-extern bool numa_demotion_enabled;
 extern int next_demotion_node(int node);
 #else
 static inline void set_migration_target_nodes(void) {}
@@ -112,7 +111,6 @@ static inline int next_demotion_node(int node)
 {
         return NUMA_NO_NODE;
 }
-#define numa_demotion_enabled  false
 #endif
 
 #ifdef CONFIG_COMPACTION
-- 
cgit v1.2.3


From c6123a19c9f040e597f55f856c679651c26b31d1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Thu, 18 Aug 2022 18:40:35 +0530
Subject: mm/demotion: add hotplug callbacks to handle new numa node onlined

If the new NUMA node onlined doesn't have a abstract distance assigned,
the kernel adds the NUMA node to default memory tier.

[aneesh.kumar@linux.ibm.com: fix kernel error with memory hotplug]
  Link: https://lkml.kernel.org/r/20220825092019.379069-1-aneesh.kumar@linux.ibm.com
Link: https://lkml.kernel.org/r/20220818131042.113280-4-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Wei Xu <weixugc@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hesham Almatary <hesham.almatary@huawei.com>
Cc: Jagdish Gediya <jvgediya.oss@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 5f24396da76c..7ae6308b2191 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -14,6 +14,7 @@
  * the same memory tier.
  */
 #define MEMTIER_ADISTANCE_DRAM	((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
+#define MEMTIER_HOTPLUG_PRIO	100
 
 #ifdef CONFIG_NUMA
 #include <linux/types.h>
-- 
cgit v1.2.3


From 7b88bda3761b95856cf97822efe8281c8100067b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Thu, 18 Aug 2022 18:40:36 +0530
Subject: mm/demotion/dax/kmem: set node's abstract distance to
 MEMTIER_DEFAULT_DAX_ADISTANCE

By default, all nodes are assigned to the default memory tier which is the
memory tier designated for nodes with DRAM

Set dax kmem device node's tier to slower memory tier by assigning
abstract distance to MEMTIER_DEFAULT_DAX_ADISTANCE.  Low-level drivers
like papr_scm or ACPI NFIT can initialize memory device type to a more
accurate value based on device tree details or HMAT.  If the kernel
doesn't find the memory type initialized, a default slower memory type is
assigned by the kmem driver.

[aneesh.kumar@linux.ibm.com: assign correct memory type for multiple dax devices with the same node affinity]
  Link: https://lkml.kernel.org/r/20220826100224.542312-1-aneesh.kumar@linux.ibm.com
Link: https://lkml.kernel.org/r/20220818131042.113280-5-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Wei Xu <weixugc@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hesham Almatary <hesham.almatary@huawei.com>
Cc: Jagdish Gediya <jvgediya.oss@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 7ae6308b2191..49d281866dca 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -2,6 +2,9 @@
 #ifndef _LINUX_MEMORY_TIERS_H
 #define _LINUX_MEMORY_TIERS_H
 
+#include <linux/types.h>
+#include <linux/nodemask.h>
+#include <linux/kref.h>
 /*
  * Each tier cover a abstrace distance chunk size of 128
  */
@@ -16,12 +19,49 @@
 #define MEMTIER_ADISTANCE_DRAM	((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
 #define MEMTIER_HOTPLUG_PRIO	100
 
+struct memory_tier;
+struct memory_dev_type {
+	/* list of memory types that are part of same tier as this type */
+	struct list_head tier_sibiling;
+	/* abstract distance for this specific memory type */
+	int adistance;
+	/* Nodes of same abstract distance */
+	nodemask_t nodes;
+	struct kref kref;
+	struct memory_tier *memtier;
+};
+
 #ifdef CONFIG_NUMA
-#include <linux/types.h>
 extern bool numa_demotion_enabled;
+struct memory_dev_type *alloc_memory_type(int adistance);
+void destroy_memory_type(struct memory_dev_type *memtype);
+void init_node_memory_type(int node, struct memory_dev_type *default_type);
+void clear_node_memory_type(int node, struct memory_dev_type *memtype);
 
 #else
 
 #define numa_demotion_enabled	false
+/*
+ * CONFIG_NUMA implementation returns non NULL error.
+ */
+static inline struct memory_dev_type *alloc_memory_type(int adistance)
+{
+	return NULL;
+}
+
+static inline void destroy_memory_type(struct memory_dev_type *memtype)
+{
+
+}
+
+static inline void init_node_memory_type(int node, struct memory_dev_type *default_type)
+{
+
+}
+
+static inline void clear_node_memory_type(int node, struct memory_dev_type *memtype)
+{
+
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
-- 
cgit v1.2.3


From 6c542ab75714fe90dae292aeb3e91ac53f5ff599 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Thu, 18 Aug 2022 18:40:37 +0530
Subject: mm/demotion: build demotion targets based on explicit memory tiers

This patch switch the demotion target building logic to use memory tiers
instead of NUMA distance.  All N_MEMORY NUMA nodes will be placed in the
default memory tier and additional memory tiers will be added by drivers
like dax kmem.

This patch builds the demotion target for a NUMA node by looking at all
memory tiers below the tier to which the NUMA node belongs.  The closest
node in the immediately following memory tier is used as a demotion
target.

Since we are now only building demotion target for N_MEMORY NUMA nodes the
CPU hotplug calls are removed in this patch.

Link: https://lkml.kernel.org/r/20220818131042.113280-6-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Wei Xu <weixugc@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hesham Almatary <hesham.almatary@huawei.com>
Cc: Jagdish Gediya <jvgediya.oss@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 13 +++++++++++++
 include/linux/migrate.h      | 13 -------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 49d281866dca..ce9c6bac6725 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -37,6 +37,14 @@ struct memory_dev_type *alloc_memory_type(int adistance);
 void destroy_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+#ifdef CONFIG_MIGRATION
+int next_demotion_node(int node);
+#else
+static inline int next_demotion_node(int node)
+{
+	return NUMA_NO_NODE;
+}
+#endif
 
 #else
 
@@ -63,5 +71,10 @@ static inline void clear_node_memory_type(int node, struct memory_dev_type *memt
 {
 
 }
+
+static inline int next_demotion_node(int node)
+{
+	return NUMA_NO_NODE;
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 96f8c84413fe..704a04f5a074 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -100,19 +100,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 
 #endif /* CONFIG_MIGRATION */
 
-#if defined(CONFIG_MIGRATION) && defined(CONFIG_NUMA)
-extern void set_migration_target_nodes(void);
-extern void migrate_on_reclaim_init(void);
-extern int next_demotion_node(int node);
-#else
-static inline void set_migration_target_nodes(void) {}
-static inline void migrate_on_reclaim_init(void) {}
-static inline int next_demotion_node(int node)
-{
-        return NUMA_NO_NODE;
-}
-#endif
-
 #ifdef CONFIG_COMPACTION
 bool PageMovable(struct page *page);
 void __SetPageMovable(struct page *page, const struct movable_operations *ops);
-- 
cgit v1.2.3


From 7766cf7a7e7545ab434a16c6f9531b09efe14dc1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Thu, 18 Aug 2022 18:40:38 +0530
Subject: mm/demotion: add pg_data_t member to track node memory tier details

Also update different helpes to use NODE_DATA()->memtier.  Since node
specific memtier can change based on the reassignment of NUMA node to a
different memory tiers, accessing NODE_DATA()->memtier needs to happen
under an rcu read lock or memory_tier_lock.

Link: https://lkml.kernel.org/r/20220818131042.113280-7-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Wei Xu <weixugc@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hesham Almatary <hesham.almatary@huawei.com>
Cc: Jagdish Gediya <jvgediya.oss@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 87347945270b..e335a492c2eb 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1246,6 +1246,9 @@ typedef struct pglist_data {
 	/* Per-node vmstats */
 	struct per_cpu_nodestat __percpu *per_cpu_nodestats;
 	atomic_long_t		vm_stat[NR_VM_NODE_STAT_ITEMS];
+#ifdef CONFIG_NUMA
+	struct memory_tier __rcu *memtier;
+#endif
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
-- 
cgit v1.2.3


From b26ac6f3ba38fac83db2d72551e6d994d0e0516f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Thu, 18 Aug 2022 18:40:39 +0530
Subject: mm/demotion: drop memtier from memtype

Now that we track node-specific memtier in pg_data_t, we can drop memtier
from memtype.

Link: https://lkml.kernel.org/r/20220818131042.113280-8-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Wei Xu <weixugc@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hesham Almatary <hesham.almatary@huawei.com>
Cc: Jagdish Gediya <jvgediya.oss@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index ce9c6bac6725..7ca52ad2789f 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -28,7 +28,6 @@ struct memory_dev_type {
 	/* Nodes of same abstract distance */
 	nodemask_t nodes;
 	struct kref kref;
-	struct memory_tier *memtier;
 };
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3


From 32008027289239100d8d2876f50b15d92bde1855 Mon Sep 17 00:00:00 2001
From: Jagdish Gediya <jvgediya.oss@gmail.com>
Date: Thu, 18 Aug 2022 18:40:40 +0530
Subject: mm/demotion: demote pages according to allocation fallback order

Currently, a higher tier node can only be demoted to selected nodes on the
next lower tier as defined by the demotion path.  This strict demotion
order does not work in all use cases (e.g.  some use cases may want to
allow cross-socket demotion to another node in the same demotion tier as a
fallback when the preferred demotion node is out of space).  This demotion
order is also inconsistent with the page allocation fallback order when
all the nodes in a higher tier are out of space: The page allocation can
fall back to any node from any lower tier, whereas the demotion order
doesn't allow that currently.

This patch adds support to get all the allowed demotion targets for a
memory tier.  demote_page_list() function is now modified to utilize this
allowed node mask as the fallback allocation mask.

Link: https://lkml.kernel.org/r/20220818131042.113280-9-aneesh.kumar@linux.ibm.com
Signed-off-by: Jagdish Gediya <jvgediya.oss@gmail.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Wei Xu <weixugc@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hesham Almatary <hesham.almatary@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 7ca52ad2789f..42791554b9b9 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -5,6 +5,7 @@
 #include <linux/types.h>
 #include <linux/nodemask.h>
 #include <linux/kref.h>
+#include <linux/mmzone.h>
 /*
  * Each tier cover a abstrace distance chunk size of 128
  */
@@ -38,11 +39,17 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
+void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
 #else
 static inline int next_demotion_node(int node)
 {
 	return NUMA_NO_NODE;
 }
+
+static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+	*targets = NODE_MASK_NONE;
+}
 #endif
 
 #else
@@ -75,5 +82,10 @@ static inline int next_demotion_node(int node)
 {
 	return NUMA_NO_NODE;
 }
+
+static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+	*targets = NODE_MASK_NONE;
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
-- 
cgit v1.2.3


From 467b171af881282fc627328e6c164f044a6df888 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Thu, 18 Aug 2022 18:40:41 +0530
Subject: mm/demotion: update node_is_toptier to work with memory tiers

With memory tier support we can have memory only NUMA nodes in the top
tier from which we want to avoid promotion tracking NUMA faults.  Update
node_is_toptier to work with memory tiers.  All NUMA nodes are by default
top tier nodes.  With lower(slower) memory tiers added we consider all
memory tiers above a memory tier having CPU NUMA nodes as a top memory
tier

[sj@kernel.org: include missed header file, memory-tiers.h]
  Link: https://lkml.kernel.org/r/20220820190720.248704-1-sj@kernel.org
[akpm@linux-foundation.org: mm/memory.c needs linux/memory-tiers.h]
[aneesh.kumar@linux.ibm.com: make toptier_distance inclusive upper bound of toptiers]
  Link: https://lkml.kernel.org/r/20220830081457.118960-1-aneesh.kumar@linux.ibm.com
Link: https://lkml.kernel.org/r/20220818131042.113280-10-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Wei Xu <weixugc@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hesham Almatary <hesham.almatary@huawei.com>
Cc: Jagdish Gediya <jvgediya.oss@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 11 +++++++++++
 include/linux/node.h         |  5 -----
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 42791554b9b9..965009aa01d7 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -40,6 +40,7 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
+bool node_is_toptier(int node);
 #else
 static inline int next_demotion_node(int node)
 {
@@ -50,6 +51,11 @@ static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *target
 {
 	*targets = NODE_MASK_NONE;
 }
+
+static inline bool node_is_toptier(int node)
+{
+	return true;
+}
 #endif
 
 #else
@@ -87,5 +93,10 @@ static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *target
 {
 	*targets = NODE_MASK_NONE;
 }
+
+static inline bool node_is_toptier(int node)
+{
+	return true;
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/include/linux/node.h b/include/linux/node.h
index 40d641a8bfb0..9ec680dd607f 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -185,9 +185,4 @@ static inline void register_hugetlbfs_with_node(node_registration_func_t reg,
 
 #define to_node(device) container_of(device, struct node, dev)
 
-static inline bool node_is_toptier(int node)
-{
-	return node_state(node, N_CPU);
-}
-
 #endif /* _LINUX_NODE_H_ */
-- 
cgit v1.2.3


From 3e061d924fe9c7b487ca45935f92fcd414ce6f1a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Thu, 18 Aug 2022 18:40:42 +0530
Subject: lib/nodemask: optimize node_random for nodemask with single NUMA node

The most common case for certain node_random usage (demotion nodemask) is
with nodemask weight 1.  We can avoid calling get_random_init() in that
case and always return the only node set in the nodemask.

A simple test as below
  before = rdtsc_ordered();
  for (i= 0; i < 100; i++) {
      rand = node_random(&nmask);
  }
  after = rdtsc_ordered();

Without fix after - before : 16438
With fix after - before : 816

Link: https://lkml.kernel.org/r/20220818131042.113280-11-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Wei Xu <weixugc@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hesham Almatary <hesham.almatary@huawei.com>
Cc: Jagdish Gediya <jvgediya.oss@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/nodemask.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 3a0eec9f2faa..e66742db741c 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -505,12 +505,21 @@ static inline int num_node_state(enum node_states state)
 static inline int node_random(const nodemask_t *maskp)
 {
 #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
-	int w, bit = NUMA_NO_NODE;
+	int w, bit;
 
 	w = nodes_weight(*maskp);
-	if (w)
+	switch (w) {
+	case 0:
+		bit = NUMA_NO_NODE;
+		break;
+	case 1:
+		bit = first_node(*maskp);
+		break;
+	default:
 		bit = bitmap_ord_to_pos(maskp->bits,
-			get_random_int() % w, MAX_NUMNODES);
+					get_random_int() % w, MAX_NUMNODES);
+		break;
+	}
 	return bit;
 #else
 	return 0;
-- 
cgit v1.2.3


From 54a611b605901c7d5d05b6b8f5d04a6ceb0962aa Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Date: Tue, 6 Sep 2022 19:48:39 +0000
Subject: Maple Tree: add new data structure

Patch series "Introducing the Maple Tree"

The maple tree is an RCU-safe range based B-tree designed to use modern
processor cache efficiently.  There are a number of places in the kernel
that a non-overlapping range-based tree would be beneficial, especially
one with a simple interface.  If you use an rbtree with other data
structures to improve performance or an interval tree to track
non-overlapping ranges, then this is for you.

The tree has a branching factor of 10 for non-leaf nodes and 16 for leaf
nodes.  With the increased branching factor, it is significantly shorter
than the rbtree so it has fewer cache misses.  The removal of the linked
list between subsequent entries also reduces the cache misses and the need
to pull in the previous and next VMA during many tree alterations.

The first user that is covered in this patch set is the vm_area_struct,
where three data structures are replaced by the maple tree: the augmented
rbtree, the vma cache, and the linked list of VMAs in the mm_struct.  The
long term goal is to reduce or remove the mmap_lock contention.

The plan is to get to the point where we use the maple tree in RCU mode.
Readers will not block for writers.  A single write operation will be
allowed at a time.  A reader re-walks if stale data is encountered.  VMAs
would be RCU enabled and this mode would be entered once multiple tasks
are using the mm_struct.

Davidlor said

: Yes I like the maple tree, and at this stage I don't think we can ask for
: more from this series wrt the MM - albeit there seems to still be some
: folks reporting breakage.  Fundamentally I see Liam's work to (re)move
: complexity out of the MM (not to say that the actual maple tree is not
: complex) by consolidating the three complimentary data structures very
: much worth it considering performance does not take a hit.  This was very
: much a turn off with the range locking approach, which worst case scenario
: incurred in prohibitive overhead.  Also as Liam and Matthew have
: mentioned, RCU opens up a lot of nice performance opportunities, and in
: addition academia[1] has shown outstanding scalability of address spaces
: with the foundation of replacing the locked rbtree with RCU aware trees.

A similar work has been discovered in the academic press

	https://pdos.csail.mit.edu/papers/rcuvm:asplos12.pdf

Sheer coincidence.  We designed our tree with the intention of solving the
hardest problem first.  Upon settling on a b-tree variant and a rough
outline, we researched ranged based b-trees and RCU b-trees and did find
that article.  So it was nice to find reassurances that we were on the
right path, but our design choice of using ranges made that paper unusable
for us.

This patch (of 70):

The maple tree is an RCU-safe range based B-tree designed to use modern
processor cache efficiently.  There are a number of places in the kernel
that a non-overlapping range-based tree would be beneficial, especially
one with a simple interface.  If you use an rbtree with other data
structures to improve performance or an interval tree to track
non-overlapping ranges, then this is for you.

The tree has a branching factor of 10 for non-leaf nodes and 16 for leaf
nodes.  With the increased branching factor, it is significantly shorter
than the rbtree so it has fewer cache misses.  The removal of the linked
list between subsequent entries also reduces the cache misses and the need
to pull in the previous and next VMA during many tree alterations.

The first user that is covered in this patch set is the vm_area_struct,
where three data structures are replaced by the maple tree: the augmented
rbtree, the vma cache, and the linked list of VMAs in the mm_struct.  The
long term goal is to reduce or remove the mmap_lock contention.

The plan is to get to the point where we use the maple tree in RCU mode.
Readers will not block for writers.  A single write operation will be
allowed at a time.  A reader re-walks if stale data is encountered.  VMAs
would be RCU enabled and this mode would be entered once multiple tasks
are using the mm_struct.

There is additional BUG_ON() calls added within the tree, most of which
are in debug code.  These will be replaced with a WARN_ON() call in the
future.  There is also additional BUG_ON() calls within the code which
will also be reduced in number at a later date.  These exist to catch
things such as out-of-range accesses which would crash anyways.

Link: https://lkml.kernel.org/r/20220906194824.2110408-1-Liam.Howlett@oracle.com
Link: https://lkml.kernel.org/r/20220906194824.2110408-2-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: David Howells <dhowells@redhat.com>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 685 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 685 insertions(+)
 create mode 100644 include/linux/maple_tree.h

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
new file mode 100644
index 000000000000..2effab72add1
--- /dev/null
+++ b/include/linux/maple_tree.h
@@ -0,0 +1,685 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef _LINUX_MAPLE_TREE_H
+#define _LINUX_MAPLE_TREE_H
+/*
+ * Maple Tree - An RCU-safe adaptive tree for storing ranges
+ * Copyright (c) 2018-2022 Oracle
+ * Authors:     Liam R. Howlett <Liam.Howlett@Oracle.com>
+ *              Matthew Wilcox <willy@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+/* #define CONFIG_MAPLE_RCU_DISABLED */
+/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */
+
+/*
+ * Allocated nodes are mutable until they have been inserted into the tree,
+ * at which time they cannot change their type until they have been removed
+ * from the tree and an RCU grace period has passed.
+ *
+ * Removed nodes have their ->parent set to point to themselves.  RCU readers
+ * check ->parent before relying on the value that they loaded from the
+ * slots array.  This lets us reuse the slots array for the RCU head.
+ *
+ * Nodes in the tree point to their parent unless bit 0 is set.
+ */
+#if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64)
+/* 64bit sizes */
+#define MAPLE_NODE_SLOTS	31	/* 256 bytes including ->parent */
+#define MAPLE_RANGE64_SLOTS	16	/* 256 bytes */
+#define MAPLE_ARANGE64_SLOTS	10	/* 240 bytes */
+#define MAPLE_ARANGE64_META_MAX	15	/* Out of range for metadata */
+#define MAPLE_ALLOC_SLOTS	(MAPLE_NODE_SLOTS - 1)
+#else
+/* 32bit sizes */
+#define MAPLE_NODE_SLOTS	63	/* 256 bytes including ->parent */
+#define MAPLE_RANGE64_SLOTS	32	/* 256 bytes */
+#define MAPLE_ARANGE64_SLOTS	21	/* 240 bytes */
+#define MAPLE_ARANGE64_META_MAX	31	/* Out of range for metadata */
+#define MAPLE_ALLOC_SLOTS	(MAPLE_NODE_SLOTS - 2)
+#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */
+
+#define MAPLE_NODE_MASK		255UL
+
+/*
+ * The node->parent of the root node has bit 0 set and the rest of the pointer
+ * is a pointer to the tree itself.  No more bits are available in this pointer
+ * (on m68k, the data structure may only be 2-byte aligned).
+ *
+ * Internal non-root nodes can only have maple_range_* nodes as parents.  The
+ * parent pointer is 256B aligned like all other tree nodes.  When storing a 32
+ * or 64 bit values, the offset can fit into 4 bits.  The 16 bit values need an
+ * extra bit to store the offset.  This extra bit comes from a reuse of the last
+ * bit in the node type.  This is possible by using bit 1 to indicate if bit 2
+ * is part of the type or the slot.
+ *
+ * Once the type is decided, the decision of an allocation range type or a range
+ * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE
+ * flag.
+ *
+ *  Node types:
+ *   0x??1 = Root
+ *   0x?00 = 16 bit nodes
+ *   0x010 = 32 bit nodes
+ *   0x110 = 64 bit nodes
+ *
+ *  Slot size and location in the parent pointer:
+ *   type  : slot location
+ *   0x??1 : Root
+ *   0x?00 : 16 bit values, type in 0-1, slot in 2-6
+ *   0x010 : 32 bit values, type in 0-2, slot in 3-6
+ *   0x110 : 64 bit values, type in 0-2, slot in 3-6
+ */
+
+/*
+ * This metadata is used to optimize the gap updating code and in reverse
+ * searching for gaps or any other code that needs to find the end of the data.
+ */
+struct maple_metadata {
+	unsigned char end;
+	unsigned char gap;
+};
+
+/*
+ * Leaf nodes do not store pointers to nodes, they store user data.  Users may
+ * store almost any bit pattern.  As noted above, the optimisation of storing an
+ * entry at 0 in the root pointer cannot be done for data which have the bottom
+ * two bits set to '10'.  We also reserve values with the bottom two bits set to
+ * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use.  Some APIs
+ * return errnos as a negative errno shifted right by two bits and the bottom
+ * two bits set to '10', and while choosing to store these values in the array
+ * is not an error, it may lead to confusion if you're testing for an error with
+ * mas_is_err().
+ *
+ * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits
+ * 3-6), bit 2 is reserved.  That leaves bits 0-1 unused for now.
+ *
+ * In regular B-Tree terms, pivots are called keys.  The term pivot is used to
+ * indicate that the tree is specifying ranges,  Pivots may appear in the
+ * subtree with an entry attached to the value whereas keys are unique to a
+ * specific position of a B-tree.  Pivot values are inclusive of the slot with
+ * the same index.
+ */
+
+struct maple_range_64 {
+	struct maple_pnode *parent;
+	unsigned long pivot[MAPLE_RANGE64_SLOTS - 1];
+	union {
+		void __rcu *slot[MAPLE_RANGE64_SLOTS];
+		struct {
+			void __rcu *pad[MAPLE_RANGE64_SLOTS - 1];
+			struct maple_metadata meta;
+		};
+	};
+};
+
+/*
+ * At tree creation time, the user can specify that they're willing to trade off
+ * storing fewer entries in a tree in return for storing more information in
+ * each node.
+ *
+ * The maple tree supports recording the largest range of NULL entries available
+ * in this node, also called gaps.  This optimises the tree for allocating a
+ * range.
+ */
+struct maple_arange_64 {
+	struct maple_pnode *parent;
+	unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1];
+	void __rcu *slot[MAPLE_ARANGE64_SLOTS];
+	unsigned long gap[MAPLE_ARANGE64_SLOTS];
+	struct maple_metadata meta;
+};
+
+struct maple_alloc {
+	unsigned long total;
+	unsigned char node_count;
+	unsigned int request_count;
+	struct maple_alloc *slot[MAPLE_ALLOC_SLOTS];
+};
+
+struct maple_topiary {
+	struct maple_pnode *parent;
+	struct maple_enode *next; /* Overlaps the pivot */
+};
+
+enum maple_type {
+	maple_dense,
+	maple_leaf_64,
+	maple_range_64,
+	maple_arange_64,
+};
+
+
+/**
+ * DOC: Maple tree flags
+ *
+ * * MT_FLAGS_ALLOC_RANGE	- Track gaps in this tree
+ * * MT_FLAGS_USE_RCU		- Operate in RCU mode
+ * * MT_FLAGS_HEIGHT_OFFSET	- The position of the tree height in the flags
+ * * MT_FLAGS_HEIGHT_MASK	- The mask for the maple tree height value
+ * * MT_FLAGS_LOCK_MASK		- How the mt_lock is used
+ * * MT_FLAGS_LOCK_IRQ		- Acquired irq-safe
+ * * MT_FLAGS_LOCK_BH		- Acquired bh-safe
+ * * MT_FLAGS_LOCK_EXTERN	- mt_lock is not used
+ *
+ * MAPLE_HEIGHT_MAX	The largest height that can be stored
+ */
+#define MT_FLAGS_ALLOC_RANGE	0x01
+#define MT_FLAGS_USE_RCU	0x02
+#define MT_FLAGS_HEIGHT_OFFSET	0x02
+#define MT_FLAGS_HEIGHT_MASK	0x7C
+#define MT_FLAGS_LOCK_MASK	0x300
+#define MT_FLAGS_LOCK_IRQ	0x100
+#define MT_FLAGS_LOCK_BH	0x200
+#define MT_FLAGS_LOCK_EXTERN	0x300
+
+#define MAPLE_HEIGHT_MAX	31
+
+
+#define MAPLE_NODE_TYPE_MASK	0x0F
+#define MAPLE_NODE_TYPE_SHIFT	0x03
+
+#define MAPLE_RESERVED_RANGE	4096
+
+#ifdef CONFIG_LOCKDEP
+typedef struct lockdep_map *lockdep_map_p;
+#define mt_lock_is_held(mt)	lock_is_held(mt->ma_external_lock)
+#define mt_set_external_lock(mt, lock)					\
+	(mt)->ma_external_lock = &(lock)->dep_map
+#else
+typedef struct { /* nothing */ } lockdep_map_p;
+#define mt_lock_is_held(mt)	1
+#define mt_set_external_lock(mt, lock)	do { } while (0)
+#endif
+
+/*
+ * If the tree contains a single entry at index 0, it is usually stored in
+ * tree->ma_root.  To optimise for the page cache, an entry which ends in '00',
+ * '01' or '11' is stored in the root, but an entry which ends in '10' will be
+ * stored in a node.  Bits 3-6 are used to store enum maple_type.
+ *
+ * The flags are used both to store some immutable information about this tree
+ * (set at tree creation time) and dynamic information set under the spinlock.
+ *
+ * Another use of flags are to indicate global states of the tree.  This is the
+ * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in
+ * RCU mode.  This mode was added to allow the tree to reuse nodes instead of
+ * re-allocating and RCU freeing nodes when there is a single user.
+ */
+struct maple_tree {
+	union {
+		spinlock_t	ma_lock;
+		lockdep_map_p	ma_external_lock;
+	};
+	void __rcu      *ma_root;
+	unsigned int	ma_flags;
+};
+
+/**
+ * MTREE_INIT() - Initialize a maple tree
+ * @name: The maple tree name
+ * @__flags: The maple tree flags
+ *
+ */
+#define MTREE_INIT(name, __flags) {					\
+	.ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock),		\
+	.ma_flags = __flags,						\
+	.ma_root = NULL,						\
+}
+
+/**
+ * MTREE_INIT_EXT() - Initialize a maple tree with an external lock.
+ * @name: The tree name
+ * @__flags: The maple tree flags
+ * @__lock: The external lock
+ */
+#ifdef CONFIG_LOCKDEP
+#define MTREE_INIT_EXT(name, __flags, __lock) {				\
+	.ma_external_lock = &(__lock).dep_map,				\
+	.ma_flags = (__flags),						\
+	.ma_root = NULL,						\
+}
+#else
+#define MTREE_INIT_EXT(name, __flags, __lock)	MTREE_INIT(name, __flags)
+#endif
+
+#define DEFINE_MTREE(name)						\
+	struct maple_tree name = MTREE_INIT(name, 0)
+
+#define mtree_lock(mt)		spin_lock((&(mt)->ma_lock))
+#define mtree_unlock(mt)	spin_unlock((&(mt)->ma_lock))
+
+/*
+ * The Maple Tree squeezes various bits in at various points which aren't
+ * necessarily obvious.  Usually, this is done by observing that pointers are
+ * N-byte aligned and thus the bottom log_2(N) bits are available for use.  We
+ * don't use the high bits of pointers to store additional information because
+ * we don't know what bits are unused on any given architecture.
+ *
+ * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8
+ * low bits for our own purposes.  Nodes are currently of 4 types:
+ * 1. Single pointer (Range is 0-0)
+ * 2. Non-leaf Allocation Range nodes
+ * 3. Non-leaf Range nodes
+ * 4. Leaf Range nodes All nodes consist of a number of node slots,
+ *    pivots, and a parent pointer.
+ */
+
+struct maple_node {
+	union {
+		struct {
+			struct maple_pnode *parent;
+			void __rcu *slot[MAPLE_NODE_SLOTS];
+		};
+		struct {
+			void *pad;
+			struct rcu_head rcu;
+			struct maple_enode *piv_parent;
+			unsigned char parent_slot;
+			enum maple_type type;
+			unsigned char slot_len;
+			unsigned int ma_flags;
+		};
+		struct maple_range_64 mr64;
+		struct maple_arange_64 ma64;
+		struct maple_alloc alloc;
+	};
+};
+
+/*
+ * More complicated stores can cause two nodes to become one or three and
+ * potentially alter the height of the tree.  Either half of the tree may need
+ * to be rebalanced against the other.  The ma_topiary struct is used to track
+ * which nodes have been 'cut' from the tree so that the change can be done
+ * safely at a later date.  This is done to support RCU.
+ */
+struct ma_topiary {
+	struct maple_enode *head;
+	struct maple_enode *tail;
+	struct maple_tree *mtree;
+};
+
+void *mtree_load(struct maple_tree *mt, unsigned long index);
+
+int mtree_insert(struct maple_tree *mt, unsigned long index,
+		void *entry, gfp_t gfp);
+int mtree_insert_range(struct maple_tree *mt, unsigned long first,
+		unsigned long last, void *entry, gfp_t gfp);
+int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
+		void *entry, unsigned long size, unsigned long min,
+		unsigned long max, gfp_t gfp);
+int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
+		void *entry, unsigned long size, unsigned long min,
+		unsigned long max, gfp_t gfp);
+
+int mtree_store_range(struct maple_tree *mt, unsigned long first,
+		      unsigned long last, void *entry, gfp_t gfp);
+int mtree_store(struct maple_tree *mt, unsigned long index,
+		void *entry, gfp_t gfp);
+void *mtree_erase(struct maple_tree *mt, unsigned long index);
+
+void mtree_destroy(struct maple_tree *mt);
+void __mt_destroy(struct maple_tree *mt);
+
+/**
+ * mtree_empty() - Determine if a tree has any present entries.
+ * @mt: Maple Tree.
+ *
+ * Context: Any context.
+ * Return: %true if the tree contains only NULL pointers.
+ */
+static inline bool mtree_empty(const struct maple_tree *mt)
+{
+	return mt->ma_root == NULL;
+}
+
+/* Advanced API */
+
+/*
+ * The maple state is defined in the struct ma_state and is used to keep track
+ * of information during operations, and even between operations when using the
+ * advanced API.
+ *
+ * If state->node has bit 0 set then it references a tree location which is not
+ * a node (eg the root).  If bit 1 is set, the rest of the bits are a negative
+ * errno.  Bit 2 (the 'unallocated slots' bit) is clear.  Bits 3-6 indicate the
+ * node type.
+ *
+ * state->alloc either has a request number of nodes or an allocated node.  If
+ * stat->alloc has a requested number of nodes, the first bit will be set (0x1)
+ * and the remaining bits are the value.  If state->alloc is a node, then the
+ * node will be of type maple_alloc.  maple_alloc has MAPLE_NODE_SLOTS - 1 for
+ * storing more allocated nodes, a total number of nodes allocated, and the
+ * node_count in this node.  node_count is the number of allocated nodes in this
+ * node.  The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further
+ * nodes into state->alloc->slot[0]'s node.  Nodes are taken from state->alloc
+ * by removing a node from the state->alloc node until state->alloc->node_count
+ * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted
+ * to state->alloc.  Nodes are pushed onto state->alloc by putting the current
+ * state->alloc into the pushed node's slot[0].
+ *
+ * The state also contains the implied min/max of the state->node, the depth of
+ * this search, and the offset. The implied min/max are either from the parent
+ * node or are 0-oo for the root node.  The depth is incremented or decremented
+ * every time a node is walked down or up.  The offset is the slot/pivot of
+ * interest in the node - either for reading or writing.
+ *
+ * When returning a value the maple state index and last respectively contain
+ * the start and end of the range for the entry.  Ranges are inclusive in the
+ * Maple Tree.
+ */
+struct ma_state {
+	struct maple_tree *tree;	/* The tree we're operating in */
+	unsigned long index;		/* The index we're operating on - range start */
+	unsigned long last;		/* The last index we're operating on - range end */
+	struct maple_enode *node;	/* The node containing this entry */
+	unsigned long min;		/* The minimum index of this node - implied pivot min */
+	unsigned long max;		/* The maximum index of this node - implied pivot max */
+	struct maple_alloc *alloc;	/* Allocated nodes for this operation */
+	unsigned char depth;		/* depth of tree descent during write */
+	unsigned char offset;
+	unsigned char mas_flags;
+};
+
+struct ma_wr_state {
+	struct ma_state *mas;
+	struct maple_node *node;	/* Decoded mas->node */
+	unsigned long r_min;		/* range min */
+	unsigned long r_max;		/* range max */
+	enum maple_type type;		/* mas->node type */
+	unsigned char offset_end;	/* The offset where the write ends */
+	unsigned char node_end;		/* mas->node end */
+	unsigned long *pivots;		/* mas->node->pivots pointer */
+	unsigned long end_piv;		/* The pivot at the offset end */
+	void __rcu **slots;		/* mas->node->slots pointer */
+	void *entry;			/* The entry to write */
+	void *content;			/* The existing entry that is being overwritten */
+};
+
+#define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
+#define mas_unlock(mas)         spin_unlock(&((mas)->tree->ma_lock))
+
+
+/*
+ * Special values for ma_state.node.
+ * MAS_START means we have not searched the tree.
+ * MAS_ROOT means we have searched the tree and the entry we found lives in
+ * the root of the tree (ie it has index 0, length 1 and is the only entry in
+ * the tree).
+ * MAS_NONE means we have searched the tree and there is no node in the
+ * tree for this entry.  For example, we searched for index 1 in an empty
+ * tree.  Or we have a tree which points to a full leaf node and we
+ * searched for an entry which is larger than can be contained in that
+ * leaf node.
+ * MA_ERROR represents an errno.  After dropping the lock and attempting
+ * to resolve the error, the walk would have to be restarted from the
+ * top of the tree as the tree may have been modified.
+ */
+#define MAS_START	((struct maple_enode *)1UL)
+#define MAS_ROOT	((struct maple_enode *)5UL)
+#define MAS_NONE	((struct maple_enode *)9UL)
+#define MAS_PAUSE	((struct maple_enode *)17UL)
+#define MA_ERROR(err) \
+		((struct maple_enode *)(((unsigned long)err << 2) | 2UL))
+
+#define MA_STATE(name, mt, first, end)					\
+	struct ma_state name = {					\
+		.tree = mt,						\
+		.index = first,						\
+		.last = end,						\
+		.node = MAS_START,					\
+		.min = 0,						\
+		.max = ULONG_MAX,					\
+		.alloc = NULL,						\
+	}
+
+#define MA_WR_STATE(name, ma_state, wr_entry)				\
+	struct ma_wr_state name = {					\
+		.mas = ma_state,					\
+		.content = NULL,					\
+		.entry = wr_entry,					\
+	}
+
+#define MA_TOPIARY(name, tree)						\
+	struct ma_topiary name = {					\
+		.head = NULL,						\
+		.tail = NULL,						\
+		.mtree = tree,						\
+	}
+
+void *mas_walk(struct ma_state *mas);
+void *mas_store(struct ma_state *mas, void *entry);
+void *mas_erase(struct ma_state *mas);
+int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp);
+void mas_store_prealloc(struct ma_state *mas, void *entry);
+void *mas_find(struct ma_state *mas, unsigned long max);
+void *mas_find_rev(struct ma_state *mas, unsigned long min);
+int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
+bool mas_is_err(struct ma_state *mas);
+
+bool mas_nomem(struct ma_state *mas, gfp_t gfp);
+void mas_pause(struct ma_state *mas);
+void maple_tree_init(void);
+void mas_destroy(struct ma_state *mas);
+int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries);
+
+void *mas_prev(struct ma_state *mas, unsigned long min);
+void *mas_next(struct ma_state *mas, unsigned long max);
+
+int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max,
+		   unsigned long size);
+
+/* Checks if a mas has not found anything */
+static inline bool mas_is_none(struct ma_state *mas)
+{
+	return mas->node == MAS_NONE;
+}
+
+/* Checks if a mas has been paused */
+static inline bool mas_is_paused(struct ma_state *mas)
+{
+	return mas->node == MAS_PAUSE;
+}
+
+void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas);
+void mas_dup_store(struct ma_state *mas, void *entry);
+
+/*
+ * This finds an empty area from the highest address to the lowest.
+ * AKA "Topdown" version,
+ */
+int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
+		       unsigned long max, unsigned long size);
+/**
+ * mas_reset() - Reset a Maple Tree operation state.
+ * @mas: Maple Tree operation state.
+ *
+ * Resets the error or walk state of the @mas so future walks of the
+ * array will start from the root.  Use this if you have dropped the
+ * lock and want to reuse the ma_state.
+ *
+ * Context: Any context.
+ */
+static inline void mas_reset(struct ma_state *mas)
+{
+	mas->node = MAS_START;
+}
+
+/**
+ * mas_for_each() - Iterate over a range of the maple tree.
+ * @__mas: Maple Tree operation state (maple_state)
+ * @__entry: Entry retrieved from the tree
+ * @__max: maximum index to retrieve from the tree
+ *
+ * When returned, mas->index and mas->last will hold the entire range for the
+ * entry.
+ *
+ * Note: may return the zero entry.
+ *
+ */
+#define mas_for_each(__mas, __entry, __max) \
+	while (((__entry) = mas_find((__mas), (__max))) != NULL)
+
+
+/**
+ * mas_set_range() - Set up Maple Tree operation state for a different index.
+ * @mas: Maple Tree operation state.
+ * @start: New start of range in the Maple Tree.
+ * @last: New end of range in the Maple Tree.
+ *
+ * Move the operation state to refer to a different range.  This will
+ * have the effect of starting a walk from the top; see mas_next()
+ * to move to an adjacent index.
+ */
+static inline
+void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
+{
+	       mas->index = start;
+	       mas->last = last;
+	       mas->node = MAS_START;
+}
+
+/**
+ * mas_set() - Set up Maple Tree operation state for a different index.
+ * @mas: Maple Tree operation state.
+ * @index: New index into the Maple Tree.
+ *
+ * Move the operation state to refer to a different index.  This will
+ * have the effect of starting a walk from the top; see mas_next()
+ * to move to an adjacent index.
+ */
+static inline void mas_set(struct ma_state *mas, unsigned long index)
+{
+
+	mas_set_range(mas, index, index);
+}
+
+static inline bool mt_external_lock(const struct maple_tree *mt)
+{
+	return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN;
+}
+
+/**
+ * mt_init_flags() - Initialise an empty maple tree with flags.
+ * @mt: Maple Tree
+ * @flags: maple tree flags.
+ *
+ * If you need to initialise a Maple Tree with special flags (eg, an
+ * allocation tree), use this function.
+ *
+ * Context: Any context.
+ */
+static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags)
+{
+	mt->ma_flags = flags;
+	if (!mt_external_lock(mt))
+		spin_lock_init(&mt->ma_lock);
+	rcu_assign_pointer(mt->ma_root, NULL);
+}
+
+/**
+ * mt_init() - Initialise an empty maple tree.
+ * @mt: Maple Tree
+ *
+ * An empty Maple Tree.
+ *
+ * Context: Any context.
+ */
+static inline void mt_init(struct maple_tree *mt)
+{
+	mt_init_flags(mt, 0);
+}
+
+static inline bool mt_in_rcu(struct maple_tree *mt)
+{
+#ifdef CONFIG_MAPLE_RCU_DISABLED
+	return false;
+#endif
+	return mt->ma_flags & MT_FLAGS_USE_RCU;
+}
+
+/**
+ * mt_clear_in_rcu() - Switch the tree to non-RCU mode.
+ * @mt: The Maple Tree
+ */
+static inline void mt_clear_in_rcu(struct maple_tree *mt)
+{
+	if (!mt_in_rcu(mt))
+		return;
+
+	if (mt_external_lock(mt)) {
+		BUG_ON(!mt_lock_is_held(mt));
+		mt->ma_flags &= ~MT_FLAGS_USE_RCU;
+	} else {
+		mtree_lock(mt);
+		mt->ma_flags &= ~MT_FLAGS_USE_RCU;
+		mtree_unlock(mt);
+	}
+}
+
+/**
+ * mt_set_in_rcu() - Switch the tree to RCU safe mode.
+ * @mt: The Maple Tree
+ */
+static inline void mt_set_in_rcu(struct maple_tree *mt)
+{
+	if (mt_in_rcu(mt))
+		return;
+
+	if (mt_external_lock(mt)) {
+		BUG_ON(!mt_lock_is_held(mt));
+		mt->ma_flags |= MT_FLAGS_USE_RCU;
+	} else {
+		mtree_lock(mt);
+		mt->ma_flags |= MT_FLAGS_USE_RCU;
+		mtree_unlock(mt);
+	}
+}
+
+void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max);
+void *mt_find_after(struct maple_tree *mt, unsigned long *index,
+		    unsigned long max);
+void *mt_prev(struct maple_tree *mt, unsigned long index,  unsigned long min);
+void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);
+
+/**
+ * mt_for_each - Iterate over each entry starting at index until max.
+ * @__tree: The Maple Tree
+ * @__entry: The current entry
+ * @__index: The index to update to track the location in the tree
+ * @__max: The maximum limit for @index
+ *
+ * Note: Will not return the zero entry.
+ */
+#define mt_for_each(__tree, __entry, __index, __max) \
+	for (__entry = mt_find(__tree, &(__index), __max); \
+		__entry; __entry = mt_find_after(__tree, &(__index), __max))
+
+
+#ifdef CONFIG_DEBUG_MAPLE_TREE
+extern atomic_t maple_tree_tests_run;
+extern atomic_t maple_tree_tests_passed;
+
+void mt_dump(const struct maple_tree *mt);
+void mt_validate(struct maple_tree *mt);
+#define MT_BUG_ON(__tree, __x) do {					\
+	atomic_inc(&maple_tree_tests_run);				\
+	if (__x) {							\
+		pr_info("BUG at %s:%d (%u)\n",				\
+		__func__, __LINE__, __x);				\
+		mt_dump(__tree);					\
+		pr_info("Pass: %u Run:%u\n",				\
+			atomic_read(&maple_tree_tests_passed),		\
+			atomic_read(&maple_tree_tests_run));		\
+		dump_stack();						\
+	} else {							\
+		atomic_inc(&maple_tree_tests_passed);			\
+	}								\
+} while (0)
+#else
+#define MT_BUG_ON(__tree, __x) BUG_ON(__x)
+#endif /* CONFIG_DEBUG_MAPLE_TREE */
+
+#endif /*_LINUX_MAPLE_TREE_H */
-- 
cgit v1.2.3


From d4af56c5c7c6781ca6ca8075e2cf5bc119ed33d1 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Date: Tue, 6 Sep 2022 19:48:45 +0000
Subject: mm: start tracking VMAs with maple tree

Start tracking the VMAs with the new maple tree structure in parallel with
the rb_tree.  Add debug and trace events for maple tree operations and
duplicate the rb_tree that is created on forks into the maple tree.

The maple tree is added to the mm_struct including the mm_init struct,
added support in required mm/mmap functions, added tracking in kernel/fork
for process forking, and used to find the unmapped_area and checked
against what the rbtree finds.

This also moves the mmap_lock() in exit_mmap() since the oom reaper call
does walk the VMAs.  Otherwise lockdep will be unhappy if oom happens.

When splitting a vma fails due to allocations of the maple tree nodes,
the error path in __split_vma() calls new->vm_ops->close(new).  The page
accounting for hugetlb is actually in the close() operation,  so it
accounts for the removal of 1/2 of the VMA which was not adjusted.  This
results in a negative exit value.  To avoid the negative charge, set
vm_start = vm_end and vm_pgoff = 0.

There is also a potential accounting issue in special mappings from
insert_vm_struct() failing to allocate, so reverse the charge there in
the failure scenario.

Link: https://lkml.kernel.org/r/20220906194824.2110408-9-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       | 5 +++++
 include/linux/mm_types.h | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7cc9ffc19e7f..896d04248e66 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2567,6 +2567,8 @@ extern bool arch_has_descending_max_zone_pfns(void);
 /* nommu.c */
 extern atomic_long_t mmap_pages_allocated;
 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
+/* mmap.c */
+void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas);
 
 /* interval_tree.c */
 void vma_interval_tree_insert(struct vm_area_struct *node,
@@ -2630,6 +2632,9 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	bool *need_rmap_locks);
 extern void exit_mmap(struct mm_struct *);
 
+void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas);
+void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas);
+
 static inline int check_data_rlimit(unsigned long rlim,
 				    unsigned long new,
 				    unsigned long start,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e1797813cc2c..425bc5f7d477 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -9,6 +9,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/rbtree.h>
+#include <linux/maple_tree.h>
 #include <linux/rwsem.h>
 #include <linux/completion.h>
 #include <linux/cpumask.h>
@@ -486,6 +487,7 @@ struct kioctx_table;
 struct mm_struct {
 	struct {
 		struct vm_area_struct *mmap;		/* list of VMAs */
+		struct maple_tree mm_mt;
 		struct rb_root mm_rb;
 		u64 vmacache_seqnum;                   /* per-thread vmacache */
 #ifdef CONFIG_MMU
@@ -697,6 +699,7 @@ struct mm_struct {
 	unsigned long cpu_bitmap[];
 };
 
+#define MM_MT_FLAGS	(MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
 extern struct mm_struct init_mm;
 
 /* Pointer magic because the dynamic array size confuses some compilers. */
-- 
cgit v1.2.3


From f39af05949a4280b9f04d5dd0f606b81aac3dae8 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 6 Sep 2022 19:48:46 +0000
Subject: mm: add VMA iterator

This thin layer of abstraction over the maple tree state is for iterating
over VMAs.  You can go forwards, go backwards or ask where the iterator
is.  Rename the existing vma_next() to __vma_next() -- it will be removed
by the end of this series.

Link: https://lkml.kernel.org/r/20220906194824.2110408-10-Liam.Howlett@oracle.com
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Howells <dhowells@redhat.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       | 32 ++++++++++++++++++++++++++++++++
 include/linux/mm_types.h | 21 +++++++++++++++++++++
 2 files changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 896d04248e66..3701da1fac5f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -661,6 +661,38 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_ACCESS_FLAGS;
 }
 
+static inline
+struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
+{
+	return mas_find(&vmi->mas, max);
+}
+
+static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
+{
+	/*
+	 * Uses vma_find() to get the first VMA when the iterator starts.
+	 * Calling mas_next() could skip the first entry.
+	 */
+	return vma_find(vmi, ULONG_MAX);
+}
+
+static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
+{
+	return mas_prev(&vmi->mas, 0);
+}
+
+static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
+{
+	return vmi->mas.index;
+}
+
+#define for_each_vma(__vmi, __vma)					\
+	while (((__vma) = vma_next(&(__vmi))) != NULL)
+
+/* The MM code likes to work with exclusive end addresses */
+#define for_each_vma_range(__vmi, __vma, __end)				\
+	while (((__vma) = vma_find(&(__vmi), (__end) - 1)) != NULL)
+
 #ifdef CONFIG_SHMEM
 /*
  * The vma_is_shmem is not inline because it is used only by slow
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 425bc5f7d477..d0b51fbdf5d4 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -777,6 +777,27 @@ static inline void lru_gen_use_mm(struct mm_struct *mm)
 
 #endif /* CONFIG_LRU_GEN */
 
+struct vma_iterator {
+	struct ma_state mas;
+};
+
+#define VMA_ITERATOR(name, __mm, __addr)				\
+	struct vma_iterator name = {					\
+		.mas = {						\
+			.tree = &(__mm)->mm_mt,				\
+			.index = __addr,				\
+			.node = MAS_START,				\
+		},							\
+	}
+
+static inline void vma_iter_init(struct vma_iterator *vmi,
+		struct mm_struct *mm, unsigned long addr)
+{
+	vmi->mas.tree = &mm->mm_mt;
+	vmi->mas.index = addr;
+	vmi->mas.node = MAS_START;
+}
+
 struct mmu_gather;
 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
-- 
cgit v1.2.3


From c9dbe82cb99db5b6029c6bc43fcf7881d3f50268 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Date: Tue, 6 Sep 2022 19:48:47 +0000
Subject: kernel/fork: use maple tree for dup_mmap() during forking

The maple tree was already tracking VMAs in this function by an earlier
commit, but the rbtree iterator was being used to iterate the list.
Change the iterator to use a maple tree native iterator and switch to the
maple tree advanced API to avoid multiple walks of the tree during insert
operations.  Unexport the now-unused vma_store() function.

For performance reasons we bulk allocate the maple tree nodes.  The node
calculations are done internally to the tree and use the VMA count and
assume the worst-case node requirements.  The VM_DONT_COPY flag does not
allow for the most efficient copy method of the tree and so a bulk loading
algorithm is used.

Link: https://lkml.kernel.org/r/20220906194824.2110408-15-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3701da1fac5f..646ea4d3bd74 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2599,8 +2599,6 @@ extern bool arch_has_descending_max_zone_pfns(void);
 /* nommu.c */
 extern atomic_long_t mmap_pages_allocated;
 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
-/* mmap.c */
-void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas);
 
 /* interval_tree.c */
 void vma_interval_tree_insert(struct vm_area_struct *node,
-- 
cgit v1.2.3


From 524e00b36e8c547f5582eef3fb645a8d9fc5e3df Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Date: Tue, 6 Sep 2022 19:48:48 +0000
Subject: mm: remove rb tree.

Remove the RB tree and start using the maple tree for vm_area_struct
tracking.

Drop validate_mm() calls in expand_upwards() and expand_downwards() as the
lock is not held.

Link: https://lkml.kernel.org/r/20220906194824.2110408-18-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       |  2 --
 include/linux/mm_types.h | 14 --------------
 2 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 646ea4d3bd74..dfce1aaa7a64 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2654,8 +2654,6 @@ extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
 extern int split_vma(struct mm_struct *, struct vm_area_struct *,
 	unsigned long addr, int new_below);
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
-	struct rb_node **, struct rb_node *);
 extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d0b51fbdf5d4..ac747273c4d6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -410,19 +410,6 @@ struct vm_area_struct {
 
 	/* linked list of VM areas per task, sorted by address */
 	struct vm_area_struct *vm_next, *vm_prev;
-
-	struct rb_node vm_rb;
-
-	/*
-	 * Largest free memory gap in bytes to the left of this VMA.
-	 * Either between this VMA and vma->vm_prev, or between one of the
-	 * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
-	 * get_unmapped_area find a free area of the right size.
-	 */
-	unsigned long rb_subtree_gap;
-
-	/* Second cache line starts here. */
-
 	struct mm_struct *vm_mm;	/* The address space we belong to. */
 
 	/*
@@ -488,7 +475,6 @@ struct mm_struct {
 	struct {
 		struct vm_area_struct *mmap;		/* list of VMAs */
 		struct maple_tree mm_mt;
-		struct rb_root mm_rb;
 		u64 vmacache_seqnum;                   /* per-thread vmacache */
 #ifdef CONFIG_MMU
 		unsigned long (*get_unmapped_area) (struct file *filp,
-- 
cgit v1.2.3


From dc8635b25e87232f62276c02899b9d21dd0793c2 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Date: Tue, 6 Sep 2022 19:48:49 +0000
Subject: mm: optimize find_exact_vma() to use vma_lookup()

Use vma_lookup() to walk the tree to the start value requested.  If the
vma at the start does not match, then the answer is NULL and there is no
need to look at the next vma the way that find_vma() would.

Link: https://lkml.kernel.org/r/20220906194824.2110408-21-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dfce1aaa7a64..a80083091f53 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2850,7 +2850,7 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
 static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
 				unsigned long vm_start, unsigned long vm_end)
 {
-	struct vm_area_struct *vma = find_vma(mm, vm_start);
+	struct vm_area_struct *vma = vma_lookup(mm, vm_start);
 
 	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
 		vma = NULL;
-- 
cgit v1.2.3


From abdba2dda0c477ca708a939b02f9b2e74666ed2d Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Date: Tue, 6 Sep 2022 19:48:50 +0000
Subject: mm: use maple tree operations for find_vma_intersection()

Move find_vma_intersection() to mmap.c and change implementation to maple
tree.

When searching for a vma within a range, it is easier to use the maple
tree interface.

Exported find_vma_intersection() for kvm module.

Link: https://lkml.kernel.org/r/20220906194824.2110408-24-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a80083091f53..06a6b8db75b7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2778,26 +2778,12 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add
 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
 					     struct vm_area_struct **pprev);
 
-/**
- * find_vma_intersection() - Look up the first VMA which intersects the interval
- * @mm: The process address space.
- * @start_addr: The inclusive start user address.
- * @end_addr: The exclusive end user address.
- *
- * Returns: The first VMA within the provided range, %NULL otherwise.  Assumes
- * start_addr < end_addr.
+/*
+ * Look up the first VMA which intersects the interval [start_addr, end_addr)
+ * NULL if none.  Assume start_addr < end_addr.
  */
-static inline
 struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
-					     unsigned long start_addr,
-					     unsigned long end_addr)
-{
-	struct vm_area_struct *vma = find_vma(mm, start_addr);
-
-	if (vma && end_addr <= vma->vm_start)
-		vma = NULL;
-	return vma;
-}
+			unsigned long start_addr, unsigned long end_addr);
 
 /**
  * vma_lookup() - Find a VMA at a specific address
-- 
cgit v1.2.3


From 7964cf8caa4dfa42c4149f3833d3878713cda3dc Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Date: Tue, 6 Sep 2022 19:48:51 +0000
Subject: mm: remove vmacache

By using the maple tree and the maple tree state, the vmacache is no
longer beneficial and is complicating the VMA code.  Remove the vmacache
to reduce the work in keeping it up to date and code complexity.

Link: https://lkml.kernel.org/r/20220906194824.2110408-26-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h      |  1 -
 include/linux/mm_types_task.h | 12 ------------
 include/linux/sched.h         |  1 -
 include/linux/vm_event_item.h |  4 ----
 include/linux/vmacache.h      | 28 ----------------------------
 include/linux/vmstat.h        |  6 ------
 6 files changed, 52 deletions(-)
 delete mode 100644 include/linux/vmacache.h

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ac747273c4d6..4541b74b1bdb 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -475,7 +475,6 @@ struct mm_struct {
 	struct {
 		struct vm_area_struct *mmap;		/* list of VMAs */
 		struct maple_tree mm_mt;
-		u64 vmacache_seqnum;                   /* per-thread vmacache */
 #ifdef CONFIG_MMU
 		unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index c1bc6731125c..0bb4b6da9993 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -24,18 +24,6 @@
 		IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
 #define ALLOC_SPLIT_PTLOCKS	(SPINLOCK_SIZE > BITS_PER_LONG/8)
 
-/*
- * The per task VMA cache array:
- */
-#define VMACACHE_BITS 2
-#define VMACACHE_SIZE (1U << VMACACHE_BITS)
-#define VMACACHE_MASK (VMACACHE_SIZE - 1)
-
-struct vmacache {
-	u64 seqnum;
-	struct vm_area_struct *vmas[VMACACHE_SIZE];
-};
-
 /*
  * When updating this, please also update struct resident_page_types[] in
  * kernel/fork.c
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a2dcfb91df03..fbac3c19fe35 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -861,7 +861,6 @@ struct task_struct {
 	struct mm_struct		*active_mm;
 
 	/* Per-thread vma caching: */
-	struct vmacache			vmacache;
 
 #ifdef SPLIT_RSS_COUNTING
 	struct task_rss_stat		rss_stat;
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index f3fc36cd2276..3518dba1e02f 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -129,10 +129,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		NR_TLB_LOCAL_FLUSH_ALL,
 		NR_TLB_LOCAL_FLUSH_ONE,
 #endif /* CONFIG_DEBUG_TLBFLUSH */
-#ifdef CONFIG_DEBUG_VM_VMACACHE
-		VMACACHE_FIND_CALLS,
-		VMACACHE_FIND_HITS,
-#endif
 #ifdef CONFIG_SWAP
 		SWAP_RA,
 		SWAP_RA_HIT,
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
deleted file mode 100644
index 6fce268a4588..000000000000
--- a/include/linux/vmacache.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __LINUX_VMACACHE_H
-#define __LINUX_VMACACHE_H
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-
-static inline void vmacache_flush(struct task_struct *tsk)
-{
-	memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas));
-}
-
-extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
-extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
-						    unsigned long addr);
-
-#ifndef CONFIG_MMU
-extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
-						  unsigned long start,
-						  unsigned long end);
-#endif
-
-static inline void vmacache_invalidate(struct mm_struct *mm)
-{
-	mm->vmacache_seqnum++;
-}
-
-#endif /* __LINUX_VMACACHE_H */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index bfe38869498d..19cf5b6892ce 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,12 +125,6 @@ static inline void vm_events_fold_cpu(int cpu)
 #define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
 #endif
 
-#ifdef CONFIG_DEBUG_VM_VMACACHE
-#define count_vm_vmacache_event(x) count_vm_event(x)
-#else
-#define count_vm_vmacache_event(x) do {} while (0)
-#endif
-
 #define __count_zid_vm_events(item, zid, delta) \
 	__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
 
-- 
cgit v1.2.3


From d7c62295570f012e1d386ae6ed472b36baf037ad Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Date: Tue, 6 Sep 2022 19:48:51 +0000
Subject: mm: convert vma_lookup() to use mtree_load()

Unlike the rbtree, the Maple Tree will return a NULL if there's nothing at
a particular address.

Since the previous commit dropped the vmacache, it is now possible to
consult the tree directly.

Link: https://lkml.kernel.org/r/20220906194824.2110408-27-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 06a6b8db75b7..49a58807719b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2795,12 +2795,7 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
 static inline
 struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
 {
-	struct vm_area_struct *vma = find_vma(mm, addr);
-
-	if (vma && addr < vma->vm_start)
-		vma = NULL;
-
-	return vma;
+	return mtree_load(&mm->mm_mt, addr);
 }
 
 static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
-- 
cgit v1.2.3


From 11f9a21ab65542189372b7d64bb2d2937dfdc9dc Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Date: Tue, 6 Sep 2022 19:48:52 +0000
Subject: mm/mmap: reorganize munmap to use maple states

Remove __do_munmap() in favour of do_munmap(), do_mas_munmap(), and
do_mas_align_munmap().

do_munmap() is a wrapper to create a maple state for any callers that have
not been converted to the maple tree.

do_mas_munmap() takes a maple state to mumap a range.  This is just a
small function which checks for error conditions and aligns the end of the
range.

do_mas_align_munmap() uses the aligned range to mumap a range.
do_mas_align_munmap() starts with the first VMA in the range, then finds
the last VMA in the range.  Both start and end are split if necessary.
Then the VMAs are removed from the linked list and the mm mlock count is
updated at the same time.  Followed by a single tree operation of
overwriting the area in with a NULL.  Finally, the detached list is
unmapped and freed.

By reorganizing the munmap calls as outlined, it is now possible to avoid
extra work of aligning pre-aligned callers which are known to be safe,
avoid extra VMA lookups or tree walks for modifications.

detach_vmas_to_be_unmapped() is no longer used, so drop this code.

vm_brk_flags() can just call the do_mas_munmap() as it checks for
intersecting VMAs directly.

Link: https://lkml.kernel.org/r/20220906194824.2110408-29-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 49a58807719b..579449d6c23b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2710,8 +2710,9 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr,
 extern unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot, unsigned long flags,
 	unsigned long pgoff, unsigned long *populate, struct list_head *uf);
-extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
-		       struct list_head *uf, bool downgrade);
+extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
+			 unsigned long start, size_t len, struct list_head *uf,
+			 bool downgrade);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t,
 		     struct list_head *uf);
 extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
-- 
cgit v1.2.3


From 69dbe6daf1041e32e003f966d71f70f20c63af53 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Date: Tue, 6 Sep 2022 19:48:57 +0000
Subject: userfaultfd: use maple tree iterator to iterate VMAs

Don't use the mm_struct linked list or the vma->vm_next in prep for
removal.

Link: https://lkml.kernel.org/r/20220906194824.2110408-45-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/userfaultfd_k.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e1b8a915e9e9..f07e6998bb68 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -175,9 +175,8 @@ extern bool userfaultfd_remove(struct vm_area_struct *vma,
 			       unsigned long start,
 			       unsigned long end);
 
-extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
-				  unsigned long start, unsigned long end,
-				  struct list_head *uf);
+extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, struct list_head *uf);
 extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 				       struct list_head *uf);
 
@@ -258,7 +257,7 @@ static inline bool userfaultfd_remove(struct vm_area_struct *vma,
 	return true;
 }
 
-static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+static inline int userfaultfd_unmap_prep(struct mm_struct *mm,
 					 unsigned long start, unsigned long end,
 					 struct list_head *uf)
 {
-- 
cgit v1.2.3


From 763ecb035029f500d7e6dc99acd1ad299b7726a1 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Date: Tue, 6 Sep 2022 19:49:06 +0000
Subject: mm: remove the vma linked list

Replace any vm_next use with vma_find().

Update free_pgtables(), unmap_vmas(), and zap_page_range() to use the
maple tree.

Use the new free_pgtables() and unmap_vmas() in do_mas_align_munmap().  At
the same time, alter the loop to be more compact.

Now that free_pgtables() and unmap_vmas() take a maple tree as an
argument, rearrange do_mas_align_munmap() to use the new tree to hold the
vmas to remove.

Remove __vma_link_list() and __vma_unlink_list() as they are exclusively
used to update the linked list.

Drop linked list update from __insert_vm_struct().

Rework validation of tree as it was depending on the linked list.

[yang.lee@linux.alibaba.com: fix one kernel-doc comment]
  Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=1949
  Link: https://lkml.kernel.org/r/20220824021918.94116-1-yang.lee@linux.alibaba.comLink: https://lkml.kernel.org/r/20220906194824.2110408-69-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       | 5 +++--
 include/linux/mm_types.h | 4 ----
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 579449d6c23b..37384a84f71a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1857,8 +1857,9 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		  unsigned long size);
 void zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		    unsigned long size);
-void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
-		unsigned long start, unsigned long end);
+void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
+		struct vm_area_struct *start_vma, unsigned long start,
+		unsigned long end);
 
 struct mmu_notifier_range;
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4541b74b1bdb..5e32211cb5a9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -408,8 +408,6 @@ struct vm_area_struct {
 	unsigned long vm_end;		/* The first byte after our end address
 					   within vm_mm. */
 
-	/* linked list of VM areas per task, sorted by address */
-	struct vm_area_struct *vm_next, *vm_prev;
 	struct mm_struct *vm_mm;	/* The address space we belong to. */
 
 	/*
@@ -473,7 +471,6 @@ struct vm_area_struct {
 struct kioctx_table;
 struct mm_struct {
 	struct {
-		struct vm_area_struct *mmap;		/* list of VMAs */
 		struct maple_tree mm_mt;
 #ifdef CONFIG_MMU
 		unsigned long (*get_unmapped_area) (struct file *filp,
@@ -488,7 +485,6 @@ struct mm_struct {
 		unsigned long mmap_compat_legacy_base;
 #endif
 		unsigned long task_size;	/* size of task vm space */
-		unsigned long highest_vm_end;	/* highest vma end address */
 		pgd_t * pgd;
 
 #ifdef CONFIG_MEMBARRIER
-- 
cgit v1.2.3


From bf3980c85212fc71512d27a46f5aab66f46ca284 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 31 May 2022 15:30:59 -0700
Subject: mm: drop oom code from exit_mmap

The primary reason to invoke the oom reaper from the exit_mmap path used
to be a prevention of an excessive oom killing if the oom victim exit
races with the oom reaper (see [1] for more details).  The invocation has
moved around since then because of the interaction with the munlock logic
but the underlying reason has remained the same (see [2]).

Munlock code is no longer a problem since [3] and there shouldn't be any
blocking operation before the memory is unmapped by exit_mmap so the oom
reaper invocation can be dropped.  The unmapping part can be done with the
non-exclusive mmap_sem and the exclusive one is only required when page
tables are freed.

Remove the oom_reaper from exit_mmap which will make the code easier to
read.  This is really unlikely to make any observable difference although
some microbenchmarks could benefit from one less branch that needs to be
evaluated even though it almost never is true.

[1] 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently")
[2] 27ae357fa82b ("mm, oom: fix concurrent munlock and oom reaper unmap, v3")
[3] a213e5cf71cb ("mm/munlock: delete munlock_vma_pages_all(), allow oomreap")

[akpm@linux-foundation.org: restore Suren's mmap_read_lock() optimization]
Link: https://lkml.kernel.org/r/20220531223100.510392-1-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Brauner (Microsoft) <brauner@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/oom.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 02d1e7bbd8cd..6cdde62b078b 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -106,8 +106,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
 	return 0;
 }
 
-bool __oom_reap_task_mm(struct mm_struct *mm);
-
 long oom_badness(struct task_struct *p,
 		unsigned long totalpages);
 
-- 
cgit v1.2.3


From b3541d912a84dc40cabb516f2deeac9ae6fa30da Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 31 May 2022 15:31:00 -0700
Subject: mm: delete unused MMF_OOM_VICTIM flag

With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is now
unused and can be removed.

[akpm@linux-foundation.org: remove comment about now-removed mm_is_oom_victim()]
Link: https://lkml.kernel.org/r/20220531223100.510392-2-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Brauner (Microsoft) <brauner@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/oom.h            | 9 ---------
 include/linux/sched/coredump.h | 7 +++----
 2 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6cdde62b078b..7d0c9c48a0c5 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
 	return tsk->signal->oom_mm;
 }
 
-/*
- * Use this helper if tsk->mm != mm and the victim mm needs a special
- * handling. This is guaranteed to stay true after once set.
- */
-static inline bool mm_is_oom_victim(struct mm_struct *mm)
-{
-	return test_bit(MMF_OOM_VICTIM, &mm->flags);
-}
-
 /*
  * Checks whether a page fault on the given mm is still reliable.
  * This is no longer true if the oom reaper started to reap the
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 4d0a5be28b70..8270ad7ae14c 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -71,9 +71,8 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
 #define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
 #define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
-#define MMF_OOM_VICTIM		25	/* mm is the oom victim */
-#define MMF_OOM_REAP_QUEUED	26	/* mm was queued for oom_reaper */
-#define MMF_MULTIPROCESS	27	/* mm is shared between processes */
+#define MMF_OOM_REAP_QUEUED	25	/* mm was queued for oom_reaper */
+#define MMF_MULTIPROCESS	26	/* mm is shared between processes */
 /*
  * MMF_HAS_PINNED: Whether this mm has pinned any pages.  This can be either
  * replaced in the future by mm.pinned_vm when it becomes stable, or grow into
@@ -81,7 +80,7 @@ static inline int get_dumpable(struct mm_struct *mm)
  * pinned pages were unpinned later on, we'll still keep this bit set for the
  * lifecycle of this mm, just for simplicity.
  */
-#define MMF_HAS_PINNED		28	/* FOLL_PIN has run, never cleared */
+#define MMF_HAS_PINNED		27	/* FOLL_PIN has run, never cleared */
 #define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
-- 
cgit v1.2.3


From 474098edac262ae26bfab1c48445877075a31cbd Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 25 Aug 2022 18:46:57 +0200
Subject: mm/gup: replace FOLL_NUMA by gup_can_follow_protnone()

Patch series "mm: minor cleanups around NUMA hinting".

Working on some GUP cleanups (e.g., getting rid of some FOLL_ flags) and
preparing for other GUP changes (getting rid of FOLL_FORCE|FOLL_WRITE for
for taking a R/O longterm pin), this is something I can easily send out
independently.

Get rid of FOLL_NUMA, allow FOLL_FORCE access to PROT_NONE mapped pages in
GUP-fast, and fixup some documentation around NUMA hinting.


This patch (of 3):

No need for a special flag that is not even properly documented to be
internal-only.

Let's just factor this check out and get rid of this flag.  The separate
function has the nice benefit that we can centralize comments.

Link: https://lkml.kernel.org/r/20220825164659.89824-2-david@redhat.com
Link: https://lkml.kernel.org/r/20220825164659.89824-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 37384a84f71a..eb25cae06c55 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2933,7 +2933,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 				 * and return without waiting upon it */
 #define FOLL_NOFAULT	0x80	/* do not fault in pages */
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
-#define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
 #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
 #define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
 #define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */
@@ -3054,6 +3053,21 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page)
 	return !PageAnonExclusive(page);
 }
 
+/*
+ * Indicates whether GUP can follow a PROT_NONE mapped page, or whether
+ * a (NUMA hinting) fault is required.
+ */
+static inline bool gup_can_follow_protnone(unsigned int flags)
+{
+	/*
+	 * FOLL_FORCE has to be able to make progress even if the VMA is
+	 * inaccessible. Further, FOLL_FORCE access usually does not represent
+	 * application behaviour and we should avoid triggering NUMA hinting
+	 * faults.
+	 */
+	return flags & FOLL_FORCE;
+}
+
 typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
-- 
cgit v1.2.3


From 7014887a01587d8c50871d5985cd572ca08b29c0 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 25 Aug 2022 18:46:59 +0200
Subject: mm: fixup documentation regarding pte_numa() and PROT_NUMA

pte_numa() no longer exists -- replaced by pte_protnone() -- and PROT_NUMA
probably never existed: MM_CP_PROT_NUMA also ends up using PROT_NONE.

Let's fixup the doc.

Link: https://lkml.kernel.org/r/20220825164659.89824-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5e32211cb5a9..26573ba485f3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -614,22 +614,22 @@ struct mm_struct {
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 		/*
-		 * numa_next_scan is the next time that the PTEs will be marked
-		 * pte_numa. NUMA hinting faults will gather statistics and
-		 * migrate pages to new nodes if necessary.
+		 * numa_next_scan is the next time that PTEs will be remapped
+		 * PROT_NONE to trigger NUMA hinting faults; such faults gather
+		 * statistics and migrate pages to new nodes if necessary.
 		 */
 		unsigned long numa_next_scan;
 
-		/* Restart point for scanning and setting pte_numa */
+		/* Restart point for scanning and remapping PTEs. */
 		unsigned long numa_scan_offset;
 
-		/* numa_scan_seq prevents two threads setting pte_numa */
+		/* numa_scan_seq prevents two threads remapping PTEs. */
 		int numa_scan_seq;
 #endif
 		/*
 		 * An operation with batched TLB flushing is going on. Anything
 		 * that can move process memory needs to flush the TLB when
-		 * moving a PROT_NONE or PROT_NUMA mapped page.
+		 * moving a PROT_NONE mapped page.
 		 */
 		atomic_t tlb_flush_pending;
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-- 
cgit v1.2.3


From 974f4367dd315acc15ad4a6453f8304aea60dfbd Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 23 Aug 2022 11:22:30 +0200
Subject: mm: reduce noise in show_mem for lowmem allocations

While discussing early DMA pool pre-allocation failure with Christoph [1]
I have realized that the allocation failure warning is rather noisy for
constrained allocations like GFP_DMA{32}.  Those zones are usually not
populated on all nodes very often as their memory ranges are constrained.

This is an attempt to reduce the ballast that doesn't provide any relevant
information for those allocation failures investigation.  Please note that
I have only compile tested it (in my default config setup) and I am
throwing it mostly to see what people think about it.

[1] http://lkml.kernel.org/r/20220817060647.1032426-1-hch@lst.de

[mhocko@suse.com: update]
  Link: https://lkml.kernel.org/r/Yw29bmJTIkKogTiW@dhcp22.suse.cz
[mhocko@suse.com: fix build]
[akpm@linux-foundation.org: fix it for mapletree]
[akpm@linux-foundation.org: update it for Michal's update]
[mhocko@suse.com: fix arch/powerpc/xmon/xmon.c]
  Link: https://lkml.kernel.org/r/Ywh3C4dKB9B93jIy@dhcp22.suse.cz
[akpm@linux-foundation.org: fix arch/sparc/kernel/setup_32.c]
Link: https://lkml.kernel.org/r/YwScVmVofIZkopkF@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index eb25cae06c55..e56dd8f7eae1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1838,7 +1838,11 @@ extern void pagefault_out_of_memory(void);
  */
 #define SHOW_MEM_FILTER_NODES		(0x0001u)	/* disallowed nodes */
 
-extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);
+extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
+static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask)
+{
+	__show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
+}
 
 #ifdef CONFIG_MMU
 extern bool can_do_mlock(void);
@@ -2578,7 +2582,12 @@ extern void calculate_min_free_kbytes(void);
 extern int __meminit init_per_zone_wmark_min(void);
 extern void mem_init(void);
 extern void __init mmap_init(void);
-extern void show_mem(unsigned int flags, nodemask_t *nodemask);
+
+extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
+static inline void show_mem(unsigned int flags, nodemask_t *nodemask)
+{
+	__show_mem(flags, nodemask, MAX_NR_ZONES - 1);
+}
 extern long si_mem_available(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
-- 
cgit v1.2.3


From e6ad640bc404eb298dd1880113131768ddf5c6a8 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Fri, 26 Aug 2022 23:06:42 +0000
Subject: mm: deduplicate cacheline padding code

There are three users (mmzone.h, memcontrol.h, page_counter.h) using
similar code for forcing cacheline padding between fields of different
structures.  Dedup that code.

Link: https://lkml.kernel.org/r/20220826230642.566725-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Suggested-by: Feng Tang <feng.tang@intel.com>
Reviewed-by: Feng Tang <feng.tang@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cache.h        | 13 +++++++++++++
 include/linux/memcontrol.h   | 13 ++-----------
 include/linux/mmzone.h       | 24 +++++-------------------
 include/linux/page_counter.h | 13 ++-----------
 4 files changed, 22 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cache.h b/include/linux/cache.h
index d742c57eaee5..5da1bbd96154 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -85,4 +85,17 @@
 #define cache_line_size()	L1_CACHE_BYTES
 #endif
 
+/*
+ * Helper to add padding within a struct to ensure data fall into separate
+ * cachelines.
+ */
+#if defined(CONFIG_SMP)
+struct cacheline_padding {
+	char x[0];
+} ____cacheline_internodealigned_in_smp;
+#define CACHELINE_PADDING(name)		struct cacheline_padding name
+#else
+#define CACHELINE_PADDING(name)
+#endif
+
 #endif /* __LINUX_CACHE_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 344022f102c2..60545e4a1c03 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -185,15 +185,6 @@ struct mem_cgroup_thresholds {
 	struct mem_cgroup_threshold_ary *spare;
 };
 
-#if defined(CONFIG_SMP)
-struct memcg_padding {
-	char x[0];
-} ____cacheline_internodealigned_in_smp;
-#define MEMCG_PADDING(name)      struct memcg_padding name
-#else
-#define MEMCG_PADDING(name)
-#endif
-
 /*
  * Remember four most recent foreign writebacks with dirty pages in this
  * cgroup.  Inode sharing is expected to be uncommon and, even if we miss
@@ -304,7 +295,7 @@ struct mem_cgroup {
 	spinlock_t		move_lock;
 	unsigned long		move_lock_flags;
 
-	MEMCG_PADDING(_pad1_);
+	CACHELINE_PADDING(_pad1_);
 
 	/* memory.stat */
 	struct memcg_vmstats	vmstats;
@@ -326,7 +317,7 @@ struct mem_cgroup {
 	struct list_head objcg_list;
 #endif
 
-	MEMCG_PADDING(_pad2_);
+	CACHELINE_PADDING(_pad2_);
 
 	/*
 	 * set > 0 if pages under this cgroup are moving to other cgroup.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e335a492c2eb..c69c08156822 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -121,20 +121,6 @@ static inline bool free_area_empty(struct free_area *area, int migratetype)
 
 struct pglist_data;
 
-/*
- * Add a wild amount of padding here to ensure data fall into separate
- * cachelines.  There are very few zone structures in the machine, so space
- * consumption is not a concern here.
- */
-#if defined(CONFIG_SMP)
-struct zone_padding {
-	char x[0];
-} ____cacheline_internodealigned_in_smp;
-#define ZONE_PADDING(name)	struct zone_padding name;
-#else
-#define ZONE_PADDING(name)
-#endif
-
 #ifdef CONFIG_NUMA
 enum numa_stat_item {
 	NUMA_HIT,		/* allocated in intended node */
@@ -837,7 +823,7 @@ struct zone {
 	int initialized;
 
 	/* Write-intensive fields used from the page allocator */
-	ZONE_PADDING(_pad1_)
+	CACHELINE_PADDING(_pad1_);
 
 	/* free areas of different sizes */
 	struct free_area	free_area[MAX_ORDER];
@@ -849,7 +835,7 @@ struct zone {
 	spinlock_t		lock;
 
 	/* Write-intensive fields used by compaction and vmstats. */
-	ZONE_PADDING(_pad2_)
+	CACHELINE_PADDING(_pad2_);
 
 	/*
 	 * When free pages are below this point, additional steps are taken
@@ -886,7 +872,7 @@ struct zone {
 
 	bool			contiguous;
 
-	ZONE_PADDING(_pad3_)
+	CACHELINE_PADDING(_pad3_);
 	/* Zone statistics */
 	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
 	atomic_long_t		vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
@@ -1196,7 +1182,7 @@ typedef struct pglist_data {
 #endif /* CONFIG_NUMA */
 
 	/* Write-intensive fields used by page reclaim */
-	ZONE_PADDING(_pad1_)
+	CACHELINE_PADDING(_pad1_);
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 	/*
@@ -1241,7 +1227,7 @@ typedef struct pglist_data {
 	struct lru_gen_mm_walk	mm_walk;
 #endif
 
-	ZONE_PADDING(_pad2_)
+	CACHELINE_PADDING(_pad2_);
 
 	/* Per-node vmstats */
 	struct per_cpu_nodestat __percpu *per_cpu_nodestats;
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 78a1c934e416..c141ea9a95ef 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -7,22 +7,13 @@
 #include <linux/kernel.h>
 #include <asm/page.h>
 
-#if defined(CONFIG_SMP)
-struct pc_padding {
-	char x[0];
-} ____cacheline_internodealigned_in_smp;
-#define PC_PADDING(name)	struct pc_padding name
-#else
-#define PC_PADDING(name)
-#endif
-
 struct page_counter {
 	/*
 	 * Make sure 'usage' does not share cacheline with any other field. The
 	 * memcg->memory.usage is a hot member of struct mem_cgroup.
 	 */
 	atomic_long_t usage;
-	PC_PADDING(_pad1_);
+	CACHELINE_PADDING(_pad1_);
 
 	/* effective memory.min and memory.min usage tracking */
 	unsigned long emin;
@@ -38,7 +29,7 @@ struct page_counter {
 	unsigned long failcnt;
 
 	/* Keep all the read most fields in a separete cacheline. */
-	PC_PADDING(_pad2_);
+	CACHELINE_PADDING(_pad2_);
 
 	unsigned long min;
 	unsigned long low;
-- 
cgit v1.2.3


From cb4df4cae4f2bd8cf7a32eff81178fce31600f7c Mon Sep 17 00:00:00 2001
From: xu xin <cgel.zte@gmail.com>
Date: Tue, 30 Aug 2022 14:38:38 +0000
Subject: ksm: count allocated ksm rmap_items for each process

Patch series "ksm: count allocated rmap_items and update documentation",
v5.

KSM can save memory by merging identical pages, but also can consume
additional memory, because it needs to generate rmap_items to save each
scanned page's brief rmap information.

To determine how beneficial the ksm-policy (like madvise), they are using
brings, so we add a new interface /proc/<pid>/ksm_stat for each process
The value "ksm_rmap_items" in it indicates the total allocated ksm
rmap_items of this process.

The detailed description can be seen in the following patches' commit
message.


This patch (of 2):

KSM can save memory by merging identical pages, but also can consume
additional memory, because it needs to generate rmap_items to save each
scanned page's brief rmap information.  Some of these pages may be merged,
but some may not be abled to be merged after being checked several times,
which are unprofitable memory consumed.

The information about whether KSM save memory or consume memory in
system-wide range can be determined by the comprehensive calculation of
pages_sharing, pages_shared, pages_unshared and pages_volatile.  A simple
approximate calculation:

	profit =~ pages_sharing * sizeof(page) - (all_rmap_items) *
	         sizeof(rmap_item);

where all_rmap_items equals to the sum of pages_sharing, pages_shared,
pages_unshared and pages_volatile.

But we cannot calculate this kind of ksm profit inner single-process wide
because the information of ksm rmap_item's number of a process is lacked.
For user applications, if this kind of information could be obtained, it
helps upper users know how beneficial the ksm-policy (like madvise) they
are using brings, and then optimize their app code.  For example, one
application madvise 1000 pages as MERGEABLE, while only a few pages are
really merged, then it's not cost-efficient.

So we add a new interface /proc/<pid>/ksm_stat for each process in which
the value of ksm_rmap_itmes is only shown now and so more values can be
added in future.

So similarly, we can calculate the ksm profit approximately for a single
process by:

	profit =~ ksm_merging_pages * sizeof(page) - ksm_rmap_items *
		 sizeof(rmap_item);

where ksm_merging_pages is shown at /proc/<pid>/ksm_merging_pages, and
ksm_rmap_items is shown in /proc/<pid>/ksm_stat.

Link: https://lkml.kernel.org/r/20220830143731.299702-1-xu.xin16@zte.com.cn
Link: https://lkml.kernel.org/r/20220830143838.299758-1-xu.xin16@zte.com.cn
Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Reviewed-by: Xiaokai Ran <ran.xiaokai@zte.com.cn>
Reviewed-by: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: CGEL ZTE <cgel.zte@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Izik Eidus <izik.eidus@ravellosystems.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 26573ba485f3..8f30f262431c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -654,6 +654,11 @@ struct mm_struct {
 		 * merging.
 		 */
 		unsigned long ksm_merging_pages;
+		/*
+		 * Represent how many pages are checked for ksm merging
+		 * including merged and not merged.
+		 */
+		unsigned long ksm_rmap_items;
 #endif
 #ifdef CONFIG_LRU_GEN
 		struct {
-- 
cgit v1.2.3


From bf7a87f1075f67c286f794519f0fedfa8b0b18cc Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 26 Sep 2022 17:33:35 +0200
Subject: kprobes: Add new KPROBE_FLAG_ON_FUNC_ENTRY kprobe flag

Adding KPROBE_FLAG_ON_FUNC_ENTRY kprobe flag to indicate that
attach address is on function entry. This is used in following
changes in get_func_ip helper to return correct function address.

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220926153340.1621984-2-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/kprobes.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 55041d2f884d..a0b92be98984 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -103,6 +103,7 @@ struct kprobe {
 				   * this flag is only for optimized_kprobe.
 				   */
 #define KPROBE_FLAG_FTRACE	8 /* probe is using ftrace */
+#define KPROBE_FLAG_ON_FUNC_ENTRY	16 /* probe is on the function entry */
 
 /* Has this kprobe gone ? */
 static inline bool kprobe_gone(struct kprobe *p)
-- 
cgit v1.2.3


From 19c02415da2345d0dda2b5c4495bc17cc14b18b5 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 26 Sep 2022 11:47:38 -0700
Subject: bpf: use bpf_prog_pack for bpf_dispatcher

Allocate bpf_dispatcher with bpf_prog_pack_alloc so that bpf_dispatcher
can share pages with bpf programs.

arch_prepare_bpf_dispatcher() is updated to provide a RW buffer as working
area for arch code to write to.

This also fixes CPA W^X warnning like:

CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ...

Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20220926184739.3512547-2-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    | 3 ++-
 include/linux/filter.h | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index edd43edb27d6..9ae155c75014 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -946,6 +946,7 @@ struct bpf_dispatcher {
 	struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX];
 	int num_progs;
 	void *image;
+	void *rw_image;
 	u32 image_off;
 	struct bpf_ksym ksym;
 };
@@ -964,7 +965,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin
 struct bpf_trampoline *bpf_trampoline_get(u64 key,
 					  struct bpf_attach_target_info *tgt_info);
 void bpf_trampoline_put(struct bpf_trampoline *tr);
-int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs);
+int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs);
 #define BPF_DISPATCHER_INIT(_name) {				\
 	.mutex = __MUTEX_INITIALIZER(_name.mutex),		\
 	.func = &_name##_func,					\
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 98e28126c24b..efc42a6e3aed 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1023,6 +1023,8 @@ extern long bpf_jit_limit_max;
 
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
+void bpf_jit_fill_hole_with_zero(void *area, unsigned int size);
+
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     unsigned int alignment,
@@ -1035,6 +1037,9 @@ void bpf_jit_free(struct bpf_prog *fp);
 struct bpf_binary_header *
 bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);
 
+void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
+void bpf_prog_pack_free(struct bpf_binary_header *hdr);
+
 static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 {
 	return list_empty(&fp->aux->ksym.lnode) ||
-- 
cgit v1.2.3


From 5b0d1c7bd5722467960829af51d523f5a6ffd848 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 26 Sep 2022 11:47:39 -0700
Subject: bpf: Enforce W^X for bpf trampoline

Mark the trampoline as RO+X after arch_prepare_bpf_trampoline, so that
the trampoine follows W^X rule strictly. This will turn off warnings like

CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ...

Also remove bpf_jit_alloc_exec_page(), since it is not used any more.

Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20220926184739.3512547-3-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9ae155c75014..5161fac0513f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1008,7 +1008,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
 void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 				struct bpf_prog *to);
 /* Called only from JIT-enabled code, so there's no need for stubs. */
-void *bpf_jit_alloc_exec_page(void);
 void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
 void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
-- 
cgit v1.2.3


From 1c32a8012b7fabe469b6af826edfd4ae2a6201d3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 20 Sep 2022 15:38:58 +0200
Subject: nvme: improve the NVME_CONNECT_AUTHREQ* definitions

Mark them as unsigned so that we don't need extra casts, and define
them relative to cdword0 instead of requiring extra shifts.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.de>
---
 include/linux/nvme.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index ae53d74f3696..050d7d0cd81b 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1482,8 +1482,8 @@ struct nvmf_connect_command {
 };
 
 enum {
-	NVME_CONNECT_AUTHREQ_ASCR	= (1 << 2),
-	NVME_CONNECT_AUTHREQ_ATR	= (1 << 1),
+	NVME_CONNECT_AUTHREQ_ASCR	= (1U << 18),
+	NVME_CONNECT_AUTHREQ_ATR	= (1U << 17),
 };
 
 struct nvmf_connect_data {
-- 
cgit v1.2.3


From f4dc7fffa9873db50ec25624572f8217a6225de8 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 16 Sep 2022 14:03:06 +0200
Subject: efi: libstub: unify initrd loading between architectures

Use a EFI configuration table to pass the initrd to the core kernel,
instead of per-arch methods. This cleans up the code considerably, and
should make it easier for architectures to get rid of their reliance on
DT for doing EFI boot in the future.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index f1b3e0d1b3fa..778ddb22f7da 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1330,6 +1330,11 @@ struct linux_efi_coco_secret_area {
 	u64	size;
 };
 
+struct linux_efi_initrd {
+	unsigned long	base;
+	unsigned long	size;
+};
+
 /* Header of a populated EFI secret area */
 #define EFI_SECRET_TABLE_HEADER_GUID	EFI_GUID(0x1e74f542, 0x71dd, 0x4d66,  0x96, 0x3e, 0xef, 0x42, 0x87, 0xff, 0x17, 0x3b)
 
-- 
cgit v1.2.3


From 171539f5a90e3fdf7d17f5396fac79d7e44ad68e Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Thu, 15 Sep 2022 23:20:06 +0200
Subject: efi: libstub: install boot-time memory map as config table

Expose the EFI boot time memory map to the kernel via a configuration
table. This is arch agnostic and enables future changes that remove the
dependency on DT on architectures that don't otherwise rely on it.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 778ddb22f7da..252b9b328577 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -410,6 +410,7 @@ void efi_native_runtime_setup(void);
 #define LINUX_EFI_INITRD_MEDIA_GUID		EFI_GUID(0x5568e427, 0x68fc, 0x4f3d,  0xac, 0x74, 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68)
 #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID	EFI_GUID(0xc451ed2b, 0x9694, 0x45d3,  0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89)
 #define LINUX_EFI_COCO_SECRET_AREA_GUID		EFI_GUID(0xadf956ad, 0xe98c, 0x484c,  0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47)
+#define LINUX_EFI_BOOT_MEMMAP_GUID		EFI_GUID(0x800f683f, 0xd08b, 0x423a,  0xa2, 0x93, 0x96, 0x5c, 0x3c, 0x6f, 0xe2, 0xb4)
 
 #define RISCV_EFI_BOOT_PROTOCOL_GUID		EFI_GUID(0xccd15fec, 0x6f73, 0x4eec,  0x83, 0x95, 0x3e, 0x69, 0xe4, 0xb9, 0x40, 0xbf)
 
-- 
cgit v1.2.3


From 3c6edd9034240ce9582be3392112321336bd25bb Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Thu, 22 Sep 2022 12:03:52 +0200
Subject: efi: zboot: create MemoryMapped() device path for the parent if
 needed

LoadImage() is supposed to install an instance of the protocol
EFI_LOADED_IMAGE_DEVICE_PATH_PROTOCOL onto the loaded image's handle so
that the program can figure out where it was loaded from. The reference
implementation even does this (with a NULL protocol pointer) if the call
to LoadImage() used the source buffer and size arguments, and passed
NULL for the image device path. Hand rolled implementations of LoadImage
may behave differently, though, and so it is better to tolerate
situations where the protocol is missing. And actually, concatenating an
Offset() node to a NULL device path (as we do currently) is not great
either.

So in cases where the protocol is absent, or when it points to NULL,
construct a MemoryMapped() device node as the base node that describes
the parent image's footprint in memory.

Cc: Daan De Meyer <daandemeyer@fb.com>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 89f16ec3ebab..da3974bf05d3 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1004,6 +1004,13 @@ struct efi_rel_offset_dev_path {
 	u64				ending_offset;
 } __packed;
 
+struct efi_mem_mapped_dev_path {
+	struct efi_generic_dev_path	header;
+	u32				memory_type;
+	u64				starting_addr;
+	u64				ending_addr;
+} __packed;
+
 struct efi_dev_path {
 	union {
 		struct efi_generic_dev_path	header;
-- 
cgit v1.2.3


From 4bf207d7a54d49637da94dbc00d2c025b74764d1 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 1 Sep 2022 11:20:53 -0300
Subject: net/mlx5: Add IFC bits for mkey ATS

Allows telling a mkey to use PCI ATS for DMA that flows through it.

Link: https://lore.kernel.org/r/1-v1-bd147097458e+ede-umem_dmabuf_jgg@nvidia.com
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4acd5610e96b..92602e33a82c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1707,7 +1707,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         steering_format_version[0x4];
 	u8         create_qp_start_hint[0x18];
 
-	u8         reserved_at_460[0x3];
+	u8         reserved_at_460[0x1];
+	u8         ats[0x1];
+	u8         reserved_at_462[0x1];
 	u8         log_max_uctx[0x5];
 	u8         reserved_at_468[0x2];
 	u8         ipsec_offload[0x1];
@@ -3873,7 +3875,9 @@ struct mlx5_ifc_mkc_bits {
 	u8         lw[0x1];
 	u8         lr[0x1];
 	u8         access_mode_1_0[0x2];
-	u8         reserved_at_18[0x8];
+	u8         reserved_at_18[0x2];
+	u8         ma_translation_mode[0x2];
+	u8         reserved_at_1c[0x4];
 
 	u8         qpn[0x18];
 	u8         mkey_7_0[0x8];
@@ -11134,7 +11138,8 @@ struct mlx5_ifc_dealloc_memic_out_bits {
 struct mlx5_ifc_umem_bits {
 	u8         reserved_at_0[0x80];
 
-	u8         reserved_at_80[0x1b];
+	u8         ats[0x1];
+	u8         reserved_at_81[0x1a];
 	u8         log_page_size[0x5];
 
 	u8         page_offset[0x20];
-- 
cgit v1.2.3


From 987f20a9dcce3989e48d87cff3952c095c994445 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Sep 2022 17:15:32 -0500
Subject: a.out: Remove the a.out implementation

In commit 19e8b701e258 ("a.out: Stop building a.out/osf1 support on
alpha and m68k") the last users of a.out were disabled.

As nothing has turned up to cause this change to be reverted, let's
remove the code implementing a.out support as well.

There may be userspace users of the uapi bits left so the uapi
headers have been left untouched.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Arnd Bergmann <arnd@arndb.de> # arm defconfigs
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/871qrx3hq3.fsf@email.froward.int.ebiederm.org
---
 include/linux/a.out.h | 18 ------------------
 1 file changed, 18 deletions(-)
 delete mode 100644 include/linux/a.out.h

(limited to 'include/linux')

diff --git a/include/linux/a.out.h b/include/linux/a.out.h
deleted file mode 100644
index 600cf45645c6..000000000000
--- a/include/linux/a.out.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __A_OUT_GNU_H__
-#define __A_OUT_GNU_H__
-
-#include <uapi/linux/a.out.h>
-
-#ifndef __ASSEMBLY__
-#ifdef linux
-#include <asm/page.h>
-#if defined(__i386__) || defined(__mc68000__)
-#else
-#ifndef SEGMENT_SIZE
-#define SEGMENT_SIZE	PAGE_SIZE
-#endif
-#endif
-#endif
-#endif /*__ASSEMBLY__ */
-#endif /* __A_OUT_GNU_H__ */
-- 
cgit v1.2.3


From 568ec936bf1384fc15873908c96a9aeb62536edb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 27 Sep 2022 09:58:15 +0200
Subject: block: replace blk_queue_nowait with bdev_nowait

Replace blk_queue_nowait with a bdev_nowait helpers that takes the
block_device given that the I/O submission path should not have to
look into the request_queue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
Link: https://lore.kernel.org/r/20220927075815.269694-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 84b13fdd34a7..4750772ef228 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -618,7 +618,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_quiesced(q)	test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
 #define blk_queue_pm_only(q)	atomic_read(&(q)->pm_only)
 #define blk_queue_registered(q)	test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
-#define blk_queue_nowait(q)	test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
 #define blk_queue_sq_sched(q)	test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
 
 extern void blk_set_pm_only(struct request_queue *q);
@@ -1280,6 +1279,11 @@ static inline bool bdev_fua(struct block_device *bdev)
 	return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags);
 }
 
+static inline bool bdev_nowait(struct block_device *bdev)
+{
+	return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags);
+}
+
 static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
-- 
cgit v1.2.3


From 644c4229559257cadc4267fc36c2dc22ee9c040f Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio@somainline.org>
Date: Wed, 21 Sep 2022 02:44:58 +0200
Subject: clk: qcom: smd: Add SM6375 clocks

Add support for controlling SMD RPM clocks on SM6375.

Signed-off-by: Konrad Dybcio <konrad.dybcio@somainline.org>
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20220921004458.151842-3-konrad.dybcio@somainline.org
---
 include/linux/soc/qcom/smd-rpm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/smd-rpm.h b/include/linux/soc/qcom/smd-rpm.h
index 82c9d489833a..3ab8c07f71c0 100644
--- a/include/linux/soc/qcom/smd-rpm.h
+++ b/include/linux/soc/qcom/smd-rpm.h
@@ -41,6 +41,7 @@ struct qcom_smd_rpm;
 #define QCOM_SMD_RPM_HWKM_CLK	0x6d6b7768
 #define QCOM_SMD_RPM_PKA_CLK	0x616b70
 #define QCOM_SMD_RPM_MCFG_CLK	0x6766636d
+#define QCOM_SMD_RPM_MMXI_CLK	0x69786d6d
 
 int qcom_rpm_smd_write(struct qcom_smd_rpm *rpm,
 		       int state,
-- 
cgit v1.2.3


From 3008119a3dd8052b7e5c07488e20a7abb0d287f2 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Fri, 23 Sep 2022 17:00:12 +0800
Subject: ftrace: Remove obsoleted code from ftrace and task_struct

The trace of "struct task_struct" was no longer used since
commit 345ddcc882d8 ("ftrace: Have set_ftrace_pid use the
bitmap like events do"), and the functions about flags for
current->trace is useless, so remove them.

Link: https://lkml.kernel.org/r/20220923090012.505990-1-cuigaosheng1@huawei.com

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 41 -----------------------------------------
 include/linux/sched.h  |  3 ---
 2 files changed, 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0b61371e287b..62557d4bffc2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1122,47 +1122,6 @@ static inline void unpause_graph_tracing(void) { }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 #ifdef CONFIG_TRACING
-
-/* flags for current->trace */
-enum {
-	TSK_TRACE_FL_TRACE_BIT	= 0,
-	TSK_TRACE_FL_GRAPH_BIT	= 1,
-};
-enum {
-	TSK_TRACE_FL_TRACE	= 1 << TSK_TRACE_FL_TRACE_BIT,
-	TSK_TRACE_FL_GRAPH	= 1 << TSK_TRACE_FL_GRAPH_BIT,
-};
-
-static inline void set_tsk_trace_trace(struct task_struct *tsk)
-{
-	set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
-}
-
-static inline void clear_tsk_trace_trace(struct task_struct *tsk)
-{
-	clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
-}
-
-static inline int test_tsk_trace_trace(struct task_struct *tsk)
-{
-	return tsk->trace & TSK_TRACE_FL_TRACE;
-}
-
-static inline void set_tsk_trace_graph(struct task_struct *tsk)
-{
-	set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
-}
-
-static inline void clear_tsk_trace_graph(struct task_struct *tsk)
-{
-	clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
-}
-
-static inline int test_tsk_trace_graph(struct task_struct *tsk)
-{
-	return tsk->trace & TSK_TRACE_FL_GRAPH;
-}
-
 enum ftrace_dump_mode;
 
 extern enum ftrace_dump_mode ftrace_dump_on_oops;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e7b2f8a5c711..c7ee04e9147a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1381,9 +1381,6 @@ struct task_struct {
 #endif
 
 #ifdef CONFIG_TRACING
-	/* State flags for use by tracers: */
-	unsigned long			trace;
-
 	/* Bitmask and counter of trace recursion: */
 	unsigned long			trace_recursion;
 #endif /* CONFIG_TRACING */
-- 
cgit v1.2.3


From 976a859c9c68fdf160379bfc154431489f318292 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Wed, 7 Sep 2022 16:36:23 -0700
Subject: net/mlx5: Expose NPPS related registers

Add management capability bits indicating firmware may support N pulses
per second. Add corresponding fields in MTPPS register.

Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Eran Ben Elisha <eranbe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 06eab92b9fb3..e929b0dcd985 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9792,7 +9792,9 @@ struct mlx5_ifc_pcam_reg_bits {
 struct mlx5_ifc_mcam_enhanced_features_bits {
 	u8         reserved_at_0[0x5d];
 	u8         mcia_32dwords[0x1];
-	u8         reserved_at_5e[0xc];
+	u8         out_pulse_duration_ns[0x1];
+	u8         npps_period[0x1];
+	u8         reserved_at_60[0xa];
 	u8         reset_state[0x1];
 	u8         ptpcyc2realtime_modify[0x1];
 	u8         reserved_at_6c[0x2];
@@ -10292,7 +10294,12 @@ struct mlx5_ifc_mtpps_reg_bits {
 	u8         reserved_at_18[0x4];
 	u8         cap_max_num_of_pps_out_pins[0x4];
 
-	u8         reserved_at_20[0x24];
+	u8         reserved_at_20[0x13];
+	u8         cap_log_min_npps_period[0x5];
+	u8         reserved_at_38[0x3];
+	u8         cap_log_min_out_pulse_duration_ns[0x5];
+
+	u8         reserved_at_40[0x4];
 	u8         cap_pin_3_mode[0x4];
 	u8         reserved_at_48[0x4];
 	u8         cap_pin_2_mode[0x4];
@@ -10311,7 +10318,9 @@ struct mlx5_ifc_mtpps_reg_bits {
 	u8         cap_pin_4_mode[0x4];
 
 	u8         field_select[0x20];
-	u8         reserved_at_a0[0x60];
+	u8         reserved_at_a0[0x20];
+
+	u8         npps_period[0x40];
 
 	u8         enable[0x1];
 	u8         reserved_at_101[0xb];
@@ -10320,7 +10329,8 @@ struct mlx5_ifc_mtpps_reg_bits {
 	u8         pin_mode[0x4];
 	u8         pin[0x8];
 
-	u8         reserved_at_120[0x20];
+	u8         reserved_at_120[0x2];
+	u8         out_pulse_duration_ns[0x1e];
 
 	u8         time_stamp[0x40];
 
-- 
cgit v1.2.3


From f0462bc3e9024e9738fcd1a026456238317d84c7 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Wed, 7 Sep 2022 16:36:24 -0700
Subject: net/mlx5: Add support for NPPS with real time mode

Add support for setting NPPS. NPPS is currently available in
REAL_TIME_CLOCK mode only. In addition allow the user to set the pulse
duration.

When NPPS pulse duration is not set explicitly by the user, driver set
it to 50% of the NPPS period.

Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Eran Ben Elisha <eranbe@nvidia.com>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 954af9cafb1d..481506376344 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -698,6 +698,8 @@ struct mlx5_pps {
 	struct work_struct         out_work;
 	u64                        start[MAX_PIN_NUM];
 	u8                         enabled;
+	u64                        min_npps_period;
+	u64                        min_out_pulse_duration_ns;
 };
 
 struct mlx5_timer {
-- 
cgit v1.2.3


From 8d1ac895fff96a228db20db92243e93687659ef7 Mon Sep 17 00:00:00 2001
From: "Liu, Changcheng" <jerrliu@nvidia.com>
Date: Wed, 7 Sep 2022 16:36:25 -0700
Subject: net/mlx5: add IFC bits for bypassing port select flow table

port_select_flow_table_bypass - When set, device supports
bypass port select flow table.
active_port - Bitmask indicates the current active ports
in PORT_SELECT_FT LAG.
MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION - op_mod to operate
PORT_SELECTION_Capabilities.

Signed-off-by: Liu, Changcheng <jerrliu@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index e929b0dcd985..b7f4e93df0f2 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -68,6 +68,7 @@ enum {
 	MLX5_SET_HCA_CAP_OP_MOD_ODP                   = 0x2,
 	MLX5_SET_HCA_CAP_OP_MOD_ATOMIC                = 0x3,
 	MLX5_SET_HCA_CAP_OP_MOD_ROCE                  = 0x4,
+	MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION       = 0x25,
 };
 
 enum {
@@ -814,7 +815,9 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 struct mlx5_ifc_port_selection_cap_bits {
 	u8         reserved_at_0[0x10];
 	u8         port_select_flow_table[0x1];
-	u8         reserved_at_11[0xf];
+	u8         reserved_at_11[0x1];
+	u8         port_select_flow_table_bypass[0x1];
+	u8         reserved_at_13[0xd];
 
 	u8         reserved_at_20[0x1e0];
 
@@ -10933,7 +10936,9 @@ struct mlx5_ifc_lagc_bits {
 	u8         reserved_at_18[0x5];
 	u8         lag_state[0x3];
 
-	u8         reserved_at_20[0x14];
+	u8         reserved_at_20[0xc];
+	u8         active_port[0x4];
+	u8         reserved_at_30[0x4];
 	u8         tx_remap_affinity_2[0x4];
 	u8         reserved_at_38[0x4];
 	u8         tx_remap_affinity_1[0x4];
-- 
cgit v1.2.3


From a83bb5df2ac604ab418fbe0a8720f55de46652eb Mon Sep 17 00:00:00 2001
From: "Liu, Changcheng" <jerrliu@nvidia.com>
Date: Wed, 7 Sep 2022 16:36:26 -0700
Subject: RDMA/mlx5: Don't set tx affinity when lag is in hash mode

In hash mode, without setting tx affinity explicitly, the port select
flow table decides which port is used for the traffic.
If port_select_flow_table_bypass capability is supported and tx affinity
is set explicitly for QP/TIS, they will be added into the explicit affinity
table in FW to check which port is used for the traffic.
1. The overloaded explicit affinity table may affect performance.
   To avoid this, do not set tx affinity explicitly by default.
2. The packets of the same flow need to be transmitted on the same port.
   Because the packets of the same flow use different QPs in slow & fast
   path, it shouldn't set tx affinity explicitly for these QPs.

Signed-off-by: Liu, Changcheng <jerrliu@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Vlad Buslov <vladbu@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 481506376344..9114caceb2b5 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1153,6 +1153,7 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_roce(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_active(struct mlx5_core_dev *dev);
+bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_master(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev);
 struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3


From 66af4fe37119dbf6a82078949a82e625155777ef Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Wed, 7 Sep 2022 16:36:30 -0700
Subject: net/mlx5: Remove unused functions

Remove functions which are no longer used in the driver:
  mlx5e_ipsec_is_tx_flow
  mlx5_health_flush
  get_cqe_enhanced_num_mini_cqes
  get_cqe_l3_hdr_type
  mlx5_health_flush
  mlx5_fs_is_ipsec_flow
  _mlx5_fs_is_outer_ipproto_flow
  mlx5_fs_is_outer_tcp_flow
  mlx5_fs_is_outer_udp_flow
  mlx5_fs_is_vxlan_flow
  mlx5_fs_is_outer_ipsec_flow

Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h     | 11 ----------
 include/linux/mlx5/driver.h     |  1 -
 include/linux/mlx5/fs_helpers.h | 48 -----------------------------------------
 3 files changed, 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 5b41b9fb3d48..753753f3ce12 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -874,12 +874,6 @@ static inline u8 get_cqe_opcode(struct mlx5_cqe64 *cqe)
 	return cqe->op_own >> 4;
 }
 
-static inline u8 get_cqe_enhanced_num_mini_cqes(struct mlx5_cqe64 *cqe)
-{
-	/* num_of_mini_cqes is zero based */
-	return get_cqe_opcode(cqe) + 1;
-}
-
 static inline u8 get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe)
 {
 	return (cqe->lro.tcppsh_abort_dupack >> 6) & 1;
@@ -890,11 +884,6 @@ static inline u8 get_cqe_l4_hdr_type(struct mlx5_cqe64 *cqe)
 	return (cqe->l4_l3_hdr_type >> 4) & 0x7;
 }
 
-static inline u8 get_cqe_l3_hdr_type(struct mlx5_cqe64 *cqe)
-{
-	return (cqe->l4_l3_hdr_type >> 2) & 0x3;
-}
-
 static inline bool cqe_is_tunneled(struct mlx5_cqe64 *cqe)
 {
 	return cqe->tls_outer_l3_tunneled & 0x1;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 9114caceb2b5..52f34fe47049 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1018,7 +1018,6 @@ int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size,
 bool mlx5_cmd_is_down(struct mlx5_core_dev *dev);
 
 int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type);
-void mlx5_health_flush(struct mlx5_core_dev *dev);
 void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/fs_helpers.h b/include/linux/mlx5/fs_helpers.h
index 9db21cd0e92c..bc5125bc0561 100644
--- a/include/linux/mlx5/fs_helpers.h
+++ b/include/linux/mlx5/fs_helpers.h
@@ -38,46 +38,6 @@
 #define MLX5_FS_IPV4_VERSION 4
 #define MLX5_FS_IPV6_VERSION 6
 
-static inline bool mlx5_fs_is_ipsec_flow(const u32 *match_c)
-{
-	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
-					   misc_parameters);
-
-	return MLX5_GET(fte_match_set_misc, misc_params_c, outer_esp_spi);
-}
-
-static inline bool _mlx5_fs_is_outer_ipproto_flow(const u32 *match_c,
-						  const u32 *match_v, u8 match)
-{
-	const void *headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
-					     outer_headers);
-	const void *headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
-					     outer_headers);
-
-	return MLX5_GET(fte_match_set_lyr_2_4, headers_c, ip_protocol) == 0xff &&
-		MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol) == match;
-}
-
-static inline bool mlx5_fs_is_outer_tcp_flow(const u32 *match_c,
-					     const u32 *match_v)
-{
-	return _mlx5_fs_is_outer_ipproto_flow(match_c, match_v, IPPROTO_TCP);
-}
-
-static inline bool mlx5_fs_is_outer_udp_flow(const u32 *match_c,
-					     const u32 *match_v)
-{
-	return _mlx5_fs_is_outer_ipproto_flow(match_c, match_v, IPPROTO_UDP);
-}
-
-static inline bool mlx5_fs_is_vxlan_flow(const u32 *match_c)
-{
-	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
-					   misc_parameters);
-
-	return MLX5_GET(fte_match_set_misc, misc_params_c, vxlan_vni);
-}
-
 static inline bool _mlx5_fs_is_outer_ipv_flow(struct mlx5_core_dev *mdev,
 					      const u32 *match_c,
 					      const u32 *match_v, int version)
@@ -131,12 +91,4 @@ mlx5_fs_is_outer_ipv6_flow(struct mlx5_core_dev *mdev, const u32 *match_c,
 					  MLX5_FS_IPV6_VERSION);
 }
 
-static inline bool mlx5_fs_is_outer_ipsec_flow(const u32 *match_c)
-{
-	void *misc_params_c =
-			MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters);
-
-	return MLX5_GET(fte_match_set_misc, misc_params_c, outer_esp_spi);
-}
-
 #endif
-- 
cgit v1.2.3


From b53ff37fcd5c7038d9dd000dcf682575a8f47999 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Wed, 7 Sep 2022 16:36:31 -0700
Subject: net/mlx5: Remove unused structs

Remove structs which are no longer used in the driver:
  mlx5dr_cmd_qp_create_attr
  mlx5_fs_dr_ns
  mlx5_pas

Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 52f34fe47049..949feb7f889f 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -857,11 +857,6 @@ struct mlx5_cmd_work_ent {
 	refcount_t              refcnt;
 };
 
-struct mlx5_pas {
-	u64	pa;
-	u8	log_sz;
-};
-
 enum phy_port_state {
 	MLX5_AAA_111
 };
-- 
cgit v1.2.3


From 9175d8103780084e70bc89f2040ea62dc30f6f60 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Wed, 7 Sep 2022 16:36:32 -0700
Subject: net/mlx5: Remove from FPGA IFC file not-needed definitions

Move IP layout bits definitions to be close to the place that actually
uses it, together with removal extra defines that not in-use.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h      | 16 ++++++++++++++++
 include/linux/mlx5/mlx5_ifc_fpga.h | 24 ------------------------
 2 files changed, 16 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index b7f4e93df0f2..c25f2ce75734 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -478,6 +478,22 @@ struct mlx5_ifc_odp_per_transport_service_cap_bits {
 	u8         reserved_at_6[0x1a];
 };
 
+struct mlx5_ifc_ipv4_layout_bits {
+	u8         reserved_at_0[0x60];
+
+	u8         ipv4[0x20];
+};
+
+struct mlx5_ifc_ipv6_layout_bits {
+	u8         ipv6[16][0x8];
+};
+
+union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits {
+	struct mlx5_ifc_ipv6_layout_bits ipv6_layout;
+	struct mlx5_ifc_ipv4_layout_bits ipv4_layout;
+	u8         reserved_at_0[0x80];
+};
+
 struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
 	u8         smac_47_16[0x20];
 
diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h
index 45c7c0d67635..0596472923ad 100644
--- a/include/linux/mlx5/mlx5_ifc_fpga.h
+++ b/include/linux/mlx5/mlx5_ifc_fpga.h
@@ -32,30 +32,6 @@
 #ifndef MLX5_IFC_FPGA_H
 #define MLX5_IFC_FPGA_H
 
-struct mlx5_ifc_ipv4_layout_bits {
-	u8         reserved_at_0[0x60];
-
-	u8         ipv4[0x20];
-};
-
-struct mlx5_ifc_ipv6_layout_bits {
-	u8         ipv6[16][0x8];
-};
-
-union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits {
-	struct mlx5_ifc_ipv6_layout_bits ipv6_layout;
-	struct mlx5_ifc_ipv4_layout_bits ipv4_layout;
-	u8         reserved_at_0[0x80];
-};
-
-enum {
-	MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX = 0x2c9,
-};
-
-enum {
-	MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_IPSEC    = 0x2,
-};
-
 struct mlx5_ifc_fpga_shell_caps_bits {
 	u8         max_num_qps[0x10];
 	u8         reserved_at_10[0x8];
-- 
cgit v1.2.3


From 7b084630153152239d84990ac4540c2dd360186f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 21 Sep 2022 15:00:31 -0700
Subject: perf: Use sample_flags for addr

Use the new sample_flags to indicate whether the addr field is filled by
the PMU driver.  As most PMU drivers pass 0, it can set the flag only if
it has a non-zero value.  And use 0 in perf_sample_output() if it's not
filled already.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220921220032.2858517-1-namhyung@kernel.org
---
 include/linux/perf_event.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 368bdc4f563f..f4a13579b0e8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1028,7 +1028,6 @@ struct perf_sample_data {
 	 * minimize the cachelines touched.
 	 */
 	u64				sample_flags;
-	u64				addr;
 	struct perf_raw_record		*raw;
 	u64				period;
 
@@ -1040,6 +1039,7 @@ struct perf_sample_data {
 	union perf_sample_weight	weight;
 	union  perf_mem_data_src	data_src;
 	u64				txn;
+	u64				addr;
 
 	u64				type;
 	u64				ip;
@@ -1079,9 +1079,13 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 {
 	/* remaining struct members initialized in perf_prepare_sample() */
 	data->sample_flags = 0;
-	data->addr = addr;
 	data->raw  = NULL;
 	data->period = period;
+
+	if (addr) {
+		data->addr = addr;
+		data->sample_flags |= PERF_SAMPLE_ADDR;
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 838d9bb62d132ec3baf1b5aba2e95ef9a7a9a3cd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 21 Sep 2022 15:00:32 -0700
Subject: perf: Use sample_flags for raw_data

Use the new sample_flags to indicate whether the raw data field is
filled by the PMU driver.  Although it could check with the NULL,
follow the same rule with other fields.

Remove the raw field from the perf_sample_data_init() to minimize
the number of cache lines touched.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220921220032.2858517-2-namhyung@kernel.org
---
 include/linux/perf_event.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f4a13579b0e8..e9b151cde491 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1028,7 +1028,6 @@ struct perf_sample_data {
 	 * minimize the cachelines touched.
 	 */
 	u64				sample_flags;
-	struct perf_raw_record		*raw;
 	u64				period;
 
 	/*
@@ -1040,6 +1039,7 @@ struct perf_sample_data {
 	union  perf_mem_data_src	data_src;
 	u64				txn;
 	u64				addr;
+	struct perf_raw_record		*raw;
 
 	u64				type;
 	u64				ip;
@@ -1078,8 +1078,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 					 u64 addr, u64 period)
 {
 	/* remaining struct members initialized in perf_prepare_sample() */
-	data->sample_flags = 0;
-	data->raw  = NULL;
+	data->sample_flags = PERF_SAMPLE_PERIOD;
 	data->period = period;
 
 	if (addr) {
-- 
cgit v1.2.3


From b8a94bfb33952bb17fbc65f8903d242a721c533d Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Mon, 5 Apr 2021 05:03:50 +0200
Subject: kallsyms: increase maximum kernel symbol length to 512

Rust symbols can become quite long due to namespacing introduced
by modules, types, traits, generics, etc. For instance,
the following code:

    pub mod my_module {
        pub struct MyType;
        pub struct MyGenericType<T>(T);

        pub trait MyTrait {
            fn my_method() -> u32;
        }

        impl MyTrait for MyGenericType<MyType> {
            fn my_method() -> u32 {
                42
            }
        }
    }

generates a symbol of length 96 when using the upcoming v0 mangling scheme:

    _RNvXNtCshGpAVYOtgW1_7example9my_moduleINtB2_13MyGenericTypeNtB2_6MyTypeENtB2_7MyTrait9my_method

At the moment, Rust symbols may reach up to 300 in length.
Setting 512 as the maximum seems like a reasonable choice to
keep some headroom.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Gary Guo <gary@garyguo.net>
Co-developed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 include/linux/kallsyms.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index ad39636e0c3f..649faac31ddb 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -15,7 +15,7 @@
 
 #include <asm/sections.h>
 
-#define KSYM_NAME_LEN 128
+#define KSYM_NAME_LEN 512
 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s %s]") + \
 			(KSYM_NAME_LEN - 1) + \
 			2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + \
-- 
cgit v1.2.3


From 2f7ab1267dc9b2d1f29695aff3211c87483480f3 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sat, 3 Jul 2021 16:42:57 +0200
Subject: Kbuild: add Rust support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Having most of the new files in place, we now enable Rust support
in the build system, including `Kconfig` entries related to Rust,
the Rust configuration printer and a few other bits.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Finn Behrens <me@kloenk.de>
Signed-off-by: Finn Behrens <me@kloenk.de>
Co-developed-by: Adam Bratschi-Kaye <ark.email@gmail.com>
Signed-off-by: Adam Bratschi-Kaye <ark.email@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Co-developed-by: Sven Van Asbroeck <thesven73@gmail.com>
Signed-off-by: Sven Van Asbroeck <thesven73@gmail.com>
Co-developed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Gary Guo <gary@garyguo.net>
Co-developed-by: Boris-Chengbiao Zhou <bobo1239@web.de>
Signed-off-by: Boris-Chengbiao Zhou <bobo1239@web.de>
Co-developed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Co-developed-by: Douglas Su <d0u9.su@outlook.com>
Signed-off-by: Douglas Su <d0u9.su@outlook.com>
Co-developed-by: Dariusz Sosnowski <dsosnowski@dsosnowski.pl>
Signed-off-by: Dariusz Sosnowski <dsosnowski@dsosnowski.pl>
Co-developed-by: Antonio Terceiro <antonio.terceiro@linaro.org>
Signed-off-by: Antonio Terceiro <antonio.terceiro@linaro.org>
Co-developed-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Co-developed-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Signed-off-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Co-developed-by: Martin Rodriguez Reboredo <yakoyoku@gmail.com>
Signed-off-by: Martin Rodriguez Reboredo <yakoyoku@gmail.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 include/linux/compiler_types.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 4f2a819fd60a..50b3f6b9502e 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -4,8 +4,12 @@
 
 #ifndef __ASSEMBLY__
 
+/*
+ * Skipped when running bindgen due to a libclang issue;
+ * see https://github.com/rust-lang/rust-bindgen/issues/2244.
+ */
 #if defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \
-	__has_attribute(btf_type_tag)
+	__has_attribute(btf_type_tag) && !defined(__BINDGEN__)
 # define BTF_TYPE_TAG(value) __attribute__((btf_type_tag(#value)))
 #else
 # define BTF_TYPE_TAG(value) /* nothing */
-- 
cgit v1.2.3


From 5aec788aeb8eb74282b75ac1b317beb0fbb69a42 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 27 Sep 2022 21:02:34 +0200
Subject: sched: Fix TASK_state comparisons

Task state is fundamentally a bitmask; direct comparisons are probably
not working as intended. Specifically the normal wait-state have
a number of possible modifiers:

  TASK_UNINTERRUPTIBLE:	TASK_WAKEKILL, TASK_NOLOAD, TASK_FREEZABLE
  TASK_INTERRUPTIBLE:   TASK_FREEZABLE

Specifically, the addition of TASK_FREEZABLE wrecked
__wait_is_interruptible(). This however led to an audit of direct
comparisons yielding the rest of the changes.

Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic")
Reported-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Debugged-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 include/linux/wait.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 14ad8a0e9fac..7f5a51aae0a7 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -281,7 +281,7 @@ static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
 
 #define ___wait_is_interruptible(state)						\
 	(!__builtin_constant_p(state) ||					\
-		state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)		\
+	 (state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
 
 extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);
 
-- 
cgit v1.2.3


From 99df7a2810b6d24651d4887ab61a142e042fb235 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathanl@linux.ibm.com>
Date: Mon, 26 Sep 2022 08:16:42 -0500
Subject: powerpc/pseries: block untrusted device tree changes when locked down

The /proc/powerpc/ofdt interface allows the root user to freely alter
the in-kernel device tree, enabling arbitrary physical address writes
via drivers that could bind to malicious device nodes, thus making it
possible to disable lockdown.

Historically this interface has been used on the pseries platform to
facilitate the runtime addition and removal of processor, memory, and
device resources (aka Dynamic Logical Partitioning or DLPAR). Years
ago, the processor and memory use cases were migrated to designs that
happen to be lockdown-friendly: device tree updates are communicated
directly to the kernel from firmware without passing through untrusted
user space. I/O device DLPAR via the "drmgr" command in powerpc-utils
remains the sole legitimate user of /proc/powerpc/ofdt, but it is
already broken in lockdown since it uses /dev/mem to allocate argument
buffers for the rtas syscall. So only illegitimate uses of the
interface should see a behavior change when running on a locked down
kernel.

Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Acked-by: Paul Moore <paul@paul-moore.com> (LSM)
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220926131643.146502-2-nathanl@linux.ibm.com
---
 include/linux/security.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 1bc362cb413f..7da801ceb5a4 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -114,6 +114,7 @@ enum lockdown_reason {
 	LOCKDOWN_IOPORT,
 	LOCKDOWN_MSR,
 	LOCKDOWN_ACPI_TABLES,
+	LOCKDOWN_DEVICE_TREE,
 	LOCKDOWN_PCMCIA_CIS,
 	LOCKDOWN_TIOCSSERIAL,
 	LOCKDOWN_MODULE_PARAMETERS,
-- 
cgit v1.2.3


From b8f3e48834fe8c86b4f21739c6effd160e2c2c19 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathanl@linux.ibm.com>
Date: Mon, 26 Sep 2022 08:16:43 -0500
Subject: powerpc/rtas: block error injection when locked down

The error injection facility on pseries VMs allows corruption of
arbitrary guest memory, potentially enabling a sufficiently privileged
user to disable lockdown or perform other modifications of the running
kernel via the rtas syscall.

Block the PAPR error injection facility from being opened or called
when locked down.

Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Acked-by: Paul Moore <paul@paul-moore.com> (LSM)
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220926131643.146502-3-nathanl@linux.ibm.com
---
 include/linux/security.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 7da801ceb5a4..a6d67600759e 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -123,6 +123,7 @@ enum lockdown_reason {
 	LOCKDOWN_XMON_WR,
 	LOCKDOWN_BPF_WRITE_USER,
 	LOCKDOWN_DBG_WRITE_KERNEL,
+	LOCKDOWN_RTAS_ERROR_INJECTION,
 	LOCKDOWN_INTEGRITY_MAX,
 	LOCKDOWN_KCORE,
 	LOCKDOWN_KPROBES,
-- 
cgit v1.2.3


From 141f3d6256e58103ece1c3dd2835e871f1dde240 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Sat, 24 Sep 2022 15:18:26 +0900
Subject: ata: libata-sata: Fix device queue depth control

The function __ata_change_queue_depth() uses the helper
ata_scsi_find_dev() to get the ata_device structure of a scsi device and
set that device maximum queue depth. However, when the ata device is
managed by libsas, ata_scsi_find_dev() returns NULL, turning
__ata_change_queue_depth() into a nop, which prevents the user from
setting the maximum queue depth of ATA devices used with libsas based
HBAs.

Fix this by renaming __ata_change_queue_depth() to
ata_change_queue_depth() and adding a pointer to the ata_device
structure of the target device as argument. This pointer is provided by
ata_scsi_change_queue_depth() using ata_scsi_find_dev() in the case of
a libata managed device and by sas_change_queue_depth() using
sas_to_ata_dev() in the case of a libsas managed ata device.

Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Tested-by: John Garry <john.garry@huawei.com>
---
 include/linux/libata.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 698032e5ef2d..20765d1c5f80 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1136,8 +1136,8 @@ extern int ata_scsi_slave_config(struct scsi_device *sdev);
 extern void ata_scsi_slave_destroy(struct scsi_device *sdev);
 extern int ata_scsi_change_queue_depth(struct scsi_device *sdev,
 				       int queue_depth);
-extern int __ata_change_queue_depth(struct ata_port *ap, struct scsi_device *sdev,
-				    int queue_depth);
+extern int ata_change_queue_depth(struct ata_port *ap, struct ata_device *dev,
+				  struct scsi_device *sdev, int queue_depth);
 extern struct ata_device *ata_dev_pair(struct ata_device *adev);
 extern int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev);
 extern void ata_scsi_port_error_handler(struct Scsi_Host *host, struct ata_port *ap);
-- 
cgit v1.2.3


From f25723dcef4a38f6a39e17afeabd1adf6402230e Mon Sep 17 00:00:00 2001
From: Vincent Whitchurch <vincent.whitchurch@axis.com>
Date: Tue, 27 Sep 2022 13:21:14 +0200
Subject: spi: Save current RX and TX DMA devices

Save the current RX and TX DMA devices to avoid having to duplicate the
logic to pick them, since we'll need access to them in some more
functions to fix a bug in the cache handling.

Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
Link: https://lore.kernel.org/r/20220927112117.77599-2-vincent.whitchurch@axis.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 19acd4c9c7e3..4b4041e9895a 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -378,6 +378,8 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  * @cleanup: frees controller-specific state
  * @can_dma: determine whether this controller supports DMA
  * @dma_map_dev: device which can be used for DMA mapping
+ * @cur_rx_dma_dev: device which is currently used for RX DMA mapping
+ * @cur_tx_dma_dev: device which is currently used for TX DMA mapping
  * @queued: whether this controller is providing an internal message queue
  * @kworker: pointer to thread struct for message pump
  * @pump_messages: work struct for scheduling work to the message pump
@@ -609,6 +611,8 @@ struct spi_controller {
 					   struct spi_device *spi,
 					   struct spi_transfer *xfer);
 	struct device *dma_map_dev;
+	struct device *cur_rx_dma_dev;
+	struct device *cur_tx_dma_dev;
 
 	/*
 	 * These hooks are for drivers that want to use the generic
-- 
cgit v1.2.3


From 612d5494aef9bd2ab68d585a8c0ac2b16d12d520 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Tue, 27 Sep 2022 20:45:57 +0800
Subject: irqchip: Make irqchip_init() usable on pure ACPI systems

Pure ACPI systems (e.g., LoongArch) do not need OF_IRQ, but still
require irqchip_init() to perform the ACPI irqchip probing,
even when OF_IRQ isn't selected.

Relax the dependency to enable the generic irqchip support when
ACPI_GENERIC_GSI is configured.

Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Tested-by: Tiezhu Yang <yangtiezhu@loongson.cn>
[maz: revamped commit message]
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220927124557.3246737-1-chenhuacai@loongson.cn
---
 include/linux/of_irq.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index 83fccd0c9bba..d6d3eae2f145 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -37,9 +37,8 @@ extern unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data);
 extern int of_irq_to_resource(struct device_node *dev, int index,
 			      struct resource *r);
 
-extern void of_irq_init(const struct of_device_id *matches);
-
 #ifdef CONFIG_OF_IRQ
+extern void of_irq_init(const struct of_device_id *matches);
 extern int of_irq_parse_one(struct device_node *device, int index,
 			  struct of_phandle_args *out_irq);
 extern int of_irq_count(struct device_node *dev);
@@ -57,6 +56,9 @@ extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev,
 extern void of_msi_configure(struct device *dev, struct device_node *np);
 u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in);
 #else
+static inline void of_irq_init(const struct of_device_id *matches)
+{
+}
 static inline int of_irq_parse_one(struct device_node *device, int index,
 				   struct of_phandle_args *out_irq)
 {
-- 
cgit v1.2.3


From 334f7d42db3eb0274aa6b4aba7ce14d87df3fef0 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Thu, 22 Sep 2022 11:12:42 -0500
Subject: irqchip: Allow extra fields to be passed to
 IRQCHIP_PLATFORM_DRIVER_END

IRQCHIP_PLATFORM_DRIVER_* doesn't allow some fields (such as .pm)
to be set in the platform_driver structure.

Make IRQCHIP_PLATFORM_DRIVER_END variadic so that .pm or another
field can be set if needed.

Signed-off-by: Frank Li <Frank.Li@nxp.com>
[maz: revamped commit message]
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220922161246.20586-3-Frank.Li@nxp.com
---
 include/linux/irqchip.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h
index 3a091d0710ae..d5e6024cb2a8 100644
--- a/include/linux/irqchip.h
+++ b/include/linux/irqchip.h
@@ -44,7 +44,8 @@ static const struct of_device_id drv_name##_irqchip_match_table[] = {
 #define IRQCHIP_MATCH(compat, fn) { .compatible = compat,		\
 				    .data = typecheck_irq_init_cb(fn), },
 
-#define IRQCHIP_PLATFORM_DRIVER_END(drv_name)				\
+
+#define IRQCHIP_PLATFORM_DRIVER_END(drv_name, ...)			\
 	{},								\
 };									\
 MODULE_DEVICE_TABLE(of, drv_name##_irqchip_match_table);		\
@@ -56,6 +57,7 @@ static struct platform_driver drv_name##_driver = {			\
 		.owner = THIS_MODULE,					\
 		.of_match_table = drv_name##_irqchip_match_table,	\
 		.suppress_bind_attrs = true,				\
+		__VA_ARGS__						\
 	},								\
 };									\
 builtin_platform_driver(drv_name##_driver)
-- 
cgit v1.2.3


From 2d48bfca42a6d2b5f92d24ceae4425fc4fae14ab Mon Sep 17 00:00:00 2001
From: Chris Morgan <macromorgan@hotmail.com>
Date: Mon, 8 Aug 2022 12:38:07 -0500
Subject: mfd: rk808: Add Rockchip rk817 battery charger support

Add rk817 charger support cell to rk808 mfd driver.

Signed-off-by: Chris Morgan <macromorgan@hotmail.com>
Signed-off-by: Maya Matuszczyk <maccraft123mc@gmail.com>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20220808173809.11320-3-macroalpha82@gmail.com
---
 include/linux/mfd/rk808.h | 91 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 58602032e642..9af1f3105f80 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -519,6 +519,77 @@ enum rk809_reg_id {
 #define MIC_DIFF_DIS			(0x0 << 7)
 #define MIC_DIFF_EN			(0x1 << 7)
 
+/* RK817 Battery Registers */
+#define RK817_GAS_GAUGE_ADC_CONFIG0	0x50
+#define RK817_GG_EN			(0x1 << 7)
+#define RK817_SYS_VOL_ADC_EN		(0x1 << 6)
+#define RK817_TS_ADC_EN			(0x1 << 5)
+#define RK817_USB_VOL_ADC_EN		(0x1 << 4)
+#define RK817_BAT_VOL_ADC_EN		(0x1 << 3)
+#define RK817_BAT_CUR_ADC_EN		(0x1 << 2)
+
+#define RK817_GAS_GAUGE_ADC_CONFIG1	0x55
+
+#define RK817_VOL_CUR_CALIB_UPD		BIT(7)
+
+#define RK817_GAS_GAUGE_GG_CON		0x56
+#define RK817_GAS_GAUGE_GG_STS		0x57
+
+#define RK817_BAT_CON			(0x1 << 4)
+#define RK817_RELAX_VOL_UPD		(0x3 << 2)
+#define RK817_RELAX_STS			(0x1 << 1)
+
+#define RK817_GAS_GAUGE_RELAX_THRE_H	0x58
+#define RK817_GAS_GAUGE_RELAX_THRE_L	0x59
+#define RK817_GAS_GAUGE_OCV_THRE_VOL	0x62
+#define RK817_GAS_GAUGE_OCV_VOL_H	0x63
+#define RK817_GAS_GAUGE_OCV_VOL_L	0x64
+#define RK817_GAS_GAUGE_PWRON_VOL_H	0x6b
+#define RK817_GAS_GAUGE_PWRON_VOL_L	0x6c
+#define RK817_GAS_GAUGE_PWRON_CUR_H	0x6d
+#define RK817_GAS_GAUGE_PWRON_CUR_L	0x6e
+#define RK817_GAS_GAUGE_OFF_CNT		0x6f
+#define RK817_GAS_GAUGE_Q_INIT_H3	0x70
+#define RK817_GAS_GAUGE_Q_INIT_H2	0x71
+#define RK817_GAS_GAUGE_Q_INIT_L1	0x72
+#define RK817_GAS_GAUGE_Q_INIT_L0	0x73
+#define RK817_GAS_GAUGE_Q_PRES_H3	0x74
+#define RK817_GAS_GAUGE_Q_PRES_H2	0x75
+#define RK817_GAS_GAUGE_Q_PRES_L1	0x76
+#define RK817_GAS_GAUGE_Q_PRES_L0	0x77
+#define RK817_GAS_GAUGE_BAT_VOL_H	0x78
+#define RK817_GAS_GAUGE_BAT_VOL_L	0x79
+#define RK817_GAS_GAUGE_BAT_CUR_H	0x7a
+#define RK817_GAS_GAUGE_BAT_CUR_L	0x7b
+#define RK817_GAS_GAUGE_USB_VOL_H	0x7e
+#define RK817_GAS_GAUGE_USB_VOL_L	0x7f
+#define RK817_GAS_GAUGE_SYS_VOL_H	0x80
+#define RK817_GAS_GAUGE_SYS_VOL_L	0x81
+#define RK817_GAS_GAUGE_Q_MAX_H3	0x82
+#define RK817_GAS_GAUGE_Q_MAX_H2	0x83
+#define RK817_GAS_GAUGE_Q_MAX_L1	0x84
+#define RK817_GAS_GAUGE_Q_MAX_L0	0x85
+#define RK817_GAS_GAUGE_SLEEP_CON_SAMP_CUR_H	0x8f
+#define RK817_GAS_GAUGE_SLEEP_CON_SAMP_CUR_L	0x90
+#define RK817_GAS_GAUGE_CAL_OFFSET_H	0x91
+#define RK817_GAS_GAUGE_CAL_OFFSET_L	0x92
+#define RK817_GAS_GAUGE_VCALIB0_H	0x93
+#define RK817_GAS_GAUGE_VCALIB0_L	0x94
+#define RK817_GAS_GAUGE_VCALIB1_H	0x95
+#define RK817_GAS_GAUGE_VCALIB1_L	0x96
+#define RK817_GAS_GAUGE_IOFFSET_H	0x97
+#define RK817_GAS_GAUGE_IOFFSET_L	0x98
+#define RK817_GAS_GAUGE_BAT_R1		0x9a
+#define RK817_GAS_GAUGE_BAT_R2		0x9b
+#define RK817_GAS_GAUGE_BAT_R3		0x9c
+#define RK817_GAS_GAUGE_DATA0		0x9d
+#define RK817_GAS_GAUGE_DATA1		0x9e
+#define RK817_GAS_GAUGE_DATA2		0x9f
+#define RK817_GAS_GAUGE_DATA3		0xa0
+#define RK817_GAS_GAUGE_DATA4		0xa1
+#define RK817_GAS_GAUGE_DATA5		0xa2
+#define RK817_GAS_GAUGE_CUR_ADC_K0	0xb0
+
 #define RK817_POWER_EN_REG(i)		(0xb1 + (i))
 #define RK817_POWER_SLP_EN_REG(i)	(0xb5 + (i))
 
@@ -544,10 +615,30 @@ enum rk809_reg_id {
 #define RK817_LDO_ON_VSEL_REG(idx)	(0xcc + (idx) * 2)
 #define RK817_BOOST_OTG_CFG		(0xde)
 
+#define RK817_PMIC_CHRG_OUT		0xe4
+#define RK817_CHRG_VOL_SEL		(0x07 << 4)
+#define RK817_CHRG_CUR_SEL		(0x07 << 0)
+
+#define RK817_PMIC_CHRG_IN		0xe5
+#define RK817_USB_VLIM_EN		(0x01 << 7)
+#define RK817_USB_VLIM_SEL		(0x07 << 4)
+#define RK817_USB_ILIM_EN		(0x01 << 3)
+#define RK817_USB_ILIM_SEL		(0x07 << 0)
+#define RK817_PMIC_CHRG_TERM		0xe6
+#define RK817_CHRG_TERM_ANA_DIG		(0x01 << 2)
+#define RK817_CHRG_TERM_ANA_SEL		(0x03 << 0)
+#define RK817_CHRG_EN			(0x01 << 6)
+
+#define RK817_PMIC_CHRG_STS		0xeb
+#define RK817_BAT_EXS			BIT(7)
+#define RK817_CHG_STS			(0x07 << 4)
+
 #define RK817_ID_MSB			0xed
 #define RK817_ID_LSB			0xee
 
 #define RK817_SYS_STS			0xf0
+#define RK817_PLUG_IN_STS		(0x1 << 6)
+
 #define RK817_SYS_CFG(i)		(0xf1 + (i))
 
 #define RK817_ON_SOURCE_REG		0xf5
-- 
cgit v1.2.3


From a47137a5134be2f0b4724fe548362a253727d8b1 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 5 Sep 2022 13:58:10 +0200
Subject: mfd/omap1: htc-i2cpld: Convert to a pure GPIO driver

Instead of passing GPIO numbers pertaining to ourselves through
platform data, just request GPIO descriptors from our own GPIO
chips and use them, and cut down on the unnecessary complexity.

Cc: Aaro Koskinen <aaro.koskinen@iki.fi>
Cc: Janusz Krzysztofik <jmkrzyszt@gmail.com>
Cc: Tony Lindgren <tony@atomide.com>
Cc: Cory Maccarrone <darkstar6262@gmail.com>
Cc: linux-omap@vger.kernel.org
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20220905115810.5987-1-linus.walleij@linaro.org
---
 include/linux/htcpld.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/htcpld.h b/include/linux/htcpld.h
index 842fce69ac06..5f8ac9b1d724 100644
--- a/include/linux/htcpld.h
+++ b/include/linux/htcpld.h
@@ -13,8 +13,6 @@ struct htcpld_chip_platform_data {
 };
 
 struct htcpld_core_platform_data {
-	unsigned int                      int_reset_gpio_hi;
-	unsigned int                      int_reset_gpio_lo;
 	unsigned int                      i2c_adapter_id;
 
 	struct htcpld_chip_platform_data  *chip;
-- 
cgit v1.2.3


From b1cab78ba392162a5ef11173d02db6e182e04edd Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Mon, 26 Sep 2022 19:59:31 +0000
Subject: Revert "net: set proper memcg for net_init hooks allocations"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 1d0403d20f6c281cb3d14c5f1db5317caeec48e9.

Anatoly Pugachev reported that the commit 1d0403d20f6c ("net: set proper
memcg for net_init hooks allocations") is somehow causing the sparc64
VMs failed to boot and the VMs boot fine with that patch reverted. So,
revert the patch for now and later we can debug the issue.

Link: https://lore.kernel.org/all/20220918092849.GA10314@u164.east.ru/
Reported-by: Anatoly Pugachev <matorola@gmail.com>
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Cc: Vasily Averin <vvs@openvz.org>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: cgroups@vger.kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
Tested-by: Anatoly Pugachev <matorola@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Fixes: 1d0403d20f6c ("net: set proper memcg for net_init hooks allocations")
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 45 ---------------------------------------------
 1 file changed, 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6257867fbf95..567f12323f55 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1788,42 +1788,6 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
 	rcu_read_unlock();
 }
 
-/**
- * get_mem_cgroup_from_obj - get a memcg associated with passed kernel object.
- * @p: pointer to object from which memcg should be extracted. It can be NULL.
- *
- * Retrieves the memory group into which the memory of the pointed kernel
- * object is accounted. If memcg is found, its reference is taken.
- * If a passed kernel object is uncharged, or if proper memcg cannot be found,
- * as well as if mem_cgroup is disabled, NULL is returned.
- *
- * Return: valid memcg pointer with taken reference or NULL.
- */
-static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p)
-{
-	struct mem_cgroup *memcg;
-
-	rcu_read_lock();
-	do {
-		memcg = mem_cgroup_from_obj(p);
-	} while (memcg && !css_tryget(&memcg->css));
-	rcu_read_unlock();
-	return memcg;
-}
-
-/**
- * mem_cgroup_or_root - always returns a pointer to a valid memory cgroup.
- * @memcg: pointer to a valid memory cgroup or NULL.
- *
- * If passed argument is not NULL, returns it without any additional checks
- * and changes. Otherwise, root_mem_cgroup is returned.
- *
- * NOTE: root_mem_cgroup can be NULL during early boot.
- */
-static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
-{
-	return memcg ? memcg : root_mem_cgroup;
-}
 #else
 static inline bool mem_cgroup_kmem_disabled(void)
 {
@@ -1880,15 +1844,6 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
 {
 }
 
-static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p)
-{
-	return NULL;
-}
-
-static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
-{
-	return NULL;
-}
 #endif /* CONFIG_MEMCG_KMEM */
 
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
-- 
cgit v1.2.3


From 49f27f2b4bfa8b6e26f02df615e544f52648bfb2 Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Wed, 28 Sep 2022 14:47:55 +0800
Subject: remoteproc: Introduce rproc features

remote processor may support:
 - boot recovery with help from main processor
 - self recovery without help from main processor
 - iommu
 - etc

Introduce rproc features could simplify code to avoid adding more bool
flags

Acked-by: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20220928064756.4059662-2-peng.fan@oss.nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/linux/remoteproc.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 1abf56ad02da..fe8978eb69f1 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -489,6 +489,20 @@ struct rproc_dump_segment {
 	loff_t offset;
 };
 
+/**
+ * enum rproc_features - features supported
+ *
+ * @RPROC_FEAT_ATTACH_ON_RECOVERY: The remote processor does not need help
+ *				   from Linux to recover, such as firmware
+ *				   loading. Linux just needs to attach after
+ *				   recovery.
+ */
+
+enum rproc_features {
+	RPROC_FEAT_ATTACH_ON_RECOVERY,
+	RPROC_MAX_FEATURES,
+};
+
 /**
  * struct rproc - represents a physical remote processor device
  * @node: list node of this rproc object
@@ -530,6 +544,7 @@ struct rproc_dump_segment {
  * @elf_machine: firmware ELF machine
  * @cdev: character device of the rproc
  * @cdev_put_on_release: flag to indicate if remoteproc should be shutdown on @char_dev release
+ * @features: indicate remoteproc features
  */
 struct rproc {
 	struct list_head node;
@@ -570,6 +585,7 @@ struct rproc {
 	u16 elf_machine;
 	struct cdev cdev;
 	bool cdev_put_on_release;
+	DECLARE_BITMAP(features, RPROC_MAX_FEATURES);
 };
 
 /**
-- 
cgit v1.2.3


From f3304ecd7f060db1d4197fbdce5a503259f770d3 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Fri, 16 Sep 2022 15:29:53 +0900
Subject: linux/export: use inline assembler to populate symbol CRCs

Since commit 7b4537199a4a ("kbuild: link symbol CRCs at final link,
removing CONFIG_MODULE_REL_CRCS"), the module versioning on the
(non-upstreamed-yet) kvx Linux port is broken due to unexpected padding
for __crc_* symbols. The kvx GCC adds padding so u32 gets 8-byte
alignment instead of 4.

I do not know if this happens for upstream architectures in general,
but any compiler has the freedom to insert padding for faster access.

Use the inline assembler to directly specify the wanted data layout.
This is how we previously did before the breakage.

Link: https://lore.kernel.org/lkml/20220817161438.32039-1-ysionneau@kalray.eu/
Link: https://lore.kernel.org/linux-kbuild/31ce5305-a76b-13d7-ea55-afca82c46cf2@kalray.eu/
Fixes: 7b4537199a4a ("kbuild: link symbol CRCs at final link, removing CONFIG_MODULE_REL_CRCS")
Reported-by: Yann Sionneau <ysionneau@kalray.eu>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Tested-by: Yann Sionneau <ysionneau@kalray.eu>
---
 include/linux/export-internal.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h
index c2b1d4fd5987..fe7e6ba918f1 100644
--- a/include/linux/export-internal.h
+++ b/include/linux/export-internal.h
@@ -10,8 +10,10 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 
-/* __used is needed to keep __crc_* for LTO */
 #define SYMBOL_CRC(sym, crc, sec)   \
-	u32 __section("___kcrctab" sec "+" #sym) __used __crc_##sym = crc
+	asm(".section \"___kcrctab" sec "+" #sym "\",\"a\""	"\n" \
+	    "__crc_" #sym ":"					"\n" \
+	    ".long " #crc					"\n" \
+	    ".previous"						"\n")
 
 #endif /* __LINUX_EXPORT_INTERNAL_H__ */
-- 
cgit v1.2.3


From f0d74c4da1f060d2a66976193712a5e6abd361f5 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Mon, 26 Sep 2022 11:49:53 -0700
Subject: bpf: Parameterize task iterators.

Allow creating an iterator that loops through resources of one
thread/process.

People could only create iterators to loop through all resources of
files, vma, and tasks in the system, even though they were interested
in only the resources of a specific task or process.  Passing the
additional parameters, people can now create an iterator to go
through all resources or only the resources of a task.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20220926184957.208194-2-kuifeng@fb.com
---
 include/linux/bpf.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5161fac0513f..0f3eaf3ed98c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1796,6 +1796,27 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 	extern int bpf_iter_ ## target(args);			\
 	int __init bpf_iter_ ## target(args) { return 0; }
 
+/*
+ * The task type of iterators.
+ *
+ * For BPF task iterators, they can be parameterized with various
+ * parameters to visit only some of tasks.
+ *
+ * BPF_TASK_ITER_ALL (default)
+ *	Iterate over resources of every task.
+ *
+ * BPF_TASK_ITER_TID
+ *	Iterate over resources of a task/tid.
+ *
+ * BPF_TASK_ITER_TGID
+ *	Iterate over resources of every task of a process / task group.
+ */
+enum bpf_iter_task_type {
+	BPF_TASK_ITER_ALL = 0,
+	BPF_TASK_ITER_TID,
+	BPF_TASK_ITER_TGID,
+};
+
 struct bpf_iter_aux_info {
 	/* for map_elem iter */
 	struct bpf_map *map;
@@ -1805,6 +1826,10 @@ struct bpf_iter_aux_info {
 		struct cgroup *start; /* starting cgroup */
 		enum bpf_cgroup_iter_order order;
 	} cgroup;
+	struct {
+		enum bpf_iter_task_type	type;
+		u32 pid;
+	} task;
 };
 
 typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
-- 
cgit v1.2.3


From 7e9fbbb1b776d8d7969551565bc246f74ec53b27 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 28 Sep 2022 13:39:38 -0400
Subject: ring-buffer: Add ring_buffer_wake_waiters()

On closing of a file that represents a ring buffer or flushing the file,
there may be waiters on the ring buffer that needs to be woken up and exit
the ring_buffer_wait() function.

Add ring_buffer_wake_waiters() to wake up the waiters on the ring buffer
and allow them to exit the wait loop.

Link: https://lkml.kernel.org/r/20220928133938.28dc2c27@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: 15693458c4bc0 ("tracing/ring-buffer: Move poll wake ups into ring buffer code")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index dac53fd3afea..2504df9a0453 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -101,7 +101,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
 int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full);
 __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
 			  struct file *filp, poll_table *poll_table);
-
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu);
 
 #define RING_BUFFER_ALL_CPUS -1
 
-- 
cgit v1.2.3


From f3ddb74ad0790030c9592229fb14d8c451f4e9a8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 27 Sep 2022 19:15:27 -0400
Subject: tracing: Wake up ring buffer waiters on closing of the file

When the file that represents the ring buffer is closed, there may be
waiters waiting on more input from the ring buffer. Call
ring_buffer_wake_waiters() to wake up any waiters when the file is
closed.

Link: https://lkml.kernel.org/r/20220927231825.182416969@goodmis.org

Cc: stable@vger.kernel.org
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: e30f53aad2202 ("tracing: Do not busy wait in buffer splice")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8401dec93c15..20749bd9db71 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -92,6 +92,7 @@ struct trace_iterator {
 	unsigned int		temp_size;
 	char			*fmt;	/* modified format holder */
 	unsigned int		fmt_size;
+	long			wait_index;
 
 	/* trace_seq for __print_flags() and __print_symbolic() etc. */
 	struct trace_seq	tmp_seq;
-- 
cgit v1.2.3


From 6eaab4dfdd30ea8fb0dd4ee04940676c12b728e8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 Sep 2022 17:39:01 +0100
Subject: net: introduce struct ubuf_info_msgzc

We're going to split struct ubuf_info and leave there only
mandatory fields. Users are free to extend it. Add struct
ubuf_info_msgzc, which will be an extended version for MSG_ZEROCOPY and
some other users. It duplicates of struct ubuf_info for now and will be
removed in a couple of patches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f15d5b62539b..fd7dcb977fdf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -554,7 +554,28 @@ struct ubuf_info {
 	} mmp;
 };
 
+struct ubuf_info_msgzc {
+	struct ubuf_info ubuf;
+
+	union {
+		struct {
+			unsigned long desc;
+			void *ctx;
+		};
+		struct {
+			u32 id;
+			u16 len;
+			u16 zerocopy:1;
+			u32 bytelen;
+		};
+	};
+
+	struct mmpin mmp;
+};
+
 #define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
+#define uarg_to_msgzc(ubuf_ptr)	container_of((ubuf_ptr), struct ubuf_info_msgzc, \
+					     ubuf)
 
 int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
 void mm_unaccount_pinned_pages(struct mmpin *mmp);
-- 
cgit v1.2.3


From e7d2b510165fff6bedc9cca88c071ad846850c74 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 Sep 2022 17:39:04 +0100
Subject: net: shrink struct ubuf_info

We can benefit from a smaller struct ubuf_info, so leave only mandatory
fields and let users to decide how they want to extend it. Convert
MSG_ZEROCOPY to struct ubuf_info_msgzc and remove duplicated fields.
This reduces the size from 48 bytes to just 16.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fd7dcb977fdf..920eb6413fee 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -533,25 +533,8 @@ enum {
 struct ubuf_info {
 	void (*callback)(struct sk_buff *, struct ubuf_info *,
 			 bool zerocopy_success);
-	union {
-		struct {
-			unsigned long desc;
-			void *ctx;
-		};
-		struct {
-			u32 id;
-			u16 len;
-			u16 zerocopy:1;
-			u32 bytelen;
-		};
-	};
 	refcount_t refcnt;
 	u8 flags;
-
-	struct mmpin {
-		struct user_struct *user;
-		unsigned int num_pg;
-	} mmp;
 };
 
 struct ubuf_info_msgzc {
@@ -570,7 +553,10 @@ struct ubuf_info_msgzc {
 		};
 	};
 
-	struct mmpin mmp;
+	struct mmpin {
+		struct user_struct *user;
+		unsigned int num_pg;
+	} mmp;
 };
 
 #define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
-- 
cgit v1.2.3


From b48b89f9c189d24eb5e2b4a0ac067da5a24ee86d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 27 Sep 2022 06:27:53 -0700
Subject: net: drop the weight argument from netif_napi_add

We tell driver developers to always pass NAPI_POLL_WEIGHT
as the weight to netif_napi_add(). This may be confusing
to newcomers, drop the weight argument, those who really
need to tweak the weight can use netif_napi_add_weight().

Acked-by: Marc Kleine-Budde <mkl@pengutronix.de> # for CAN
Link: https://lore.kernel.org/r/20220927132753.750069-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9f42fc871c3b..7a084a1c14e9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2553,16 +2553,15 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
  * @dev:  network device
  * @napi: NAPI context
  * @poll: polling function
- * @weight: default weight
  *
  * netif_napi_add() must be used to initialize a NAPI context prior to calling
  * *any* of the other NAPI-related functions.
  */
 static inline void
 netif_napi_add(struct net_device *dev, struct napi_struct *napi,
-	       int (*poll)(struct napi_struct *, int), int weight)
+	       int (*poll)(struct napi_struct *, int))
 {
-	netif_napi_add_weight(dev, napi, poll, weight);
+	netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
 }
 
 static inline void
-- 
cgit v1.2.3


From 40b72108f9c609c5043903efb70c52d46b8aa871 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Tue, 27 Sep 2022 13:35:56 -0700
Subject: net/mlx5: Add the log_min_mkey_entity_size capability

Add the capability that will allow the driver to determine the minimal
MTT page size to be able to map the smallest possible pages in XSK. The
older firmwares that don't have this capability default to 12 (i.e.
4096-byte pages).

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3d963c0e47e4..1ad762e22d86 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1878,7 +1878,13 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   max_reformat_remove_size[0x8];
 	u8	   max_reformat_remove_offset[0x8];
 
-	u8	   reserved_at_c0[0x160];
+	u8	   reserved_at_c0[0xe0];
+
+	u8	   reserved_at_1a0[0xb];
+	u8	   log_min_mkey_entity_size[0x5];
+	u8	   reserved_at_1b0[0x10];
+
+	u8	   reserved_at_1c0[0x60];
 
 	u8	   reserved_at_220[0x1];
 	u8	   sw_vhca_id_valid[0x1];
-- 
cgit v1.2.3


From 9ed9cac1850a2a55674b4a17100c50b46f645921 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 23 Sep 2022 13:28:07 -0700
Subject: slab: Remove __malloc attribute from realloc functions

The __malloc attribute should not be applied to "realloc" functions, as
the returned pointer may alias the storage of the prior pointer. Instead
of splitting __malloc from __alloc_size, which would be a huge amount of
churn, just create __realloc_size for the few cases where it is needed.

Thanks to Geert Uytterhoeven <geert@linux-m68k.org> for reporting build
failures with gcc-8 in earlier version which tried to remove the #ifdef.
While the "alloc_size" attribute is available on all GCC versions, I
forgot that it gets disabled explicitly by the kernel in GCC < 9.1 due
to misbehaviors. Add a note to the compiler_attributes.h entry for it.

Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: linux-mm@kvack.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/compiler_attributes.h |  3 ++-
 include/linux/compiler_types.h      |  8 +++++---
 include/linux/slab.h                | 12 ++++++------
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index 445e80517cab..96a4ed11b4be 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -35,7 +35,8 @@
 
 /*
  * Note: do not use this directly. Instead, use __alloc_size() since it is conditionally
- * available and includes other attributes.
+ * available and includes other attributes. For GCC < 9.1, __alloc_size__ gets undefined
+ * in compiler-gcc.h, due to misbehaviors.
  *
  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alloc_005fsize-function-attribute
  * clang: https://clang.llvm.org/docs/AttributeReference.html#alloc-size
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 4f2a819fd60a..0717534f8364 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -271,14 +271,16 @@ struct ftrace_likely_data {
 
 /*
  * Any place that could be marked with the "alloc_size" attribute is also
- * a place to be marked with the "malloc" attribute. Do this as part of the
- * __alloc_size macro to avoid redundant attributes and to avoid missing a
- * __malloc marking.
+ * a place to be marked with the "malloc" attribute, except those that may
+ * be performing a _reallocation_, as that may alias the existing pointer.
+ * For these, use __realloc_size().
  */
 #ifdef __alloc_size__
 # define __alloc_size(x, ...)	__alloc_size__(x, ## __VA_ARGS__) __malloc
+# define __realloc_size(x, ...)	__alloc_size__(x, ## __VA_ARGS__)
 #else
 # define __alloc_size(x, ...)	__malloc
+# define __realloc_size(x, ...)
 #endif
 
 #ifndef asm_volatile_goto
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0fefdf528e0d..41bd036e7551 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -184,7 +184,7 @@ int kmem_cache_shrink(struct kmem_cache *s);
 /*
  * Common kmalloc functions provided by all allocators
  */
-void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2);
+void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2);
 void kfree(const void *objp);
 void kfree_sensitive(const void *objp);
 size_t __ksize(const void *objp);
@@ -647,10 +647,10 @@ static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_
  * @new_size: new size of a single member of the array
  * @flags: the type of memory to allocate (see kmalloc)
  */
-static inline __alloc_size(2, 3) void * __must_check krealloc_array(void *p,
-								    size_t new_n,
-								    size_t new_size,
-								    gfp_t flags)
+static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p,
+								      size_t new_n,
+								      size_t new_size,
+								      gfp_t flags)
 {
 	size_t bytes;
 
@@ -774,7 +774,7 @@ static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t fla
 }
 
 extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
-		      __alloc_size(3);
+		      __realloc_size(3);
 extern void kvfree(const void *addr);
 extern void kvfree_sensitive(const void *addr, size_t len);
 
-- 
cgit v1.2.3


From 05a940656e1eb2026d9ee31019d5b47e9545124d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 23 Sep 2022 13:28:08 -0700
Subject: slab: Introduce kmalloc_size_roundup()

In the effort to help the compiler reason about buffer sizes, the
__alloc_size attribute was added to allocators. This improves the scope
of the compiler's ability to apply CONFIG_UBSAN_BOUNDS and (in the near
future) CONFIG_FORTIFY_SOURCE. For most allocations, this works well,
as the vast majority of callers are not expecting to use more memory
than what they asked for.

There is, however, one common exception to this: anticipatory resizing
of kmalloc allocations. These cases all use ksize() to determine the
actual bucket size of a given allocation (e.g. 128 when 126 was asked
for). This comes in two styles in the kernel:

1) An allocation has been determined to be too small, and needs to be
   resized. Instead of the caller choosing its own next best size, it
   wants to minimize the number of calls to krealloc(), so it just uses
   ksize() plus some additional bytes, forcing the realloc into the next
   bucket size, from which it can learn how large it is now. For example:

	data = krealloc(data, ksize(data) + 1, gfp);
	data_len = ksize(data);

2) The minimum size of an allocation is calculated, but since it may
   grow in the future, just use all the space available in the chosen
   bucket immediately, to avoid needing to reallocate later. A good
   example of this is skbuff's allocators:

	data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
	...
	/* kmalloc(size) might give us more room than requested.
	 * Put skb_shared_info exactly at the end of allocated zone,
	 * to allow max possible filling before reallocation.
	 */
	osize = ksize(data);
        size = SKB_WITH_OVERHEAD(osize);

In both cases, the "how much was actually allocated?" question is answered
_after_ the allocation, where the compiler hinting is not in an easy place
to make the association any more. This mismatch between the compiler's
view of the buffer length and the code's intention about how much it is
going to actually use has already caused problems[1]. It is possible to
fix this by reordering the use of the "actual size" information.

We can serve the needs of users of ksize() and still have accurate buffer
length hinting for the compiler by doing the bucket size calculation
_before_ the allocation. Code can instead ask "how large an allocation
would I get for a given size?".

Introduce kmalloc_size_roundup(), to serve this function so we can start
replacing the "anticipatory resizing" uses of ksize().

[1] https://github.com/ClangBuiltLinux/linux/issues/1599
    https://github.com/KSPP/linux/issues/183

[ vbabka@suse.cz: add SLOB version ]

Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 41bd036e7551..727640173568 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -188,7 +188,21 @@ void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __r
 void kfree(const void *objp);
 void kfree_sensitive(const void *objp);
 size_t __ksize(const void *objp);
+
+/**
+ * ksize - Report actual allocation size of associated object
+ *
+ * @objp: Pointer returned from a prior kmalloc()-family allocation.
+ *
+ * This should not be used for writing beyond the originally requested
+ * allocation size. Either use krealloc() or round up the allocation size
+ * with kmalloc_size_roundup() prior to allocation. If this is used to
+ * access beyond the originally requested allocation size, UBSAN_BOUNDS
+ * and/or FORTIFY_SOURCE may trip, since they only know about the
+ * originally allocated size via the __alloc_size attribute.
+ */
 size_t ksize(const void *objp);
+
 #ifdef CONFIG_PRINTK
 bool kmem_valid_obj(void *object);
 void kmem_dump_obj(void *object);
@@ -779,6 +793,23 @@ extern void kvfree(const void *addr);
 extern void kvfree_sensitive(const void *addr, size_t len);
 
 unsigned int kmem_cache_size(struct kmem_cache *s);
+
+/**
+ * kmalloc_size_roundup - Report allocation bucket size for the given size
+ *
+ * @size: Number of bytes to round up from.
+ *
+ * This returns the number of bytes that would be available in a kmalloc()
+ * allocation of @size bytes. For example, a 126 byte request would be
+ * rounded up to the next sized kmalloc bucket, 128 bytes. (This is strictly
+ * for the general-purpose kmalloc()-based allocations, and is not for the
+ * pre-sized kmem_cache_alloc()-based allocations.)
+ *
+ * Use this to kmalloc() the full bucket size ahead of time instead of using
+ * ksize() to query the size after an allocation.
+ */
+size_t kmalloc_size_roundup(size_t size);
+
 void __init kmem_cache_init_late(void);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_SLAB)
-- 
cgit v1.2.3


From c60ba2d34608dc6e224440267ed392adf65e05f2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Sep 2022 02:10:37 +0206
Subject: printk: Make pr_flush() static

No user outside the printk code and no reason to export this.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220924000454.3319186-2-john.ogness@linutronix.de
---
 include/linux/printk.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 091fba7283e1..b70a42f94031 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -170,8 +170,6 @@ extern void __printk_safe_exit(void);
 #define printk_deferred_enter __printk_safe_enter
 #define printk_deferred_exit __printk_safe_exit
 
-extern bool pr_flush(int timeout_ms, bool reset_on_progress);
-
 /*
  * Please don't use printk_ratelimit(), because it shares ratelimiting state
  * with all other unrelated printk_ratelimit() callsites.  Instead use
@@ -222,11 +220,6 @@ static inline void printk_deferred_exit(void)
 {
 }
 
-static inline bool pr_flush(int timeout_ms, bool reset_on_progress)
-{
-	return true;
-}
-
 static inline int printk_ratelimit(void)
 {
 	return 0;
-- 
cgit v1.2.3


From e3f12f0602833c88ad2cbe3aff397acd0cfd2695 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Sep 2022 02:10:38 +0206
Subject: printk: Declare log_wait properly

kernel/printk/printk.c:365:1: warning: symbol 'log_wait' was not declared. Should it be static?

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220924000454.3319186-3-john.ogness@linutronix.de
---
 include/linux/syslog.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/syslog.h b/include/linux/syslog.h
index 86af908e2663..955f80e34d4f 100644
--- a/include/linux/syslog.h
+++ b/include/linux/syslog.h
@@ -8,6 +8,8 @@
 #ifndef _LINUX_SYSLOG_H
 #define _LINUX_SYSLOG_H
 
+#include <linux/wait.h>
+
 /* Close the log.  Currently a NOP. */
 #define SYSLOG_ACTION_CLOSE          0
 /* Open the log. Currently a NOP. */
@@ -35,5 +37,6 @@
 #define SYSLOG_FROM_PROC             1
 
 int do_syslog(int type, char __user *buf, int count, int source);
+extern wait_queue_head_t log_wait;
 
 #endif /* _LINUX_SYSLOG_H */
-- 
cgit v1.2.3


From 8cafdb5ab94cda3ebb0975be16e2d564a05132ea Mon Sep 17 00:00:00 2001
From: Pankaj Raghav <p.raghav@samsung.com>
Date: Thu, 29 Sep 2022 09:47:44 +0200
Subject: block: adapt blk_mq_plug() to not plug for writes that require a zone
 lock

The current implementation of blk_mq_plug() disables plugging for all
operations that involves a transfer to the device as we just check if
the last bit in op_is_write() function.

Modify blk_mq_plug() to disable plugging only for REQ_OP_WRITE and
REQ_OP_WRITE_ZEROS as they might require a zone lock.

Suggested-by: Christoph Hellwig <hch@lst.de>
Suggested-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220929074745.103073-2-p.raghav@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4750772ef228..49373d002631 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1304,6 +1304,15 @@ static inline bool bdev_is_zoned(struct block_device *bdev)
 	return false;
 }
 
+static inline bool bdev_op_is_zoned_write(struct block_device *bdev,
+					  blk_opf_t op)
+{
+	if (!bdev_is_zoned(bdev))
+		return false;
+
+	return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES;
+}
+
 static inline sector_t bdev_zone_sectors(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
-- 
cgit v1.2.3


From 39d6d08b2edf99c4b39a689a70bf0adee065b357 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Thu, 28 Jul 2022 16:33:08 -0700
Subject: tracing/user_events: Use bits vs bytes for enabled status page data

User processes may require many events and when they do the cache
performance of a byte index status check is less ideal than a bit index.
The previous event limit per-page was 4096, the new limit is 32,768.

This change adds a bitwise index to the user_reg struct. Programs check
that the bit at status_bit has a bit set within the status page(s).

Link: https://lkml.kernel.org/r/20220728233309.1896-6-beaub@linux.microsoft.com
Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/user_events.h | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/user_events.h b/include/linux/user_events.h
index 736e05603463..592a3fbed98e 100644
--- a/include/linux/user_events.h
+++ b/include/linux/user_events.h
@@ -20,15 +20,6 @@
 #define USER_EVENTS_SYSTEM "user_events"
 #define USER_EVENTS_PREFIX "u:"
 
-/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */
-#define EVENT_BIT_FTRACE 0
-#define EVENT_BIT_PERF 1
-#define EVENT_BIT_OTHER 7
-
-#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE)
-#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF)
-#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER)
-
 /* Create dynamic location entry within a 32-bit value */
 #define DYN_LOC(offset, size) ((size) << 16 | (offset))
 
@@ -45,12 +36,12 @@ struct user_reg {
 	/* Input: Pointer to string with event name, description and flags */
 	__u64 name_args;
 
-	/* Output: Byte index of the event within the status page */
-	__u32 status_index;
+	/* Output: Bitwise index of the event within the status page */
+	__u32 status_bit;
 
 	/* Output: Index of the event to use when writing data */
 	__u32 write_index;
-};
+} __attribute__((__packed__));
 
 #define DIAG_IOC_MAGIC '*'
 
-- 
cgit v1.2.3


From adfdfcbdbd32b356323a3db6d3a683270051a7e6 Mon Sep 17 00:00:00 2001
From: Jerome Neanne <jneanne@baylibre.com>
Date: Thu, 29 Sep 2022 15:25:25 +0200
Subject: regulator: gpio: Add input_supply support in gpio_regulator_config

This is simillar as fixed-regulator.
Used to extract regulator parent from the device tree.

Without that property used, the parent regulator can be shut down (if not an always on).
Thus leading to inappropriate behavior:
On am62-SP-SK this fix is required to avoid tps65219 ldo1 (SDMMC rail) to be shut down after boot completion.

Signed-off-by: Jerome Neanne <jneanne@baylibre.com>
Link: https://lore.kernel.org/r/20220929132526.29427-2-jneanne@baylibre.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/gpio-regulator.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/gpio-regulator.h b/include/linux/regulator/gpio-regulator.h
index fdeb312cdabd..c223e50ff9f7 100644
--- a/include/linux/regulator/gpio-regulator.h
+++ b/include/linux/regulator/gpio-regulator.h
@@ -42,6 +42,7 @@ struct gpio_regulator_state {
 /**
  * struct gpio_regulator_config - config structure
  * @supply_name:	Name of the regulator supply
+ * @input_supply:	Name of the input regulator supply
  * @enabled_at_boot:	Whether regulator has been enabled at
  *			boot or not. 1 = Yes, 0 = No
  *			This is used to keep the regulator at
@@ -62,6 +63,7 @@ struct gpio_regulator_state {
  */
 struct gpio_regulator_config {
 	const char *supply_name;
+	const char *input_supply;
 
 	unsigned enabled_at_boot:1;
 	unsigned startup_delay;
-- 
cgit v1.2.3


From 64696c40d03c01e0ea2e3e9aa1c490a7b6a1b6be Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 29 Sep 2022 00:04:03 -0700
Subject: bpf: Add __bpf_prog_{enter,exit}_struct_ops for struct_ops trampoline

The struct_ops prog is to allow using bpf to implement the functions in
a struct (eg. kernel module).  The current usage is to implement the
tcp_congestion.  The kernel does not call the tcp-cc's ops (ie.
the bpf prog) in a recursive way.

The struct_ops is sharing the tracing-trampoline's enter/exit
function which tracks prog->active to avoid recursion.  It is
needed for tracing prog.  However, it turns out the struct_ops
bpf prog will hit this prog->active and unnecessarily skipped
running the struct_ops prog.  eg.  The '.ssthresh' may run in_task()
and then interrupted by softirq that runs the same '.ssthresh'.
Skip running the '.ssthresh' will end up returning random value
to the caller.

The patch adds __bpf_prog_{enter,exit}_struct_ops for the
struct_ops trampoline.  They do not track the prog->active
to detect recursion.

One exception is when the tcp_congestion's '.init' ops is doing
bpf_setsockopt(TCP_CONGESTION) and then recurs to the same
'.init' ops.  This will be addressed in the following patches.

Fixes: ca06f55b9002 ("bpf: Add per-program recursion prevention mechanism")
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20220929070407.965581-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0f3eaf3ed98c..9e7d46d16032 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -864,6 +864,10 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
 					struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
 					struct bpf_tramp_run_ctx *run_ctx);
+u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
+					struct bpf_tramp_run_ctx *run_ctx);
+void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
+					struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
 void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
 
-- 
cgit v1.2.3


From 061ff040710e9f6f043d1fa80b1b362d2845b17a Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 29 Sep 2022 00:04:06 -0700
Subject: bpf: tcp: Stop bpf_setsockopt(TCP_CONGESTION) in init ops to recur
 itself

When a bad bpf prog '.init' calls
bpf_setsockopt(TCP_CONGESTION, "itself"), it will trigger this loop:

.init => bpf_setsockopt(tcp_cc) => .init => bpf_setsockopt(tcp_cc) ...
... => .init => bpf_setsockopt(tcp_cc).

It was prevented by the prog->active counter before but the prog->active
detection cannot be used in struct_ops as explained in the earlier
patch of the set.

In this patch, the second bpf_setsockopt(tcp_cc) is not allowed
in order to break the loop.  This is done by using a bit of
an existing 1 byte hole in tcp_sock to check if there is
on-going bpf_setsockopt(TCP_CONGESTION) in this tcp_sock.

Note that this essentially limits only the first '.init' can
call bpf_setsockopt(TCP_CONGESTION) to pick a fallback cc (eg. peer
does not support ECN) and the second '.init' cannot fallback to
another cc.  This applies even the second
bpf_setsockopt(TCP_CONGESTION) will not cause a loop.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20220929070407.965581-5-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/tcp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a9fbe22732c3..3bdf687e2fb3 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -388,6 +388,12 @@ struct tcp_sock {
 	u8	bpf_sock_ops_cb_flags;  /* Control calling BPF programs
 					 * values defined in uapi/linux/tcp.h
 					 */
+	u8	bpf_chg_cc_inprogress:1; /* In the middle of
+					  * bpf_setsockopt(TCP_CONGESTION),
+					  * it is to avoid the bpf_tcp_cc->init()
+					  * to recur itself by calling
+					  * bpf_setsockopt(TCP_CONGESTION, "itself").
+					  */
 #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
 #else
 #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
-- 
cgit v1.2.3


From f62384995e4cb7703e5295779c44135c5311770d Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Mon, 26 Sep 2022 17:43:14 +0200
Subject: random: split initialization into early step and later step

The full RNG initialization relies on some timestamps, made possible
with initialization functions like time_init() and timekeeping_init().
However, these are only available rather late in initialization.
Meanwhile, other things, such as memory allocator functions, make use of
the RNG much earlier.

So split RNG initialization into two phases. We can provide arch
randomness very early on, and then later, after timekeeping and such are
available, initialize the rest.

This ensures that, for example, slabs are properly randomized if RDRAND
is available. Without this, CONFIG_SLAB_FREELIST_RANDOM=y loses a degree
of its security, because its random seed is potentially deterministic,
since it hasn't yet incorporated RDRAND. It also makes it possible to
use a better seed in kfence, which currently relies on only the cycle
counter.

Another positive consequence is that on systems with RDRAND, running
with CONFIG_WARN_ALL_UNSEEDED_RANDOM=y results in no warnings at all.

One subtle side effect of this change is that on systems with no RDRAND,
RDTSC is now only queried by random_init() once, committing the moment
of the function call, instead of multiple times as before. This is
intentional, as the multiple RDTSCs in a loop before weren't
accomplishing very much, with jitter being better provided by
try_to_generate_entropy(). Plus, filling blocks with RDTSC is still
being done in extract_entropy(), which is necessarily called before
random bytes are served anyway.

Cc: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/linux/random.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/random.h b/include/linux/random.h
index 3fec206487f6..a9e6e16f9774 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -72,7 +72,8 @@ static inline unsigned long get_random_canary(void)
 	return get_random_long() & CANARY_MASK;
 }
 
-int __init random_init(const char *command_line);
+void __init random_init_early(const char *command_line);
+void __init random_init(void);
 bool rng_is_initialized(void);
 int wait_for_random_bytes(void);
 
-- 
cgit v1.2.3


From 585cd5fe9f7378601b1d4915ad6e9088333b5e5e Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Wed, 28 Sep 2022 18:47:30 +0200
Subject: random: add 8-bit and 16-bit batches

There are numerous places in the kernel that would be sped up by having
smaller batches. Currently those callsites do `get_random_u32() & 0xff`
or similar. Since these are pretty spread out, and will require patches
to multiple different trees, let's get ahead of the curve and lay the
foundation for `get_random_u8()` and `get_random_u16()`, so that it's
then possible to start submitting conversion patches leisurely.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/linux/random.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/random.h b/include/linux/random.h
index a9e6e16f9774..2c130f8f18e5 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -38,6 +38,8 @@ static inline int unregister_random_vmfork_notifier(struct notifier_block *nb) {
 #endif
 
 void get_random_bytes(void *buf, size_t len);
+u8 get_random_u8(void);
+u16 get_random_u16(void);
 u32 get_random_u32(void);
 u64 get_random_u64(void);
 static inline unsigned int get_random_int(void)
-- 
cgit v1.2.3


From 4c95236a335d8b62aa8dbd587bed6a5f30d8265a Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Thu, 29 Sep 2022 11:49:35 +0200
Subject: prandom: make use of smaller types in prandom_u32_max

When possible at compile-time, make use of smaller types in
prandom_u32_max(), so that we can use smaller batches from random.c,
which in turn leads to a 2x or 4x performance boost. This makes a
difference, for example, in kfence, which needs a fast stream of small
numbers (booleans).

At the same time, we use the occasion to update the old documentation on
these functions. prandom_u32() and prandom_bytes() have direct
replacements now in random.h, while prandom_u32_max() remains useful as
a prandom.h function, since it's not cryptographically secure by virtue
of not being evenly distributed.

Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Acked-by: Marco Elver <elver@google.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/linux/prandom.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/prandom.h b/include/linux/prandom.h
index deace5fb4e62..78db003bc290 100644
--- a/include/linux/prandom.h
+++ b/include/linux/prandom.h
@@ -12,11 +12,13 @@
 #include <linux/percpu.h>
 #include <linux/random.h>
 
+/* Deprecated: use get_random_u32 instead. */
 static inline u32 prandom_u32(void)
 {
 	return get_random_u32();
 }
 
+/* Deprecated: use get_random_bytes instead. */
 static inline void prandom_bytes(void *buf, size_t nbytes)
 {
 	return get_random_bytes(buf, nbytes);
@@ -37,17 +39,20 @@ void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state);
  * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro)
  * @ep_ro: right open interval endpoint
  *
- * Returns a pseudo-random number that is in interval [0, ep_ro). Note
- * that the result depends on PRNG being well distributed in [0, ~0U]
- * u32 space. Here we use maximally equidistributed combined Tausworthe
- * generator, that is, prandom_u32(). This is useful when requesting a
- * random index of an array containing ep_ro elements, for example.
+ * Returns a pseudo-random number that is in interval [0, ep_ro). This is
+ * useful when requesting a random index of an array containing ep_ro elements,
+ * for example. The result is somewhat biased when ep_ro is not a power of 2,
+ * so do not use this for cryptographic purposes.
  *
  * Returns: pseudo-random number in interval [0, ep_ro)
  */
 static inline u32 prandom_u32_max(u32 ep_ro)
 {
-	return (u32)(((u64) prandom_u32() * ep_ro) >> 32);
+	if (__builtin_constant_p(ep_ro <= 1U << 8) && ep_ro <= 1U << 8)
+		return (get_random_u8() * ep_ro) >> 8;
+	if (__builtin_constant_p(ep_ro <= 1U << 16) && ep_ro <= 1U << 16)
+		return (get_random_u16() * ep_ro) >> 16;
+	return ((u64)get_random_u32() * ep_ro) >> 32;
 }
 
 /*
-- 
cgit v1.2.3


From 9f4beead610c83065cc0410bfe97ff51d8e9578d Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Thu, 29 Sep 2022 22:39:03 +0200
Subject: binfmt: remove taso from linux_binprm struct

With commit 987f20a9dcce ("a.out: Remove the a.out implementation"), the
use of the special taso flag for alpha architectures in the linux_binprm
struct is gone.

Remove the definition of taso in the linux_binprm struct.

No functional change.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220929203903.9475-1-lukas.bulwahn@gmail.com
---
 include/linux/binfmts.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 3dc20c4f394c..8d51f69f9f5e 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -43,9 +43,6 @@ struct linux_binprm {
 		 * original userspace.
 		 */
 		point_of_no_return:1;
-#ifdef __alpha__
-	unsigned int taso:1;
-#endif
 	struct file *executable; /* Executable to pass to the interpreter */
 	struct file *interpreter;
 	struct file *file;
-- 
cgit v1.2.3


From f5290d8e4f0caa81a491448a27dd70e726095d07 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Fri, 16 Sep 2022 09:17:38 +0300
Subject: clk: asm9260: use parent index to link the reference clock

Rewrite clk-asm9260 to use parent index to use the reference clock.
During this rework two helpers are added:

- clk_hw_register_mux_table_parent_data() to supplement
  clk_hw_register_mux_table() but using parent_data instead of
  parent_names

- clk_hw_register_fixed_rate_parent_accuracy() to be used instead of
  directly calling __clk_hw_register_fixed_rate(). The later function is
  an internal API, which is better not to be called directly.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20220916061740.87167-2-dmitry.baryshkov@linaro.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 1615010aa0ec..86140ac2f9a5 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -439,6 +439,20 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
 	__clk_hw_register_fixed_rate((dev), NULL, (name), NULL, NULL,	      \
 				     (parent_data), NULL, (flags),	      \
 				     (fixed_rate), (fixed_accuracy), 0)
+/**
+ * clk_hw_register_fixed_rate_parent_accuracy - register fixed-rate clock with
+ * the clock framework
+ * @dev: device that is registering this clock
+ * @name: name of this clock
+ * @parent_name: name of clock's parent
+ * @flags: framework-specific flags
+ * @fixed_rate: non-adjustable clock rate
+ */
+#define clk_hw_register_fixed_rate_parent_accuracy(dev, name, parent_data,    \
+						   flags, fixed_rate)	      \
+	__clk_hw_register_fixed_rate((dev), NULL, (name), NULL, NULL,      \
+				     (parent_data), (flags), (fixed_rate), 0,    \
+				     CLK_FIXED_RATE_PARENT_ACCURACY)
 
 void clk_unregister_fixed_rate(struct clk *clk);
 void clk_hw_unregister_fixed_rate(struct clk_hw *hw);
@@ -957,6 +971,13 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name,
 			      (parent_names), NULL, NULL, (flags), (reg),     \
 			      (shift), (mask), (clk_mux_flags), (table),      \
 			      (lock))
+#define clk_hw_register_mux_table_parent_data(dev, name, parent_data,	      \
+				  num_parents, flags, reg, shift, mask,	      \
+				  clk_mux_flags, table, lock)		      \
+	__clk_hw_register_mux((dev), NULL, (name), (num_parents),	      \
+			      NULL, NULL, (parent_data), (flags), (reg),      \
+			      (shift), (mask), (clk_mux_flags), (table),      \
+			      (lock))
 #define clk_hw_register_mux(dev, name, parent_names, num_parents, flags, reg, \
 			    shift, width, clk_mux_flags, lock)		      \
 	__clk_hw_register_mux((dev), NULL, (name), (num_parents),	      \
-- 
cgit v1.2.3


From 1d7d20658534c7d36fe6f4252f6f1a27d9631a99 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Fri, 16 Sep 2022 09:17:39 +0300
Subject: clk: fixed-rate: add devm_clk_hw_register_fixed_rate

Add devm_clk_hw_register_fixed_rate(), devres-managed helper to register
fixed-rate clock.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20220916061740.87167-3-dmitry.baryshkov@linaro.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 86140ac2f9a5..49405b336cad 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -350,7 +350,7 @@ struct clk_hw *__clk_hw_register_fixed_rate(struct device *dev,
 		const char *parent_name, const struct clk_hw *parent_hw,
 		const struct clk_parent_data *parent_data, unsigned long flags,
 		unsigned long fixed_rate, unsigned long fixed_accuracy,
-		unsigned long clk_fixed_flags);
+		unsigned long clk_fixed_flags, bool devm);
 struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		unsigned long fixed_rate);
@@ -365,7 +365,20 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
  */
 #define clk_hw_register_fixed_rate(dev, name, parent_name, flags, fixed_rate)  \
 	__clk_hw_register_fixed_rate((dev), NULL, (name), (parent_name), NULL, \
-				     NULL, (flags), (fixed_rate), 0, 0)
+				     NULL, (flags), (fixed_rate), 0, 0, false)
+
+/**
+ * devm_clk_hw_register_fixed_rate - register fixed-rate clock with the clock
+ * framework
+ * @dev: device that is registering this clock
+ * @name: name of this clock
+ * @parent_name: name of clock's parent
+ * @flags: framework-specific flags
+ * @fixed_rate: non-adjustable clock rate
+ */
+#define devm_clk_hw_register_fixed_rate(dev, name, parent_name, flags, fixed_rate)  \
+	__clk_hw_register_fixed_rate((dev), NULL, (name), (parent_name), NULL, \
+				     NULL, (flags), (fixed_rate), 0, 0, true)
 /**
  * clk_hw_register_fixed_rate_parent_hw - register fixed-rate clock with
  * the clock framework
@@ -378,7 +391,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
 #define clk_hw_register_fixed_rate_parent_hw(dev, name, parent_hw, flags,     \
 					     fixed_rate)		      \
 	__clk_hw_register_fixed_rate((dev), NULL, (name), NULL, (parent_hw),  \
-				     NULL, (flags), (fixed_rate), 0, 0)
+				     NULL, (flags), (fixed_rate), 0, 0, false)
 /**
  * clk_hw_register_fixed_rate_parent_data - register fixed-rate clock with
  * the clock framework
@@ -392,7 +405,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
 					     fixed_rate)		      \
 	__clk_hw_register_fixed_rate((dev), NULL, (name), NULL, NULL,	      \
 				     (parent_data), (flags), (fixed_rate), 0, \
-				     0)
+				     0, false)
 /**
  * clk_hw_register_fixed_rate_with_accuracy - register fixed-rate clock with
  * the clock framework
@@ -408,7 +421,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
 						 fixed_accuracy)	      \
 	__clk_hw_register_fixed_rate((dev), NULL, (name), (parent_name),      \
 				     NULL, NULL, (flags), (fixed_rate),       \
-				     (fixed_accuracy), 0)
+				     (fixed_accuracy), 0, false)
 /**
  * clk_hw_register_fixed_rate_with_accuracy_parent_hw - register fixed-rate
  * clock with the clock framework
@@ -423,7 +436,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
 		parent_hw, flags, fixed_rate, fixed_accuracy)		      \
 	__clk_hw_register_fixed_rate((dev), NULL, (name), NULL, (parent_hw)   \
 				     NULL, NULL, (flags), (fixed_rate),	      \
-				     (fixed_accuracy), 0)
+				     (fixed_accuracy), 0, false)
 /**
  * clk_hw_register_fixed_rate_with_accuracy_parent_data - register fixed-rate
  * clock with the clock framework
@@ -438,7 +451,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
 		parent_data, flags, fixed_rate, fixed_accuracy)		      \
 	__clk_hw_register_fixed_rate((dev), NULL, (name), NULL, NULL,	      \
 				     (parent_data), NULL, (flags),	      \
-				     (fixed_rate), (fixed_accuracy), 0)
+				     (fixed_rate), (fixed_accuracy), 0, false)
 /**
  * clk_hw_register_fixed_rate_parent_accuracy - register fixed-rate clock with
  * the clock framework
@@ -452,7 +465,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
 						   flags, fixed_rate)	      \
 	__clk_hw_register_fixed_rate((dev), NULL, (name), NULL, NULL,      \
 				     (parent_data), (flags), (fixed_rate), 0,    \
-				     CLK_FIXED_RATE_PARENT_ACCURACY)
+				     CLK_FIXED_RATE_PARENT_ACCURACY, false)
 
 void clk_unregister_fixed_rate(struct clk *clk);
 void clk_hw_unregister_fixed_rate(struct clk_hw *hw);
-- 
cgit v1.2.3


From dbae2b062824fc2d35ae2d5df2f500626c758e80 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 28 Sep 2022 10:43:09 +0200
Subject: net: skb: introduce and use a single page frag cache

After commit 3226b158e67c ("net: avoid 32 x truesize under-estimation
for tiny skbs") we are observing 10-20% regressions in performance
tests with small packets. The perf trace points to high pressure on
the slab allocator.

This change tries to improve the allocation schema for small packets
using an idea originally suggested by Eric: a new per CPU page frag is
introduced and used in __napi_alloc_skb to cope with small allocation
requests.

To ensure that the above does not lead to excessive truesize
underestimation, the frag size for small allocation is inflated to 1K
and all the above is restricted to build with 4K page size.

Note that we need to update accordingly the run-time check introduced
with commit fd9ea57f4e95 ("net: add napi_get_frags_check() helper").

Alex suggested a smart page refcount schema to reduce the number
of atomic operations and deal properly with pfmemalloc pages.

Under small packet UDP flood, I measure a 15% peak tput increases.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Suggested-by: Alexander H Duyck <alexanderduyck@fb.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://lore.kernel.org/r/6b6f65957c59f86a353fc09a5127e83a32ab5999.1664350652.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7a084a1c14e9..64c3a5864969 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3821,6 +3821,7 @@ void netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
+void napi_get_frags_check(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
 struct packet_offload *gro_find_receive_by_type(__be16 type);
 struct packet_offload *gro_find_complete_by_type(__be16 type);
-- 
cgit v1.2.3


From aac4daa8941ea6566563ac001e9e5d4e54a674e2 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 28 Sep 2022 12:51:57 +0300
Subject: net/sched: query offload capabilities through ndo_setup_tc()

When adding optional new features to Qdisc offloads, existing drivers
must reject the new configuration until they are coded up to act on it.

Since modifying all drivers in lockstep with the changes in the Qdisc
can create problems of its own, it would be nice if there existed an
automatic opt-in mechanism for offloading optional features.

Jakub proposes that we multiplex one more kind of call through
ndo_setup_tc(): one where the driver populates a Qdisc-specific
capability structure.

First user will be taprio in further changes. Here we are introducing
the definitions for the base functionality.

Link: https://patchwork.kernel.org/project/netdevbpf/patch/20220923163310.3192733-3-vladimir.oltean@nxp.com/
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Co-developed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 64c3a5864969..eddf8ee270e7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -940,6 +940,7 @@ struct net_device_path_ctx {
 };
 
 enum tc_setup_type {
+	TC_QUERY_CAPS,
 	TC_SETUP_QDISC_MQPRIO,
 	TC_SETUP_CLSU32,
 	TC_SETUP_CLSFLOWER,
-- 
cgit v1.2.3


From 5bdf402a05fafc55c876b9993b4c948bb2bc2361 Mon Sep 17 00:00:00 2001
From: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Date: Thu, 18 Aug 2022 10:34:40 +0530
Subject: fs/buffer: make submit_bh & submit_bh_wbc return type as void

submit_bh/submit_bh_wbc are non-blocking functions which just submit
the bio and return. The caller of submit_bh/submit_bh_wbc needs to wait
on buffer till I/O completion and then check buffer head's b_state field
to know if there was any I/O error.

Hence there is no need for these functions to have any return type.
Even now they always returns 0. Hence drop the return value and make
their return type as void to avoid any confusion.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Link: https://lore.kernel.org/r/cb66ef823374cdd94d2d03083ce13de844fffd41.1660788334.git.ritesh.list@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/buffer_head.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 089c9ade4325..dcc0e90d8979 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -229,7 +229,7 @@ void ll_rw_block(blk_opf_t, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
 int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
 void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
-int submit_bh(blk_opf_t, struct buffer_head *);
+void submit_bh(blk_opf_t, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 int bh_uptodate_or_lock(struct buffer_head *bh);
-- 
cgit v1.2.3


From cbfecb927f429a6fa613d74b998496bd71e4438a Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Thu, 25 Aug 2022 12:06:57 +0200
Subject: fs: record I_DIRTY_TIME even if inode already has I_DIRTY_INODE

Currently the I_DIRTY_TIME will never get set if the inode already has
I_DIRTY_INODE with assumption that it supersedes I_DIRTY_TIME.  That's
true, however ext4 will only update the on-disk inode in
->dirty_inode(), not on actual writeback. As a result if the inode
already has I_DIRTY_INODE state by the time we get to
__mark_inode_dirty() only with I_DIRTY_TIME, the time was already filled
into on-disk inode and will not get updated until the next I_DIRTY_INODE
update, which might never come if we crash or get a power failure.

The problem can be reproduced on ext4 by running xfstest generic/622
with -o iversion mount option.

Fix it by allowing I_DIRTY_TIME to be set even if the inode already has
I_DIRTY_INODE. Also make sure that the case is properly handled in
writeback_single_inode() as well. Additionally changes in
xfs_fs_dirty_inode() was made to accommodate for I_DIRTY_TIME in flag.

Thanks Jan Kara for suggestions on how to make this work properly.

Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: stable@kernel.org
Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20220825100657.44217-1-lczerner@redhat.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/fs.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9eced4cc286e..56a4b4b02477 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2371,13 +2371,14 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
  *			don't have to write inode on fdatasync() when only
  *			e.g. the timestamps have changed.
  * I_DIRTY_PAGES	Inode has dirty pages.  Inode itself may be clean.
- * I_DIRTY_TIME		The inode itself only has dirty timestamps, and the
+ * I_DIRTY_TIME		The inode itself has dirty timestamps, and the
  *			lazytime mount option is enabled.  We keep track of this
  *			separately from I_DIRTY_SYNC in order to implement
  *			lazytime.  This gets cleared if I_DIRTY_INODE
- *			(I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set.  I.e.
- *			either I_DIRTY_TIME *or* I_DIRTY_INODE can be set in
- *			i_state, but not both.  I_DIRTY_PAGES may still be set.
+ *			(I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But
+ *			I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already
+ *			in place because writeback might already be in progress
+ *			and we don't want to lose the time update
  * I_NEW		Serves as both a mutex and completion notification.
  *			New inodes set I_NEW.  If two processes both create
  *			the same inode, one of them will release its inode and
-- 
cgit v1.2.3


From d427c8999b071af2003203b42a007c4a80156c18 Mon Sep 17 00:00:00 2001
From: Richard Gobert <richardbgobert@gmail.com>
Date: Wed, 28 Sep 2022 14:55:31 +0200
Subject: net-next: skbuff: refactor pskb_pull

pskb_may_pull already contains all of the checks performed by
pskb_pull.
Use pskb_may_pull for validation in pskb_pull, eliminating the
duplication and making __pskb_pull obsolete.
Replace __pskb_pull with pskb_pull where applicable.

Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 920eb6413fee..9fcf534f2d92 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2616,20 +2616,6 @@ void *skb_pull_data(struct sk_buff *skb, size_t len);
 
 void *__pskb_pull_tail(struct sk_buff *skb, int delta);
 
-static inline void *__pskb_pull(struct sk_buff *skb, unsigned int len)
-{
-	if (len > skb_headlen(skb) &&
-	    !__pskb_pull_tail(skb, len - skb_headlen(skb)))
-		return NULL;
-	skb->len -= len;
-	return skb->data += len;
-}
-
-static inline void *pskb_pull(struct sk_buff *skb, unsigned int len)
-{
-	return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len);
-}
-
 static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len)
 {
 	if (likely(len <= skb_headlen(skb)))
@@ -2639,6 +2625,15 @@ static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len)
 	return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL;
 }
 
+static inline void *pskb_pull(struct sk_buff *skb, unsigned int len)
+{
+	if (!pskb_may_pull(skb, len))
+		return NULL;
+
+	skb->len -= len;
+	return skb->data += len;
+}
+
 void skb_condense(struct sk_buff *skb);
 
 /**
-- 
cgit v1.2.3


From f4ce91ce12a7c6ead19b128ffa8cff6e3ded2a14 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 28 Sep 2022 16:03:31 -0400
Subject: tcp: fix tcp_cwnd_validate() to not forget is_cwnd_limited

This commit fixes a bug in the tracking of max_packets_out and
is_cwnd_limited. This bug can cause the connection to fail to remember
that is_cwnd_limited is true, causing the connection to fail to grow
cwnd when it should, causing throughput to be lower than it should be.

The following event sequence is an example that triggers the bug:

 (a) The connection is cwnd_limited, but packets_out is not at its
     peak due to TSO deferral deciding not to send another skb yet.
     In such cases the connection can advance max_packets_seq and set
     tp->is_cwnd_limited to true and max_packets_out to a small
     number.

(b) Then later in the round trip the connection is pacing-limited (not
     cwnd-limited), and packets_out is larger. In such cases the
     connection would raise max_packets_out to a bigger number but
     (unexpectedly) flip tp->is_cwnd_limited from true to false.

This commit fixes that bug.

One straightforward fix would be to separately track (a) the next
window after max_packets_out reaches a maximum, and (b) the next
window after tp->is_cwnd_limited is set to true. But this would
require consuming an extra u32 sequence number.

Instead, to save space we track only the most important
information. Specifically, we track the strongest available signal of
the degree to which the cwnd is fully utilized:

(1) If the connection is cwnd-limited then we remember that fact for
the current window.

(2) If the connection not cwnd-limited then we track the maximum
number of outstanding packets in the current window.

In particular, note that the new logic cannot trigger the buggy
(a)/(b) sequence above because with the new logic a condition where
tp->packets_out > tp->max_packets_out can only trigger an update of
tp->is_cwnd_limited if tp->is_cwnd_limited is false.

This first showed up in a testing of a BBRv2 dev branch, but this
buggy behavior highlighted a general issue with the
tcp_cwnd_validate() logic that can cause cwnd to fail to increase at
the proper rate for any TCP congestion control, including Reno or
CUBIC.

Fixes: ca8a22634381 ("tcp: make cwnd-limited checks measurement-based, and gentler")
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Kevin(Yudong) Yang <yyd@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a9fbe22732c3..4791fd801945 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -295,7 +295,7 @@ struct tcp_sock {
 	u32	packets_out;	/* Packets which are "in flight"	*/
 	u32	retrans_out;	/* Retransmitted packets out		*/
 	u32	max_packets_out;  /* max packets_out in last window */
-	u32	max_packets_seq;  /* right edge of max_packets_out flight */
+	u32	cwnd_usage_seq;  /* right edge of cwnd usage tracking flight */
 
 	u16	urg_data;	/* Saved octet of OOB data and control flags */
 	u8	ecn_flags;	/* ECN status bits.			*/
-- 
cgit v1.2.3


From 650ae67bbf7ba5ac193f053969612fbb93247b64 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <william.gray@linaro.org>
Date: Tue, 27 Sep 2022 18:53:38 -0400
Subject: counter: Introduce the Signal polarity component

The Signal polarity component represents the active level of a
respective Signal. There are two possible states: positive (rising edge)
and negative (falling edge); enum counter_signal_polarity represents
these states. A convenience macro COUNTER_COMP_POLARITY() is provided
for driver authors to declare a Signal polarity component.

Cc: Julien Panis <jpanis@baylibre.com>
Link: https://lore.kernel.org/r/8f47d6e1db71a11bb1e2666f8e2a6e9d256d4131.1664204990.git.william.gray@linaro.org/
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
Link: https://lore.kernel.org/r/b6e53438badcb6318997d13dd2fc052f97d808ac.1664318353.git.william.gray@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/counter.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index a81234bc8ea8..b3fb6b68881a 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -31,6 +31,7 @@ enum counter_comp_type {
 	COUNTER_COMP_ENUM,
 	COUNTER_COMP_COUNT_DIRECTION,
 	COUNTER_COMP_COUNT_MODE,
+	COUNTER_COMP_SIGNAL_POLARITY,
 };
 
 /**
@@ -477,6 +478,15 @@ struct counter_available {
 #define COUNTER_COMP_FLOOR(_read, _write) \
 	COUNTER_COMP_COUNT_U64("floor", _read, _write)
 
+#define COUNTER_COMP_POLARITY(_read, _write, _available) \
+{ \
+	.type = COUNTER_COMP_SIGNAL_POLARITY, \
+	.name = "polarity", \
+	.signal_u32_read = (_read), \
+	.signal_u32_write = (_write), \
+	.priv = &(_available), \
+}
+
 #define COUNTER_COMP_PRESET(_read, _write) \
 	COUNTER_COMP_COUNT_U64("preset", _read, _write)
 
-- 
cgit v1.2.3


From 45d2918520b2d8e640e4fb3fbf664dfb823dc520 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <william.gray@linaro.org>
Date: Tue, 27 Sep 2022 18:53:40 -0400
Subject: counter: Introduce the Count capture component

Some devices provide a latch function to save historic Count values.
This patch standardizes exposure of such functionality as Count capture
components. A COUNTER_COMP_CAPTURE macro is provided for driver authors
to define a capture component. A new event COUNTER_EVENT_CAPTURE is
introduced to represent Count value capture events.

Cc: Julien Panis <jpanis@baylibre.com>
Link: https://lore.kernel.org/r/c239572ab4208d0d6728136e82a88ad464369a7a.1664204990.git.william.gray@linaro.org/
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
Link: https://lore.kernel.org/r/3cebaa0b807a225eb277d771504fe6dba7269ffd.1664318353.git.william.gray@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/counter.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index b3fb6b68881a..e160197971dd 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -453,6 +453,9 @@ struct counter_available {
 	.priv = &(_available), \
 }
 
+#define COUNTER_COMP_CAPTURE(_read, _write) \
+	COUNTER_COMP_COUNT_U64("capture", _read, _write)
+
 #define COUNTER_COMP_CEILING(_read, _write) \
 	COUNTER_COMP_COUNT_U64("ceiling", _read, _write)
 
-- 
cgit v1.2.3


From d2011be1e22f7769c7c71d6d7f777ffcc544808d Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <william.gray@linaro.org>
Date: Tue, 27 Sep 2022 18:53:42 -0400
Subject: counter: Introduce the COUNTER_COMP_ARRAY component type

The COUNTER_COMP_ARRAY Counter component type is introduced to enable
support for Counter array components. With Counter array components,
exposure for buffers on counter devices can be defined via new Counter
array component macros. This should simplify code for driver authors who
would otherwise need to define individual Counter components for each
array element.

Eight Counter array component macros are introduced::

        DEFINE_COUNTER_ARRAY_U64(_name, _length)
        DEFINE_COUNTER_ARRAY_CAPTURE(_name, _length)
        DEFINE_COUNTER_ARRAY_POLARITY(_name, _enums, _length)
        COUNTER_COMP_DEVICE_ARRAY_U64(_name, _read, _write, _array)
        COUNTER_COMP_COUNT_ARRAY_U64(_name, _read, _write, _array)
        COUNTER_COMP_SIGNAL_ARRAY_U64(_name, _read, _write, _array)
        COUNTER_COMP_ARRAY_CAPTURE(_read, _write, _array)
        COUNTER_COMP_ARRAY_POLARITY(_read, _write, _array)

Eight Counter array callbacks are introduced as well::

        int (*signal_array_u32_read)(struct counter_device *counter,
                                     struct counter_signal *signal,
                                     size_t idx, u32 *val);
        int (*signal_array_u32_write)(struct counter_device *counter,
                                      struct counter_signal *signal,
                                      size_t idx, u32 val);
        int (*device_array_u64_read)(struct counter_device *counter,
                                     size_t idx, u64 *val);
        int (*count_array_u64_read)(struct counter_device *counter,
                                    struct counter_count *count,
                                    size_t idx, u64 *val);
        int (*signal_array_u64_read)(struct counter_device *counter,
                                     struct counter_signal *signal,
                                     size_t idx, u64 *val);
        int (*device_array_u64_write)(struct counter_device *counter,
                                      size_t idx, u64 val);
        int (*count_array_u64_write)(struct counter_device *counter,
                                     struct counter_count *count,
                                     size_t idx, u64 val);
        int (*signal_array_u64_write)(struct counter_device *counter,
                                      struct counter_signal *signal,
                                      size_t idx, u64 val);

Driver authors can handle reads/writes for an array component by
receiving an element index via the `idx` parameter and processing the
respective value via the `val` parameter.

For example, suppose a driver wants to expose a Count's read-only
capture buffer of four elements using a callback
`foobar_capture_read()`::

        DEFINE_COUNTER_ARRAY_CAPTURE(foobar_capture_array, 4);
        COUNTER_COMP_ARRAY_CAPTURE(foobar_capture_read, NULL,
                                   foobar_capture_array)

Respective sysfs attributes for each array element would appear for the
respective Count:

* /sys/bus/counter/devices/counterX/countY/capture0
* /sys/bus/counter/devices/counterX/countY/capture1
* /sys/bus/counter/devices/counterX/countY/capture2
* /sys/bus/counter/devices/counterX/countY/capture3

If a user tries to read _capture2_ for example, `idx` will be `2` when
passed to the `foobar_capture_read()` callback, and thus the driver
knows which array element to handle.

Counter arrays for polarity elements can be defined in a similar
manner as u64 elements::

        const enum counter_signal_polarity foobar_polarity_states[] = {
                COUNTER_SIGNAL_POLARITY_POSITIVE,
                COUNTER_SIGNAL_POLARITY_NEGATIVE,
        };
        DEFINE_COUNTER_ARRAY_POLARITY(foobar_polarity_array,
                                      foobar_polarity_states, 4);
        COUNTER_COMP_ARRAY_POLARITY(foobar_polarity_read,
                                    foobar_polarity_write,
                                    foobar_polarity_array)

Tested-by: Julien Panis <jpanis@baylibre.com>
Link: https://lore.kernel.org/r/5310c22520aeae65b1b74952419f49ac4c8e1ec1.1664204990.git.william.gray@linaro.org/
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
Link: https://lore.kernel.org/r/a51fd608704bdfc5a0efa503fc5481df34241e0a.1664318353.git.william.gray@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/counter.h | 134 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index e160197971dd..c41fa602ed28 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -32,6 +32,7 @@ enum counter_comp_type {
 	COUNTER_COMP_COUNT_DIRECTION,
 	COUNTER_COMP_COUNT_MODE,
 	COUNTER_COMP_SIGNAL_POLARITY,
+	COUNTER_COMP_ARRAY,
 };
 
 /**
@@ -69,6 +70,30 @@ enum counter_comp_type {
  * @signal_u64_read:	Signal u64 component read callback. The read value of
  *			the respective Signal u64 component should be passed
  *			back via the val parameter.
+ * @signal_array_u32_read:	Signal u32 array component read callback. The
+ *				index of the respective Count u32 array
+ *				component element is passed via the idx
+ *				parameter. The read value of the respective
+ *				Count u32 array component element should be
+ *				passed back via the val parameter.
+ * @device_array_u64_read:	Device u64 array component read callback. The
+ *				index of the respective Device u64 array
+ *				component element is passed via the idx
+ *				parameter. The read value of the respective
+ *				Device u64 array component element should be
+ *				passed back via the val parameter.
+ * @count_array_u64_read:	Count u64 array component read callback. The
+ *				index of the respective Count u64 array
+ *				component element is passed via the idx
+ *				parameter. The read value of the respective
+ *				Count u64 array component element should be
+ *				passed back via the val parameter.
+ * @signal_array_u64_read:	Signal u64 array component read callback. The
+ *				index of the respective Count u64 array
+ *				component element is passed via the idx
+ *				parameter. The read value of the respective
+ *				Count u64 array component element should be
+ *				passed back via the val parameter.
  * @action_write:	Synapse action mode write callback. The write value of
  *			the respective Synapse action mode is passed via the
  *			action parameter.
@@ -99,6 +124,30 @@ enum counter_comp_type {
  * @signal_u64_write:	Signal u64 component write callback. The write value of
  *			the respective Signal u64 component is passed via the
  *			val parameter.
+ * @signal_array_u32_write:	Signal u32 array component write callback. The
+ *				index of the respective Signal u32 array
+ *				component element is passed via the idx
+ *				parameter. The write value of the respective
+ *				Signal u32 array component element is passed via
+ *				the val parameter.
+ * @device_array_u64_write:	Device u64 array component write callback. The
+ *				index of the respective Device u64 array
+ *				component element is passed via the idx
+ *				parameter. The write value of the respective
+ *				Device u64 array component element is passed via
+ *				the val parameter.
+ * @count_array_u64_write:	Count u64 array component write callback. The
+ *				index of the respective Count u64 array
+ *				component element is passed via the idx
+ *				parameter. The write value of the respective
+ *				Count u64 array component element is passed via
+ *				the val parameter.
+ * @signal_array_u64_write:	Signal u64 array component write callback. The
+ *				index of the respective Signal u64 array
+ *				component element is passed via the idx
+ *				parameter. The write value of the respective
+ *				Signal u64 array component element is passed via
+ *				the val parameter.
  */
 struct counter_comp {
 	enum counter_comp_type type;
@@ -126,6 +175,17 @@ struct counter_comp {
 				      struct counter_count *count, u64 *val);
 		int (*signal_u64_read)(struct counter_device *counter,
 				       struct counter_signal *signal, u64 *val);
+		int (*signal_array_u32_read)(struct counter_device *counter,
+					     struct counter_signal *signal,
+					     size_t idx, u32 *val);
+		int (*device_array_u64_read)(struct counter_device *counter,
+					     size_t idx, u64 *val);
+		int (*count_array_u64_read)(struct counter_device *counter,
+					    struct counter_count *count,
+					    size_t idx, u64 *val);
+		int (*signal_array_u64_read)(struct counter_device *counter,
+					     struct counter_signal *signal,
+					     size_t idx, u64 *val);
 	};
 	union {
 		int (*action_write)(struct counter_device *counter,
@@ -149,6 +209,17 @@ struct counter_comp {
 				       struct counter_count *count, u64 val);
 		int (*signal_u64_write)(struct counter_device *counter,
 					struct counter_signal *signal, u64 val);
+		int (*signal_array_u32_write)(struct counter_device *counter,
+					      struct counter_signal *signal,
+					      size_t idx, u32 val);
+		int (*device_array_u64_write)(struct counter_device *counter,
+					      size_t idx, u64 val);
+		int (*count_array_u64_write)(struct counter_device *counter,
+					     struct counter_count *count,
+					     size_t idx, u64 val);
+		int (*signal_array_u64_write)(struct counter_device *counter,
+					      struct counter_signal *signal,
+					      size_t idx, u64 val);
 	};
 };
 
@@ -453,6 +524,57 @@ struct counter_available {
 	.priv = &(_available), \
 }
 
+struct counter_array {
+	enum counter_comp_type type;
+	const struct counter_available *avail;
+	union {
+		size_t length;
+		size_t idx;
+	};
+};
+
+#define DEFINE_COUNTER_ARRAY_U64(_name, _length) \
+	struct counter_array _name = { \
+		.type = COUNTER_COMP_U64, \
+		.length = (_length), \
+	}
+
+#define DEFINE_COUNTER_ARRAY_CAPTURE(_name, _length) \
+	DEFINE_COUNTER_ARRAY_U64(_name, _length)
+
+#define DEFINE_COUNTER_ARRAY_POLARITY(_name, _enums, _length) \
+	DEFINE_COUNTER_AVAILABLE(_name##_available, _enums); \
+	struct counter_array _name = { \
+		.type = COUNTER_COMP_SIGNAL_POLARITY, \
+		.avail = &(_name##_available), \
+		.length = (_length), \
+	}
+
+#define COUNTER_COMP_DEVICE_ARRAY_U64(_name, _read, _write, _array) \
+{ \
+	.type = COUNTER_COMP_ARRAY, \
+	.name = (_name), \
+	.device_array_u64_read = (_read), \
+	.device_array_u64_write = (_write), \
+	.priv = &(_array), \
+}
+#define COUNTER_COMP_COUNT_ARRAY_U64(_name, _read, _write, _array) \
+{ \
+	.type = COUNTER_COMP_ARRAY, \
+	.name = (_name), \
+	.count_array_u64_read = (_read), \
+	.count_array_u64_write = (_write), \
+	.priv = &(_array), \
+}
+#define COUNTER_COMP_SIGNAL_ARRAY_U64(_name, _read, _write, _array) \
+{ \
+	.type = COUNTER_COMP_ARRAY, \
+	.name = (_name), \
+	.signal_array_u64_read = (_read), \
+	.signal_array_u64_write = (_write), \
+	.priv = &(_array), \
+}
+
 #define COUNTER_COMP_CAPTURE(_read, _write) \
 	COUNTER_COMP_COUNT_U64("capture", _read, _write)
 
@@ -496,4 +618,16 @@ struct counter_available {
 #define COUNTER_COMP_PRESET_ENABLE(_read, _write) \
 	COUNTER_COMP_COUNT_BOOL("preset_enable", _read, _write)
 
+#define COUNTER_COMP_ARRAY_CAPTURE(_read, _write, _array) \
+	COUNTER_COMP_COUNT_ARRAY_U64("capture", _read, _write, _array)
+
+#define COUNTER_COMP_ARRAY_POLARITY(_read, _write, _array) \
+{ \
+	.type = COUNTER_COMP_ARRAY, \
+	.name = "polarity", \
+	.signal_array_u32_read = (_read), \
+	.signal_array_u32_write = (_write), \
+	.priv = &(_array), \
+}
+
 #endif /* _COUNTER_H_ */
-- 
cgit v1.2.3


From de671d6116b5210097cd6fbb877bac92536f265b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 21 Sep 2022 15:19:54 -0600
Subject: block: change request end_io handler to pass back a return value

Everything is just converted to returning RQ_END_IO_NONE, and there
should be no functional changes with this patch.

In preparation for allowing the end_io handler to pass ownership back
to the block layer, rather than retain ownership of the request.

Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 00a15808c137..e6fa49dd6196 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -14,7 +14,12 @@ struct blk_flush_queue;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_DEFAULT_RQ	128
 
-typedef void (rq_end_io_fn)(struct request *, blk_status_t);
+enum rq_end_io_ret {
+	RQ_END_IO_NONE,
+	RQ_END_IO_FREE,
+};
+
+typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t);
 
 /*
  * request flags */
-- 
cgit v1.2.3


From ab3e1d3bbab9e973aeb4dd4603251578658a47ff Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 21 Sep 2022 08:24:16 -0600
Subject: block: allow end_io based requests in the completion batch handling

With end_io handlers now being able to potentially pass ownership of
the request upon completion, we can allow requests with end_io handlers
in the batch completion handling.

Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Co-developed-by: Stefan Roesch <shr@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e6fa49dd6196..50811d0fb143 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -853,8 +853,9 @@ static inline bool blk_mq_add_to_batch(struct request *req,
 				       struct io_comp_batch *iob, int ioerror,
 				       void (*complete)(struct io_comp_batch *))
 {
-	if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror)
+	if (!iob || (req->rq_flags & RQF_ELV) || ioerror)
 		return false;
+
 	if (!iob->complete)
 		iob->complete = complete;
 	else if (iob->complete != complete)
-- 
cgit v1.2.3


From a9216fac3ed8819cbbda5d39dd5fcaa43dfd35d8 Mon Sep 17 00:00:00 2001
From: Anuj Gupta <anuj20.g@samsung.com>
Date: Fri, 30 Sep 2022 11:57:38 +0530
Subject: io_uring: add io_uring_cmd_import_fixed

This is a new helper that callers can use to obtain a bvec iterator for
the previously mapped buffer. This is preparatory work to enable
fixed-buffer support for io_uring_cmd.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Link: https://lore.kernel.org/r/20220930062749.152261-2-anuj20.g@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 58676c0a398f..1dbf51115c30 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -4,6 +4,7 @@
 
 #include <linux/sched.h>
 #include <linux/xarray.h>
+#include <uapi/linux/io_uring.h>
 
 enum io_uring_cmd_flags {
 	IO_URING_F_COMPLETE_DEFER	= 1,
@@ -32,6 +33,8 @@ struct io_uring_cmd {
 };
 
 #if defined(CONFIG_IO_URING)
+int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
+			      struct iov_iter *iter, void *ioucmd);
 void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2);
 void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
 			void (*task_work_cb)(struct io_uring_cmd *));
@@ -59,6 +62,11 @@ static inline void io_uring_free(struct task_struct *tsk)
 		__io_uring_free(tsk);
 }
 #else
+static int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
+			      struct iov_iter *iter, void *ioucmd)
+{
+	return -EOPNOTSUPP;
+}
 static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
 		ssize_t ret2)
 {
-- 
cgit v1.2.3


From 9cda70f622cdcf049521a9c2886e5fd8a90a0591 Mon Sep 17 00:00:00 2001
From: Anuj Gupta <anuj20.g@samsung.com>
Date: Fri, 30 Sep 2022 11:57:39 +0530
Subject: io_uring: introduce fixed buffer support for io_uring_cmd

Add IORING_URING_CMD_FIXED flag that is to be used for sending io_uring
command with previously registered buffers. User-space passes the buffer
index in sqe->buf_index, same as done in read/write variants that uses
fixed buffers.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Link: https://lore.kernel.org/r/20220930062749.152261-3-anuj20.g@samsung.com
[axboe: shuffle valid flags check before acting on it]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 1dbf51115c30..e10c5cc81082 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -28,7 +28,7 @@ struct io_uring_cmd {
 		void *cookie;
 	};
 	u32		cmd_op;
-	u32		pad;
+	u32		flags;
 	u8		pdu[32]; /* available inline for free use */
 };
 
-- 
cgit v1.2.3


From 557654025df5706785d395558244890dc4b2c875 Mon Sep 17 00:00:00 2001
From: Anuj Gupta <anuj20.g@samsung.com>
Date: Fri, 30 Sep 2022 11:57:40 +0530
Subject: block: add blk_rq_map_user_io

Create a helper blk_rq_map_user_io for mapping of vectored as well as
non-vectored requests. This will help in saving dupilcation of code at few
places in scsi and nvme.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220930062749.152261-4-anuj20.g@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 50811d0fb143..ba18e9bdb799 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -985,6 +985,8 @@ struct rq_map_data {
 
 int blk_rq_map_user(struct request_queue *, struct request *,
 		struct rq_map_data *, void __user *, unsigned long, gfp_t);
+int blk_rq_map_user_io(struct request *, struct rq_map_data *,
+		void __user *, unsigned long, gfp_t, bool, int, bool, int);
 int blk_rq_map_user_iov(struct request_queue *, struct request *,
 		struct rq_map_data *, const struct iov_iter *, gfp_t);
 int blk_rq_unmap_user(struct bio *);
-- 
cgit v1.2.3


From e5a3cc83d54019a90119ce0cf17b5e21729b79bf Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Thu, 29 Sep 2022 00:21:42 -0700
Subject: net/mlx5e: Use runtime page_shift for striding RQ

This commit allows striding RQ to determine MTT page size at runtime,
instead of sticking to the compile-time PAGE_SIZE. This functionality
will be used by a following commit that adjusts the MTT page size to the
XSK frame size.

Stick with PAGE_SIZE for XSK on legacy RQ, as frag_stride is not used in
data path, it only helps calculate how pages are partitioned into
fragments, and PAGE_SIZE will ensure each fragment starts at the
beginning of a new allocation unit (XSK frame).

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/qp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index be640c749d5e..afac93f9552c 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -162,6 +162,8 @@ enum {
 	MLX5_SEND_WQE_MAX_WQEBBS	= 16,
 };
 
+#define MLX5_SEND_WQE_MAX_SIZE (MLX5_SEND_WQE_MAX_WQEBBS * MLX5_SEND_WQE_BB)
+
 enum {
 	MLX5_WQE_FMR_PERM_LOCAL_READ	= 1 << 27,
 	MLX5_WQE_FMR_PERM_LOCAL_WRITE	= 1 << 28,
-- 
cgit v1.2.3


From 6470d2e7e8ed8e9dd560d8dc3e09d1100a17ee26 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Thu, 29 Sep 2022 00:21:46 -0700
Subject: net/mlx5e: xsk: Use KSM for unaligned XSK

UMR MTTs used in striding RQ have certain alignment requirements. While
it's guaranteed to work when UMR pages are aligned to the UMR page size,
in practice it works then UMR pages are aligned to 8 bytes. However,
it's still not enough flexibility for the unaligned mode of XSK. This
patch leverages KSM to map UMR pages without alignment requirements,
when unaligned XSK is active. The downside is that KSM entries are twice
as big as MTTs, which limits the maximum WQE size, so regular RQs and
aligned XSK continue using MTTs.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/qp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index afac93f9552c..4657d5c54abe 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -478,6 +478,12 @@ struct mlx5_klm {
 	__be64		va;
 };
 
+struct mlx5_ksm {
+	__be32		reserved;
+	__be32		key;
+	__be64		va;
+};
+
 struct mlx5_stride_block_entry {
 	__be16		stride;
 	__be16		bcount;
-- 
cgit v1.2.3


From 82b1ec794d701478381482264f3bfada3a7bf2d9 Mon Sep 17 00:00:00 2001
From: Sumeet Pawnikar <sumeet.r.pawnikar@intel.com>
Date: Tue, 27 Sep 2022 21:17:09 +0530
Subject: thermal: core: Increase maximum number of trip points

On one of the Chrome system, if we define more than 12 trip points,
probe for thermal sensor fails with
"int3403 thermal: probe of INTC1046:03 failed with error -22"
and throws an error as
"thermal_sys: Error: Incorrect number of thermal trips".

The thermal_zone_device_register() interface needs maximum
number of trip points supported in a zone as an argument.
This number can't exceed THERMAL_MAX_TRIPS, which is currently
set to 12. To address this issue, THERMAL_MAX_TRIPS value
has to be increased.

This interface also has an argument to specify a mask of trips
which are writable. This mask is defined as an int.
This mask sets the ceiling for increasing maximum number of
supported trips. With the current implementation, maximum number
of trips can be supported is 31.

Also, THERMAL_MAX_TRIPS macro is used in one place only.
So, remove THERMAL_MAX_TRIPS macro and compare num_trips
directly with using a macro BITS_PER_TYPE(int)-1.

Signed-off-by: Sumeet Pawnikar <sumeet.r.pawnikar@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/thermal.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 86c24ddd5985..6f1ec4fb7ef8 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -17,8 +17,6 @@
 #include <linux/workqueue.h>
 #include <uapi/linux/thermal.h>
 
-#define THERMAL_MAX_TRIPS	12
-
 /* invalid cooling state */
 #define THERMAL_CSTATE_INVALID -1UL
 
-- 
cgit v1.2.3


From 88269151be679b9accb2f1e73487eaeb9eae9e39 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 29 Sep 2022 17:32:49 -0700
Subject: of: base: make of_device_compatible_match() accept const device node

of_device_is_compatible() accepts const device node pointer, there is
no reason why of_device_compatible_match() can't do the same.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Link: https://lore.kernel.org/r/YzY5MaU5N4A2st5R@google.com
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index 766d002bddb9..6b79ef9a6541 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -342,7 +342,7 @@ extern int of_property_read_string_helper(const struct device_node *np,
 					      const char **out_strs, size_t sz, int index);
 extern int of_device_is_compatible(const struct device_node *device,
 				   const char *);
-extern int of_device_compatible_match(struct device_node *device,
+extern int of_device_compatible_match(const struct device_node *device,
 				      const char *const *compat);
 extern bool of_device_is_available(const struct device_node *device);
 extern bool of_device_is_big_endian(const struct device_node *device);
@@ -562,7 +562,7 @@ static inline int of_device_is_compatible(const struct device_node *device,
 	return 0;
 }
 
-static inline  int of_device_compatible_match(struct device_node *device,
+static inline  int of_device_compatible_match(const struct device_node *device,
 					      const char *const *compat)
 {
 	return 0;
-- 
cgit v1.2.3


From 1c8934b4802d2744c97e7c97d244af967f4bf141 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 23 Jun 2022 14:57:17 +0300
Subject: clk: Remove never used devm_of_clk_del_provider()

For the entire history of the devm_of_clk_del_provider) existence
(since 2017) it was never used. Remove it for good.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220623115719.52683-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 1615010aa0ec..305e18eb23cf 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1454,7 +1454,7 @@ int devm_of_clk_add_hw_provider(struct device *dev,
 						 void *data),
 			   void *data);
 void of_clk_del_provider(struct device_node *np);
-void devm_of_clk_del_provider(struct device *dev);
+
 struct clk *of_clk_src_simple_get(struct of_phandle_args *clkspec,
 				  void *data);
 struct clk_hw *of_clk_hw_simple_get(struct of_phandle_args *clkspec,
@@ -1491,7 +1491,7 @@ static inline int devm_of_clk_add_hw_provider(struct device *dev,
 	return 0;
 }
 static inline void of_clk_del_provider(struct device_node *np) {}
-static inline void devm_of_clk_del_provider(struct device *dev) {}
+
 static inline struct clk *of_clk_src_simple_get(
 	struct of_phandle_args *clkspec, void *data)
 {
-- 
cgit v1.2.3


From 07bdf48d3fee12268bf9a179821aee9b80f5239c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 23 Jun 2022 14:57:18 +0300
Subject: clkdev: Remove never used devm_clk_release_clkdev()

For the entire history of the devm_clk_release_clkdev() existence
(since 2018) it was never used. Remove it for good.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220623115719.52683-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h
index 8a8423eb8e9a..45570bc21a43 100644
--- a/include/linux/clkdev.h
+++ b/include/linux/clkdev.h
@@ -46,6 +46,4 @@ int clk_hw_register_clkdev(struct clk_hw *, const char *, const char *);
 
 int devm_clk_hw_register_clkdev(struct device *dev, struct clk_hw *hw,
 				const char *con_id, const char *dev_id);
-void devm_clk_release_clkdev(struct device *dev, const char *con_id,
-			     const char *dev_id);
 #endif
-- 
cgit v1.2.3


From 854701ba4c39afae2362ba19a580c461cb183e4f Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 19 Sep 2022 14:05:54 -0700
Subject: net: fix cpu_max_bits_warn() usage in netif_attrmask_next{,_and}

The functions require to be passed with a cpu index prior to one that is
the first to start search, so the valid input range is [-1, nr_cpu_ids-1).
However, the code checks against [-1, nr_cpu_ids).

Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/netdevice.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 05d6f3facd5a..4d6d5a2dd82e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3643,9 +3643,8 @@ static inline bool netif_attr_test_online(unsigned long j,
 static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp,
 					       unsigned int nr_bits)
 {
-	/* -1 is a legal arg here. */
-	if (n != -1)
-		cpu_max_bits_warn(n, nr_bits);
+	/* n is a prior cpu */
+	cpu_max_bits_warn(n + 1, nr_bits);
 
 	if (srcp)
 		return find_next_bit(srcp, nr_bits, n + 1);
@@ -3666,9 +3665,8 @@ static inline int netif_attrmask_next_and(int n, const unsigned long *src1p,
 					  const unsigned long *src2p,
 					  unsigned int nr_bits)
 {
-	/* -1 is a legal arg here. */
-	if (n != -1)
-		cpu_max_bits_warn(n, nr_bits);
+	/* n is a prior cpu */
+	cpu_max_bits_warn(n + 1, nr_bits);
 
 	if (src1p && src2p)
 		return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
-- 
cgit v1.2.3


From 33e67710beda78aed38a2fe10be6088d4aeb1c53 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 19 Sep 2022 14:05:55 -0700
Subject: cpumask: switch for_each_cpu{,_not} to use for_each_bit()

The difference between for_each_cpu() and for_each_set_bit()
is that the latter uses cpumask_next() instead of find_next_bit(),
and so calls cpumask_check().

This check is useless because the iterator value is not provided by
user. It generates false-positives for the very last iteration
of for_each_cpu().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 12 +++---------
 include/linux/find.h    |  5 +++++
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index e4f9136a4a63..9c85774d45a4 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -246,9 +246,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
  * After the loop, cpu is >= nr_cpu_ids.
  */
 #define for_each_cpu(cpu, mask)				\
-	for ((cpu) = -1;				\
-		(cpu) = cpumask_next((cpu), (mask)),	\
-		(cpu) < nr_cpu_ids;)
+	for_each_set_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
 
 /**
  * for_each_cpu_not - iterate over every cpu in a complemented mask
@@ -258,9 +256,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
  * After the loop, cpu is >= nr_cpu_ids.
  */
 #define for_each_cpu_not(cpu, mask)				\
-	for ((cpu) = -1;					\
-		(cpu) = cpumask_next_zero((cpu), (mask)),	\
-		(cpu) < nr_cpu_ids;)
+	for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
 
 #if NR_CPUS == 1
 static inline
@@ -313,9 +309,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
  * After the loop, cpu is >= nr_cpu_ids.
  */
 #define for_each_cpu_and(cpu, mask1, mask2)				\
-	for ((cpu) = -1;						\
-		(cpu) = cpumask_next_and((cpu), (mask1), (mask2)),	\
-		(cpu) < nr_cpu_ids;)
+	for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits)
 
 /**
  * cpumask_any_but - return a "random" in a cpumask, but not this one.
diff --git a/include/linux/find.h b/include/linux/find.h
index b100944daba0..128615a3f93e 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -390,6 +390,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 	     (bit) < (size);					\
 	     (bit) = find_next_bit((addr), (size), (bit) + 1))
 
+#define for_each_and_bit(bit, addr1, addr2, size) \
+	for ((bit) = find_next_and_bit((addr1), (addr2), (size), 0);		\
+	     (bit) < (size);							\
+	     (bit) = find_next_and_bit((addr1), (addr2), (size), (bit) + 1))
+
 /* same as for_each_set_bit() but use bit as value to start with */
 #define for_each_set_bit_from(bit, addr, size) \
 	for ((bit) = find_next_bit((addr), (size), (bit));	\
-- 
cgit v1.2.3


From 6cc18331a987c4a29d66b9c4fd292587fba4d7bd Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 19 Sep 2022 14:05:56 -0700
Subject: lib/find_bit: add find_next{,_and}_bit_wrap

The helper is better optimized for the worst case: in case of empty
cpumask, current code traverses 2 * size:

  next = cpumask_next_and(prev, src1p, src2p);
  if (next >= nr_cpu_ids)
  	next = cpumask_first_and(src1p, src2p);

At bitmap level we can stop earlier after checking 'size + offset' bits.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/find.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/find.h b/include/linux/find.h
index 128615a3f93e..77c087b7a451 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -290,6 +290,52 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
 }
 #endif
 
+/**
+ * find_next_and_bit_wrap - find the next set bit in both memory regions
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @size: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Returns the bit number for the next set bit, or first set bit up to @offset
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_next_and_bit_wrap(const unsigned long *addr1,
+					const unsigned long *addr2,
+					unsigned long size, unsigned long offset)
+{
+	unsigned long bit = find_next_and_bit(addr1, addr2, size, offset);
+
+	if (bit < size)
+		return bit;
+
+	bit = find_first_and_bit(addr1, addr2, offset);
+	return bit < offset ? bit : size;
+}
+
+/**
+ * find_next_bit_wrap - find the next set bit in both memory regions
+ * @addr: The first address to base the search on
+ * @size: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Returns the bit number for the next set bit, or first set bit up to @offset
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_next_bit_wrap(const unsigned long *addr,
+					unsigned long size, unsigned long offset)
+{
+	unsigned long bit = find_next_bit(addr, size, offset);
+
+	if (bit < size)
+		return bit;
+
+	bit = find_first_bit(addr, offset);
+	return bit < offset ? bit : size;
+}
+
 /**
  * find_next_clump8 - find next 8-bit clump with set bits in a memory region
  * @clump: location to store copy of found clump
-- 
cgit v1.2.3


From 4fe49b3b97c2640147c46519c2a6fdb06df34f5f Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 19 Sep 2022 14:05:57 -0700
Subject: lib/bitmap: introduce for_each_set_bit_wrap() macro

Add for_each_set_bit_wrap() macro and use it in for_each_cpu_wrap(). The
new macro is based on __for_each_wrap() iterator, which is simpler and
smaller than cpumask_next_wrap().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h |  6 ++----
 include/linux/find.h    | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 9c85774d45a4..13b32dd9803b 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -289,10 +289,8 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
  *
  * After the loop, cpu is >= nr_cpu_ids.
  */
-#define for_each_cpu_wrap(cpu, mask, start)					\
-	for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false);	\
-	     (cpu) < nr_cpumask_bits;						\
-	     (cpu) = cpumask_next_wrap((cpu), (mask), (start), true))
+#define for_each_cpu_wrap(cpu, mask, start)				\
+	for_each_set_bit_wrap(cpu, cpumask_bits(mask), nr_cpumask_bits, start)
 
 /**
  * for_each_cpu_and - iterate over every cpu in both masks
diff --git a/include/linux/find.h b/include/linux/find.h
index 77c087b7a451..3b746a183216 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -336,6 +336,32 @@ unsigned long find_next_bit_wrap(const unsigned long *addr,
 	return bit < offset ? bit : size;
 }
 
+/*
+ * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing
+ * before using it alone.
+ */
+static inline
+unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size,
+				 unsigned long start, unsigned long n)
+{
+	unsigned long bit;
+
+	/* If not wrapped around */
+	if (n > start) {
+		/* and have a bit, just return it. */
+		bit = find_next_bit(bitmap, size, n);
+		if (bit < size)
+			return bit;
+
+		/* Otherwise, wrap around and ... */
+		n = 0;
+	}
+
+	/* Search the other part. */
+	bit = find_next_bit(bitmap, start, n);
+	return bit < start ? bit : size;
+}
+
 /**
  * find_next_clump8 - find next 8-bit clump with set bits in a memory region
  * @clump: location to store copy of found clump
@@ -514,6 +540,19 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 	     (b) = find_next_zero_bit((addr), (size), (e) + 1),	\
 	     (e) = find_next_bit((addr), (size), (b) + 1))
 
+/**
+ * for_each_set_bit_wrap - iterate over all set bits starting from @start, and
+ * wrapping around the end of bitmap.
+ * @bit: offset for current iteration
+ * @addr: bitmap address to base the search on
+ * @size: bitmap size in number of bits
+ * @start: Starting bit for bitmap traversing, wrapping around the bitmap end
+ */
+#define for_each_set_bit_wrap(bit, addr, size, start) \
+	for ((bit) = find_next_bit_wrap((addr), (size), (start));		\
+	     (bit) < (size);							\
+	     (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1))
+
 /**
  * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
  * @start: bit offset to start search and to store the current iteration offset
-- 
cgit v1.2.3


From fdae96a3fc7f70eb8ff9619550d7fa604719626a Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 19 Sep 2022 14:05:58 -0700
Subject: lib/find: optimize for_each() macros

Moving an iterator of the macros inside conditional part of for-loop
helps to generate a better code. It had been first implemented in commit
7baac8b91f9871ba ("cpumask: make for_each_cpu_mask a bit smaller").

Now that cpumask for-loops are the aliases to bitmap loops, it's worth
to optimize them the same way.

Bloat-o-meter says:
add/remove: 8/12 grow/shrink: 147/592 up/down: 4876/-24416 (-19540)

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/find.h | 56 +++++++++++++++++++++++-----------------------------
 1 file changed, 25 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/find.h b/include/linux/find.h
index 3b746a183216..0cdfab9734a6 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -458,31 +458,25 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 #endif
 
 #define for_each_set_bit(bit, addr, size) \
-	for ((bit) = find_next_bit((addr), (size), 0);		\
-	     (bit) < (size);					\
-	     (bit) = find_next_bit((addr), (size), (bit) + 1))
+	for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
 
 #define for_each_and_bit(bit, addr1, addr2, size) \
-	for ((bit) = find_next_and_bit((addr1), (addr2), (size), 0);		\
-	     (bit) < (size);							\
-	     (bit) = find_next_and_bit((addr1), (addr2), (size), (bit) + 1))
+	for ((bit) = 0;									\
+	     (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
+	     (bit)++)
 
 /* same as for_each_set_bit() but use bit as value to start with */
 #define for_each_set_bit_from(bit, addr, size) \
-	for ((bit) = find_next_bit((addr), (size), (bit));	\
-	     (bit) < (size);					\
-	     (bit) = find_next_bit((addr), (size), (bit) + 1))
+	for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
 
 #define for_each_clear_bit(bit, addr, size) \
-	for ((bit) = find_next_zero_bit((addr), (size), 0);	\
-	     (bit) < (size);					\
-	     (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
+	for ((bit) = 0;									\
+	     (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size);		\
+	     (bit)++)
 
 /* same as for_each_clear_bit() but use bit as value to start with */
 #define for_each_clear_bit_from(bit, addr, size) \
-	for ((bit) = find_next_zero_bit((addr), (size), (bit));	\
-	     (bit) < (size);					\
-	     (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
+	for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
 
 /**
  * for_each_set_bitrange - iterate over all set bit ranges [b; e)
@@ -492,11 +486,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned
  * @size: bitmap size in number of bits
  */
 #define for_each_set_bitrange(b, e, addr, size)			\
-	for ((b) = find_next_bit((addr), (size), 0),		\
-	     (e) = find_next_zero_bit((addr), (size), (b) + 1);	\
+	for ((b) = 0;						\
+	     (b) = find_next_bit((addr), (size), b),		\
+	     (e) = find_next_zero_bit((addr), (size), (b) + 1),	\
 	     (b) < (size);					\
-	     (b) = find_next_bit((addr), (size), (e) + 1),	\
-	     (e) = find_next_zero_bit((addr), (size), (b) + 1))
+	     (b) = (e) + 1)
 
 /**
  * for_each_set_bitrange_from - iterate over all set bit ranges [b; e)
@@ -506,11 +500,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned
  * @size: bitmap size in number of bits
  */
 #define for_each_set_bitrange_from(b, e, addr, size)		\
-	for ((b) = find_next_bit((addr), (size), (b)),		\
-	     (e) = find_next_zero_bit((addr), (size), (b) + 1);	\
+	for (;							\
+	     (b) = find_next_bit((addr), (size), (b)),		\
+	     (e) = find_next_zero_bit((addr), (size), (b) + 1),	\
 	     (b) < (size);					\
-	     (b) = find_next_bit((addr), (size), (e) + 1),	\
-	     (e) = find_next_zero_bit((addr), (size), (b) + 1))
+	     (b) = (e) + 1)
 
 /**
  * for_each_clear_bitrange - iterate over all unset bit ranges [b; e)
@@ -520,11 +514,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned
  * @size: bitmap size in number of bits
  */
 #define for_each_clear_bitrange(b, e, addr, size)		\
-	for ((b) = find_next_zero_bit((addr), (size), 0),	\
-	     (e) = find_next_bit((addr), (size), (b) + 1);	\
+	for ((b) = 0;						\
+	     (b) = find_next_zero_bit((addr), (size), (b)),	\
+	     (e) = find_next_bit((addr), (size), (b) + 1),	\
 	     (b) < (size);					\
-	     (b) = find_next_zero_bit((addr), (size), (e) + 1),	\
-	     (e) = find_next_bit((addr), (size), (b) + 1))
+	     (b) = (e) + 1)
 
 /**
  * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e)
@@ -534,11 +528,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned
  * @size: bitmap size in number of bits
  */
 #define for_each_clear_bitrange_from(b, e, addr, size)		\
-	for ((b) = find_next_zero_bit((addr), (size), (b)),	\
-	     (e) = find_next_bit((addr), (size), (b) + 1);	\
+	for (;							\
+	     (b) = find_next_zero_bit((addr), (size), (b)),	\
+	     (e) = find_next_bit((addr), (size), (b) + 1),	\
 	     (b) < (size);					\
-	     (b) = find_next_zero_bit((addr), (size), (e) + 1),	\
-	     (e) = find_next_bit((addr), (size), (b) + 1))
+	     (b) = (e) + 1)
 
 /**
  * for_each_set_bit_wrap - iterate over all set bits starting from @start, and
-- 
cgit v1.2.3


From 78e5a3399421ad79fc024e6d78e2deb7809d26af Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 19 Sep 2022 14:05:53 -0700
Subject: cpumask: fix checking valid cpu range

The range of valid CPUs is [0, nr_cpu_ids). Some cpumask functions are
passed with a shifted CPU index, and for them, the valid range is
[-1, nr_cpu_ids-1). Currently for those functions, we check the index
against [-1, nr_cpu_ids), which is wrong.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 13b32dd9803b..286804bfe3b7 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -174,9 +174,8 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp)
 static inline
 unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
-	/* -1 is a legal arg here. */
-	if (n != -1)
-		cpumask_check(n);
+	/* n is a prior cpu */
+	cpumask_check(n + 1);
 	return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
 }
 
@@ -189,9 +188,8 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp)
  */
 static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 {
-	/* -1 is a legal arg here. */
-	if (n != -1)
-		cpumask_check(n);
+	/* n is a prior cpu */
+	cpumask_check(n + 1);
 	return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
 }
 
@@ -231,9 +229,8 @@ static inline
 unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
 		     const struct cpumask *src2p)
 {
-	/* -1 is a legal arg here. */
-	if (n != -1)
-		cpumask_check(n);
+	/* n is a prior cpu */
+	cpumask_check(n + 1);
 	return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
 		nr_cpumask_bits, n + 1);
 }
@@ -263,8 +260,8 @@ static inline
 unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
 {
 	cpumask_check(start);
-	if (n != -1)
-		cpumask_check(n);
+	/* n is a prior cpu */
+	cpumask_check(n + 1);
 
 	/*
 	 * Return the first available CPU when wrapping, or when starting before cpu0,
-- 
cgit v1.2.3


From fd580c9830316edad6f8b1d9f542563730658efe Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 30 Sep 2022 16:21:00 +0200
Subject: net: sfp: augment SFP parsing with phy_interface_t bitmap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We currently parse the SFP EEPROM to a bitmap of ethtool link modes,
and then attempt to convert the link modes to a PHY interface mode.
While this works at present, there are cases where this is sub-optimal.
For example, where a module can operate with several different PHY
interface modes.

To start addressing this, arrange for the SFP EEPROM parsing to also
provide a bitmap of the possible PHY interface modes.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sfp.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 302094b855fb..d1f343853b6c 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -535,7 +535,7 @@ int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
 		   unsigned long *support);
 bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id);
 void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
-		       unsigned long *support);
+		       unsigned long *support, unsigned long *interfaces);
 phy_interface_t sfp_select_interface(struct sfp_bus *bus,
 				     unsigned long *link_modes);
 
@@ -568,7 +568,8 @@ static inline bool sfp_may_have_phy(struct sfp_bus *bus,
 
 static inline void sfp_parse_support(struct sfp_bus *bus,
 				     const struct sfp_eeprom_id *id,
-				     unsigned long *support)
+				     unsigned long *support,
+				     unsigned long *interfaces)
 {
 }
 
-- 
cgit v1.2.3


From eca68a3c7d05b38b4e728cead0c49718f2bc1d5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Beh=C3=BAn?= <kabel@kernel.org>
Date: Fri, 30 Sep 2022 16:21:03 +0200
Subject: net: phylink: pass supported host PHY interface modes to phylib for
 SFP's PHYs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pass the supported PHY interface types to phylib if the PHY we are
connecting is inside a SFP, so that the PHY driver can select an
appropriate host configuration mode for their interface according to
the host capabilities.

For example the Marvell 88X3310 PHY inside RollBall SFP modules
defaults to 10gbase-r mode on host's side, and the marvell10g
driver currently does not change this setting. But a host may not
support 10gbase-r. For example Turris Omnia only supports sgmii,
1000base-x and 2500base-x modes. The PHY can be configured to use
those modes, but in order for the PHY driver to do that, it needs
to know which modes are supported.

Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 9c66f357f489..d65fc76fe0ae 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -571,6 +571,7 @@ struct macsec_ops;
  * @advertising: Currently advertised linkmodes
  * @adv_old: Saved advertised while power saving for WoL
  * @lp_advertising: Current link partner advertised linkmodes
+ * @host_interfaces: PHY interface modes supported by host
  * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited
  * @autoneg: Flag autoneg being used
  * @rate_matching: Current rate matching mode
@@ -670,6 +671,9 @@ struct phy_device {
 	/* used with phy_speed_down */
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(adv_old);
 
+	/* Host supported PHY interface types. Should be ignored if empty. */
+	DECLARE_PHY_INTERFACE_MASK(host_interfaces);
+
 	/* Energy efficient ethernet modes which should be prohibited */
 	u32 eee_broken_modes;
 
-- 
cgit v1.2.3


From e85b1347ace677c3822c12d9332dfaaffe594da6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Beh=C3=BAn?= <kabel@kernel.org>
Date: Fri, 30 Sep 2022 16:21:08 +0200
Subject: net: sfp: create/destroy I2C mdiobus before PHY probe/after PHY
 release
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of configuring the I2C mdiobus when SFP driver is probed,
create/destroy the mdiobus before the PHY is probed for/after it is
released.

This way we can tell the mdio-i2c code which protocol to use for each
SFP transceiver.

Move the code that determines MDIO I2C protocol from
sfp_sm_probe_for_phy() to sfp_sm_mod_probe(), where most of the SFP ID
parsing is done. Don't allocate I2C bus if no PHY is expected.

Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio/mdio-i2c.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mdio/mdio-i2c.h b/include/linux/mdio/mdio-i2c.h
index b1d27f7cd23f..3bde1a555a49 100644
--- a/include/linux/mdio/mdio-i2c.h
+++ b/include/linux/mdio/mdio-i2c.h
@@ -11,6 +11,12 @@ struct device;
 struct i2c_adapter;
 struct mii_bus;
 
+enum mdio_i2c_proto {
+	MDIO_I2C_NONE,
+	MDIO_I2C_MARVELL_C22,
+	MDIO_I2C_C45,
+};
+
 struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c);
 
 #endif
-- 
cgit v1.2.3


From 09bbedac72d5a9267088c15d1a71c8c3a8fb47e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Beh=C3=BAn?= <kabel@kernel.org>
Date: Fri, 30 Sep 2022 16:21:09 +0200
Subject: net: phy: mdio-i2c: support I2C MDIO protocol for RollBall SFP
 modules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some multigig SFPs from RollBall and Hilink do not expose functional
MDIO access to the internal PHY of the SFP via I2C address 0x56
(although there seems to be read-only clause 22 access on this address).

Instead these SFPs PHY can be accessed via I2C via the SFP Enhanced
Digital Diagnostic Interface - I2C address 0x51. The SFP_PAGE has to be
selected to 3 and the password must be filled with 0xff bytes for this
PHY communication to work.

This extends the mdio-i2c driver to support this protocol by adding a
special parameter to mdio_i2c_alloc function via which this RollBall
protocol can be selected.

Signed-off-by: Marek Behún <kabel@kernel.org>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio/mdio-i2c.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mdio/mdio-i2c.h b/include/linux/mdio/mdio-i2c.h
index 3bde1a555a49..65b550a6fc32 100644
--- a/include/linux/mdio/mdio-i2c.h
+++ b/include/linux/mdio/mdio-i2c.h
@@ -15,8 +15,10 @@ enum mdio_i2c_proto {
 	MDIO_I2C_NONE,
 	MDIO_I2C_MARVELL_C22,
 	MDIO_I2C_C45,
+	MDIO_I2C_ROLLBALL,
 };
 
-struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c);
+struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c,
+			       enum mdio_i2c_proto protocol);
 
 #endif
-- 
cgit v1.2.3


From 62c07983bef9d3e78e71189441e1a470f0d1e653 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 1 Oct 2022 13:51:02 -0700
Subject: once: add DO_ONCE_SLOW() for sleepable contexts

Christophe Leroy reported a ~80ms latency spike
happening at first TCP connect() time.

This is because __inet_hash_connect() uses get_random_once()
to populate a perturbation table which became quite big
after commit 4c2c8f03a5ab ("tcp: increase source port perturb table to 2^16")

get_random_once() uses DO_ONCE(), which block hard irqs for the duration
of the operation.

This patch adds DO_ONCE_SLOW() which uses a mutex instead of a spinlock
for operations where we prefer to stay in process context.

Then __inet_hash_connect() can use get_random_slow_once()
to populate its perturbation table.

Fixes: 4c2c8f03a5ab ("tcp: increase source port perturb table to 2^16")
Fixes: 190cc82489f4 ("tcp: change source port randomizarion at connect() time")
Reported-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Link: https://lore.kernel.org/netdev/CANn89iLAEYBaoYajy0Y9UmGFff5GPxDUoG-ErVB2jDdRNQ5Tug@mail.gmail.com/T/#t
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willy Tarreau <w@1wt.eu>
Tested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/once.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/once.h b/include/linux/once.h
index b14d8b309d52..176ab75b42df 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -5,10 +5,18 @@
 #include <linux/types.h>
 #include <linux/jump_label.h>
 
+/* Helpers used from arbitrary contexts.
+ * Hard irqs are blocked, be cautious.
+ */
 bool __do_once_start(bool *done, unsigned long *flags);
 void __do_once_done(bool *done, struct static_key_true *once_key,
 		    unsigned long *flags, struct module *mod);
 
+/* Variant for process contexts only. */
+bool __do_once_slow_start(bool *done);
+void __do_once_slow_done(bool *done, struct static_key_true *once_key,
+			 struct module *mod);
+
 /* Call a function exactly once. The idea of DO_ONCE() is to perform
  * a function call such as initialization of random seeds, etc, only
  * once, where DO_ONCE() can live in the fast-path. After @func has
@@ -52,7 +60,27 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
 		___ret;							     \
 	})
 
+/* Variant of DO_ONCE() for process/sleepable contexts. */
+#define DO_ONCE_SLOW(func, ...)						     \
+	({								     \
+		bool ___ret = false;					     \
+		static bool __section(".data.once") ___done = false;	     \
+		static DEFINE_STATIC_KEY_TRUE(___once_key);		     \
+		if (static_branch_unlikely(&___once_key)) {		     \
+			___ret = __do_once_slow_start(&___done);	     \
+			if (unlikely(___ret)) {				     \
+				func(__VA_ARGS__);			     \
+				__do_once_slow_done(&___done, &___once_key,  \
+						    THIS_MODULE);	     \
+			}						     \
+		}							     \
+		___ret;							     \
+	})
+
 #define get_random_once(buf, nbytes)					     \
 	DO_ONCE(get_random_bytes, (buf), (nbytes))
 
+#define get_random_slow_once(buf, nbytes)				     \
+	DO_ONCE_SLOW(get_random_bytes, (buf), (nbytes))
+
 #endif /* _LINUX_ONCE_H */
-- 
cgit v1.2.3


From 79e1119b7e0099c6c9379ca3129ffb7aa2a1c249 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Wed, 31 Aug 2022 11:19:47 +0800
Subject: ksm: remove redundant declarations in ksm.h

Currently, for struct stable_node, no one uses it in both the
include/linux/ksm.h file and the file that contains it.  For struct
mem_cgroup, it's also not used in ksm.h.  So they're all redundant, just
remove them.

Link: https://lkml.kernel.org/r/20220831031951.43152-4-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 0b4f17418f64..7e232ba59b86 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -15,9 +15,6 @@
 #include <linux/sched.h>
 #include <linux/sched/coredump.h>
 
-struct stable_node;
-struct mem_cgroup;
-
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
-- 
cgit v1.2.3


From 379708ffde1b049bc41084e0a0572c44c8a1d2c4 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:45:58 +0100
Subject: mm: add the first tail page to struct folio

Some of the static checkers get confused by extracting the page from the
folio and referring to fields in the first tail page.  Adding these fields
to struct folio lets us avoid doing that.  It has the risk that people
will refer to those fields without checking that the folio is actually a
large folio, so prefix them with underscores and document the preferred
function to use instead.

Link: https://lkml.kernel.org/r/20220902194653.1739778-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8f30f262431c..5c87d0f292a2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -245,6 +245,13 @@ struct page {
  * @_refcount: Do not access this member directly.  Use folio_ref_count()
  *    to find how many references there are to this folio.
  * @memcg_data: Memory Control Group data.
+ * @_flags_1: For large folios, additional page flags.
+ * @__head: Points to the folio.  Do not use.
+ * @_folio_dtor: Which destructor to use for this folio.
+ * @_folio_order: Do not use directly, call folio_order().
+ * @_total_mapcount: Do not use directly, call folio_entire_mapcount().
+ * @_pincount: Do not use directly, call folio_maybe_dma_pinned().
+ * @_folio_nr_pages: Do not use directly, call folio_nr_pages().
  *
  * A folio is a physically, virtually and logically contiguous set
  * of bytes.  It is a power-of-two in size, and it is aligned to that
@@ -283,9 +290,17 @@ struct folio {
 		};
 		struct page page;
 	};
+	unsigned long _flags_1;
+	unsigned long __head;
+	unsigned char _folio_dtor;
+	unsigned char _folio_order;
+	atomic_t _total_mapcount;
+	atomic_t _pincount;
+#ifdef CONFIG_64BIT
+	unsigned int _folio_nr_pages;
+#endif
 };
 
-static_assert(sizeof(struct page) == sizeof(struct folio));
 #define FOLIO_MATCH(pg, fl)						\
 	static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl))
 FOLIO_MATCH(flags, flags);
@@ -300,6 +315,19 @@ FOLIO_MATCH(_refcount, _refcount);
 FOLIO_MATCH(memcg_data, memcg_data);
 #endif
 #undef FOLIO_MATCH
+#define FOLIO_MATCH(pg, fl)						\
+	static_assert(offsetof(struct folio, fl) ==			\
+			offsetof(struct page, pg) + sizeof(struct page))
+FOLIO_MATCH(flags, _flags_1);
+FOLIO_MATCH(compound_head, __head);
+FOLIO_MATCH(compound_dtor, _folio_dtor);
+FOLIO_MATCH(compound_order, _folio_order);
+FOLIO_MATCH(compound_mapcount, _total_mapcount);
+FOLIO_MATCH(compound_pincount, _pincount);
+#ifdef CONFIG_64BIT
+FOLIO_MATCH(compound_nr, _folio_nr_pages);
+#endif
+#undef FOLIO_MATCH
 
 static inline atomic_t *folio_mapcount_ptr(struct folio *folio)
 {
-- 
cgit v1.2.3


From c3a15bff46cb5149aeae4c8ae69443d791fa6578 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:45:59 +0100
Subject: mm: reimplement folio_order() and folio_nr_pages()

Instead of calling compound_order() and compound_nr_pages(), use the folio
directly.  Saves 1905 bytes from mm/filemap.o due to folio_test_large()
now being a cheaper check than PageHead().

Link: https://lkml.kernel.org/r/20220902194653.1739778-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e56dd8f7eae1..a37c8a29c49b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -729,7 +729,9 @@ static inline unsigned int compound_order(struct page *page)
  */
 static inline unsigned int folio_order(struct folio *folio)
 {
-	return compound_order(&folio->page);
+	if (!folio_test_large(folio))
+		return 0;
+	return folio->_folio_order;
 }
 
 #include <linux/huge_mm.h>
@@ -1659,7 +1661,13 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
  */
 static inline long folio_nr_pages(struct folio *folio)
 {
-	return compound_nr(&folio->page);
+	if (!folio_test_large(folio))
+		return 1;
+#ifdef CONFIG_64BIT
+	return folio->_folio_nr_pages;
+#else
+	return 1L << folio->_folio_order;
+#endif
 }
 
 /**
-- 
cgit v1.2.3


From d788f5b374c2ba204fed57e39acf2452acc24812 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:46:00 +0100
Subject: mm: add split_folio()

This wrapper removes a need to use split_huge_page(&folio->page).  Convert
two callers.

Link: https://lkml.kernel.org/r/20220902194653.1739778-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 38265f9f782e..a1341fdcf666 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -444,6 +444,11 @@ static inline int split_folio_to_list(struct folio *folio,
 	return split_huge_page_to_list(&folio->page, list);
 }
 
+static inline int split_folio(struct folio *folio)
+{
+	return split_folio_to_list(folio, NULL);
+}
+
 /*
  * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to
  * limitations in the implementation like arm64 MTE can override this to
-- 
cgit v1.2.3


From 681ecf6301786cb06942b57f0ef7103b07ae6813 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:46:01 +0100
Subject: mm: add folio_add_lru_vma()

Convert lru_cache_add_inactive_or_unevictable() to folio_add_lru_vma()
and add a compatibility wrapper.

Link: https://lkml.kernel.org/r/20220902194653.1739778-6-willy@infradead.org
Signed-off-by: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6308150b234a..2ede1e3695d9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -379,11 +379,11 @@ extern unsigned long totalreserve_pages;
 
 
 /* linux/mm/swap.c */
-extern void lru_note_cost(struct lruvec *lruvec, bool file,
-			  unsigned int nr_pages);
-extern void lru_note_cost_folio(struct folio *);
-extern void folio_add_lru(struct folio *);
-extern void lru_cache_add(struct page *);
+void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages);
+void lru_note_cost_folio(struct folio *);
+void folio_add_lru(struct folio *);
+void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
+void lru_cache_add(struct page *);
 void mark_page_accessed(struct page *);
 void folio_mark_accessed(struct folio *);
 
-- 
cgit v1.2.3


From 907ea17eb2b436f07332c935476d77893abae735 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:46:04 +0100
Subject: shmem: convert shmem_replace_page() to use folios throughout

Introduce folio_set_swap_entry() to abstract how both folio->private and
swp_entry_t work.  Use swap_address_space() directly instead of
indirecting through folio_mapping().  Include an assertion that the old
folio is not large as we only allocate a single-page folio to replace it.
Use folio_put_refs() instead of calling folio_put() twice.

Link: https://lkml.kernel.org/r/20220902194653.1739778-9-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2ede1e3695d9..61e13d1a4cab 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -355,6 +355,11 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio)
 	return entry;
 }
 
+static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry)
+{
+	folio->private = (void *)entry.val;
+}
+
 /* linux/mm/workingset.c */
 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
 void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
-- 
cgit v1.2.3


From bdb0ed54a4768dc3c2613d4c45f94c887d43cd7a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:46:06 +0100
Subject: mm/swapfile: convert try_to_free_swap() to folio_free_swap()

Add kernel-doc for folio_free_swap() and make it return bool.  Add a
try_to_free_swap() compatibility wrapper.

Link: https://lkml.kernel.org/r/20220902194653.1739778-11-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 61e13d1a4cab..dac6308d878e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -490,6 +490,7 @@ static inline long get_nr_swap_pages(void)
 
 extern void si_swapinfo(struct sysinfo *);
 swp_entry_t folio_alloc_swap(struct folio *folio);
+bool folio_free_swap(struct folio *folio);
 extern void put_swap_page(struct page *page, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
 extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
@@ -606,6 +607,11 @@ static inline swp_entry_t folio_alloc_swap(struct folio *folio)
 	return entry;
 }
 
+static inline bool folio_free_swap(struct folio *folio)
+{
+	return false;
+}
+
 static inline int add_swap_extent(struct swap_info_struct *sis,
 				  unsigned long start_page,
 				  unsigned long nr_pages, sector_t start_block)
-- 
cgit v1.2.3


From 4081f7446d95a9d3ced12dc04ff02c187a761e90 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:46:09 +0100
Subject: mm/swap: convert put_swap_page() to put_swap_folio()

With all callers now using a folio, we can convert this function.

Link: https://lkml.kernel.org/r/20220902194653.1739778-14-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index dac6308d878e..42cbef554de6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -491,7 +491,7 @@ static inline long get_nr_swap_pages(void)
 extern void si_swapinfo(struct sysinfo *);
 swp_entry_t folio_alloc_swap(struct folio *folio);
 bool folio_free_swap(struct folio *folio);
-extern void put_swap_page(struct page *page, swp_entry_t entry);
+void put_swap_folio(struct folio *folio, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
 extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
@@ -576,7 +576,7 @@ static inline void swap_free(swp_entry_t swp)
 {
 }
 
-static inline void put_swap_page(struct page *page, swp_entry_t swp)
+static inline void put_swap_folio(struct folio *folio, swp_entry_t swp)
 {
 }
 
-- 
cgit v1.2.3


From 6599591816f522c1cc8ec4eb5cea75738963756a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:46:12 +0100
Subject: memcg: convert mem_cgroup_swapin_charge_page() to
 mem_cgroup_swapin_charge_folio()

All callers now have a folio, so pass it in here and remove an unnecessary
call to page_folio().

Link: https://lkml.kernel.org/r/20220902194653.1739778-17-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 60545e4a1c03..ca0df42662ad 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -688,7 +688,7 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
 	return __mem_cgroup_charge(folio, mm, gfp);
 }
 
-int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
 				  gfp_t gfp, swp_entry_t entry);
 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
 
@@ -1254,7 +1254,7 @@ static inline int mem_cgroup_charge(struct folio *folio,
 	return 0;
 }
 
-static inline int mem_cgroup_swapin_charge_page(struct page *page,
+static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
 			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
 {
 	return 0;
-- 
cgit v1.2.3


From 4e1fc793ad9892cec67b40c9f67583160e08f695 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:46:20 +0100
Subject: shmem: add shmem_get_folio()

With no remaining callers of shmem_getpage_gfp(), add shmem_get_folio()
and reimplement shmem_getpage() as a call to shmem_get_folio().

Link: https://lkml.kernel.org/r/20220902194653.1739778-25-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index ff0b990de83d..f4bd50b08a91 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -113,6 +113,8 @@ enum sgp_type {
 
 extern int shmem_getpage(struct inode *inode, pgoff_t index,
 		struct page **pagep, enum sgp_type sgp);
+int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
+		enum sgp_type sgp);
 
 static inline struct page *shmem_read_mapping_page(
 				struct address_space *mapping, pgoff_t index)
-- 
cgit v1.2.3


From 923e2f0e7c30db5c1ee5d680050ab781e6c114fb Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:46:29 +0100
Subject: shmem: remove shmem_getpage()

With all callers removed, remove this wrapper function.  The flags are now
mysteriously called SGP, but I think we can live with that.

Link: https://lkml.kernel.org/r/20220902194653.1739778-34-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index f4bd50b08a91..f24071e3c826 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -102,7 +102,7 @@ extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
 extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
 						pgoff_t start, pgoff_t end);
 
-/* Flag allocation requirements to shmem_getpage */
+/* Flag allocation requirements to shmem_get_folio */
 enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_NOALLOC,	/* similar, but fail on hole or use fallocated page */
@@ -111,8 +111,6 @@ enum sgp_type {
 	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
 };
 
-extern int shmem_getpage(struct inode *inode, pgoff_t index,
-		struct page **pagep, enum sgp_type sgp);
 int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
 		enum sgp_type sgp);
 
-- 
cgit v1.2.3


From 9202d527b715f67bcdccbb9b712b65fe053f8109 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:46:43 +0100
Subject: memcg: convert mem_cgroup_swap_full() to take a folio

All callers now have a folio, so convert the function to take a folio.
Saves a couple of calls to compound_head().

Link: https://lkml.kernel.org/r/20220902194653.1739778-48-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 42cbef554de6..d8bd6401c3e7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -692,7 +692,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p
 }
 
 extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
-extern bool mem_cgroup_swap_full(struct page *page);
+extern bool mem_cgroup_swap_full(struct folio *folio);
 #else
 static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
 {
@@ -714,7 +714,7 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
 	return get_nr_swap_pages();
 }
 
-static inline bool mem_cgroup_swap_full(struct page *page)
+static inline bool mem_cgroup_swap_full(struct folio *folio)
 {
 	return vm_swap_full();
 }
-- 
cgit v1.2.3


From 3b344157c0c15b8f9588e3021dfb22ee25f4508a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:46:44 +0100
Subject: mm: remove try_to_free_swap()

All callers have now been converted to folio_free_swap() and we can remove
this wrapper.

Link: https://lkml.kernel.org/r/20220902194653.1739778-49-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d8bd6401c3e7..fc8d98660326 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -510,7 +510,6 @@ extern int __swp_swapcount(swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
 extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
-extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
 extern void exit_swap_address_space(unsigned int type);
@@ -595,11 +594,6 @@ static inline int swp_swapcount(swp_entry_t entry)
 	return 0;
 }
 
-static inline int try_to_free_swap(struct page *page)
-{
-	return 0;
-}
-
 static inline swp_entry_t folio_alloc_swap(struct folio *folio)
 {
 	swp_entry_t entry;
-- 
cgit v1.2.3


From 29eea9b5a9c9ecf21164a082a42bfabe06fdcb30 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:46:50 +0100
Subject: mm: convert page_get_anon_vma() to folio_get_anon_vma()

With all callers now passing in a folio, rename the function and convert
all callers.  Removes a couple of calls to compound_head() and a reference
to page->mapping.

Link: https://lkml.kernel.org/r/20220902194653.1739778-55-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 72b2bcc37f73..3d56e3712bb2 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -163,7 +163,7 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
 	unlink_anon_vmas(next);
 }
 
-struct anon_vma *page_get_anon_vma(struct page *page);
+struct anon_vma *folio_get_anon_vma(struct folio *folio);
 
 /* RMAP flags, currently only relevant for some anon rmap operations. */
 typedef int __bitwise rmap_t;
-- 
cgit v1.2.3


From 0c826c0b6a176b9ed5ace7106fd1770bb48f1898 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:46:51 +0100
Subject: rmap: remove page_unlock_anon_vma_read()

This was simply an alias for anon_vma_unlock_read() since 2011.

Link: https://lkml.kernel.org/r/20220902194653.1739778-56-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 3d56e3712bb2..ca3e4ba6c58c 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -458,13 +458,8 @@ struct rmap_walk_control {
 
 void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
 void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
-
-/*
- * Called by memory-failure.c to kill processes.
- */
 struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
 					  struct rmap_walk_control *rwc);
-void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
 
 #else	/* !CONFIG_MMU */
 
-- 
cgit v1.2.3


From 19672a9e4a75252871cba319f4e3b859b8fdf671 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 2 Sep 2022 20:46:53 +0100
Subject: mm: convert lock_page_or_retry() to folio_lock_or_retry()

Remove a call to compound_head() in each of the two callers.

Link: https://lkml.kernel.org/r/20220902194653.1739778-58-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09de43e36a64..32846b6306db 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -989,19 +989,16 @@ static inline int lock_page_killable(struct page *page)
 }
 
 /*
- * lock_page_or_retry - Lock the page, unless this would block and the
+ * folio_lock_or_retry - Lock the folio, unless this would block and the
  * caller indicated that it can handle a retry.
  *
  * Return value and mmap_lock implications depend on flags; see
  * __folio_lock_or_retry().
  */
-static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm,
-				     unsigned int flags)
+static inline bool folio_lock_or_retry(struct folio *folio,
+		struct mm_struct *mm, unsigned int flags)
 {
-	struct folio *folio;
 	might_sleep();
-
-	folio = page_folio(page);
 	return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags);
 }
 
-- 
cgit v1.2.3


From f372bde922e2ced8e0b5a928887b4cf587cc4453 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Mon, 5 Sep 2022 23:05:29 +0200
Subject: kasan: only define kasan_metadata_size for Generic mode

KASAN provides a helper for calculating the size of per-object metadata
stored in the redzone.

As now only the Generic mode uses per-object metadata, only define
kasan_metadata_size() for this mode.

Link: https://lkml.kernel.org/r/8f81d4938b80446bc72538a08217009f328a3e23.1662411799.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Peter Collingbourne <pcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b092277bf48d..027df7599573 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -150,14 +150,6 @@ static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache)
 		__kasan_cache_create_kmalloc(cache);
 }
 
-size_t __kasan_metadata_size(struct kmem_cache *cache);
-static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache)
-{
-	if (kasan_enabled())
-		return __kasan_metadata_size(cache);
-	return 0;
-}
-
 void __kasan_poison_slab(struct slab *slab);
 static __always_inline void kasan_poison_slab(struct slab *slab)
 {
@@ -282,7 +274,6 @@ static inline void kasan_cache_create(struct kmem_cache *cache,
 				      unsigned int *size,
 				      slab_flags_t *flags) {}
 static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {}
-static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 static inline void kasan_poison_slab(struct slab *slab) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
 					void *object) {}
@@ -333,6 +324,8 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
 
 #ifdef CONFIG_KASAN_GENERIC
 
+size_t kasan_metadata_size(struct kmem_cache *cache);
+
 void kasan_cache_shrink(struct kmem_cache *cache);
 void kasan_cache_shutdown(struct kmem_cache *cache);
 void kasan_record_aux_stack(void *ptr);
@@ -340,6 +333,12 @@ void kasan_record_aux_stack_noalloc(void *ptr);
 
 #else /* CONFIG_KASAN_GENERIC */
 
+/* Tag-based KASAN modes do not use per-object metadata. */
+static inline size_t kasan_metadata_size(struct kmem_cache *cache)
+{
+	return 0;
+}
+
 static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
 static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
 static inline void kasan_record_aux_stack(void *ptr) {}
-- 
cgit v1.2.3


From 3b7f8813e9ecf7fe91f2f8dc3b581a111cd374a5 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Mon, 5 Sep 2022 23:05:30 +0200
Subject: kasan: only define kasan_never_merge for Generic mode

KASAN prevents merging of slab caches whose objects have per-object
metadata stored in redzones.

As now only the Generic mode uses per-object metadata, define
kasan_never_merge() only for this mode.

Link: https://lkml.kernel.org/r/81ed01f29ff3443580b7e2fe362a8b47b1e8006d.1662411799.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Peter Collingbourne <pcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 027df7599573..9743d4b3a918 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -103,14 +103,6 @@ struct kasan_cache {
 	bool is_kmalloc;
 };
 
-slab_flags_t __kasan_never_merge(void);
-static __always_inline slab_flags_t kasan_never_merge(void)
-{
-	if (kasan_enabled())
-		return __kasan_never_merge();
-	return 0;
-}
-
 void __kasan_unpoison_range(const void *addr, size_t size);
 static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
 {
@@ -261,10 +253,6 @@ static __always_inline bool kasan_check_byte(const void *addr)
 
 #else /* CONFIG_KASAN */
 
-static inline slab_flags_t kasan_never_merge(void)
-{
-	return 0;
-}
 static inline void kasan_unpoison_range(const void *address, size_t size) {}
 static inline void kasan_poison_pages(struct page *page, unsigned int order,
 				      bool init) {}
@@ -325,6 +313,7 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
 #ifdef CONFIG_KASAN_GENERIC
 
 size_t kasan_metadata_size(struct kmem_cache *cache);
+slab_flags_t kasan_never_merge(void);
 
 void kasan_cache_shrink(struct kmem_cache *cache);
 void kasan_cache_shutdown(struct kmem_cache *cache);
@@ -338,6 +327,11 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache)
 {
 	return 0;
 }
+/* And thus nothing prevents cache merging. */
+static inline slab_flags_t kasan_never_merge(void)
+{
+	return 0;
+}
 
 static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
 static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
-- 
cgit v1.2.3


From 26f21f3ac76df6cf3b447e8231f8754991165475 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Mon, 5 Sep 2022 23:05:31 +0200
Subject: kasan: only define metadata offsets for Generic mode

Hide the definitions of alloc_meta_offset and free_meta_offset under an
ifdef CONFIG_KASAN_GENERIC check, as these fields are now only used when
the Generic mode is enabled.

Link: https://lkml.kernel.org/r/d4bafa0534facafd1a23c465a94261e64f366493.1662411799.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Peter Collingbourne <pcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 9743d4b3a918..a212c2e3f32d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -98,8 +98,10 @@ static inline bool kasan_has_integrated_init(void)
 #ifdef CONFIG_KASAN
 
 struct kasan_cache {
+#ifdef CONFIG_KASAN_GENERIC
 	int alloc_meta_offset;
 	int free_meta_offset;
+#endif
 	bool is_kmalloc;
 };
 
-- 
cgit v1.2.3


From 682ed08924407b719fa0b1123a26971748d76ace Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Mon, 5 Sep 2022 23:05:33 +0200
Subject: kasan: only define kasan_cache_create for Generic mode

Right now, kasan_cache_create() assigns SLAB_KASAN for all KASAN modes and
then sets up metadata-related cache parameters for the Generic mode.

SLAB_KASAN is used in two places:

1. In slab_ksize() to account for per-object metadata when
   calculating the size of the accessible memory within the object.
2. In slab_common.c via kasan_never_merge() to prevent merging of
   caches with per-object metadata.

Both cases are only relevant when per-object metadata is present, which is
only the case with the Generic mode.

Thus, assign SLAB_KASAN and define kasan_cache_create() only for the
Generic mode.

Also update the SLAB_KASAN-related comment.

Link: https://lkml.kernel.org/r/61faa2aa1906e2d02c97d00ddf99ce8911dda095.1662411799.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Peter Collingbourne <pcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 18 ++++++------------
 include/linux/slab.h  |  2 +-
 2 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index a212c2e3f32d..d811b3d7d2a1 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -128,15 +128,6 @@ static __always_inline void kasan_unpoison_pages(struct page *page,
 		__kasan_unpoison_pages(page, order, init);
 }
 
-void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
-				slab_flags_t *flags);
-static __always_inline void kasan_cache_create(struct kmem_cache *cache,
-				unsigned int *size, slab_flags_t *flags)
-{
-	if (kasan_enabled())
-		__kasan_cache_create(cache, size, flags);
-}
-
 void __kasan_cache_create_kmalloc(struct kmem_cache *cache);
 static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache)
 {
@@ -260,9 +251,6 @@ static inline void kasan_poison_pages(struct page *page, unsigned int order,
 				      bool init) {}
 static inline void kasan_unpoison_pages(struct page *page, unsigned int order,
 					bool init) {}
-static inline void kasan_cache_create(struct kmem_cache *cache,
-				      unsigned int *size,
-				      slab_flags_t *flags) {}
 static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {}
 static inline void kasan_poison_slab(struct slab *slab) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
@@ -316,6 +304,8 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
 
 size_t kasan_metadata_size(struct kmem_cache *cache);
 slab_flags_t kasan_never_merge(void);
+void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+			slab_flags_t *flags);
 
 void kasan_cache_shrink(struct kmem_cache *cache);
 void kasan_cache_shutdown(struct kmem_cache *cache);
@@ -334,6 +324,10 @@ static inline slab_flags_t kasan_never_merge(void)
 {
 	return 0;
 }
+/* And no cache-related metadata initialization is required. */
+static inline void kasan_cache_create(struct kmem_cache *cache,
+				      unsigned int *size,
+				      slab_flags_t *flags) {}
 
 static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
 static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 352e3f082acc..617a39f7db46 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -106,7 +106,7 @@
 # define SLAB_ACCOUNT		0
 #endif
 
-#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_GENERIC
 #define SLAB_KASAN		((slab_flags_t __force)0x08000000U)
 #else
 #define SLAB_KASAN		0
-- 
cgit v1.2.3


From 36001cba4f728e7fa2a58bc69fece22eaeef5cca Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Tue, 6 Sep 2022 23:18:47 +0800
Subject: mm/damon/core: iterate the regions list from current point in
 damon_set_regions()

We iterate the whole regions list every time to get the first/last regions
intersecting with the specific range in damon_set_regions(), in order to
add new region or resize existing regions to fit in the specific range.
Actually, it is unnecessary to iterate the new added regions and the front
regions that have been checked.  Just iterate the regions list from the
current point using list_for_each_entry_from() every time to improve
performance.

The kunit tests passed:
 [PASSED] damon_test_apply_three_regions1
 [PASSED] damon_test_apply_three_regions2
 [PASSED] damon_test_apply_three_regions3
 [PASSED] damon_test_apply_three_regions4

Link: https://lkml.kernel.org/r/1662477527-13003-1-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 7b1f4a488230..d54acec048d6 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -463,9 +463,17 @@ static inline struct damon_region *damon_last_region(struct damon_target *t)
 	return list_last_entry(&t->regions_list, struct damon_region, list);
 }
 
+static inline struct damon_region *damon_first_region(struct damon_target *t)
+{
+	return list_first_entry(&t->regions_list, struct damon_region, list);
+}
+
 #define damon_for_each_region(r, t) \
 	list_for_each_entry(r, &t->regions_list, list)
 
+#define damon_for_each_region_from(r, t) \
+	list_for_each_entry_from(r, &t->regions_list, list)
+
 #define damon_for_each_region_safe(r, next, t) \
 	list_for_each_entry_safe(r, next, &t->regions_list, list)
 
-- 
cgit v1.2.3


From 4f9bc69ac5ce34071a9a51343bc81ca76cb2e3f1 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 7 Sep 2022 14:08:42 +0800
Subject: mm: reuse pageblock_start/end_pfn() macro

Move pageblock_start_pfn/pageblock_end_pfn() into pageblock-flags.h, then
they could be used somewhere else, not only in compaction, also use
ALIGN_DOWN() instead of round_down() to be pair with ALIGN(), which should
be same for pageblock usage.

Link: https://lkml.kernel.org/r/20220907060844.126891-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pageblock-flags.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 83c7248053a1..a09b7fe6bbf8 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -53,6 +53,8 @@ extern unsigned int pageblock_order;
 #endif /* CONFIG_HUGETLB_PAGE */
 
 #define pageblock_nr_pages	(1UL << pageblock_order)
+#define pageblock_start_pfn(pfn)	ALIGN_DOWN((pfn), pageblock_nr_pages)
+#define pageblock_end_pfn(pfn)		ALIGN((pfn) + 1, pageblock_nr_pages)
 
 /* Forward declaration */
 struct page;
-- 
cgit v1.2.3


From 5f7fa13fa858c17580ed513bd5e0a4b36d68fdd6 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 7 Sep 2022 14:08:43 +0800
Subject: mm: add pageblock_align() macro

Add pageblock_align() macro and use it to simplify code.

Link: https://lkml.kernel.org/r/20220907060844.126891-2-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pageblock-flags.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index a09b7fe6bbf8..293c76630fa8 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -53,6 +53,7 @@ extern unsigned int pageblock_order;
 #endif /* CONFIG_HUGETLB_PAGE */
 
 #define pageblock_nr_pages	(1UL << pageblock_order)
+#define pageblock_align(pfn)	ALIGN((pfn), pageblock_nr_pages)
 #define pageblock_start_pfn(pfn)	ALIGN_DOWN((pfn), pageblock_nr_pages)
 #define pageblock_end_pfn(pfn)		ALIGN((pfn) + 1, pageblock_nr_pages)
 
-- 
cgit v1.2.3


From ee0913c4719610204315a0d8a35122c6233249e0 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 7 Sep 2022 14:08:44 +0800
Subject: mm: add pageblock_aligned() macro

Add pageblock_aligned() and use it to simplify code.

Link: https://lkml.kernel.org/r/20220907060844.126891-3-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pageblock-flags.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 293c76630fa8..5f1ae07d724b 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -54,6 +54,7 @@ extern unsigned int pageblock_order;
 
 #define pageblock_nr_pages	(1UL << pageblock_order)
 #define pageblock_align(pfn)	ALIGN((pfn), pageblock_nr_pages)
+#define pageblock_aligned(pfn)	IS_ALIGNED((pfn), pageblock_nr_pages)
 #define pageblock_start_pfn(pfn)	ALIGN_DOWN((pfn), pageblock_nr_pages)
 #define pageblock_end_pfn(pfn)		ALIGN((pfn) + 1, pageblock_nr_pages)
 
-- 
cgit v1.2.3


From 410f8e82689e1e66044fea51ef852054a09502b7 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Wed, 7 Sep 2022 04:35:35 +0000
Subject: memcg: extract memcg_vmstats from struct mem_cgroup

Patch series "memcg: reduce memory overhead of memory cgroups".

Currently a lot of memory is wasted to maintain the vmevents for memory
cgroups as we have multiple arrays of size NR_VM_EVENT_ITEMS which can be
as large as 110.  However memcg code uses small portion of those entries.
This patch series eliminate this overhead by removing the unneeded vmevent
entries from memory cgroup data structures.


This patch (of 3):

This is a preparatory patch to reduce the memory overhead of memory
cgroup. The struct memcg_vmstats is the largest object embedded into the
struct mem_cgroup. This patch extracts struct memcg_vmstats from struct
mem_cgroup to ease the following patches in reducing the size of struct
memcg_vmstats.

Link: https://lkml.kernel.org/r/20220907043537.3457014-1-shakeelb@google.com
Link: https://lkml.kernel.org/r/20220907043537.3457014-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 37 ++++---------------------------------
 1 file changed, 4 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ca0df42662ad..dc7d40e575d5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -80,29 +80,8 @@ enum mem_cgroup_events_target {
 	MEM_CGROUP_NTARGETS,
 };
 
-struct memcg_vmstats_percpu {
-	/* Local (CPU and cgroup) page state & events */
-	long			state[MEMCG_NR_STAT];
-	unsigned long		events[NR_VM_EVENT_ITEMS];
-
-	/* Delta calculation for lockless upward propagation */
-	long			state_prev[MEMCG_NR_STAT];
-	unsigned long		events_prev[NR_VM_EVENT_ITEMS];
-
-	/* Cgroup1: threshold notifications & softlimit tree updates */
-	unsigned long		nr_page_events;
-	unsigned long		targets[MEM_CGROUP_NTARGETS];
-};
-
-struct memcg_vmstats {
-	/* Aggregated (CPU and subtree) page state & events */
-	long			state[MEMCG_NR_STAT];
-	unsigned long		events[NR_VM_EVENT_ITEMS];
-
-	/* Pending child counts during tree propagation */
-	long			state_pending[MEMCG_NR_STAT];
-	unsigned long		events_pending[NR_VM_EVENT_ITEMS];
-};
+struct memcg_vmstats_percpu;
+struct memcg_vmstats;
 
 struct mem_cgroup_reclaim_iter {
 	struct mem_cgroup *position;
@@ -298,7 +277,7 @@ struct mem_cgroup {
 	CACHELINE_PADDING(_pad1_);
 
 	/* memory.stat */
-	struct memcg_vmstats	vmstats;
+	struct memcg_vmstats	*vmstats;
 
 	/* memory.events */
 	atomic_long_t		memory_events[MEMCG_NR_MEMORY_EVENTS];
@@ -1001,15 +980,7 @@ static inline void mod_memcg_page_state(struct page *page,
 	rcu_read_unlock();
 }
 
-static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
-{
-	long x = READ_ONCE(memcg->vmstats.state[idx]);
-#ifdef CONFIG_SMP
-	if (x < 0)
-		x = 0;
-#endif
-	return x;
-}
+unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
 
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
 					      enum node_stat_item idx)
-- 
cgit v1.2.3


From f5a79d7c0c87c8d88bb5e3f3c898258fdf1b3b05 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Thu, 8 Sep 2022 19:14:43 +0000
Subject: mm/damon: introduce struct damos_access_pattern

damon_new_scheme() has too many parameters, so introduce struct
damos_access_pattern to simplify it.

In additon, we can't use a bpf trace kprobe that has more than 5
parameters.

Link: https://lkml.kernel.org/r/20220908191443.129534-1-sj@kernel.org
Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index d54acec048d6..90f20675da22 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -216,13 +216,26 @@ struct damos_stat {
 };
 
 /**
- * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
+ * struct damos_access_pattern - Target access pattern of the given scheme.
  * @min_sz_region:	Minimum size of target regions.
  * @max_sz_region:	Maximum size of target regions.
  * @min_nr_accesses:	Minimum ``->nr_accesses`` of target regions.
  * @max_nr_accesses:	Maximum ``->nr_accesses`` of target regions.
  * @min_age_region:	Minimum age of target regions.
  * @max_age_region:	Maximum age of target regions.
+ */
+struct damos_access_pattern {
+	unsigned long min_sz_region;
+	unsigned long max_sz_region;
+	unsigned int min_nr_accesses;
+	unsigned int max_nr_accesses;
+	unsigned int min_age_region;
+	unsigned int max_age_region;
+};
+
+/**
+ * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
+ * @pattern:		Access pattern of target regions.
  * @action:		&damo_action to be applied to the target regions.
  * @quota:		Control the aggressiveness of this scheme.
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
@@ -230,10 +243,8 @@ struct damos_stat {
  * @list:		List head for siblings.
  *
  * For each aggregation interval, DAMON finds regions which fit in the
- * condition (&min_sz_region, &max_sz_region, &min_nr_accesses,
- * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to
- * those.  To avoid consuming too much CPU time or IO resources for the
- * &action, &quota is used.
+ * &pattern and applies &action to those. To avoid consuming too much
+ * CPU time or IO resources for the &action, &quota is used.
  *
  * To do the work only when needed, schemes can be activated for specific
  * system situations using &wmarks.  If all schemes that registered to the
@@ -248,12 +259,7 @@ struct damos_stat {
  * &action is applied.
  */
 struct damos {
-	unsigned long min_sz_region;
-	unsigned long max_sz_region;
-	unsigned int min_nr_accesses;
-	unsigned int max_nr_accesses;
-	unsigned int min_age_region;
-	unsigned int max_age_region;
+	struct damos_access_pattern pattern;
 	enum damos_action action;
 	struct damos_quota quota;
 	struct damos_watermarks wmarks;
@@ -509,12 +515,9 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
 
-struct damos *damon_new_scheme(
-		unsigned long min_sz_region, unsigned long max_sz_region,
-		unsigned int min_nr_accesses, unsigned int max_nr_accesses,
-		unsigned int min_age_region, unsigned int max_age_region,
-		enum damos_action action, struct damos_quota *quota,
-		struct damos_watermarks *wmarks);
+struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
+			enum damos_action action, struct damos_quota *quota,
+			struct damos_watermarks *wmarks);
 void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
 void damon_destroy_scheme(struct damos *s);
 
-- 
cgit v1.2.3


From 0d83b2d89dbfad17b62d4e7fb8f0b0525ba1a204 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Fri, 9 Sep 2022 21:36:06 +0000
Subject: mm/damon: remove duplicate get_monitoring_region() definitions

In lru_sort.c and reclaim.c, they are all defining get_monitoring_region()
function, there is no need to define it separately.

As 'get_monitoring_region()' is not a 'static' function anymore, we try to
use a prefix to distinguish with other functions, so there rename it to
'damon_find_biggest_system_ram'.

Link: https://lkml.kernel.org/r/20220909213606.136221-1-sj@kernel.org
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Signed-off-by: SeongJae Park <sj@kernel.org>
Suggested-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 90f20675da22..016b6c9c03d6 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -549,6 +549,8 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 
+bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end);
+
 #endif	/* CONFIG_DAMON */
 
 #endif	/* _DAMON_H */
-- 
cgit v1.2.3


From 13cc378403a83e70430ae9bad53fd65199f21fe1 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Fri, 9 Sep 2022 10:57:11 +0800
Subject: writeback: remove unused macro DIRTY_FULL_SCOPE

It's introduced but never used. Remove it.

Link: https://lkml.kernel.org/r/20220909025711.32012-1-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Jens Axboe <axboe@kernel.dk>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: NeilBrown <neilb@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: zhanglianjie <zhanglianjie@uniontech.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/writeback.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3f045f6d6c4f..06f9291b6fd5 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -17,20 +17,12 @@ struct bio;
 DECLARE_PER_CPU(int, dirty_throttle_leaks);
 
 /*
- * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
- *
- *	(thresh - thresh/DIRTY_FULL_SCOPE, thresh)
- *
- * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
- * time) for the dirty pages to drop, unless written enough pages.
- *
  * The global dirty threshold is normally equal to the global dirty limit,
  * except when the system suddenly allocates a lot of anonymous memory and
  * knocks down the global dirty threshold quickly, in which case the global
  * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
  */
 #define DIRTY_SCOPE		8
-#define DIRTY_FULL_SCOPE	(DIRTY_SCOPE / 2)
 
 struct backing_dev_info;
 
-- 
cgit v1.2.3


From cbeaa77b044938cfe91818821ece6b0b1511e967 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:32 +0000
Subject: mm/damon/core: use a dedicated struct for monitoring attributes

DAMON monitoring attributes are directly defined as fields of 'struct
damon_ctx'.  This makes 'struct damon_ctx' a little long and complicated.
This commit defines and uses a struct, 'struct damon_attrs', which is
dedicated for only the monitoring attributes to make the purpose of the
five values clearer and simplify 'struct damon_ctx'.

Link: https://lkml.kernel.org/r/20220913174449.50645-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 016b6c9c03d6..2ceee8b07726 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -389,13 +389,15 @@ struct damon_callback {
 };
 
 /**
- * struct damon_ctx - Represents a context for each monitoring.  This is the
- * main interface that allows users to set the attributes and get the results
- * of the monitoring.
+ * struct damon_attrs - Monitoring attributes for accuracy/overhead control.
  *
  * @sample_interval:		The time between access samplings.
  * @aggr_interval:		The time between monitor results aggregations.
  * @ops_update_interval:	The time between monitoring operations updates.
+ * @min_nr_regions:		The minimum number of adaptive monitoring
+ *				regions.
+ * @max_nr_regions:		The maximum number of adaptive monitoring
+ *				regions.
  *
  * For each @sample_interval, DAMON checks whether each region is accessed or
  * not.  It aggregates and keeps the access information (number of accesses to
@@ -405,7 +407,21 @@ struct damon_callback {
  * @ops_update_interval.  All time intervals are in micro-seconds.
  * Please refer to &struct damon_operations and &struct damon_callback for more
  * detail.
+ */
+struct damon_attrs {
+	unsigned long sample_interval;
+	unsigned long aggr_interval;
+	unsigned long ops_update_interval;
+	unsigned long min_nr_regions;
+	unsigned long max_nr_regions;
+};
+
+/**
+ * struct damon_ctx - Represents a context for each monitoring.  This is the
+ * main interface that allows users to set the attributes and get the results
+ * of the monitoring.
  *
+ * @attrs:		Monitoring attributes for accuracy/overhead control.
  * @kdamond:		Kernel thread who does the monitoring.
  * @kdamond_lock:	Mutex for the synchronizations with @kdamond.
  *
@@ -427,15 +443,11 @@ struct damon_callback {
  * @ops:	Set of monitoring operations for given use cases.
  * @callback:	Set of callbacks for monitoring events notifications.
  *
- * @min_nr_regions:	The minimum number of adaptive monitoring regions.
- * @max_nr_regions:	The maximum number of adaptive monitoring regions.
  * @adaptive_targets:	Head of monitoring targets (&damon_target) list.
  * @schemes:		Head of schemes (&damos) list.
  */
 struct damon_ctx {
-	unsigned long sample_interval;
-	unsigned long aggr_interval;
-	unsigned long ops_update_interval;
+	struct damon_attrs attrs;
 
 /* private: internal use only */
 	struct timespec64 last_aggregation;
@@ -448,8 +460,6 @@ struct damon_ctx {
 	struct damon_operations ops;
 	struct damon_callback callback;
 
-	unsigned long min_nr_regions;
-	unsigned long max_nr_regions;
 	struct list_head adaptive_targets;
 	struct list_head schemes;
 };
-- 
cgit v1.2.3


From bead3b00088eb8016b32cafa7e0701b3283e68a4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 13 Sep 2022 17:44:33 +0000
Subject: mm/damon/core: reduce parameters for damon_set_attrs()

Number of parameters for 'damon_set_attrs()' is six.  As it could be
confusing and verbose, this commit reduces the number by receiving single
pointer to a 'struct damon_attrs'.

Link: https://lkml.kernel.org/r/20220913174449.50645-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 2ceee8b07726..c5dc0c77c772 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -540,9 +540,7 @@ unsigned int damon_nr_regions(struct damon_target *t);
 
 struct damon_ctx *damon_new_ctx(void);
 void damon_destroy_ctx(struct damon_ctx *ctx);
-int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
-		unsigned long aggr_int, unsigned long ops_upd_int,
-		unsigned long min_nr_reg, unsigned long max_nr_reg);
+int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs);
 int damon_set_schemes(struct damon_ctx *ctx,
 			struct damos **schemes, ssize_t nr_schemes);
 int damon_nr_running_ctxs(void);
-- 
cgit v1.2.3


From b958d4d08fbfe938af24ea06ebbf839b48fa18a9 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 14 Sep 2022 15:26:02 +0800
Subject: mm: hugetlb: simplify per-node sysfs creation and removal

Patch series "simplify handling of per-node sysfs creation and removal",
v4.


This patch (of 2):

The following commit offload per-node sysfs creation and removal to a
kworker and did not say why it is needed.  And it also said "I don't know
that this is absolutely required".  It seems like the author was not sure
as well.  Since it only complicates the code, this patch will revert the
changes to simplify the code.

  39da08cb074c ("hugetlb: offload per node attribute registrations")

We could use memory hotplug notifier to do per-node sysfs creation and
removal instead of inserting those operations to node registration and
unregistration.  Then, it can reduce the code coupling between node.c and
hugetlb.c.  Also, it can simplify the code.

Link: https://lkml.kernel.org/r/20220914072603.60293-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20220914072603.60293-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/node.h | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/node.h b/include/linux/node.h
index 9ec680dd607f..427a5975cf40 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -2,15 +2,15 @@
 /*
  * include/linux/node.h - generic node definition
  *
- * This is mainly for topological representation. We define the 
- * basic 'struct node' here, which can be embedded in per-arch 
+ * This is mainly for topological representation. We define the
+ * basic 'struct node' here, which can be embedded in per-arch
  * definitions of processors.
  *
  * Basic handling of the devices is done in drivers/base/node.c
- * and system devices are handled in drivers/base/sys.c. 
+ * and system devices are handled in drivers/base/sys.c.
  *
  * Nodes are exported via driverfs in the class/node/devices/
- * directory. 
+ * directory.
  */
 #ifndef _LINUX_NODE_H_
 #define _LINUX_NODE_H_
@@ -18,7 +18,6 @@
 #include <linux/device.h>
 #include <linux/cpumask.h>
 #include <linux/list.h>
-#include <linux/workqueue.h>
 
 /**
  * struct node_hmem_attrs - heterogeneous memory performance attributes
@@ -84,10 +83,6 @@ static inline void node_set_perf_attrs(unsigned int nid,
 struct node {
 	struct device	dev;
 	struct list_head access_list;
-
-#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS)
-	struct work_struct	node_work;
-#endif
 #ifdef CONFIG_HMEM_REPORTING
 	struct list_head cache_attrs;
 	struct device *cache_dev;
@@ -96,7 +91,6 @@ struct node {
 
 struct memory_block;
 extern struct node *node_devices[];
-typedef  void (*node_registration_func_t)(struct node *);
 
 #if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA)
 void register_memory_blocks_under_node(int nid, unsigned long start_pfn,
@@ -144,11 +138,6 @@ extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
 extern int register_memory_node_under_compute_node(unsigned int mem_nid,
 						   unsigned int cpu_nid,
 						   unsigned access);
-
-#ifdef CONFIG_HUGETLBFS
-extern void register_hugetlbfs_with_node(node_registration_func_t doregister,
-					 node_registration_func_t unregister);
-#endif
 #else
 static inline void node_dev_init(void)
 {
@@ -176,11 +165,6 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
 static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
 }
-
-static inline void register_hugetlbfs_with_node(node_registration_func_t reg,
-						node_registration_func_t unreg)
-{
-}
 #endif
 
 #define to_node(device) container_of(device, struct node, dev)
-- 
cgit v1.2.3


From a4a00b451ef5e1deb959088e25e248f4ee399792 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 14 Sep 2022 15:26:03 +0800
Subject: mm: hugetlb: eliminate memory-less nodes handling

The memory-notify-based approach aims to handle meory-less nodes, however,
it just adds the complexity of code as pointed by David in thread [1].
The handling of memory-less nodes is introduced by commit 4faf8d950ec4
("hugetlb: handle memory hot-plug events").  >From its commit message, we
cannot find any necessity of handling this case.  So, we can simply
register/unregister sysfs entries in register_node/unregister_node to
simlify the code.

BTW, hotplug callback added because in hugetlb_register_all_nodes() we
register sysfs nodes only for N_MEMORY nodes, seeing commit 9b5e5d0fdc91,
which said it was a preparation for handling memory-less nodes via memory
hotplug.  Since we want to remove memory hotplug, so make sure we only
register per-node sysfs for online (N_ONLINE) nodes in
hugetlb_register_all_nodes().

https://lore.kernel.org/linux-mm/60933ffc-b850-976c-78a0-0ee6e0ea9ef0@redhat.com/ [1]
Link: https://lkml.kernel.org/r/20220914072603.60293-3-songmuchun@bytedance.com
Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 57e72954a482..6d7f39754060 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -16,6 +16,7 @@
 struct ctl_table;
 struct user_struct;
 struct mmu_gather;
+struct node;
 
 #ifndef is_hugepd
 typedef struct { unsigned long pd; } hugepd_t;
@@ -935,6 +936,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 }
 #endif
 
+#ifdef CONFIG_NUMA
+void hugetlb_register_node(struct node *node);
+void hugetlb_unregister_node(struct node *node);
+#endif
+
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 
@@ -1109,6 +1115,14 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 				   pte_t *ptep, pte_t pte)
 {
 }
+
+static inline void hugetlb_register_node(struct node *node)
+{
+}
+
+static inline void hugetlb_unregister_node(struct node *node)
+{
+}
 #endif	/* CONFIG_HUGETLB_PAGE */
 
 static inline spinlock_t *huge_pte_lock(struct hstate *h,
-- 
cgit v1.2.3


From c195c3215741746b1eb7ab7980b926ddc37a4be3 Mon Sep 17 00:00:00 2001
From: Ke Sun <sunke@kylinos.cn>
Date: Wed, 14 Sep 2022 10:17:38 +0800
Subject: mm/filemap: make folio_put_wait_locked static

It's only used in mm/filemap.c, since commit <ffa65753c431>
("mm/migrate.c: rework migration_entry_wait() to not take a pageref").

Make it static.

Link: https://lkml.kernel.org/r/20220914021738.3228011-1-sunke@kylinos.cn
Signed-off-by: Ke Sun <sunke@kylinos.cn>
Reported-by: k2ci <kernel-bot@kylinos.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 32846b6306db..23125ab87ded 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1039,7 +1039,6 @@ static inline int wait_on_page_locked_killable(struct page *page)
 	return folio_wait_locked_killable(page_folio(page));
 }
 
-int folio_put_wait_locked(struct folio *folio, int state);
 void wait_on_page_writeback(struct page *page);
 void folio_wait_writeback(struct folio *folio);
 int folio_wait_writeback_killable(struct folio *folio);
-- 
cgit v1.2.3


From 7e1813d48dd30e6c6f235f6661d1bc108fcab528 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 14 Sep 2022 15:18:04 -0700
Subject: hugetlb: rename remove_huge_page to hugetlb_delete_from_page_cache

remove_huge_page removes a hugetlb page from the page cache.  Change to
hugetlb_delete_from_page_cache as it is a more descriptive name.
huge_add_to_page_cache is global in scope, but only deals with hugetlb
pages.  For consistency and clarity, rename to hugetlb_add_to_page_cache.

Link: https://lkml.kernel.org/r/20220914221810.95771-4-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: James Houghton <jthoughton@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6d7f39754060..4893d6d07099 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -666,7 +666,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
 				nodemask_t *nmask, gfp_t gfp_mask);
 struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
 				unsigned long address);
-int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
 			pgoff_t idx);
 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
 				unsigned long address, struct page *page);
-- 
cgit v1.2.3


From 8d9bfb2608145cf3e408428c224099e1585471af Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 14 Sep 2022 15:18:07 -0700
Subject: hugetlb: add vma based lock for pmd sharing

Allocate a new hugetlb_vma_lock structure and hang off vm_private_data for
synchronization use by vmas that could be involved in pmd sharing.  This
data structure contains a rw semaphore that is the primary tool used for
synchronization.

This new structure is ref counted, so that it can exist when NOT attached
to a vma.  This is only helpful in resolving lock ordering issues where
code may need to obtain the vma_lock while there are no guarantees the vma
may go away.  By obtaining a ref on the structure, it can be guaranteed
that at least the rw semaphore will not go away.

Only add infrastructure for the new lock here.  Actual use will be added
in subsequent patches.

[mike.kravetz@oracle.com: fix build issue for missing hugetlb_vma_lock_release]
  Link: https://lkml.kernel.org/r/YyNUtA1vRASOE4+M@monkey
Link: https://lkml.kernel.org/r/20220914221810.95771-7-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: James Houghton <jthoughton@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 43 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4893d6d07099..7b70aa931729 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -115,6 +115,12 @@ struct file_region {
 #endif
 };
 
+struct hugetlb_vma_lock {
+	struct kref refs;
+	struct rw_semaphore rw_sema;
+	struct vm_area_struct *vma;
+};
+
 extern struct resv_map *resv_map_alloc(void);
 void resv_map_release(struct kref *ref);
 
@@ -127,7 +133,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
 						long min_hpages);
 void hugepage_put_subpool(struct hugepage_subpool *spool);
 
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
+void hugetlb_dup_vma_private(struct vm_area_struct *vma);
 void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
 int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
 int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
@@ -215,6 +221,14 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
 struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
 			     pgd_t *pgd, int flags);
 
+void hugetlb_vma_lock_read(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
+void hugetlb_vma_lock_write(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
+void hugetlb_vma_lock_release(struct kref *kref);
+
 int pmd_huge(pmd_t pmd);
 int pud_huge(pud_t pud);
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -226,7 +240,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
-static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
 {
 }
 
@@ -337,6 +351,31 @@ static inline int prepare_hugepage_range(struct file *file,
 	return -EINVAL;
 }
 
+static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+	return 1;
+}
+
+static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
 static inline int pmd_huge(pmd_t pmd)
 {
 	return 0;
-- 
cgit v1.2.3


From 83a4f1ef45a90d740bc6edf6a2533b14a3e5d183 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:03:36 +0200
Subject: stackdepot: reserve 5 extra bits in depot_stack_handle_t

Some users (currently only KMSAN) may want to use spare bits in
depot_stack_handle_t.  Let them do so by adding @extra_bits to
__stack_depot_save() to store arbitrary flags, and providing
stack_depot_get_extra_bits() to retrieve those flags.

Also adapt KASAN to the new prototype by passing extra_bits=0, as KASAN
does not intend to store additional information in the stack handle.

Link: https://lkml.kernel.org/r/20220915150417.722975-3-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/stackdepot.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index bc2797955de9..9ca7798d7a31 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -14,9 +14,15 @@
 #include <linux/gfp.h>
 
 typedef u32 depot_stack_handle_t;
+/*
+ * Number of bits in the handle that stack depot doesn't use. Users may store
+ * information in them.
+ */
+#define STACK_DEPOT_EXTRA_BITS 5
 
 depot_stack_handle_t __stack_depot_save(unsigned long *entries,
 					unsigned int nr_entries,
+					unsigned int extra_bits,
 					gfp_t gfp_flags, bool can_alloc);
 
 /*
@@ -59,6 +65,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
 			       unsigned long **entries);
 
+unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle);
+
 int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
 		       int spaces);
 
-- 
cgit v1.2.3


From 33b75c1d884e81ec97525e0a6fdcb187adf273f4 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:03:37 +0200
Subject: instrumented.h: allow instrumenting both sides of copy_from_user()

Introduce instrument_copy_from_user_before() and
instrument_copy_from_user_after() hooks to be invoked before and after the
call to copy_from_user().

KASAN and KCSAN will be only using instrument_copy_from_user_before(), but
for KMSAN we'll need to insert code after copy_from_user().

Link: https://lkml.kernel.org/r/20220915150417.722975-4-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/instrumented.h | 21 +++++++++++++++++++--
 include/linux/uaccess.h      | 19 ++++++++++++++-----
 2 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h
index 42faebbaa202..ee8f7d17d34f 100644
--- a/include/linux/instrumented.h
+++ b/include/linux/instrumented.h
@@ -120,7 +120,7 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
 }
 
 /**
- * instrument_copy_from_user - instrument writes of copy_from_user
+ * instrument_copy_from_user_before - add instrumentation before copy_from_user
  *
  * Instrument writes to kernel memory, that are due to copy_from_user (and
  * variants). The instrumentation should be inserted before the accesses.
@@ -130,10 +130,27 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
  * @n number of bytes to copy
  */
 static __always_inline void
-instrument_copy_from_user(const void *to, const void __user *from, unsigned long n)
+instrument_copy_from_user_before(const void *to, const void __user *from, unsigned long n)
 {
 	kasan_check_write(to, n);
 	kcsan_check_write(to, n);
 }
 
+/**
+ * instrument_copy_from_user_after - add instrumentation after copy_from_user
+ *
+ * Instrument writes to kernel memory, that are due to copy_from_user (and
+ * variants). The instrumentation should be inserted after the accesses.
+ *
+ * @to destination address
+ * @from source address
+ * @n number of bytes to copy
+ * @left number of bytes not copied (as returned by copy_from_user)
+ */
+static __always_inline void
+instrument_copy_from_user_after(const void *to, const void __user *from,
+				unsigned long n, unsigned long left)
+{
+}
+
 #endif /* _LINUX_INSTRUMENTED_H */
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 47e5d374c7eb..afb18f198843 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -58,20 +58,28 @@
 static __always_inline __must_check unsigned long
 __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 {
-	instrument_copy_from_user(to, from, n);
+	unsigned long res;
+
+	instrument_copy_from_user_before(to, from, n);
 	check_object_size(to, n, false);
-	return raw_copy_from_user(to, from, n);
+	res = raw_copy_from_user(to, from, n);
+	instrument_copy_from_user_after(to, from, n, res);
+	return res;
 }
 
 static __always_inline __must_check unsigned long
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
+	unsigned long res;
+
 	might_fault();
+	instrument_copy_from_user_before(to, from, n);
 	if (should_fail_usercopy())
 		return n;
-	instrument_copy_from_user(to, from, n);
 	check_object_size(to, n, false);
-	return raw_copy_from_user(to, from, n);
+	res = raw_copy_from_user(to, from, n);
+	instrument_copy_from_user_after(to, from, n, res);
+	return res;
 }
 
 /**
@@ -115,8 +123,9 @@ _copy_from_user(void *to, const void __user *from, unsigned long n)
 	unsigned long res = n;
 	might_fault();
 	if (!should_fail_usercopy() && likely(access_ok(from, n))) {
-		instrument_copy_from_user(to, from, n);
+		instrument_copy_from_user_before(to, from, n);
 		res = raw_copy_from_user(to, from, n);
+		instrument_copy_from_user_after(to, from, n, res);
 	}
 	if (unlikely(res))
 		memset(to + (n - res), 0, res);
-- 
cgit v1.2.3


From 888f84a6da4d17e453058169fa7b235fff34f5bf Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:03:38 +0200
Subject: x86: asm: instrument usercopy in get_user() and put_user()

Use hooks from instrumented.h to notify bug detection tools about usercopy
events in variations of get_user() and put_user().

Link: https://lkml.kernel.org/r/20220915150417.722975-5-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Marco Elver <elver@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/instrumented.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h
index ee8f7d17d34f..9f1dba8f717b 100644
--- a/include/linux/instrumented.h
+++ b/include/linux/instrumented.h
@@ -153,4 +153,32 @@ instrument_copy_from_user_after(const void *to, const void __user *from,
 {
 }
 
+/**
+ * instrument_get_user() - add instrumentation to get_user()-like macros
+ *
+ * get_user() and friends are fragile, so it may depend on the implementation
+ * whether the instrumentation happens before or after the data is copied from
+ * the userspace.
+ *
+ * @to destination variable, may not be address-taken
+ */
+#define instrument_get_user(to)                         \
+({                                                      \
+})
+
+/**
+ * instrument_put_user() - add instrumentation to put_user()-like macros
+ *
+ * put_user() and friends are fragile, so it may depend on the implementation
+ * whether the instrumentation happens before or after the data is copied from
+ * the userspace.
+ *
+ * @from source address
+ * @ptr userspace pointer to copy to
+ * @size number of bytes to copy
+ */
+#define instrument_put_user(from, ptr, size)                    \
+({                                                              \
+})
+
 #endif /* _LINUX_INSTRUMENTED_H */
-- 
cgit v1.2.3


From 9b448bc25b776daab3215393c3ce6953dd3bb8ad Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:03:41 +0200
Subject: kmsan: introduce __no_sanitize_memory and __no_kmsan_checks

__no_sanitize_memory is a function attribute that instructs KMSAN to skip
a function during instrumentation.  This is needed to e.g.  implement the
noinstr functions.

__no_kmsan_checks is a function attribute that makes KMSAN ignore the
uninitialized values coming from the function's inputs, and initialize the
function's outputs.

Functions marked with this attribute can't be inlined into functions not
marked with it, and vice versa.  This behavior is overridden by
__always_inline.

__SANITIZE_MEMORY__ is a macro that's defined iff the file is instrumented
with KMSAN.  This is not the same as CONFIG_KMSAN, which is defined for
every file.

Link: https://lkml.kernel.org/r/20220915150417.722975-8-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler-clang.h | 23 +++++++++++++++++++++++
 include/linux/compiler-gcc.h   |  6 ++++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index c84fec767445..4fa0cc4cbd2c 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -51,6 +51,29 @@
 #define __no_sanitize_undefined
 #endif
 
+#if __has_feature(memory_sanitizer)
+#define __SANITIZE_MEMORY__
+/*
+ * Unlike other sanitizers, KMSAN still inserts code into functions marked with
+ * no_sanitize("kernel-memory"). Using disable_sanitizer_instrumentation
+ * provides the behavior consistent with other __no_sanitize_ attributes,
+ * guaranteeing that __no_sanitize_memory functions remain uninstrumented.
+ */
+#define __no_sanitize_memory __disable_sanitizer_instrumentation
+
+/*
+ * The __no_kmsan_checks attribute ensures that a function does not produce
+ * false positive reports by:
+ *  - initializing all local variables and memory stores in this function;
+ *  - skipping all shadow checks;
+ *  - passing initialized arguments to this function's callees.
+ */
+#define __no_kmsan_checks __attribute__((no_sanitize("kernel-memory")))
+#else
+#define __no_sanitize_memory
+#define __no_kmsan_checks
+#endif
+
 /*
  * Support for __has_feature(coverage_sanitizer) was added in Clang 13 together
  * with no_sanitize("coverage"). Prior versions of Clang support coverage
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 9b157b71036f..f55a37efdb97 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -114,6 +114,12 @@
 #define __SANITIZE_ADDRESS__
 #endif
 
+/*
+ * GCC does not support KMSAN.
+ */
+#define __no_sanitize_memory
+#define __no_kmsan_checks
+
 /*
  * Turn individual warnings and errors on and off locally, depending
  * on version.
-- 
cgit v1.2.3


From 5de0ce85f5a4d2883eae6f48eb015bc5dfbd91e9 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:03:42 +0200
Subject: kmsan: mark noinstr as __no_sanitize_memory

noinstr functions should never be instrumented, so make KMSAN skip them by
applying the __no_sanitize_memory attribute.

Link: https://lkml.kernel.org/r/20220915150417.722975-9-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler_types.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 4f2a819fd60a..015207a6e2bf 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -229,7 +229,8 @@ struct ftrace_likely_data {
 /* Section for code which can't be instrumented at all */
 #define noinstr								\
 	noinline notrace __attribute((__section__(".noinstr.text")))	\
-	__no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage
+	__no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \
+	__no_sanitize_memory
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From f80be4571b19b9fd8dd1528cd2a2f123aff51f70 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:03:45 +0200
Subject: kmsan: add KMSAN runtime core
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For each memory location KernelMemorySanitizer maintains two types of
metadata:

1. The so-called shadow of that location - а byte:byte mapping describing
   whether or not individual bits of memory are initialized (shadow is 0)
   or not (shadow is 1).
2. The origins of that location - а 4-byte:4-byte mapping containing
   4-byte IDs of the stack traces where uninitialized values were
   created.

Each struct page now contains pointers to two struct pages holding KMSAN
metadata (shadow and origins) for the original struct page.  Utility
routines in mm/kmsan/core.c and mm/kmsan/shadow.c handle the metadata
creation, addressing, copying and checking.  mm/kmsan/report.c performs
error reporting in the cases an uninitialized value is used in a way that
leads to undefined behavior.

KMSAN compiler instrumentation is responsible for tracking the metadata
along with the kernel memory.  mm/kmsan/instrumentation.c provides the
implementation for instrumentation hooks that are called from files
compiled with -fsanitize=kernel-memory.

To aid parameter passing (also done at instrumentation level), each
task_struct now contains a struct kmsan_task_state used to track the
metadata of function parameters and return values for that task.

Finally, this patch provides CONFIG_KMSAN that enables KMSAN, and declares
CFLAGS_KMSAN, which are applied to files compiled with KMSAN.  The
KMSAN_SANITIZE:=n Makefile directive can be used to completely disable
KMSAN instrumentation for certain files.

Similarly, KMSAN_ENABLE_CHECKS:=n disables KMSAN checks and makes newly
created stack memory initialized.

Users can also use functions from include/linux/kmsan-checks.h to mark
certain memory regions as uninitialized or initialized (this is called
"poisoning" and "unpoisoning") or check that a particular region is
initialized.

Link: https://lkml.kernel.org/r/20220915150417.722975-12-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Marco Elver <elver@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kmsan-checks.h | 64 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/kmsan_types.h  | 35 ++++++++++++++++++++++++
 include/linux/mm_types.h     | 12 +++++++++
 include/linux/sched.h        |  5 ++++
 4 files changed, 116 insertions(+)
 create mode 100644 include/linux/kmsan-checks.h
 create mode 100644 include/linux/kmsan_types.h

(limited to 'include/linux')

diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h
new file mode 100644
index 000000000000..a6522a0c28df
--- /dev/null
+++ b/include/linux/kmsan-checks.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KMSAN checks to be used for one-off annotations in subsystems.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#ifndef _LINUX_KMSAN_CHECKS_H
+#define _LINUX_KMSAN_CHECKS_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_KMSAN
+
+/**
+ * kmsan_poison_memory() - Mark the memory range as uninitialized.
+ * @address: address to start with.
+ * @size:    size of buffer to poison.
+ * @flags:   GFP flags for allocations done by this function.
+ *
+ * Until other data is written to this range, KMSAN will treat it as
+ * uninitialized. Error reports for this memory will reference the call site of
+ * kmsan_poison_memory() as origin.
+ */
+void kmsan_poison_memory(const void *address, size_t size, gfp_t flags);
+
+/**
+ * kmsan_unpoison_memory() -  Mark the memory range as initialized.
+ * @address: address to start with.
+ * @size:    size of buffer to unpoison.
+ *
+ * Until other data is written to this range, KMSAN will treat it as
+ * initialized.
+ */
+void kmsan_unpoison_memory(const void *address, size_t size);
+
+/**
+ * kmsan_check_memory() - Check the memory range for being initialized.
+ * @address: address to start with.
+ * @size:    size of buffer to check.
+ *
+ * If any piece of the given range is marked as uninitialized, KMSAN will report
+ * an error.
+ */
+void kmsan_check_memory(const void *address, size_t size);
+
+#else
+
+static inline void kmsan_poison_memory(const void *address, size_t size,
+				       gfp_t flags)
+{
+}
+static inline void kmsan_unpoison_memory(const void *address, size_t size)
+{
+}
+static inline void kmsan_check_memory(const void *address, size_t size)
+{
+}
+
+#endif
+
+#endif /* _LINUX_KMSAN_CHECKS_H */
diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h
new file mode 100644
index 000000000000..8bfa6c98176d
--- /dev/null
+++ b/include/linux/kmsan_types.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A minimal header declaring types added by KMSAN to existing kernel structs.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+#ifndef _LINUX_KMSAN_TYPES_H
+#define _LINUX_KMSAN_TYPES_H
+
+/* These constants are defined in the MSan LLVM instrumentation pass. */
+#define KMSAN_RETVAL_SIZE 800
+#define KMSAN_PARAM_SIZE 800
+
+struct kmsan_context_state {
+	char param_tls[KMSAN_PARAM_SIZE];
+	char retval_tls[KMSAN_RETVAL_SIZE];
+	char va_arg_tls[KMSAN_PARAM_SIZE];
+	char va_arg_origin_tls[KMSAN_PARAM_SIZE];
+	u64 va_arg_overflow_size_tls;
+	char param_origin_tls[KMSAN_PARAM_SIZE];
+	u32 retval_origin_tls;
+};
+
+#undef KMSAN_PARAM_SIZE
+#undef KMSAN_RETVAL_SIZE
+
+struct kmsan_ctx {
+	struct kmsan_context_state cstate;
+	int kmsan_in_runtime;
+	bool allow_reporting;
+};
+
+#endif /* _LINUX_KMSAN_TYPES_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5c87d0f292a2..500e536796ca 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -224,6 +224,18 @@ struct page {
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
 
+#ifdef CONFIG_KMSAN
+	/*
+	 * KMSAN metadata for this page:
+	 *  - shadow page: every bit indicates whether the corresponding
+	 *    bit of the original page is initialized (0) or not (1);
+	 *  - origin page: every 4 bytes contain an id of the stack trace
+	 *    where the uninitialized value was created.
+	 */
+	struct page *kmsan_shadow;
+	struct page *kmsan_origin;
+#endif
+
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
 	int _last_cpupid;
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fbac3c19fe35..88a043f7235e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -14,6 +14,7 @@
 #include <linux/pid.h>
 #include <linux/sem.h>
 #include <linux/shm.h>
+#include <linux/kmsan_types.h>
 #include <linux/mutex.h>
 #include <linux/plist.h>
 #include <linux/hrtimer.h>
@@ -1362,6 +1363,10 @@ struct task_struct {
 #endif
 #endif
 
+#ifdef CONFIG_KMSAN
+	struct kmsan_ctx		kmsan_ctx;
+#endif
+
 #if IS_ENABLED(CONFIG_KUNIT)
 	struct kunit			*kunit_test;
 #endif
-- 
cgit v1.2.3


From b073d7f8aee4ebf05d10e3380df377b73120cf16 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:03:48 +0200
Subject: mm: kmsan: maintain KMSAN metadata for page operations

Insert KMSAN hooks that make the necessary bookkeeping changes:
 - poison page shadow and origins in alloc_pages()/free_page();
 - clear page shadow and origins in clear_page(), copy_user_highpage();
 - copy page metadata in copy_highpage(), wp_page_copy();
 - handle vmap()/vunmap()/iounmap();

Link: https://lkml.kernel.org/r/20220915150417.722975-15-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Marco Elver <elver@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h |   3 +
 include/linux/kmsan.h   | 145 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 include/linux/kmsan.h

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 25679035ca28..e9912da5441b 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/bug.h>
 #include <linux/cacheflush.h>
+#include <linux/kmsan.h>
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
@@ -311,6 +312,7 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
 	vfrom = kmap_local_page(from);
 	vto = kmap_local_page(to);
 	copy_user_page(vto, vfrom, vaddr, to);
+	kmsan_unpoison_memory(page_address(to), PAGE_SIZE);
 	kunmap_local(vto);
 	kunmap_local(vfrom);
 }
@@ -326,6 +328,7 @@ static inline void copy_highpage(struct page *to, struct page *from)
 	vfrom = kmap_local_page(from);
 	vto = kmap_local_page(to);
 	copy_page(vto, vfrom);
+	kmsan_copy_page_meta(to, from);
 	kunmap_local(vto);
 	kunmap_local(vfrom);
 }
diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
new file mode 100644
index 000000000000..b36bf3db835e
--- /dev/null
+++ b/include/linux/kmsan.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KMSAN API for subsystems.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+#ifndef _LINUX_KMSAN_H
+#define _LINUX_KMSAN_H
+
+#include <linux/gfp.h>
+#include <linux/kmsan-checks.h>
+#include <linux/types.h>
+
+struct page;
+
+#ifdef CONFIG_KMSAN
+
+/**
+ * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call.
+ * @page:  struct page pointer returned by alloc_pages().
+ * @order: order of allocated struct page.
+ * @flags: GFP flags used by alloc_pages()
+ *
+ * KMSAN marks 1<<@order pages starting at @page as uninitialized, unless
+ * @flags contain __GFP_ZERO.
+ */
+void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags);
+
+/**
+ * kmsan_free_page() - Notify KMSAN about a free_pages() call.
+ * @page:  struct page pointer passed to free_pages().
+ * @order: order of deallocated struct page.
+ *
+ * KMSAN marks freed memory as uninitialized.
+ */
+void kmsan_free_page(struct page *page, unsigned int order);
+
+/**
+ * kmsan_copy_page_meta() - Copy KMSAN metadata between two pages.
+ * @dst: destination page.
+ * @src: source page.
+ *
+ * KMSAN copies the contents of metadata pages for @src into the metadata pages
+ * for @dst. If @dst has no associated metadata pages, nothing happens.
+ * If @src has no associated metadata pages, @dst metadata pages are unpoisoned.
+ */
+void kmsan_copy_page_meta(struct page *dst, struct page *src);
+
+/**
+ * kmsan_map_kernel_range_noflush() - Notify KMSAN about a vmap.
+ * @start:	start of vmapped range.
+ * @end:	end of vmapped range.
+ * @prot:	page protection flags used for vmap.
+ * @pages:	array of pages.
+ * @page_shift:	page_shift passed to vmap_range_noflush().
+ *
+ * KMSAN maps shadow and origin pages of @pages into contiguous ranges in
+ * vmalloc metadata address range.
+ */
+void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
+				    pgprot_t prot, struct page **pages,
+				    unsigned int page_shift);
+
+/**
+ * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap.
+ * @start: start of vunmapped range.
+ * @end:   end of vunmapped range.
+ *
+ * KMSAN unmaps the contiguous metadata ranges created by
+ * kmsan_map_kernel_range_noflush().
+ */
+void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end);
+
+/**
+ * kmsan_ioremap_page_range() - Notify KMSAN about a ioremap_page_range() call.
+ * @addr:	range start.
+ * @end:	range end.
+ * @phys_addr:	physical range start.
+ * @prot:	page protection flags used for ioremap_page_range().
+ * @page_shift:	page_shift argument passed to vmap_range_noflush().
+ *
+ * KMSAN creates new metadata pages for the physical pages mapped into the
+ * virtual memory.
+ */
+void kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
+			      phys_addr_t phys_addr, pgprot_t prot,
+			      unsigned int page_shift);
+
+/**
+ * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call.
+ * @start: range start.
+ * @end:   range end.
+ *
+ * KMSAN unmaps the metadata pages for the given range and, unlike for
+ * vunmap_page_range(), also deallocates them.
+ */
+void kmsan_iounmap_page_range(unsigned long start, unsigned long end);
+
+#else
+
+static inline int kmsan_alloc_page(struct page *page, unsigned int order,
+				   gfp_t flags)
+{
+	return 0;
+}
+
+static inline void kmsan_free_page(struct page *page, unsigned int order)
+{
+}
+
+static inline void kmsan_copy_page_meta(struct page *dst, struct page *src)
+{
+}
+
+static inline void kmsan_vmap_pages_range_noflush(unsigned long start,
+						  unsigned long end,
+						  pgprot_t prot,
+						  struct page **pages,
+						  unsigned int page_shift)
+{
+}
+
+static inline void kmsan_vunmap_range_noflush(unsigned long start,
+					      unsigned long end)
+{
+}
+
+static inline void kmsan_ioremap_page_range(unsigned long start,
+					    unsigned long end,
+					    phys_addr_t phys_addr,
+					    pgprot_t prot,
+					    unsigned int page_shift)
+{
+}
+
+static inline void kmsan_iounmap_page_range(unsigned long start,
+					    unsigned long end)
+{
+}
+
+#endif
+
+#endif /* _LINUX_KMSAN_H */
-- 
cgit v1.2.3


From 68ef169a1dd20df5cfa5a161b7304ad9fdd14c36 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:03:49 +0200
Subject: mm: kmsan: call KMSAN hooks from SLUB code

In order to report uninitialized memory coming from heap allocations KMSAN
has to poison them unless they're created with __GFP_ZERO.

It's handy that we need KMSAN hooks in the places where
init_on_alloc/init_on_free initialization is performed.

In addition, we apply __no_kmsan_checks to get_freepointer_safe() to
suppress reports when accessing freelist pointers that reside in freed
objects.

Link: https://lkml.kernel.org/r/20220915150417.722975-16-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kmsan.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index b36bf3db835e..5c4e0079054e 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 
 struct page;
+struct kmem_cache;
 
 #ifdef CONFIG_KMSAN
 
@@ -48,6 +49,44 @@ void kmsan_free_page(struct page *page, unsigned int order);
  */
 void kmsan_copy_page_meta(struct page *dst, struct page *src);
 
+/**
+ * kmsan_slab_alloc() - Notify KMSAN about a slab allocation.
+ * @s:      slab cache the object belongs to.
+ * @object: object pointer.
+ * @flags:  GFP flags passed to the allocator.
+ *
+ * Depending on cache flags and GFP flags, KMSAN sets up the metadata of the
+ * newly created object, marking it as initialized or uninitialized.
+ */
+void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
+
+/**
+ * kmsan_slab_free() - Notify KMSAN about a slab deallocation.
+ * @s:      slab cache the object belongs to.
+ * @object: object pointer.
+ *
+ * KMSAN marks the freed object as uninitialized.
+ */
+void kmsan_slab_free(struct kmem_cache *s, void *object);
+
+/**
+ * kmsan_kmalloc_large() - Notify KMSAN about a large slab allocation.
+ * @ptr:   object pointer.
+ * @size:  object size.
+ * @flags: GFP flags passed to the allocator.
+ *
+ * Similar to kmsan_slab_alloc(), but for large allocations.
+ */
+void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags);
+
+/**
+ * kmsan_kfree_large() - Notify KMSAN about a large slab deallocation.
+ * @ptr: object pointer.
+ *
+ * Similar to kmsan_slab_free(), but for large allocations.
+ */
+void kmsan_kfree_large(const void *ptr);
+
 /**
  * kmsan_map_kernel_range_noflush() - Notify KMSAN about a vmap.
  * @start:	start of vmapped range.
@@ -114,6 +153,24 @@ static inline void kmsan_copy_page_meta(struct page *dst, struct page *src)
 {
 }
 
+static inline void kmsan_slab_alloc(struct kmem_cache *s, void *object,
+				    gfp_t flags)
+{
+}
+
+static inline void kmsan_slab_free(struct kmem_cache *s, void *object)
+{
+}
+
+static inline void kmsan_kmalloc_large(const void *ptr, size_t size,
+				       gfp_t flags)
+{
+}
+
+static inline void kmsan_kfree_large(const void *ptr)
+{
+}
+
 static inline void kmsan_vmap_pages_range_noflush(unsigned long start,
 						  unsigned long end,
 						  pgprot_t prot,
-- 
cgit v1.2.3


From 50b5e49ca694a60f84a2a12d62b6cb6ec8e3649f Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:03:50 +0200
Subject: kmsan: handle task creation and exiting

Tell KMSAN that a new task is created, so the tool creates a backing
metadata structure for that task.

Link: https://lkml.kernel.org/r/20220915150417.722975-17-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Marco Elver <elver@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kmsan.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index 5c4e0079054e..354aee6f7b1a 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -15,9 +15,22 @@
 
 struct page;
 struct kmem_cache;
+struct task_struct;
 
 #ifdef CONFIG_KMSAN
 
+/**
+ * kmsan_task_create() - Initialize KMSAN state for the task.
+ * @task: task to initialize.
+ */
+void kmsan_task_create(struct task_struct *task);
+
+/**
+ * kmsan_task_exit() - Notify KMSAN that a task has exited.
+ * @task: task about to finish.
+ */
+void kmsan_task_exit(struct task_struct *task);
+
 /**
  * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call.
  * @page:  struct page pointer returned by alloc_pages().
@@ -139,6 +152,14 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end);
 
 #else
 
+static inline void kmsan_task_create(struct task_struct *task)
+{
+}
+
+static inline void kmsan_task_exit(struct task_struct *task)
+{
+}
+
 static inline int kmsan_alloc_page(struct page *page, unsigned int order,
 				   gfp_t flags)
 {
-- 
cgit v1.2.3


From 3c206509826094e85ead0b056f484db96829248d Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:03:51 +0200
Subject: init: kmsan: call KMSAN initialization routines

kmsan_init_shadow() scans the mappings created at boot time and creates
metadata pages for those mappings.

When the memblock allocator returns pages to pagealloc, we reserve 2/3 of
those pages and use them as metadata for the remaining 1/3.  Once KMSAN
starts, every page allocated by pagealloc has its associated shadow and
origin pages.

kmsan_initialize() initializes the bookkeeping for init_task and enables
KMSAN.

Link: https://lkml.kernel.org/r/20220915150417.722975-18-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Marco Elver <elver@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kmsan.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index 354aee6f7b1a..e00de976ee43 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -31,6 +31,28 @@ void kmsan_task_create(struct task_struct *task);
  */
 void kmsan_task_exit(struct task_struct *task);
 
+/**
+ * kmsan_init_shadow() - Initialize KMSAN shadow at boot time.
+ *
+ * Allocate and initialize KMSAN metadata for early allocations.
+ */
+void __init kmsan_init_shadow(void);
+
+/**
+ * kmsan_init_runtime() - Initialize KMSAN state and enable KMSAN.
+ */
+void __init kmsan_init_runtime(void);
+
+/**
+ * kmsan_memblock_free_pages() - handle freeing of memblock pages.
+ * @page:	struct page to free.
+ * @order:	order of @page.
+ *
+ * Freed pages are either returned to buddy allocator or held back to be used
+ * as metadata pages.
+ */
+bool __init kmsan_memblock_free_pages(struct page *page, unsigned int order);
+
 /**
  * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call.
  * @page:  struct page pointer returned by alloc_pages().
@@ -152,6 +174,20 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end);
 
 #else
 
+static inline void kmsan_init_shadow(void)
+{
+}
+
+static inline void kmsan_init_runtime(void)
+{
+}
+
+static inline bool kmsan_memblock_free_pages(struct page *page,
+					     unsigned int order)
+{
+	return true;
+}
+
 static inline void kmsan_task_create(struct task_struct *task)
 {
 }
-- 
cgit v1.2.3


From 75cf0290271bf6dae9dee982aef15242dadf97e4 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:03:52 +0200
Subject: instrumented.h: add KMSAN support

To avoid false positives, KMSAN needs to unpoison the data copied from the
userspace.  To detect infoleaks - check the memory buffer passed to
copy_to_user().

Link: https://lkml.kernel.org/r/20220915150417.722975-19-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/instrumented.h | 18 +++++++++++++-----
 include/linux/kmsan-checks.h | 19 +++++++++++++++++++
 2 files changed, 32 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h
index 9f1dba8f717b..501fa8486749 100644
--- a/include/linux/instrumented.h
+++ b/include/linux/instrumented.h
@@ -2,7 +2,7 @@
 
 /*
  * This header provides generic wrappers for memory access instrumentation that
- * the compiler cannot emit for: KASAN, KCSAN.
+ * the compiler cannot emit for: KASAN, KCSAN, KMSAN.
  */
 #ifndef _LINUX_INSTRUMENTED_H
 #define _LINUX_INSTRUMENTED_H
@@ -10,6 +10,7 @@
 #include <linux/compiler.h>
 #include <linux/kasan-checks.h>
 #include <linux/kcsan-checks.h>
+#include <linux/kmsan-checks.h>
 #include <linux/types.h>
 
 /**
@@ -117,6 +118,7 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 	kasan_check_read(from, n);
 	kcsan_check_read(from, n);
+	kmsan_copy_to_user(to, from, n, 0);
 }
 
 /**
@@ -151,6 +153,7 @@ static __always_inline void
 instrument_copy_from_user_after(const void *to, const void __user *from,
 				unsigned long n, unsigned long left)
 {
+	kmsan_unpoison_memory(to, n - left);
 }
 
 /**
@@ -162,10 +165,14 @@ instrument_copy_from_user_after(const void *to, const void __user *from,
  *
  * @to destination variable, may not be address-taken
  */
-#define instrument_get_user(to)                         \
-({                                                      \
+#define instrument_get_user(to)				\
+({							\
+	u64 __tmp = (u64)(to);				\
+	kmsan_unpoison_memory(&__tmp, sizeof(__tmp));	\
+	to = __tmp;					\
 })
 
+
 /**
  * instrument_put_user() - add instrumentation to put_user()-like macros
  *
@@ -177,8 +184,9 @@ instrument_copy_from_user_after(const void *to, const void __user *from,
  * @ptr userspace pointer to copy to
  * @size number of bytes to copy
  */
-#define instrument_put_user(from, ptr, size)                    \
-({                                                              \
+#define instrument_put_user(from, ptr, size)			\
+({								\
+	kmsan_copy_to_user(ptr, &from, sizeof(from), 0);	\
 })
 
 #endif /* _LINUX_INSTRUMENTED_H */
diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h
index a6522a0c28df..c4cae333deec 100644
--- a/include/linux/kmsan-checks.h
+++ b/include/linux/kmsan-checks.h
@@ -46,6 +46,21 @@ void kmsan_unpoison_memory(const void *address, size_t size);
  */
 void kmsan_check_memory(const void *address, size_t size);
 
+/**
+ * kmsan_copy_to_user() - Notify KMSAN about a data transfer to userspace.
+ * @to:      destination address in the userspace.
+ * @from:    source address in the kernel.
+ * @to_copy: number of bytes to copy.
+ * @left:    number of bytes not copied.
+ *
+ * If this is a real userspace data transfer, KMSAN checks the bytes that were
+ * actually copied to ensure there was no information leak. If @to belongs to
+ * the kernel space (which is possible for compat syscalls), KMSAN just copies
+ * the metadata.
+ */
+void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
+			size_t left);
+
 #else
 
 static inline void kmsan_poison_memory(const void *address, size_t size,
@@ -58,6 +73,10 @@ static inline void kmsan_unpoison_memory(const void *address, size_t size)
 static inline void kmsan_check_memory(const void *address, size_t size)
 {
 }
+static inline void kmsan_copy_to_user(void __user *to, const void *from,
+				      size_t to_copy, size_t left)
+{
+}
 
 #endif
 
-- 
cgit v1.2.3


From 7ade4f10779cb46f5c29ced9b7a41f68501cf0ed Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:03:55 +0200
Subject: dma: kmsan: unpoison DMA mappings

KMSAN doesn't know about DMA memory writes performed by devices.  We
unpoison such memory when it's mapped to avoid false positive reports.

Link: https://lkml.kernel.org/r/20220915150417.722975-22-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Marco Elver <elver@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kmsan.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index e00de976ee43..dac296da45c5 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -9,6 +9,7 @@
 #ifndef _LINUX_KMSAN_H
 #define _LINUX_KMSAN_H
 
+#include <linux/dma-direction.h>
 #include <linux/gfp.h>
 #include <linux/kmsan-checks.h>
 #include <linux/types.h>
@@ -16,6 +17,7 @@
 struct page;
 struct kmem_cache;
 struct task_struct;
+struct scatterlist;
 
 #ifdef CONFIG_KMSAN
 
@@ -172,6 +174,35 @@ void kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
  */
 void kmsan_iounmap_page_range(unsigned long start, unsigned long end);
 
+/**
+ * kmsan_handle_dma() - Handle a DMA data transfer.
+ * @page:   first page of the buffer.
+ * @offset: offset of the buffer within the first page.
+ * @size:   buffer size.
+ * @dir:    one of possible dma_data_direction values.
+ *
+ * Depending on @direction, KMSAN:
+ * * checks the buffer, if it is copied to device;
+ * * initializes the buffer, if it is copied from device;
+ * * does both, if this is a DMA_BIDIRECTIONAL transfer.
+ */
+void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
+		      enum dma_data_direction dir);
+
+/**
+ * kmsan_handle_dma_sg() - Handle a DMA transfer using scatterlist.
+ * @sg:    scatterlist holding DMA buffers.
+ * @nents: number of scatterlist entries.
+ * @dir:   one of possible dma_data_direction values.
+ *
+ * Depending on @direction, KMSAN:
+ * * checks the buffers in the scatterlist, if they are copied to device;
+ * * initializes the buffers, if they are copied from device;
+ * * does both, if this is a DMA_BIDIRECTIONAL transfer.
+ */
+void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
+			 enum dma_data_direction dir);
+
 #else
 
 static inline void kmsan_init_shadow(void)
@@ -254,6 +285,16 @@ static inline void kmsan_iounmap_page_range(unsigned long start,
 {
 }
 
+static inline void kmsan_handle_dma(struct page *page, size_t offset,
+				    size_t size, enum dma_data_direction dir)
+{
+}
+
+static inline void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
+				       enum dma_data_direction dir)
+{
+}
+
 #endif
 
 #endif /* _LINUX_KMSAN_H */
-- 
cgit v1.2.3


From 553a80188a5d7164d2b0688b06bf3fe297023bfe Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:03:57 +0200
Subject: kmsan: handle memory sent to/from USB

Depending on the value of is_out kmsan_handle_urb() KMSAN either marks the
data copied to the kernel from a USB device as initialized, or checks the
data sent to the device for being initialized.

Link: https://lkml.kernel.org/r/20220915150417.722975-24-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Marco Elver <elver@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kmsan.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index dac296da45c5..c473e0e21683 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -18,6 +18,7 @@ struct page;
 struct kmem_cache;
 struct task_struct;
 struct scatterlist;
+struct urb;
 
 #ifdef CONFIG_KMSAN
 
@@ -203,6 +204,16 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
 void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
 			 enum dma_data_direction dir);
 
+/**
+ * kmsan_handle_urb() - Handle a USB data transfer.
+ * @urb:    struct urb pointer.
+ * @is_out: data transfer direction (true means output to hardware).
+ *
+ * If @is_out is true, KMSAN checks the transfer buffer of @urb. Otherwise,
+ * KMSAN initializes the transfer buffer.
+ */
+void kmsan_handle_urb(const struct urb *urb, bool is_out);
+
 #else
 
 static inline void kmsan_init_shadow(void)
@@ -295,6 +306,10 @@ static inline void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
 {
 }
 
+static inline void kmsan_handle_urb(const struct urb *urb, bool is_out)
+{
+}
+
 #endif
 
 #endif /* _LINUX_KMSAN_H */
-- 
cgit v1.2.3


From ff901d80fff6d65ada6f2a60a1f7d180ee2e0416 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:04:09 +0200
Subject: x86: kmsan: use __msan_ string functions where possible.

Unless stated otherwise (by explicitly calling __memcpy(), __memset() or
__memmove()) we want all string functions to call their __msan_ versions
(e.g.  __msan_memcpy() instead of memcpy()), so that shadow and origin
values are updated accordingly.

Bootloader must still use the default string functions to avoid crashes.

Link: https://lkml.kernel.org/r/20220915150417.722975-36-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Marco Elver <elver@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fortify-string.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 3b401fa0f374..6c8a1a29d0b6 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -285,8 +285,10 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size,
  * __builtin_object_size() must be captured here to avoid evaluating argument
  * side-effects further into the macro layers.
  */
+#ifndef CONFIG_KMSAN
 #define memset(p, c, s) __fortify_memset_chk(p, c, s,			\
 		__builtin_object_size(p, 0), __builtin_object_size(p, 1))
+#endif
 
 /*
  * To make sure the compiler can enforce protection against buffer overflows,
-- 
cgit v1.2.3


From 6cae637fa26df867449c6bc20ea8bc693abe49b0 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 15 Sep 2022 17:04:14 +0200
Subject: entry: kmsan: introduce kmsan_unpoison_entry_regs()

struct pt_regs passed into IRQ entry code is set up by uninstrumented asm
functions, therefore KMSAN may not notice the registers are initialized.

kmsan_unpoison_entry_regs() unpoisons the contents of struct pt_regs,
preventing potential false positives.  Unlike kmsan_unpoison_memory(), it
can be called under kmsan_in_runtime(), which is often the case in IRQ
entry code.

Link: https://lkml.kernel.org/r/20220915150417.722975-41-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Marco Elver <elver@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kmsan.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index c473e0e21683..e38ae3c34618 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -214,6 +214,17 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
  */
 void kmsan_handle_urb(const struct urb *urb, bool is_out);
 
+/**
+ * kmsan_unpoison_entry_regs() - Handle pt_regs in low-level entry code.
+ * @regs:	struct pt_regs pointer received from assembly code.
+ *
+ * KMSAN unpoisons the contents of the passed pt_regs, preventing potential
+ * false positive reports. Unlike kmsan_unpoison_memory(),
+ * kmsan_unpoison_entry_regs() can be called from the regions where
+ * kmsan_in_runtime() returns true, which is the case in early entry code.
+ */
+void kmsan_unpoison_entry_regs(const struct pt_regs *regs);
+
 #else
 
 static inline void kmsan_init_shadow(void)
@@ -310,6 +321,10 @@ static inline void kmsan_handle_urb(const struct urb *urb, bool is_out)
 {
 }
 
+static inline void kmsan_unpoison_entry_regs(const struct pt_regs *regs)
+{
+}
+
 #endif
 
 #endif /* _LINUX_KMSAN_H */
-- 
cgit v1.2.3


From 16bc1b0f0269b6110f6d25880b52947d354e2980 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Thu, 15 Sep 2022 19:33:41 +0800
Subject: mm/damon: use 'struct damon_target *' instead of 'void *' in
 target_valid()

We could use 'struct damon_target *' directly instead of 'void *' in
target_valid() operation to make code simple.

Link: https://lkml.kernel.org/r/1663241621-13293-1-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index c5dc0c77c772..1dda8d0068e5 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -346,7 +346,7 @@ struct damon_operations {
 	unsigned long (*apply_scheme)(struct damon_ctx *context,
 			struct damon_target *t, struct damon_region *r,
 			struct damos *scheme);
-	bool (*target_valid)(void *target);
+	bool (*target_valid)(struct damon_target *t);
 	void (*cleanup)(struct damon_ctx *context);
 };
 
-- 
cgit v1.2.3


From cc713520bdc1b84fc5394f6ac8649b93ad2c5dde Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Fri, 16 Sep 2022 23:20:35 +0800
Subject: mm/damon: return void from damon_set_schemes()

There is no point in returning an int from damon_set_schemes().  It always
returns 0 which is meaningless for the caller, so change it to return void
directly.

Link: https://lkml.kernel.org/r/1663341635-12675-1-git-send-email-kaixuxia@tencent.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 1dda8d0068e5..e7808a84675f 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -541,7 +541,7 @@ unsigned int damon_nr_regions(struct damon_target *t);
 struct damon_ctx *damon_new_ctx(void);
 void damon_destroy_ctx(struct damon_ctx *ctx);
 int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs);
-int damon_set_schemes(struct damon_ctx *ctx,
+void damon_set_schemes(struct damon_ctx *ctx,
 			struct damos **schemes, ssize_t nr_schemes);
 int damon_nr_running_ctxs(void);
 bool damon_is_registered_ops(enum damon_ops_id id);
-- 
cgit v1.2.3


From 638a9ae97ab596f1f7b7522dad709e69cb5b4e9d Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Fri, 16 Sep 2022 15:22:44 +0800
Subject: mm: remove obsolete macro NR_PCP_ORDER_MASK and NR_PCP_ORDER_WIDTH

Since commit 8b10b465d0e1 ("mm/page_alloc: free pages in a single pass
during bulk free"), they're not used anymore.  Remove them.

Link: https://lkml.kernel.org/r/20220916072257.9639-4-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c69c08156822..3ff1e757d5aa 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -564,13 +564,6 @@ enum zone_watermarks {
 #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
 #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
 
-/*
- * Shift to encode migratetype and order in the same integer, with order
- * in the least significant bits.
- */
-#define NR_PCP_ORDER_WIDTH 8
-#define NR_PCP_ORDER_MASK ((1<<NR_PCP_ORDER_WIDTH) - 1)
-
 #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
 #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
-- 
cgit v1.2.3


From 5749fcc5f04cef4091dea0c2ba6b5c5f5e05a549 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Fri, 16 Sep 2022 15:22:46 +0800
Subject: mm/page_alloc: add __init annotations to
 init_mem_debugging_and_hardening()

It's only called by mm_init(). Add __init annotations to it.

Link: https://lkml.kernel.org/r/20220916072257.9639-6-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a37c8a29c49b..8bbcccbc5565 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3092,7 +3092,7 @@ extern int apply_to_existing_page_range(struct mm_struct *mm,
 				   unsigned long address, unsigned long size,
 				   pte_fn_t fn, void *data);
 
-extern void init_mem_debugging_and_hardening(void);
+extern void __init init_mem_debugging_and_hardening(void);
 #ifdef CONFIG_PAGE_POISONING
 extern void __kernel_poison_pages(struct page *page, int numpages);
 extern void __kernel_unpoison_pages(struct page *page, int numpages);
-- 
cgit v1.2.3


From 30e3b5d7c82f78c63c53197b5d8b99636bb60d56 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Fri, 16 Sep 2022 15:22:48 +0800
Subject: mm: remove obsolete pgdat_is_empty()

There's no caller.  Remove it.

Link: https://lkml.kernel.org/r/20220916072257.9639-8-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3ff1e757d5aa..4c8510f26b02 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1241,11 +1241,6 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
 	return pgdat->node_start_pfn + pgdat->node_spanned_pages;
 }
 
-static inline bool pgdat_is_empty(pg_data_t *pgdat)
-{
-	return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
-}
-
 #include <linux/memory_hotplug.h>
 
 void build_all_zonelists(pg_data_t *pgdat);
-- 
cgit v1.2.3


From f774a6a6fd39e1b5677bdf71f6813b382faddeeb Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Fri, 16 Sep 2022 15:22:51 +0800
Subject: mm, memory_hotplug: remove obsolete generic_free_nodedata()

Commit 390511e1476e ("mm, memory_hotplug: drop arch_free_nodedata") drops
the last caller of generic_free_nodedata().  Remove it too.

Link: https://lkml.kernel.org/r/20220916072257.9639-11-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 51052969dbfe..9fcbf5706595 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -43,11 +43,6 @@ extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
 ({								\
 	memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES);	\
 })
-/*
- * This definition is just for error path in node hotadd.
- * For node hotremove, we have to replace this.
- */
-#define generic_free_nodedata(pgdat)	kfree(pgdat)
 
 extern pg_data_t *node_data[];
 static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
@@ -63,9 +58,6 @@ static inline pg_data_t *generic_alloc_nodedata(int nid)
 	BUG();
 	return NULL;
 }
-static inline void generic_free_nodedata(pg_data_t *pgdat)
-{
-}
 static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
 {
 }
-- 
cgit v1.2.3


From def76fd549c513bb90278a8d6d0fe3ef3faa20a7 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Fri, 16 Sep 2022 15:22:56 +0800
Subject: mm/page_alloc: remove obsolete gfpflags_normal_context()

Since commit dacb5d8875cc ("tcp: fix page frag corruption on page fault"),
there's no caller of gfpflags_normal_context().  Remove it as this helper
is strictly tied to the sk page frag usage and there won't be other user
in the future.

[linmiaohe@huawei.com: fix htmldocs]
  Link: https://lkml.kernel.org/r/1bc55727-9b66-0e9e-c306-f10c4716ea89@huawei.com
Link: https://lkml.kernel.org/r/20220916072257.9639-16-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h | 23 -----------------------
 1 file changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index ea6cb9399152..ef4aea3b356e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -36,29 +36,6 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
 	return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
 }
 
-/**
- * gfpflags_normal_context - is gfp_flags a normal sleepable context?
- * @gfp_flags: gfp_flags to test
- *
- * Test whether @gfp_flags indicates that the allocation is from the
- * %current context and allowed to sleep.
- *
- * An allocation being allowed to block doesn't mean it owns the %current
- * context.  When direct reclaim path tries to allocate memory, the
- * allocation context is nested inside whatever %current was doing at the
- * time of the original allocation.  The nested allocation may be allowed
- * to block but modifying anything %current owns can corrupt the outer
- * context's expectations.
- *
- * %true result from this function indicates that the allocation context
- * can sleep and use anything that's associated with %current.
- */
-static inline bool gfpflags_normal_context(const gfp_t gfp_flags)
-{
-	return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) ==
-		__GFP_DIRECT_RECLAIM;
-}
-
 #ifdef CONFIG_HIGHMEM
 #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
 #else
-- 
cgit v1.2.3


From 233f0b31bd9503ce2be7be0bde69c67287c8a741 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <kaixuxia@tencent.com>
Date: Tue, 20 Sep 2022 16:53:22 +0000
Subject: mm/damon: deduplicate damon_{reclaim,lru_sort}_apply_parameters()

The bodies of damon_{reclaim,lru_sort}_apply_parameters() contain
duplicates.  This commit adds a common function
damon_set_region_biggest_system_ram_default() to remove the duplicates.

Link: https://lkml.kernel.org/r/6329f00d.a70a0220.9bb29.3678SMTPIN_ADDED_BROKEN@mx.google.com
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Suggested-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index e7808a84675f..ed5470f50bab 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -557,7 +557,8 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 
-bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end);
+int damon_set_region_biggest_system_ram_default(struct damon_target *t,
+				unsigned long *start, unsigned long *end);
 
 #endif	/* CONFIG_DAMON */
 
-- 
cgit v1.2.3


From 2eb989195d9a361d13d66ffb8738847649e080ad Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Tue, 20 Sep 2022 02:06:33 +0800
Subject: mm: memcontrol: use memcg_kmem_enabled in count_objcg_event

Patch series "mm: memcontrol: cleanup and optimize for two accounting
params", v2.


This patch (of 2):

There are currently two helpers for checking if cgroup kmem
accounting is enabled:

- mem_cgroup_kmem_disabled
- memcg_kmem_enabled

mem_cgroup_kmem_disabled is a simple helper that returns true
if cgroup.memory=nokmem is specified, otherwise returns false.

memcg_kmem_enabled is a bit different, it returns true if
cgroup.memory=nokmem is not specified and there was at least one
non-root memory control enabled cgroup ever created. This help improve
performance when kmem accounting was not actually activated. And it's
optimized with static branch.

The usage of mem_cgroup_kmem_disabled is for sub-systems that need to
preallocate data for kmem accounting since they could be initialized
before kmem accounting is activated. But count_objcg_event doesn't
need that, so using memcg_kmem_enabled is better here.

Link: https://lkml.kernel.org/r/20220919180634.45958-1-ryncsn@gmail.com
Link: https://lkml.kernel.org/r/20220919180634.45958-2-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index dc7d40e575d5..ef479e554253 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1778,7 +1778,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
 {
 	struct mem_cgroup *memcg;
 
-	if (mem_cgroup_kmem_disabled())
+	if (!memcg_kmem_enabled())
 		return;
 
 	rcu_read_lock();
-- 
cgit v1.2.3


From 7c6c6cc4d3a213e7303ef06ff40f6193df01839c Mon Sep 17 00:00:00 2001
From: Zach O'Keefe <zokeefe@google.com>
Date: Thu, 22 Sep 2022 15:40:37 -0700
Subject: mm/shmem: add flag to enforce shmem THP in hugepage_vma_check()

Patch series "mm: add file/shmem support to MADV_COLLAPSE", v4.

This series builds on top of the previous "mm: userspace hugepage
collapse" series which introduced the MADV_COLLAPSE madvise mode and added
support for private, anonymous mappings[2], by adding support for file and
shmem backed memory to CONFIG_READ_ONLY_THP_FOR_FS=y kernels.

File and shmem support have been added with effort to align with existing
MADV_COLLAPSE semantics and policy decisions[3].  Collapse of shmem-backed
memory ignores kernel-guiding directives and heuristics including all
sysfs settings (transparent_hugepage/shmem_enabled), and tmpfs huge= mount
options (shmem always supports large folios).  Like anonymous mappings, on
successful return of MADV_COLLAPSE on file/shmem memory, the contents of
memory mapped by the addresses provided will be synchronously pmd-mapped
THPs.

This functionality unlocks two important uses:

(1)	Immediately back executable text by THPs.  Current support provided
	by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large
	system which might impair services from serving at their full rated
	load after (re)starting.  Tricks like mremap(2)'ing text onto
	anonymous memory to immediately realize iTLB performance prevents
	page sharing and demand paging, both of which increase steady state
	memory footprint.  Now, we can have the best of both worlds: Peak
	upfront performance and lower RAM footprints.

(2)	userfaultfd-based live migration of virtual machines satisfy UFFD
	faults by fetching native-sized pages over the network (to avoid
	latency of transferring an entire hugepage).  However, after guest
	memory has been fully copied to the new host, MADV_COLLAPSE can
	be used to immediately increase guest performance.

khugepaged has received a small improvement by association and can now
detect and collapse pte-mapped THPs.  However, there is still work to be
done along the file collapse path.  Compound pages of arbitrary order
still needs to be supported and THP collapse needs to be converted to
using folios in general.  Eventually, we'd like to move away from the
read-only and executable-mapped constraints currently imposed on eligible
files and support any inode claiming huge folio support.  That said, I
think the series as-is covers enough to claim that MADV_COLLAPSE supports
file/shmem memory.

Patches 1-3	Implement the guts of the series.
Patch 4 	Is a tracepoint for debugging.
Patches 5-9 	Refactor existing khugepaged selftests to work with new
		memory types + new collapse tests.
Patch 10 	Adds a userfaultfd selftest mode to mimic a functional test
		of UFFDIO_REGISTER_MODE_MINOR+MADV_COLLAPSE live migration.
		(v4 note: "userfaultfd shmem" selftest is failing as of
		Sep 22 mm-unstable)

[1] https://lore.kernel.org/linux-mm/YyiK8YvVcrtZo0z3@google.com/
[2] https://lore.kernel.org/linux-mm/20220706235936.2197195-1-zokeefe@google.com/
[3] https://lore.kernel.org/linux-mm/YtBmhaiPHUTkJml8@google.com/
[4] https://lore.kernel.org/linux-mm/20220922222731.1124481-1-zokeefe@google.com/
[5] https://lore.kernel.org/linux-mm/20220922184651.1016461-1-zokeefe@google.com/


This patch (of 10):

Extend 'mm/thp: add flag to enforce sysfs THP in hugepage_vma_check()' to
shmem, allowing callers to ignore
/sys/kernel/transparent_hugepage/shmem_enabled and tmpfs huge= mount.

This is intended to be used by MADV_COLLAPSE, and the rationale is
analogous to the anon/file case: MADV_COLLAPSE is not coupled to
directives that advise the kernel's decisions on when THPs should be
considered eligible.  shmem/tmpfs always claims large folio support,
regardless of sysfs or mount options.

[shy828301@gmail.com: test shmem_huge_force explicitly]
  Link: https://lore.kernel.org/linux-mm/CAHbLzko3A5-TpS0BgBeKkx5cuOkWgLvWXQH=TdgW-baO4rPtdg@mail.gmail.com/
Link: https://lkml.kernel.org/r/20220922224046.1143204-1-zokeefe@google.com
Link: https://lkml.kernel.org/r/20220907144521.3115321-2-zokeefe@google.com
Link: https://lkml.kernel.org/r/20220922224046.1143204-2-zokeefe@google.com
Signed-off-by: Zach O'Keefe <zokeefe@google.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Chris Kennelly <ckennelly@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Houghton <jthoughton@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rongwei Wang <rongwei.wang@linux.alibaba.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index f24071e3c826..d500ea967dc7 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -92,11 +92,13 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 int shmem_unuse(unsigned int type);
 
-extern bool shmem_is_huge(struct vm_area_struct *vma,
-			  struct inode *inode, pgoff_t index);
-static inline bool shmem_huge_enabled(struct vm_area_struct *vma)
+extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
+			  pgoff_t index, bool shmem_huge_force);
+static inline bool shmem_huge_enabled(struct vm_area_struct *vma,
+				      bool shmem_huge_force)
 {
-	return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff);
+	return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff,
+			     shmem_huge_force);
 }
 extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
 extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
-- 
cgit v1.2.3


From 34488399fa08faaf664743fa54b271eb6f9e1321 Mon Sep 17 00:00:00 2001
From: Zach O'Keefe <zokeefe@google.com>
Date: Thu, 22 Sep 2022 15:40:39 -0700
Subject: mm/madvise: add file and shmem support to MADV_COLLAPSE

Add support for MADV_COLLAPSE to collapse shmem-backed and file-backed
memory into THPs (requires CONFIG_READ_ONLY_THP_FOR_FS=y).

On success, the backing memory will be a hugepage.  For the memory range
and process provided, the page tables will synchronously have a huge pmd
installed, mapping the THP.  Other mappings of the file extent mapped by
the memory range may be added to a set of entries that khugepaged will
later process and attempt update their page tables to map the THP by a
pmd.

This functionality unlocks two important uses:

(1)	Immediately back executable text by THPs.  Current support provided
	by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large
	system which might impair services from serving at their full rated
	load after (re)starting.  Tricks like mremap(2)'ing text onto
	anonymous memory to immediately realize iTLB performance prevents
	page sharing and demand paging, both of which increase steady state
	memory footprint.  Now, we can have the best of both worlds: Peak
	upfront performance and lower RAM footprints.

(2)	userfaultfd-based live migration of virtual machines satisfy UFFD
	faults by fetching native-sized pages over the network (to avoid
	latency of transferring an entire hugepage).  However, after guest
	memory has been fully copied to the new host, MADV_COLLAPSE can
	be used to immediately increase guest performance.

Since khugepaged is single threaded, this change now introduces
possibility of collapse contexts racing in file collapse path.  There a
important few places to consider:

(1)	hpage_collapse_scan_file(), when we xas_pause() and drop RCU.
	We could have the memory collapsed out from under us, but
	the next xas_for_each() iteration will correctly pick up the
	hugepage.  The hugepage might not be up to date (insofar as
	copying of small page contents might not have completed - the
	page still may be locked), but regardless what small page index
	we were iterating over, we'll find the hugepage and identify it
	as a suitably aligned compound page of order HPAGE_PMD_ORDER.

	In khugepaged path, we locklessly check the value of the pmd,
	and only add it to deferred collapse array if we find pmd
	mapping pte table. This is fine, since other values that could
	have raced in right afterwards denote failure, or that the
	memory was successfully collapsed, so we don't need further
	processing.

	In madvise path, we'll take mmap_lock() in write to serialize
	against page table updates and will know what to do based on the
	true value of the pmd: recheck all ptes if we point to a pte table,
	directly install the pmd, if the pmd has been cleared, but
	memory not yet faulted, or nothing at all if we find a huge pmd.

	It's worth putting emphasis here on how we treat the none pmd
	here.  If khugepaged has processed this mm's page tables
	already, it will have left the pmd cleared (ready for refault by
	the process).  Depending on the VMA flags and sysfs settings,
	amount of RAM on the machine, and the current load, could be a
	relatively common occurrence - and as such is one we'd like to
	handle successfully in MADV_COLLAPSE.  When we see the none pmd
	in collapse_pte_mapped_thp(), we've locked mmap_lock in write
	and checked (a) huepaged_vma_check() to see if the backing
	memory is appropriate still, along with VMA sizing and
	appropriate hugepage alignment within the file, and (b) we've
	found a hugepage head of order HPAGE_PMD_ORDER at the offset
	in the file mapped by our hugepage-aligned virtual address.
	Even though the common-case is likely race with khugepaged,
	given these checks (regardless how we got here - we could be
	operating on a completely different file than originally checked
	in hpage_collapse_scan_file() for all we know) it should be safe
	to directly make the pmd a huge pmd pointing to this hugepage.

(2)	collapse_file() is mostly serialized on the same file extent by
	lock sequence:

		|	lock hupepage
		|		lock mapping->i_pages
		|			lock 1st page
		|		unlock mapping->i_pages
		|				<page checks>
		|		lock mapping->i_pages
		|				page_ref_freeze(3)
		|				xas_store(hugepage)
		|		unlock mapping->i_pages
		|				page_ref_unfreeze(1)
		|			unlock 1st page
		V	unlock hugepage

	Once a context (who already has their fresh hugepage locked)
	locks mapping->i_pages exclusively, it will hold said lock
	until it locks the first page, and it will hold that lock until
	the after the hugepage has been added to the page cache (and
	will unlock the hugepage after page table update, though that
	isn't important here).

	A racing context that loses the race for mapping->i_pages will
	then lose the race to locking the first page.  Here - depending
	on how far the other racing context has gotten - we might find
	the new hugepage (in which case we'll exit cleanly when we
	check PageTransCompound()), or we'll find the "old" 1st small
	page (in which we'll exit cleanly when we discover unexpected
	refcount of 2 after isolate_lru_page()).  This is assuming we
	are able to successfully lock the page we find - in shmem path,
	we could just fail the trylock and exit cleanly anyways.

	Failure path in collapse_file() is similar: once we hold lock
	on 1st small page, we are serialized against other collapse
	contexts.  Before the 1st small page is unlocked, we add it
	back to the pagecache and unfreeze the refcount appropriately.
	Contexts who lost the race to the 1st small page will then find
	the same 1st small page with the correct refcount and will be
	able to proceed.

[zokeefe@google.com: don't check pmd value twice in collapse_pte_mapped_thp()]
  Link: https://lkml.kernel.org/r/20220927033854.477018-1-zokeefe@google.com
[shy828301@gmail.com: Delete hugepage_vma_revalidate_anon(), remove
	check for multi-add in khugepaged_add_pte_mapped_thp()]
  Link: https://lore.kernel.org/linux-mm/CAHbLzkrtpM=ic7cYAHcqkubah5VTR8N5=k5RT8MTvv5rN1Y91w@mail.gmail.com/
Link: https://lkml.kernel.org/r/20220907144521.3115321-4-zokeefe@google.com
Link: https://lkml.kernel.org/r/20220922224046.1143204-4-zokeefe@google.com
Signed-off-by: Zach O'Keefe <zokeefe@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Chris Kennelly <ckennelly@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Houghton <jthoughton@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rongwei Wang <rongwei.wang@linux.alibaba.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/khugepaged.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 384f034ae947..70162d707caf 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -16,11 +16,13 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma,
 				 unsigned long vm_flags);
 extern void khugepaged_min_free_kbytes_update(void);
 #ifdef CONFIG_SHMEM
-extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr);
+extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
+				   bool install_pmd);
 #else
-static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
-					   unsigned long addr)
+static inline int collapse_pte_mapped_thp(struct mm_struct *mm,
+					  unsigned long addr, bool install_pmd)
 {
+	return 0;
 }
 #endif
 
@@ -46,9 +48,10 @@ static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
 					unsigned long vm_flags)
 {
 }
-static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
-					   unsigned long addr)
+static inline int collapse_pte_mapped_thp(struct mm_struct *mm,
+					  unsigned long addr, bool install_pmd)
 {
+	return 0;
 }
 
 static inline void khugepaged_min_free_kbytes_update(void)
-- 
cgit v1.2.3


From 6b91e5dfb3c7ef485587e7ab494dcb47bcdadce3 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Thu, 22 Sep 2022 19:09:35 +0800
Subject: mm: remove unused inline functions from include/linux/mm_inline.h

Remove the following unused inline functions from mm_inline.h:

1.  All uses of add_page_to_lru_list_tail() have been removed since
   commit 7a3dbfe8a52b ("mm/swap: convert lru_deactivate_file to a
   folio_batch"), and it can be replaced by lruvec_add_folio_tail().

2.  All uses of __clear_page_lru_flags() have been removed since commit
   188e8caee968 ("mm/swap: convert __page_cache_release() to use a
   folio"), and it can be replaced by __folio_clear_lru_flags().

They are useless, so remove them.

Link: https://lkml.kernel.org/r/20220922110935.1495099-1-cuigaosheng1@huawei.com
Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_inline.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 4949eda9a9a2..e8ed225d8f7c 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -76,11 +76,6 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio)
 	__folio_clear_unevictable(folio);
 }
 
-static __always_inline void __clear_page_lru_flags(struct page *page)
-{
-	__folio_clear_lru_flags(page_folio(page));
-}
-
 /**
  * folio_lru_list - Which LRU list should a folio be on?
  * @folio: The folio to test.
@@ -348,12 +343,6 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
 	list_add_tail(&folio->lru, &lruvec->lists[lru]);
 }
 
-static __always_inline void add_page_to_lru_list_tail(struct page *page,
-				struct lruvec *lruvec)
-{
-	lruvec_add_folio_tail(lruvec, page_folio(page));
-}
-
 static __always_inline
 void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
 {
-- 
cgit v1.2.3


From e55b9f96860f6c6026cff97966a740576285e07b Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 26 Sep 2022 09:57:04 -0400
Subject: mm: memcontrol: drop dead CONFIG_MEMCG_SWAP config symbol

Since 2d1c498072de ("mm: memcontrol: make swap tracking an integral part
of memory control"), CONFIG_MEMCG_SWAP hasn't been a user-visible config
option anymore, it just means CONFIG_MEMCG && CONFIG_SWAP.

Update the sites accordingly and drop the symbol.

[ While touching the docs, remove two references to CONFIG_MEMCG_KMEM,
  which hasn't been a user-visible symbol for over half a decade. ]

Link: https://lkml.kernel.org/r/20220926135704.400818-5-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h        | 2 +-
 include/linux/swap_cgroup.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index fc8d98660326..a18cf4b7c724 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -666,7 +666,7 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
 	cgroup_throttle_swaprate(&folio->page, gfp);
 }
 
-#ifdef CONFIG_MEMCG_SWAP
+#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
 void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
 int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry);
 static inline int mem_cgroup_try_charge_swap(struct folio *folio,
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
index a12dd1c3966c..ae73a87775b3 100644
--- a/include/linux/swap_cgroup.h
+++ b/include/linux/swap_cgroup.h
@@ -4,7 +4,7 @@
 
 #include <linux/swap.h>
 
-#ifdef CONFIG_MEMCG_SWAP
+#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
 
 extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 					unsigned short old, unsigned short new);
@@ -40,6 +40,6 @@ static inline void swap_cgroup_swapoff(int type)
 	return;
 }
 
-#endif /* CONFIG_MEMCG_SWAP */
+#endif
 
 #endif /* __LINUX_SWAP_CGROUP_H */
-- 
cgit v1.2.3


From 8f824b4abd31c5ea32ae1d6725c47bdb247d18da Mon Sep 17 00:00:00 2001
From: Jiangshan Yi <yijiangshan@kylinos.cn>
Date: Mon, 5 Sep 2022 10:10:34 +0800
Subject: init.h: fix spelling typo in comment

Fix spelling typo in comment.

Link: https://lkml.kernel.org/r/20220905021034.947701-1-13667453960@163.com
Signed-off-by: Jiangshan Yi <yijiangshan@kylinos.cn>
Reported-by: k2ci <kernel-bot@kylinos.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/init.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index baf0b29a7010..3254766ebbf2 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -134,7 +134,7 @@ static inline initcall_t initcall_from_entry(initcall_entry_t *entry)
 
 extern initcall_entry_t __con_initcall_start[], __con_initcall_end[];
 
-/* Used for contructor calls. */
+/* Used for constructor calls. */
 typedef void (*ctor_fn_t)(void);
 
 struct file_system_type;
-- 
cgit v1.2.3


From 5ca14835dc429c09fefb290f60343fe266382760 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 9 Sep 2022 13:57:41 -0700
Subject: fs: uninline inode_maybe_inc_iversion()

It has many callsites and is large.

   text	   data	    bss	    dec	    hex	filename
  91796	  15984	    512	 108292	  1a704	mm/shmem.o-before
  91180	  15984	    512	 107676	  1a49c	mm/shmem.o-after

Acked-by: Jeff Layton <jlayton@kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/iversion.h | 46 +---------------------------------------------
 1 file changed, 1 insertion(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iversion.h b/include/linux/iversion.h
index eb5a15810169..e27bd4f55d84 100644
--- a/include/linux/iversion.h
+++ b/include/linux/iversion.h
@@ -172,51 +172,7 @@ inode_set_iversion_queried(struct inode *inode, u64 val)
 				I_VERSION_QUERIED);
 }
 
-/**
- * inode_maybe_inc_iversion - increments i_version
- * @inode: inode with the i_version that should be updated
- * @force: increment the counter even if it's not necessary?
- *
- * Every time the inode is modified, the i_version field must be seen to have
- * changed by any observer.
- *
- * If "force" is set or the QUERIED flag is set, then ensure that we increment
- * the value, and clear the queried flag.
- *
- * In the common case where neither is set, then we can return "false" without
- * updating i_version.
- *
- * If this function returns false, and no other metadata has changed, then we
- * can avoid logging the metadata.
- */
-static inline bool
-inode_maybe_inc_iversion(struct inode *inode, bool force)
-{
-	u64 cur, new;
-
-	/*
-	 * The i_version field is not strictly ordered with any other inode
-	 * information, but the legacy inode_inc_iversion code used a spinlock
-	 * to serialize increments.
-	 *
-	 * Here, we add full memory barriers to ensure that any de-facto
-	 * ordering with other info is preserved.
-	 *
-	 * This barrier pairs with the barrier in inode_query_iversion()
-	 */
-	smp_mb();
-	cur = inode_peek_iversion_raw(inode);
-	do {
-		/* If flag is clear then we needn't do anything */
-		if (!force && !(cur & I_VERSION_QUERIED))
-			return false;
-
-		/* Since lowest bit is flag, add 2 to avoid it */
-		new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
-	} while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
-	return true;
-}
-
+bool inode_maybe_inc_iversion(struct inode *inode, bool force);
 
 /**
  * inode_inc_iversion - forcibly increment i_version
-- 
cgit v1.2.3


From 5d0ce3595ab75330a15cec914096efbbb8b41e4a Mon Sep 17 00:00:00 2001
From: Jiebin Sun <jiebin.sun@intel.com>
Date: Wed, 14 Sep 2022 03:25:37 +0800
Subject: percpu: add percpu_counter_add_local and percpu_counter_sub_local

Patch series "/msg: mitigate the lock contention in ipc/msg", v6.

Here are two patches to mitigate the lock contention in ipc/msg.

The 1st patch is to add the new interface percpu_counter_add_local and
percpu_counter_sub_local.  The batch size in percpu_counter_add_batch
should be very large in heavy writing and rare reading case.  Add the
"_local" version, and mostly it will do local adding, reduce the global
updating and mitigate lock contention in writing.

The 2nd patch is to use percpu_counter instead of atomic update in
ipc/msg.  The msg_bytes and msg_hdrs atomic counters are frequently
updated when IPC msg queue is in heavy use, causing heavy cache bounce and
overhead.  Change them to percpu_counter greatly improve the performance.
Since there is one percpu struct per namespace, additional memory cost is
minimal.  Reading of the count done in msgctl call, which is infrequent.
So the need to sum up the counts in each CPU is infrequent.


This patch (of 2):

The batch size in percpu_counter_add_batch should be very large in
heavy writing and rare reading case. Add the "_local" version, and
mostly it will do local adding, reduce the global updating and
mitigate lock contention in writing.

Link: https://lkml.kernel.org/r/20220913192538.3023708-1-jiebin.sun@intel.com
Link: https://lkml.kernel.org/r/20220913192538.3023708-2-jiebin.sun@intel.com
Signed-off-by: Jiebin Sun <jiebin.sun@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
Cc: Alexey Gladkov <legion@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu_counter.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 01861eebed79..8ed5fba6d156 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -15,6 +15,9 @@
 #include <linux/types.h>
 #include <linux/gfp.h>
 
+/* percpu_counter batch for local add or sub */
+#define PERCPU_COUNTER_LOCAL_BATCH	INT_MAX
+
 #ifdef CONFIG_SMP
 
 struct percpu_counter {
@@ -56,6 +59,22 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 	percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
 }
 
+/*
+ * With percpu_counter_add_local() and percpu_counter_sub_local(), counts
+ * are accumulated in local per cpu counter and not in fbc->count until
+ * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter
+ * write efficient.
+ * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be
+ * used to add up the counts from each CPU to account for all the local
+ * counts. So percpu_counter_add_local() and percpu_counter_sub_local()
+ * should be used when a counter is updated frequently and read rarely.
+ */
+static inline void
+percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
+{
+	percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH);
+}
+
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
 {
 	s64 ret = __percpu_counter_sum(fbc);
@@ -138,6 +157,13 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 	preempt_enable();
 }
 
+/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
+static inline void
+percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
+{
+	percpu_counter_add(fbc, amount);
+}
+
 static inline void
 percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
 {
@@ -193,4 +219,10 @@ static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
 	percpu_counter_add(fbc, -amount);
 }
 
+static inline void
+percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
+{
+	percpu_counter_add_local(fbc, -amount);
+}
+
 #endif /* _LINUX_PERCPU_COUNTER_H */
-- 
cgit v1.2.3


From 72d1e611082eda18689106a0c192f2827072713c Mon Sep 17 00:00:00 2001
From: Jiebin Sun <jiebin.sun@intel.com>
Date: Wed, 14 Sep 2022 03:25:38 +0800
Subject: ipc/msg: mitigate the lock contention with percpu counter

The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC
msg queue is in heavy use, causing heavy cache bounce and overhead.
Change them to percpu_counter greatly improve the performance.  Since
there is one percpu struct per namespace, additional memory cost is
minimal.  Reading of the count done in msgctl call, which is infrequent.
So the need to sum up the counts in each CPU is infrequent.

Apply the patch and test the pts/stress-ng-1.4.0
-- system v message passing (160 threads).

Score gain: 3.99x

CPU: ICX 8380 x 2 sockets
Core number: 40 x 2 physical cores
Benchmark: pts/stress-ng-1.4.0
-- system v message passing (160 threads)

[akpm@linux-foundation.org: coding-style cleanups]
[jiebin.sun@intel.com: avoid negative value by overflow in msginfo]
  Link: https://lkml.kernel.org/r/20220920150809.4014944-1-jiebin.sun@intel.com
[akpm@linux-foundation.org: fix min() warnings]
Link: https://lkml.kernel.org/r/20220913192538.3023708-3-jiebin.sun@intel.com
Signed-off-by: Jiebin Sun <jiebin.sun@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
Cc: Alexey Gladkov <legion@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Averin <vasily.averin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ipc_namespace.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index e3e8c8662b49..e8240cf2611a 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -11,6 +11,7 @@
 #include <linux/refcount.h>
 #include <linux/rhashtable-types.h>
 #include <linux/sysctl.h>
+#include <linux/percpu_counter.h>
 
 struct user_namespace;
 
@@ -36,8 +37,8 @@ struct ipc_namespace {
 	unsigned int	msg_ctlmax;
 	unsigned int	msg_ctlmnb;
 	unsigned int	msg_ctlmni;
-	atomic_t	msg_bytes;
-	atomic_t	msg_hdrs;
+	struct percpu_counter percpu_msg_bytes;
+	struct percpu_counter percpu_msg_hdrs;
 
 	size_t		shm_ctlmax;
 	size_t		shm_ctlall;
-- 
cgit v1.2.3


From 168723c1f8d6e1e823d4c6ad3cf64478cf58330a Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:22 -0700
Subject: net/mlx5e: xsk: Use umr_mode to calculate striding RQ parameters

Instead of passing the unaligned flag, pass an enum that indicates the
UMR mode. The next commit will add the third mode (KLM for certain
configurations of XSK), which will be added to this enum instead of
adding another bool flag everywhere.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/driver.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f8ecb33105d3..285f301a6390 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1288,4 +1288,8 @@ static inline bool mlx5_get_roce_state(struct mlx5_core_dev *dev)
 	return mlx5_is_roce_on(dev);
 }
 
+enum {
+	MLX5_OCTWORD = 16,
+};
+
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From 16ab85e78439bab1201ff26ba430231d1574b4ae Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:27 -0700
Subject: net/mlx5e: Expose rx_oversize_pkts_buffer counter

Add the rx_oversize_pkts_buffer counter to ethtool statistics.
This counter exposes the number of dropped received packets due to
length which arrived to RQ and exceed software buffer size allocated by
the device for incoming traffic. It might imply that the device MTU is
larger than the software buffers size.

Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 1ad762e22d86..06574d430ff5 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1491,7 +1491,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_120[0xa];
 	u8         log_max_ra_req_dc[0x6];
-	u8         reserved_at_130[0x9];
+	u8         reserved_at_130[0x2];
+	u8         eth_wqe_too_small[0x1];
+	u8         reserved_at_133[0x6];
 	u8         vnic_env_cq_overrun[0x1];
 	u8         log_max_ra_res_dc[0x6];
 
@@ -3537,7 +3539,9 @@ struct mlx5_ifc_vnic_diagnostic_statistics_bits {
 
 	u8         cq_overrun[0x20];
 
-	u8         reserved_at_220[0xde0];
+	u8         eth_wqe_too_small[0x20];
+
+	u8         reserved_at_220[0xdc0];
 };
 
 struct mlx5_ifc_traffic_counter_bits {
-- 
cgit v1.2.3


From 9b98d395b85dd042fe83fb696b1ac02e6c93a520 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:28 -0700
Subject: net/mlx5: Start health poll at earlier stage of driver load

Start health poll at earlier stage, so if fw fatal issue occurred before
or during initialization commands such as init_hca or set_hca_cap the
poll health can detect and indicate that the driver is already in error
state.

Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 285f301a6390..a12929bc31b2 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1017,6 +1017,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
 void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health);
+void mlx5_start_health_fw_log_up(struct mlx5_core_dev *dev);
 void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
 void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
 int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
-- 
cgit v1.2.3


From 3114b075eb2531dea31a961944309485d6a53040 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 3 Oct 2022 08:51:57 +0200
Subject: net: add framework to support Ethernet PSE and PDs devices

This framework was create with intention to provide support for Ethernet PSE
(Power Sourcing Equipment) and PDs (Powered Device).

At current step this patch implements generic PSE support for PoDL (Power over
Data Lines 802.3bu) specification with reserving name space for PD devices as
well.

This framework can be extended to support 802.3af and 802.3at "Power via the
Media Dependent Interface" (or PoE/Power over Ethernet)

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/pse-pd/pse.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 include/linux/pse-pd/pse.h

(limited to 'include/linux')

diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
new file mode 100644
index 000000000000..3ba787a48b15
--- /dev/null
+++ b/include/linux/pse-pd/pse.h
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+// Copyright (c) 2022 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+ */
+#ifndef _LINUX_PSE_CONTROLLER_H
+#define _LINUX_PSE_CONTROLLER_H
+
+#include <linux/ethtool.h>
+#include <linux/list.h>
+#include <uapi/linux/ethtool.h>
+
+struct module;
+struct device_node;
+struct of_phandle_args;
+struct pse_control;
+
+/**
+ * struct pse_controller_dev - PSE controller entity that might
+ *                             provide multiple PSE controls
+ * @ops: a pointer to device specific struct pse_controller_ops
+ * @owner: kernel module of the PSE controller driver
+ * @list: internal list of PSE controller devices
+ * @pse_control_head: head of internal list of requested PSE controls
+ * @dev: corresponding driver model device struct
+ * @of_pse_n_cells: number of cells in PSE line specifiers
+ * @of_xlate: translation function to translate from specifier as found in the
+ *            device tree to id as given to the PSE control ops
+ * @nr_lines: number of PSE controls in this controller device
+ * @lock: Mutex for serialization access to the PSE controller
+ */
+struct pse_controller_dev {
+	const struct pse_controller_ops *ops;
+	struct module *owner;
+	struct list_head list;
+	struct list_head pse_control_head;
+	struct device *dev;
+	int of_pse_n_cells;
+	int (*of_xlate)(struct pse_controller_dev *pcdev,
+			const struct of_phandle_args *pse_spec);
+	unsigned int nr_lines;
+	struct mutex lock;
+};
+
+#if IS_ENABLED(CONFIG_PSE_CONTROLLER)
+int pse_controller_register(struct pse_controller_dev *pcdev);
+void pse_controller_unregister(struct pse_controller_dev *pcdev);
+struct device;
+int devm_pse_controller_register(struct device *dev,
+				 struct pse_controller_dev *pcdev);
+
+struct pse_control *of_pse_control_get(struct device_node *node);
+void pse_control_put(struct pse_control *psec);
+
+#else
+
+static inline struct pse_control *of_pse_control_get(struct device_node *node)
+{
+	return ERR_PTR(-ENOENT);
+}
+
+static inline void pse_control_put(struct pse_control *psec)
+{
+}
+
+#endif
+
+#endif
-- 
cgit v1.2.3


From 5e82147de1cbd758bb280908daa39d95ed467538 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 3 Oct 2022 08:51:59 +0200
Subject: net: mdiobus: search for PSE nodes by parsing PHY nodes.

Some PHYs can be linked with PSE (Power Sourcing Equipment), so search
for related nodes and attach it to the phydev.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index d65fc76fe0ae..ddf66198f751 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -597,6 +597,7 @@ struct macsec_ops;
  * @master_slave_get: Current master/slave advertisement
  * @master_slave_state: Current master/slave configuration
  * @mii_ts: Pointer to time stamper callbacks
+ * @psec: Pointer to Power Sourcing Equipment control struct
  * @lock:  Mutex for serialization access to PHY
  * @state_queue: Work queue for state machine
  * @shared: Pointer to private data shared by phys in one package
@@ -715,6 +716,7 @@ struct phy_device {
 	struct phylink *phylink;
 	struct net_device *attached_dev;
 	struct mii_timestamper *mii_ts;
+	struct pse_control *psec;
 
 	u8 mdix;
 	u8 mdix_ctrl;
-- 
cgit v1.2.3


From 18ff0bcda6d1dd3d53b4ce3f03e61bf1a648f960 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 3 Oct 2022 08:52:00 +0200
Subject: ethtool: add interface to interact with Ethernet Power Equipment

Add interface to support Power Sourcing Equipment. At current step it
provides generic way to address all variants of PSE devices as defined
in IEEE 802.3-2018 but support only objects specified for IEEE 802.3-2018 104.4
PoDL Power Sourcing Equipment (PSE).

Currently supported and mandatory objects are:
IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus
IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState
IEEE 802.3-2018 30.15.1.2.1 acPoDLPSEAdminControl

This is minimal interface needed to control PSE on each separate
ethernet port but it provides not all mandatory objects specified in
IEEE 802.3-2018.

Since "PoDL PSE" and "PSE" have similar names, but some different values
I decide to not merge them and keep separate naming schema. This should
allow as to be as close to IEEE 802.3 spec as possible and avoid name
conflicts in the future.

This implementation is connected to PHYs instead of MACs because PSE
auto classification can potentially interfere with PHY auto negotiation.
So, may be some extra PHY related initialization will be needed.

With WIP version of ethtools interaction with PSE capable link looks
as following:

$ ip l
...
5: t1l1@eth0: <BROADCAST,MULTICAST> ..
...

$ ethtool --show-pse t1l1
PSE attributs for t1l1:
PoDL PSE Admin State: disabled
PoDL PSE Power Detection Status: disabled

$ ethtool --set-pse t1l1 podl-pse-admin-control enable
$ ethtool --show-pse t1l1
PSE attributs for t1l1:
PoDL PSE Admin State: enabled
PoDL PSE Power Detection Status: delivering power

Signed-off-by: kernel test robot <lkp@intel.com>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/pse-pd/pse.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index 3ba787a48b15..fd1a916eeeba 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -9,6 +9,47 @@
 #include <linux/list.h>
 #include <uapi/linux/ethtool.h>
 
+struct phy_device;
+struct pse_controller_dev;
+
+/**
+ * struct pse_control_config - PSE control/channel configuration.
+ *
+ * @admin_cotrol: set PoDL PSE admin control as described in
+ *	IEEE 802.3-2018 30.15.1.2.1 acPoDLPSEAdminControl
+ */
+struct pse_control_config {
+	enum ethtool_podl_pse_admin_state admin_cotrol;
+};
+
+/**
+ * struct pse_control_status - PSE control/channel status.
+ *
+ * @podl_admin_state: operational state of the PoDL PSE
+ *	functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState
+ * @podl_pw_status: power detection status of the PoDL PSE.
+ *	IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus:
+ */
+struct pse_control_status {
+	enum ethtool_podl_pse_admin_state podl_admin_state;
+	enum ethtool_podl_pse_pw_d_status podl_pw_status;
+};
+
+/**
+ * struct pse_controller_ops - PSE controller driver callbacks
+ *
+ * @ethtool_get_status: get PSE control status for ethtool interface
+ * @ethtool_set_config: set PSE control configuration over ethtool interface
+ */
+struct pse_controller_ops {
+	int (*ethtool_get_status)(struct pse_controller_dev *pcdev,
+		unsigned long id, struct netlink_ext_ack *extack,
+		struct pse_control_status *status);
+	int (*ethtool_set_config)(struct pse_controller_dev *pcdev,
+		unsigned long id, struct netlink_ext_ack *extack,
+		const struct pse_control_config *config);
+};
+
 struct module;
 struct device_node;
 struct of_phandle_args;
@@ -51,6 +92,13 @@ int devm_pse_controller_register(struct device *dev,
 struct pse_control *of_pse_control_get(struct device_node *node);
 void pse_control_put(struct pse_control *psec);
 
+int pse_ethtool_get_status(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   struct pse_control_status *status);
+int pse_ethtool_set_config(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   const struct pse_control_config *config);
+
 #else
 
 static inline struct pse_control *of_pse_control_get(struct device_node *node)
@@ -62,6 +110,20 @@ static inline void pse_control_put(struct pse_control *psec)
 {
 }
 
+int pse_ethtool_get_status(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   struct pse_control_status *status)
+{
+	return -ENOTSUPP;
+}
+
+int pse_ethtool_set_config(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   const struct pse_control_config *config)
+{
+	return -ENOTSUPP;
+}
+
 #endif
 
 #endif
-- 
cgit v1.2.3


From 2a4187f4406ec3236f8b9d0d5150d2bf8d021b68 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Mon, 3 Oct 2022 20:14:13 +0200
Subject: once: rename _SLOW to _SLEEPABLE

The _SLOW designation wasn't really descriptive of anything. This is
meant to be called from process context when it's possible to sleep. So
name this more aptly _SLEEPABLE, which better fits its intended use.

Fixes: 62c07983bef9 ("once: add DO_ONCE_SLOW() for sleepable contexts")
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20221003181413.1221968-1-Jason@zx2c4.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/once.h | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/once.h b/include/linux/once.h
index 176ab75b42df..bc714d414448 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -13,9 +13,9 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
 		    unsigned long *flags, struct module *mod);
 
 /* Variant for process contexts only. */
-bool __do_once_slow_start(bool *done);
-void __do_once_slow_done(bool *done, struct static_key_true *once_key,
-			 struct module *mod);
+bool __do_once_sleepable_start(bool *done);
+void __do_once_sleepable_done(bool *done, struct static_key_true *once_key,
+			      struct module *mod);
 
 /* Call a function exactly once. The idea of DO_ONCE() is to perform
  * a function call such as initialization of random seeds, etc, only
@@ -61,26 +61,26 @@ void __do_once_slow_done(bool *done, struct static_key_true *once_key,
 	})
 
 /* Variant of DO_ONCE() for process/sleepable contexts. */
-#define DO_ONCE_SLOW(func, ...)						     \
-	({								     \
-		bool ___ret = false;					     \
-		static bool __section(".data.once") ___done = false;	     \
-		static DEFINE_STATIC_KEY_TRUE(___once_key);		     \
-		if (static_branch_unlikely(&___once_key)) {		     \
-			___ret = __do_once_slow_start(&___done);	     \
-			if (unlikely(___ret)) {				     \
-				func(__VA_ARGS__);			     \
-				__do_once_slow_done(&___done, &___once_key,  \
-						    THIS_MODULE);	     \
-			}						     \
-		}							     \
-		___ret;							     \
+#define DO_ONCE_SLEEPABLE(func, ...)						\
+	({									\
+		bool ___ret = false;						\
+		static bool __section(".data.once") ___done = false;		\
+		static DEFINE_STATIC_KEY_TRUE(___once_key);			\
+		if (static_branch_unlikely(&___once_key)) {			\
+			___ret = __do_once_sleepable_start(&___done);		\
+			if (unlikely(___ret)) {					\
+				func(__VA_ARGS__);				\
+				__do_once_sleepable_done(&___done, &___once_key,\
+						    THIS_MODULE);		\
+			}							\
+		}								\
+		___ret;								\
 	})
 
 #define get_random_once(buf, nbytes)					     \
 	DO_ONCE(get_random_bytes, (buf), (nbytes))
 
-#define get_random_slow_once(buf, nbytes)				     \
-	DO_ONCE_SLOW(get_random_bytes, (buf), (nbytes))
+#define get_random_sleepable_once(buf, nbytes)				     \
+	DO_ONCE_SLEEPABLE(get_random_bytes, (buf), (nbytes))
 
 #endif /* _LINUX_ONCE_H */
-- 
cgit v1.2.3


From d7915651fe9a6474e06161c34c637e6aeb811e9d Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Wed, 14 Sep 2022 16:47:42 +0200
Subject: clk: introduce (devm_)hw_register_mux_parent_data_table API

Introduce (devm_)hw_register_mux_parent_data_table new API. We have
basic support for clk_register_mux using parent_data but we lack any API
to provide a custom parent_map. Add these 2 new API to correctly handle
these special configuration instead of using the generic
__(devm_)clk_hw_register_mux API.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Link: https://lore.kernel.org/r/20220914144743.17369-1-ansuelsmth@gmail.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 1615010aa0ec..65b70f0d62c5 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -974,6 +974,13 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name,
 	__clk_hw_register_mux((dev), NULL, (name), (num_parents), NULL, NULL, \
 			      (parent_data), (flags), (reg), (shift),	      \
 			      BIT((width)) - 1, (clk_mux_flags), NULL, (lock))
+#define clk_hw_register_mux_parent_data_table(dev, name, parent_data,	      \
+					      num_parents, flags, reg, shift, \
+					      width, clk_mux_flags, table,    \
+					      lock)			      \
+	__clk_hw_register_mux((dev), NULL, (name), (num_parents), NULL, NULL, \
+			      (parent_data), (flags), (reg), (shift),	      \
+			      BIT((width)) - 1, (clk_mux_flags), table, (lock))
 #define devm_clk_hw_register_mux(dev, name, parent_names, num_parents, flags, reg, \
 			    shift, width, clk_mux_flags, lock)		      \
 	__devm_clk_hw_register_mux((dev), NULL, (name), (num_parents),	      \
@@ -987,6 +994,13 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name,
 				   (parent_hws), NULL, (flags), (reg),        \
 				   (shift), BIT((width)) - 1,		      \
 				   (clk_mux_flags), NULL, (lock))
+#define devm_clk_hw_register_mux_parent_data_table(dev, name, parent_data,    \
+					      num_parents, flags, reg, shift, \
+					      width, clk_mux_flags, table,    \
+					      lock)			      \
+	__devm_clk_hw_register_mux((dev), NULL, (name), (num_parents), NULL,  \
+			      NULL, (parent_data), (flags), (reg), (shift),   \
+			      BIT((width)) - 1, (clk_mux_flags), table, (lock))
 
 int clk_mux_val_to_index(struct clk_hw *hw, const u32 *table, unsigned int flags,
 			 unsigned int val);
-- 
cgit v1.2.3


From 681bf011b9b5989c6e9db6beb64494918aab9a43 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 3 Oct 2022 21:03:27 -0700
Subject: eth: pse: add missing static inlines

build bot reports missing 'static inline' qualifiers in the header.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: 18ff0bcda6d1 ("ethtool: add interface to interact with Ethernet Power Equipment")
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20221004040327.2034878-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/pse-pd/pse.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index fd1a916eeeba..fb724c65c77b 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -110,16 +110,16 @@ static inline void pse_control_put(struct pse_control *psec)
 {
 }
 
-int pse_ethtool_get_status(struct pse_control *psec,
-			   struct netlink_ext_ack *extack,
-			   struct pse_control_status *status)
+static inline int pse_ethtool_get_status(struct pse_control *psec,
+					 struct netlink_ext_ack *extack,
+					 struct pse_control_status *status)
 {
 	return -ENOTSUPP;
 }
 
-int pse_ethtool_set_config(struct pse_control *psec,
-			   struct netlink_ext_ack *extack,
-			   const struct pse_control_config *config)
+static inline int pse_ethtool_set_config(struct pse_control *psec,
+					 struct netlink_ext_ack *extack,
+					 const struct pse_control_config *config)
 {
 	return -ENOTSUPP;
 }
-- 
cgit v1.2.3


From 5d10f480f77b67332e4835ad565bfe4cb8528159 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Thu, 18 Aug 2022 10:23:16 +0200
Subject: thermal/of: Remove the thermal_zone_of_get_sensor_id() function

The function thermal_zone_of_get_sensor_id() is no longer used
anywhere, remove it.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20220818082316.2717095-2-daniel.lezcano@linaro.org
---
 include/linux/thermal.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 6f1ec4fb7ef8..9ecc128944a1 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -308,9 +308,6 @@ void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_dev
 
 void thermal_of_zone_unregister(struct thermal_zone_device *tz);
 
-int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
-				  struct device_node *sensor_np,
-				  u32 *id);
 #else
 static inline
 struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data,
@@ -334,13 +331,6 @@ static inline void devm_thermal_of_zone_unregister(struct device *dev,
 						   struct thermal_zone_device *tz)
 {
 }
-
-static inline int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
-						struct device_node *sensor_np,
-						u32 *id)
-{
-	return -ENOENT;
-}
 #endif
 
 #ifdef CONFIG_THERMAL
-- 
cgit v1.2.3


From 0ce38047e82a02017839b6cae837f13a1383a3a0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 4 Oct 2022 10:46:58 +0200
Subject: perf: Fix lockdep_assert_event_ctx()

I'm a flamin' moron; because even after Mark told me it should be '&&'
I still got it wrong in the final commit.

Fixes: f3c0eba28704 ("perf: Add a few assertions")
Reported-by: Borislav Petkov <bp@alien8.de>
Reported-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Borislav Petkov <bp@alien8.de>
Link: https://lkml.kernel.org/r/YvvIWmDBWdIUCMZj@FVFF77S0Q05N
---
 include/linux/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e9b151cde491..853f64b6c8c2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -644,7 +644,7 @@ struct pmu_event_list {
 #ifdef CONFIG_PROVE_LOCKING
 #define lockdep_assert_event_ctx(event)				\
 	WARN_ON_ONCE(__lockdep_enabled &&			\
-		     (this_cpu_read(hardirqs_enabled) ||	\
+		     (this_cpu_read(hardirqs_enabled) &&	\
 		      lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD))
 #else
 #define lockdep_assert_event_ctx(event)
-- 
cgit v1.2.3


From 4c99256013fa4e0fe9733ca1bab2b5684ccc02a1 Mon Sep 17 00:00:00 2001
From: Raul E Rangel <rrangel@chromium.org>
Date: Thu, 29 Sep 2022 10:19:09 -0600
Subject: gpiolib: acpi: Add wake_capable variants of acpi_dev_gpio_irq_get

The ACPI spec defines the SharedAndWake and ExclusiveAndWake share type
keywords. This is an indication that the GPIO IRQ can also be used as a
wake source. This change exposes the wake_capable bit so drivers can
correctly enable wake functionality instead of making an assumption.

Signed-off-by: Raul E Rangel <rrangel@chromium.org>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6f64b2f3dc54..cd7371a5f283 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1202,7 +1202,8 @@ bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
 				struct acpi_resource_gpio **agpio);
 bool acpi_gpio_get_io_resource(struct acpi_resource *ares,
 			       struct acpi_resource_gpio **agpio);
-int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name, int index);
+int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name, int index,
+				  bool *wake_capable);
 #else
 static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
 					      struct acpi_resource_gpio **agpio)
@@ -1214,16 +1215,28 @@ static inline bool acpi_gpio_get_io_resource(struct acpi_resource *ares,
 {
 	return false;
 }
-static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev,
-					   const char *name, int index)
+static inline int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name,
+						int index, bool *wake_capable)
 {
 	return -ENXIO;
 }
 #endif
 
+static inline int acpi_dev_gpio_irq_wake_get(struct acpi_device *adev, int index,
+					     bool *wake_capable)
+{
+	return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, wake_capable);
+}
+
+static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name,
+					   int index)
+{
+	return acpi_dev_gpio_irq_wake_get_by(adev, name, index, NULL);
+}
+
 static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
 {
-	return acpi_dev_gpio_irq_get_by(adev, NULL, index);
+	return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, NULL);
 }
 
 /* Device properties */
-- 
cgit v1.2.3


From 5ff811604f93bdd2650beed80b48c2ca16c6fba6 Mon Sep 17 00:00:00 2001
From: Raul E Rangel <rrangel@chromium.org>
Date: Thu, 29 Sep 2022 10:19:10 -0600
Subject: ACPI: resources: Add wake_capable parameter to acpi_dev_irq_flags

ACPI IRQ/Interrupt resources contain a bit that describes if the
interrupt should wake the system. This change exposes that bit via
a new IORESOURCE_IRQ_WAKECAPABLE flag. Drivers should check this flag
before arming an IRQ to wake the system.

Signed-off-by: Raul E Rangel <rrangel@chromium.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h   | 2 +-
 include/linux/ioport.h | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index cd7371a5f283..ea2efbdbeee5 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -495,7 +495,7 @@ bool acpi_dev_resource_address_space(struct acpi_resource *ares,
 				     struct resource_win *win);
 bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares,
 					 struct resource_win *win);
-unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable);
+unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable, u8 wake_capable);
 unsigned int acpi_dev_get_irq_type(int triggering, int polarity);
 bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
 				 struct resource *res);
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 616b683563a9..3baeea4d903b 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -79,7 +79,8 @@ struct resource {
 #define IORESOURCE_IRQ_HIGHLEVEL	(1<<2)
 #define IORESOURCE_IRQ_LOWLEVEL		(1<<3)
 #define IORESOURCE_IRQ_SHAREABLE	(1<<4)
-#define IORESOURCE_IRQ_OPTIONAL 	(1<<5)
+#define IORESOURCE_IRQ_OPTIONAL		(1<<5)
+#define IORESOURCE_IRQ_WAKECAPABLE	(1<<6)
 
 /* PnP DMA specific bits (IORESOURCE_BITS) */
 #define IORESOURCE_DMA_TYPE_MASK	(3<<0)
-- 
cgit v1.2.3


From e7fd8b68404f3d8bc03f85bc3c078d67096be06f Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Thu, 29 Sep 2022 15:05:23 +0800
Subject: kernel/reboot: Add SYS_OFF_MODE_RESTART_PREPARE mode

Add SYS_OFF_MODE_RESTART_PREPARE callbacks to be invoked before
a system restart.

Suggested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Reviewed-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/reboot.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index e5d9ef886179..2b6bb593be5b 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -105,6 +105,14 @@ enum sys_off_mode {
 	 */
 	SYS_OFF_MODE_POWER_OFF,
 
+	/**
+	 * @SYS_OFF_MODE_RESTART_PREPARE:
+	 *
+	 * Handlers prepare system to be restarted. Handlers are
+	 * allowed to sleep.
+	 */
+	SYS_OFF_MODE_RESTART_PREPARE,
+
 	/**
 	 * @SYS_OFF_MODE_RESTART:
 	 *
-- 
cgit v1.2.3


From 0e0abad2a71bcd7ba0f30e7975f5b4199ade4e60 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 4 Oct 2022 14:39:10 +0200
Subject: io_uring: Add missing inline to io_uring_cmd_import_fixed() dummy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If CONFIG_IO_URING is not set:

    include/linux/io_uring.h:65:12: error: ‘io_uring_cmd_import_fixed’ defined but not used [-Werror=unused-function]
       65 | static int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
	  |            ^~~~~~~~~~~~~~~~~~~~~~~~~

Fix this by adding the missing "inline" keyword.

Fixes: a9216fac3ed8819c ("io_uring: add io_uring_cmd_import_fixed")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/7404b4a696f64e33e5ef3c5bd3754d4f26d13e50.1664887093.git.geert+renesas@glider.be
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index e10c5cc81082..43bc8a2edccf 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -62,7 +62,7 @@ static inline void io_uring_free(struct task_struct *tsk)
 		__io_uring_free(tsk);
 }
 #else
-static int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
+static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From da4ab869e37cf81f93333ba74b16e0ea6d322e15 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 25 May 2022 06:11:00 -0400
Subject: libceph: drop last_piece flag from ceph_msg_data_cursor

ceph_msg_data_next is always passed a NULL pointer for this field. Some
of the "next" operations look at it in order to determine the length,
but we can just take the min of the data on the page or cursor->resid.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index e7f2fb2fc207..99c1726be6ee 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -207,7 +207,6 @@ struct ceph_msg_data_cursor {
 
 	struct ceph_msg_data	*data;		/* current data item */
 	size_t			resid;		/* bytes not yet consumed */
-	bool			last_piece;	/* current is last piece */
 	bool			need_crc;	/* crc update needed */
 	union {
 #ifdef CONFIG_BLOCK
@@ -498,8 +497,7 @@ void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq);
 void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
 			       struct ceph_msg *msg, size_t length);
 struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
-				size_t *page_offset, size_t *length,
-				bool *last_piece);
+				size_t *page_offset, size_t *length);
 void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes);
 
 u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
-- 
cgit v1.2.3


From bdef2b7896df293736330eb6eb0f43947049b828 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Sep 2022 11:26:41 +0200
Subject: vfio/mdev: make mdev.h standalone includable

Include <linux/device.h> and <linux/uuid.h> so that users of this headers
don't need to do that and remove those includes that aren't needed
any more.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Link: https://lore.kernel.org/r/20220923092652.100656-4-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 47ad3b104d9e..a5d8ae6132a2 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -10,6 +10,9 @@
 #ifndef MDEV_H
 #define MDEV_H
 
+#include <linux/device.h>
+#include <linux/uuid.h>
+
 struct mdev_type;
 
 struct mdev_device {
-- 
cgit v1.2.3


From 89345d5177aa0f6d678251e1e0870b0eeb1ab510 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Sep 2022 11:26:42 +0200
Subject: vfio/mdev: embedd struct mdev_parent in the parent data structure

Simplify mdev_{un}register_device by requiring the caller to pass in
a structure allocate as part of the parent device structure.  This
removes the need for a list of parents and the separate mdev_parent
refcount as we can simplify rely on the reference to the parent device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Link: https://lore.kernel.org/r/20220923092652.100656-5-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index a5d8ae6132a2..262512c2a8ff 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -23,6 +23,16 @@ struct mdev_device {
 	bool active;
 };
 
+/* embedded into the struct device that the mdev devices hang off */
+struct mdev_parent {
+	struct device *dev;
+	struct mdev_driver *mdev_driver;
+	struct kset *mdev_types_kset;
+	struct list_head type_list;
+	/* Synchronize device creation/removal with parent unregistration */
+	struct rw_semaphore unreg_sem;
+};
+
 static inline struct mdev_device *to_mdev_device(struct device *dev)
 {
 	return container_of(dev, struct mdev_device, dev);
@@ -70,8 +80,9 @@ struct mdev_driver {
 
 extern struct bus_type mdev_bus_type;
 
-int mdev_register_device(struct device *dev, struct mdev_driver *mdev_driver);
-void mdev_unregister_device(struct device *dev);
+int mdev_register_parent(struct mdev_parent *parent, struct device *dev,
+		struct mdev_driver *mdev_driver);
+void mdev_unregister_parent(struct mdev_parent *parent);
 
 int mdev_register_driver(struct mdev_driver *drv);
 void mdev_unregister_driver(struct mdev_driver *drv);
-- 
cgit v1.2.3


From da44c340c4fe9d9653ae84fa6a60f406bafcffce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Sep 2022 11:26:43 +0200
Subject: vfio/mdev: simplify mdev_type handling

Instead of abusing struct attribute_group to control initialization of
struct mdev_type, just define the actual attributes in the mdev_driver,
allocate the mdev_type structures in the caller and pass them to
mdev_register_parent.

This allows the caller to use container_of to get at the containing
structure and thus significantly simplify the code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Link: https://lore.kernel.org/r/20220923092652.100656-6-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 262512c2a8ff..19bc93c10e8c 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -23,14 +23,27 @@ struct mdev_device {
 	bool active;
 };
 
+struct mdev_type {
+	/* set by the driver before calling mdev_register parent: */
+	const char *sysfs_name;
+
+	/* set by the core, can be used drivers */
+	struct mdev_parent *parent;
+
+	/* internal only */
+	struct kobject kobj;
+	struct kobject *devices_kobj;
+};
+
 /* embedded into the struct device that the mdev devices hang off */
 struct mdev_parent {
 	struct device *dev;
 	struct mdev_driver *mdev_driver;
 	struct kset *mdev_types_kset;
-	struct list_head type_list;
 	/* Synchronize device creation/removal with parent unregistration */
 	struct rw_semaphore unreg_sem;
+	struct mdev_type **types;
+	unsigned int nr_types;
 };
 
 static inline struct mdev_device *to_mdev_device(struct device *dev)
@@ -38,8 +51,6 @@ static inline struct mdev_device *to_mdev_device(struct device *dev)
 	return container_of(dev, struct mdev_device, dev);
 }
 
-unsigned int mdev_get_type_group_id(struct mdev_device *mdev);
-unsigned int mtype_get_type_group_id(struct mdev_type *mtype);
 struct device *mtype_get_parent_dev(struct mdev_type *mtype);
 
 /* interface for exporting mdev supported type attributes */
@@ -66,22 +77,21 @@ struct mdev_type_attribute mdev_type_attr_##_name =		\
  * struct mdev_driver - Mediated device driver
  * @probe: called when new device created
  * @remove: called when device removed
- * @supported_type_groups: Attributes to define supported types. It is mandatory
- *			to provide supported types.
+ * @types_attrs: attributes to the type kobjects.
  * @driver: device driver structure
- *
  **/
 struct mdev_driver {
 	int (*probe)(struct mdev_device *dev);
 	void (*remove)(struct mdev_device *dev);
-	struct attribute_group **supported_type_groups;
+	const struct attribute * const *types_attrs;
 	struct device_driver driver;
 };
 
 extern struct bus_type mdev_bus_type;
 
 int mdev_register_parent(struct mdev_parent *parent, struct device *dev,
-		struct mdev_driver *mdev_driver);
+		struct mdev_driver *mdev_driver, struct mdev_type **types,
+		unsigned int nr_types);
 void mdev_unregister_parent(struct mdev_parent *parent);
 
 int mdev_register_driver(struct mdev_driver *drv);
-- 
cgit v1.2.3


From cbf3bb28aaeaee425ca7b9c537a3efff1f8c98ae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Sep 2022 11:26:44 +0200
Subject: vfio/mdev: remove mdev_from_dev

Just open code it in the only caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Link: https://lore.kernel.org/r/20220923092652.100656-7-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 19bc93c10e8c..4f558de52fd9 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -102,9 +102,5 @@ static inline struct device *mdev_dev(struct mdev_device *mdev)
 {
 	return &mdev->dev;
 }
-static inline struct mdev_device *mdev_from_dev(struct device *dev)
-{
-	return dev->bus == &mdev_bus_type ? to_mdev_device(dev) : NULL;
-}
 
 #endif /* MDEV_H */
-- 
cgit v1.2.3


From 2815fe149ffa8e1a022b2830ab62999135c00a4e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Sep 2022 11:26:45 +0200
Subject: vfio/mdev: unexport mdev_bus_type

mdev_bus_type is only used in mdev.ko now, so unexport it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Link: https://lore.kernel.org/r/20220923092652.100656-8-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 4f558de52fd9..6c179d2b8927 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -87,8 +87,6 @@ struct mdev_driver {
 	struct device_driver driver;
 };
 
-extern struct bus_type mdev_bus_type;
-
 int mdev_register_parent(struct mdev_parent *parent, struct device *dev,
 		struct mdev_driver *mdev_driver, struct mdev_type **types,
 		unsigned int nr_types);
-- 
cgit v1.2.3


From 062e720cd209d8091c4f3d118d93973f02209aca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Sep 2022 11:26:46 +0200
Subject: vfio/mdev: remove mdev_parent_dev

Just open code the dereferences in the only user.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Link: https://lore.kernel.org/r/20220923092652.100656-9-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 6c179d2b8927..bbedffcb38d4 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -95,7 +95,6 @@ void mdev_unregister_parent(struct mdev_parent *parent);
 int mdev_register_driver(struct mdev_driver *drv);
 void mdev_unregister_driver(struct mdev_driver *drv);
 
-struct device *mdev_parent_dev(struct mdev_device *mdev);
 static inline struct device *mdev_dev(struct mdev_device *mdev)
 {
 	return &mdev->dev;
-- 
cgit v1.2.3


From c7c1f38f6cba7e3249866c06639ea62755f0a24e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Sep 2022 11:26:47 +0200
Subject: vfio/mdev: remove mtype_get_parent_dev

Just open code the dereferences in the only user.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason J. Herne <jjherne@linux.ibm.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Link: https://lore.kernel.org/r/20220923092652.100656-10-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index bbedffcb38d4..e445f809ceca 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -51,8 +51,6 @@ static inline struct mdev_device *to_mdev_device(struct device *dev)
 	return container_of(dev, struct mdev_device, dev);
 }
 
-struct device *mtype_get_parent_dev(struct mdev_type *mtype);
-
 /* interface for exporting mdev supported type attributes */
 struct mdev_type_attribute {
 	struct attribute attr;
-- 
cgit v1.2.3


From 290aac5df88a83e264b3a73ec146e5e5b3c45793 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 23 Sep 2022 11:26:48 +0200
Subject: vfio/mdev: consolidate all the device_api sysfs into the core code

Every driver just emits a static string, simply feed it through the ops
and provide a standard sysfs show function.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Link: https://lore.kernel.org/r/20220923092652.100656-11-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index e445f809ceca..af1ff0165b8d 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -61,11 +61,6 @@ struct mdev_type_attribute {
 			 size_t count);
 };
 
-#define MDEV_TYPE_ATTR(_name, _mode, _show, _store)		\
-struct mdev_type_attribute mdev_type_attr_##_name =		\
-	__ATTR(_name, _mode, _show, _store)
-#define MDEV_TYPE_ATTR_RW(_name) \
-	struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RW(_name)
 #define MDEV_TYPE_ATTR_RO(_name) \
 	struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name)
 #define MDEV_TYPE_ATTR_WO(_name) \
@@ -73,12 +68,14 @@ struct mdev_type_attribute mdev_type_attr_##_name =		\
 
 /**
  * struct mdev_driver - Mediated device driver
+ * @device_api: string to return for the device_api sysfs
  * @probe: called when new device created
  * @remove: called when device removed
  * @types_attrs: attributes to the type kobjects.
  * @driver: device driver structure
  **/
 struct mdev_driver {
+	const char *device_api;
 	int (*probe)(struct mdev_device *dev);
 	void (*remove)(struct mdev_device *dev);
 	const struct attribute * const *types_attrs;
-- 
cgit v1.2.3


From 0bc79069ccbdbe26492493dd0c4e38b7cadf8ad5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Sep 2022 11:26:49 +0200
Subject: vfio/mdev: consolidate all the name sysfs into the core code

Every driver just emits a static string, simply add a field to the
mdev_type for the driver to fill out or fall back to the sysfs name and
provide a standard sysfs show function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Link: https://lore.kernel.org/r/20220923092652.100656-12-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index af1ff0165b8d..4bb8a58b577b 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -26,6 +26,7 @@ struct mdev_device {
 struct mdev_type {
 	/* set by the driver before calling mdev_register parent: */
 	const char *sysfs_name;
+	const char *pretty_name;
 
 	/* set by the core, can be used drivers */
 	struct mdev_parent *parent;
-- 
cgit v1.2.3


From f2fbc72e6da4f8e01fe5fe3d6871a791e76271c3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Sep 2022 11:26:50 +0200
Subject: vfio/mdev: consolidate all the available_instance sysfs into the core
 code

Every driver just print a number, simply add a method to the mdev_driver
to return it and provide a standard sysfs show function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Link: https://lore.kernel.org/r/20220923092652.100656-13-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 4bb8a58b577b..d39e08a1824c 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -72,6 +72,7 @@ struct mdev_type_attribute {
  * @device_api: string to return for the device_api sysfs
  * @probe: called when new device created
  * @remove: called when device removed
+ * @get_available: Return the max number of instances that can be created
  * @types_attrs: attributes to the type kobjects.
  * @driver: device driver structure
  **/
@@ -79,6 +80,7 @@ struct mdev_driver {
 	const char *device_api;
 	int (*probe)(struct mdev_device *dev);
 	void (*remove)(struct mdev_device *dev);
+	unsigned int (*get_available)(struct mdev_type *mtype);
 	const struct attribute * const *types_attrs;
 	struct device_driver driver;
 };
-- 
cgit v1.2.3


From 685a1537f4c603cfcaf4b9be56ff6a571f7ddd08 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Sep 2022 11:26:51 +0200
Subject: vfio/mdev: consolidate all the description sysfs into the core code

Every driver just emits a string, simply add a method to the mdev_driver
to return it and provide a standard sysfs show function.

Remove the now unused types_attrs field in struct mdev_driver and the
support code for it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Link: https://lore.kernel.org/r/20220923092652.100656-14-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index d39e08a1824c..33674cb5ed5d 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -52,28 +52,13 @@ static inline struct mdev_device *to_mdev_device(struct device *dev)
 	return container_of(dev, struct mdev_device, dev);
 }
 
-/* interface for exporting mdev supported type attributes */
-struct mdev_type_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct mdev_type *mtype,
-			struct mdev_type_attribute *attr, char *buf);
-	ssize_t (*store)(struct mdev_type *mtype,
-			 struct mdev_type_attribute *attr, const char *buf,
-			 size_t count);
-};
-
-#define MDEV_TYPE_ATTR_RO(_name) \
-	struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name)
-#define MDEV_TYPE_ATTR_WO(_name) \
-	struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_WO(_name)
-
 /**
  * struct mdev_driver - Mediated device driver
  * @device_api: string to return for the device_api sysfs
  * @probe: called when new device created
  * @remove: called when device removed
  * @get_available: Return the max number of instances that can be created
- * @types_attrs: attributes to the type kobjects.
+ * @show_description: Print a description of the mtype
  * @driver: device driver structure
  **/
 struct mdev_driver {
@@ -81,7 +66,7 @@ struct mdev_driver {
 	int (*probe)(struct mdev_device *dev);
 	void (*remove)(struct mdev_device *dev);
 	unsigned int (*get_available)(struct mdev_type *mtype);
-	const struct attribute * const *types_attrs;
+	ssize_t (*show_description)(struct mdev_type *mtype, char *buf);
 	struct device_driver driver;
 };
 
-- 
cgit v1.2.3


From 9c799c224d6ebc5be51065bd3217a2d7eea23b8f Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 23 Sep 2022 11:26:52 +0200
Subject: vfio/mdev: add mdev available instance checking to the core

Many of the mdev drivers use a simple counter for keeping track of the
available instances. Move this code to the core code and store the counter
in the mdev_parent. Implement it using correct locking, fixing mdpy.

Drivers just provide the value in the mdev_driver at registration time
and the core code takes care of maintaining it and exposing the value in
sysfs.

[hch: count instances per-parent instead of per-type, use an atomic_t
 to avoid taking mdev_list_lock in the show method]

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Link: https://lore.kernel.org/r/20220923092652.100656-15-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 33674cb5ed5d..139d05b26f82 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -45,6 +45,7 @@ struct mdev_parent {
 	struct rw_semaphore unreg_sem;
 	struct mdev_type **types;
 	unsigned int nr_types;
+	atomic_t available_instances;
 };
 
 static inline struct mdev_device *to_mdev_device(struct device *dev)
@@ -55,6 +56,7 @@ static inline struct mdev_device *to_mdev_device(struct device *dev)
 /**
  * struct mdev_driver - Mediated device driver
  * @device_api: string to return for the device_api sysfs
+ * @max_instances: maximum number of instances supported (optional)
  * @probe: called when new device created
  * @remove: called when device removed
  * @get_available: Return the max number of instances that can be created
@@ -63,6 +65,7 @@ static inline struct mdev_device *to_mdev_device(struct device *dev)
  **/
 struct mdev_driver {
 	const char *device_api;
+	unsigned int max_instances;
 	int (*probe)(struct mdev_device *dev);
 	void (*remove)(struct mdev_device *dev);
 	unsigned int (*get_available)(struct mdev_type *mtype);
-- 
cgit v1.2.3


From 34e1ed189fab1b7533befb266b96051104c1deb6 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Mon, 8 Aug 2022 19:40:38 +0200
Subject: PM: Improve EXPORT_*_DEV_PM_OPS macros

Update the _EXPORT_DEV_PM_OPS() internal macro. It was not used anywhere
outside pm.h and pm_runtime.h, so it is safe to update it.

Before, this macro would take a few parameters to be used as sleep and
runtime callbacks. This made it unsuitable to use with different
callbacks, for instance the "noirq" ones.

It is now semantically different: instead of creating a conditionally
exported dev_pm_ops structure, it only contains part of the definition.

This macro should however never be used directly (hence the trailing
underscore). Instead, the following four macros are provided:
- EXPORT_DEV_PM_OPS(name)
- EXPORT_GPL_DEV_PM_OPS(name)
- EXPORT_NS_DEV_PM_OPS(name, ns)
- EXPORT_NS_GPL_DEV_PM_OPS(name, ns)

For instance, it is now possible to conditionally export noirq
suspend/resume PM functions like this:

EXPORT_GPL_DEV_PM_OPS(foo_pm_ops) = {
    NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn)
};

The existing helper macros EXPORT_*_SIMPLE_DEV_PM_OPS() and
EXPORT_*_RUNTIME_DEV_PM_OPS() have been updated to use these new macros.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h         | 37 +++++++++++++++++++++++--------------
 include/linux/pm_runtime.h | 20 ++++++++++++--------
 2 files changed, 35 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 871c9c49ec9d..93cd34f00822 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -375,19 +375,20 @@ const struct dev_pm_ops name = { \
 }
 
 #ifdef CONFIG_PM
-#define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \
-			   runtime_resume_fn, idle_fn, sec, ns)		\
-	_DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \
-			   runtime_resume_fn, idle_fn); \
-	__EXPORT_SYMBOL(name, sec, ns)
+#define _EXPORT_DEV_PM_OPS(name, sec, ns)				\
+	const struct dev_pm_ops name;					\
+	__EXPORT_SYMBOL(name, sec, ns);					\
+	const struct dev_pm_ops name
 #else
-#define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \
-			   runtime_resume_fn, idle_fn, sec, ns) \
-static __maybe_unused _DEFINE_DEV_PM_OPS(__static_##name, suspend_fn, \
-					 resume_fn, runtime_suspend_fn, \
-					 runtime_resume_fn, idle_fn)
+#define _EXPORT_DEV_PM_OPS(name, sec, ns)				\
+	static __maybe_unused const struct dev_pm_ops __static_##name
 #endif
 
+#define EXPORT_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "", "")
+#define EXPORT_GPL_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "_gpl", "")
+#define EXPORT_NS_DEV_PM_OPS(name, ns) _EXPORT_DEV_PM_OPS(name, "", #ns)
+#define EXPORT_NS_GPL_DEV_PM_OPS(name, ns) _EXPORT_DEV_PM_OPS(name, "_gpl", #ns)
+
 /*
  * Use this if you want to use the same suspend and resume callbacks for suspend
  * to RAM and hibernation.
@@ -399,13 +400,21 @@ static __maybe_unused _DEFINE_DEV_PM_OPS(__static_##name, suspend_fn, \
 	_DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL)
 
 #define EXPORT_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
-	_EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "", "")
+	EXPORT_DEV_PM_OPS(name) = { \
+		SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+	}
 #define EXPORT_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
-	_EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl", "")
+	EXPORT_GPL_DEV_PM_OPS(name) = { \
+		SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+	}
 #define EXPORT_NS_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns)	\
-	_EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "", #ns)
+	EXPORT_NS_DEV_PM_OPS(name, ns) = { \
+		SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+	}
 #define EXPORT_NS_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns)	\
-	_EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl", #ns)
+	EXPORT_NS_GPL_DEV_PM_OPS(name, ns) = { \
+		SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+	}
 
 /* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */
 #define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 0a41b2dcccad..9a8151a2bdea 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -40,17 +40,21 @@
 			   resume_fn, idle_fn)
 
 #define EXPORT_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
-	_EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \
-			   suspend_fn, resume_fn, idle_fn, "", "")
+	EXPORT_DEV_PM_OPS(name) = { \
+		RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
+	}
 #define EXPORT_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
-	_EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \
-			   suspend_fn, resume_fn, idle_fn, "_gpl", "")
+	EXPORT_GPL_DEV_PM_OPS(name) = { \
+		RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
+	}
 #define EXPORT_NS_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \
-	_EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \
-			   suspend_fn, resume_fn, idle_fn, "", #ns)
+	EXPORT_NS_DEV_PM_OPS(name, ns) = { \
+		RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
+	}
 #define EXPORT_NS_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \
-	_EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \
-			   suspend_fn, resume_fn, idle_fn, "_gpl", #ns)
+	EXPORT_NS_GPL_DEV_PM_OPS(name, ns) = { \
+		RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
+	}
 
 #ifdef CONFIG_PM
 extern struct workqueue_struct *pm_wq;
-- 
cgit v1.2.3


From a9cfee0ef98e99c8b1951dfd1d57a88580354d0d Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Wed, 28 Sep 2022 23:38:53 +0800
Subject: f2fs: support recording stop_checkpoint reason into super_block

This patch supports to record stop_checkpoint error into
f2fs_super_block.s_stop_reason[].

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index d445150c5350..5dd1e52b8997 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -73,6 +73,20 @@ struct f2fs_device {
 	__le32 total_segments;
 } __packed;
 
+/* reason of stop_checkpoint */
+enum stop_cp_reason {
+	STOP_CP_REASON_SHUTDOWN,
+	STOP_CP_REASON_FAULT_INJECT,
+	STOP_CP_REASON_META_PAGE,
+	STOP_CP_REASON_WRITE_FAIL,
+	STOP_CP_REASON_CORRUPTED_SUMMARY,
+	STOP_CP_REASON_UPDATE_INODE,
+	STOP_CP_REASON_FLUSH_FAIL,
+	STOP_CP_REASON_MAX,
+};
+
+#define	MAX_STOP_REASON			32
+
 struct f2fs_super_block {
 	__le32 magic;			/* Magic Number */
 	__le16 major_ver;		/* Major Version */
@@ -116,7 +130,8 @@ struct f2fs_super_block {
 	__u8 hot_ext_count;		/* # of hot file extension */
 	__le16  s_encoding;		/* Filename charset encoding */
 	__le16  s_encoding_flags;	/* Filename charset encoding flags */
-	__u8 reserved[306];		/* valid reserved region */
+	__u8 s_stop_reason[MAX_STOP_REASON];	/* stop checkpoint reason */
+	__u8 reserved[274];		/* valid reserved region */
 	__le32 crc;			/* checksum of superblock */
 } __packed;
 
-- 
cgit v1.2.3


From 95fa90c9e5a7f14c2497d5b032544478c9377c3a Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Wed, 28 Sep 2022 23:38:54 +0800
Subject: f2fs: support recording errors into superblock

This patch supports to record detail reason of FSCORRUPTED error into
f2fs_super_block.s_errors[].

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 5dd1e52b8997..ee0d75d9a302 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -87,6 +87,28 @@ enum stop_cp_reason {
 
 #define	MAX_STOP_REASON			32
 
+/* detail reason for EFSCORRUPTED */
+enum f2fs_error {
+	ERROR_CORRUPTED_CLUSTER,
+	ERROR_FAIL_DECOMPRESSION,
+	ERROR_INVALID_BLKADDR,
+	ERROR_CORRUPTED_DIRENT,
+	ERROR_CORRUPTED_INODE,
+	ERROR_INCONSISTENT_SUMMARY,
+	ERROR_INCONSISTENT_FOOTER,
+	ERROR_INCONSISTENT_SUM_TYPE,
+	ERROR_CORRUPTED_JOURNAL,
+	ERROR_INCONSISTENT_NODE_COUNT,
+	ERROR_INCONSISTENT_BLOCK_COUNT,
+	ERROR_INVALID_CURSEG,
+	ERROR_INCONSISTENT_SIT,
+	ERROR_CORRUPTED_VERITY_XATTR,
+	ERROR_CORRUPTED_XATTR,
+	ERROR_MAX,
+};
+
+#define MAX_F2FS_ERRORS			16
+
 struct f2fs_super_block {
 	__le32 magic;			/* Magic Number */
 	__le16 major_ver;		/* Major Version */
@@ -131,7 +153,8 @@ struct f2fs_super_block {
 	__le16  s_encoding;		/* Filename charset encoding */
 	__le16  s_encoding_flags;	/* Filename charset encoding flags */
 	__u8 s_stop_reason[MAX_STOP_REASON];	/* stop checkpoint reason */
-	__u8 reserved[274];		/* valid reserved region */
+	__u8 s_errors[MAX_F2FS_ERRORS];		/* reason of image corrupts */
+	__u8 reserved[258];		/* valid reserved region */
 	__le32 crc;			/* checksum of superblock */
 } __packed;
 
-- 
cgit v1.2.3


From f1375ec1df09fb09fecfdf9418d3784e7ba12469 Mon Sep 17 00:00:00 2001
From: Meng Li <li.meng@amd.com>
Date: Wed, 17 Aug 2022 11:46:27 +0800
Subject: cpufreq: amd-pstate: Expose struct amd_cpudata

Expose struct amd_cpudata to AMD P-State unit test module.

This data struct will be used on the following AMD P-State unit test
(amd-pstate-ut) module. The amd-pstate-ut module can get some
AMD infomations by this data struct. For example: highest perf,
nominal perf, boost supported etc.

Signed-off-by: Meng Li <li.meng@amd.com>
Acked-by: Huang Rui <ray.huang@amd.com>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/linux/amd-pstate.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 include/linux/amd-pstate.h

(limited to 'include/linux')

diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
new file mode 100644
index 000000000000..1c4b8659f171
--- /dev/null
+++ b/include/linux/amd-pstate.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * linux/include/linux/amd-pstate.h
+ *
+ * Copyright (C) 2022 Advanced Micro Devices, Inc.
+ *
+ * Author: Meng Li <li.meng@amd.com>
+ */
+
+#ifndef _LINUX_AMD_PSTATE_H
+#define _LINUX_AMD_PSTATE_H
+
+#include <linux/pm_qos.h>
+
+/*********************************************************************
+ *                        AMD P-state INTERFACE                       *
+ *********************************************************************/
+/**
+ * struct  amd_aperf_mperf
+ * @aperf: actual performance frequency clock count
+ * @mperf: maximum performance frequency clock count
+ * @tsc:   time stamp counter
+ */
+struct amd_aperf_mperf {
+	u64 aperf;
+	u64 mperf;
+	u64 tsc;
+};
+
+/**
+ * struct amd_cpudata - private CPU data for AMD P-State
+ * @cpu: CPU number
+ * @req: constraint request to apply
+ * @cppc_req_cached: cached performance request hints
+ * @highest_perf: the maximum performance an individual processor may reach,
+ *		  assuming ideal conditions
+ * @nominal_perf: the maximum sustained performance level of the processor,
+ *		  assuming ideal operating conditions
+ * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power
+ *			   savings are achieved
+ * @lowest_perf: the absolute lowest performance level of the processor
+ * @max_freq: the frequency that mapped to highest_perf
+ * @min_freq: the frequency that mapped to lowest_perf
+ * @nominal_freq: the frequency that mapped to nominal_perf
+ * @lowest_nonlinear_freq: the frequency that mapped to lowest_nonlinear_perf
+ * @cur: Difference of Aperf/Mperf/tsc count between last and current sample
+ * @prev: Last Aperf/Mperf/tsc count value read from register
+ * @freq: current cpu frequency value
+ * @boost_supported: check whether the Processor or SBIOS supports boost mode
+ *
+ * The amd_cpudata is key private data for each CPU thread in AMD P-State, and
+ * represents all the attributes and goals that AMD P-State requests at runtime.
+ */
+struct amd_cpudata {
+	int	cpu;
+
+	struct	freq_qos_request req[2];
+	u64	cppc_req_cached;
+
+	u32	highest_perf;
+	u32	nominal_perf;
+	u32	lowest_nonlinear_perf;
+	u32	lowest_perf;
+
+	u32	max_freq;
+	u32	min_freq;
+	u32	nominal_freq;
+	u32	lowest_nonlinear_freq;
+
+	struct amd_aperf_mperf cur;
+	struct amd_aperf_mperf prev;
+
+	u64	freq;
+	bool	boost_supported;
+};
+
+#endif /* _LINUX_AMD_PSTATE_H */
-- 
cgit v1.2.3


From 07d2872bf4c864eb83d034263c155746a2fb7a3b Mon Sep 17 00:00:00 2001
From: Avri Altman <avri.altman@wdc.com>
Date: Wed, 28 Sep 2022 12:57:44 +0300
Subject: mmc: core: Add SD card quirk for broken discard

Some SD-cards from Sandisk that are SDA-6.0 compliant reports they supports
discard, while they actually don't. This might cause mk2fs to fail while
trying to format the card and revert it to a read-only mode.

To fix this problem, let's add a card quirk (MMC_QUIRK_BROKEN_SD_DISCARD)
to indicate that we shall fall-back to use the legacy erase command
instead.

Signed-off-by: Avri Altman <avri.altman@wdc.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20220928095744.16455-1-avri.altman@wdc.com
[Ulf: Updated the commit message]
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/card.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 8a30de08e913..c726ea781255 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -293,6 +293,7 @@ struct mmc_card {
 #define MMC_QUIRK_BROKEN_IRQ_POLLING	(1<<11)	/* Polling SDIO_CCCR_INTx could create a fake interrupt */
 #define MMC_QUIRK_TRIM_BROKEN	(1<<12)		/* Skip trim */
 #define MMC_QUIRK_BROKEN_HPI	(1<<13)		/* Disable broken HPI support */
+#define MMC_QUIRK_BROKEN_SD_DISCARD	(1<<14)	/* Disable broken SD discard support */
 
 	bool			reenable_cmdq;	/* Re-enable Command Queue */
 
-- 
cgit v1.2.3


From 90d482908eedd56f01a707325aa541cf9c40f936 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Mon, 3 Oct 2022 16:34:17 +0100
Subject: lib/find_bit: Introduce find_next_andnot_bit()

In preparation of introducing for_each_cpu_andnot(), add a variant of
find_next_bit() that negate the bits in @addr2 when ANDing them with the
bits in @addr1.

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
 include/linux/find.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/find.h b/include/linux/find.h
index 0cdfab9734a6..2235af19e7e1 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -12,6 +12,8 @@ unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits,
 				unsigned long start);
 unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
 					unsigned long nbits, unsigned long start);
+unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
+					unsigned long nbits, unsigned long start);
 unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
 					 unsigned long start);
 extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
@@ -91,6 +93,37 @@ unsigned long find_next_and_bit(const unsigned long *addr1,
 }
 #endif
 
+#ifndef find_next_andnot_bit
+/**
+ * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits
+ *                        in *addr2
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @size: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Returns the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_next_andnot_bit(const unsigned long *addr1,
+		const unsigned long *addr2, unsigned long size,
+		unsigned long offset)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val;
+
+		if (unlikely(offset >= size))
+			return size;
+
+		val = *addr1 & ~*addr2 & GENMASK(size - 1, offset);
+		return val ? __ffs(val) : size;
+	}
+
+	return _find_next_andnot_bit(addr1, addr2, size, offset);
+}
+#endif
+
 #ifndef find_next_zero_bit
 /**
  * find_next_zero_bit - find the next cleared bit in a memory region
-- 
cgit v1.2.3


From 5f75ff295c662c1c8fb9e1737e9dc3b9a1e7fb29 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Mon, 3 Oct 2022 16:34:18 +0100
Subject: cpumask: Introduce for_each_cpu_andnot()

for_each_cpu_and() is very convenient as it saves having to allocate a
temporary cpumask to store the result of cpumask_and(). The same issue
applies to cpumask_andnot() which doesn't actually need temporary storage
for iteration purposes.

Following what has been done for for_each_cpu_and(), introduce
for_each_cpu_andnot().

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
 include/linux/cpumask.h | 18 ++++++++++++++++++
 include/linux/find.h    |  5 +++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 286804bfe3b7..2f065ad97541 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -306,6 +306,24 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
 #define for_each_cpu_and(cpu, mask1, mask2)				\
 	for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits)
 
+/**
+ * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding
+ *			 those present in another.
+ * @cpu: the (optionally unsigned) integer iterator
+ * @mask1: the first cpumask pointer
+ * @mask2: the second cpumask pointer
+ *
+ * This saves a temporary CPU mask in many places.  It is equivalent to:
+ *	struct cpumask tmp;
+ *	cpumask_andnot(&tmp, &mask1, &mask2);
+ *	for_each_cpu(cpu, &tmp)
+ *		...
+ *
+ * After the loop, cpu is >= nr_cpu_ids.
+ */
+#define for_each_cpu_andnot(cpu, mask1, mask2)				\
+	for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits)
+
 /**
  * cpumask_any_but - return a "random" in a cpumask, but not this one.
  * @mask: the cpumask to search
diff --git a/include/linux/find.h b/include/linux/find.h
index 2235af19e7e1..ccaf61a0f5fd 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -498,6 +498,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 	     (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
 	     (bit)++)
 
+#define for_each_andnot_bit(bit, addr1, addr2, size) \
+	for ((bit) = 0;									\
+	     (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
+	     (bit)++)
+
 /* same as for_each_set_bit() but use bit as value to start with */
 #define for_each_set_bit_from(bit, addr, size) \
 	for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
-- 
cgit v1.2.3


From 39494194f93bed7926d4b3bd03a6a76ba23e612b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 5 Oct 2022 15:57:35 -0400
Subject: SUNRPC: Fix races with rpc_killall_tasks()

Ensure that we immediately call rpc_exit_task() after waking up, and
that the tk_rpc_status cannot get clobbered by some other function.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index acc62647317c..647247040ef9 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -209,6 +209,7 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
 struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req);
 void		rpc_put_task(struct rpc_task *);
 void		rpc_put_task_async(struct rpc_task *);
+bool		rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status);
 void		rpc_signal_task(struct rpc_task *);
 void		rpc_exit_task(struct rpc_task *);
 void		rpc_exit(struct rpc_task *, int);
-- 
cgit v1.2.3


From f8423909ecca208834a9d704e58409800f8b5f21 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 5 Oct 2022 15:57:36 -0400
Subject: SUNRPC: Add a helper to allow pNFS drivers to selectively cancel RPC
 calls

Add the helper rpc_cancel_tasks(), which uses a caller-defined selection
function to define a set of in-flight RPC calls to cancel. This is
mainly intended for pNFS drivers which are subject to a layout recall,
and which may therefore want to cancel all pending I/O using that layout
in order to redrive it after the layout recall has been satisfied.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/sched.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 647247040ef9..cdcf0fe56a6f 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -210,11 +210,16 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req);
 void		rpc_put_task(struct rpc_task *);
 void		rpc_put_task_async(struct rpc_task *);
 bool		rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status);
+void		rpc_task_try_cancel(struct rpc_task *task, int error);
 void		rpc_signal_task(struct rpc_task *);
 void		rpc_exit_task(struct rpc_task *);
 void		rpc_exit(struct rpc_task *, int);
 void		rpc_release_calldata(const struct rpc_call_ops *, void *);
 void		rpc_killall_tasks(struct rpc_clnt *);
+unsigned long	rpc_cancel_tasks(struct rpc_clnt *clnt, int error,
+				 bool (*fnmatch)(const struct rpc_task *,
+						 const void *),
+				 const void *data);
 void		rpc_execute(struct rpc_task *);
 void		rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
 void		rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
-- 
cgit v1.2.3


From dc4c4304855a5721d214e2a53e17df5152dd5f34 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 5 Oct 2022 15:57:37 -0400
Subject: SUNRPC: Add API to force the client to disconnect

Allow the caller to force a disconnection of the RPC client so that we
can clear any pending requests that are buffered in the socket.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/clnt.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 75eea5ebb179..770ef2cb5775 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -246,6 +246,7 @@ void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *, struct rpc_xprt *);
 bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
 			const struct sockaddr *sap);
 void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt);
+void rpc_clnt_disconnect(struct rpc_clnt *clnt);
 void rpc_cleanup_clids(void);
 
 static inline int rpc_reply_expected(struct rpc_task *task)
-- 
cgit v1.2.3


From a890d1c657ecba73a7b28591c92587aef1be1888 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Wed, 5 Oct 2022 12:54:38 +0200
Subject: random: clear new batches when bringing new CPUs online

The commit that added the new get_random_{u8,u16}() functions neglected
to update the code that clears the batches when bringing up a new CPU.
It also forgot a few comments and helper defines, so add those in too.

Fixes: 585cd5fe9f73 ("random: add 8-bit and 16-bit batches")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/linux/random.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/random.h b/include/linux/random.h
index 2c130f8f18e5..08322f700cdc 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -96,6 +96,8 @@ static inline int get_random_bytes_wait(void *buf, size_t nbytes)
 		*out = get_random_ ## name(); \
 		return 0; \
 	}
+declare_get_random_var_wait(u8, u8)
+declare_get_random_var_wait(u16, u16)
 declare_get_random_var_wait(u32, u32)
 declare_get_random_var_wait(u64, u32)
 declare_get_random_var_wait(int, unsigned int)
-- 
cgit v1.2.3


From e3e6e1d16a4cf7b63159ec71774e822194071954 Mon Sep 17 00:00:00 2001
From: Hawkins Jiawei <yin31149@gmail.com>
Date: Tue, 27 Sep 2022 07:34:59 +0800
Subject: wifi: wext: use flex array destination for memcpy()

Syzkaller reports buffer overflow false positive as follows:
------------[ cut here ]------------
memcpy: detected field-spanning write (size 8) of single field
	"&compat_event->pointer" at net/wireless/wext-core.c:623 (size 4)
WARNING: CPU: 0 PID: 3607 at net/wireless/wext-core.c:623
	wireless_send_event+0xab5/0xca0 net/wireless/wext-core.c:623
Modules linked in:
CPU: 1 PID: 3607 Comm: syz-executor659 Not tainted
	6.0.0-rc6-next-20220921-syzkaller #0
[...]
Call Trace:
 <TASK>
 ioctl_standard_call+0x155/0x1f0 net/wireless/wext-core.c:1022
 wireless_process_ioctl+0xc8/0x4c0 net/wireless/wext-core.c:955
 wext_ioctl_dispatch net/wireless/wext-core.c:988 [inline]
 wext_ioctl_dispatch net/wireless/wext-core.c:976 [inline]
 wext_handle_ioctl+0x26b/0x280 net/wireless/wext-core.c:1049
 sock_ioctl+0x285/0x640 net/socket.c:1220
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:870 [inline]
 __se_sys_ioctl fs/ioctl.c:856 [inline]
 __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:856
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
 [...]
 </TASK>

Wireless events will be sent on the appropriate channels in
wireless_send_event(). Different wireless events may have different
payload structure and size, so kernel uses **len** and **cmd** field
in struct __compat_iw_event as wireless event common LCP part, uses
**pointer** as a label to mark the position of remaining different part.

Yet the problem is that, **pointer** is a compat_caddr_t type, which may
be smaller than the relative structure at the same position. So during
wireless_send_event() tries to parse the wireless events payload, it may
trigger the memcpy() run-time destination buffer bounds checking when the
relative structure's data is copied to the position marked by **pointer**.

This patch solves it by introducing flexible-array field **ptr_bytes**,
to mark the position of the wireless events remaining part next to
LCP part. What's more, this patch also adds **ptr_len** variable in
wireless_send_event() to improve its maintainability.

Reported-and-tested-by: syzbot+473754e5af963cf014cf@syzkaller.appspotmail.com
Link: https://lore.kernel.org/all/00000000000070db2005e95a5984@google.com/
Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Hawkins Jiawei <yin31149@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/wireless.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/wireless.h b/include/linux/wireless.h
index 2d1b54556eff..e6e34d74dda0 100644
--- a/include/linux/wireless.h
+++ b/include/linux/wireless.h
@@ -26,7 +26,15 @@ struct compat_iw_point {
 struct __compat_iw_event {
 	__u16		len;			/* Real length of this stuff */
 	__u16		cmd;			/* Wireless IOCTL */
-	compat_caddr_t	pointer;
+
+	union {
+		compat_caddr_t	pointer;
+
+		/* we need ptr_bytes to make memcpy() run-time destination
+		 * buffer bounds checking happy, nothing special
+		 */
+		DECLARE_FLEX_ARRAY(__u8, ptr_bytes);
+	};
 };
 #define IW_EV_COMPAT_LCP_LEN offsetof(struct __compat_iw_event, pointer)
 #define IW_EV_COMPAT_POINT_OFF offsetof(struct compat_iw_point, length)
-- 
cgit v1.2.3


From cdbd952bb7b5fca36676b3d318796b196b127397 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 15 Aug 2022 18:04:20 -0400
Subject: virtio: drop vp_legacy_set_queue_size

There's actually no way to set queue size on legacy virtio pci.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20220815220447.155860-1-mst@redhat.com>
---
 include/linux/virtio_pci_legacy.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_pci_legacy.h b/include/linux/virtio_pci_legacy.h
index e5d665faf00e..a8dc757d0367 100644
--- a/include/linux/virtio_pci_legacy.h
+++ b/include/linux/virtio_pci_legacy.h
@@ -32,8 +32,6 @@ void vp_legacy_set_queue_address(struct virtio_pci_legacy_device *ldev,
 			     u16 index, u32 queue_pfn);
 bool vp_legacy_get_queue_enable(struct virtio_pci_legacy_device *ldev,
 				u16 idx);
-void vp_legacy_set_queue_size(struct virtio_pci_legacy_device *ldev,
-			      u16 idx, u16 size);
 u16 vp_legacy_get_queue_size(struct virtio_pci_legacy_device *ldev,
 			     u16 idx);
 int vp_legacy_probe(struct virtio_pci_legacy_device *ldev);
-- 
cgit v1.2.3


From 90fea5a800c3dd80fb8ad9a02929bcef5fde42b8 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 27 Sep 2022 15:48:08 +0800
Subject: vdpa: device feature provisioning

This patch allows the device features to be provisioned through
netlink. A new attribute is introduced to allow the userspace to pass
a 64bit device features during device adding.

This provides several advantages:

- Allow to provision a subset of the features to ease the cross vendor
  live migration.
- Better debug-ability for vDPA framework and parent.

Reviewed-by: Eli Cohen <elic@nvidia.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220927074810.28627-2-jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index d282f464d2f1..6d0f5e4e82c2 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -104,6 +104,7 @@ struct vdpa_iova_range {
 };
 
 struct vdpa_dev_set_config {
+	u64 device_features;
 	struct {
 		u8 mac[ETH_ALEN];
 		u16 mtu;
-- 
cgit v1.2.3


From 4b22ef042d6f54a6e5899555f2db71749133eca8 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 7 Oct 2022 11:04:39 -0300
Subject: vfio: Add vfio_file_is_group()

This replaces uses of vfio_file_iommu_group() which were only detecting if
the file is a VFIO file with no interest in the actual group.

The only remaning user of vfio_file_iommu_group() is in KVM for the SPAPR
stuff. It passes the iommu_group into the arch code through kvm for some
reason.

Tested-by: Matthew Rosato <mjrosato@linux.ibm.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v2-15417f29324e+1c-vfio_group_disassociate_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index ee399a768070..e7cebeb875dd 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -199,6 +199,7 @@ int vfio_mig_get_next_state(struct vfio_device *device,
  * External user API
  */
 struct iommu_group *vfio_file_iommu_group(struct file *file);
+bool vfio_file_is_group(struct file *file);
 bool vfio_file_enforced_coherent(struct file *file);
 void vfio_file_set_kvm(struct file *file, struct kvm *kvm);
 bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
-- 
cgit v1.2.3


From 9afd20a5a1b3558950b5e12f3ed057ef93c64a05 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 10 Oct 2022 09:52:37 +0530
Subject: clk: spear: Move prototype to accessible header

Fixes the following W=1 kernel build warning(s):

 drivers/clk/spear/spear6xx_clock.c:116:13: warning: no previous prototype for function 'spear6xx_clk_init' [-Wmissing-prototypes]

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/clk/spear.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk/spear.h b/include/linux/clk/spear.h
index a64d034ceddd..eaf95ca656f8 100644
--- a/include/linux/clk/spear.h
+++ b/include/linux/clk/spear.h
@@ -8,6 +8,20 @@
 #ifndef __LINUX_CLK_SPEAR_H
 #define __LINUX_CLK_SPEAR_H
 
+#ifdef CONFIG_ARCH_SPEAR3XX
+void __init spear3xx_clk_init(void __iomem *misc_base,
+			      void __iomem *soc_config_base);
+#else
+static inline void __init spear3xx_clk_init(void __iomem *misc_base,
+					    void __iomem *soc_config_base) {}
+#endif
+
+#ifdef CONFIG_ARCH_SPEAR6XX
+void __init spear6xx_clk_init(void __iomem *misc_base);
+#else
+static inline void __init spear6xx_clk_init(void __iomem *misc_base) {}
+#endif
+
 #ifdef CONFIG_MACH_SPEAR1310
 void __init spear1310_clk_init(void __iomem *misc_base, void __iomem *ras_base);
 #else
-- 
cgit v1.2.3


From ca5eebda3e1c1a58a1c5a337da393ed6734593e3 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 3 Oct 2022 09:35:34 -0400
Subject: block: avoid sign extend problem with default queue flags mask

request_queue->queue_flags is unsigned long, which is 8-bytes on
64-bit architectures. Most queue flag modifications occur through
bit field helpers, but default flags can be logically OR'd via the
QUEUE_FLAG_MQ_DEFAULT mask. If this mask happens to include bit 31,
the assignment can sign extend the field and set all upper 32 bits.

This exact problem has been observed on a downstream kernel that
happens to use bit 31 for QUEUE_FLAG_NOWAIT. This is not an
immediate problem for current upstream because bit 31 is not
included in the default flag assignment (and is not used at all,
actually). Regardless, fix up the QUEUE_FLAG_MQ_DEFAULT mask
definition to avoid the landmine in the future.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20221003133534.1075582-1-bfoster@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 49373d002631..e4fb47753177 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -580,9 +580,9 @@ struct request_queue {
 #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
 #define QUEUE_FLAG_SQ_SCHED     30	/* single queue style io dispatch */
 
-#define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
-				 (1 << QUEUE_FLAG_SAME_COMP) |		\
-				 (1 << QUEUE_FLAG_NOWAIT))
+#define QUEUE_FLAG_MQ_DEFAULT	((1UL << QUEUE_FLAG_IO_STAT) |		\
+				 (1UL << QUEUE_FLAG_SAME_COMP) |	\
+				 (1UL << QUEUE_FLAG_NOWAIT))
 
 void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
 void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
-- 
cgit v1.2.3


From 81895a65ec63ee1daec3255dc1a06675d2fbe915 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Wed, 5 Oct 2022 16:43:38 +0200
Subject: treewide: use prandom_u32_max() when possible, part 1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rather than incurring a division or requesting too many random bytes for
the given range, use the prandom_u32_max() function, which only takes
the minimum required bytes from the RNG and avoids divisions. This was
done mechanically with this coccinelle script:

@basic@
expression E;
type T;
identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32";
typedef u64;
@@
(
- ((T)get_random_u32() % (E))
+ prandom_u32_max(E)
|
- ((T)get_random_u32() & ((E) - 1))
+ prandom_u32_max(E * XXX_MAKE_SURE_E_IS_POW2)
|
- ((u64)(E) * get_random_u32() >> 32)
+ prandom_u32_max(E)
|
- ((T)get_random_u32() & ~PAGE_MASK)
+ prandom_u32_max(PAGE_SIZE)
)

@multi_line@
identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32";
identifier RAND;
expression E;
@@

-       RAND = get_random_u32();
        ... when != RAND
-       RAND %= (E);
+       RAND = prandom_u32_max(E);

// Find a potential literal
@literal_mask@
expression LITERAL;
type T;
identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32";
position p;
@@

        ((T)get_random_u32()@p & (LITERAL))

// Add one to the literal.
@script:python add_one@
literal << literal_mask.LITERAL;
RESULT;
@@

value = None
if literal.startswith('0x'):
        value = int(literal, 16)
elif literal[0] in '123456789':
        value = int(literal, 10)
if value is None:
        print("I don't know how to handle %s" % (literal))
        cocci.include_match(False)
elif value == 2**32 - 1 or value == 2**31 - 1 or value == 2**24 - 1 or value == 2**16 - 1 or value == 2**8 - 1:
        print("Skipping 0x%x for cleanup elsewhere" % (value))
        cocci.include_match(False)
elif value & (value + 1) != 0:
        print("Skipping 0x%x because it's not a power of two minus one" % (value))
        cocci.include_match(False)
elif literal.startswith('0x'):
        coccinelle.RESULT = cocci.make_expr("0x%x" % (value + 1))
else:
        coccinelle.RESULT = cocci.make_expr("%d" % (value + 1))

// Replace the literal mask with the calculated result.
@plus_one@
expression literal_mask.LITERAL;
position literal_mask.p;
expression add_one.RESULT;
identifier FUNC;
@@

-       (FUNC()@p & (LITERAL))
+       prandom_u32_max(RESULT)

@collapse_ret@
type T;
identifier VAR;
expression E;
@@

 {
-       T VAR;
-       VAR = (E);
-       return VAR;
+       return E;
 }

@drop_var@
type T;
identifier VAR;
@@

 {
-       T VAR;
        ... when != VAR
 }

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: KP Singh <kpsingh@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz> # for ext4 and sbitmap
Reviewed-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com> # for drbd
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Heiko Carstens <hca@linux.ibm.com> # for s390
Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # for mmc
Acked-by: Darrick J. Wong <djwong@kernel.org> # for xfs
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/linux/nodemask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 378956c93c94..efef68c9352a 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -516,7 +516,7 @@ static inline int node_random(const nodemask_t *maskp)
 		bit = first_node(*maskp);
 		break;
 	default:
-		bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_int() % w);
+		bit = find_nth_bit(maskp->bits, MAX_NUMNODES, prandom_u32_max(w));
 		break;
 	}
 	return bit;
-- 
cgit v1.2.3


From de492c83cae0af72de370b9404aacda93dafcad5 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Wed, 5 Oct 2022 17:50:20 +0200
Subject: prandom: remove unused functions

With no callers left of prandom_u32() and prandom_bytes(), as well as
get_random_int(), remove these deprecated wrappers, in favor of
get_random_u32() and get_random_bytes().

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/linux/prandom.h | 12 ------------
 include/linux/random.h  |  5 -----
 2 files changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/prandom.h b/include/linux/prandom.h
index 78db003bc290..e0a0759dd09c 100644
--- a/include/linux/prandom.h
+++ b/include/linux/prandom.h
@@ -12,18 +12,6 @@
 #include <linux/percpu.h>
 #include <linux/random.h>
 
-/* Deprecated: use get_random_u32 instead. */
-static inline u32 prandom_u32(void)
-{
-	return get_random_u32();
-}
-
-/* Deprecated: use get_random_bytes instead. */
-static inline void prandom_bytes(void *buf, size_t nbytes)
-{
-	return get_random_bytes(buf, nbytes);
-}
-
 struct rnd_state {
 	__u32 s1, s2, s3, s4;
 };
diff --git a/include/linux/random.h b/include/linux/random.h
index 08322f700cdc..147a5e0d0b8e 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -42,10 +42,6 @@ u8 get_random_u8(void);
 u16 get_random_u16(void);
 u32 get_random_u32(void);
 u64 get_random_u64(void);
-static inline unsigned int get_random_int(void)
-{
-	return get_random_u32();
-}
 static inline unsigned long get_random_long(void)
 {
 #if BITS_PER_LONG == 64
@@ -100,7 +96,6 @@ declare_get_random_var_wait(u8, u8)
 declare_get_random_var_wait(u16, u16)
 declare_get_random_var_wait(u32, u32)
 declare_get_random_var_wait(u64, u32)
-declare_get_random_var_wait(int, unsigned int)
 declare_get_random_var_wait(long, unsigned long)
 #undef declare_get_random_var
 
-- 
cgit v1.2.3


From 6a961bffd1c3505c13b4d33bbb8385fe08239cb8 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Fri, 2 Sep 2022 11:41:46 +0800
Subject: include/linux/entry-common.h: remove has_signal comment of
 arch_do_signal_or_restart() prototype

The argument has_signal of arch_do_signal_or_restart() has been removed in
commit 8ba62d37949e ("task_work: Call tracehook_notify_signal from
get_signal on all architectures"), let us remove the related comment.

Link: https://lkml.kernel.org/r/1662090106-5545-1-git-send-email-yangtiezhu@loongson.cn
Fixes: 8ba62d37949e ("task_work: Call tracehook_notify_signal from get_signal on all architectures")
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/entry-common.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 84a466b176cf..d95ab85f96ba 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -253,7 +253,6 @@ static __always_inline void arch_exit_to_user_mode(void) { }
 /**
  * arch_do_signal_or_restart -  Architecture specific signal delivery function
  * @regs:	Pointer to currents pt_regs
- * @has_signal:	actual signal to handle
  *
  * Invoked from exit_to_user_mode_loop().
  */
-- 
cgit v1.2.3


From fac35ba763ed07ba93154c95ffc0c4a55023707f Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Thu, 1 Sep 2022 18:41:31 +0800
Subject: mm/hugetlb: fix races when looking up a CONT-PTE/PMD size hugetlb
 page

On some architectures (like ARM64), it can support CONT-PTE/PMD size
hugetlb, which means it can support not only PMD/PUD size hugetlb (2M and
1G), but also CONT-PTE/PMD size(64K and 32M) if a 4K page size specified.

So when looking up a CONT-PTE size hugetlb page by follow_page(), it will
use pte_offset_map_lock() to get the pte entry lock for the CONT-PTE size
hugetlb in follow_page_pte().  However this pte entry lock is incorrect
for the CONT-PTE size hugetlb, since we should use huge_pte_lock() to get
the correct lock, which is mm->page_table_lock.

That means the pte entry of the CONT-PTE size hugetlb under current pte
lock is unstable in follow_page_pte(), we can continue to migrate or
poison the pte entry of the CONT-PTE size hugetlb, which can cause some
potential race issues, even though they are under the 'pte lock'.

For example, suppose thread A is trying to look up a CONT-PTE size hugetlb
page by move_pages() syscall under the lock, however antoher thread B can
migrate the CONT-PTE hugetlb page at the same time, which will cause
thread A to get an incorrect page, if thread A also wants to do page
migration, then data inconsistency error occurs.

Moreover we have the same issue for CONT-PMD size hugetlb in
follow_huge_pmd().

To fix above issues, rename the follow_huge_pmd() as follow_huge_pmd_pte()
to handle PMD and PTE level size hugetlb, which uses huge_pte_lock() to
get the correct pte entry lock to make the pte entry stable.

Mike said:

Support for CONT_PMD/_PTE was added with bb9dd3df8ee9 ("arm64: hugetlb:
refactor find_num_contig()").  Patch series "Support for contiguous pte
hugepages", v4.  However, I do not believe these code paths were
executed until migration support was added with 5480280d3f2d ("arm64/mm:
enable HugeTLB migration for contiguous bit HugeTLB pages") I would go
with 5480280d3f2d for the Fixes: targe.

Link: https://lkml.kernel.org/r/635f43bdd85ac2615a58405da82b4d33c6e5eb05.1662017562.git.baolin.wang@linux.alibaba.com
Fixes: 5480280d3f2d ("arm64/mm: enable HugeTLB migration for contiguous bit HugeTLB pages")
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3ec981a0d8b3..67c88b82fc32 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -207,8 +207,8 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 struct page *follow_huge_pd(struct vm_area_struct *vma,
 			    unsigned long address, hugepd_t hpd,
 			    int flags, int pdshift);
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int flags);
+struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address,
+				 int flags);
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
 				pud_t *pud, int flags);
 struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
@@ -312,8 +312,8 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
 	return NULL;
 }
 
-static inline struct page *follow_huge_pmd(struct mm_struct *mm,
-				unsigned long address, pmd_t *pmd, int flags)
+static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma,
+				unsigned long address, int flags)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 0091bfc81741b8d3aeb3b7ab8636f911b2de6e80 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 3 Oct 2022 13:59:47 +0100
Subject: io_uring/af_unix: defer registered files gc to io_uring release

Instead of putting io_uring's registered files in unix_gc() we want it
to be done by io_uring itself. The trick here is to consider io_uring
registered files for cycle detection but not actually putting them down.
Because io_uring can't register other ring instances, this will remove
all refs to the ring file triggering the ->release path and clean up
with io_ring_ctx_free().

Cc: stable@vger.kernel.org
Fixes: 6b06314c47e1 ("io_uring: add file set registration")
Reported-and-tested-by: David Bouman <dbouman03@gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
[axboe: add kerneldoc comment to skb, fold in skb leak fix]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/skbuff.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9fcf534f2d92..7be5bb4c94b6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -803,6 +803,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@csum_level: indicates the number of consecutive checksums found in
  *		the packet minus one that have been verified as
  *		CHECKSUM_UNNECESSARY (max 3)
+ *	@scm_io_uring: SKB holds io_uring registered files
  *	@dst_pending_confirm: need to confirm neighbour
  *	@decrypted: Decrypted SKB
  *	@slow_gro: state present at GRO time, slower prepare step required
@@ -982,6 +983,7 @@ struct sk_buff {
 #endif
 	__u8			slow_gro:1;
 	__u8			csum_not_inet:1;
+	__u8			scm_io_uring:1;
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
-- 
cgit v1.2.3


From b7a817752efc850603c4c23ed78da2b990a6a34a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 4 Oct 2022 03:19:25 +0100
Subject: io_uring: remove notif leftovers

Notifications were killed but there is a couple of fields and struct
declarations left, remove them.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8df8877d677be5a2b43afd936d600e60105ea960.1664849941.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index aa4d90a53866..f5b687a787a3 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -34,9 +34,6 @@ struct io_file_table {
 	unsigned int alloc_hint;
 };
 
-struct io_notif;
-struct io_notif_slot;
-
 struct io_hash_bucket {
 	spinlock_t		lock;
 	struct hlist_head	list;
@@ -242,8 +239,6 @@ struct io_ring_ctx {
 		unsigned		nr_user_files;
 		unsigned		nr_user_bufs;
 		struct io_mapped_ubuf	**user_bufs;
-		struct io_notif_slot	*notif_slots;
-		unsigned		nr_notif_slots;
 
 		struct io_submit_state	submit_state;
 
-- 
cgit v1.2.3


From 7be1c1a3c7b13fb259bb5159662a7b83622013b8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 11 Oct 2022 20:55:31 +0300
Subject: mm: more vma cache removal

Link: https://lkml.kernel.org/r/Y0WuE3Riv4iy5Jx8@localhost.localdomain
Fixes: 7964cf8caa4d ("mm: remove vmacache")
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 88a043f7235e..e0bb85cf8bdd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -861,8 +861,6 @@ struct task_struct {
 	struct mm_struct		*mm;
 	struct mm_struct		*active_mm;
 
-	/* Per-thread vma caching: */
-
 #ifdef SPLIT_RSS_COUNTING
 	struct task_rss_stat		rss_stat;
 #endif
-- 
cgit v1.2.3


From 652e04464d3944226052c827bdaaf5113b072870 Mon Sep 17 00:00:00 2001
From: Xin Hao <xhao@linux.alibaba.com>
Date: Tue, 27 Sep 2022 08:19:45 +0800
Subject: mm/damon: move sz_damon_region to damon_sz_region

Rename sz_damon_region() to damon_sz_region(), and move it to
"include/linux/damon.h", because in many places, we can to use this func.

Link: https://lkml.kernel.org/r/20220927001946.85375-1-xhao@linux.alibaba.com
Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Suggested-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ed5470f50bab..620ada094c3b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -484,6 +484,12 @@ static inline struct damon_region *damon_first_region(struct damon_target *t)
 	return list_first_entry(&t->regions_list, struct damon_region, list);
 }
 
+static inline unsigned long damon_sz_region(struct damon_region *r)
+{
+	return r->ar.end - r->ar.start;
+}
+
+
 #define damon_for_each_region(r, t) \
 	list_for_each_entry(r, &t->regions_list, list)
 
-- 
cgit v1.2.3


From 16ce101db85db694a91380aa4c89b25530871d33 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Wed, 28 Sep 2022 22:01:15 +1000
Subject: mm/memory.c: fix race when faulting a device private page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Fix several device private page reference counting issues",
v2

This series aims to fix a number of page reference counting issues in
drivers dealing with device private ZONE_DEVICE pages.  These result in
use-after-free type bugs, either from accessing a struct page which no
longer exists because it has been removed or accessing fields within the
struct page which are no longer valid because the page has been freed.

During normal usage it is unlikely these will cause any problems.  However
without these fixes it is possible to crash the kernel from userspace.
These crashes can be triggered either by unloading the kernel module or
unbinding the device from the driver prior to a userspace task exiting.
In modules such as Nouveau it is also possible to trigger some of these
issues by explicitly closing the device file-descriptor prior to the task
exiting and then accessing device private memory.

This involves some minor changes to both PowerPC and AMD GPU code.
Unfortunately I lack hardware to test either of those so any help there
would be appreciated.  The changes mimic what is done in for both Nouveau
and hmm-tests though so I doubt they will cause problems.


This patch (of 8):

When the CPU tries to access a device private page the migrate_to_ram()
callback associated with the pgmap for the page is called.  However no
reference is taken on the faulting page.  Therefore a concurrent migration
of the device private page can free the page and possibly the underlying
pgmap.  This results in a race which can crash the kernel due to the
migrate_to_ram() function pointer becoming invalid.  It also means drivers
can't reliably read the zone_device_data field because the page may have
been freed with memunmap_pages().

Close the race by getting a reference on the page while holding the ptl to
ensure it has not been freed.  Unfortunately the elevated reference count
will cause the migration required to handle the fault to fail.  To avoid
this failure pass the faulting page into the migrate_vma functions so that
if an elevated reference count is found it can be checked to see if it's
expected or not.

[mpe@ellerman.id.au: fix build]
  Link: https://lkml.kernel.org/r/87fsgbf3gh.fsf@mpe.ellerman.id.au
Link: https://lkml.kernel.org/r/cover.60659b549d8509ddecafad4f498ee7f03bb23c69.1664366292.git-series.apopple@nvidia.com
Link: https://lkml.kernel.org/r/d3e813178a59e565e8d78d9b9a4e2562f6494f90.1664366292.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 704a04f5a074..52090d1f9230 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -62,6 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES];
 #ifdef CONFIG_MIGRATION
 
 extern void putback_movable_pages(struct list_head *l);
+int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
+		struct folio *src, enum migrate_mode mode, int extra_count);
 int migrate_folio(struct address_space *mapping, struct folio *dst,
 		struct folio *src, enum migrate_mode mode);
 extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
@@ -197,6 +199,12 @@ struct migrate_vma {
 	 */
 	void			*pgmap_owner;
 	unsigned long		flags;
+
+	/*
+	 * Set to vmf->page if this is being called to migrate a page as part of
+	 * a migrate_to_ram() callback.
+	 */
+	struct page		*fault_page;
 };
 
 int migrate_vma_setup(struct migrate_vma *args);
-- 
cgit v1.2.3


From ef233450898f8893dafa193a9f3211fa077a3d05 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Wed, 28 Sep 2022 22:01:16 +1000
Subject: mm: free device private pages have zero refcount
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page
refcount") device private pages have no longer had an extra reference
count when the page is in use.  However before handing them back to the
owning device driver we add an extra reference count such that free pages
have a reference count of one.

This makes it difficult to tell if a page is free or not because both free
and in use pages will have a non-zero refcount.  Instead we should return
pages to the drivers page allocator with a zero reference count.  Kernel
code can then safely use kernel functions such as get_page_unless_zero().

Link: https://lkml.kernel.org/r/cf70cf6f8c0bdb8aaebdbfb0d790aea4c683c3c6.1664366292.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memremap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index c3b4cc84877b..7fcaf3180a5b 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -187,6 +187,7 @@ static inline bool folio_is_device_coherent(const struct folio *folio)
 }
 
 #ifdef CONFIG_ZONE_DEVICE
+void zone_device_page_init(struct page *page);
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
 void memunmap_pages(struct dev_pagemap *pgmap);
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
-- 
cgit v1.2.3


From e778406b40dbb1342a1888cd751ca9d2982a12e2 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Wed, 28 Sep 2022 22:01:19 +1000
Subject: mm/migrate_device.c: add migrate_device_range()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Device drivers can use the migrate_vma family of functions to migrate
existing private anonymous mappings to device private pages.  These pages
are backed by memory on the device with drivers being responsible for
copying data to and from device memory.

Device private pages are freed via the pgmap->page_free() callback when
they are unmapped and their refcount drops to zero.  Alternatively they
may be freed indirectly via migration back to CPU memory in response to a
pgmap->migrate_to_ram() callback called whenever the CPU accesses an
address mapped to a device private page.

In other words drivers cannot control the lifetime of data allocated on
the devices and must wait until these pages are freed from userspace.
This causes issues when memory needs to reclaimed on the device, either
because the device is going away due to a ->release() callback or because
another user needs to use the memory.

Drivers could use the existing migrate_vma functions to migrate data off
the device.  However this would require them to track the mappings of each
page which is both complicated and not always possible.  Instead drivers
need to be able to migrate device pages directly so they can free up
device memory.

To allow that this patch introduces the migrate_device family of functions
which are functionally similar to migrate_vma but which skips the initial
lookup based on mapping.

Link: https://lkml.kernel.org/r/868116aab70b0c8ee467d62498bb2cf0ef907295.1664366292.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 52090d1f9230..3ef77f52a4f0 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -210,6 +210,13 @@ struct migrate_vma {
 int migrate_vma_setup(struct migrate_vma *args);
 void migrate_vma_pages(struct migrate_vma *migrate);
 void migrate_vma_finalize(struct migrate_vma *migrate);
+int migrate_device_range(unsigned long *src_pfns, unsigned long start,
+			unsigned long npages);
+void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
+			unsigned long npages);
+void migrate_device_finalize(unsigned long *src_pfns,
+			unsigned long *dst_pfns, unsigned long npages);
+
 #endif /* CONFIG_MIGRATION */
 
 #endif /* _LINUX_MIGRATE_H */
-- 
cgit v1.2.3


From 57d849636a04a12713dd3a10a97cb9658ec7edf6 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 12 Oct 2022 11:06:35 +0800
Subject: clk: at91: fix the build with binutils 2.27

There is an issue when build with older versions of binutils 2.27.0,

arch/arm/mach-at91/pm_suspend.S: Assembler messages:
arch/arm/mach-at91/pm_suspend.S:1086: Error: garbage following instruction -- `ldr tmp1,=0x00020010UL'

Use UL() macro to fix the issue in assembly file.

Fixes: 4fd36e458392 ("ARM: at91: pm: add plla disable/enable support for sam9x60")
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Link: https://lore.kernel.org/r/20221012030635.13140-1-wangkefeng.wang@huawei.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk/at91_pmc.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h
index 3484309b59bf..7af499bdbecb 100644
--- a/include/linux/clk/at91_pmc.h
+++ b/include/linux/clk/at91_pmc.h
@@ -12,6 +12,8 @@
 #ifndef AT91_PMC_H
 #define AT91_PMC_H
 
+#include <linux/bits.h>
+
 #define AT91_PMC_V1		(1)			/* PMC version 1 */
 #define AT91_PMC_V2		(2)			/* PMC version 2 [SAM9X60] */
 
@@ -45,8 +47,8 @@
 #define	AT91_PMC_PCSR		0x18			/* Peripheral Clock Status Register */
 
 #define AT91_PMC_PLL_ACR	0x18			/* PLL Analog Control Register [for SAM9X60] */
-#define		AT91_PMC_PLL_ACR_DEFAULT_UPLL	0x12020010UL	/* Default PLL ACR value for UPLL */
-#define		AT91_PMC_PLL_ACR_DEFAULT_PLLA	0x00020010UL	/* Default PLL ACR value for PLLA */
+#define		AT91_PMC_PLL_ACR_DEFAULT_UPLL	UL(0x12020010)	/* Default PLL ACR value for UPLL */
+#define		AT91_PMC_PLL_ACR_DEFAULT_PLLA	UL(0x00020010)	/* Default PLL ACR value for PLLA */
 #define		AT91_PMC_PLL_ACR_UTMIVR		(1 << 12)	/* UPLL Voltage regulator Control */
 #define		AT91_PMC_PLL_ACR_UTMIBG		(1 << 13)	/* UPLL Bandgap Control */
 
-- 
cgit v1.2.3


From e36ce448a08d43de69e7449eb225805a7a8addf8 Mon Sep 17 00:00:00 2001
From: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Date: Sat, 15 Oct 2022 13:34:29 +0900
Subject: mm/slab: use kmalloc_node() for off slab freelist_idx_t array
 allocation

After commit d6a71648dbc0 ("mm/slab: kmalloc: pass requests larger than
order-1 page to page allocator"), SLAB passes large ( > PAGE_SIZE * 2)
requests to buddy like SLUB does.

SLAB has been using kmalloc caches to allocate freelist_idx_t array for
off slab caches. But after the commit, freelist_size can be bigger than
KMALLOC_MAX_CACHE_SIZE.

Instead of using pointer to kmalloc cache, use kmalloc_node() and only
check if the kmalloc cache is off slab during calculate_slab_order().
If freelist_size > KMALLOC_MAX_CACHE_SIZE, no looping condition happens
as it allocates freelist_idx_t array directly from buddy.

Link: https://lore.kernel.org/all/20221014205818.GA1428667@roeck-us.net/
Reported-and-tested-by: Guenter Roeck <linux@roeck-us.net>
Fixes: d6a71648dbc0 ("mm/slab: kmalloc: pass requests larger than order-1 page to page allocator")
Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab_def.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index e24c9aff6fed..f0ffad6a3365 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -33,7 +33,6 @@ struct kmem_cache {
 
 	size_t colour;			/* cache colouring range */
 	unsigned int colour_off;	/* colour offset */
-	struct kmem_cache *freelist_cache;
 	unsigned int freelist_size;
 
 	/* constructor func */
-- 
cgit v1.2.3


From 80493877d7d0ae0cbe62921d748682811c58026f Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Sun, 16 Oct 2022 00:53:51 +0900
Subject: Revert "cpumask: fix checking valid cpu range".

This reverts commit 78e5a3399421 ("cpumask: fix checking valid cpu range").

syzbot is hitting WARN_ON_ONCE(cpu >= nr_cpumask_bits) warning at
cpu_max_bits_warn() [1], for commit 78e5a3399421 ("cpumask: fix checking
valid cpu range") is broken.  Obviously that patch hits WARN_ON_ONCE()
when e.g.  reading /proc/cpuinfo because passing "cpu + 1" instead of
"cpu" will trivially hit cpu == nr_cpumask_bits condition.

Although syzbot found this problem in linux-next.git on 2022/09/27 [2],
this problem was not fixed immediately.  As a result, that patch was
sent to linux.git before the patch author recognizes this problem, and
syzbot started failing to test changes in linux.git since 2022/10/10
[3].

Andrew Jones proposed a fix for x86 and riscv architectures [4].  But
[2] and [5] indicate that affected locations are not limited to arch
code.  More delay before we find and fix affected locations, less tested
kernel (and more difficult to bisect and fix) before release.

We should have inspected and fixed basically all cpumask users before
applying that patch.  We should not crash kernels in order to ask
existing cpumask users to update their code, even if limited to
CONFIG_DEBUG_PER_CPU_MAPS=y case.

Link: https://syzkaller.appspot.com/bug?extid=d0fd2bf0dd6da72496dd [1]
Link: https://syzkaller.appspot.com/bug?extid=21da700f3c9f0bc40150 [2]
Link: https://syzkaller.appspot.com/bug?extid=51a652e2d24d53e75734 [3]
Link: https://lkml.kernel.org/r/20221014155845.1986223-1-ajones@ventanamicro.com [4]
Link: https://syzkaller.appspot.com/bug?extid=4d46c43d81c3bd155060 [5]
Reported-by: Andrew Jones <ajones@ventanamicro.com>
Reported-by: syzbot+d0fd2bf0dd6da72496dd@syzkaller.appspotmail.com
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Yury Norov <yury.norov@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 2f065ad97541..c2aa0aa26b45 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -174,8 +174,9 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp)
 static inline
 unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
-	/* n is a prior cpu */
-	cpumask_check(n + 1);
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpumask_check(n);
 	return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
 }
 
@@ -188,8 +189,9 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp)
  */
 static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 {
-	/* n is a prior cpu */
-	cpumask_check(n + 1);
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpumask_check(n);
 	return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
 }
 
@@ -229,8 +231,9 @@ static inline
 unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
 		     const struct cpumask *src2p)
 {
-	/* n is a prior cpu */
-	cpumask_check(n + 1);
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpumask_check(n);
 	return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
 		nr_cpumask_bits, n + 1);
 }
@@ -260,8 +263,8 @@ static inline
 unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
 {
 	cpumask_check(start);
-	/* n is a prior cpu */
-	cpumask_check(n + 1);
+	if (n != -1)
+		cpumask_check(n);
 
 	/*
 	 * Return the first available CPU when wrapping, or when starting before cpu0,
-- 
cgit v1.2.3