From de399236e240743ad2dd10d719c37b97ddf31996 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Wed, 18 May 2022 19:17:30 +0200 Subject: ucounts: Split rlimit and ucount values and max values Since the semantics of maximum rlimit values are different, it would be better not to mix ucount and rlimit values. This will prevent the error of using inc_count/dec_ucount for rlimit parameters. This patch also renames the functions to emphasize the lack of connection between rlimit and ucount. v3: - Fix BUG:KASAN:use-after-free_in_dec_ucount. v2: - Fix the array-index-out-of-bounds that was found by the lkp project. Reported-by: kernel test robot Signed-off-by: Alexey Gladkov Signed-off-by: Eric W. Biederman Link: https://lkml.kernel.org/r/20220518171730.l65lmnnjtnxnftpq@example.org Signed-off-by: Eric W. Biederman --- include/linux/user_namespace.h | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 33a4240e6a6f..45f09bec02c4 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -54,15 +54,17 @@ enum ucount_type { UCOUNT_FANOTIFY_GROUPS, UCOUNT_FANOTIFY_MARKS, #endif + UCOUNT_COUNTS, +}; + +enum rlimit_type { UCOUNT_RLIMIT_NPROC, UCOUNT_RLIMIT_MSGQUEUE, UCOUNT_RLIMIT_SIGPENDING, UCOUNT_RLIMIT_MEMLOCK, - UCOUNT_COUNTS, + UCOUNT_RLIMIT_COUNTS, }; -#define MAX_PER_NAMESPACE_UCOUNTS UCOUNT_RLIMIT_NPROC - struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; @@ -99,6 +101,7 @@ struct user_namespace { #endif struct ucounts *ucounts; long ucount_max[UCOUNT_COUNTS]; + long rlimit_max[UCOUNT_RLIMIT_COUNTS]; } __randomize_layout; struct ucounts { @@ -107,6 +110,7 @@ struct ucounts { kuid_t uid; atomic_t count; atomic_long_t ucount[UCOUNT_COUNTS]; + atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; }; extern struct user_namespace init_user_ns; @@ -120,21 +124,26 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); struct ucounts * __must_check get_ucounts(struct ucounts *ucounts); void put_ucounts(struct ucounts *ucounts); -static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type type) +static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { - return atomic_long_read(&ucounts->ucount[type]); + return atomic_long_read(&ucounts->rlimit[type]); } -long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); -bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); -long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type); -void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type); -bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max); +long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); +bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type); +void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); +bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); + +static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type) +{ + return READ_ONCE(ns->rlimit_max[type]); +} -static inline void set_rlimit_ucount_max(struct user_namespace *ns, - enum ucount_type type, unsigned long max) +static inline void set_userns_rlimit_max(struct user_namespace *ns, + enum rlimit_type type, unsigned long max) { - ns->ucount_max[type] = max <= LONG_MAX ? max : LONG_MAX; + ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; } #ifdef CONFIG_USER_NS -- cgit v1.2.3 From d80f7d7b2c75c5954d335dffbccca62a5002c3e0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 21 Jun 2022 14:38:52 -0500 Subject: signal: Guarantee that SIGNAL_GROUP_EXIT is set on process exit Track how many threads have not started exiting and when the last thread starts exiting set SIGNAL_GROUP_EXIT. This guarantees that SIGNAL_GROUP_EXIT will get set when a process exits. In practice this achieves nothing as glibc's implementation of _exit calls sys_group_exit then sys_exit. While glibc's implemenation of pthread_exit calls exit (which cleansup and calls _exit) if it is the last thread and sys_exit if it is the last thread. This means the only way the kernel might observe a process that does not set call exit_group is if the language runtime does not use glibc. With more cleanups I hope to move the decrement of quick_threads earlier. Link: https://lkml.kernel.org/r/87bkukd4tc.fsf_-_@email.froward.int.ebiederm.org Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index cafbe03eed01..20099268fa25 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -94,6 +94,7 @@ struct signal_struct { refcount_t sigcnt; atomic_t live; int nr_threads; + int quick_threads; struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ -- cgit v1.2.3 From fa96b24204af42274ec13dfb2f2e6990d7510e55 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 5 Aug 2022 14:48:14 -0700 Subject: btf: Add a new kfunc flag which allows to mark a function to be sleepable This allows to declare a kfunc as sleepable and prevents its use in a non sleepable program. Signed-off-by: Benjamin Tissoires Co-developed-by: Yosry Ahmed Signed-off-by: Yosry Ahmed Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220805214821.1058337-2-haoluo@google.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index cdb376d53238..976cbdd2981f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -49,6 +49,7 @@ * for this case. */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ +#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ struct btf; struct btf_member; -- cgit v1.2.3 From c8996c98f703b09afe77a1d247dae691c9849dc1 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Aug 2022 08:08:02 +0200 Subject: bpf: Add BPF-helper for accessing CLOCK_TAI Commit 3dc6ffae2da2 ("timekeeping: Introduce fast accessor to clock tai") introduced a fast and NMI-safe accessor for CLOCK_TAI. Especially in time sensitive networks (TSN), where all nodes are synchronized by Precision Time Protocol (PTP), it's helpful to have the possibility to generate timestamps based on CLOCK_TAI instead of CLOCK_MONOTONIC. With a BPF helper for TAI in place, it becomes very convenient to correlate activity across different machines in the network. Use cases for such a BPF helper include functionalities such as Tx launch time (e.g. ETF and TAPRIO Qdiscs) and timestamping. Note: CLOCK_TAI is nothing new per se, only the NMI-safe variant of it is. Signed-off-by: Jesper Dangaard Brouer [Kurt: Wrote changelog and renamed helper] Signed-off-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20220809060803.5773-2-kurt@linutronix.de Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 20c26aed7896..a627a02cf8ab 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2349,6 +2349,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; +extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; -- cgit v1.2.3 From 4dd48c6f1f83290d4bc61b43e61d86f8bc6c310e Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Wed, 10 Aug 2022 08:59:03 +0200 Subject: bpf: add destructive kfunc flag Add KF_DESTRUCTIVE flag for destructive functions. Functions with this flag set will require CAP_SYS_BOOT capabilities. Signed-off-by: Artem Savkov Link: https://lore.kernel.org/r/20220810065905.475418-2-asavkov@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 976cbdd2981f..ad93c2d9cc1c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -49,7 +49,8 @@ * for this case. */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ -#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ +#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ +#define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ struct btf; struct btf_member; -- cgit v1.2.3 From fb12ad5e3179c9667dfcbafe2c5da91f9e700e53 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 11 Aug 2022 16:11:36 -0700 Subject: Input: bma150 - fix a typo in some comments Remove some extra '0' s/BMA0150_RANGE_xxx/BMA150_RANGE_xxx/ s/BMA0150_BW_xxx/BMA150_BW_xxx Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/a331a6244a1dfbf34dc85f1be6995fa91500c801.1659802757.git.christophe.jaillet@wanadoo.fr Signed-off-by: Dmitry Torokhov --- include/linux/bma150.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bma150.h b/include/linux/bma150.h index 31c9e323a391..4d4a62d49341 100644 --- a/include/linux/bma150.h +++ b/include/linux/bma150.h @@ -33,8 +33,8 @@ struct bma150_cfg { unsigned char lg_hyst; /* Low-G hysterisis */ unsigned char lg_dur; /* Low-G duration */ unsigned char lg_thres; /* Low-G threshold */ - unsigned char range; /* one of BMA0150_RANGE_xxx */ - unsigned char bandwidth; /* one of BMA0150_BW_xxx */ + unsigned char range; /* one of BMA150_RANGE_xxx */ + unsigned char bandwidth; /* one of BMA150_BW_xxx */ }; struct bma150_platform_data { -- cgit v1.2.3 From 67f4b5dc49913abcdb5cc736e73674e2f352f81d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 13 Aug 2022 08:22:25 -0400 Subject: NFS: Fix another fsync() issue after a server reboot Currently, when the writeback code detects a server reboot, it redirties any pages that were not committed to disk, and it sets the flag NFS_CONTEXT_RESEND_WRITES in the nfs_open_context of the file descriptor that dirtied the file. While this allows the file descriptor in question to redrive its own writes, it violates the fsync() requirement that we should be synchronising all writes to disk. While the problem is infrequent, we do see corner cases where an untimely server reboot causes the fsync() call to abandon its attempt to sync data to disk and causing data corruption issues due to missed error conditions or similar. In order to tighted up the client's ability to deal with this situation without introducing livelocks, add a counter that records the number of times pages are redirtied due to a server reboot-like condition, and use that in fsync() to redrive the sync to disk. Fixes: 2197e9b06c22 ("NFS: Fix up fsync() when the server rebooted") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b32ed68e7dc4..f08e581f0161 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -182,6 +182,7 @@ struct nfs_inode { /* Regular file */ struct { atomic_long_t nrequests; + atomic_long_t redirtied_pages; struct nfs_mds_commit_info commit_info; struct mutex commit_mutex; }; -- cgit v1.2.3 From 5f6277a0c15e1ea54b6fd3d78c9fff7bfe42556c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 13 Aug 2022 08:51:45 -0400 Subject: NFS: Cleanup to remove unused flag NFS_CONTEXT_RESEND_WRITES Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f08e581f0161..7931fa472561 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -83,7 +83,6 @@ struct nfs_open_context { fmode_t mode; unsigned long flags; -#define NFS_CONTEXT_RESEND_WRITES (1) #define NFS_CONTEXT_BAD (2) #define NFS_CONTEXT_UNLOCK (3) #define NFS_CONTEXT_FILE_OPEN (4) -- cgit v1.2.3 From 93ce557679e1cf7742ad327d40a1499e7d8535b7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 8 Aug 2022 23:33:59 +0300 Subject: regmap: mmio: Introduce IO accessors that can talk to IO port Some users may use regmap MMIO for IO ports, and this can be done by assigning ioreadXX()/iowriteXX() and their Big Endian counterparts to the regmap context. Add IO port support with a corresponding flag added. While doing that, make sure that user won't select relaxed MMIO access along with IO port because the latter have no relaxed variants. Signed-off-by: Andy Shevchenko Acked-by: William Breathitt Gray Link: https://lore.kernel.org/r/20220808203401.35153-4-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 7cf2157134ac..8cccc247cd37 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -311,6 +311,8 @@ typedef void (*regmap_unlock)(void *); * This field is a duplicate of a similar file in * 'struct regmap_bus' and serves exact same purpose. * Use it only for "no-bus" cases. + * @io_port: Support IO port accessors. Makes sense only when MMIO vs. IO port + * access can be distinguished. * @max_register: Optional, specifies the maximum valid register address. * @wr_table: Optional, points to a struct regmap_access_table specifying * valid ranges for write access. @@ -399,6 +401,7 @@ struct regmap_config { size_t max_raw_write; bool fast_io; + bool io_port; unsigned int max_register; const struct regmap_access_table *wr_table; -- cgit v1.2.3 From 0a90ed8d0cfa29735a221eba14d9cb6c735d35b6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 1 Aug 2022 14:37:31 +0300 Subject: platform/x86: pmc_atom: Fix SLP_TYPx bitfield mask On Intel hardware the SLP_TYPx bitfield occupies bits 10-12 as per ACPI specification (see Table 4.13 "PM1 Control Registers Fixed Hardware Feature Control Bits" for the details). Fix the mask and other related definitions accordingly. Fixes: 93e5eadd1f6e ("x86/platform: New Intel Atom SOC power management controller driver") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220801113734.36131-1-andriy.shevchenko@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/pmc_atom.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h index 3edfb6d4e67a..dd81f510e4cf 100644 --- a/include/linux/platform_data/x86/pmc_atom.h +++ b/include/linux/platform_data/x86/pmc_atom.h @@ -7,6 +7,8 @@ #ifndef PMC_ATOM_H #define PMC_ATOM_H +#include + /* ValleyView Power Control Unit PCI Device ID */ #define PCI_DEVICE_ID_VLV_PMC 0x0F1C /* CherryTrail Power Control Unit PCI Device ID */ @@ -139,9 +141,9 @@ #define ACPI_MMIO_REG_LEN 0x100 #define PM1_CNT 0x4 -#define SLEEP_TYPE_MASK 0xFFFFECFF +#define SLEEP_TYPE_MASK GENMASK(12, 10) #define SLEEP_TYPE_S5 0x1C00 -#define SLEEP_ENABLE 0x2000 +#define SLEEP_ENABLE BIT(13) extern int pmc_atom_read(int offset, u32 *value); -- cgit v1.2.3 From 7f203bc89eb66d6afde7eae91347fc0352090cc3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Jul 2022 13:10:16 -1000 Subject: cgroup: Replace cgroup->ancestor_ids[] with ->ancestors[] Every cgroup knows all its ancestors through its ->ancestor_ids[]. There's no advantage to remembering the IDs instead of the pointers directly and this makes the array useless for finding an actual ancestor cgroup forcing cgroup_ancestor() to iteratively walk up the hierarchy instead. Let's replace cgroup->ancestor_ids[] with ->ancestors[] and remove the walking-up from cgroup_ancestor(). While at it, improve comments around cgroup_root->cgrp_ancestor_storage. This patch shouldn't cause user-visible behavior differences. v2: Update cgroup_ancestor() to use ->ancestors[]. v3: cgroup_root->cgrp_ancestor_storage's type is updated to match cgroup->ancestors[]. Better comments. Signed-off-by: Tejun Heo Acked-by: Namhyung Kim --- include/linux/cgroup-defs.h | 16 ++++++++++------ include/linux/cgroup.h | 8 +++----- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4bcf56b3491c..1283993d7ea8 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -384,7 +384,7 @@ struct cgroup { /* * The depth this cgroup is at. The root is at depth zero and each * step down the hierarchy increments the level. This along with - * ancestor_ids[] can determine whether a given cgroup is a + * ancestors[] can determine whether a given cgroup is a * descendant of another without traversing the hierarchy. */ int level; @@ -504,8 +504,8 @@ struct cgroup { /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; - /* ids of the ancestors at each level including self */ - u64 ancestor_ids[]; + /* All ancestors including self */ + struct cgroup *ancestors[]; }; /* @@ -522,11 +522,15 @@ struct cgroup_root { /* Unique id for this hierarchy. */ int hierarchy_id; - /* The root cgroup. Root is destroyed on its release. */ + /* + * The root cgroup. The containing cgroup_root will be destroyed on its + * release. cgrp->ancestors[0] will be used overflowing into the + * following field. cgrp_ancestor_storage must immediately follow. + */ struct cgroup cgrp; - /* for cgrp->ancestor_ids[0] */ - u64 cgrp_ancestor_id_storage; + /* must follow cgrp for cgrp->ancestors[0], see above */ + struct cgroup *cgrp_ancestor_storage; /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ed53bfe7c46c..4d143729b246 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -574,7 +574,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; - return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor); + return cgrp->ancestors[ancestor->level] == ancestor; } /** @@ -591,11 +591,9 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, int ancestor_level) { - if (cgrp->level < ancestor_level) + if (ancestor_level < 0 || ancestor_level > cgrp->level) return NULL; - while (cgrp && cgrp->level > ancestor_level) - cgrp = cgroup_parent(cgrp); - return cgrp; + return cgrp->ancestors[ancestor_level]; } /** -- cgit v1.2.3 From 1e64b9c5f9a01f1a752438724bc83180c451e1c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Fri, 15 Jul 2022 14:28:53 +0200 Subject: iio: inkern: move to fwnode properties MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This moves the IIO in kernel interface to use fwnode properties and thus be firmware agnostic. Note that the interface is still not firmware agnostic. At this point we have both OF and fwnode interfaces so that we don't break any user. On top of this we also want to have a per driver conversion and that is the main reason we have both of_xlate() and fwnode_xlate() support. Signed-off-by: Nuno Sá Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220715122903.332535-6-nuno.sa@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/consumer.h | 36 ++++++++++++++++++++---------------- include/linux/iio/iio.h | 5 +++++ 2 files changed, 25 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h index 5fa5957586cf..2adb1306da3e 100644 --- a/include/linux/iio/consumer.h +++ b/include/linux/iio/consumer.h @@ -7,6 +7,7 @@ #ifndef _IIO_INKERN_CONSUMER_H_ #define _IIO_INKERN_CONSUMER_H_ +#include #include #include @@ -14,6 +15,7 @@ struct iio_dev; struct iio_chan_spec; struct device; struct device_node; +struct fwnode_handle; /** * struct iio_channel - everything needed for a consumer to use a channel @@ -99,26 +101,20 @@ void iio_channel_release_all(struct iio_channel *chan); struct iio_channel *devm_iio_channel_get_all(struct device *dev); /** - * of_iio_channel_get_by_name() - get description of all that is needed to access channel. - * @np: Pointer to consumer device tree node + * fwnode_iio_channel_get_by_name() - get description of all that is needed to access channel. + * @fwnode: Pointer to consumer Firmware node * @consumer_channel: Unique name to identify the channel on the consumer * side. This typically describes the channels use within * the consumer. E.g. 'battery_voltage' */ -#ifdef CONFIG_OF -struct iio_channel *of_iio_channel_get_by_name(struct device_node *np, const char *name); -#else -static inline struct iio_channel * -of_iio_channel_get_by_name(struct device_node *np, const char *name) -{ - return NULL; -} -#endif +struct iio_channel *fwnode_iio_channel_get_by_name(struct fwnode_handle *fwnode, + const char *name); /** - * devm_of_iio_channel_get_by_name() - Resource managed version of of_iio_channel_get_by_name(). + * devm_fwnode_iio_channel_get_by_name() - Resource managed version of + * fwnode_iio_channel_get_by_name(). * @dev: Pointer to consumer device. - * @np: Pointer to consumer device tree node + * @fwnode: Pointer to consumer Firmware node * @consumer_channel: Unique name to identify the channel on the consumer * side. This typically describes the channels use within * the consumer. E.g. 'battery_voltage' @@ -129,9 +125,17 @@ of_iio_channel_get_by_name(struct device_node *np, const char *name) * The allocated iio channel is automatically released when the device is * unbound. */ -struct iio_channel *devm_of_iio_channel_get_by_name(struct device *dev, - struct device_node *np, - const char *consumer_channel); +struct iio_channel *devm_fwnode_iio_channel_get_by_name(struct device *dev, + struct fwnode_handle *fwnode, + const char *consumer_channel); + +static inline struct iio_channel +*devm_of_iio_channel_get_by_name(struct device *dev, struct device_node *np, + const char *consumer_channel) +{ + return devm_fwnode_iio_channel_get_by_name(dev, of_fwnode_handle(np), + consumer_channel); +} struct iio_cb_buffer; /** diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 5dfbfc991c69..6002f8a0baf1 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -18,6 +18,7 @@ */ struct of_phandle_args; +struct fwnode_reference_args; enum iio_shared_by { IIO_SEPARATE, @@ -429,6 +430,8 @@ struct iio_trigger; /* forward declaration */ * provide a custom of_xlate function that reads the * *args* and returns the appropriate index in registered * IIO channels array. + * @fwnode_xlate: fwnode based function pointer to obtain channel specifier index. + * Functionally the same as @of_xlate. * @hwfifo_set_watermark: function pointer to set the current hardware * fifo watermark level; see hwfifo_* entries in * Documentation/ABI/testing/sysfs-bus-iio for details on @@ -510,6 +513,8 @@ struct iio_info { unsigned *readval); int (*of_xlate)(struct iio_dev *indio_dev, const struct of_phandle_args *iiospec); + int (*fwnode_xlate)(struct iio_dev *indio_dev, + const struct fwnode_reference_args *iiospec); int (*hwfifo_set_watermark)(struct iio_dev *indio_dev, unsigned val); int (*hwfifo_flush_to_buffer)(struct iio_dev *indio_dev, unsigned count); -- cgit v1.2.3 From b22bc4d6072e74b768c4c19e2ff3585ba5927904 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Fri, 15 Jul 2022 14:29:02 +0200 Subject: iio: inkern: remove OF dependencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since all users of the OF dependendent API are now converted to use the firmware agnostic alternative, we can drop OF dependencies from the IIO in kernel interface. Signed-off-by: Nuno Sá Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220715122903.332535-15-nuno.sa@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/consumer.h | 10 ---------- include/linux/iio/iio.h | 3 --- 2 files changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h index 2adb1306da3e..6802596b017c 100644 --- a/include/linux/iio/consumer.h +++ b/include/linux/iio/consumer.h @@ -7,14 +7,12 @@ #ifndef _IIO_INKERN_CONSUMER_H_ #define _IIO_INKERN_CONSUMER_H_ -#include #include #include struct iio_dev; struct iio_chan_spec; struct device; -struct device_node; struct fwnode_handle; /** @@ -129,14 +127,6 @@ struct iio_channel *devm_fwnode_iio_channel_get_by_name(struct device *dev, struct fwnode_handle *fwnode, const char *consumer_channel); -static inline struct iio_channel -*devm_of_iio_channel_get_by_name(struct device *dev, struct device_node *np, - const char *consumer_channel) -{ - return devm_fwnode_iio_channel_get_by_name(dev, of_fwnode_handle(np), - consumer_channel); -} - struct iio_cb_buffer; /** * iio_channel_get_all_cb() - register callback for triggered capture diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 6002f8a0baf1..f0ec8a5e5a7a 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -17,7 +17,6 @@ * Currently assumes nano seconds. */ -struct of_phandle_args; struct fwnode_reference_args; enum iio_shared_by { @@ -511,8 +510,6 @@ struct iio_info { int (*debugfs_reg_access)(struct iio_dev *indio_dev, unsigned reg, unsigned writeval, unsigned *readval); - int (*of_xlate)(struct iio_dev *indio_dev, - const struct of_phandle_args *iiospec); int (*fwnode_xlate)(struct iio_dev *indio_dev, const struct fwnode_reference_args *iiospec); int (*hwfifo_set_watermark)(struct iio_dev *indio_dev, unsigned val); -- cgit v1.2.3 From 5c64990b99aa91622cf044a9a2f7a78de8f770a2 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 26 Jun 2022 13:29:32 +0100 Subject: iio: core: Introduce _zeropoint for differential channels Address an ABI gap for device where the offset of both lines in a differential pair may be controlled so as to allow a wider range of inputs, but without having any direct effect of the differential measurement. _offset cannot be used as to remain in line with existing usage, userspace would be expected to apply it as (_raw + _offset) * _scale whereas _zeropoint is not. i.e. If we were computing the differential in software it would be. ((postive_raw + _zeropoint) - (negative_raw + zeropoint) + _offset) * _scale = ((postive_raw - negative_raw) + _offset) * _scale = (differential_raw + _offset) * _scale Similarly calibbias is expected to tweak the measurement seen, not the adjust the two lines of the differential pair. Needed for in_capacitanceX-capacitanceY_zeropoint for the AD7746 CDC driver. Signed-off-by: Jonathan Cameron Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220626122938.582107-12-jic23@kernel.org --- include/linux/iio/types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index a7aa91f3a8dc..27143b03909d 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -63,6 +63,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_OVERSAMPLING_RATIO, IIO_CHAN_INFO_THERMOCOUPLE_TYPE, IIO_CHAN_INFO_CALIBAMBIENT, + IIO_CHAN_INFO_ZEROPOINT, }; #endif /* _IIO_TYPES_H_ */ -- cgit v1.2.3 From 76b079ef4cc954fc2c2e0333a01855b0b2b6bdee Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Sat, 6 Aug 2022 20:05:09 +0800 Subject: sched/psi: Remove unused parameter nbytes of psi_trigger_create() psi_trigger_create()'s 'nbytes' parameter is not used, so we can remove it. Signed-off-by: Hao Jia Reviewed-by: Ingo Molnar Acked-by: Johannes Weiner Signed-off-by: Tejun Heo --- include/linux/psi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/psi.h b/include/linux/psi.h index 89784763d19e..dd74411ac21d 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -27,7 +27,7 @@ void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, size_t nbytes, enum psi_res res); + char *buf, enum psi_res res); void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, -- cgit v1.2.3 From d7ae5818c3fa3007dee13f9d99832e7f26b8bc44 Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Sat, 6 Aug 2022 20:05:10 +0800 Subject: sched/psi: Remove redundant cgroup_psi() when !CONFIG_CGROUPS cgroup_psi() is only called under CONFIG_CGROUPS. We don't need cgroup_psi() when !CONFIG_CGROUPS, so we can remove it in this case. Signed-off-by: Hao Jia Reviewed-by: Ingo Molnar Acked-by: Johannes Weiner Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ed53bfe7c46c..ac5d0515680e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -734,11 +734,6 @@ static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) return NULL; } -static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) -{ - return NULL; -} - static inline bool cgroup_psi_enabled(void) { return false; -- cgit v1.2.3 From 6a8f359c3132e4f51bdb263ad74ec632c65e55fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 15 Aug 2022 10:02:29 +0200 Subject: gpio: pca953x: Make platform teardown callback return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All platforms that provide a teardown callback return 0. New users are supposed to not make use of platform support, so there is no functionality lost. This patch is a preparation for making i2c remove callbacks return void. Reviewed-by: Andy Shevchenko Signed-off-by: Uwe Kleine-König Acked-by: Bartosz Golaszewski Signed-off-by: Wolfram Sang --- include/linux/platform_data/pca953x.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/pca953x.h b/include/linux/platform_data/pca953x.h index 4eb53e023997..96c1a14ab365 100644 --- a/include/linux/platform_data/pca953x.h +++ b/include/linux/platform_data/pca953x.h @@ -22,7 +22,7 @@ struct pca953x_platform_data { int (*setup)(struct i2c_client *client, unsigned gpio, unsigned ngpio, void *context); - int (*teardown)(struct i2c_client *client, + void (*teardown)(struct i2c_client *client, unsigned gpio, unsigned ngpio, void *context); const char *const *names; -- cgit v1.2.3 From ed5c2f5fd10dda07263f79f338a512c0f49f76f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 15 Aug 2022 10:02:30 +0200 Subject: i2c: Make remove callback return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The value returned by an i2c driver's remove function is mostly ignored. (Only an error message is printed if the value is non-zero that the error is ignored.) So change the prototype of the remove function to return no value. This way driver authors are not tempted to assume that passing an error to the upper layer is a good idea. All drivers are adapted accordingly. There is no intended change of behaviour, all callbacks were prepared to return 0 before. Reviewed-by: Peter Senna Tschudin Reviewed-by: Jeremy Kerr Reviewed-by: Benjamin Mugnier Reviewed-by: Javier Martinez Canillas Reviewed-by: Crt Mori Reviewed-by: Heikki Krogerus Acked-by: Greg Kroah-Hartman Acked-by: Marek Behún # for leds-turris-omnia Acked-by: Andy Shevchenko Reviewed-by: Petr Machata # for mlxsw Reviewed-by: Maximilian Luz # for surface3_power Acked-by: Srinivas Pandruvada # for bmc150-accel-i2c + kxcjk-1013 Reviewed-by: Hans Verkuil # for media/* + staging/media/* Acked-by: Miguel Ojeda # for auxdisplay/ht16k33 + auxdisplay/lcd2s Reviewed-by: Luca Ceresoli # for versaclock5 Reviewed-by: Ajay Gupta # for ucsi_ccg Acked-by: Jonathan Cameron # for iio Acked-by: Peter Rosin # for i2c-mux-*, max9860 Acked-by: Adrien Grassein # for lontium-lt8912b Reviewed-by: Jean Delvare # for hwmon, i2c-core and i2c/muxes Acked-by: Corey Minyard # for IPMI Reviewed-by: Vladimir Oltean Acked-by: Dmitry Torokhov Acked-by: Sebastian Reichel # for drivers/power Acked-by: Krzysztof Hałasa Signed-off-by: Uwe Kleine-König Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 8eab5017bff3..f7c49bbdb8a1 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -273,7 +273,7 @@ struct i2c_driver { /* Standard driver model interfaces */ int (*probe)(struct i2c_client *client, const struct i2c_device_id *id); - int (*remove)(struct i2c_client *client); + void (*remove)(struct i2c_client *client); /* New driver model interface to aid the seamless removal of the * current probe()'s, more commonly unused than used second parameter. -- cgit v1.2.3 From 680f8666baf6b4a6cc368dfff6614010ec23c51d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 18 Jul 2022 14:14:08 +0200 Subject: interconnect: Make icc_provider_del() return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All users ignore the return value of icc_provider_del(). Consequently make it not return an error code. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20220718121409.171773-8-u.kleine-koenig@pengutronix.de Signed-off-by: Georgi Djakov --- include/linux/interconnect-provider.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interconnect-provider.h b/include/linux/interconnect-provider.h index 6bd01f7159c6..cd5c5a27557f 100644 --- a/include/linux/interconnect-provider.h +++ b/include/linux/interconnect-provider.h @@ -123,7 +123,7 @@ void icc_node_add(struct icc_node *node, struct icc_provider *provider); void icc_node_del(struct icc_node *node); int icc_nodes_remove(struct icc_provider *provider); int icc_provider_add(struct icc_provider *provider); -int icc_provider_del(struct icc_provider *provider); +void icc_provider_del(struct icc_provider *provider); struct icc_node_data *of_icc_get_from_provider(struct of_phandle_args *spec); void icc_sync_state(struct device *dev); @@ -172,9 +172,8 @@ static inline int icc_provider_add(struct icc_provider *provider) return -ENOTSUPP; } -static inline int icc_provider_del(struct icc_provider *provider) +static inline void icc_provider_del(struct icc_provider *provider) { - return -ENOTSUPP; } static inline struct icc_node_data *of_icc_get_from_provider(struct of_phandle_args *spec) -- cgit v1.2.3 From 7cd4c5c2101cb092db00f61f69d24380cf7a0ee8 Mon Sep 17 00:00:00 2001 From: Frederick Lawler Date: Mon, 15 Aug 2022 11:20:25 -0500 Subject: security, lsm: Introduce security_create_user_ns() User namespaces are an effective tool to allow programs to run with permission without requiring the need for a program to run as root. User namespaces may also be used as a sandboxing technique. However, attackers sometimes leverage user namespaces as an initial attack vector to perform some exploit. [1,2,3] While it is not the unprivileged user namespace functionality, which causes the kernel to be exploitable, users/administrators might want to more granularly limit or at least monitor how various processes use this functionality, while vulnerable kernel subsystems are being patched. Preventing user namespace already creation comes in a few of forms in order of granularity: 1. /proc/sys/user/max_user_namespaces sysctl 2. Distro specific patch(es) 3. CONFIG_USER_NS To block a task based on its attributes, the LSM hook cred_prepare is a decent candidate for use because it provides more granular control, and it is called before create_user_ns(): cred = prepare_creds() security_prepare_creds() call_int_hook(cred_prepare, ... if (cred) create_user_ns(cred) Since security_prepare_creds() is meant for LSMs to copy and prepare credentials, access control is an unintended use of the hook. [4] Further, security_prepare_creds() will always return a ENOMEM if the hook returns any non-zero error code. This hook also does not handle the clone3 case which requires us to access a user space pointer to know if we're in the CLONE_NEW_USER call path which may be subject to a TOCTTOU attack. Lastly, cred_prepare is called in many call paths, and a targeted hook further limits the frequency of calls which is a beneficial outcome. Therefore introduce a new function security_create_user_ns() with an accompanying userns_create LSM hook. With the new userns_create hook, users will have more control over the observability and access control over user namespace creation. Users should expect that normal operation of user namespaces will behave as usual, and only be impacted when controls are implemented by users or administrators. This hook takes the prepared creds for LSM authors to write policy against. On success, the new namespace is applied to credentials, otherwise an error is returned. Links: 1. https://nvd.nist.gov/vuln/detail/CVE-2022-0492 2. https://nvd.nist.gov/vuln/detail/CVE-2022-25636 3. https://nvd.nist.gov/vuln/detail/CVE-2022-34918 4. https://lore.kernel.org/all/1c4b1c0d-12f6-6e9e-a6a3-cdce7418110c@schaufler-ca.com/ Reviewed-by: Christian Brauner (Microsoft) Reviewed-by: KP Singh Signed-off-by: Frederick Lawler Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 1 + include/linux/lsm_hooks.h | 4 ++++ include/linux/security.h | 6 ++++++ 3 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 806448173033..aa7272e83626 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -224,6 +224,7 @@ LSM_HOOK(int, -ENOSYS, task_prctl, int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p, struct inode *inode) +LSM_HOOK(int, 0, userns_create, const struct cred *cred) LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag) LSM_HOOK(void, LSM_RET_VOID, ipc_getsecid, struct kern_ipc_perm *ipcp, u32 *secid) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 84a0d7e02176..2e11a2a22ed1 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -806,6 +806,10 @@ * security attributes, e.g. for /proc/pid inodes. * @p contains the task_struct for the task. * @inode contains the inode structure for the inode. + * @userns_create: + * Check permission prior to creating a new user namespace. + * @cred points to prepared creds. + * Return 0 if successful, otherwise < 0 error code. * * Security hooks for Netlink messaging. * diff --git a/include/linux/security.h b/include/linux/security.h index 1bc362cb413f..767802fe9bfa 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -437,6 +437,7 @@ int security_task_kill(struct task_struct *p, struct kernel_siginfo *info, int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); void security_task_to_inode(struct task_struct *p, struct inode *inode); +int security_create_user_ns(const struct cred *cred); int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag); void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid); int security_msg_msg_alloc(struct msg_msg *msg); @@ -1194,6 +1195,11 @@ static inline int security_task_prctl(int option, unsigned long arg2, static inline void security_task_to_inode(struct task_struct *p, struct inode *inode) { } +static inline int security_create_user_ns(const struct cred *cred) +{ + return 0; +} + static inline int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag) { -- cgit v1.2.3 From 0630f64d25a0f0a8c6a9ce9fde8750b3b561e6f5 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 15 Aug 2022 12:07:47 -0700 Subject: net: phy: broadcom: Implement suspend/resume for AC131 and BCM5241 Implement the suspend/resume procedure for the Broadcom AC131 and BCM5241 type of PHYs (10/100 only) by entering the standard power down followed by the proprietary standby mode in the auxiliary mode 4 shadow register. On resume, the PHY software reset is enough to make it come out of standby mode so we can utilize brcm_fet_config_init() as the resume hook. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 6ff567ece34a..9e77165f3ef6 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -293,6 +293,7 @@ #define MII_BRCM_FET_SHDW_MC_FAME 0x4000 /* Force Auto MDIX enable */ #define MII_BRCM_FET_SHDW_AUXMODE4 0x1a /* Auxiliary mode 4 */ +#define MII_BRCM_FET_SHDW_AM4_STANDBY 0x0008 /* Standby enable */ #define MII_BRCM_FET_SHDW_AM4_LED_MASK 0x0003 #define MII_BRCM_FET_SHDW_AM4_LED_MODE1 0x0001 -- cgit v1.2.3 From c20cc099b30abd50f563e422aa72edcd7f92da55 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 16 Aug 2022 22:48:31 +0200 Subject: regmap: Support accelerated noinc operations Several architectures have accelerated operations for MMIO operations writing to a single register, such as writesb, writesw, writesl, writesq, readsb, readsw, readsl and readsq but regmap currently cannot use them because we have no hooks for providing an accelerated noinc back-end for MMIO. Solve this by providing reg_[read/write]_noinc callbacks for the bus abstraction, so that the regmap-mmio bus can use this. Currently I do not see a need to support this for custom regmaps so it is only added to the bus. Callbacks are passed a void * with the array of values and a count which is the number of items of the byte chunk size for the specific register width. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20220816204832.265837-1-linus.walleij@linaro.org Signed-off-by: Mark Brown --- include/linux/regmap.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 8cccc247cd37..ca3434dca3a0 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -492,8 +492,12 @@ typedef int (*regmap_hw_read)(void *context, void *val_buf, size_t val_size); typedef int (*regmap_hw_reg_read)(void *context, unsigned int reg, unsigned int *val); +typedef int (*regmap_hw_reg_noinc_read)(void *context, unsigned int reg, + void *val, size_t val_count); typedef int (*regmap_hw_reg_write)(void *context, unsigned int reg, unsigned int val); +typedef int (*regmap_hw_reg_noinc_write)(void *context, unsigned int reg, + const void *val, size_t val_count); typedef int (*regmap_hw_reg_update_bits)(void *context, unsigned int reg, unsigned int mask, unsigned int val); typedef struct regmap_async *(*regmap_hw_async_alloc)(void); @@ -514,6 +518,8 @@ typedef void (*regmap_hw_free_context)(void *context); * must serialise with respect to non-async I/O. * @reg_write: Write a single register value to the given register address. This * write operation has to complete when returning from the function. + * @reg_write_noinc: Write multiple register value to the same register. This + * write operation has to complete when returning from the function. * @reg_update_bits: Update bits operation to be used against volatile * registers, intended for devices supporting some mechanism * for setting clearing bits without having to @@ -541,9 +547,11 @@ struct regmap_bus { regmap_hw_gather_write gather_write; regmap_hw_async_write async_write; regmap_hw_reg_write reg_write; + regmap_hw_reg_noinc_write reg_noinc_write; regmap_hw_reg_update_bits reg_update_bits; regmap_hw_read read; regmap_hw_reg_read reg_read; + regmap_hw_reg_noinc_read reg_noinc_read; regmap_hw_free_context free_context; regmap_hw_async_alloc async_alloc; u8 read_flag_mask; -- cgit v1.2.3 From 874de459488b8741afc0e9888d39f2e15a962b3d Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 14 Jul 2022 09:10:40 +0800 Subject: soundwire: add read_ping_status helper definition in manager ops The existing manager ops provide callbacks to transfer read/write commands, but don't allow for direct access to PING status register. This is accessible in all existing IP, and would help diagnose timeouts or resume issues by reporting the 'true' status instead of the internal status reported by the IP. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Acked-By: Vinod Koul Link: https://lore.kernel.org/r/20220714011043.46059-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 39058c841469..eb840a07c25c 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -839,6 +839,8 @@ struct sdw_defer { * @set_bus_conf: Set the bus configuration * @pre_bank_switch: Callback for pre bank switch * @post_bank_switch: Callback for post bank switch + * @read_ping_status: Read status from PING frames, reported with two bits per Device. + * Bits 31:24 are reserved. */ struct sdw_master_ops { int (*read_prop)(struct sdw_bus *bus); @@ -855,6 +857,7 @@ struct sdw_master_ops { struct sdw_bus_params *params); int (*pre_bank_switch)(struct sdw_bus *bus); int (*post_bank_switch)(struct sdw_bus *bus); + u32 (*read_ping_status)(struct sdw_bus *bus); }; -- cgit v1.2.3 From 79fe02cb7547fcc09e83b850cfd32896d7dc6289 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 14 Jul 2022 09:10:42 +0800 Subject: soundwire: add sdw_show_ping_status() helper This helper provides an optional delay parameter to wait for devices to resync in case of errors, and checks that devices are indeed attached on the bus. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Acked-By: Vinod Koul Link: https://lore.kernel.org/r/20220714011043.46059-4-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index eb840a07c25c..822599957b35 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -922,6 +922,8 @@ int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, struct fwnode_handle *fwnode); void sdw_bus_master_delete(struct sdw_bus *bus); +void sdw_show_ping_status(struct sdw_bus *bus, bool sync_delay); + /** * sdw_port_config: Master or Slave Port configuration * -- cgit v1.2.3 From 3fd6d6e2b4e80fe45bfd1c8f01dff7d30a0f9b53 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 5 Aug 2022 00:43:17 +0200 Subject: thermal/of: Rework the thermal device tree initialization The following changes are reworking entirely the thermal device tree initialization. The old version is kept until the different drivers using it are converted to the new API. The old approach creates the different actors independently. This approach is the source of the code duplication in the thermal OF because a thermal zone is created but a sensor is registered after. The thermal zones are created unconditionnaly with a fake sensor at init time, thus forcing to provide fake ops and store all the thermal zone related information in duplicated structures. Then the sensor is initialized and the code looks up the thermal zone name using the device tree. Then the sensor is associated to the thermal zone, and the sensor specific ops are called with a second level of indirection from the thermal zone ops. When a sensor is removed (with a module unload), the thermal zone stays there with the fake sensor. The cooling device associated with a thermal zone and a trip point is stored in a list, again duplicating information, using the node name of the device tree to match afterwards the cooling devices. The new approach is simpler, it creates a thermal zone when the sensor is registered and destroys it when the sensor is removed. All the matching between the cooling device, trip points and thermal zones are done using the device tree, as well as bindings. The ops are no longer specific but uses the generic ones provided by the thermal framework. When the old code won't have any users, it can be removed and the remaining thermal OF code will be much simpler. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220804224349.1926752-2-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- include/linux/thermal.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 1386c713885d..e2ac9d473bd6 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -325,6 +325,16 @@ struct thermal_zone_of_device_ops { /* Function declarations */ #ifdef CONFIG_THERMAL_OF +struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data, + const struct thermal_zone_device_ops *ops); + +struct thermal_zone_device *devm_thermal_of_zone_register(struct device *dev, int id, void *data, + const struct thermal_zone_device_ops *ops); + +void thermal_of_zone_unregister(struct thermal_zone_device *tz); + +void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz); + int thermal_zone_of_get_sensor_id(struct device_node *tz_np, struct device_node *sensor_np, u32 *id); @@ -366,6 +376,14 @@ static inline struct thermal_zone_device *devm_thermal_zone_of_sensor_register( return ERR_PTR(-ENODEV); } +static inline void thermal_of_zone_unregister(struct thermal_zone_device *tz) +{ +} + +static inline void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz) +{ +} + static inline void devm_thermal_zone_of_sensor_unregister(struct device *dev, struct thermal_zone_device *tz) -- cgit v1.2.3 From f59ac19b7f44cab23df84810e2da5f57bdd3a7e7 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 5 Aug 2022 00:43:49 +0200 Subject: thermal/of: Remove old OF code All the drivers are converted to the new OF API, remove the old OF code. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220804224349.1926752-34-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- include/linux/thermal.h | 77 ++++++++++--------------------------------------- 1 file changed, 15 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index e2ac9d473bd6..86c24ddd5985 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -296,33 +296,6 @@ struct thermal_zone_params { int offset; }; -/** - * struct thermal_zone_of_device_ops - callbacks for handling DT based zones - * - * Mandatory: - * @get_temp: a pointer to a function that reads the sensor temperature. - * - * Optional: - * @get_trend: a pointer to a function that reads the sensor temperature trend. - * @set_trips: a pointer to a function that sets a temperature window. When - * this window is left the driver must inform the thermal core via - * thermal_zone_device_update. - * @set_emul_temp: a pointer to a function that sets sensor emulated - * temperature. - * @set_trip_temp: a pointer to a function that sets the trip temperature on - * hardware. - * @change_mode: a pointer to a function that notifies the thermal zone - * mode change. - */ -struct thermal_zone_of_device_ops { - int (*get_temp)(void *, int *); - int (*get_trend)(void *, int, enum thermal_trend *); - int (*set_trips)(void *, int, int); - int (*set_emul_temp)(void *, int); - int (*set_trip_temp)(void *, int, int); - int (*change_mode) (void *, enum thermal_device_mode); -}; - /* Function declarations */ #ifdef CONFIG_THERMAL_OF struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data, @@ -335,61 +308,41 @@ void thermal_of_zone_unregister(struct thermal_zone_device *tz); void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz); +void thermal_of_zone_unregister(struct thermal_zone_device *tz); + int thermal_zone_of_get_sensor_id(struct device_node *tz_np, struct device_node *sensor_np, u32 *id); -struct thermal_zone_device * -thermal_zone_of_sensor_register(struct device *dev, int id, void *data, - const struct thermal_zone_of_device_ops *ops); -void thermal_zone_of_sensor_unregister(struct device *dev, - struct thermal_zone_device *tz); -struct thermal_zone_device *devm_thermal_zone_of_sensor_register( - struct device *dev, int id, void *data, - const struct thermal_zone_of_device_ops *ops); -void devm_thermal_zone_of_sensor_unregister(struct device *dev, - struct thermal_zone_device *tz); #else - -static inline int thermal_zone_of_get_sensor_id(struct device_node *tz_np, - struct device_node *sensor_np, - u32 *id) -{ - return -ENOENT; -} -static inline struct thermal_zone_device * -thermal_zone_of_sensor_register(struct device *dev, int id, void *data, - const struct thermal_zone_of_device_ops *ops) -{ - return ERR_PTR(-ENODEV); -} - static inline -void thermal_zone_of_sensor_unregister(struct device *dev, - struct thermal_zone_device *tz) +struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data, + const struct thermal_zone_device_ops *ops) { + return ERR_PTR(-ENOTSUPP); } -static inline struct thermal_zone_device *devm_thermal_zone_of_sensor_register( - struct device *dev, int id, void *data, - const struct thermal_zone_of_device_ops *ops) +static inline +struct thermal_zone_device *devm_thermal_of_zone_register(struct device *dev, int id, void *data, + const struct thermal_zone_device_ops *ops) { - return ERR_PTR(-ENODEV); + return ERR_PTR(-ENOTSUPP); } static inline void thermal_of_zone_unregister(struct thermal_zone_device *tz) { } -static inline void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz) +static inline void devm_thermal_of_zone_unregister(struct device *dev, + struct thermal_zone_device *tz) { } -static inline -void devm_thermal_zone_of_sensor_unregister(struct device *dev, - struct thermal_zone_device *tz) +static inline int thermal_zone_of_get_sensor_id(struct device_node *tz_np, + struct device_node *sensor_np, + u32 *id) { + return -ENOENT; } - #endif #ifdef CONFIG_THERMAL -- cgit v1.2.3 From 25885a35a72007cf28ec5f9ba7169c5c798f7167 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Aug 2022 11:57:56 -0400 Subject: Change calling conventions for filldir_t filldir_t instances (directory iterators callbacks) used to return 0 for "OK, keep going" or -E... for "stop". Note that it's *NOT* how the error values are reported - the rules for those are callback-dependent and ->iterate{,_shared}() instances only care about zero vs. non-zero (look at emit_dir() and friends). So let's just return bool ("should we keep going?") - it's less confusing that way. The choice between "true means keep going" and "true means stop" is bikesheddable; we have two groups of callbacks - do something for everything in directory, until we run into problem and find an entry in directory and do something to it. The former tended to use 0/-E... conventions - -E on failure. The latter tended to use 0/1, 1 being "stop, we are done". The callers treated anything non-zero as "stop", ignoring which non-zero value did they get. "true means stop" would be more natural for the second group; "true means keep going" - for the first one. I tried both variants and the things like if allocation failed something = -ENOMEM; return true; just looked unnatural and asking for trouble. [folded suggestion from Matthew Wilcox ] Acked-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- include/linux/fs.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9eced4cc286e..c40f68f62941 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2038,9 +2038,10 @@ umode_t mode_strip_sgid(struct user_namespace *mnt_userns, * the kernel specify what kind of dirent layout it wants to have. * This allows the kernel to read directories into kernel space or * to have different dirent layouts depending on the binary type. + * Return 'true' to keep going and 'false' if there are no more entries. */ struct dir_context; -typedef int (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, +typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, unsigned); struct dir_context { @@ -3540,17 +3541,17 @@ static inline bool dir_emit(struct dir_context *ctx, const char *name, int namelen, u64 ino, unsigned type) { - return ctx->actor(ctx, name, namelen, ctx->pos, ino, type) == 0; + return ctx->actor(ctx, name, namelen, ctx->pos, ino, type); } static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, ".", 1, ctx->pos, - file->f_path.dentry->d_inode->i_ino, DT_DIR) == 0; + file->f_path.dentry->d_inode->i_ino, DT_DIR); } static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, "..", 2, ctx->pos, - parent_ino(file->f_path.dentry), DT_DIR) == 0; + parent_ino(file->f_path.dentry), DT_DIR); } static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) { -- cgit v1.2.3 From e34cfee65ec891a319ce79797dda18083af33a76 Mon Sep 17 00:00:00 2001 From: Wong Vee Khee Date: Wed, 17 Aug 2022 14:43:24 +0800 Subject: stmmac: intel: remove unused 'has_crossts' flag The 'has_crossts' flag was not used anywhere in the stmmac driver, removing it from both header file and dwmac-intel driver. Signed-off-by: Wong Vee Khee Reviewed-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20220817064324.10025-1-veekhee@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 8df475db88c0..fb2e88614f5d 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -257,7 +257,6 @@ struct plat_stmmacenet_data { u8 vlan_fail_q; unsigned int eee_usecs_rate; struct pci_dev *pdev; - bool has_crossts; int int_snapshot_num; int ext_snapshot_num; bool int_snapshot_en; -- cgit v1.2.3 From da279e6965b3838e99e5c0ab8f76b87bf86b31a5 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Fri, 12 Aug 2022 13:10:37 +0300 Subject: regulator: Add devm helpers for get and enable A few regulator consumer drivers seem to be just getting a regulator, enabling it and registering a devm-action to disable the regulator at the driver detach and then forget about it. We can simplify this a bit by adding a devm-helper for this pattern. Add devm_regulator_get_enable() and devm_regulator_get_enable_optional() Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/ed7b8841193bb9749d426f3cb3b199c9460794cd.1660292316.git.mazziesaccount@gmail.com Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index bc6cda706d1f..ee3b4a014611 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -207,6 +207,8 @@ struct regulator *__must_check regulator_get_optional(struct device *dev, const char *id); struct regulator *__must_check devm_regulator_get_optional(struct device *dev, const char *id); +int devm_regulator_get_enable(struct device *dev, const char *id); +int devm_regulator_get_enable_optional(struct device *dev, const char *id); void regulator_put(struct regulator *regulator); void devm_regulator_put(struct regulator *regulator); @@ -244,12 +246,15 @@ int __must_check regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers); int __must_check devm_regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers); +void devm_regulator_bulk_put(struct regulator_bulk_data *consumers); int __must_check devm_regulator_bulk_get_const( struct device *dev, int num_consumers, const struct regulator_bulk_data *in_consumers, struct regulator_bulk_data **out_consumers); int __must_check regulator_bulk_enable(int num_consumers, struct regulator_bulk_data *consumers); +int devm_regulator_bulk_get_enable(struct device *dev, int num_consumers, + const char * const *id); int regulator_bulk_disable(int num_consumers, struct regulator_bulk_data *consumers); int regulator_bulk_force_disable(int num_consumers, @@ -354,6 +359,17 @@ devm_regulator_get_exclusive(struct device *dev, const char *id) return ERR_PTR(-ENODEV); } +static inline int devm_regulator_get_enable(struct device *dev, const char *id) +{ + return -ENODEV; +} + +static inline int devm_regulator_get_enable_optional(struct device *dev, + const char *id) +{ + return -ENODEV; +} + static inline struct regulator *__must_check regulator_get_optional(struct device *dev, const char *id) { @@ -375,6 +391,10 @@ static inline void devm_regulator_put(struct regulator *regulator) { } +static inline void devm_regulator_bulk_put(struct regulator_bulk_data *consumers) +{ +} + static inline int regulator_register_supply_alias(struct device *dev, const char *id, struct device *alias_dev, @@ -465,6 +485,13 @@ static inline int regulator_bulk_enable(int num_consumers, return 0; } +static inline int devm_regulator_bulk_get_enable(struct device *dev, + int num_consumers, + const char * const *id) +{ + return 0; +} + static inline int regulator_bulk_disable(int num_consumers, struct regulator_bulk_data *consumers) { -- cgit v1.2.3 From b5a5b9d5f28d23b84f06b45c61dcad95b07d41bc Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 18 Aug 2022 15:38:58 +0200 Subject: serial: document start_rx member at struct uart_ops Fix this doc build warning: ./include/linux/serial_core.h:397: warning: Function parameter or member 'start_rx' not described in 'uart_ops' Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/5d07ae2eec8fbad87e623160f9926b178bef2744.1660829433.git.mchehab@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index aef3145f2032..6e4f4765d209 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -141,6 +141,14 @@ struct gpio_desc; * Locking: none. * Interrupts: caller dependent. * + * @start_rx: ``void ()(struct uart_port *port)`` + * + * Start receiving characters. + * + * Locking: @port->lock taken. + * Interrupts: locally disabled. + * This call must not sleep + * * @stop_rx: ``void ()(struct uart_port *port)`` * * Stop receiving characters; the @port is in the process of being closed. -- cgit v1.2.3 From c1e5c2f0cb8a22ec2e14af92afc7006491bebabb Mon Sep 17 00:00:00 2001 From: Pablo Sun Date: Thu, 4 Aug 2022 11:48:03 +0800 Subject: usb: typec: altmodes/displayport: correct pin assignment for UFP receptacles Fix incorrect pin assignment values when connecting to a monitor with Type-C receptacle instead of a plug. According to specification, an UFP_D receptacle's pin assignment should came from the UFP_D pin assignments field (bit 23:16), while an UFP_D plug's assignments are described in the DFP_D pin assignments (bit 15:8) during Mode Discovery. For example the LG 27 UL850-W is a monitor with Type-C receptacle. The monitor responds to MODE DISCOVERY command with following DisplayPort Capability flag: dp->alt->vdo=0x140045 The existing logic only take cares of UPF_D plug case, and would take the bit 15:8 for this 0x140045 case. This results in an non-existing pin assignment 0x0 in dp_altmode_configure. To fix this problem a new set of macros are introduced to take plug/receptacle differences into consideration. Fixes: 0e3bb7d6894d ("usb: typec: Add driver for DisplayPort alternate mode") Cc: stable@vger.kernel.org Co-developed-by: Pablo Sun Co-developed-by: Macpaul Lin Reviewed-by: Guillaume Ranquet Reviewed-by: Heikki Krogerus Signed-off-by: Pablo Sun Signed-off-by: Macpaul Lin Link: https://lore.kernel.org/r/20220804034803.19486-1-macpaul.lin@mediatek.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec_dp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/typec_dp.h b/include/linux/usb/typec_dp.h index cfb916cccd31..8d09c2f0a9b8 100644 --- a/include/linux/usb/typec_dp.h +++ b/include/linux/usb/typec_dp.h @@ -73,6 +73,11 @@ enum { #define DP_CAP_USB BIT(7) #define DP_CAP_DFP_D_PIN_ASSIGN(_cap_) (((_cap_) & GENMASK(15, 8)) >> 8) #define DP_CAP_UFP_D_PIN_ASSIGN(_cap_) (((_cap_) & GENMASK(23, 16)) >> 16) +/* Get pin assignment taking plug & receptacle into consideration */ +#define DP_CAP_PIN_ASSIGN_UFP_D(_cap_) ((_cap_ & DP_CAP_RECEPTACLE) ? \ + DP_CAP_UFP_D_PIN_ASSIGN(_cap_) : DP_CAP_DFP_D_PIN_ASSIGN(_cap_)) +#define DP_CAP_PIN_ASSIGN_DFP_D(_cap_) ((_cap_ & DP_CAP_RECEPTACLE) ? \ + DP_CAP_DFP_D_PIN_ASSIGN(_cap_) : DP_CAP_UFP_D_PIN_ASSIGN(_cap_)) /* DisplayPort Status Update VDO bits */ #define DP_STATUS_CONNECTION(_status_) ((_status_) & 3) -- cgit v1.2.3 From 77947238dad3b83bbe085ebcd576ea55afb70425 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Tue, 16 Aug 2022 21:48:29 +0000 Subject: platform/chrome: Add Type-C mux set command definitions Copy EC header definitions for the USB Type-C Mux control command from the EC code base. Also pull in "TBT_UFP_REPLY" definitions, since that is the prior entry in the enum. These headers are already present in the EC code base. [1] [1] https://chromium.googlesource.com/chromiumos/platform/ec/+/b80f85a94a423273c1638ef7b662c56931a138dd/include/ec_commands.h Signed-off-by: Prashant Malani Reviewed-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/20220816214857.2088914-2-pmalani@chromium.org --- include/linux/platform_data/cros_ec_commands.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index 8b1b795867a1..5744a2d746aa 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -5724,8 +5724,21 @@ enum typec_control_command { TYPEC_CONTROL_COMMAND_EXIT_MODES, TYPEC_CONTROL_COMMAND_CLEAR_EVENTS, TYPEC_CONTROL_COMMAND_ENTER_MODE, + TYPEC_CONTROL_COMMAND_TBT_UFP_REPLY, + TYPEC_CONTROL_COMMAND_USB_MUX_SET, }; +/* Replies the AP may specify to the TBT EnterMode command as a UFP */ +enum typec_tbt_ufp_reply { + TYPEC_TBT_UFP_REPLY_NAK, + TYPEC_TBT_UFP_REPLY_ACK, +}; + +struct typec_usb_mux_set { + uint8_t mux_index; /* Index of the mux to set in the chain */ + uint8_t mux_flags; /* USB_PD_MUX_*-encoded USB mux state to set */ +} __ec_align1; + struct ec_params_typec_control { uint8_t port; uint8_t command; /* enum typec_control_command */ @@ -5739,6 +5752,8 @@ struct ec_params_typec_control { union { uint32_t clear_events_mask; uint8_t mode_to_enter; /* enum typec_mode */ + uint8_t tbt_ufp_reply; /* enum typec_tbt_ufp_reply */ + struct typec_usb_mux_set mux_params; uint8_t placeholder[128]; }; } __ec_align1; @@ -5817,6 +5832,9 @@ enum tcpc_cc_polarity { #define PD_STATUS_EVENT_SOP_DISC_DONE BIT(0) #define PD_STATUS_EVENT_SOP_PRIME_DISC_DONE BIT(1) #define PD_STATUS_EVENT_HARD_RESET BIT(2) +#define PD_STATUS_EVENT_DISCONNECTED BIT(3) +#define PD_STATUS_EVENT_MUX_0_SET_DONE BIT(4) +#define PD_STATUS_EVENT_MUX_1_SET_DONE BIT(5) struct ec_params_typec_status { uint8_t port; -- cgit v1.2.3 From 24426654ed3ae83d1127511891fb782c54f49203 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:17:17 -0700 Subject: bpf: net: Avoid sk_setsockopt() taking sk lock when called from bpf Most of the code in bpf_setsockopt(SOL_SOCKET) are duplicated from the sk_setsockopt(). The number of supported optnames are increasing ever and so as the duplicated code. One issue in reusing sk_setsockopt() is that the bpf prog has already acquired the sk lock. This patch adds a has_current_bpf_ctx() to tell if the sk_setsockopt() is called from a bpf prog. The bpf prog calling bpf_setsockopt() is either running in_task() or in_serving_softirq(). Both cases have the current->bpf_ctx initialized. Thus, the has_current_bpf_ctx() only needs to test !!current->bpf_ctx. This patch also adds sockopt_{lock,release}_sock() helpers for sk_setsockopt() to use. These helpers will test has_current_bpf_ctx() before acquiring/releasing the lock. They are in EXPORT_SYMBOL for the ipv6 module to use in a latter patch. Note on the change in sock_setbindtodevice(). sockopt_lock_sock() is done in sock_setbindtodevice() instead of doing the lock_sock in sock_bindtoindex(..., lock_sk = true). Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061717.4175589-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a627a02cf8ab..39bd36359c1e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1966,6 +1966,15 @@ static inline bool unprivileged_ebpf_enabled(void) return !sysctl_unprivileged_bpf_disabled; } +/* Not all bpf prog type has the bpf_ctx. + * For the bpf prog type that has initialized the bpf_ctx, + * this function can be used to decide if a kernel function + * is called by a bpf program. + */ +static inline bool has_current_bpf_ctx(void) +{ + return !!current->bpf_ctx; +} #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2175,6 +2184,10 @@ static inline bool unprivileged_ebpf_enabled(void) return false; } +static inline bool has_current_bpf_ctx(void) +{ + return false; +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, -- cgit v1.2.3 From 2c8cc0946c14c39b02748fba34325ecae636530a Mon Sep 17 00:00:00 2001 From: Gene Chen Date: Fri, 5 Aug 2022 15:17:12 +0800 Subject: usb: typec: tcpci: Move function "tcpci_to_typec_cc" to common Move transition function "tcpci_to_typec_cc" to common header Reviewed-by: Guenter Roeck Acked-by: Heikki Krogerus Signed-off-by: Gene Chen Link: https://lore.kernel.org/r/20220805071714.150882-7-gene.chen.richtek@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpci.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h index 20c0bedb8ec8..17657451c762 100644 --- a/include/linux/usb/tcpci.h +++ b/include/linux/usb/tcpci.h @@ -167,6 +167,11 @@ /* I2C_WRITE_BYTE_COUNT + 1 when TX_BUF_BYTE_x is only accessible I2C_WRITE_BYTE_COUNT */ #define TCPC_TRANSMIT_BUFFER_MAX_LEN 31 +#define tcpc_presenting_rd(reg, cc) \ + (!(TCPC_ROLE_CTRL_DRP & (reg)) && \ + (((reg) & (TCPC_ROLE_CTRL_## cc ##_MASK << TCPC_ROLE_CTRL_## cc ##_SHIFT)) == \ + (TCPC_ROLE_CTRL_CC_RD << TCPC_ROLE_CTRL_## cc ##_SHIFT))) + struct tcpci; /* @@ -207,4 +212,21 @@ irqreturn_t tcpci_irq(struct tcpci *tcpci); struct tcpm_port; struct tcpm_port *tcpci_get_tcpm_port(struct tcpci *tcpci); + +static inline enum typec_cc_status tcpci_to_typec_cc(unsigned int cc, bool sink) +{ + switch (cc) { + case 0x1: + return sink ? TYPEC_CC_RP_DEF : TYPEC_CC_RA; + case 0x2: + return sink ? TYPEC_CC_RP_1_5 : TYPEC_CC_RD; + case 0x3: + if (sink) + return TYPEC_CC_RP_3_0; + fallthrough; + case 0x0: + default: + return TYPEC_CC_OPEN; + } +} #endif /* __LINUX_USB_TCPCI_H */ -- cgit v1.2.3 From 77bfa0fc7536e8fa7dc6f12081827e0edd75b0f9 Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Tue, 16 Aug 2022 16:23:52 +0800 Subject: phy: tegra: xusb: add utmi pad power on/down ops Add utmi_pad_power_on/down ops for each SOC instead of exporting tegra_phy_xusb_utmi_pad_power_on/down directly for Tegra186 chip. Signed-off-by: BH Hsieh Signed-off-by: Jim Lin Link: https://lore.kernel.org/r/20220816082353.13390-2-jilin@nvidia.com Signed-off-by: Greg Kroah-Hartman --- include/linux/phy/tegra/xusb.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy/tegra/xusb.h b/include/linux/phy/tegra/xusb.h index 3a35e74cdc61..70998e6dd6fd 100644 --- a/include/linux/phy/tegra/xusb.h +++ b/include/linux/phy/tegra/xusb.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef PHY_TEGRA_XUSB_H @@ -21,6 +21,8 @@ int tegra_xusb_padctl_usb3_set_lfps_detect(struct tegra_xusb_padctl *padctl, unsigned int port, bool enable); int tegra_xusb_padctl_set_vbus_override(struct tegra_xusb_padctl *padctl, bool val); +void tegra_phy_xusb_utmi_pad_power_on(struct phy *phy); +void tegra_phy_xusb_utmi_pad_power_down(struct phy *phy); int tegra_phy_xusb_utmi_port_reset(struct phy *phy); int tegra_xusb_padctl_get_usb3_companion(struct tegra_xusb_padctl *padctl, unsigned int port); -- cgit v1.2.3 From 36cb6494429bd64b27b7ff8b4af56f8e526da2b4 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 28 Jul 2022 18:22:20 +0800 Subject: hwrng: core - let sleep be interrupted when unregistering hwrng MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are two deadlock scenarios that need addressing, which cause problems when the computer goes to sleep, the interface is set down, and hwrng_unregister() is called. When the deadlock is hit, sleep is delayed for tens of seconds, causing it to fail. These scenarios are: 1) The hwrng kthread can't be stopped while it's sleeping, because it uses msleep_interruptible() which does not react to kthread_stop. 2) A normal user thread can't be interrupted by hwrng_unregister() while it's sleeping, because hwrng_unregister() is called from elsewhere. We solve both issues by add a completion object called dying that fulfils waiters once we have started the process in hwrng_unregister. At the same time, we should cleanup a common and useless dmesg splat in the same area. Cc: Reported-by: Gregory Erwin Fixes: fcd09c90c3c5 ("ath9k: use hw_random API instead of directly dumping into random.c") Link: https://lore.kernel.org/all/CAO+Okf6ZJC5-nTE_EJUGQtd8JiCkiEHytGgDsFGTEjs0c00giw@mail.gmail.com/ Link: https://lore.kernel.org/lkml/CAO+Okf5k+C+SE6pMVfPf-d8MfVPVq4PO7EY8Hys_DVXtent3HA@mail.gmail.com/ Link: https://bugs.archlinux.org/task/75138 Signed-off-by: Jason A. Donenfeld Signed-off-by: Herbert Xu Acked-by: Toke Høiland-Jørgensen Acked-by: Kalle Valo Signed-off-by: Herbert Xu --- include/linux/hw_random.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index aa1d4da03538..77c2885c4c13 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -50,6 +50,7 @@ struct hwrng { struct list_head list; struct kref ref; struct completion cleanup_done; + struct completion dying; }; struct device; @@ -61,4 +62,6 @@ extern int devm_hwrng_register(struct device *dev, struct hwrng *rng); extern void hwrng_unregister(struct hwrng *rng); extern void devm_hwrng_unregister(struct device *dve, struct hwrng *rng); +extern long hwrng_msleep(struct hwrng *rng, unsigned int msecs); + #endif /* LINUX_HWRANDOM_H_ */ -- cgit v1.2.3 From 0f60d28828dd94779c6527440289e1c36a05115a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 30 Jan 2022 15:03:49 -0500 Subject: dynamic_dname(): drop unused dentry argument Signed-off-by: Al Viro --- include/linux/dcache.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 92c78ed02b54..54d46518c481 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -287,8 +287,8 @@ static inline unsigned d_count(const struct dentry *dentry) /* * helper function for dentry_operations.d_dname() members */ -extern __printf(4, 5) -char *dynamic_dname(struct dentry *, char *, int, const char *, ...); +extern __printf(3, 4) +char *dynamic_dname(char *, int, const char *, ...); extern char *__d_path(const struct path *, const struct path *, char *, int); extern char *d_absolute_path(const struct path *, char *, int); -- cgit v1.2.3 From 5535be3099717646781ce1540cf725965d680e7b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 9 Aug 2022 22:56:40 +0200 Subject: mm/gup: fix FOLL_FORCE COW security issue and remove FOLL_COW Ever since the Dirty COW (CVE-2016-5195) security issue happened, we know that FOLL_FORCE can be possibly dangerous, especially if there are races that can be exploited by user space. Right now, it would be sufficient to have some code that sets a PTE of a R/O-mapped shared page dirty, in order for it to erroneously become writable by FOLL_FORCE. The implications of setting a write-protected PTE dirty might not be immediately obvious to everyone. And in fact ever since commit 9ae0f87d009c ("mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte"), we can use UFFDIO_CONTINUE to map a shmem page R/O while marking the pte dirty. This can be used by unprivileged user space to modify tmpfs/shmem file content even if the user does not have write permissions to the file, and to bypass memfd write sealing -- Dirty COW restricted to tmpfs/shmem (CVE-2022-2590). To fix such security issues for good, the insight is that we really only need that fancy retry logic (FOLL_COW) for COW mappings that are not writable (!VM_WRITE). And in a COW mapping, we really only broke COW if we have an exclusive anonymous page mapped. If we have something else mapped, or the mapped anonymous page might be shared (!PageAnonExclusive), we have to trigger a write fault to break COW. If we don't find an exclusive anonymous page when we retry, we have to trigger COW breaking once again because something intervened. Let's move away from this mandatory-retry + dirty handling and rely on our PageAnonExclusive() flag for making a similar decision, to use the same COW logic as in other kernel parts here as well. In case we stumble over a PTE in a COW mapping that does not map an exclusive anonymous page, COW was not properly broken and we have to trigger a fake write-fault to break COW. Just like we do in can_change_pte_writable() added via commit 64fe24a3e05e ("mm/mprotect: try avoiding write faults for exclusive anonymous pages when changing protection") and commit 76aefad628aa ("mm/mprotect: fix soft-dirty check in can_change_pte_writable()"), take care of softdirty and uffd-wp manually. For example, a write() via /proc/self/mem to a uffd-wp-protected range has to fail instead of silently granting write access and bypassing the userspace fault handler. Note that FOLL_FORCE is not only used for debug access, but also triggered by applications without debug intentions, for example, when pinning pages via RDMA. This fixes CVE-2022-2590. Note that only x86_64 and aarch64 are affected, because only those support CONFIG_HAVE_ARCH_USERFAULTFD_MINOR. Fortunately, FOLL_COW is no longer required to handle FOLL_FORCE. So let's just get rid of it. Thanks to Nadav Amit for pointing out that the pte_dirty() check in FOLL_FORCE code is problematic and might be exploitable. Note 1: We don't check for the PTE being dirty because it doesn't matter for making a "was COWed" decision anymore, and whoever modifies the page has to set the page dirty either way. Note 2: Kernels before extended uffd-wp support and before PageAnonExclusive (< 5.19) can simply revert the problematic commit instead and be safe regarding UFFDIO_CONTINUE. A backport to v5.19 requires minor adjustments due to lack of vma_soft_dirty_enabled(). Link: https://lkml.kernel.org/r/20220809205640.70916-1-david@redhat.com Fixes: 9ae0f87d009c ("mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte") Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Axel Rasmussen Cc: Nadav Amit Cc: Peter Xu Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: John Hubbard Cc: Jason Gunthorpe Cc: David Laight Cc: [5.16] Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3bedc449c14d..982f2607180b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2885,7 +2885,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ -#define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ -- cgit v1.2.3 From a39c5d3ce03dd890ab6a9be44b21177cec32da55 Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Sun, 7 Aug 2022 15:44:42 +0000 Subject: mm: add DEVICE_ZONE to FOR_ALL_ZONES FOR_ALL_ZONES should be consistent with enum zone_type. Otherwise, __count_zid_vm_events have the potential to add count to wrong item when zid is ZONE_DEVICE. Link: https://lkml.kernel.org/r/20220807154442.GA18167@haolee.io Signed-off-by: Hao Lee Cc: David Hildenbrand Cc: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/vm_event_item.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 404024486fa5..f3fc36cd2276 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -20,12 +20,19 @@ #define HIGHMEM_ZONE(xx) #endif -#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE +#ifdef CONFIG_ZONE_DEVICE +#define DEVICE_ZONE(xx) xx##_DEVICE, +#else +#define DEVICE_ZONE(xx) +#endif + +#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, \ + HIGHMEM_ZONE(xx) xx##_MOVABLE, DEVICE_ZONE(xx) enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, - FOR_ALL_ZONES(PGALLOC), - FOR_ALL_ZONES(ALLOCSTALL), - FOR_ALL_ZONES(PGSCAN_SKIP), + FOR_ALL_ZONES(PGALLOC) + FOR_ALL_ZONES(ALLOCSTALL) + FOR_ALL_ZONES(PGSCAN_SKIP) PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, PGFAULT, PGMAJFAULT, PGLAZYFREED, -- cgit v1.2.3 From f369b07c861435bd812a9d14493f71b34132ed6f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 11 Aug 2022 16:13:40 -0400 Subject: mm/uffd: reset write protection when unregister with wp-mode The motivation of this patch comes from a recent report and patchfix from David Hildenbrand on hugetlb shared handling of wr-protected page [1]. With the reproducer provided in commit message of [1], one can leverage the uffd-wp lazy-reset of ptes to trigger a hugetlb issue which can affect not only the attacker process, but also the whole system. The lazy-reset mechanism of uffd-wp was used to make unregister faster, meanwhile it has an assumption that any leftover pgtable entries should only affect the process on its own, so not only the user should be aware of anything it does, but also it should not affect outside of the process. But it seems that this is not true, and it can also be utilized to make some exploit easier. So far there's no clue showing that the lazy-reset is important to any userfaultfd users because normally the unregister will only happen once for a specific range of memory of the lifecycle of the process. Considering all above, what this patch proposes is to do explicit pte resets when unregister an uffd region with wr-protect mode enabled. It should be the same as calling ioctl(UFFDIO_WRITEPROTECT, wp=false) right before ioctl(UFFDIO_UNREGISTER) for the user. So potentially it'll make the unregister slower. From that pov it's a very slight abi change, but hopefully nothing should break with this change either. Regarding to the change itself - core of uffd write [un]protect operation is moved into a separate function (uffd_wp_range()) and it is reused in the unregister code path. Note that the new function will not check for anything, e.g. ranges or memory types, because they should have been checked during the previous UFFDIO_REGISTER or it should have failed already. It also doesn't check mmap_changing because we're with mmap write lock held anyway. I added a Fixes upon introducing of uffd-wp shmem+hugetlbfs because that's the only issue reported so far and that's the commit David's reproducer will start working (v5.19+). But the whole idea actually applies to not only file memories but also anonymous. It's just that we don't need to fix anonymous prior to v5.19- because there's no known way to exploit. IOW, this patch can also fix the issue reported in [1] as the patch 2 does. [1] https://lore.kernel.org/all/20220811103435.188481-3-david@redhat.com/ Link: https://lkml.kernel.org/r/20220811201340.39342-1-peterx@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: Peter Xu Cc: David Hildenbrand Cc: Mike Rapoport Cc: Mike Kravetz Cc: Andrea Arcangeli Cc: Nadav Amit Cc: Axel Rasmussen Cc: Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 732b522bacb7..e1b8a915e9e9 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -73,6 +73,8 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); +extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, + unsigned long start, unsigned long len, bool enable_wp); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, -- cgit v1.2.3 From cb241339b9d020c758a6647c69f8e42538c5cf88 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 10 Aug 2022 21:51:09 -0700 Subject: mm/shmem: fix chattr fsflags support in tmpfs ext[234] have always allowed unimplemented chattr flags to be set, but other filesystems have tended to be stricter. Follow the stricter approach for tmpfs: I don't want to have to explain why csu attributes don't actually work, and we won't need to update the chattr(1) manpage; and it's never wrong to start off strict, relaxing later if persuaded. Allow only a (append only) i (immutable) A (no atime) and d (no dump). Although lsattr showed 'A' inherited, the NOATIME behavior was not being inherited: because nothing sync'ed FS_NOATIME_FL to S_NOATIME. Add shmem_set_inode_flags() to sync the flags, using inode_set_flags() to avoid that instant of lost immutablility during fileattr_set(). But that change switched generic/079 from passing to failing: because FS_IMMUTABLE_FL and FS_APPEND_FL had been unconventionally included in the INHERITED fsflags: remove them and generic/079 is back to passing. Link: https://lkml.kernel.org/r/2961dcb0-ddf3-b9f0-3268-12a4ff996856@google.com Fixes: e408e695f5f1 ("mm/shmem: support FS_IOC_[SG]ETFLAGS in tmpfs") Signed-off-by: Hugh Dickins Cc: "Theodore Ts'o" Cc: Radoslaw Burny Cc: "Darrick J. Wong" Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 1b6c4013f691..ff0b990de83d 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -29,15 +29,10 @@ struct shmem_inode_info { struct inode vfs_inode; }; -#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE -#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE -#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE - -/* Flags that are appropriate for regular files (all but dir-specific ones). */ -#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) - -/* Flags that are appropriate for non-directories/regular files. */ -#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE +#define SHMEM_FL_USER_MODIFIABLE \ + (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL) +#define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL) struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ -- cgit v1.2.3 From 5e61fe157a27afc7c0d4f7bcbceefdca536c015f Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Wed, 17 Aug 2022 14:32:52 +0200 Subject: net: phy: Introduce QUSGMII PHY mode The QUSGMII mode is a derivative of Cisco's USXGMII standard. This standard is pretty similar to SGMII, but allows for faster speeds, and has the build-in bits for Quad and Octa variants (like QSGMII). The main difference with SGMII/QSGMII is that USXGMII/QUSGMII re-uses the preamble to carry various information, named 'Extensions'. As of today, the USXGMII standard only mentions the "PCH" extension, which is used to convey timestamps, allowing in-band signaling of PTP timestamps without having to modify the frame itself. This commit adds support for that mode. When no extension is in use, it behaves exactly like QSGMII, although it's not compatible with QSGMII. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 87638c55d844..9eeab9b9a74c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -115,6 +115,7 @@ extern const int phy_10gbit_features_array[1]; * @PHY_INTERFACE_MODE_25GBASER: 25G BaseR * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN + * @PHY_INTERFACE_MODE_QUSGMII: Quad Universal SGMII * @PHY_INTERFACE_MODE_MAX: Book keeping * * Describes the interface between the MAC and PHY. @@ -152,6 +153,7 @@ typedef enum { PHY_INTERFACE_MODE_USXGMII, /* 10GBASE-KR - with Clause 73 AN */ PHY_INTERFACE_MODE_10GKR, + PHY_INTERFACE_MODE_QUSGMII, PHY_INTERFACE_MODE_MAX, } phy_interface_t; @@ -267,6 +269,8 @@ static inline const char *phy_modes(phy_interface_t interface) return "10gbase-kr"; case PHY_INTERFACE_MODE_100BASEX: return "100base-x"; + case PHY_INTERFACE_MODE_QUSGMII: + return "qusgmii"; default: return "unknown"; } -- cgit v1.2.3 From c04ade27cb7b952b6b9b9a0efa0a6129cc63f2ae Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Wed, 17 Aug 2022 14:32:54 +0200 Subject: net: phy: Add helper to derive the number of ports from a phy mode Some phy modes such as QSGMII multiplex several MAC<->PHY links on one single physical interface. QSGMII used to be the only one supported, but other modes such as QUSGMII also carry multiple links. This helper allows getting the number of links that are multiplexed on a given interface. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 9eeab9b9a74c..7c49ab95441b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -968,6 +968,8 @@ struct phy_fixup { const char *phy_speed_to_str(int speed); const char *phy_duplex_to_str(unsigned int duplex); +int phy_interface_num_ports(phy_interface_t interface); + /* A structure for mapping a particular speed and duplex * combination to a particular SUPPORTED and ADVERTISED value */ -- cgit v1.2.3 From 1202cdd665315c525b5237e96e0bedc76d7e754f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 17 Aug 2022 17:43:21 -0700 Subject: Remove DECnet support from kernel DECnet is an obsolete network protocol that receives more attention from kernel janitors than users. It belongs in computer protocol history museum not in Linux kernel. It has been "Orphaned" in kernel since 2010. The iproute2 support for DECnet was dropped in 5.0 release. The documentation link on Sourceforge says it is abandoned there as well. Leave the UAPI alone to keep userspace programs compiling. This means that there is still an empty neighbour table for AF_DECNET. The table of /proc/sys/net entries was updated to match current directories and reformatted to be alphabetical. Signed-off-by: Stephen Hemminger Acked-by: David Ahern Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ---- include/linux/netfilter.h | 5 ----- include/linux/netfilter_defs.h | 8 -------- 3 files changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a3cb93c3dcc..64e8662632f8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1837,7 +1837,6 @@ enum netdev_ml_priv_type { * @tipc_ptr: TIPC specific data * @atalk_ptr: AppleTalk link * @ip_ptr: IPv4 specific data - * @dn_ptr: DECnet specific data * @ip6_ptr: IPv6 specific data * @ax25_ptr: AX.25 specific data * @ieee80211_ptr: IEEE 802.11 specific data, assign before registering @@ -2133,9 +2132,6 @@ struct net_device { #if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif -#if IS_ENABLED(CONFIG_DECNET) - struct dn_dev __rcu *dn_ptr; -#endif #if IS_ENABLED(CONFIG_AX25) void *ax25_ptr; #endif diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index c2c6f332fb90..d8817d381c14 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -243,11 +243,6 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); #endif break; -#if IS_ENABLED(CONFIG_DECNET) - case NFPROTO_DECNET: - hook_head = rcu_dereference(net->nf.hooks_decnet[hook]); - break; -#endif default: WARN_ON_ONCE(1); break; diff --git a/include/linux/netfilter_defs.h b/include/linux/netfilter_defs.h index 8dddfb151f00..a5f7bef1b3a4 100644 --- a/include/linux/netfilter_defs.h +++ b/include/linux/netfilter_defs.h @@ -7,14 +7,6 @@ /* in/out/forward only */ #define NF_ARP_NUMHOOKS 3 -/* max hook is NF_DN_ROUTE (6), also see uapi/linux/netfilter_decnet.h */ -#define NF_DN_NUMHOOKS 7 - -#if IS_ENABLED(CONFIG_DECNET) -/* Largest hook number + 1, see uapi/linux/netfilter_decnet.h */ -#define NF_MAX_HOOKS NF_DN_NUMHOOKS -#else #define NF_MAX_HOOKS NF_INET_NUMHOOKS -#endif #endif -- cgit v1.2.3 From c6ea70604249bc357ce09e9f8e16c29df0fb2fa2 Mon Sep 17 00:00:00 2001 From: "dougmill@linux.vnet.ibm.com" Date: Tue, 16 Aug 2022 15:07:13 +0100 Subject: block: sed-opal: Add ioctl to return device status Provide a mechanism to retrieve basic status information about the device, including the "supported" flag indicating whether SED-OPAL is supported. The information returned is from the various feature descriptors received during the discovery0 step, and so this ioctl does nothing more than perform the discovery0 step and then save the information received. See "struct opal_status" and OPAL_FL_* bits for the status information currently returned. This is necessary to be able to check whether a device is OPAL enabled, set up, locked or unlocked from userspace programs like systemd-cryptsetup and libcryptsetup. Right now we just have to assume the user 'knows' or blindly attempt setup/lock/unlock operations. Signed-off-by: Douglas Miller Tested-by: Luca Boccassi Reviewed-by: Scott Bauer Acked-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20220816140713.84893-1-luca.boccassi@gmail.com Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 1ac0d712a9c3..6f837bb6c715 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -43,6 +43,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_MBR_DONE: case IOC_OPAL_WRITE_SHADOW_MBR: case IOC_OPAL_GENERIC_TABLE_RW: + case IOC_OPAL_GET_STATUS: return true; } return false; -- cgit v1.2.3 From a4e1d0b76e7b32c0839e72679c530445172a2564 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 15 Aug 2022 10:00:43 -0700 Subject: block: Change the return type of blk_mq_map_queues() into void Since blk_mq_map_queues() and the .map_queues() callbacks always return 0, change their return type into void. Most callers ignore the returned value anyway. Cc: Christoph Hellwig Cc: Jason Wang Cc: Keith Busch Cc: Martin K. Petersen Cc: Doug Gilbert Cc: Michael S. Tsirkin Signed-off-by: Bart Van Assche Reviewed-by: John Garry Acked-by: Md Haris Iqbal Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20220815170043.19489-3-bvanassche@acm.org [axboe: fold in fix from Bart] Signed-off-by: Jens Axboe --- include/linux/blk-mq-pci.h | 4 ++-- include/linux/blk-mq-rdma.h | 2 +- include/linux/blk-mq-virtio.h | 2 +- include/linux/blk-mq.h | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h index 0b1f45c62623..ca544e1d3508 100644 --- a/include/linux/blk-mq-pci.h +++ b/include/linux/blk-mq-pci.h @@ -5,7 +5,7 @@ struct blk_mq_queue_map; struct pci_dev; -int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, - int offset); +void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, + int offset); #endif /* _LINUX_BLK_MQ_PCI_H */ diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h index 5cc5f0f36218..53b58c610e76 100644 --- a/include/linux/blk-mq-rdma.h +++ b/include/linux/blk-mq-rdma.h @@ -5,7 +5,7 @@ struct blk_mq_tag_set; struct ib_device; -int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, +void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, struct ib_device *dev, int first_vec); #endif /* _LINUX_BLK_MQ_RDMA_H */ diff --git a/include/linux/blk-mq-virtio.h b/include/linux/blk-mq-virtio.h index 687ae287e1dc..13226e9b22dd 100644 --- a/include/linux/blk-mq-virtio.h +++ b/include/linux/blk-mq-virtio.h @@ -5,7 +5,7 @@ struct blk_mq_queue_map; struct virtio_device; -int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, +void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, struct virtio_device *vdev, int first_vec); #endif /* _LINUX_BLK_MQ_VIRTIO_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 92294a5fb083..c38575209d51 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -630,7 +630,7 @@ struct blk_mq_ops { * @map_queues: This allows drivers specify their own queue mapping by * overriding the setup-time function that builds the mq_map. */ - int (*map_queues)(struct blk_mq_tag_set *set); + void (*map_queues)(struct blk_mq_tag_set *set); #ifdef CONFIG_BLK_DEBUG_FS /** @@ -880,7 +880,7 @@ void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); -int blk_mq_map_queues(struct blk_mq_queue_map *qmap); +void blk_mq_map_queues(struct blk_mq_queue_map *qmap); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); -- cgit v1.2.3 From f5d632d15e9e0a037339601680d82bb840f85d10 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Aug 2022 16:39:04 -0600 Subject: block: shrink rq_map_data a bit We don't need full ints for several of these members. Change the page_order and nr_entries to unsigned shorts, and the true/false from_user and null_mapped to booleans. This shrinks the struct from 32 to 24 bytes on 64-bit archs. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c38575209d51..74b99d716b0b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -963,11 +963,11 @@ blk_status_t blk_insert_cloned_request(struct request *rq); struct rq_map_data { struct page **pages; - int page_order; - int nr_entries; unsigned long offset; - int null_mapped; - int from_user; + unsigned short page_order; + unsigned short nr_entries; + bool null_mapped; + bool from_user; }; int blk_rq_map_user(struct request_queue *, struct request *, -- cgit v1.2.3 From 1ecb7d27b1af6705e9a4e94415b4d8cc8cf2fbfb Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 17 Aug 2022 18:27:27 +0100 Subject: firmware: arm_scmi: Improve checks in the info_get operations SCMI protocols abstract and expose a number of protocol specific resources like clocks, sensors and so on. Information about such specific domain resources are generally exposed via an `info_get` protocol operation. Improve the sanity check on these operations where needed. Link: https://lore.kernel.org/r/20220817172731.1185305-3-cristian.marussi@arm.com Signed-off-by: Cristian Marussi Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index a193884ecf2b..4f765bc788ff 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -84,7 +84,7 @@ struct scmi_protocol_handle; struct scmi_clk_proto_ops { int (*count_get)(const struct scmi_protocol_handle *ph); - const struct scmi_clock_info *(*info_get) + const struct scmi_clock_info __must_check *(*info_get) (const struct scmi_protocol_handle *ph, u32 clk_id); int (*rate_get)(const struct scmi_protocol_handle *ph, u32 clk_id, u64 *rate); @@ -466,7 +466,7 @@ enum scmi_sensor_class { */ struct scmi_sensor_proto_ops { int (*count_get)(const struct scmi_protocol_handle *ph); - const struct scmi_sensor_info *(*info_get) + const struct scmi_sensor_info __must_check *(*info_get) (const struct scmi_protocol_handle *ph, u32 sensor_id); int (*trip_point_config)(const struct scmi_protocol_handle *ph, u32 sensor_id, u8 trip_id, u64 trip_value); -- cgit v1.2.3 From d59b73a66e5e0682442b6d7b4965364e57078b80 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 3 Aug 2022 10:49:23 +0300 Subject: net/mlx5: Avoid false positive lockdep warning by adding lock_class_key Add a lock_class_key per mlx5 device to avoid a false positive "possible circular locking dependency" warning by lockdep, on flows which lock more than one mlx5 device, such as adding SF. kernel log: ====================================================== WARNING: possible circular locking dependency detected 5.19.0-rc8+ #2 Not tainted ------------------------------------------------------ kworker/u20:0/8 is trying to acquire lock: ffff88812dfe0d98 (&dev->intf_state_mutex){+.+.}-{3:3}, at: mlx5_init_one+0x2e/0x490 [mlx5_core] but task is already holding lock: ffff888101aa7898 (&(¬ifier->n_head)->rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x5a/0x130 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&(¬ifier->n_head)->rwsem){++++}-{3:3}: down_write+0x90/0x150 blocking_notifier_chain_register+0x53/0xa0 mlx5_sf_table_init+0x369/0x4a0 [mlx5_core] mlx5_init_one+0x261/0x490 [mlx5_core] probe_one+0x430/0x680 [mlx5_core] local_pci_probe+0xd6/0x170 work_for_cpu_fn+0x4e/0xa0 process_one_work+0x7c2/0x1340 worker_thread+0x6f6/0xec0 kthread+0x28f/0x330 ret_from_fork+0x1f/0x30 -> #0 (&dev->intf_state_mutex){+.+.}-{3:3}: __lock_acquire+0x2fc7/0x6720 lock_acquire+0x1c1/0x550 __mutex_lock+0x12c/0x14b0 mlx5_init_one+0x2e/0x490 [mlx5_core] mlx5_sf_dev_probe+0x29c/0x370 [mlx5_core] auxiliary_bus_probe+0x9d/0xe0 really_probe+0x1e0/0xaa0 __driver_probe_device+0x219/0x480 driver_probe_device+0x49/0x130 __device_attach_driver+0x1b8/0x280 bus_for_each_drv+0x123/0x1a0 __device_attach+0x1a3/0x460 bus_probe_device+0x1a2/0x260 device_add+0x9b1/0x1b40 __auxiliary_device_add+0x88/0xc0 mlx5_sf_dev_state_change_handler+0x67e/0x9d0 [mlx5_core] blocking_notifier_call_chain+0xd5/0x130 mlx5_vhca_state_work_handler+0x2b0/0x3f0 [mlx5_core] process_one_work+0x7c2/0x1340 worker_thread+0x59d/0xec0 kthread+0x28f/0x330 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&(¬ifier->n_head)->rwsem); lock(&dev->intf_state_mutex); lock(&(¬ifier->n_head)->rwsem); lock(&dev->intf_state_mutex); *** DEADLOCK *** 4 locks held by kworker/u20:0/8: #0: ffff888150612938 ((wq_completion)mlx5_events){+.+.}-{0:0}, at: process_one_work+0x6e2/0x1340 #1: ffff888100cafdb8 ((work_completion)(&work->work)#3){+.+.}-{0:0}, at: process_one_work+0x70f/0x1340 #2: ffff888101aa7898 (&(¬ifier->n_head)->rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x5a/0x130 #3: ffff88813682d0e8 (&dev->mutex){....}-{3:3}, at:__device_attach+0x76/0x460 stack backtrace: CPU: 6 PID: 8 Comm: kworker/u20:0 Not tainted 5.19.0-rc8+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: mlx5_events mlx5_vhca_state_work_handler [mlx5_core] Call Trace: dump_stack_lvl+0x57/0x7d check_noncircular+0x278/0x300 ? print_circular_bug+0x460/0x460 ? lock_chain_count+0x20/0x20 ? register_lock_class+0x1880/0x1880 __lock_acquire+0x2fc7/0x6720 ? register_lock_class+0x1880/0x1880 ? register_lock_class+0x1880/0x1880 lock_acquire+0x1c1/0x550 ? mlx5_init_one+0x2e/0x490 [mlx5_core] ? lockdep_hardirqs_on_prepare+0x400/0x400 __mutex_lock+0x12c/0x14b0 ? mlx5_init_one+0x2e/0x490 [mlx5_core] ? mlx5_init_one+0x2e/0x490 [mlx5_core] ? _raw_read_unlock+0x1f/0x30 ? mutex_lock_io_nested+0x1320/0x1320 ? __ioremap_caller.constprop.0+0x306/0x490 ? mlx5_sf_dev_probe+0x269/0x370 [mlx5_core] ? iounmap+0x160/0x160 mlx5_init_one+0x2e/0x490 [mlx5_core] mlx5_sf_dev_probe+0x29c/0x370 [mlx5_core] ? mlx5_sf_dev_remove+0x130/0x130 [mlx5_core] auxiliary_bus_probe+0x9d/0xe0 really_probe+0x1e0/0xaa0 __driver_probe_device+0x219/0x480 ? auxiliary_match_id+0xe9/0x140 driver_probe_device+0x49/0x130 __device_attach_driver+0x1b8/0x280 ? driver_allows_async_probing+0x140/0x140 bus_for_each_drv+0x123/0x1a0 ? bus_for_each_dev+0x1a0/0x1a0 ? lockdep_hardirqs_on_prepare+0x286/0x400 ? trace_hardirqs_on+0x2d/0x100 __device_attach+0x1a3/0x460 ? device_driver_attach+0x1e0/0x1e0 ? kobject_uevent_env+0x22d/0xf10 bus_probe_device+0x1a2/0x260 device_add+0x9b1/0x1b40 ? dev_set_name+0xab/0xe0 ? __fw_devlink_link_to_suppliers+0x260/0x260 ? memset+0x20/0x40 ? lockdep_init_map_type+0x21a/0x7d0 __auxiliary_device_add+0x88/0xc0 ? auxiliary_device_init+0x86/0xa0 mlx5_sf_dev_state_change_handler+0x67e/0x9d0 [mlx5_core] blocking_notifier_call_chain+0xd5/0x130 mlx5_vhca_state_work_handler+0x2b0/0x3f0 [mlx5_core] ? mlx5_vhca_event_arm+0x100/0x100 [mlx5_core] ? lock_downgrade+0x6e0/0x6e0 ? lockdep_hardirqs_on_prepare+0x286/0x400 process_one_work+0x7c2/0x1340 ? lockdep_hardirqs_on_prepare+0x400/0x400 ? pwq_dec_nr_in_flight+0x230/0x230 ? rwlock_bug.part.0+0x90/0x90 worker_thread+0x59d/0xec0 ? process_one_work+0x1340/0x1340 kthread+0x28f/0x330 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 Fixes: 6a3273217469 ("net/mlx5: SF, Port function state change support") Signed-off-by: Moshe Shemesh Reviewed-by: Shay Drory Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 96b16fbe1aa4..7b7ce602c808 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -779,6 +779,7 @@ struct mlx5_core_dev { enum mlx5_device_state state; /* sync interface state */ struct mutex intf_state_mutex; + struct lock_class_key lock_key; unsigned long intf_state; struct mlx5_priv priv; struct mlx5_profile profile; -- cgit v1.2.3 From 272ac1500372183ffd54b0c9f43f52afc482e610 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 15 Aug 2022 11:45:01 -0700 Subject: fscrypt: remove fscrypt_set_test_dummy_encryption() Now that all its callers have been converted to fscrypt_parse_test_dummy_encryption() and fscrypt_add_test_dummy_key() instead, fscrypt_set_test_dummy_encryption() can be removed. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20220513231605.175121-6-ebiggers@kernel.org --- include/linux/fscrypt.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 7d2f1e0f23b1..b95b8601b9c1 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -295,8 +295,6 @@ int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy); bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2); -int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, - struct fscrypt_dummy_policy *dummy_policy); void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb); static inline bool -- cgit v1.2.3 From a9da7251ac8bcc2f2358513868f1903ac2809b3d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Aug 2022 17:14:58 -0700 Subject: Input: gameport - move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20220818210156.8143-1-wsa+renesas@sang-engineering.com Signed-off-by: Dmitry Torokhov --- include/linux/gameport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gameport.h b/include/linux/gameport.h index 69081d899492..8c2f00018e89 100644 --- a/include/linux/gameport.h +++ b/include/linux/gameport.h @@ -110,7 +110,7 @@ static inline void gameport_free_port(struct gameport *gameport) static inline void gameport_set_name(struct gameport *gameport, const char *name) { - strlcpy(gameport->name, name, sizeof(gameport->name)); + strscpy(gameport->name, name, sizeof(gameport->name)); } /* -- cgit v1.2.3 From 13a8e0f6b01b14b2e28ba144e112c883f03a3db2 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Fri, 19 Aug 2022 15:16:11 -0700 Subject: Revert "driver core: Delete driver_deferred_probe_check_state()" This reverts commit 9cbffc7a59561be950ecc675d19a3d2b45202b2b. There are a few more issues to fix that have been reported in the thread for the original series [1]. We'll need to fix those before this will work. So, revert it for now. [1] - https://lore.kernel.org/lkml/20220601070707.3946847-1-saravanak@google.com/ Fixes: 9cbffc7a5956 ("driver core: Delete driver_deferred_probe_check_state()") Tested-by: Tony Lindgren Tested-by: Peng Fan Tested-by: Douglas Anderson Tested-by: Alexander Stein Reviewed-by: Tony Lindgren Signed-off-by: Saravana Kannan Link: https://lore.kernel.org/r/20220819221616.2107893-2-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index 7acaabde5396..2114d65b862f 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -242,6 +242,7 @@ driver_find_device_by_acpi_dev(struct device_driver *drv, const void *adev) extern int driver_deferred_probe_timeout; void driver_deferred_probe_add(struct device *dev); +int driver_deferred_probe_check_state(struct device *dev); void driver_init(void); /** -- cgit v1.2.3 From 7997eff82828304b780dc0a39707e1946d6f1ebf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 20 Aug 2022 17:38:37 +0200 Subject: netfilter: ebtables: reject blobs that don't provide all entry points Harshit Mogalapalli says: In ebt_do_table() function dereferencing 'private->hook_entry[hook]' can lead to NULL pointer dereference. [..] Kernel panic: general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] [..] RIP: 0010:ebt_do_table+0x1dc/0x1ce0 Code: 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 5c 16 00 00 48 b8 00 00 00 00 00 fc ff df 49 8b 6c df 08 48 8d 7d 2c 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 88 [..] Call Trace: nf_hook_slow+0xb1/0x170 __br_forward+0x289/0x730 maybe_deliver+0x24b/0x380 br_flood+0xc6/0x390 br_dev_xmit+0xa2e/0x12c0 For some reason ebtables rejects blobs that provide entry points that are not supported by the table, but what it should instead reject is the opposite: blobs that DO NOT provide an entry point supported by the table. t->valid_hooks is the bitmask of hooks (input, forward ...) that will see packets. Providing an entry point that is not support is harmless (never called/used), but the inverse isn't: it results in a crash because the ebtables traverser doesn't expect a NULL blob for a location its receiving packets for. Instead of fixing all the individual checks, do what iptables is doing and reject all blobs that differ from the expected hooks. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Harshit Mogalapalli Reported-by: syzkaller Signed-off-by: Florian Westphal --- include/linux/netfilter_bridge/ebtables.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index a13296d6c7ce..fd533552a062 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -94,10 +94,6 @@ struct ebt_table { struct ebt_replace_kernel *table; unsigned int valid_hooks; rwlock_t lock; - /* e.g. could be the table explicitly only allows certain - * matches, targets, ... 0 == let it in */ - int (*check)(const struct ebt_table_info *info, - unsigned int valid_hooks); /* the data used by the kernel */ struct ebt_table_info *private; struct nf_hook_ops *ops; -- cgit v1.2.3 From 12cda13cfd5310bbfefdfe32a82489228e2e0381 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 15 Aug 2022 15:43:25 -0400 Subject: fs: dlm: remove DLM_LSFL_FS from uapi The DLM_LSFL_FS flag is set in lockspaces created directly for a kernel user, as opposed to those lockspaces created for user space applications. The user space libdlm allowed this flag to be set for lockspaces created from user space, but then used by a kernel user. No kernel user has ever used this method, so remove the ability to do it. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- include/linux/dlm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dlm.h b/include/linux/dlm.h index ff951e9f6f20..f5f55c2138ae 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -56,9 +56,6 @@ struct dlm_lockspace_ops { * DLM_LSFL_TIMEWARN * The dlm should emit netlink messages if locks have been waiting * for a configurable amount of time. (Unused.) - * DLM_LSFL_FS - * The lockspace user is in the kernel (i.e. filesystem). Enables - * direct bast/cast callbacks. * DLM_LSFL_NEWEXCL * dlm_new_lockspace() should return -EEXIST if the lockspace exists. * -- cgit v1.2.3 From 56171e0db23a5e0edce1596dd2792b95ffe57bd3 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 15 Aug 2022 15:43:27 -0400 Subject: fs: dlm: const void resource name parameter The resource name parameter should never be changed by DLM so we declare it as const. At some point it is handled as a char pointer, a resource name can be a non printable ascii string as well. This patch change it to handle it as void pointer as it is offered by DLM API. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- include/linux/dlm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dlm.h b/include/linux/dlm.h index f5f55c2138ae..c6bc2b5ee7e6 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -131,7 +131,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, int mode, struct dlm_lksb *lksb, uint32_t flags, - void *name, + const void *name, unsigned int namelen, uint32_t parent_lkid, void (*lockast) (void *astarg), -- cgit v1.2.3 From 0ba985024ae7db226776725d9aa436b5c1c9fca2 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sun, 21 Aug 2022 14:35:16 +0300 Subject: flow_dissector: Make 'bpf_flow_dissect' return the bpf program retcode Let 'bpf_flow_dissect' callers know the BPF program's retcode and act accordingly. Signed-off-by: Shmulik Ladkani Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220821113519.116765-2-shmulik.ladkani@gmail.com --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ca8afa382bf2..87921996175c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1460,8 +1460,8 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, unsigned int key_count); struct bpf_flow_dissector; -bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen, unsigned int flags); +u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen, unsigned int flags); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, -- cgit v1.2.3 From dea6a4e17013382b20717664ebf3d7cc405e0952 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:51 -0700 Subject: bpf: Introduce cgroup_{common,current}_func_proto Split cgroup_base_func_proto into the following: * cgroup_common_func_proto - common helpers for all cgroup hooks * cgroup_current_func_proto - common helpers for all cgroup hooks running in the process context (== have meaningful 'current'). Move bpf_{g,s}et_retval and other cgroup-related helpers into kernel/bpf/cgroup.c so they closer to where they are being used. Signed-off-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220823222555.523590-2-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 2bd1b5f8de9b..57e9e109257e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -414,6 +414,11 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); + +const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); +const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); #else static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } @@ -444,6 +449,18 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return NULL; +} + +static inline const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return NULL; +} + static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( -- cgit v1.2.3 From bed89185af0de0d417e29ca1798df50f161b0231 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:52 -0700 Subject: bpf: Use cgroup_{common,current}_func_proto in more hooks The following hooks are per-cgroup hooks but they are not using cgroup_{common,current}_func_proto, fix it: * BPF_PROG_TYPE_CGROUP_SKB (cg_skb) * BPF_PROG_TYPE_CGROUP_SOCK_ADDR (cg_sock_addr) * BPF_PROG_TYPE_CGROUP_SOCK (cg_sock) * BPF_PROG_TYPE_LSM+BPF_LSM_CGROUP Also: * move common func_proto's into cgroup func_proto handlers * make sure bpf_{g,s}et_retval are not accessible from recvmsg, getpeername and getsockname (return/errno is ignored in these places) * as a side effect, expose get_current_pid_tgid, get_current_comm_proto, get_current_ancestor_cgroup_id, get_cgroup_classid to more cgroup hooks Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220823222555.523590-3-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 39bd36359c1e..99fc7a64564f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2375,6 +2375,7 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto; +extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto; extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; -- cgit v1.2.3 From e8bf17d58a4db4b4f38617925414097f12e0d509 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Mon, 22 Aug 2022 14:40:40 -0700 Subject: platform/chrome: cros_ec: Expose suspend_timeout_ms in debugfs In modern Chromebooks, the embedded controller has a mechanism where it will watch a hardware-controlled line that toggles in suspend, and wake the system up if an expected sleep transition didn't occur. This can be very useful for detecting power management issues where the system appears to suspend, but doesn't actually reach its lowest expected power states. Sometimes it's useful in debug and test scenarios to be able to control the duration of that timeout, or even disable the EC timeout mechanism altogether. Add a debugfs control to set the timeout to values other than the EC-defined default, for more convenient debug and development iteration. Signed-off-by: Evan Green Reviewed-by: Prashant Malani Reviewed-by: Stephen Boyd Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220822144026.v3.1.Idd188ff3f9caddebc17ac357a13005f93333c21f@changeid [tzungbi: fix one nit in Documentation/ABI/testing/debugfs-cros-ec.] Signed-off-by: Tzung-Bi Shih --- include/linux/platform_data/cros_ec_proto.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 408b29ca4004..e43107e0bee1 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -169,6 +169,7 @@ struct cros_ec_device { int event_size; u32 host_event_wake_mask; u32 last_resume_result; + u16 suspend_timeout_ms; ktime_t last_event_time; struct notifier_block notifier_ready; -- cgit v1.2.3 From c205cc7534a97f2d6fbd2a23a94ed7c036c6e2aa Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 21 Aug 2022 13:18:58 +0800 Subject: net: skb: prevent the split of kfree_skb_reason() by gcc Sometimes, gcc will optimize the function by spliting it to two or more functions. In this case, kfree_skb_reason() is splited to kfree_skb_reason and kfree_skb_reason.part.0. However, the function/tracepoint trace_kfree_skb() in it needs the return address of kfree_skb_reason(). This split makes the call chains becomes: kfree_skb_reason() -> kfree_skb_reason.part.0 -> trace_kfree_skb() which makes the return address that passed to trace_kfree_skb() be kfree_skb(). Therefore, introduce '__fix_address', which is the combination of '__noclone' and 'noinline', and apply it to kfree_skb_reason() to prevent to from being splited or made inline. (Is it better to simply apply '__noclone oninline' to kfree_skb_reason? I'm thinking maybe other functions have the same problems) Meanwhile, wrap 'skb_unref()' with 'unlikely()', as the compiler thinks it is likely return true and splits kfree_skb_reason(). Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/compiler_attributes.h | 7 +++++++ include/linux/skbuff.h | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 445e80517cab..fc93c9488c76 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -371,4 +371,11 @@ */ #define __weak __attribute__((__weak__)) +/* + * Used by functions that use '__builtin_return_address'. These function + * don't want to be splited or made inline, which can make + * the '__builtin_return_address' get unexpected address. + */ +#define __fix_address noinline __noclone + #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ca8afa382bf2..b51d07a727c9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1195,7 +1195,8 @@ static inline bool skb_unref(struct sk_buff *skb) return true; } -void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); +void __fix_address +kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); /** * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason -- cgit v1.2.3 From af67508ea6cbf0e4ea27f8120056fa2efce127dd Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:56 -0700 Subject: net: Fix data-races around sysctl_fb_tunnels_only_for_init_net. While reading sysctl_fb_tunnels_only_for_init_net, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 79134e6ce2c9 ("net: do not create fallback tunnels for non-default namespaces") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a3cb93c3dcc..6d3a33fd0cdb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -640,9 +640,14 @@ extern int sysctl_devconf_inherit_init_net; */ static inline bool net_has_fallback_tunnels(const struct net *net) { - return !IS_ENABLED(CONFIG_SYSCTL) || - !sysctl_fb_tunnels_only_for_init_net || - (net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1); +#if IS_ENABLED(CONFIG_SYSCTL) + int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net); + + return !fb_tunnels_only_for_init_net || + (net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1); +#else + return true; +#endif } static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) -- cgit v1.2.3 From a5612ca10d1aa05624ebe72633e0c8c792970833 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:57 -0700 Subject: net: Fix data-races around sysctl_devconf_inherit_init_net. While reading sysctl_devconf_inherit_init_net, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 856c395cfa63 ("net: introduce a knob to control whether to inherit devconf config") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6d3a33fd0cdb..05d6f3facd5a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -650,6 +650,15 @@ static inline bool net_has_fallback_tunnels(const struct net *net) #endif } +static inline int net_inherit_devconf(void) +{ +#if IS_ENABLED(CONFIG_SYSCTL) + return READ_ONCE(sysctl_devconf_inherit_init_net); +#else + return 0; +#endif +} + static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) { #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) -- cgit v1.2.3 From f78a03f6e28be0283f73d3c18b54837b638a8ccf Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:12 +0900 Subject: mm/slab_common: remove CONFIG_NUMA ifdefs for common kmalloc functions Now that slab_alloc_node() is available for SLAB when CONFIG_NUMA=n, remove CONFIG_NUMA ifdefs for common kmalloc functions. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 0fefdf528e0d..4754c834b0e3 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -456,38 +456,18 @@ static __always_inline void kfree_bulk(size_t size, void **p) kmem_cache_free_bulk(NULL, size, p); } -#ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __alloc_size(1); void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; -#else -static __always_inline __alloc_size(1) void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __kmalloc(size, flags); -} - -static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) -{ - return kmem_cache_alloc(s, flags); -} -#endif #ifdef CONFIG_TRACING extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) __assume_slab_alignment __alloc_size(3); -#ifdef CONFIG_NUMA extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) __assume_slab_alignment __alloc_size(4); -#else -static __always_inline __alloc_size(4) void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, int node, size_t size) -{ - return kmem_cache_alloc_trace(s, gfpflags, size); -} -#endif /* CONFIG_NUMA */ #else /* CONFIG_TRACING */ static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s, @@ -701,20 +681,12 @@ static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t } -#ifdef CONFIG_NUMA extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, unsigned long caller) __alloc_size(1); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ _RET_IP_) -#else /* CONFIG_NUMA */ - -#define kmalloc_node_track_caller(size, flags, node) \ - kmalloc_track_caller(size, flags) - -#endif /* CONFIG_NUMA */ - /* * Shortcuts */ -- cgit v1.2.3 From c45248db04f8e3aca4798d67a394fb9cc2168118 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:13 +0900 Subject: mm/slab_common: cleanup kmalloc_track_caller() Make kmalloc_track_caller() wrapper of kmalloc_node_track_caller(). Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 4754c834b0e3..a0e57df3d5a4 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -651,6 +651,12 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag return kmalloc_array(n, size, flags | __GFP_ZERO); } +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, + unsigned long caller) __alloc_size(1); +#define kmalloc_node_track_caller(size, flags, node) \ + __kmalloc_node_track_caller(size, flags, node, \ + _RET_IP_) + /* * kmalloc_track_caller is a special version of kmalloc that records the * calling function of the routine calling it for slab leak tracking instead @@ -659,9 +665,9 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag * allocator where we care about the real place the memory allocation * request comes from. */ -extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller); #define kmalloc_track_caller(size, flags) \ - __kmalloc_track_caller(size, flags, _RET_IP_) + __kmalloc_node_track_caller(size, flags, \ + NUMA_NO_NODE, _RET_IP_) static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, int node) @@ -680,13 +686,6 @@ static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); } - -extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, - unsigned long caller) __alloc_size(1); -#define kmalloc_node_track_caller(size, flags, node) \ - __kmalloc_node_track_caller(size, flags, node, \ - _RET_IP_) - /* * Shortcuts */ -- cgit v1.2.3 From e4c98d68959e51646c379e157bad36ef0d7bf467 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:15 +0900 Subject: mm/slab_common: fold kmalloc_order_trace() into kmalloc_large() There is no caller of kmalloc_order_trace() except kmalloc_large(). Fold it into kmalloc_large() and remove kmalloc_order{,_trace}(). Also add tracepoint in kmalloc_large() that was previously in kmalloc_order_trace(). Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index a0e57df3d5a4..15a4c59da59e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -489,26 +489,8 @@ static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, g } #endif /* CONFIG_TRACING */ -extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment - __alloc_size(1); - -#ifdef CONFIG_TRACING -extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) - __assume_page_alignment __alloc_size(1); -#else -static __always_inline __alloc_size(1) void *kmalloc_order_trace(size_t size, gfp_t flags, - unsigned int order) -{ - return kmalloc_order(size, flags, order); -} -#endif - -static __always_inline __alloc_size(1) void *kmalloc_large(size_t size, gfp_t flags) -{ - unsigned int order = get_order(size); - return kmalloc_order_trace(size, flags, order); -} - +void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment + __alloc_size(1); /** * kmalloc - allocate memory * @size: how many bytes of memory are required. -- cgit v1.2.3 From a0c3b940023eef3fa005b2bc37d9312712331dcb Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:16 +0900 Subject: mm/slub: move kmalloc_large_node() to slab_common.c In later patch SLAB will also pass requests larger than order-1 page to page allocator. Move kmalloc_large_node() to slab_common.c. Fold kmalloc_large_node_hook() into kmalloc_large_node() as there is no other caller. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 15a4c59da59e..082499306098 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -491,6 +491,10 @@ static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, g void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment __alloc_size(1); + +void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_alignment + __alloc_size(1); + /** * kmalloc - allocate memory * @size: how many bytes of memory are required. -- cgit v1.2.3 From bf37d791022ecfb1279ac88c5448a53f1ae40a59 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:17 +0900 Subject: mm/slab_common: kmalloc_node: pass large requests to page allocator Now that kmalloc_large_node() is in common code, pass large requests to page allocator in kmalloc_node() using kmalloc_large_node(). One problem is that currently there is no tracepoint in kmalloc_large_node(). Instead of simply putting tracepoint in it, use kmalloc_large_node{,_notrace} depending on its caller to show useful address for both inlined kmalloc_node() and __kmalloc_node_track_caller() when large objects are allocated. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 082499306098..fd2e129fc813 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -571,23 +571,35 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) return __kmalloc(size, flags); } +#ifndef CONFIG_SLOB static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) { -#ifndef CONFIG_SLOB - if (__builtin_constant_p(size) && - size <= KMALLOC_MAX_CACHE_SIZE) { - unsigned int i = kmalloc_index(size); + if (__builtin_constant_p(size)) { + unsigned int index; - if (!i) + if (size > KMALLOC_MAX_CACHE_SIZE) + return kmalloc_large_node(size, flags, node); + + index = kmalloc_index(size); + + if (!index) return ZERO_SIZE_PTR; return kmem_cache_alloc_node_trace( - kmalloc_caches[kmalloc_type(flags)][i], + kmalloc_caches[kmalloc_type(flags)][index], flags, node, size); } -#endif return __kmalloc_node(size, flags, node); } +#else +static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) +{ + if (__builtin_constant_p(size) && size > KMALLOC_MAX_CACHE_SIZE) + return kmalloc_large_node(size, flags, node); + + return __kmalloc_node(size, flags, node); +} +#endif /** * kmalloc_array - allocate memory for an array. -- cgit v1.2.3 From d6a71648dbc0ca5520cba16a8fdce8d37ae74218 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:19 +0900 Subject: mm/slab: kmalloc: pass requests larger than order-1 page to page allocator There is not much benefit for serving large objects in kmalloc(). Let's pass large requests to page allocator like SLUB for better maintenance of common code. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index fd2e129fc813..4ee5b2fed164 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -243,27 +243,17 @@ static inline unsigned int arch_slab_minalign(void) #ifdef CONFIG_SLAB /* - * The largest kmalloc size supported by the SLAB allocators is - * 32 megabyte (2^25) or the maximum allocatable page order if that is - * less than 32 MB. - * - * WARNING: Its not easy to increase this value since the allocators have - * to do various tricks to work around compiler limitations in order to - * ensure proper constant folding. + * SLAB and SLUB directly allocates requests fitting in to an order-1 page + * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ -#define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \ - (MAX_ORDER + PAGE_SHIFT - 1) : 25) -#define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH +#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 5 #endif #endif #ifdef CONFIG_SLUB -/* - * SLUB directly allocates requests fitting in to an order-1 page - * (PAGE_SIZE*2). Larger requests are passed to the page allocator. - */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW @@ -415,10 +405,6 @@ static __always_inline unsigned int __kmalloc_index(size_t size, if (size <= 512 * 1024) return 19; if (size <= 1024 * 1024) return 20; if (size <= 2 * 1024 * 1024) return 21; - if (size <= 4 * 1024 * 1024) return 22; - if (size <= 8 * 1024 * 1024) return 23; - if (size <= 16 * 1024 * 1024) return 24; - if (size <= 32 * 1024 * 1024) return 25; if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant) BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()"); @@ -428,6 +414,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size, /* Will never be reached. Needed because the compiler may complain */ return -1; } +static_assert(PAGE_SHIFT <= 20); #define kmalloc_index(s) __kmalloc_index(s, true) #endif /* !CONFIG_SLOB */ -- cgit v1.2.3 From ebc97a52b5d6cd5fb0c15a3fc9cdd6eb924646a1 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 23 Aug 2022 00:46:36 +0000 Subject: mm: add NR_SECONDARY_PAGETABLE to count secondary page table uses. We keep track of several kernel memory stats (total kernel memory, page tables, stack, vmalloc, etc) on multiple levels (global, per-node, per-memcg, etc). These stats give insights to users to how much memory is used by the kernel and for what purposes. Currently, memory used by KVM mmu is not accounted in any of those kernel memory stats. This patch series accounts the memory pages used by KVM for page tables in those stats in a new NR_SECONDARY_PAGETABLE stat. This stat can be later extended to account for other types of secondary pages tables (e.g. iommu page tables). KVM has a decent number of large allocations that aren't for page tables, but for most of them, the number/size of those allocations scales linearly with either the number of vCPUs or the amount of memory assigned to the VM. KVM's secondary page table allocations do not scale linearly, especially when nested virtualization is in use. From a KVM perspective, NR_SECONDARY_PAGETABLE will scale with KVM's per-VM pages_{4k,2m,1g} stats unless the guest is doing something bizarre (e.g. accessing only 4kb chunks of 2mb pages so that KVM is forced to allocate a large number of page tables even though the guest isn't accessing that much memory). However, someone would need to either understand how KVM works to make that connection, or know (or be told) to go look at KVM's stats if they're running VMs to better decipher the stats. Furthermore, having NR_PAGETABLE side-by-side with NR_SECONDARY_PAGETABLE is informative. For example, when backing a VM with THP vs. HugeTLB, NR_SECONDARY_PAGETABLE is roughly the same, but NR_PAGETABLE is an order of magnitude higher with THP. So having this stat will at the very least prove to be useful for understanding tradeoffs between VM backing types, and likely even steer folks towards potential optimizations. The original discussion with more details about the rationale: https://lore.kernel.org/all/87ilqoi77b.wl-maz@kernel.org This stat will be used by subsequent patches to count KVM mmu memory usage. Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20220823004639.2387269-2-yosryahmed@google.com Signed-off-by: Sean Christopherson --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e24b40c52468..355d842d2731 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -216,6 +216,7 @@ enum node_stat_item { NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_PAGETABLE, /* used for pagetables */ + NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. KVM pagetables */ #ifdef CONFIG_SWAP NR_SWAPCACHE, #endif -- cgit v1.2.3 From 9d9d00ac29d0ef7ce426964de46fa6b380357d0a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Aug 2022 03:31:25 +0200 Subject: bpf: Fix reference state management for synchronous callbacks Currently, verifier verifies callback functions (sync and async) as if they will be executed once, (i.e. it explores execution state as if the function was being called once). The next insn to explore is set to start of subprog and the exit from nested frame is handled using curframe > 0 and prepare_func_exit. In case of async callback it uses a customized variant of push_stack simulating a kind of branch to set up custom state and execution context for the async callback. While this approach is simple and works when callback really will be executed only once, it is unsafe for all of our current helpers which are for_each style, i.e. they execute the callback multiple times. A callback releasing acquired references of the caller may do so multiple times, but currently verifier sees it as one call inside the frame, which then returns to caller. Hence, it thinks it released some reference that the cb e.g. got access through callback_ctx (register filled inside cb from spilled typed register on stack). Similarly, it may see that an acquire call is unpaired inside the callback, so the caller will copy the reference state of callback and then will have to release the register with new ref_obj_ids. But again, the callback may execute multiple times, but the verifier will only account for acquired references for a single symbolic execution of the callback, which will cause leaks. Note that for async callback case, things are different. While currently we have bpf_timer_set_callback which only executes it once, even for multiple executions it would be safe, as reference state is NULL and check_reference_leak would force program to release state before BPF_EXIT. The state is also unaffected by analysis for the caller frame. Hence async callback is safe. Since we want the reference state to be accessible, e.g. for pointers loaded from stack through callback_ctx's PTR_TO_STACK, we still have to copy caller's reference_state to callback's bpf_func_state, but we enforce that whatever references it adds to that reference_state has been released before it hits BPF_EXIT. This requires introducing a new callback_ref member in the reference state to distinguish between caller vs callee references. Hence, check_reference_leak now errors out if it sees we are in callback_fn and we have not released callback_ref refs. Since there can be multiple nested callbacks, like frame 0 -> cb1 -> cb2 etc. we need to also distinguish between whether this particular ref belongs to this callback frame or parent, and only error for our own, so we store state->frameno (which is always non-zero for callbacks). In short, callbacks can read parent reference_state, but cannot mutate it, to be able to use pointers acquired by the caller. They must only undo their changes (by releasing their own acquired_refs before BPF_EXIT) on top of caller reference_state before returning (at which point the caller and callback state will match anyway, so no need to copy it back to caller). Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220823013125.24938-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2e3bad8640dc..1fdddbf3546b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -212,6 +212,17 @@ struct bpf_reference_state { * is used purely to inform the user of a reference leak. */ int insn_idx; + /* There can be a case like: + * main (frame 0) + * cb (frame 1) + * func (frame 3) + * cb (frame 4) + * Hence for frame 4, if callback_ref just stored boolean, it would be + * impossible to distinguish nested callback refs. Hence store the + * frameno and compare that to callback_ref in check_reference_leak when + * exiting a callback function. + */ + int callback_ref; }; /* state of the program: -- cgit v1.2.3 From ea5cba269fb1fe22b84f4d01bb3d56320e6ffa3e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 16 Aug 2022 11:26:23 +0200 Subject: wifi: cfg80211/mac80211: check EHT capability size correctly For AP/non-AP the EHT MCS/NSS subfield size differs, the 4-octet subfield is only used for 20 MHz-only non-AP STA. Pass an argument around everywhere to be able to parse it properly. Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 55e6f4ad0ca6..6f70394417ac 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2886,7 +2886,8 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) /* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */ static inline u8 ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, - const struct ieee80211_eht_cap_elem_fixed *eht_cap) + const struct ieee80211_eht_cap_elem_fixed *eht_cap, + bool from_ap) { u8 count = 0; @@ -2907,7 +2908,10 @@ ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) count += 3; - return count ? count : 4; + if (count) + return count; + + return from_ap ? 3 : 4; } /* 802.11be EHT PPE Thresholds */ @@ -2943,7 +2947,8 @@ ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info) } static inline bool -ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len) +ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len, + bool from_ap) { const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data; u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed); @@ -2952,7 +2957,8 @@ ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len) return false; needed += ieee80211_eht_mcs_nss_size((const void *)he_capa, - (const void *)data); + (const void *)data, + from_ap); if (len < needed) return false; -- cgit v1.2.3 From d8c04e27d93eb0afd750d5ea60119a92b146a755 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 1 Aug 2022 14:37:31 +0300 Subject: platform/x86: pmc_atom: Fix SLP_TYPx bitfield mask On Intel hardware the SLP_TYPx bitfield occupies bits 10-12 as per ACPI specification (see Table 4.13 "PM1 Control Registers Fixed Hardware Feature Control Bits" for the details). Fix the mask and other related definitions accordingly. Fixes: 93e5eadd1f6e ("x86/platform: New Intel Atom SOC power management controller driver") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220801113734.36131-1-andriy.shevchenko@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/pmc_atom.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h index 3edfb6d4e67a..dd81f510e4cf 100644 --- a/include/linux/platform_data/x86/pmc_atom.h +++ b/include/linux/platform_data/x86/pmc_atom.h @@ -7,6 +7,8 @@ #ifndef PMC_ATOM_H #define PMC_ATOM_H +#include + /* ValleyView Power Control Unit PCI Device ID */ #define PCI_DEVICE_ID_VLV_PMC 0x0F1C /* CherryTrail Power Control Unit PCI Device ID */ @@ -139,9 +141,9 @@ #define ACPI_MMIO_REG_LEN 0x100 #define PM1_CNT 0x4 -#define SLEEP_TYPE_MASK 0xFFFFECFF +#define SLEEP_TYPE_MASK GENMASK(12, 10) #define SLEEP_TYPE_S5 0x1C00 -#define SLEEP_ENABLE 0x2000 +#define SLEEP_ENABLE BIT(13) extern int pmc_atom_read(int offset, u32 *value); -- cgit v1.2.3 From 5a88ace44d0f99e2bac55d827f4cd6e8bc07e00b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 1 Aug 2022 14:37:34 +0300 Subject: platform/x86: pmc_atom: Amend comment style and grammar The style of the comments is not uniform, make it so and fix a few grammar issues. While at it, update Copyright years. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220801113734.36131-4-andriy.shevchenko@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/pmc_atom.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h index dd81f510e4cf..b8a701c77fd0 100644 --- a/include/linux/platform_data/x86/pmc_atom.h +++ b/include/linux/platform_data/x86/pmc_atom.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Intel Atom SOC Power Management Controller Header File - * Copyright (c) 2014, Intel Corporation. + * Intel Atom SoC Power Management Controller Header File + * Copyright (c) 2014-2015,2022 Intel Corporation. */ #ifndef PMC_ATOM_H -- cgit v1.2.3 From 01ef026ab36357a818c7d8324a36dbb8beff6ff5 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sat, 13 Aug 2022 21:26:24 +1200 Subject: platform/x86: asus-wmi: Support the hardware GPU MUX on some laptops Support the hardware GPU MUX switch available on some models. This switch can toggle the MUX between: - 0, Dedicated mode - 1, Optimus mode Optimus mode is the regular iGPU + dGPU available, while dedicated mode switches the system to have only the dGPU available. Signed-off-by: Luke D. Jones Link: https://lore.kernel.org/r/20220813092624.6228-1-luke@ljones.dev Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/asus-wmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 98f2b2f20f3e..70d2347bf6ca 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -99,6 +99,9 @@ /* dgpu on/off */ #define ASUS_WMI_DEVID_DGPU 0x00090020 +/* gpu mux switch, 0 = dGPU, 1 = Optimus */ +#define ASUS_WMI_DEVID_GPU_MUX 0x00090016 + /* DSTS masks */ #define ASUS_WMI_DSTS_STATUS_BIT 0x00000001 #define ASUS_WMI_DSTS_UNKNOWN_BIT 0x00000002 -- cgit v1.2.3 From e397c3c460bf3849384f2f55516d1887617cfca9 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sat, 13 Aug 2022 21:27:53 +1200 Subject: platform/x86: asus-wmi: Add support for ROG X13 tablet mode Add quirk for ASUS ROG X13 Flow 2-in-1 to enable tablet mode with lid flip (all screen rotations). Signed-off-by: Luke D. Jones Link: https://lore.kernel.org/r/20220813092753.6635-2-luke@ljones.dev Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/asus-wmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 70d2347bf6ca..6e8a95c10d17 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -65,6 +65,7 @@ #define ASUS_WMI_DEVID_PANEL_OD 0x00050019 #define ASUS_WMI_DEVID_CAMERA 0x00060013 #define ASUS_WMI_DEVID_LID_FLIP 0x00060062 +#define ASUS_WMI_DEVID_LID_FLIP_ROG 0x00060077 /* Storage */ #define ASUS_WMI_DEVID_CARDREADER 0x00080013 -- cgit v1.2.3 From d4ccaf58a8472123ac97e6db03932c375b5c45ba Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Wed, 24 Aug 2022 16:31:13 -0700 Subject: bpf: Introduce cgroup iter Cgroup_iter is a type of bpf_iter. It walks over cgroups in four modes: - walking a cgroup's descendants in pre-order. - walking a cgroup's descendants in post-order. - walking a cgroup's ancestors. - process only the given cgroup. When attaching cgroup_iter, one can set a cgroup to the iter_link created from attaching. This cgroup is passed as a file descriptor or cgroup id and serves as the starting point of the walk. If no cgroup is specified, the starting point will be the root cgroup v2. For walking descendants, one can specify the order: either pre-order or post-order. For walking ancestors, the walk starts at the specified cgroup and ends at the root. One can also terminate the walk early by returning 1 from the iter program. Note that because walking cgroup hierarchy holds cgroup_mutex, the iter program is called with cgroup_mutex held. Currently only one session is supported, which means, depending on the volume of data bpf program intends to send to user space, the number of cgroups that can be walked is limited. For example, given the current buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can be walked is 512. This is a limitation of cgroup_iter. If the output data is larger than the kernel buffer size, after all data in the kernel buffer is consumed by user space, the subsequent read() syscall will signal EOPNOTSUPP. In order to work around, the user may have to update their program to reduce the volume of data sent to output. For example, skip some uninteresting cgroups. In future, we may extend bpf_iter flags to allow customizing buffer size. Acked-by: Yonghong Song Acked-by: Tejun Heo Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220824233117.1312810-2-haoluo@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 99fc7a64564f..9c1674973e03 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -48,6 +48,7 @@ struct mem_cgroup; struct module; struct bpf_func_state; struct ftrace_ops; +struct cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1730,7 +1731,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags); int __init bpf_iter_ ## target(args) { return 0; } struct bpf_iter_aux_info { + /* for map_elem iter */ struct bpf_map *map; + + /* for cgroup iter */ + struct { + struct cgroup *start; /* starting cgroup */ + enum bpf_cgroup_iter_order order; + } cgroup; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, -- cgit v1.2.3 From 40bfe7a86d84cf08ac6a8fe2f0c8bf7a43edd110 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 24 Aug 2022 17:32:56 +0200 Subject: of/device: Fix up of_dma_configure_id() stub Since the stub version of of_dma_configure_id() was added in commit a081bd4af4ce ("of/device: Add input id to of_dma_configure()"), it has not matched the signature of the full function, leading to build failure reports when code using this function is built on !OF configurations. Fixes: a081bd4af4ce ("of/device: Add input id to of_dma_configure()") Cc: stable@vger.kernel.org Signed-off-by: Thierry Reding Reviewed-by: Frank Rowand Acked-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20220824153256.1437483-1-thierry.reding@gmail.com Signed-off-by: Rob Herring --- include/linux/of_device.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_device.h b/include/linux/of_device.h index 1d7992a02e36..1a803e4335d3 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -101,8 +101,9 @@ static inline struct device_node *of_cpu_device_node_get(int cpu) } static inline int of_dma_configure_id(struct device *dev, - struct device_node *np, - bool force_dma) + struct device_node *np, + bool force_dma, + const u32 *id) { return 0; } -- cgit v1.2.3 From b9030780971b56c0c455c3b66244efd96608846d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 22 Aug 2022 16:32:43 +0200 Subject: netdev: Use try_cmpxchg in napi_if_scheduled_mark_missed Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in napi_if_scheduled_mark_missed. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. Cc: Eric Dumazet Cc: Paolo Abeni Signed-off-by: Uros Bizjak Link: https://lore.kernel.org/r/20220822143243.2798-1-ubizjak@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 64e8662632f8..8839abc1f941 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -546,8 +546,8 @@ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n) { unsigned long val, new; + val = READ_ONCE(n->state); do { - val = READ_ONCE(n->state); if (val & NAPIF_STATE_DISABLE) return true; @@ -555,7 +555,7 @@ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n) return false; new = val | NAPIF_STATE_MISSED; - } while (cmpxchg(&n->state, val, new) != val); + } while (!try_cmpxchg(&n->state, &val, new)); return true; } -- cgit v1.2.3 From e00923c59e68b63c998a0fef4145b5279331968a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 9 Jun 2022 12:33:13 +0900 Subject: ata: libata: Rename ATA_DFLAG_NCQ_PRIO_ENABLE Rename ATA_DFLAG_NCQ_PRIO_ENABLE to ATA_DFLAG_NCQ_PRIO_ENABLED to match the fact that this flags indicates if NCQ priority use is enabled by the user. Signed-off-by: Damien Le Moal --- include/linux/libata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 0269ff114f5a..a505cfb92ab3 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -101,7 +101,7 @@ enum { ATA_DFLAG_UNLOCK_HPA = (1 << 18), /* unlock HPA */ ATA_DFLAG_NCQ_SEND_RECV = (1 << 19), /* device supports NCQ SEND and RECV */ ATA_DFLAG_NCQ_PRIO = (1 << 20), /* device supports NCQ priority */ - ATA_DFLAG_NCQ_PRIO_ENABLE = (1 << 21), /* Priority cmds sent to dev */ + ATA_DFLAG_NCQ_PRIO_ENABLED = (1 << 21), /* Priority cmds sent to dev */ ATA_DFLAG_INIT_MASK = (1 << 24) - 1, ATA_DFLAG_DETACH = (1 << 24), -- cgit v1.2.3 From 12ff4c803d23218f9afea9f82019a5c9619f6744 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Fri, 26 Aug 2022 12:42:10 +1200 Subject: platform/x86: asus-wmi: Support the GPU fan on TUF laptops Add support for TUF laptops which have the ability to control the GPU fan. This will show as a second fan in hwmon, and has the ability to run as boost (fullspeed), or auto. Signed-off-by: Luke D. Jones Link: https://lore.kernel.org/r/20220826004210.356534-3-luke@ljones.dev Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/asus-wmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 6e8a95c10d17..31d810e6569d 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -79,6 +79,7 @@ #define ASUS_WMI_DEVID_THERMAL_CTRL 0x00110011 #define ASUS_WMI_DEVID_FAN_CTRL 0x00110012 /* deprecated */ #define ASUS_WMI_DEVID_CPU_FAN_CTRL 0x00110013 +#define ASUS_WMI_DEVID_GPU_FAN_CTRL 0x00110014 #define ASUS_WMI_DEVID_CPU_FAN_CURVE 0x00110024 #define ASUS_WMI_DEVID_GPU_FAN_CURVE 0x00110025 -- cgit v1.2.3 From e305a71cea37a64c7558b8b979f6f08f657d0c3d Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Fri, 26 Aug 2022 11:22:50 +1200 Subject: platform/x86: asus-wmi: Implement TUF laptop keyboard LED modes Adds support for changing the laptop keyboard LED mode and colour. The modes are visible effects such as static, rainbow, pulsing, colour cycles. These sysfs attributes are added to asus::kbd_backlight: - kbd_rgb_mode - kbd_rgb_mode_index Signed-off-by: Luke D. Jones Link: https://lore.kernel.org/r/20220825232251.345893-2-luke@ljones.dev Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/asus-wmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 31d810e6569d..09e5477f1aea 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -104,6 +104,9 @@ /* gpu mux switch, 0 = dGPU, 1 = Optimus */ #define ASUS_WMI_DEVID_GPU_MUX 0x00090016 +/* TUF laptop RGB modes/colours */ +#define ASUS_WMI_DEVID_TUF_RGB_MODE 0x00100056 + /* DSTS masks */ #define ASUS_WMI_DSTS_STATUS_BIT 0x00000001 #define ASUS_WMI_DSTS_UNKNOWN_BIT 0x00000002 -- cgit v1.2.3 From 61f64515299e5f4885093656087ec1c0df8109c5 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Fri, 26 Aug 2022 11:22:51 +1200 Subject: platform/x86: asus-wmi: Implement TUF laptop keyboard power states Adds support for setting various power states of TUF keyboards. These states are combinations of: - boot, set if a boot animation is shown on keyboard - awake, set if the keyboard LEDs are visible while laptop is on - sleep, set if an animation is displayed while the laptop is suspended - keyboard (unknown effect) Adds two sysfs attributes to asus::kbd_backlight: - kbd_rgb_state - kbd_rgb_state_index Signed-off-by: Luke D. Jones Link: https://lore.kernel.org/r/20220825232251.345893-3-luke@ljones.dev Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/asus-wmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 09e5477f1aea..28234dc9fa6a 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -107,6 +107,9 @@ /* TUF laptop RGB modes/colours */ #define ASUS_WMI_DEVID_TUF_RGB_MODE 0x00100056 +/* TUF laptop RGB power/state */ +#define ASUS_WMI_DEVID_TUF_RGB_STATE 0x00100057 + /* DSTS masks */ #define ASUS_WMI_DSTS_STATUS_BIT 0x00000001 #define ASUS_WMI_DSTS_UNKNOWN_BIT 0x00000002 -- cgit v1.2.3 From 2a5840124009f133bd09fd855963551fb2cefe22 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 15 Jul 2022 12:16:22 -0700 Subject: lsm,io_uring: add LSM hooks for the new uring_cmd file op io-uring cmd support was added through ee692a21e9bf ("fs,io_uring: add infrastructure for uring-cmd"), this extended the struct file_operations to allow a new command which each subsystem can use to enable command passthrough. Add an LSM specific for the command passthrough which enables LSMs to inspect the command details. This was discussed long ago without no clear pointer for something conclusive, so this enables LSMs to at least reject this new file operation. [0] https://lkml.kernel.org/r/8adf55db-7bab-f59d-d612-ed906b948d19@schaufler-ca.com Cc: stable@vger.kernel.org Fixes: ee692a21e9bf ("fs,io_uring: add infrastructure for uring-cmd") Signed-off-by: Luis Chamberlain Acked-by: Jens Axboe Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 1 + include/linux/lsm_hooks.h | 3 +++ include/linux/security.h | 5 +++++ 3 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 806448173033..60fff133c0b1 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -407,4 +407,5 @@ LSM_HOOK(int, 0, perf_event_write, struct perf_event *event) #ifdef CONFIG_IO_URING LSM_HOOK(int, 0, uring_override_creds, const struct cred *new) LSM_HOOK(int, 0, uring_sqpoll, void) +LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd) #endif /* CONFIG_IO_URING */ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 84a0d7e02176..3aa6030302f5 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1582,6 +1582,9 @@ * Check whether the current task is allowed to spawn a io_uring polling * thread (IORING_SETUP_SQPOLL). * + * @uring_cmd: + * Check whether the file_operations uring_cmd is allowed to run. + * */ union security_list_options { #define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__); diff --git a/include/linux/security.h b/include/linux/security.h index 1bc362cb413f..7bd0c490703d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2060,6 +2060,7 @@ static inline int security_perf_event_write(struct perf_event *event) #ifdef CONFIG_SECURITY extern int security_uring_override_creds(const struct cred *new); extern int security_uring_sqpoll(void); +extern int security_uring_cmd(struct io_uring_cmd *ioucmd); #else static inline int security_uring_override_creds(const struct cred *new) { @@ -2069,6 +2070,10 @@ static inline int security_uring_sqpoll(void) { return 0; } +static inline int security_uring_cmd(struct io_uring_cmd *ioucmd) +{ + return 0; +} #endif /* CONFIG_SECURITY */ #endif /* CONFIG_IO_URING */ -- cgit v1.2.3 From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 26 Aug 2022 09:17:08 -0400 Subject: wait_on_bit: add an acquire memory barrier There are several places in the kernel where wait_on_bit is not followed by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read). On architectures with weak memory ordering, it may happen that memory accesses that follow wait_on_bit are reordered before wait_on_bit and they may return invalid data. Fix this class of bugs by introducing a new function "test_bit_acquire" that works like test_bit, but has acquire memory ordering semantics. Signed-off-by: Mikulas Patocka Acked-by: Will Deacon Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 1 + include/linux/buffer_head.h | 2 +- include/linux/wait_bit.h | 8 ++++---- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index cf9bf65039f2..3b89c64bcfd8 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -59,6 +59,7 @@ extern unsigned long __sw_hweight64(__u64 w); #define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr) #define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr) #define test_bit(nr, addr) bitop(_test_bit, nr, addr) +#define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr) /* * Include this here because some architectures need generic_ffs/fls in diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index def8b8d30ccc..089c9ade4325 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -156,7 +156,7 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh) * make it consistent with folio_test_uptodate * pairs with smp_mb__before_atomic in set_buffer_uptodate */ - return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0; + return test_bit_acquire(BH_Uptodate, &bh->b_state); } #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 7dec36aecbd9..7725b7579b78 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -71,7 +71,7 @@ static inline int wait_on_bit(unsigned long *word, int bit, unsigned mode) { might_sleep(); - if (!test_bit(bit, word)) + if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait, @@ -96,7 +96,7 @@ static inline int wait_on_bit_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); - if (!test_bit(bit, word)) + if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait_io, @@ -123,7 +123,7 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, unsigned long timeout) { might_sleep(); - if (!test_bit(bit, word)) + if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit_timeout(word, bit, bit_wait_timeout, @@ -151,7 +151,7 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); - if (!test_bit(bit, word)) + if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, action, mode); } -- cgit v1.2.3 From fa7e439cf90ba23ea473d0b7d85efd02ae6ccf94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 26 Aug 2022 18:52:37 +0200 Subject: cgroup: Homogenize cgroup_get_from_id() return value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cgroup id is user provided datum hence extend its return domain to include possible error reason (similar to cgroup_get_from_fd()). This change also fixes commit d4ccaf58a847 ("bpf: Introduce cgroup iter") that would use NULL instead of proper error handling in d4ccaf58a847 ("bpf: Introduce cgroup iter"). Additionally, neither of: fc_appid_store, bpf_iter_attach_cgroup, mem_cgroup_get_from_ino (callers of cgroup_get_from_fd) is built without CONFIG_CGROUPS (depends via CONFIG_BLK_CGROUP, direct, transitive CONFIG_MEMCG respectively) transitive, so drop the singular definition not needed with !CONFIG_CGROUPS. Fixes: d4ccaf58a847 ("bpf: Introduce cgroup iter") Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4d143729b246..30ee8ce2665e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -750,11 +750,6 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) {} - -static inline struct cgroup *cgroup_get_from_id(u64 id) -{ - return NULL; -} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS -- cgit v1.2.3 From 93315e46b000fc80fff5d53c3f444417fb3df6de Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 18:00:00 +0530 Subject: perf/core: Add speculation info to branch entries Add a new "spec" bitfield to branch entries for providing speculation information. This will be populated using hints provided by branch sampling features on supported hardware. The following cases are covered: * No branch speculation information is available * Branch is speculative but taken on the wrong path * Branch is non-speculative but taken on the correct path * Branch is speculative and taken on the correct path Suggested-by: Stephane Eranian Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/834088c302faf21c7b665031dd111f424e509a64.1660211399.git.sandipan.das@amd.com --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ee8b9ecdc03b..ae30c61957d2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1078,6 +1078,7 @@ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *b br->abort = 0; br->cycles = 0; br->type = 0; + br->spec = PERF_BR_SPEC_NA; br->reserved = 0; } -- cgit v1.2.3 From d2a4cbcb8bdc0e3d1cf85bf47a670695da0bc27a Mon Sep 17 00:00:00 2001 From: Dmitry Rokosov Date: Fri, 12 Aug 2022 16:52:26 +0000 Subject: units: complement the set of Hz units Currently, Hz units do not have milli, micro and nano Hz coefficients. Some drivers (IIO especially) use their analogues to calculate appropriate Hz values. This patch includes them to units.h definitions, so they can be used from different kernel places. Signed-off-by: Dmitry Rokosov Link: https://lore.kernel.org/r/20220812165243.22177-3-ddrokosov@sberdevices.ru Signed-off-by: Jonathan Cameron --- include/linux/units.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/units.h b/include/linux/units.h index 681fc652e3d7..2793a41e73a2 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -20,6 +20,9 @@ #define PICO 1000000000000ULL #define FEMTO 1000000000000000ULL +#define NANOHZ_PER_HZ 1000000000UL +#define MICROHZ_PER_HZ 1000000UL +#define MILLIHZ_PER_HZ 1000UL #define HZ_PER_KHZ 1000UL #define KHZ_PER_MHZ 1000UL #define HZ_PER_MHZ 1000000UL -- cgit v1.2.3 From 1f5d7ea73c4b630dbb2c90818cb9fc0be54d2fe3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 22 Aug 2022 17:49:23 +0000 Subject: lib/string_helpers: Add str_read_write() helper Add str_read_write() helper to return 'read' or 'write' string literal. Signed-off-by: Andy Shevchenko Signed-off-by: Dmitry Rokosov Link: https://lore.kernel.org/r/20220822175011.2886-2-ddrokosov@sberdevices.ru Signed-off-by: Jonathan Cameron --- include/linux/string_helpers.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index 4d72258d42fd..9e22cd78f3b8 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -126,4 +126,9 @@ static inline const char *str_enabled_disabled(bool v) return v ? "enabled" : "disabled"; } +static inline const char *str_read_write(bool v) +{ + return v ? "read" : "write"; +} + #endif -- cgit v1.2.3 From fcab34b433e2c13e333b2f53c4a8409eadc432c7 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 10 Aug 2022 10:53:59 -0600 Subject: mm: re-allow pinning of zero pfns (again) The below referenced commit makes the same error as 1c563432588d ("mm: fix is_pinnable_page against a cma page"), re-interpreting the logic to exclude pinning of the zero page, which breaks device assignment with vfio. To avoid further subtle mistakes, split the logic into discrete tests. [akpm@linux-foundation.org: simplify comment, per John] Link: https://lkml.kernel.org/r/166015037385.760108.16881097713975517242.stgit@omen Link: https://lore.kernel.org/all/165490039431.944052.12458624139225785964.stgit@omen Fixes: f25cbb7a95a2 ("mm: add zone device coherent type memory support") Signed-off-by: Alex Williamson Suggested-by: Matthew Wilcox Suggested-by: Felix Kuehling Tested-by: Slawomir Laba Reviewed-by: John Hubbard Cc: Alex Sierra Cc: Christoph Hellwig Cc: Alistair Popple Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 982f2607180b..21f8b27bd9fd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1544,9 +1544,16 @@ static inline bool is_longterm_pinnable_page(struct page *page) if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif - return !(is_device_coherent_page(page) || - is_zone_movable_page(page) || - is_zero_pfn(page_to_pfn(page))); + /* The zero page may always be pinned */ + if (is_zero_pfn(page_to_pfn(page))) + return true; + + /* Coherent device memory must always allow eviction. */ + if (is_device_coherent_page(page)) + return false; + + /* Otherwise, non-movable zone pages can be pinned. */ + return !is_zone_movable_page(page); } #else static inline bool is_longterm_pinnable_page(struct page *page) -- cgit v1.2.3 From dbb16df6443c59e8a1ef21c2272fcf387d600ddf Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 17 Aug 2022 17:21:39 +0000 Subject: Revert "memcg: cleanup racy sum avoidance code" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 96e51ccf1af33e82f429a0d6baebba29c6448d0f. Recently we started running the kernel with rstat infrastructure on production traffic and begin to see negative memcg stats values. Particularly the 'sock' stat is the one which we observed having negative value. $ grep "sock " /mnt/memory/job/memory.stat sock 253952 total_sock 18446744073708724224 Re-run after couple of seconds $ grep "sock " /mnt/memory/job/memory.stat sock 253952 total_sock 53248 For now we are only seeing this issue on large machines (256 CPUs) and only with 'sock' stat. I think the networking stack increase the stat on one cpu and decrease it on another cpu much more often. So, this negative sock is due to rstat flusher flushing the stats on the CPU that has seen the decrement of sock but missed the CPU that has increments. A typical race condition. For easy stable backport, revert is the most simple solution. For long term solution, I am thinking of two directions. First is just reduce the race window by optimizing the rstat flusher. Second is if the reader sees a negative stat value, force flush and restart the stat collection. Basically retry but limited. Link: https://lkml.kernel.org/r/20220817172139.3141101-1-shakeelb@google.com Fixes: 96e51ccf1af33e8 ("memcg: cleanup racy sum avoidance code") Signed-off-by: Shakeel Butt Cc: "Michal Koutný" Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Muchun Song Cc: David Hildenbrand Cc: Yosry Ahmed Cc: Greg Thelen Cc: [5.15] Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4d31ce55b1c0..6257867fbf95 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -987,19 +987,30 @@ static inline void mod_memcg_page_state(struct page *page, static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - return READ_ONCE(memcg->vmstats.state[idx]); + long x = READ_ONCE(memcg->vmstats.state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; + long x; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - return READ_ONCE(pn->lruvec_stats.state[idx]); + x = READ_ONCE(pn->lruvec_stats.state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, -- cgit v1.2.3 From bc12b70f7d216b36bd87701349374a13e486f8eb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Aug 2022 13:36:40 +0200 Subject: x86/earlyprintk: Clean up pciserial While working on a GRUB patch to support PCI-serial, a number of cleanups were suggested that apply to the code I took inspiration from. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Bjorn Helgaas # pci_ids.h Reviewed-by: Greg Kroah-Hartman Link: https://lkml.kernel.org/r/YwdeyCEtW+wa+QhH@worktop.programming.kicks-ass.net --- include/linux/pci_ids.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 6feade66efdb..41b3fffdbb8e 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -75,6 +75,9 @@ #define PCI_CLASS_COMMUNICATION_MODEM 0x0703 #define PCI_CLASS_COMMUNICATION_OTHER 0x0780 +/* Interface for SERIAL/MODEM */ +#define PCI_SERIAL_16550_COMPATIBLE 0x02 + #define PCI_BASE_CLASS_SYSTEM 0x08 #define PCI_CLASS_SYSTEM_PIC 0x0800 #define PCI_CLASS_SYSTEM_PIC_IOAPIC 0x080010 -- cgit v1.2.3 From 9c5d03d362519f36cd551aec596388f895c93d2d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Aug 2022 17:18:30 -0700 Subject: genetlink: start to validate reserved header bytes We had historically not checked that genlmsghdr.reserved is 0 on input which prevents us from using those precious bytes in the future. One use case would be to extend the cmd field, which is currently just 8 bits wide and 256 is not a lot of commands for some core families. To make sure that new families do the right thing by default put the onus of opting out of validation on existing families. Signed-off-by: Jakub Kicinski Acked-by: Paul Moore (NetLabel) Signed-off-by: David S. Miller --- include/linux/genl_magic_func.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 939b1a8f571b..4a4b387181ad 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -294,6 +294,7 @@ static struct genl_family ZZZ_genl_family __ro_after_init = { .ops = ZZZ_genl_ops, .n_ops = ARRAY_SIZE(ZZZ_genl_ops), .mcgrps = ZZZ_genl_mcgrps, + .resv_start_op = 42, /* drbd is currently the only user */ .n_mcgrps = ARRAY_SIZE(ZZZ_genl_mcgrps), .module = THIS_MODULE, }; -- cgit v1.2.3 From e8013f8edaa30e17fd583a326daff1fa86b75c82 Mon Sep 17 00:00:00 2001 From: Casper Andersson Date: Thu, 25 Aug 2022 11:28:35 +0200 Subject: ethernet: Add helpers to recognize addresses mapped to IP multicast IP multicast must sometimes be discriminated from non-IP multicast, e.g. when determining the forwarding behavior of a given group in the presence of multicast router ports on an offloaded bridge. Therefore, provide helpers to identify these groups. Signed-off-by: Casper Andersson Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 92b10e67d5f8..a541f0c4f146 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -428,6 +428,28 @@ static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2, return true; } +static inline bool ether_addr_is_ipv4_mcast(const u8 *addr) +{ + u8 base[ETH_ALEN] = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 }; + u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 }; + + return ether_addr_equal_masked(addr, base, mask); +} + +static inline bool ether_addr_is_ipv6_mcast(const u8 *addr) +{ + u8 base[ETH_ALEN] = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 }; + u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }; + + return ether_addr_equal_masked(addr, base, mask); +} + +static inline bool ether_addr_is_ip_mcast(const u8 *addr) +{ + return ether_addr_is_ipv4_mcast(addr) || + ether_addr_is_ipv6_mcast(addr); +} + /** * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address -- cgit v1.2.3 From dcf8e5633e2e69ad60b730ab5905608b756a032f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 23 Aug 2022 12:59:25 -0700 Subject: tracing: Define the is_signed_type() macro once There are two definitions of the is_signed_type() macro: one in and a second definition in . As suggested by Linus, move the definition of the is_signed_type() macro into the header file. Change the definition of the is_signed_type() macro to make sure that it does not trigger any sparse warnings with future versions of sparse for bitwise types. Link: https://lore.kernel.org/all/CAHk-=whjH6p+qzwUdx5SOVVHjS3WvzJQr6mDUwhEyTf6pJWzaQ@mail.gmail.com/ Link: https://lore.kernel.org/all/CAHk-=wjQGnVfb4jehFR0XyZikdQvCZouE96xR_nnf5kqaM5qqQ@mail.gmail.com/ Cc: Rasmus Villemoes Cc: Steven Rostedt Acked-by: Kees Cook Signed-off-by: Bart Van Assche Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 6 ++++++ include/linux/overflow.h | 1 - include/linux/trace_events.h | 2 -- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 01ce94b58b42..7713d7bcdaea 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -239,6 +239,12 @@ static inline void *offset_to_ptr(const int *off) /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +/* + * Whether 'type' is a signed type or an unsigned type. Supports scalar types, + * bool and also pointer types. + */ +#define is_signed_type(type) (((type)(-1)) < (__force type)1) + /* * This is needed in functions which generate the stack canary, see * arch/x86/kernel/smpboot.c::start_secondary() for an example. diff --git a/include/linux/overflow.h b/include/linux/overflow.h index f1221d11f8e5..0eb3b192f07a 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -30,7 +30,6 @@ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - * credit to Christian Biere. */ -#define is_signed_type(type) (((type)(-1)) < (type)1) #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) #define type_min(T) ((T)((T)-type_max(T)-(T)1)) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index b18759a673c6..8401dec93c15 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -814,8 +814,6 @@ extern int trace_add_event_call(struct trace_event_call *call); extern int trace_remove_event_call(struct trace_event_call *call); extern int trace_event_get_offsets(struct trace_event_call *call); -#define is_signed_type(type) (((type)(-1)) < (type)1) - int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); int trace_set_clr_event(const char *system, const char *event, int set); int trace_array_set_clr_event(struct trace_array *tr, const char *system, -- cgit v1.2.3 From ff6d365898d4d31bd557954c7fc53f38977b491c Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 22 Aug 2022 08:34:35 -0700 Subject: soc: qcom: qmi: use const for struct qmi_elem_info Currently all usage of struct qmi_elem_info, which is used to define the QMI message encoding/decoding rules, does not use const. This prevents clients from registering const arrays. Since these arrays are always pre-defined, they should be const, so add the const qualifier to all places in the QMI interface where struct qmi_elem_info is used. Once this patch is in place, clients can independently update their pre-defined arrays to be const, as demonstrated in the QMI sample code. Signed-off-by: Jeff Johnson Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220822153435.7856-1-quic_jjohnson@quicinc.com --- include/linux/soc/qcom/qmi.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/qmi.h b/include/linux/soc/qcom/qmi.h index b1f80e756d2a..469e02d2aa0d 100644 --- a/include/linux/soc/qcom/qmi.h +++ b/include/linux/soc/qcom/qmi.h @@ -75,7 +75,7 @@ struct qmi_elem_info { enum qmi_array_type array_type; u8 tlv_type; u32 offset; - struct qmi_elem_info *ei_array; + const struct qmi_elem_info *ei_array; }; #define QMI_RESULT_SUCCESS_V01 0 @@ -102,7 +102,7 @@ struct qmi_response_type_v01 { u16 error; }; -extern struct qmi_elem_info qmi_response_type_v01_ei[]; +extern const struct qmi_elem_info qmi_response_type_v01_ei[]; /** * struct qmi_service - context to track lookup-results @@ -173,7 +173,7 @@ struct qmi_txn { struct completion completion; int result; - struct qmi_elem_info *ei; + const struct qmi_elem_info *ei; void *dest; }; @@ -189,7 +189,7 @@ struct qmi_msg_handler { unsigned int type; unsigned int msg_id; - struct qmi_elem_info *ei; + const struct qmi_elem_info *ei; size_t decoded_size; void (*fn)(struct qmi_handle *qmi, struct sockaddr_qrtr *sq, @@ -249,23 +249,23 @@ void qmi_handle_release(struct qmi_handle *qmi); ssize_t qmi_send_request(struct qmi_handle *qmi, struct sockaddr_qrtr *sq, struct qmi_txn *txn, int msg_id, size_t len, - struct qmi_elem_info *ei, const void *c_struct); + const struct qmi_elem_info *ei, const void *c_struct); ssize_t qmi_send_response(struct qmi_handle *qmi, struct sockaddr_qrtr *sq, struct qmi_txn *txn, int msg_id, size_t len, - struct qmi_elem_info *ei, const void *c_struct); + const struct qmi_elem_info *ei, const void *c_struct); ssize_t qmi_send_indication(struct qmi_handle *qmi, struct sockaddr_qrtr *sq, - int msg_id, size_t len, struct qmi_elem_info *ei, + int msg_id, size_t len, const struct qmi_elem_info *ei, const void *c_struct); void *qmi_encode_message(int type, unsigned int msg_id, size_t *len, - unsigned int txn_id, struct qmi_elem_info *ei, + unsigned int txn_id, const struct qmi_elem_info *ei, const void *c_struct); int qmi_decode_message(const void *buf, size_t len, - struct qmi_elem_info *ei, void *c_struct); + const struct qmi_elem_info *ei, void *c_struct); int qmi_txn_init(struct qmi_handle *qmi, struct qmi_txn *txn, - struct qmi_elem_info *ei, void *c_struct); + const struct qmi_elem_info *ei, void *c_struct); int qmi_txn_wait(struct qmi_txn *txn, unsigned long timeout); void qmi_txn_cancel(struct qmi_txn *txn); -- cgit v1.2.3 From c13d7d261e361dbaf5adbdc216ee4a1204c48001 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 25 Aug 2022 10:08:56 +0530 Subject: soc: qcom: llcc: Pass LLCC version based register offsets to EDAC driver The LLCC EDAC register offsets varies between each SoCs. Until now, the EDAC driver used the hardcoded register offsets. But this caused crash on SM8450 SoC where the register offsets has been changed. So to avoid this crash and also to make it easy to accommodate changes for new SoCs, let's pass the LLCC version specific register offsets to the EDAC driver. Currently, two set of offsets are used. One is starting from LLCC version v1.0.0 used by all SoCs other than SM8450. For SM8450, LLCC version starting from v2.1.0 is used. Signed-off-by: Manivannan Sadhasivam Reviewed-by: Sai Prakash Ranjan Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220825043859.30066-3-manivannan.sadhasivam@linaro.org --- include/linux/soc/qcom/llcc-qcom.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 9ed5384c5ca1..bc2fb8343a94 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -78,11 +78,40 @@ struct llcc_edac_reg_data { u8 ways_shift; }; +struct llcc_edac_reg_offset { + /* LLCC TRP registers */ + u32 trp_ecc_error_status0; + u32 trp_ecc_error_status1; + u32 trp_ecc_sb_err_syn0; + u32 trp_ecc_db_err_syn0; + u32 trp_ecc_error_cntr_clear; + u32 trp_interrupt_0_status; + u32 trp_interrupt_0_clear; + u32 trp_interrupt_0_enable; + + /* LLCC Common registers */ + u32 cmn_status0; + u32 cmn_interrupt_0_enable; + u32 cmn_interrupt_2_enable; + + /* LLCC DRP registers */ + u32 drp_ecc_error_cfg; + u32 drp_ecc_error_cntr_clear; + u32 drp_interrupt_status; + u32 drp_interrupt_clear; + u32 drp_interrupt_enable; + u32 drp_ecc_error_status0; + u32 drp_ecc_error_status1; + u32 drp_ecc_sb_err_syn0; + u32 drp_ecc_db_err_syn0; +}; + /** * struct llcc_drv_data - Data associated with the llcc driver * @regmap: regmap associated with the llcc device * @bcast_regmap: regmap associated with llcc broadcast offset * @cfg: pointer to the data structure for slice configuration + * @edac_reg_offset: Offset of the LLCC EDAC registers * @lock: mutex associated with each slice * @cfg_size: size of the config data table * @max_slices: max slices as read from device tree @@ -96,6 +125,7 @@ struct llcc_drv_data { struct regmap *regmap; struct regmap *bcast_regmap; const struct llcc_slice_config *cfg; + const struct llcc_edac_reg_offset *edac_reg_offset; struct mutex lock; u32 cfg_size; u32 max_slices; -- cgit v1.2.3 From c60561014257699d81ab392eb6cb6389ad8907df Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 23 Aug 2022 12:50:03 +0800 Subject: soundwire: bus: allow device number to be unique at system level The SoundWire specification allows the device number to be allocated at will. When a system includes multiple SoundWire links, the device number scope is limited to the link to which the device is attached. However, for integration/debug it can be convenient to have a unique device number across the system. This patch adds a 'dev_num_ida_min' field at the bus level, which when set will be used to allocate an IDA. The allocation happens when a hardware device reports as ATTACHED. If any error happens during the enumeration, the allocated IDA is not freed - the device number will be reused if/when the device re-joins the bus. The IDA is only freed when the Linux device is unregistered. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20220823045004.2670658-3-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 39058c841469..a2b31d25ea27 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -889,6 +889,9 @@ struct sdw_master_ops { * meaningful if multi_link is set. If set to 1, hardware-based * synchronization will be used even if a stream only uses a single * SoundWire segment. + * @dev_num_ida_min: if set, defines the minimum values for the IDA + * used to allocate system-unique device numbers. This value needs to be + * identical across all SoundWire bus in the system. */ struct sdw_bus { struct device *dev; @@ -913,6 +916,7 @@ struct sdw_bus { u32 bank_switch_timeout; bool multi_link; int hw_sync_min_links; + int dev_num_ida_min; }; int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, -- cgit v1.2.3 From c5b81449f915a28bb9c7725e53aebab3ba39b4a2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:07 +0200 Subject: perf/hw_breakpoint: Provide hw_breakpoint_is_used() and use in test Provide hw_breakpoint_is_used() to check if breakpoints are in use on the system. Use it in the KUnit test to verify the global state before and after a test case. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-3-elver@google.com --- include/linux/hw_breakpoint.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 78dd7035d1e5..a3fb846705eb 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -74,6 +74,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, extern int register_perf_hw_breakpoint(struct perf_event *bp); extern void unregister_hw_breakpoint(struct perf_event *bp); extern void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events); +extern bool hw_breakpoint_is_used(void); extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); @@ -121,6 +122,8 @@ register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } static inline void unregister_hw_breakpoint(struct perf_event *bp) { } static inline void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { } +static inline bool hw_breakpoint_is_used(void) { return false; } + static inline int reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; } static inline void release_bp_slot(struct perf_event *bp) { } -- cgit v1.2.3 From 0370dc314df35579b751d1b77c9169f071444962 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:09 +0200 Subject: perf/hw_breakpoint: Optimize list of per-task breakpoints On a machine with 256 CPUs, running the recently added perf breakpoint benchmark results in: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 236.418 [sec] | | 123134.794271 usecs/op | 7880626.833333 usecs/op/cpu The benchmark tests inherited breakpoint perf events across many threads. Looking at a perf profile, we can see that the majority of the time is spent in various hw_breakpoint.c functions, which execute within the 'nr_bp_mutex' critical sections which then results in contention on that mutex as well: 37.27% [kernel] [k] osq_lock 34.92% [kernel] [k] mutex_spin_on_owner 12.15% [kernel] [k] toggle_bp_slot 11.90% [kernel] [k] __reserve_bp_slot The culprit here is task_bp_pinned(), which has a runtime complexity of O(#tasks) due to storing all task breakpoints in the same list and iterating through that list looking for a matching task. Clearly, this does not scale to thousands of tasks. Instead, make use of the "rhashtable" variant "rhltable" which stores multiple items with the same key in a list. This results in average runtime complexity of O(1) for task_bp_pinned(). With the optimization, the benchmark shows: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.208 [sec] | | 108.422396 usecs/op | 6939.033333 usecs/op/cpu On this particular setup that's a speedup of ~1135x. While one option would be to make task_struct a breakpoint list node, this would only further bloat task_struct for infrequently used data. Furthermore, after all optimizations in this series, there's no evidence it would result in better performance: later optimizations make the time spent looking up entries in the hash table negligible (we'll reach the theoretical ideal performance i.e. no constraints). Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-5-elver@google.com --- include/linux/perf_event.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ae30c61957d2..1999408a9cbb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -36,6 +36,7 @@ struct perf_guest_info_callbacks { }; #ifdef CONFIG_HAVE_HW_BREAKPOINT +#include #include #endif @@ -178,7 +179,7 @@ struct hw_perf_event { * creation and event initalization. */ struct arch_hw_breakpoint info; - struct list_head bp_list; + struct rhlist_head bp_list; }; #endif struct { /* amd_iommu */ -- cgit v1.2.3 From 9caf87be118f4639537404eeb67dd444a3716e9a Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:12 +0200 Subject: perf/hw_breakpoint: Make hw_breakpoint_weight() inlinable Due to being a __weak function, hw_breakpoint_weight() will cause the compiler to always emit a call to it. This generates unnecessarily bad code (register spills etc.) for no good reason; in fact it appears in profiles of `perf bench -r 100 breakpoint thread -b 4 -p 128 -t 512`: ... 0.70% [kernel] [k] hw_breakpoint_weight ... While a small percentage, no architecture defines its own hw_breakpoint_weight() nor are there users outside hw_breakpoint.c, which makes the fact it is currently __weak a poor choice. Change hw_breakpoint_weight()'s definition to follow a similar protocol to hw_breakpoint_slots(), such that if defines hw_breakpoint_weight(), we'll use it instead. The result is that it is inlined and no longer shows up in profiles. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-8-elver@google.com --- include/linux/hw_breakpoint.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index a3fb846705eb..f319bd26b030 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -80,7 +80,6 @@ extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); extern int reserve_bp_slot(struct perf_event *bp); extern void release_bp_slot(struct perf_event *bp); -int hw_breakpoint_weight(struct perf_event *bp); int arch_reserve_bp_slot(struct perf_event *bp); void arch_release_bp_slot(struct perf_event *bp); void arch_unregister_hw_breakpoint(struct perf_event *bp); -- cgit v1.2.3 From 01fe8a3f818e1074a9a95d624be4549ee7ea2b2b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:15 +0200 Subject: locking/percpu-rwsem: Add percpu_is_write_locked() and percpu_is_read_locked() Implement simple accessors to probe percpu-rwsem's locked state: percpu_is_write_locked(), percpu_is_read_locked(). Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-11-elver@google.com --- include/linux/percpu-rwsem.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 5fda40f97fe9..36b942b67b7d 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -121,9 +121,15 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem) preempt_enable(); } +extern bool percpu_is_read_locked(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); +static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem) +{ + return atomic_read(&sem->block); +} + extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); -- cgit v1.2.3 From 25af7406df5915f04d5f1c8f081dabb0ead1cdcc Mon Sep 17 00:00:00 2001 From: Isaac Manjarres Date: Thu, 18 Aug 2022 18:28:51 +0100 Subject: ARM: 9229/1: amba: Fix use-after-free in amba_read_periphid() After commit f2d3b9a46e0e ("ARM: 9220/1: amba: Remove deferred device addition"), it became possible for amba_read_periphid() to be invoked concurrently from two threads for a particular AMBA device. Consider the case where a thread (T0) is registering an AMBA driver, and searching for all of the devices it can match with on the AMBA bus. Suppose that another thread (T1) is executing the deferred probe work, and is searching through all of the AMBA drivers on the bus for a driver that matches a particular AMBA device. Assume that both threads begin operating on the same AMBA device and the device's peripheral ID is still unknown. In this scenario, the amba_match() function will be invoked for the same AMBA device by both threads, which means amba_read_periphid() can also be invoked by both threads, and both threads will be able to manipulate the AMBA device's pclk pointer without any synchronization. It's possible that one thread will initialize the pclk pointer, then the other thread will re-initialize it, overwriting the previous value, and both will race to free the same pclk, resulting in a use-after-free for whichever thread frees the pclk last. Add a lock per AMBA device to synchronize the handling with detecting the peripheral ID to avoid the use-after-free scenario. The following KFENCE bug report helped detect this problem: ================================================================== BUG: KFENCE: use-after-free read in clk_disable+0x14/0x34 Use-after-free read at 0x(ptrval) (in kfence-#19): clk_disable+0x14/0x34 amba_read_periphid+0xdc/0x134 amba_match+0x3c/0x84 __driver_attach+0x20/0x158 bus_for_each_dev+0x74/0xc0 bus_add_driver+0x154/0x1e8 driver_register+0x88/0x11c do_one_initcall+0x8c/0x2fc kernel_init_freeable+0x190/0x220 kernel_init+0x10/0x108 ret_from_fork+0x14/0x3c 0x0 kfence-#19: 0x(ptrval)-0x(ptrval), size=36, cache=kmalloc-64 allocated by task 8 on cpu 0 at 11.629931s: clk_hw_create_clk+0x38/0x134 amba_get_enable_pclk+0x10/0x68 amba_read_periphid+0x28/0x134 amba_match+0x3c/0x84 __device_attach_driver+0x2c/0xc4 bus_for_each_drv+0x80/0xd0 __device_attach+0xb0/0x1f0 bus_probe_device+0x88/0x90 deferred_probe_work_func+0x8c/0xc0 process_one_work+0x23c/0x690 worker_thread+0x34/0x488 kthread+0xd4/0xfc ret_from_fork+0x14/0x3c 0x0 freed by task 8 on cpu 0 at 11.630095s: amba_read_periphid+0xec/0x134 amba_match+0x3c/0x84 __device_attach_driver+0x2c/0xc4 bus_for_each_drv+0x80/0xd0 __device_attach+0xb0/0x1f0 bus_probe_device+0x88/0x90 deferred_probe_work_func+0x8c/0xc0 process_one_work+0x23c/0x690 worker_thread+0x34/0x488 kthread+0xd4/0xfc ret_from_fork+0x14/0x3c 0x0 Cc: Saravana Kannan Cc: patches@armlinux.org.uk Fixes: f2d3b9a46e0e ("ARM: 9220/1: amba: Remove deferred device addition") Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Isaac J. Manjarres Signed-off-by: Russell King (Oracle) --- include/linux/amba/bus.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index e94cdf235f1d..5001e14c5c06 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -67,6 +67,7 @@ struct amba_device { struct clk *pclk; struct device_dma_parameters dma_parms; unsigned int periphid; + struct mutex periphid_lock; unsigned int cid; struct amba_cs_uci_id uci; unsigned int irq[AMBA_NR_IRQS]; -- cgit v1.2.3 From 690252f19f0e486abb8590b3a7a03d4e065d93d4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Aug 2022 20:09:31 -0700 Subject: netlink: add support for ext_ack missing attributes There is currently no way to report via extack in a structured way that an attribute is missing. This leads to families resorting to string messages. Add a pair of attributes - @offset and @type for machine-readable way of reporting missing attributes. The @offset points to the nest which should have contained the attribute, @type is the expected nla_type. The offset will be skipped if the attribute is missing at the message level rather than inside a nest. User space should be able to figure out which attribute enum (AKA attribute space AKA attribute set) the nest pointed to by @offset is using. Reviewed-by: Johannes Berg Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/netlink.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index bda1c385cffb..1619221c415c 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -71,6 +71,8 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) * %NL_SET_ERR_MSG * @bad_attr: attribute with error * @policy: policy for a bad attribute + * @miss_type: attribute type which was missing + * @miss_nest: nest missing an attribute (%NULL if missing top level attr) * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length */ @@ -78,6 +80,8 @@ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; const struct nla_policy *policy; + const struct nlattr *miss_nest; + u16 miss_type; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; }; @@ -126,6 +130,15 @@ struct netlink_ext_ack { #define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) +#define NL_SET_ERR_ATTR_MISS(extack, nest, type) do { \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (__extack) { \ + __extack->miss_nest = (nest); \ + __extack->miss_type = (type); \ + } \ +} while (0) + static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { -- cgit v1.2.3 From 45dca15759643806660e9285e6af8a1ba3c76c82 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Aug 2022 20:09:32 -0700 Subject: netlink: add helpers for extack attr presence checking Being able to check attribute presence and set extack if not on one line is handy, add helpers. Reviewed-by: Johannes Berg Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/netlink.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 1619221c415c..d51e041d2242 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -139,6 +139,17 @@ struct netlink_ext_ack { } \ } while (0) +#define NL_REQ_ATTR_CHECK(extack, nest, tb, type) ({ \ + struct nlattr **__tb = (tb); \ + u32 __attr = (type); \ + int __retval; \ + \ + __retval = !__tb[__attr]; \ + if (__retval) \ + NL_SET_ERR_ATTR_MISS((extack), (nest), __attr); \ + __retval; \ +}) + static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { -- cgit v1.2.3 From 87888fb9ac0c71cdc1edd0779382052475dadb84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 16 Aug 2022 14:57:32 +0300 Subject: tty: Remove baudrate dead code & make ktermios params const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the architectures currently in-tree, either: 1) CBAUDEX is zero 2) The earlier BOTHER if check covers cbaud < 1 case 3) All CBAUD bits are covered by the baud_table Thus, the check for cbaud being out-of-range for CBAUDEX case cannot ever be true. The ktermios parameters can now be made const. Reviewed-by: Andy Shevchenko Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220816115739.10928-2-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 7b0a5d478ef6..cf5ab26de73d 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -434,7 +434,7 @@ int tty_hung_up_p(struct file *filp); void do_SAK(struct tty_struct *tty); void __do_SAK(struct tty_struct *tty); void no_tty(void); -speed_t tty_termios_baud_rate(struct ktermios *termios); +speed_t tty_termios_baud_rate(const struct ktermios *termios); void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud, speed_t obaud); void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, -- cgit v1.2.3 From d15f89d997d995c9a0bf5be5de4c1c405886f21c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 16 Aug 2022 14:57:35 +0300 Subject: tty: Make tty_termios_copy_hw() old ktermios const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There should be no reason to adjust old ktermios which is going to get discarded anyway. Reviewed-by: Andy Shevchenko Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220816115739.10928-5-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index cf5ab26de73d..ae41893f8653 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -458,7 +458,7 @@ static inline speed_t tty_get_baud_rate(struct tty_struct *tty) unsigned char tty_get_char_size(unsigned int cflag); unsigned char tty_get_frame_size(unsigned int cflag); -void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old); +void tty_termios_copy_hw(struct ktermios *new, const struct ktermios *old); int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b); int tty_set_termios(struct tty_struct *tty, struct ktermios *kt); -- cgit v1.2.3 From 8b7d2d95cf82f8ca034ac65d0f39a2b3359b680f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 16 Aug 2022 14:57:36 +0300 Subject: tty: Make ldisc ->set_termios() old ktermios const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There should be no reason to adjust old ktermios which is going to get discarded anyway. Reviewed-by: Andy Shevchenko Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220816115739.10928-6-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_ldisc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index ede6f2157f32..dcb61ec11424 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -130,7 +130,7 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, * a pointer to wordsize-sensitive structure belongs here, but most of * ldiscs will happily leave it %NULL. * - * @set_termios: [TTY] ``void ()(struct tty_struct *tty, struct ktermios *old)`` + * @set_termios: [TTY] ``void ()(struct tty_struct *tty, const struct ktermios *old)`` * * This function notifies the line discpline that a change has been made * to the termios structure. @@ -227,7 +227,7 @@ struct tty_ldisc_ops { unsigned long arg); int (*compat_ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); - void (*set_termios)(struct tty_struct *tty, struct ktermios *old); + void (*set_termios)(struct tty_struct *tty, const struct ktermios *old); __poll_t (*poll)(struct tty_struct *tty, struct file *file, struct poll_table_struct *wait); void (*hangup)(struct tty_struct *tty); -- cgit v1.2.3 From bec5b814d46c2a704c3c8148752e62a33e9fa6dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 16 Aug 2022 14:57:37 +0300 Subject: serial: Make ->set_termios() old ktermios const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There should be no reason to adjust old ktermios which is going to get discarded anyway. Reviewed-by: Andy Shevchenko Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220816115739.10928-7-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 4 ++-- include/linux/serial_core.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 8c7b793aa4d7..43edd10e8c96 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -32,7 +32,7 @@ struct plat_serial8250_port { void (*serial_out)(struct uart_port *, int, int); void (*set_termios)(struct uart_port *, struct ktermios *new, - struct ktermios *old); + const struct ktermios *old); void (*set_ldisc)(struct uart_port *, struct ktermios *); unsigned int (*get_mctrl)(struct uart_port *); @@ -157,7 +157,7 @@ extern int early_serial8250_setup(struct earlycon_device *device, extern void serial8250_update_uartclk(struct uart_port *port, unsigned int uartclk); extern void serial8250_do_set_termios(struct uart_port *port, - struct ktermios *termios, struct ktermios *old); + struct ktermios *termios, const struct ktermios *old); extern void serial8250_do_set_ldisc(struct uart_port *port, struct ktermios *termios); extern unsigned int serial8250_do_get_mctrl(struct uart_port *port); diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index aef3145f2032..0e9bc943bfa8 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -379,7 +379,7 @@ struct uart_ops { void (*shutdown)(struct uart_port *); void (*flush_buffer)(struct uart_port *); void (*set_termios)(struct uart_port *, struct ktermios *new, - struct ktermios *old); + const struct ktermios *old); void (*set_ldisc)(struct uart_port *, struct ktermios *); void (*pm)(struct uart_port *, unsigned int state, unsigned int oldstate); @@ -425,7 +425,7 @@ struct uart_port { void (*serial_out)(struct uart_port *, int, int); void (*set_termios)(struct uart_port *, struct ktermios *new, - struct ktermios *old); + const struct ktermios *old); void (*set_ldisc)(struct uart_port *, struct ktermios *); unsigned int (*get_mctrl)(struct uart_port *); @@ -644,7 +644,7 @@ void uart_write_wakeup(struct uart_port *port); void uart_update_timeout(struct uart_port *port, unsigned int cflag, unsigned int baud); unsigned int uart_get_baud_rate(struct uart_port *port, struct ktermios *termios, - struct ktermios *old, unsigned int min, + const struct ktermios *old, unsigned int min, unsigned int max); unsigned int uart_get_divisor(struct uart_port *port, unsigned int baud); -- cgit v1.2.3 From f6d47fe5921a6d3f5a1a3d0b9a3dd34b8e295722 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 16 Aug 2022 14:57:38 +0300 Subject: usb: serial: Make ->set_termios() old ktermios const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There should be no reason to adjust old ktermios which is going to get discarded anyway. Reviewed-by: Andy Shevchenko Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220816115739.10928-8-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/serial.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h index 8ea319f89e1f..f7bfedb740f5 100644 --- a/include/linux/usb/serial.h +++ b/include/linux/usb/serial.h @@ -276,8 +276,8 @@ struct usb_serial_driver { unsigned int cmd, unsigned long arg); void (*get_serial)(struct tty_struct *tty, struct serial_struct *ss); int (*set_serial)(struct tty_struct *tty, struct serial_struct *ss); - void (*set_termios)(struct tty_struct *tty, - struct usb_serial_port *port, struct ktermios *old); + void (*set_termios)(struct tty_struct *tty, struct usb_serial_port *port, + const struct ktermios *old); void (*break_ctl)(struct tty_struct *tty, int break_state); unsigned int (*chars_in_buffer)(struct tty_struct *tty); void (*wait_until_sent)(struct tty_struct *tty, long timeout); -- cgit v1.2.3 From a8c11c1520347be74b02312d10ef686b01b525f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 16 Aug 2022 14:57:39 +0300 Subject: tty: Make ->set_termios() old ktermios const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There should be no reason to adjust old ktermios which is going to get discarded anyway. Reviewed-by: Andy Shevchenko Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220816115739.10928-9-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 4841d8069c07..b2456b545ba0 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -141,7 +141,7 @@ struct serial_struct; * * Optional. * - * @set_termios: ``void ()(struct tty_struct *tty, struct ktermios *old)`` + * @set_termios: ``void ()(struct tty_struct *tty, const struct ktermios *old)`` * * This routine allows the @tty driver to be notified when device's * termios settings have changed. New settings are in @tty->termios. @@ -365,7 +365,7 @@ struct tty_operations { unsigned int cmd, unsigned long arg); long (*compat_ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); - void (*set_termios)(struct tty_struct *tty, struct ktermios * old); + void (*set_termios)(struct tty_struct *tty, const struct ktermios *old); void (*throttle)(struct tty_struct * tty); void (*unthrottle)(struct tty_struct * tty); void (*stop)(struct tty_struct *tty); -- cgit v1.2.3 From 9c6d778800b921bde3bff3cff5003d1650f942d1 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 26 Aug 2022 15:31:32 -0400 Subject: USB: core: Prevent nested device-reset calls Automatic kernel fuzzing revealed a recursive locking violation in usb-storage: ============================================ WARNING: possible recursive locking detected 5.18.0 #3 Not tainted -------------------------------------------- kworker/1:3/1205 is trying to acquire lock: ffff888018638db8 (&us_interface_key[i]){+.+.}-{3:3}, at: usb_stor_pre_reset+0x35/0x40 drivers/usb/storage/usb.c:230 but task is already holding lock: ffff888018638db8 (&us_interface_key[i]){+.+.}-{3:3}, at: usb_stor_pre_reset+0x35/0x40 drivers/usb/storage/usb.c:230 ... stack backtrace: CPU: 1 PID: 1205 Comm: kworker/1:3 Not tainted 5.18.0 #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: usb_hub_wq hub_event Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_deadlock_bug kernel/locking/lockdep.c:2988 [inline] check_deadlock kernel/locking/lockdep.c:3031 [inline] validate_chain kernel/locking/lockdep.c:3816 [inline] __lock_acquire.cold+0x152/0x3ca kernel/locking/lockdep.c:5053 lock_acquire kernel/locking/lockdep.c:5665 [inline] lock_acquire+0x1ab/0x520 kernel/locking/lockdep.c:5630 __mutex_lock_common kernel/locking/mutex.c:603 [inline] __mutex_lock+0x14f/0x1610 kernel/locking/mutex.c:747 usb_stor_pre_reset+0x35/0x40 drivers/usb/storage/usb.c:230 usb_reset_device+0x37d/0x9a0 drivers/usb/core/hub.c:6109 r871xu_dev_remove+0x21a/0x270 drivers/staging/rtl8712/usb_intf.c:622 usb_unbind_interface+0x1bd/0x890 drivers/usb/core/driver.c:458 device_remove drivers/base/dd.c:545 [inline] device_remove+0x11f/0x170 drivers/base/dd.c:537 __device_release_driver drivers/base/dd.c:1222 [inline] device_release_driver_internal+0x1a7/0x2f0 drivers/base/dd.c:1248 usb_driver_release_interface+0x102/0x180 drivers/usb/core/driver.c:627 usb_forced_unbind_intf+0x4d/0xa0 drivers/usb/core/driver.c:1118 usb_reset_device+0x39b/0x9a0 drivers/usb/core/hub.c:6114 This turned out not to be an error in usb-storage but rather a nested device reset attempt. That is, as the rtl8712 driver was being unbound from a composite device in preparation for an unrelated USB reset (that driver does not have pre_reset or post_reset callbacks), its ->remove routine called usb_reset_device() -- thus nesting one reset call within another. Performing a reset as part of disconnect processing is a questionable practice at best. However, the bug report points out that the USB core does not have any protection against nested resets. Adding a reset_in_progress flag and testing it will prevent such errors in the future. Link: https://lore.kernel.org/all/CAB7eexKUpvX-JNiLzhXBDWgfg2T9e9_0Tw4HQ6keN==voRbP0g@mail.gmail.com/ Cc: stable@vger.kernel.org Reported-and-tested-by: Rondreis Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/YwkflDxvg0KWqyZK@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index f7a9914fc97f..9ff1ad4dfad1 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -575,6 +575,7 @@ struct usb3_lpm_parameters { * @devaddr: device address, XHCI: assigned by HW, others: same as devnum * @can_submit: URBs may be submitted * @persist_enabled: USB_PERSIST enabled for this device + * @reset_in_progress: the device is being reset * @have_langid: whether string_langid is valid * @authorized: policy has said we can use it; * (user space) policy determines if we authorize this device to be @@ -662,6 +663,7 @@ struct usb_device { unsigned can_submit:1; unsigned persist_enabled:1; + unsigned reset_in_progress:1; unsigned have_langid:1; unsigned authorized:1; unsigned authenticated:1; -- cgit v1.2.3 From 43a063cab325ee7cc50349967e536b3cd4e57f03 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 23 Aug 2022 00:46:37 +0000 Subject: KVM: x86/mmu: count KVM mmu usage in secondary pagetable stats. Count the pages used by KVM mmu on x86 in memory stats under secondary pagetable stats (e.g. "SecPageTables" in /proc/meminfo) to give better visibility into the memory consumption of KVM mmu in a similar way to how normal user page tables are accounted. Add the inner helper in common KVM, ARM will also use it to count stats in a future commit. Signed-off-by: Yosry Ahmed Reviewed-by: Sean Christopherson Acked-by: Marc Zyngier # generic KVM changes Link: https://lore.kernel.org/r/20220823004639.2387269-3-yosryahmed@google.com Link: https://lore.kernel.org/r/20220823004639.2387269-4-yosryahmed@google.com [sean: squash x86 usage to workaround modpost issues] Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f4519d3689e1..04c7e5f2f727 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2247,6 +2247,19 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) } #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ +/* + * If more than one page is being (un)accounted, @virt must be the address of + * the first page of a block of pages what were allocated together (i.e + * accounted together). + * + * kvm_account_pgtable_pages() is thread-safe because mod_lruvec_page_state() + * is thread-safe. + */ +static inline void kvm_account_pgtable_pages(void *virt, int nr) +{ + mod_lruvec_page_state(virt_to_page(virt), NR_SECONDARY_PAGETABLE, nr); +} + /* * This defines how many reserved entries we want to keep before we * kick the vcpu to the userspace to avoid dirty ring full. This -- cgit v1.2.3 From 4e508b259ed02f5fa608cdd83b817a7f49c22271 Mon Sep 17 00:00:00 2001 From: "Chengci.Xu" Date: Wed, 17 Aug 2022 20:46:07 +0800 Subject: memory: mtk-smi: Add enable IOMMU SMC command for MM master For concerns about security, the register to enable/disable IOMMU of SMI LARB should only be configured in secure world. Thus, we add some SMC command for multimedia master to enable/disable MM IOMMU in ATF by setting the register of SMI LARB. This function is prepared for MT8188. Signed-off-by: Chengci.Xu Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220817124608.10062-4-chengci.xu@mediatek.com --- include/linux/soc/mediatek/mtk_sip_svc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_sip_svc.h b/include/linux/soc/mediatek/mtk_sip_svc.h index 082398e0cfb1..0761128b4354 100644 --- a/include/linux/soc/mediatek/mtk_sip_svc.h +++ b/include/linux/soc/mediatek/mtk_sip_svc.h @@ -22,4 +22,7 @@ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, MTK_SIP_SMC_CONVENTION, \ ARM_SMCCC_OWNER_SIP, fn_id) +/* IOMMU related SMC call */ +#define MTK_SIP_KERNEL_IOMMU_CONTROL MTK_SIP_SMC_CMD(0x514) + #endif -- cgit v1.2.3 From 9d2b2e83ef277b9c7b8852e8717140daa373ccf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 30 Aug 2022 20:54:10 -0700 Subject: Input: adp5588-keys - support gpi key events as 'gpio keys' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change replaces the support for GPIs as key event generators. Instead of reporting the events directly, we add a gpio based irqchip so that these events can be consumed by keys defined in the gpio-keys driver (as it's goal is indeed for keys on GPIOs capable of generating interrupts). With this, the gpio-adp5588 driver can also be dropped. The basic idea is that all the pins that are not being used as part of the keymap matrix can be possibly requested as GPIOs by gpio-keys (it's also fine to use these pins as plain interrupts though that's not really the point). Since the gpiochip now also has irqchip capabilities, we should only remove it after we free the device interrupt (otherwise we could, in theory, be handling GPIs interrupts while the gpiochip is concurrently removed). Thus the call 'adp5588_gpio_add()' is moved and since the setup phase also needs to come before making the gpios visible, we also need to move 'adp5588_setup()'. While at it, always select GPIOLIB so that we don't need to use #ifdef guards. Signed-off-by: Nuno Sá Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220829131553.690063-2-nuno.sa@analog.com Signed-off-by: Dmitry Torokhov --- include/linux/platform_data/adp5588.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/adp5588.h b/include/linux/platform_data/adp5588.h index 6d3f7d911a92..82170ec8c266 100644 --- a/include/linux/platform_data/adp5588.h +++ b/include/linux/platform_data/adp5588.h @@ -147,8 +147,6 @@ struct adp5588_kpad_platform_data { unsigned en_keylock:1; /* Enable Key Lock feature */ unsigned short unlock_key1; /* Unlock Key 1 */ unsigned short unlock_key2; /* Unlock Key 2 */ - const struct adp5588_gpi_map *gpimap; - unsigned short gpimapsize; const struct adp5588_gpio_platform_data *gpio_data; }; -- cgit v1.2.3 From 6704a86283b7e79ff7ae36d388466428f6672962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 30 Aug 2022 21:00:14 -0700 Subject: Input: adp5588-keys - add support for fw properties MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use firmware properties (eg: OF) to get the device specific configuration. This change just replaces the platform data since there was no platform using it and so, it makes no sense having both. Special note to the PULL-UP disable setting that is now supported as part of the gpio subsystem (using 'set_config()' callback). Signed-off-by: Nuno Sá Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220829131553.690063-5-nuno.sa@analog.com Signed-off-by: Dmitry Torokhov --- include/linux/platform_data/adp5588.h | 169 ---------------------------------- 1 file changed, 169 deletions(-) delete mode 100644 include/linux/platform_data/adp5588.h (limited to 'include/linux') diff --git a/include/linux/platform_data/adp5588.h b/include/linux/platform_data/adp5588.h deleted file mode 100644 index 82170ec8c266..000000000000 --- a/include/linux/platform_data/adp5588.h +++ /dev/null @@ -1,169 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Analog Devices ADP5588 I/O Expander and QWERTY Keypad Controller - * - * Copyright 2009-2010 Analog Devices Inc. - */ - -#ifndef _ADP5588_H -#define _ADP5588_H - -#define DEV_ID 0x00 /* Device ID */ -#define CFG 0x01 /* Configuration Register1 */ -#define INT_STAT 0x02 /* Interrupt Status Register */ -#define KEY_LCK_EC_STAT 0x03 /* Key Lock and Event Counter Register */ -#define Key_EVENTA 0x04 /* Key Event Register A */ -#define Key_EVENTB 0x05 /* Key Event Register B */ -#define Key_EVENTC 0x06 /* Key Event Register C */ -#define Key_EVENTD 0x07 /* Key Event Register D */ -#define Key_EVENTE 0x08 /* Key Event Register E */ -#define Key_EVENTF 0x09 /* Key Event Register F */ -#define Key_EVENTG 0x0A /* Key Event Register G */ -#define Key_EVENTH 0x0B /* Key Event Register H */ -#define Key_EVENTI 0x0C /* Key Event Register I */ -#define Key_EVENTJ 0x0D /* Key Event Register J */ -#define KP_LCK_TMR 0x0E /* Keypad Lock1 to Lock2 Timer */ -#define UNLOCK1 0x0F /* Unlock Key1 */ -#define UNLOCK2 0x10 /* Unlock Key2 */ -#define GPIO_INT_STAT1 0x11 /* GPIO Interrupt Status */ -#define GPIO_INT_STAT2 0x12 /* GPIO Interrupt Status */ -#define GPIO_INT_STAT3 0x13 /* GPIO Interrupt Status */ -#define GPIO_DAT_STAT1 0x14 /* GPIO Data Status, Read twice to clear */ -#define GPIO_DAT_STAT2 0x15 /* GPIO Data Status, Read twice to clear */ -#define GPIO_DAT_STAT3 0x16 /* GPIO Data Status, Read twice to clear */ -#define GPIO_DAT_OUT1 0x17 /* GPIO DATA OUT */ -#define GPIO_DAT_OUT2 0x18 /* GPIO DATA OUT */ -#define GPIO_DAT_OUT3 0x19 /* GPIO DATA OUT */ -#define GPIO_INT_EN1 0x1A /* GPIO Interrupt Enable */ -#define GPIO_INT_EN2 0x1B /* GPIO Interrupt Enable */ -#define GPIO_INT_EN3 0x1C /* GPIO Interrupt Enable */ -#define KP_GPIO1 0x1D /* Keypad or GPIO Selection */ -#define KP_GPIO2 0x1E /* Keypad or GPIO Selection */ -#define KP_GPIO3 0x1F /* Keypad or GPIO Selection */ -#define GPI_EM1 0x20 /* GPI Event Mode 1 */ -#define GPI_EM2 0x21 /* GPI Event Mode 2 */ -#define GPI_EM3 0x22 /* GPI Event Mode 3 */ -#define GPIO_DIR1 0x23 /* GPIO Data Direction */ -#define GPIO_DIR2 0x24 /* GPIO Data Direction */ -#define GPIO_DIR3 0x25 /* GPIO Data Direction */ -#define GPIO_INT_LVL1 0x26 /* GPIO Edge/Level Detect */ -#define GPIO_INT_LVL2 0x27 /* GPIO Edge/Level Detect */ -#define GPIO_INT_LVL3 0x28 /* GPIO Edge/Level Detect */ -#define Debounce_DIS1 0x29 /* Debounce Disable */ -#define Debounce_DIS2 0x2A /* Debounce Disable */ -#define Debounce_DIS3 0x2B /* Debounce Disable */ -#define GPIO_PULL1 0x2C /* GPIO Pull Disable */ -#define GPIO_PULL2 0x2D /* GPIO Pull Disable */ -#define GPIO_PULL3 0x2E /* GPIO Pull Disable */ -#define CMP_CFG_STAT 0x30 /* Comparator Configuration and Status Register */ -#define CMP_CONFG_SENS1 0x31 /* Sensor1 Comparator Configuration Register */ -#define CMP_CONFG_SENS2 0x32 /* L2 Light Sensor Reference Level, Output Falling for Sensor 1 */ -#define CMP1_LVL2_TRIP 0x33 /* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 1 */ -#define CMP1_LVL2_HYS 0x34 /* L3 Light Sensor Reference Level, Output Falling For Sensor 1 */ -#define CMP1_LVL3_TRIP 0x35 /* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 1 */ -#define CMP1_LVL3_HYS 0x36 /* Sensor 2 Comparator Configuration Register */ -#define CMP2_LVL2_TRIP 0x37 /* L2 Light Sensor Reference Level, Output Falling for Sensor 2 */ -#define CMP2_LVL2_HYS 0x38 /* L2 Light Sensor Hysteresis (Active when Output Rising) for Sensor 2 */ -#define CMP2_LVL3_TRIP 0x39 /* L3 Light Sensor Reference Level, Output Falling For Sensor 2 */ -#define CMP2_LVL3_HYS 0x3A /* L3 Light Sensor Hysteresis (Active when Output Rising) For Sensor 2 */ -#define CMP1_ADC_DAT_R1 0x3B /* Comparator 1 ADC data Register1 */ -#define CMP1_ADC_DAT_R2 0x3C /* Comparator 1 ADC data Register2 */ -#define CMP2_ADC_DAT_R1 0x3D /* Comparator 2 ADC data Register1 */ -#define CMP2_ADC_DAT_R2 0x3E /* Comparator 2 ADC data Register2 */ - -#define ADP5588_DEVICE_ID_MASK 0xF - - /* Configuration Register1 */ -#define ADP5588_AUTO_INC (1 << 7) -#define ADP5588_GPIEM_CFG (1 << 6) -#define ADP5588_OVR_FLOW_M (1 << 5) -#define ADP5588_INT_CFG (1 << 4) -#define ADP5588_OVR_FLOW_IEN (1 << 3) -#define ADP5588_K_LCK_IM (1 << 2) -#define ADP5588_GPI_IEN (1 << 1) -#define ADP5588_KE_IEN (1 << 0) - -/* Interrupt Status Register */ -#define ADP5588_CMP2_INT (1 << 5) -#define ADP5588_CMP1_INT (1 << 4) -#define ADP5588_OVR_FLOW_INT (1 << 3) -#define ADP5588_K_LCK_INT (1 << 2) -#define ADP5588_GPI_INT (1 << 1) -#define ADP5588_KE_INT (1 << 0) - -/* Key Lock and Event Counter Register */ -#define ADP5588_K_LCK_EN (1 << 6) -#define ADP5588_LCK21 0x30 -#define ADP5588_KEC 0xF - -#define ADP5588_MAXGPIO 18 -#define ADP5588_BANK(offs) ((offs) >> 3) -#define ADP5588_BIT(offs) (1u << ((offs) & 0x7)) - -/* Put one of these structures in i2c_board_info platform_data */ - -#define ADP5588_KEYMAPSIZE 80 - -#define GPI_PIN_ROW0 97 -#define GPI_PIN_ROW1 98 -#define GPI_PIN_ROW2 99 -#define GPI_PIN_ROW3 100 -#define GPI_PIN_ROW4 101 -#define GPI_PIN_ROW5 102 -#define GPI_PIN_ROW6 103 -#define GPI_PIN_ROW7 104 -#define GPI_PIN_COL0 105 -#define GPI_PIN_COL1 106 -#define GPI_PIN_COL2 107 -#define GPI_PIN_COL3 108 -#define GPI_PIN_COL4 109 -#define GPI_PIN_COL5 110 -#define GPI_PIN_COL6 111 -#define GPI_PIN_COL7 112 -#define GPI_PIN_COL8 113 -#define GPI_PIN_COL9 114 - -#define GPI_PIN_ROW_BASE GPI_PIN_ROW0 -#define GPI_PIN_ROW_END GPI_PIN_ROW7 -#define GPI_PIN_COL_BASE GPI_PIN_COL0 -#define GPI_PIN_COL_END GPI_PIN_COL9 - -#define GPI_PIN_BASE GPI_PIN_ROW_BASE -#define GPI_PIN_END GPI_PIN_COL_END - -#define ADP5588_GPIMAPSIZE_MAX (GPI_PIN_END - GPI_PIN_BASE + 1) - -struct adp5588_gpi_map { - unsigned short pin; - unsigned short sw_evt; -}; - -struct adp5588_kpad_platform_data { - int rows; /* Number of rows */ - int cols; /* Number of columns */ - const unsigned short *keymap; /* Pointer to keymap */ - unsigned short keymapsize; /* Keymap size */ - unsigned repeat:1; /* Enable key repeat */ - unsigned en_keylock:1; /* Enable Key Lock feature */ - unsigned short unlock_key1; /* Unlock Key 1 */ - unsigned short unlock_key2; /* Unlock Key 2 */ - const struct adp5588_gpio_platform_data *gpio_data; -}; - -struct i2c_client; /* forward declaration */ - -struct adp5588_gpio_platform_data { - int gpio_start; /* GPIO Chip base # */ - const char *const *names; - unsigned irq_base; /* interrupt base # */ - unsigned pullup_dis_mask; /* Pull-Up Disable Mask */ - int (*setup)(struct i2c_client *client, - unsigned gpio, unsigned ngpio, - void *context); - int (*teardown)(struct i2c_client *client, - unsigned gpio, unsigned ngpio, - void *context); - void *context; -}; - -#endif -- cgit v1.2.3 From 6b70fe0601adb1396ad0b85cdf05d217500b49e7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 29 Aug 2022 14:38:42 +0200 Subject: acl: add vfs_set_acl_prepare() Various filesystems store POSIX ACLs on the backing store in their uapi format. Such filesystems need to translate from the uapi POSIX ACL format into the VFS format during i_op->get_acl(). The VFS provides the posix_acl_from_xattr() helper for this task. But the usage of posix_acl_from_xattr() is currently ambiguous. It is intended to transform from a uapi POSIX ACL to the VFS represenation. For example, when retrieving POSIX ACLs for permission checking during lookup or when calling getxattr() to retrieve system.posix_acl_{access,default}. Calling posix_acl_from_xattr() during i_op->get_acl() will map the raw {g,u}id values stored as ACL_{GROUP,USER} entries in the uapi POSIX ACL format into k{g,u}id_t in the filesystem's idmapping and return a struct posix_acl ready to be returned to the VFS for caching and to perform permission checks on. However, posix_acl_from_xattr() is also called during setxattr() for all filesystems that rely on VFS provides posix_acl_{access,default}_xattr_handler. The posix_acl_xattr_set() handler which is used for the ->set() method of posix_acl_{access,default}_xattr_handler uses posix_acl_from_xattr() to translate from the uapi POSIX ACL format to the VFS format so that it can be passed to the i_op->set_acl() handler of the filesystem or for direct caching in case no i_op->set_acl() handler is defined. During setxattr() the {g,u}id values stored as ACL_{GROUP,USER} entries in the uapi POSIX ACL format aren't raw {g,u}id values that need to be mapped according to the filesystem's idmapping. Instead they are {g,u}id values in the caller's idmapping which have been generated during posix_acl_fix_xattr_from_user(). In other words, they are k{g,u}id_t which are passed as raw {g,u}id values abusing the uapi POSIX ACL format (Please note that this type safety violation has existed since the introduction of k{g,u}id_t. Please see [1] for more details.). So when posix_acl_from_xattr() is called in posix_acl_xattr_set() the filesystem idmapping is completely irrelevant. Instead, we abuse the initial idmapping to recover the k{g,u}id_t base on the value stored in raw {g,u}id as ACL_{GROUP,USER} in the uapi POSIX ACL format. We need to clearly distinguish betweeen these two operations as it is really easy to confuse for filesystems as can be seen in ntfs3. In order to do this we factor out make_posix_acl() which takes callbacks allowing callers to pass dedicated methods to generate the correct k{g,u}id_t. This is just an internal static helper which is not exposed to any filesystems but it neatly encapsulates the basic logic of walking through a uapi POSIX ACL and returning an allocated VFS POSIX ACL with the correct k{g,u}id_t values. The posix_acl_from_xattr() helper can then be implemented as a simple call to make_posix_acl() with callbacks that generate the correct k{g,u}id_t from the raw {g,u}id values in ACL_{GROUP,USER} entries in the uapi POSIX ACL format as read from the backing store. For setxattr() we add a new helper vfs_set_acl_prepare() which has callbacks to map the POSIX ACLs from the uapi format with the k{g,u}id_t values stored in raw {g,u}id format in ACL_{GROUP,USER} entries into the correct k{g,u}id_t values in the filesystem idmapping. In contrast to posix_acl_from_xattr() the vfs_set_acl_prepare() helper needs to take the mount idmapping into account. The differences are explained in more detail in the kernel doc for the new functions. In follow up patches we will remove all abuses of posix_acl_from_xattr() for setxattr() operations and replace it with calls to vfs_set_acl_prepare(). The new vfs_set_acl_prepare() helper allows us to deal with the ambiguity in how the POSI ACL uapi struct stores {g,u}id values depending on whether this is a getxattr() or setxattr() operation. This also allows us to remove the posix_acl_setxattr_idmapped_mnt() helper reducing the abuse of the POSIX ACL uapi format to pass values that should be distinct types in {g,u}id values stored as ACL_{GROUP,USER} entries. The removal of posix_acl_setxattr_idmapped_mnt() in turn allows us to re-constify the value parameter of vfs_setxattr() which in turn allows us to avoid the nasty cast from a const void pointer to a non-const void pointer on ovl_do_setxattr(). Ultimately, the plan is to get rid of the type violations completely and never pass the values from k{g,u}id_t as raw {g,u}id in ACL_{GROUP,USER} entries in uapi POSIX ACL format. But that's a longer way to go and this is a preparatory step. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Co-Developed-by: Seth Forshee Signed-off-by: Christian Brauner (Microsoft) --- include/linux/posix_acl_xattr.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index b6bd3eac2bcc..47eca15fd842 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -66,6 +66,9 @@ struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns, const void *value, size_t size); int posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, void *buffer, size_t size); +struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + const void *value, size_t size); extern const struct xattr_handler posix_acl_access_xattr_handler; extern const struct xattr_handler posix_acl_default_xattr_handler; -- cgit v1.2.3 From 66d1c8021e1d9c97101d58fa09c33c002d96747a Mon Sep 17 00:00:00 2001 From: Piyush Mehta Date: Mon, 22 Aug 2022 11:10:51 +0530 Subject: usb: chipidea: Add support for VBUS control with PHY Some platforms make use of VBUS control over PHY which means controller driver has to access PHY registers to turn on/off VBUS line.This patch adds support for such platforms in chipidea. Flag 'CI_HDRC_PHY_VBUS_CONTROL' added to support VBus control feature. Acked-by: Peter Chen Signed-off-by: Piyush Mehta Signed-off-by: Piyush Mehta Link: https://lore.kernel.org/r/20220822054051.2941282-1-piyush.mehta@amd.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/chipidea.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h index edf3342507f1..ee38835ed77c 100644 --- a/include/linux/usb/chipidea.h +++ b/include/linux/usb/chipidea.h @@ -62,6 +62,7 @@ struct ci_hdrc_platform_data { #define CI_HDRC_REQUIRES_ALIGNED_DMA BIT(13) #define CI_HDRC_IMX_IS_HSIC BIT(14) #define CI_HDRC_PMQOS BIT(15) +#define CI_HDRC_PHY_VBUS_CONTROL BIT(16) enum usb_dr_mode dr_mode; #define CI_HDRC_CONTROLLER_RESET_EVENT 0 #define CI_HDRC_CONTROLLER_STOPPED_EVENT 1 -- cgit v1.2.3 From 66df18b3bd74107dd7c196e75ce00d64d7553152 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 31 Aug 2022 10:47:50 +0200 Subject: gpio: ucb1400: Use proper header The UCB1400 implements a GPIO driver so it needs to include the header, not the legacy header. Compile tested on pxa_defconfig. Signed-off-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- include/linux/ucb1400.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ucb1400.h b/include/linux/ucb1400.h index 22345391350b..2516082cd3a9 100644 --- a/include/linux/ucb1400.h +++ b/include/linux/ucb1400.h @@ -23,7 +23,7 @@ #include #include #include -#include +#include /* * UCB1400 AC-link registers -- cgit v1.2.3 From d8f3f5834febb74d18b5b2098f67d9db740f3e30 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Aug 2022 10:36:32 -0700 Subject: rcu: Update rcu_access_pointer() header for rcu_dereference_protected() The rcu_access_pointer() docbook header correctly notes that it may be used during post-grace-period teardown. However, it is usually better to use rcu_dereference_protected() for this purpose. This commit therefore calls out this preferred usage. Reported-by: Maxim Mikityanskiy Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f527f27e6438..61a1a85c720c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -496,13 +496,21 @@ do { \ * against NULL. Although rcu_access_pointer() may also be used in cases * where update-side locks prevent the value of the pointer from changing, * you should instead use rcu_dereference_protected() for this use case. + * Within an RCU read-side critical section, there is little reason to + * use rcu_access_pointer(). + * + * It is usually best to test the rcu_access_pointer() return value + * directly in order to avoid accidental dereferences being introduced + * by later inattentive changes. In other words, assigning the + * rcu_access_pointer() return value to a local variable results in an + * accident waiting to happen. * * It is also permissible to use rcu_access_pointer() when read-side - * access to the pointer was removed at least one grace period ago, as - * is the case in the context of the RCU callback that is freeing up - * the data, or after a synchronize_rcu() returns. This can be useful - * when tearing down multi-linked structures after a grace period - * has elapsed. + * access to the pointer was removed at least one grace period ago, as is + * the case in the context of the RCU callback that is freeing up the data, + * or after a synchronize_rcu() returns. This can be useful when tearing + * down multi-linked structures after a grace period has elapsed. However, + * rcu_dereference_protected() is normally preferred for this use case. */ #define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu) -- cgit v1.2.3 From 91a967fd6934abc0c7e4b0d26728e38674278707 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jul 2022 15:37:05 -0700 Subject: rcu: Add full-sized polling for get_completed*() and poll_state*() The get_completed_synchronize_rcu() and poll_state_synchronize_rcu() APIs compress the combined expedited and normal grace-period states into a single unsigned long, which conserves storage, but can miss grace periods in certain cases involving overlapping normal and expedited grace periods. Missing the occasional grace period is usually not a problem, but there are use cases that care about each and every grace period. This commit therefore adds the first members of the full-state RCU grace-period polling API, namely the get_completed_synchronize_rcu_full() and poll_state_synchronize_rcu_full() functions. These use up to three times the storage (rcu_gp_oldstate structure instead of unsigned long), but which are guaranteed not to miss grace periods, at least in situations where the single-CPU grace-period optimization does not apply. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 3 +++ include/linux/rcutiny.h | 9 +++++++++ include/linux/rcutree.h | 8 ++++++++ 3 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f527f27e6438..faaa174dfb27 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -42,7 +42,10 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); void rcu_barrier_tasks_rude(void); void synchronize_rcu(void); + +struct rcu_gp_oldstate; unsigned long get_completed_synchronize_rcu(void); +void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); #ifdef CONFIG_PREEMPT_RCU diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 62815c0a2dce..1fbff8600d92 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -14,10 +14,19 @@ #include /* for HZ */ +struct rcu_gp_oldstate { + unsigned long rgos_norm; +}; + unsigned long get_state_synchronize_rcu(void); unsigned long start_poll_synchronize_rcu(void); bool poll_state_synchronize_rcu(unsigned long oldstate); +static inline bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + return poll_state_synchronize_rcu(rgosp->rgos_norm); +} + static inline void cond_synchronize_rcu(unsigned long oldstate) { might_sleep(); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 47eaa4cb0df7..4ccbc3aa9dc2 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -40,11 +40,19 @@ bool rcu_eqs_special_set(int cpu); void rcu_momentary_dyntick_idle(void); void kfree_rcu_scheduler_running(void); bool rcu_gp_might_be_stalled(void); + +struct rcu_gp_oldstate { + unsigned long rgos_norm; + unsigned long rgos_exp; + unsigned long rgos_polled; +}; + unsigned long start_poll_synchronize_rcu_expedited(void); void cond_synchronize_rcu_expedited(unsigned long oldstate); unsigned long get_state_synchronize_rcu(void); unsigned long start_poll_synchronize_rcu(void); bool poll_state_synchronize_rcu(unsigned long oldstate); +bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); void cond_synchronize_rcu(unsigned long oldstate); bool rcu_is_idle_cpu(int cpu); -- cgit v1.2.3 From 3fdefca9b42c8bebe3beea5c1a067c9718ca0fc7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jul 2022 19:58:13 -0700 Subject: rcu: Add full-sized polling for get_state() The get_state_synchronize_rcu() API compresses the combined expedited and normal grace-period states into a single unsigned long, which conserves storage, but can miss grace periods in certain cases involving overlapping normal and expedited grace periods. Missing the occasional grace period is usually not a problem, but there are use cases that care about each and every grace period. This commit therefore adds the next member of the full-state RCU grace-period polling API, namely the get_state_synchronize_rcu_full() function. This uses up to three times the storage (rcu_gp_oldstate structure instead of unsigned long), but is guaranteed not to miss grace periods. Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 6 ++++++ include/linux/rcutree.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 1fbff8600d92..6e299955c4e9 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -19,6 +19,12 @@ struct rcu_gp_oldstate { }; unsigned long get_state_synchronize_rcu(void); + +static inline void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + rgosp->rgos_norm = get_state_synchronize_rcu(); +} + unsigned long start_poll_synchronize_rcu(void); bool poll_state_synchronize_rcu(unsigned long oldstate); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 4ccbc3aa9dc2..7b769f1b417a 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -50,6 +50,7 @@ struct rcu_gp_oldstate { unsigned long start_poll_synchronize_rcu_expedited(void); void cond_synchronize_rcu_expedited(unsigned long oldstate); unsigned long get_state_synchronize_rcu(void); +void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); unsigned long start_poll_synchronize_rcu(void); bool poll_state_synchronize_rcu(unsigned long oldstate); bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); -- cgit v1.2.3 From 76ea364161e72b1878126edf8d507d2a62fb47b0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Aug 2022 17:04:54 -0700 Subject: rcu: Add full-sized polling for start_poll() The start_poll_synchronize_rcu() API compresses the combined expedited and normal grace-period states into a single unsigned long, which conserves storage, but can miss grace periods in certain cases involving overlapping normal and expedited grace periods. Missing the occasional grace period is usually not a problem, but there are use cases that care about each and every grace period. This commit therefore adds the next member of the full-state RCU grace-period polling API, namely the start_poll_synchronize_rcu_full() function. This uses up to three times the storage (rcu_gp_oldstate structure instead of unsigned long), but is guaranteed not to miss grace periods. Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 6 ++++++ include/linux/rcutree.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 6e299955c4e9..6bc30e46a819 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -26,6 +26,12 @@ static inline void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) } unsigned long start_poll_synchronize_rcu(void); + +static inline void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + rgosp->rgos_norm = start_poll_synchronize_rcu(); +} + bool poll_state_synchronize_rcu(unsigned long oldstate); static inline bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 7b769f1b417a..8f2e0f0b26f6 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -52,6 +52,7 @@ void cond_synchronize_rcu_expedited(unsigned long oldstate); unsigned long get_state_synchronize_rcu(void); void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); unsigned long start_poll_synchronize_rcu(void); +void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); bool poll_state_synchronize_rcu(unsigned long oldstate); bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); void cond_synchronize_rcu(unsigned long oldstate); -- cgit v1.2.3 From 6c502b14ba66da0670a59e20354469fa56eab26c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Aug 2022 12:38:51 -0700 Subject: rcu: Add full-sized polling for start_poll_expedited() The start_poll_synchronize_rcu_expedited() API compresses the combined expedited and normal grace-period states into a single unsigned long, which conserves storage, but can miss grace periods in certain cases involving overlapping normal and expedited grace periods. Missing the occasional grace period is usually not a problem, but there are use cases that care about each and every grace period. This commit therefore adds yet another member of the full-state RCU grace-period polling API, which is the start_poll_synchronize_rcu_expedited_full() function. This uses up to three times the storage (rcu_gp_oldstate structure instead of unsigned long), but is guaranteed not to miss grace periods. [ paulmck: Apply feedback from kernel test robot and Julia Lawall. ] Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 5 +++++ include/linux/rcutree.h | 1 + 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 6bc30e46a819..653e35777a99 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -49,6 +49,11 @@ static inline unsigned long start_poll_synchronize_rcu_expedited(void) return start_poll_synchronize_rcu(); } +static inline void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp) +{ + rgosp->rgos_norm = start_poll_synchronize_rcu_expedited(); +} + static inline void cond_synchronize_rcu_expedited(unsigned long oldstate) { cond_synchronize_rcu(oldstate); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 8f2e0f0b26f6..7151fd861736 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -48,6 +48,7 @@ struct rcu_gp_oldstate { }; unsigned long start_poll_synchronize_rcu_expedited(void); +void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp); void cond_synchronize_rcu_expedited(unsigned long oldstate); unsigned long get_state_synchronize_rcu(void); void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); -- cgit v1.2.3 From b6fe4917ae4353b397079902cb024ae01f20dfb2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Aug 2022 13:46:05 -0700 Subject: rcu: Add full-sized polling for cond_sync_full() The cond_synchronize_rcu() API compresses the combined expedited and normal grace-period states into a single unsigned long, which conserves storage, but can miss grace periods in certain cases involving overlapping normal and expedited grace periods. Missing the occasional grace period is usually not a problem, but there are use cases that care about each and every grace period. This commit therefore adds yet another member of the full-state RCU grace-period polling API, which is the cond_synchronize_rcu_full() function. This uses up to three times the storage (rcu_gp_oldstate structure instead of unsigned long), but is guaranteed not to miss grace periods. [ paulmck: Apply feedback from kernel test robot and Julia Lawall. ] Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 5 +++++ include/linux/rcutree.h | 1 + 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 653e35777a99..3bee97f76bf4 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -44,6 +44,11 @@ static inline void cond_synchronize_rcu(unsigned long oldstate) might_sleep(); } +static inline void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + cond_synchronize_rcu(rgosp->rgos_norm); +} + static inline unsigned long start_poll_synchronize_rcu_expedited(void) { return start_poll_synchronize_rcu(); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 7151fd861736..1b44288c027d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -57,6 +57,7 @@ void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); bool poll_state_synchronize_rcu(unsigned long oldstate); bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); void cond_synchronize_rcu(unsigned long oldstate); +void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); bool rcu_is_idle_cpu(int cpu); -- cgit v1.2.3 From 8df13f01608ea48712956c0b1afce35bdba5a1c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Aug 2022 15:23:26 -0700 Subject: rcu: Add full-sized polling for cond_sync_exp_full() The cond_synchronize_rcu_expedited() API compresses the combined expedited and normal grace-period states into a single unsigned long, which conserves storage, but can miss grace periods in certain cases involving overlapping normal and expedited grace periods. Missing the occasional grace period is usually not a problem, but there are use cases that care about each and every grace period. This commit therefore adds yet another member of the full-state RCU grace-period polling API, which is the cond_synchronize_rcu_exp_full() function. This uses up to three times the storage (rcu_gp_oldstate structure instead of unsigned long), but is guaranteed not to miss grace periods. Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 5 +++++ include/linux/rcutree.h | 1 + 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 3bee97f76bf4..4405e9112cee 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -64,6 +64,11 @@ static inline void cond_synchronize_rcu_expedited(unsigned long oldstate) cond_synchronize_rcu(oldstate); } +static inline void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp) +{ + cond_synchronize_rcu_expedited(rgosp->rgos_norm); +} + extern void rcu_barrier(void); static inline void synchronize_rcu_expedited(void) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 1b44288c027d..755b082f4ec6 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -50,6 +50,7 @@ struct rcu_gp_oldstate { unsigned long start_poll_synchronize_rcu_expedited(void); void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp); void cond_synchronize_rcu_expedited(unsigned long oldstate); +void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp); unsigned long get_state_synchronize_rcu(void); void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); unsigned long start_poll_synchronize_rcu(void); -- cgit v1.2.3 From 7ecef0871dd9a879038dbe8a681ab48bd0c92988 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Aug 2022 17:54:53 -0700 Subject: rcu: Remove ->rgos_polled field from rcu_gp_oldstate structure Because both normal and expedited grace periods increment their respective counters on their pre-scheduler early boot fastpaths, the rcu_gp_oldstate structure no longer needs its ->rgos_polled field. This commit therefore removes this field, shrinking this structure so that it is the same size as an rcu_head structure. Signed-off-by: Paul E. McKenney --- include/linux/rcutree.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 755b082f4ec6..455a03bdce15 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -44,7 +44,6 @@ bool rcu_gp_might_be_stalled(void); struct rcu_gp_oldstate { unsigned long rgos_norm; unsigned long rgos_exp; - unsigned long rgos_polled; }; unsigned long start_poll_synchronize_rcu_expedited(void); -- cgit v1.2.3 From 18538248e5486b0f0e8581083de275176674cd1f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 24 Aug 2022 15:39:09 -0700 Subject: rcu: Add functions to compare grace-period state values This commit adds same_state_synchronize_rcu() and same_state_synchronize_rcu_full() functions to compare grace-period state values, for example, those obtained from get_state_synchronize_rcu() and get_state_synchronize_rcu_full(). These functions allow small structures to omit these state values by placing them in list headers for lists containing structures with the same token value. Presumably the per-structure list pointers are the same ones used to link the structures into whatever reader-accessible data structure was used. This commit also adds both NUM_ACTIVE_RCU_POLL_OLDSTATE and NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE, which define the maximum number of distinct unsigned long values and rcu_gp_oldstate values, respectively, corresponding to not-yet-completed grace periods. These values can be used to size arrays of the list headers described above. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 21 +++++++++++++++++++++ include/linux/rcutiny.h | 14 ++++++++++++++ include/linux/rcutree.h | 28 ++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index faaa174dfb27..9941d5c3d5e1 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -47,6 +47,27 @@ struct rcu_gp_oldstate; unsigned long get_completed_synchronize_rcu(void); void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); +// Maximum number of unsigned long values corresponding to +// not-yet-completed RCU grace periods. +#define NUM_ACTIVE_RCU_POLL_OLDSTATE 2 + +/** + * same_state_synchronize_rcu - Are two old-state values identical? + * @oldstate1: First old-state value. + * @oldstate2: Second old-state value. + * + * The two old-state values must have been obtained from either + * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or + * get_completed_synchronize_rcu(). Returns @true if the two values are + * identical and @false otherwise. This allows structures whose lifetimes + * are tracked by old-state values to push these values to a list header, + * allowing those structures to be slightly smaller. + */ +static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2) +{ + return oldstate1 == oldstate2; +} + #ifdef CONFIG_PREEMPT_RCU void __rcu_read_lock(void); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 4405e9112cee..768196a5f39d 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -18,6 +18,20 @@ struct rcu_gp_oldstate { unsigned long rgos_norm; }; +// Maximum number of rcu_gp_oldstate values corresponding to +// not-yet-completed RCU grace periods. +#define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 2 + +/* + * Are the two oldstate values the same? See the Tree RCU version for + * docbook header. + */ +static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1, + struct rcu_gp_oldstate *rgosp2) +{ + return rgosp1->rgos_norm == rgosp2->rgos_norm; +} + unsigned long get_state_synchronize_rcu(void); static inline void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 455a03bdce15..5efb51486e8a 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -46,6 +46,34 @@ struct rcu_gp_oldstate { unsigned long rgos_exp; }; +// Maximum number of rcu_gp_oldstate values corresponding to +// not-yet-completed RCU grace periods. +#define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 4 + +/** + * same_state_synchronize_rcu_full - Are two old-state values identical? + * @rgosp1: First old-state value. + * @rgosp2: Second old-state value. + * + * The two old-state values must have been obtained from either + * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), + * or get_completed_synchronize_rcu_full(). Returns @true if the two + * values are identical and @false otherwise. This allows structures + * whose lifetimes are tracked by old-state values to push these values + * to a list header, allowing those structures to be slightly smaller. + * + * Note that equality is judged on a bitwise basis, so that an + * @rcu_gp_oldstate structure with an already-completed state in one field + * will compare not-equal to a structure with an already-completed state + * in the other field. After all, the @rcu_gp_oldstate structure is opaque + * so how did such a situation come to pass in the first place? + */ +static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1, + struct rcu_gp_oldstate *rgosp2) +{ + return rgosp1->rgos_norm == rgosp2->rgos_norm && rgosp1->rgos_exp == rgosp2->rgos_exp; +} + unsigned long start_poll_synchronize_rcu_expedited(void); void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp); void cond_synchronize_rcu_expedited(unsigned long oldstate); -- cgit v1.2.3 From d66e4cf974a53c1195f1f5a96387ee5dbad2bdf2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Aug 2022 10:18:23 -0700 Subject: srcu: Add GP and maximum requested GP to Tiny SRCU rcutorture output This commit adds the ->srcu_idx and ->srcu_max_idx fields to the Tiny SRCU rcutorture output for additional diagnostics. Signed-off-by: Paul E. McKenney --- include/linux/srcutiny.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 6cfaa0a9a9b9..4fcec6f5af90 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -82,10 +82,12 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, int idx; idx = ((data_race(READ_ONCE(ssp->srcu_idx)) + 1) & 0x2) >> 1; - pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", + pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd) gp: %hu->%hu\n", tt, tf, idx, data_race(READ_ONCE(ssp->srcu_lock_nesting[!idx])), - data_race(READ_ONCE(ssp->srcu_lock_nesting[idx]))); + data_race(READ_ONCE(ssp->srcu_lock_nesting[idx])), + data_race(READ_ONCE(ssp->srcu_idx)), + data_race(READ_ONCE(ssp->srcu_idx_max))); } #endif -- cgit v1.2.3 From 5fe89191e43fb37e874b8e5177fb2a5c72379b06 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Aug 2022 15:32:47 -0700 Subject: srcu: Make Tiny SRCU use full-sized grace-period counters This commit makes Tiny SRCU use full-sized grace-period counters to further avoid counter-wrap issues when using polled grace-period APIs. Signed-off-by: Paul E. McKenney --- include/linux/srcutiny.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 4fcec6f5af90..5aa5e0faf6a1 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -15,10 +15,10 @@ struct srcu_struct { short srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ - unsigned short srcu_idx; /* Current reader array element in bit 0x2. */ - unsigned short srcu_idx_max; /* Furthest future srcu_idx request. */ u8 srcu_gp_running; /* GP workqueue running? */ u8 srcu_gp_waiting; /* GP waiting for readers? */ + unsigned long srcu_idx; /* Current reader array element in bit 0x2. */ + unsigned long srcu_idx_max; /* Furthest future srcu_idx request. */ struct swait_queue_head srcu_wq; /* Last srcu_read_unlock() wakes GP. */ struct rcu_head *srcu_cb_head; /* Pending callbacks: Head. */ @@ -82,7 +82,7 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, int idx; idx = ((data_race(READ_ONCE(ssp->srcu_idx)) + 1) & 0x2) >> 1; - pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd) gp: %hu->%hu\n", + pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd) gp: %lu->%lu\n", tt, tf, idx, data_race(READ_ONCE(ssp->srcu_lock_nesting[!idx])), data_race(READ_ONCE(ssp->srcu_lock_nesting[idx])), -- cgit v1.2.3 From f9cad07b840ec8a8eb54928908d694b6e262631c Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 30 Aug 2022 18:32:47 +0300 Subject: thunderbolt: Show link type for XDomain connections too Following what we do for routers already, extend this to XDomain connections as well. This will show in sysfs whether the link is in USB4 or Thunderbolt mode. Signed-off-by: Mika Westerberg Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 9f442d73f3df..90cd08ab2f5d 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -187,6 +187,7 @@ void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir); * @device_name: Name of the device (or %NULL if not known) * @link_speed: Speed of the link in Gb/s * @link_width: Width of the link (1 or 2) + * @link_usb4: Downstream link is USB4 * @is_unplugged: The XDomain is unplugged * @needs_uuid: If the XDomain does not have @remote_uuid it will be * queried first @@ -234,6 +235,7 @@ struct tb_xdomain { const char *device_name; unsigned int link_speed; unsigned int link_width; + bool link_usb4; bool is_unplugged; bool needs_uuid; struct ida service_ids; -- cgit v1.2.3 From ec1bd37123c607ca6485beb4542a792a4db765aa Mon Sep 17 00:00:00 2001 From: Khalid Masum Date: Thu, 18 Aug 2022 10:07:38 +0600 Subject: fscache: fix misdocumented parameter This patch fixes two warnings generated by make docs. The functions fscache_use_cookie and fscache_unuse_cookie, both have a parameter named cookie. But they are documented with the name "object" with unclear description. Which generates the warning when creating docs. This commit will replace the currently misdocumented parameter names with the correct ones while adding proper descriptions. CC: Randy Dunlap Signed-off-by: Khalid Masum Signed-off-by: David Howells Link: https://lore.kernel.org/r/20220521142446.4746-1-khalid.masum.92@gmail.com/ # v1 Link: https://lore.kernel.org/r/20220818040738.12036-1-khalid.masum.92@gmail.com/ # v2 Link: https://lore.kernel.org/r/880d7d25753fb326ee17ac08005952112fcf9bdb.1657360984.git.mchehab@kernel.org/ # Mauro's version --- include/linux/fscache.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 720874e6ee94..36e5dd84cf59 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -258,7 +258,7 @@ struct fscache_cookie *fscache_acquire_cookie(struct fscache_volume *volume, /** * fscache_use_cookie - Request usage of cookie attached to an object - * @object: Object description + * @cookie: The cookie representing the cache object * @will_modify: If cache is expected to be modified locally * * Request usage of the cookie attached to an object. The caller should tell @@ -274,7 +274,7 @@ static inline void fscache_use_cookie(struct fscache_cookie *cookie, /** * fscache_unuse_cookie - Cease usage of cookie attached to an object - * @object: Object description + * @cookie: The cookie representing the cache object * @aux_data: Updated auxiliary data (or NULL) * @object_size: Revised size of the object (or NULL) * -- cgit v1.2.3 From 52edb4080eb9606536c34d5d642ccd9d35ad5d08 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 29 Aug 2022 14:38:43 +0200 Subject: acl: move idmapping handling into posix_acl_xattr_set() The uapi POSIX ACL struct passed through the value argument during setxattr() contains {g,u}id values encoded via ACL_{GROUP,USER} entries that should actually be stored in the form of k{g,u}id_t (See [1] for a long explanation of the issue.). In 0c5fd887d2bb ("acl: move idmapped mount fixup into vfs_{g,s}etxattr()") we took the mount's idmapping into account in order to let overlayfs handle POSIX ACLs on idmapped layers correctly. The fixup is currently performed directly in vfs_setxattr() which piles on top of the earlier hackiness by handling the mount's idmapping and stuff the vfs{g,u}id_t values into the uapi struct as well. While that is all correct and works fine it's just ugly. Now that we have introduced vfs_make_posix_acl() earlier move handling idmapped mounts out of vfs_setxattr() and into the POSIX ACL handler where it belongs. Note that we also need to call vfs_make_posix_acl() for EVM which interpretes POSIX ACLs during security_inode_setxattr(). Leave them a longer comment for future reference. All filesystems that support idmapped mounts via FS_ALLOW_IDMAP use the standard POSIX ACL xattr handlers and are covered by this change. This includes overlayfs which simply calls vfs_{g,s}etxattr(). The following filesystems use custom POSIX ACL xattr handlers: 9p, cifs, ecryptfs, and ntfs3 (and overlayfs but we've covered that in the paragraph above) and none of them support idmapped mounts yet. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org/ [1] Signed-off-by: Christian Brauner (Microsoft) Reviewed-by: Seth Forshee (DigitalOcean) --- include/linux/posix_acl_xattr.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index 47eca15fd842..8163dd48c430 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -38,9 +38,6 @@ void posix_acl_fix_xattr_to_user(void *value, size_t size); void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, const struct inode *inode, void *value, size_t size); -void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, - const struct inode *inode, - void *value, size_t size); #else static inline void posix_acl_fix_xattr_from_user(void *value, size_t size) { @@ -54,12 +51,6 @@ posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, size_t size) { } -static inline void -posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, - const struct inode *inode, void *value, - size_t size) -{ -} #endif struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns, -- cgit v1.2.3 From 6344e66970c619a1623f457910e78819076e9104 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 29 Aug 2022 14:38:45 +0200 Subject: xattr: constify value argument in vfs_setxattr() Now that we don't perform translations directly in vfs_setxattr() anymore we can constify the @value argument in vfs_setxattr(). This also allows us to remove the hack to cast from a const in ovl_do_setxattr(). Signed-off-by: Christian Brauner (Microsoft) Reviewed-by: Seth Forshee (DigitalOcean) --- include/linux/xattr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 979a9d3e5bfb..4c379d23ec6e 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -61,7 +61,7 @@ int __vfs_setxattr_locked(struct user_namespace *, struct dentry *, const char *, const void *, size_t, int, struct inode **); int vfs_setxattr(struct user_namespace *, struct dentry *, const char *, - void *, size_t, int); + const void *, size_t, int); int __vfs_removexattr(struct user_namespace *, struct dentry *, const char *); int __vfs_removexattr_locked(struct user_namespace *, struct dentry *, const char *, struct inode **); -- cgit v1.2.3 From b6df1cbb415e543f2908f9c59a8fb20714b86879 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 30 Aug 2022 18:26:10 +0100 Subject: coresight: Simplify sysfs accessors by using csdev_access abstraction The coresight_device struct is available in the sysfs accessor, and this contains a csdev_access struct which can be used to access registers. Use this instead of passing in the type of each drvdata so that a common function can be shared between all the cs drivers. No functional changes. Signed-off-by: James Clark Reviewed-by: Mike Leach Link: https://lore.kernel.org/r/20220830172614.340962-3-james.clark@arm.com Signed-off-by: Mathieu Poirier --- include/linux/coresight.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 9f445f09fcfe..a47dd1f62216 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -372,6 +372,24 @@ static inline u32 csdev_access_relaxed_read32(struct csdev_access *csa, return csa->read(offset, true, false); } +static inline u64 csdev_access_relaxed_read_pair(struct csdev_access *csa, + s32 lo_offset, s32 hi_offset) +{ + u64 val; + + if (likely(csa->io_mem)) { + val = readl_relaxed(csa->base + lo_offset); + val |= (hi_offset < 0) ? 0 : + (u64)readl_relaxed(csa->base + hi_offset) << 32; + return val; + } + + val = csa->read(lo_offset, true, false); + val |= (hi_offset < 0) ? 0 : + (u64)csa->read(hi_offset, true, false) << 32; + return val; +} + static inline u32 csdev_access_read32(struct csdev_access *csa, u32 offset) { if (likely(csa->io_mem)) -- cgit v1.2.3 From 0a98181f805058773961c5ab3172ecf1bf1ed0e1 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 30 Aug 2022 18:26:13 +0100 Subject: coresight: Make new csdev_access offsets unsigned New csdev_access functions were added as part of the previous refactor. In order to make them more consistent with the existing ones, change any signed offset types to be unsigned. Now that they are unsigned, stop using hi_off = -1 to signify a single 32bit access. Instead just call the existing 32bit accessors. This is also applied to other parts of the codebase, and the coresight_{read,write}_reg_pair() functions can be deleted. Signed-off-by: James Clark Reviewed-by: Mike Leach Link: https://lore.kernel.org/r/20220830172614.340962-6-james.clark@arm.com Signed-off-by: Mathieu Poirier --- include/linux/coresight.h | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index a47dd1f62216..1554021231f9 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -373,21 +373,26 @@ static inline u32 csdev_access_relaxed_read32(struct csdev_access *csa, } static inline u64 csdev_access_relaxed_read_pair(struct csdev_access *csa, - s32 lo_offset, s32 hi_offset) + u32 lo_offset, u32 hi_offset) { - u64 val; - if (likely(csa->io_mem)) { - val = readl_relaxed(csa->base + lo_offset); - val |= (hi_offset < 0) ? 0 : - (u64)readl_relaxed(csa->base + hi_offset) << 32; - return val; + return readl_relaxed(csa->base + lo_offset) | + ((u64)readl_relaxed(csa->base + hi_offset) << 32); } - val = csa->read(lo_offset, true, false); - val |= (hi_offset < 0) ? 0 : - (u64)csa->read(hi_offset, true, false) << 32; - return val; + return csa->read(lo_offset, true, false) | (csa->read(hi_offset, true, false) << 32); +} + +static inline void csdev_access_relaxed_write_pair(struct csdev_access *csa, u64 val, + u32 lo_offset, u32 hi_offset) +{ + if (likely(csa->io_mem)) { + writel_relaxed((u32)val, csa->base + lo_offset); + writel_relaxed((u32)(val >> 32), csa->base + hi_offset); + } else { + csa->write((u32)val, lo_offset, true, false); + csa->write((u32)(val >> 32), hi_offset, true, false); + } } static inline u32 csdev_access_read32(struct csdev_access *csa, u32 offset) -- cgit v1.2.3 From 92d23c6e94157739b997cacce151586a0d07bb8a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 26 Aug 2022 09:21:16 -0700 Subject: overflow, tracing: Define the is_signed_type() macro once There are two definitions of the is_signed_type() macro: one in and a second definition in . As suggested by Linus Torvalds, move the definition of the is_signed_type() macro into the header file. Change the definition of the is_signed_type() macro to make sure that it does not trigger any sparse warnings with future versions of sparse for bitwise types. See also: https://lore.kernel.org/all/CAHk-=whjH6p+qzwUdx5SOVVHjS3WvzJQr6mDUwhEyTf6pJWzaQ@mail.gmail.com/ https://lore.kernel.org/all/CAHk-=wjQGnVfb4jehFR0XyZikdQvCZouE96xR_nnf5kqaM5qqQ@mail.gmail.com/ Cc: Andrew Morton Cc: Arnd Bergmann Cc: Dan Williams Cc: Eric Dumazet Cc: Ingo Molnar Cc: Isabella Basso Cc: "Jason A. Donenfeld" Cc: Josh Poimboeuf Cc: Luc Van Oostenryck Cc: Masami Hiramatsu Cc: Nathan Chancellor Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Sander Vanheule Cc: Steven Rostedt Cc: Vlastimil Babka Cc: Yury Norov Signed-off-by: Bart Van Assche Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220826162116.1050972-3-bvanassche@acm.org --- include/linux/compiler.h | 6 ++++++ include/linux/overflow.h | 1 - include/linux/trace_events.h | 2 -- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 01ce94b58b42..7713d7bcdaea 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -239,6 +239,12 @@ static inline void *offset_to_ptr(const int *off) /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +/* + * Whether 'type' is a signed type or an unsigned type. Supports scalar types, + * bool and also pointer types. + */ +#define is_signed_type(type) (((type)(-1)) < (__force type)1) + /* * This is needed in functions which generate the stack canary, see * arch/x86/kernel/smpboot.c::start_secondary() for an example. diff --git a/include/linux/overflow.h b/include/linux/overflow.h index f1221d11f8e5..0eb3b192f07a 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -30,7 +30,6 @@ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - * credit to Christian Biere. */ -#define is_signed_type(type) (((type)(-1)) < (type)1) #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) #define type_min(T) ((T)((T)-type_max(T)-(T)1)) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index b18759a673c6..8401dec93c15 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -814,8 +814,6 @@ extern int trace_add_event_call(struct trace_event_call *call); extern int trace_remove_event_call(struct trace_event_call *call); extern int trace_event_get_offsets(struct trace_event_call *call); -#define is_signed_type(type) (((type)(-1)) < (type)1) - int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); int trace_set_clr_event(const char *system, const char *event, int set); int trace_array_set_clr_event(struct trace_array *tr, const char *system, -- cgit v1.2.3 From bd8092def983f567800f764a5d23b2dca98f078c Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:01:56 +0200 Subject: PM: suspend: move from strlcpy() with unused retval to strscpy() Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Signed-off-by: Wolfram Sang Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 70f2921e2e70..23a253df7f6b 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -75,7 +75,7 @@ extern struct suspend_stats suspend_stats; static inline void dpm_save_failed_dev(const char *name) { - strlcpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev], + strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev], name, sizeof(suspend_stats.failed_devs[0])); suspend_stats.last_failed_dev++; -- cgit v1.2.3 From 21370ecddfe1ff6fb826faedb601cfbb7adcf4ff Mon Sep 17 00:00:00 2001 From: Allen-KH Cheng Date: Thu, 1 Sep 2022 01:21:51 +0800 Subject: soc: mediatek: mutex: Add mt8186 mutex mod settings for mdp3 Add mt8186 mutex mod settings for mdp3. Co-developed-by: Xiandong Wang Signed-off-by: Xiandong Wang Signed-off-by: Allen-KH Cheng Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20220831172151.10215-3-allen-kh.cheng@mediatek.com Signed-off-by: Matthias Brugger --- include/linux/soc/mediatek/mtk-mutex.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk-mutex.h b/include/linux/soc/mediatek/mtk-mutex.h index a0f4f51a3b45..b335c2837cd8 100644 --- a/include/linux/soc/mediatek/mtk-mutex.h +++ b/include/linux/soc/mediatek/mtk-mutex.h @@ -20,6 +20,8 @@ enum mtk_mutex_mod_index { MUTEX_MOD_IDX_MDP_WDMA, MUTEX_MOD_IDX_MDP_AAL0, MUTEX_MOD_IDX_MDP_CCORR0, + MUTEX_MOD_IDX_MDP_HDR0, + MUTEX_MOD_IDX_MDP_COLOR0, MUTEX_MOD_IDX_MAX /* ALWAYS keep at the end */ }; -- cgit v1.2.3 From 2555283eb40df89945557273121e9393ef9b542b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 31 Aug 2022 19:06:00 +0200 Subject: mm/rmap: Fix anon_vma->degree ambiguity leading to double-reuse anon_vma->degree tracks the combined number of child anon_vmas and VMAs that use the anon_vma as their ->anon_vma. anon_vma_clone() then assumes that for any anon_vma attached to src->anon_vma_chain other than src->anon_vma, it is impossible for it to be a leaf node of the VMA tree, meaning that for such VMAs ->degree is elevated by 1 because of a child anon_vma, meaning that if ->degree equals 1 there are no VMAs that use the anon_vma as their ->anon_vma. This assumption is wrong because the ->degree optimization leads to leaf nodes being abandoned on anon_vma_clone() - an existing anon_vma is reused and no new parent-child relationship is created. So it is possible to reuse an anon_vma for one VMA while it is still tied to another VMA. This is an issue because is_mergeable_anon_vma() and its callers assume that if two VMAs have the same ->anon_vma, the list of anon_vmas attached to the VMAs is guaranteed to be the same. When this assumption is violated, vma_merge() can merge pages into a VMA that is not attached to the corresponding anon_vma, leading to dangling page->mapping pointers that will be dereferenced during rmap walks. Fix it by separately tracking the number of child anon_vmas and the number of VMAs using the anon_vma as their ->anon_vma. Fixes: 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy") Cc: stable@kernel.org Acked-by: Michal Hocko Acked-by: Vlastimil Babka Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bf80adca980b..b89b4b86951f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -41,12 +41,15 @@ struct anon_vma { atomic_t refcount; /* - * Count of child anon_vmas and VMAs which points to this anon_vma. + * Count of child anon_vmas. Equals to the count of all anon_vmas that + * have ->parent pointing to this one, including itself. * * This counter is used for making decision about reusing anon_vma * instead of forking new one. See comments in function anon_vma_clone. */ - unsigned degree; + unsigned long num_children; + /* Count of VMAs whose ->anon_vma pointer points to this object. */ + unsigned long num_active_vmas; struct anon_vma *parent; /* Parent of this anon_vma */ -- cgit v1.2.3 From 12198d9179aaa53d0a4026318d8b73b146d89729 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 20 Jul 2022 10:29:34 +0200 Subject: clk: davinci: remove PLL and PSC clocks for DaVinci DM644x and DM646x Commit 7dd33764486d ("ARM: davinci: Delete DM644x board files") and commit b4aed01de486 ("ARM: davinci: Delete DM646x board files") removes the support for DaVinci DM644x and DM646x boards. Hence, remove the PLL and PSC clock descriptions for those boards as well. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20220720082934.17741-1-lukas.bulwahn@gmail.com Reviewed-by: David Lechner Reviewed-by: Linus Walleij Signed-off-by: Stephen Boyd --- include/linux/clk/davinci.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk/davinci.h b/include/linux/clk/davinci.h index 8a7b5cd7eac0..f6ebab6228c2 100644 --- a/include/linux/clk/davinci.h +++ b/include/linux/clk/davinci.h @@ -28,13 +28,5 @@ int dm365_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgch int dm365_pll2_init(struct device *dev, void __iomem *base, struct regmap *cfgchip); int dm365_psc_init(struct device *dev, void __iomem *base); #endif -#ifdef CONFIG_ARCH_DAVINCI_DM644x -int dm644x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip); -int dm644x_psc_init(struct device *dev, void __iomem *base); -#endif -#ifdef CONFIG_ARCH_DAVINCI_DM646x -int dm646x_pll1_init(struct device *dev, void __iomem *base, struct regmap *cfgchip); -int dm646x_psc_init(struct device *dev, void __iomem *base); -#endif #endif /* __LINUX_CLK_DAVINCI_PLL_H___ */ -- cgit v1.2.3 From 9a1043d43a9ab75d28b8ec54512ea13ec33bc910 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Mon, 22 Aug 2022 22:07:22 +0300 Subject: EDAC/mc: Replace spaces with tabs in memtype flags definition Currently, the memory type macros are partly defined with multiple spaces between the macro name and its definition. Replace the spaces with tabs as the kernel coding style requires. Signed-off-by: Serge Semin Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220822190730.27277-13-Sergey.Semin@baikalelectronics.ru --- include/linux/edac.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/edac.h b/include/linux/edac.h index e730b3468719..fa4bda2a70f6 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -231,21 +231,21 @@ enum mem_type { #define MEM_FLAG_DDR BIT(MEM_DDR) #define MEM_FLAG_RDDR BIT(MEM_RDDR) #define MEM_FLAG_RMBS BIT(MEM_RMBS) -#define MEM_FLAG_DDR2 BIT(MEM_DDR2) -#define MEM_FLAG_FB_DDR2 BIT(MEM_FB_DDR2) -#define MEM_FLAG_RDDR2 BIT(MEM_RDDR2) -#define MEM_FLAG_XDR BIT(MEM_XDR) -#define MEM_FLAG_DDR3 BIT(MEM_DDR3) -#define MEM_FLAG_RDDR3 BIT(MEM_RDDR3) -#define MEM_FLAG_LPDDR3 BIT(MEM_LPDDR3) -#define MEM_FLAG_DDR4 BIT(MEM_DDR4) -#define MEM_FLAG_RDDR4 BIT(MEM_RDDR4) -#define MEM_FLAG_LRDDR4 BIT(MEM_LRDDR4) -#define MEM_FLAG_LPDDR4 BIT(MEM_LPDDR4) -#define MEM_FLAG_DDR5 BIT(MEM_DDR5) -#define MEM_FLAG_RDDR5 BIT(MEM_RDDR5) -#define MEM_FLAG_LRDDR5 BIT(MEM_LRDDR5) -#define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) +#define MEM_FLAG_DDR2 BIT(MEM_DDR2) +#define MEM_FLAG_FB_DDR2 BIT(MEM_FB_DDR2) +#define MEM_FLAG_RDDR2 BIT(MEM_RDDR2) +#define MEM_FLAG_XDR BIT(MEM_XDR) +#define MEM_FLAG_DDR3 BIT(MEM_DDR3) +#define MEM_FLAG_RDDR3 BIT(MEM_RDDR3) +#define MEM_FLAG_LPDDR3 BIT(MEM_LPDDR3) +#define MEM_FLAG_DDR4 BIT(MEM_DDR4) +#define MEM_FLAG_RDDR4 BIT(MEM_RDDR4) +#define MEM_FLAG_LRDDR4 BIT(MEM_LRDDR4) +#define MEM_FLAG_LPDDR4 BIT(MEM_LPDDR4) +#define MEM_FLAG_DDR5 BIT(MEM_DDR5) +#define MEM_FLAG_RDDR5 BIT(MEM_RDDR5) +#define MEM_FLAG_LRDDR5 BIT(MEM_LRDDR5) +#define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) #define MEM_FLAG_WIO2 BIT(MEM_WIO2) #define MEM_FLAG_HBM2 BIT(MEM_HBM2) -- cgit v1.2.3 From 26a40990ba052e6f553256f9d0f112452b992a38 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:22 +0900 Subject: mm/sl[au]b: cleanup kmem_cache_alloc[_node]_trace() Despite its name, kmem_cache_alloc[_node]_trace() is hook for inlined kmalloc. So rename it to kmalloc[_node]_trace(). Move its implementation to slab_common.c by using __kmem_cache_alloc_node(), but keep CONFIG_TRACING=n varients to save a function call when CONFIG_TRACING=n. Use __assume_kmalloc_alignment for kmalloc[_node]_trace instead of __assume_slab_alignement. Generally kmalloc has larger alignment requirements. Suggested-by: Vlastimil Babka Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 4ee5b2fed164..c8e485ce8815 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -449,16 +449,16 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assum __malloc; #ifdef CONFIG_TRACING -extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) - __assume_slab_alignment __alloc_size(3); - -extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) __assume_slab_alignment - __alloc_size(4); +void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) + __assume_kmalloc_alignment __alloc_size(3); +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) __assume_kmalloc_alignment + __alloc_size(4); #else /* CONFIG_TRACING */ -static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s, - gfp_t flags, size_t size) +/* Save a function call when CONFIG_TRACING=n */ +static __always_inline __alloc_size(3) +void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) { void *ret = kmem_cache_alloc(s, flags); @@ -466,8 +466,9 @@ static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_ return ret; } -static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) +static __always_inline __alloc_size(4) +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) { void *ret = kmem_cache_alloc_node(s, gfpflags, node); @@ -550,7 +551,7 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) if (!index) return ZERO_SIZE_PTR; - return kmem_cache_alloc_trace( + return kmalloc_trace( kmalloc_caches[kmalloc_type(flags)][index], flags, size); #endif @@ -572,9 +573,9 @@ static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t fla if (!index) return ZERO_SIZE_PTR; - return kmem_cache_alloc_node_trace( + return kmalloc_node_trace( kmalloc_caches[kmalloc_type(flags)][index], - flags, node, size); + flags, node, size); } return __kmalloc_node(size, flags, node); } -- cgit v1.2.3 From 7f8170686d1733321841b21da8c2cd09ddb922b7 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 23 Aug 2022 13:38:36 +0800 Subject: soundwire: intel: cleanup definition of LCOUNT Add definition in header file rather than hidden in code. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20220823053846.2684635-2-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index ec16ae49e6a4..49a3c265529b 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -15,7 +15,10 @@ #define SDW_LINK_SIZE 0x10000 /* Intel SHIM Registers Definition */ +/* LCAP */ #define SDW_SHIM_LCAP 0x0 +#define SDW_SHIM_LCAP_LCOUNT_MASK GENMASK(2, 0) + #define SDW_SHIM_LCTL 0x4 #define SDW_SHIM_IPPTR 0x8 #define SDW_SHIM_SYNC 0xC -- cgit v1.2.3 From feaa24aa408f117dbf074b580d38131dc819505a Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 23 Aug 2022 13:38:37 +0800 Subject: soundwire: intel: regroup definitions for LCTL No functionality change, just regroup offset and bitfield definitions. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20220823053846.2684635-3-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 49a3c265529b..d9f51f43e42c 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -19,7 +19,14 @@ #define SDW_SHIM_LCAP 0x0 #define SDW_SHIM_LCAP_LCOUNT_MASK GENMASK(2, 0) +/* LCTL */ #define SDW_SHIM_LCTL 0x4 + +#define SDW_SHIM_LCTL_SPA BIT(0) +#define SDW_SHIM_LCTL_SPA_MASK GENMASK(3, 0) +#define SDW_SHIM_LCTL_CPA BIT(8) +#define SDW_SHIM_LCTL_CPA_MASK GENMASK(11, 8) + #define SDW_SHIM_IPPTR 0x8 #define SDW_SHIM_SYNC 0xC @@ -39,11 +46,6 @@ #define SDW_SHIM_WAKEEN 0x190 #define SDW_SHIM_WAKESTS 0x192 -#define SDW_SHIM_LCTL_SPA BIT(0) -#define SDW_SHIM_LCTL_SPA_MASK GENMASK(3, 0) -#define SDW_SHIM_LCTL_CPA BIT(8) -#define SDW_SHIM_LCTL_CPA_MASK GENMASK(11, 8) - #define SDW_SHIM_SYNC_SYNCPRD_VAL_24 (24000 / SDW_CADENCE_GSYNC_KHZ - 1) #define SDW_SHIM_SYNC_SYNCPRD_VAL_38_4 (38400 / SDW_CADENCE_GSYNC_KHZ - 1) #define SDW_SHIM_SYNC_SYNCPRD GENMASK(14, 0) -- cgit v1.2.3 From c36b610047463a37203e1158aeaf90f0a82f45dc Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 23 Aug 2022 13:38:38 +0800 Subject: soundwire: intel: remove IPPTR unused definition This read-only register only defines an offset which is known already and a version which isn't used. Remove unused definition. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20220823053846.2684635-4-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index d9f51f43e42c..581a9ba32f82 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -27,7 +27,6 @@ #define SDW_SHIM_LCTL_CPA BIT(8) #define SDW_SHIM_LCTL_CPA_MASK GENMASK(11, 8) -#define SDW_SHIM_IPPTR 0x8 #define SDW_SHIM_SYNC 0xC #define SDW_SHIM_CTLSCAP(x) (0x010 + 0x60 * (x)) -- cgit v1.2.3 From ca33a58d12d3c75a3b28cc97037563bb6e03be7c Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 23 Aug 2022 13:38:39 +0800 Subject: soundwire: intel: cleanup SHIM SYNC Regroup offset and bitfields, no functionality change Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20220823053846.2684635-5-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 581a9ba32f82..66503cf29f48 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -27,8 +27,17 @@ #define SDW_SHIM_LCTL_CPA BIT(8) #define SDW_SHIM_LCTL_CPA_MASK GENMASK(11, 8) +/* SYNC */ #define SDW_SHIM_SYNC 0xC +#define SDW_SHIM_SYNC_SYNCPRD_VAL_24 (24000 / SDW_CADENCE_GSYNC_KHZ - 1) +#define SDW_SHIM_SYNC_SYNCPRD_VAL_38_4 (38400 / SDW_CADENCE_GSYNC_KHZ - 1) +#define SDW_SHIM_SYNC_SYNCPRD GENMASK(14, 0) +#define SDW_SHIM_SYNC_SYNCCPU BIT(15) +#define SDW_SHIM_SYNC_CMDSYNC_MASK GENMASK(19, 16) +#define SDW_SHIM_SYNC_CMDSYNC BIT(16) +#define SDW_SHIM_SYNC_SYNCGO BIT(24) + #define SDW_SHIM_CTLSCAP(x) (0x010 + 0x60 * (x)) #define SDW_SHIM_CTLS0CM(x) (0x012 + 0x60 * (x)) #define SDW_SHIM_CTLS1CM(x) (0x014 + 0x60 * (x)) @@ -45,14 +54,6 @@ #define SDW_SHIM_WAKEEN 0x190 #define SDW_SHIM_WAKESTS 0x192 -#define SDW_SHIM_SYNC_SYNCPRD_VAL_24 (24000 / SDW_CADENCE_GSYNC_KHZ - 1) -#define SDW_SHIM_SYNC_SYNCPRD_VAL_38_4 (38400 / SDW_CADENCE_GSYNC_KHZ - 1) -#define SDW_SHIM_SYNC_SYNCPRD GENMASK(14, 0) -#define SDW_SHIM_SYNC_SYNCCPU BIT(15) -#define SDW_SHIM_SYNC_CMDSYNC_MASK GENMASK(19, 16) -#define SDW_SHIM_SYNC_CMDSYNC BIT(16) -#define SDW_SHIM_SYNC_SYNCGO BIT(24) - #define SDW_SHIM_PCMSCAP_ISS GENMASK(3, 0) #define SDW_SHIM_PCMSCAP_OSS GENMASK(7, 4) #define SDW_SHIM_PCMSCAP_BSS GENMASK(12, 8) -- cgit v1.2.3 From c27ce5c9dd178f7218de6fd29651ad04bd496d5c Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 23 Aug 2022 13:38:40 +0800 Subject: soundwire: intel: remove unused PDM capabilities We removed PDM support a long time ago but kept the definitions. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20220823053846.2684635-6-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 66503cf29f48..34af5bc010b3 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -47,7 +47,6 @@ #define SDW_SHIM_PCMSYCHM(x, y) (0x022 + (0x60 * (x)) + (0x2 * (y))) #define SDW_SHIM_PCMSYCHC(x, y) (0x042 + (0x60 * (x)) + (0x2 * (y))) -#define SDW_SHIM_PDMSCAP(x) (0x062 + 0x60 * (x)) #define SDW_SHIM_IOCTL(x) (0x06C + 0x60 * (x)) #define SDW_SHIM_CTMCTL(x) (0x06E + 0x60 * (x)) @@ -63,11 +62,6 @@ #define SDW_SHIM_PCMSYCM_STREAM GENMASK(13, 8) #define SDW_SHIM_PCMSYCM_DIR BIT(15) -#define SDW_SHIM_PDMSCAP_ISS GENMASK(3, 0) -#define SDW_SHIM_PDMSCAP_OSS GENMASK(7, 4) -#define SDW_SHIM_PDMSCAP_BSS GENMASK(12, 8) -#define SDW_SHIM_PDMSCAP_CPSS GENMASK(15, 13) - #define SDW_SHIM_IOCTL_MIF BIT(0) #define SDW_SHIM_IOCTL_CO BIT(1) #define SDW_SHIM_IOCTL_COE BIT(2) -- cgit v1.2.3 From bd45a65dad8e13ba47f8fb3d6a6c68adecc96db7 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 23 Aug 2022 13:38:41 +0800 Subject: soundwire: intel: add comment for control stream cap/chmap add comment and newline to mark out control stream capabilities and chmap. These registers are unused by the driver, only dumped in debugfs. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20220823053846.2684635-7-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 34af5bc010b3..7c1d111c5f3f 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -38,11 +38,13 @@ #define SDW_SHIM_SYNC_CMDSYNC BIT(16) #define SDW_SHIM_SYNC_SYNCGO BIT(24) +/* Control stream capabililities and channel mask */ #define SDW_SHIM_CTLSCAP(x) (0x010 + 0x60 * (x)) #define SDW_SHIM_CTLS0CM(x) (0x012 + 0x60 * (x)) #define SDW_SHIM_CTLS1CM(x) (0x014 + 0x60 * (x)) #define SDW_SHIM_CTLS2CM(x) (0x016 + 0x60 * (x)) #define SDW_SHIM_CTLS3CM(x) (0x018 + 0x60 * (x)) + #define SDW_SHIM_PCMSCAP(x) (0x020 + 0x60 * (x)) #define SDW_SHIM_PCMSYCHM(x, y) (0x022 + (0x60 * (x)) + (0x2 * (y))) -- cgit v1.2.3 From 40f7a3ddf4e4eff8868d0b0e4ed072f41ab6a528 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 23 Aug 2022 13:38:42 +0800 Subject: soundwire: intel: cleanup PCM stream capabilities Regroup offset and bitfields Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20220823053846.2684635-8-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 7c1d111c5f3f..958628e936ea 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -45,8 +45,13 @@ #define SDW_SHIM_CTLS2CM(x) (0x016 + 0x60 * (x)) #define SDW_SHIM_CTLS3CM(x) (0x018 + 0x60 * (x)) +/* PCM Stream capabilities */ #define SDW_SHIM_PCMSCAP(x) (0x020 + 0x60 * (x)) +#define SDW_SHIM_PCMSCAP_ISS GENMASK(3, 0) +#define SDW_SHIM_PCMSCAP_OSS GENMASK(7, 4) +#define SDW_SHIM_PCMSCAP_BSS GENMASK(12, 8) + #define SDW_SHIM_PCMSYCHM(x, y) (0x022 + (0x60 * (x)) + (0x2 * (y))) #define SDW_SHIM_PCMSYCHC(x, y) (0x042 + (0x60 * (x)) + (0x2 * (y))) #define SDW_SHIM_IOCTL(x) (0x06C + 0x60 * (x)) @@ -55,10 +60,6 @@ #define SDW_SHIM_WAKEEN 0x190 #define SDW_SHIM_WAKESTS 0x192 -#define SDW_SHIM_PCMSCAP_ISS GENMASK(3, 0) -#define SDW_SHIM_PCMSCAP_OSS GENMASK(7, 4) -#define SDW_SHIM_PCMSCAP_BSS GENMASK(12, 8) - #define SDW_SHIM_PCMSYCM_LCHN GENMASK(3, 0) #define SDW_SHIM_PCMSYCM_HCHN GENMASK(7, 4) #define SDW_SHIM_PCMSYCM_STREAM GENMASK(13, 8) -- cgit v1.2.3 From 5c0d256201ec0e47e42ca8cafd3bee54f1c923f0 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 23 Aug 2022 13:38:43 +0800 Subject: soundwire: intel: cleanup PCM Stream channel map and channel count Regroup offset and bitfield definitions. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20220823053846.2684635-9-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 958628e936ea..a1810701642e 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -52,19 +52,23 @@ #define SDW_SHIM_PCMSCAP_OSS GENMASK(7, 4) #define SDW_SHIM_PCMSCAP_BSS GENMASK(12, 8) +/* PCM Stream Channel Map */ #define SDW_SHIM_PCMSYCHM(x, y) (0x022 + (0x60 * (x)) + (0x2 * (y))) -#define SDW_SHIM_PCMSYCHC(x, y) (0x042 + (0x60 * (x)) + (0x2 * (y))) -#define SDW_SHIM_IOCTL(x) (0x06C + 0x60 * (x)) -#define SDW_SHIM_CTMCTL(x) (0x06E + 0x60 * (x)) -#define SDW_SHIM_WAKEEN 0x190 -#define SDW_SHIM_WAKESTS 0x192 +/* PCM Stream Channel Count */ +#define SDW_SHIM_PCMSYCHC(x, y) (0x042 + (0x60 * (x)) + (0x2 * (y))) #define SDW_SHIM_PCMSYCM_LCHN GENMASK(3, 0) #define SDW_SHIM_PCMSYCM_HCHN GENMASK(7, 4) #define SDW_SHIM_PCMSYCM_STREAM GENMASK(13, 8) #define SDW_SHIM_PCMSYCM_DIR BIT(15) +#define SDW_SHIM_IOCTL(x) (0x06C + 0x60 * (x)) +#define SDW_SHIM_CTMCTL(x) (0x06E + 0x60 * (x)) + +#define SDW_SHIM_WAKEEN 0x190 +#define SDW_SHIM_WAKESTS 0x192 + #define SDW_SHIM_IOCTL_MIF BIT(0) #define SDW_SHIM_IOCTL_CO BIT(1) #define SDW_SHIM_IOCTL_COE BIT(2) -- cgit v1.2.3 From 3ea29d33651db069080d1e3b12aea5ef36d766d1 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 23 Aug 2022 13:38:44 +0800 Subject: soundwire: intel: cleanup IO control Regroup offset and bitfield definitions. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20220823053846.2684635-10-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index a1810701642e..a2e1927ac8ac 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -63,11 +63,8 @@ #define SDW_SHIM_PCMSYCM_STREAM GENMASK(13, 8) #define SDW_SHIM_PCMSYCM_DIR BIT(15) +/* IO control */ #define SDW_SHIM_IOCTL(x) (0x06C + 0x60 * (x)) -#define SDW_SHIM_CTMCTL(x) (0x06E + 0x60 * (x)) - -#define SDW_SHIM_WAKEEN 0x190 -#define SDW_SHIM_WAKESTS 0x192 #define SDW_SHIM_IOCTL_MIF BIT(0) #define SDW_SHIM_IOCTL_CO BIT(1) @@ -79,6 +76,11 @@ #define SDW_SHIM_IOCTL_CIBD BIT(8) #define SDW_SHIM_IOCTL_DIBD BIT(9) +#define SDW_SHIM_CTMCTL(x) (0x06E + 0x60 * (x)) + +#define SDW_SHIM_WAKEEN 0x190 +#define SDW_SHIM_WAKESTS 0x192 + #define SDW_SHIM_CTMCTL_DACTQE BIT(0) #define SDW_SHIM_CTMCTL_DODS BIT(1) #define SDW_SHIM_CTMCTL_DOAIS GENMASK(4, 3) -- cgit v1.2.3 From bc7b9595392407549158651bc760c9bda2ab8b5d Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 23 Aug 2022 13:38:45 +0800 Subject: soundwire: intel: cleanup AC Timing Control Regroup offset and bitfield definitions Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20220823053846.2684635-11-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index a2e1927ac8ac..3a56fd5a6331 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -76,11 +76,12 @@ #define SDW_SHIM_IOCTL_CIBD BIT(8) #define SDW_SHIM_IOCTL_DIBD BIT(9) -#define SDW_SHIM_CTMCTL(x) (0x06E + 0x60 * (x)) - #define SDW_SHIM_WAKEEN 0x190 #define SDW_SHIM_WAKESTS 0x192 +/* AC Timing control */ +#define SDW_SHIM_CTMCTL(x) (0x06E + 0x60 * (x)) + #define SDW_SHIM_CTMCTL_DACTQE BIT(0) #define SDW_SHIM_CTMCTL_DODS BIT(1) #define SDW_SHIM_CTMCTL_DOAIS GENMASK(4, 3) -- cgit v1.2.3 From 279e46bc298629b26436c90bd4b5d104cc1e0fb2 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 23 Aug 2022 13:38:46 +0800 Subject: soundwire: intel: cleanup WakeEnable and WakeStatus Regroup offset and bitfield definitions. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20220823053846.2684635-12-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 3a56fd5a6331..2e9fd91572d4 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -76,9 +76,16 @@ #define SDW_SHIM_IOCTL_CIBD BIT(8) #define SDW_SHIM_IOCTL_DIBD BIT(9) +/* Wake Enable*/ #define SDW_SHIM_WAKEEN 0x190 + +#define SDW_SHIM_WAKEEN_ENABLE BIT(0) + +/* Wake Status */ #define SDW_SHIM_WAKESTS 0x192 +#define SDW_SHIM_WAKESTS_STATUS BIT(0) + /* AC Timing control */ #define SDW_SHIM_CTMCTL(x) (0x06E + 0x60 * (x)) @@ -86,9 +93,6 @@ #define SDW_SHIM_CTMCTL_DODS BIT(1) #define SDW_SHIM_CTMCTL_DOAIS GENMASK(4, 3) -#define SDW_SHIM_WAKEEN_ENABLE BIT(0) -#define SDW_SHIM_WAKESTS_STATUS BIT(0) - /* Intel ALH Register definitions */ #define SDW_ALH_STRMZCFG(x) (0x000 + (0x4 * (x))) #define SDW_ALH_NUM_STREAMS 64 -- cgit v1.2.3 From 8dfa9d554061873f96335730fb1d403698b2b1b4 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:25 +0900 Subject: mm/slab_common: move declaration of __ksize() to mm/slab.h __ksize() is only called by KASAN. Remove export symbol and move declaration to mm/slab.h as we don't want to grow its callers. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index c8e485ce8815..9b592e611cb1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -187,7 +187,6 @@ int kmem_cache_shrink(struct kmem_cache *s); void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2); void kfree(const void *objp); void kfree_sensitive(const void *objp); -size_t __ksize(const void *objp); size_t ksize(const void *objp); #ifdef CONFIG_PRINTK bool kmem_valid_obj(void *object); -- cgit v1.2.3 From ac56a0b48da86fd1b4389632fb7c4c8a5d86eefa Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 26 Aug 2022 15:39:28 +0100 Subject: rxrpc: Fix ICMP/ICMP6 error handling Because rxrpc pretends to be a tunnel on top of a UDP/UDP6 socket, allowing it to siphon off UDP packets early in the handling of received UDP packets thereby avoiding the packet going through the UDP receive queue, it doesn't get ICMP packets through the UDP ->sk_error_report() callback. In fact, it doesn't appear that there's any usable option for getting hold of ICMP packets. Fix this by adding a new UDP encap hook to distribute error messages for UDP tunnels. If the hook is set, then the tunnel driver will be able to see ICMP packets. The hook provides the offset into the packet of the UDP header of the original packet that caused the notification. An alternative would be to call the ->error_handler() hook - but that requires that the skbuff be cloned (as ip_icmp_error() or ipv6_cmp_error() do, though isn't really necessary or desirable in rxrpc's case is we want to parse them there and then, not queue them). Changes ======= ver #3) - Fixed an uninitialised variable. ver #2) - Fixed some missing CONFIG_AF_RXRPC_IPV6 conditionals. Fixes: 5271953cad31 ("rxrpc: Use the UDP encap_rcv hook") Signed-off-by: David Howells --- include/linux/udp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 254a2654400f..e96da4157d04 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -70,6 +70,7 @@ struct udp_sock { * For encapsulation sockets. */ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); + void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb); void (*encap_destroy)(struct sock *sk); -- cgit v1.2.3 From 4e55e22d3d9aa50ef1ba059bf3a53aa61109c179 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Wed, 31 Aug 2022 15:50:52 +0300 Subject: USB: hcd-pci: Drop the unused id parameter from usb_hcd_pci_probe() Since the HC driver is now passed to the function with its own parameter (it used to be picked from id->driver_data), the id parameter serves no purpose. Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20220831125052.71584-1-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/hcd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 67f8713d3fa3..78cd566ee238 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -479,7 +479,6 @@ static inline int ehset_single_step_set_feature(struct usb_hcd *hcd, int port) struct pci_dev; struct pci_device_id; extern int usb_hcd_pci_probe(struct pci_dev *dev, - const struct pci_device_id *id, const struct hc_driver *driver); extern void usb_hcd_pci_remove(struct pci_dev *dev); extern void usb_hcd_pci_shutdown(struct pci_dev *dev); -- cgit v1.2.3 From a97126265dfe10d3321c0fde4708a6cea49b19ed Mon Sep 17 00:00:00 2001 From: Henning Schild Date: Thu, 25 Aug 2022 12:44:20 +0200 Subject: leds: simatic-ipc-leds-gpio: add new model 227G This adds support of the Siemens Simatic IPC227G. Its LEDs are connected to GPIO pins provided by the gpio-f7188x module. We make sure that gets loaded, if not enabled in the kernel config no LED support will be available. Reviewed-by: Andy Shevchenko Reviewed-by: Hans de Goede Signed-off-by: Henning Schild Link: https://lore.kernel.org/r/20220825104422.14156-6-henning.schild@siemens.com Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/simatic-ipc-base.h | 1 + include/linux/platform_data/x86/simatic-ipc.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/simatic-ipc-base.h b/include/linux/platform_data/x86/simatic-ipc-base.h index 39fefd48cf4d..57d6a10dfc9e 100644 --- a/include/linux/platform_data/x86/simatic-ipc-base.h +++ b/include/linux/platform_data/x86/simatic-ipc-base.h @@ -19,6 +19,7 @@ #define SIMATIC_IPC_DEVICE_427E 2 #define SIMATIC_IPC_DEVICE_127E 3 #define SIMATIC_IPC_DEVICE_227E 4 +#define SIMATIC_IPC_DEVICE_227G 5 struct simatic_ipc_platform { u8 devmode; diff --git a/include/linux/platform_data/x86/simatic-ipc.h b/include/linux/platform_data/x86/simatic-ipc.h index f3b76b39776b..7a2e79f3be0b 100644 --- a/include/linux/platform_data/x86/simatic-ipc.h +++ b/include/linux/platform_data/x86/simatic-ipc.h @@ -31,6 +31,7 @@ enum simatic_ipc_station_ids { SIMATIC_IPC_IPC427E = 0x00000A01, SIMATIC_IPC_IPC477E = 0x00000A02, SIMATIC_IPC_IPC127E = 0x00000D01, + SIMATIC_IPC_IPC227G = 0x00000F01, }; static inline u32 simatic_ipc_get_station_id(u8 *data, int max_len) -- cgit v1.2.3 From 8f5c9858c5db129359b5de2f60f5f034bf5d56c0 Mon Sep 17 00:00:00 2001 From: Henning Schild Date: Thu, 25 Aug 2022 12:44:22 +0200 Subject: platform/x86: simatic-ipc: add new model 427G This adds support for the Siemens Simatic IPC427G. A board which basically works like the 227G added in a previous patch. So all there is to do is to add the station_id and make it take all the 227G branches. Reviewed-by: Andy Shevchenko Signed-off-by: Henning Schild Link: https://lore.kernel.org/r/20220825104422.14156-8-henning.schild@siemens.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/simatic-ipc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/simatic-ipc.h b/include/linux/platform_data/x86/simatic-ipc.h index 7a2e79f3be0b..632320ec8f08 100644 --- a/include/linux/platform_data/x86/simatic-ipc.h +++ b/include/linux/platform_data/x86/simatic-ipc.h @@ -32,6 +32,7 @@ enum simatic_ipc_station_ids { SIMATIC_IPC_IPC477E = 0x00000A02, SIMATIC_IPC_IPC127E = 0x00000D01, SIMATIC_IPC_IPC227G = 0x00000F01, + SIMATIC_IPC_IPC427G = 0x00001001, }; static inline u32 simatic_ipc_get_station_id(u8 *data, int max_len) -- cgit v1.2.3 From 0a64ce6e5442bbd96cbe9057d9ba1edab244f25b Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Tue, 30 Aug 2022 16:50:04 +0200 Subject: kernel/panic: Drop unblank_screen call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit console_unblank() does this too (called in both places right after), and with a lot more confidence inspiring approach to locking. Reconstructing this story is very strange: In b61312d353da ("oops handling: ensure that any oops is flushed to the mtdoops console") it is claimed that a printk(" "); flushed out the console buffer, which was removed in e3e8a75d2acf ("[PATCH] Extract and use wake_up_klogd()"). In todays kernels this is done way earlier in console_flush_on_panic with some really nasty tricks. I didn't bother to fully reconstruct this all, least because the call to bust_spinlock(0); gets moved every few years, depending upon how the wind blows (or well, who screamed loudest about the various issue each call site caused). Before that commit the only calls to console_unblank() where in s390 arch code. The other side here is the console->unblank callback, which was introduced in 2.1.31 for the vt driver. Which predates the console_unblank() function by a lot, which was added (without users) in 2.4.14.3. So pretty much impossible to guess at any motivation here. Also afaict the vt driver is the only (and always was the only) console driver implementing the unblank callback, so no idea why a call to console_unblank() was added for the mtdooops driver - the action actually flushing out the console buffers is done from console_unlock() only. Note that as prep for the s390 users the locking was adjusted in 2.5.22 (I couldn't figure out how to properly reference the BK commit from the historical git trees) from a normal semaphore to a trylock. Note that a copy of the direct unblank_screen() call was added to panic() in c7c3f05e341a ("panic: avoid deadlocks in re-entrant console drivers"), which partially inlined the bust_spinlocks(0); call. Long story short, I have no idea why the direct call to unblank_screen survived for so long (the infrastructure to do it properly existed for years), nor why it wasn't removed when the console_unblank() call was finally added. But it makes a ton more sense to finally do that than not - it's just better encapsulation to go through the console functions instead of doing a direct call, so let's dare. Plus it really does not make much sense to call the only unblank implementation there is twice, once without, and once with appropriate locking. Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: Daniel Vetter Cc: "Ilpo Järvinen" Cc: Tetsuo Handa Cc: Xuezhi Zhang Cc: Yangxi Xiang Cc: nick black Cc: Petr Mladek Cc: Andrew Morton Cc: Luis Chamberlain Cc: "Guilherme G. Piccoli" Cc: Marco Elver Cc: John Ogness Cc: Sebastian Andrzej Siewior Cc: David Gow Cc: tangmeng Cc: Tiezhu Yang Cc: Chris Wilson Reviewed-by: Petr Mladek Acked-by: Sebastian Andrzej Siewior Signed-off-by: Daniel Vetter Signed-off-by: Andrew Morton Link: https://lore.kernel.org/r/20220830145004.430545-1-daniel.vetter@ffwll.ch Signed-off-by: Greg Kroah-Hartman --- include/linux/vt_kern.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h index b5ab452fca5b..c1f5aebef170 100644 --- a/include/linux/vt_kern.h +++ b/include/linux/vt_kern.h @@ -30,7 +30,6 @@ struct vc_data *vc_deallocate(unsigned int console); void reset_palette(struct vc_data *vc); void do_blank_screen(int entering_gfx); void do_unblank_screen(int leaving_gfx); -void unblank_screen(void); void poke_blanked_console(void); int con_font_op(struct vc_data *vc, struct console_font_op *op); int con_set_cmap(unsigned char __user *cmap); -- cgit v1.2.3 From 3954cf4338becb1f140bb6fa4f5e9a42f2529b86 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Aug 2022 08:14:24 +0200 Subject: devres: remove devm_ioremap_np devm_ioremap_np has never been used anywhere since it was added in early 2021, so remove it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220822061424.151819-1-hch@lst.de Signed-off-by: Greg Kroah-Hartman --- include/linux/io.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io.h b/include/linux/io.h index 5fc800390fe4..308f4f0cfb93 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -59,8 +59,6 @@ void __iomem *devm_ioremap_uc(struct device *dev, resource_size_t offset, resource_size_t size); void __iomem *devm_ioremap_wc(struct device *dev, resource_size_t offset, resource_size_t size); -void __iomem *devm_ioremap_np(struct device *dev, resource_size_t offset, - resource_size_t size); void devm_iounmap(struct device *dev, void __iomem *addr); int check_signature(const volatile void __iomem *io_addr, const unsigned char *signature, int length); -- cgit v1.2.3 From c25491747b21536bd56dccb82a109754bbc8d52c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 27 Aug 2022 19:04:37 -1000 Subject: kernfs: Add KERNFS_REMOVING flags KERNFS_ACTIVATED tracks whether a given node has ever been activated. As a node was only deactivated on removal, this was used for 1. Drain optimization (removed by the previous patch). 2. To hide !activated nodes 3. To avoid double activations 4. Reject adding children to a node being removed 5. Skip activaing a node which is being removed. We want to decouple deactivation from removal so that nodes can be deactivated and hidden dynamically, which makes KERNFS_ACTIVATED useless for all of the above purposes. #1 is already gone. #2 and #3 can instead test whether the node is currently active. A new flag KERNFS_REMOVING is added to explicitly mark nodes which are being removed for #4 and #5. While this leaves KERNFS_ACTIVATED with no users, leave it be as it will be used in a following patch. Cc: Chengming Zhou Tested-by: Chengming Zhou Reviewed-by: Chengming Zhou Signed-off-by: Tejun Heo Link: https://lore.kernel.org/r/20220828050440.734579-7-tj@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 367044d7708c..b77d257c1f7e 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -112,6 +112,7 @@ enum kernfs_node_flag { KERNFS_SUICIDED = 0x0800, KERNFS_EMPTY_DIR = 0x1000, KERNFS_HAS_RELEASE = 0x2000, + KERNFS_REMOVING = 0x4000, }; /* @flags for kernfs_create_root() */ -- cgit v1.2.3 From 783bd07d095b722108725af11113795ee046ed0e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 27 Aug 2022 19:04:39 -1000 Subject: kernfs: Implement kernfs_show() Currently, kernfs nodes can be created hidden and activated later by calling kernfs_activate() to allow creation of multiple nodes to succeed or fail as a unit. This is an one-way one-time-only transition. This patch introduces kernfs_show() which can toggle visibility dynamically. As the currently proposed use - toggling the cgroup pressure files - only requires operating on leaf nodes, for the sake of simplicity, restrict it as such for now. Hiding uses the same mechanism as deactivation and likewise guarantees that there are no in-flight operations on completion. KERNFS_ACTIVATED and KERNFS_HIDDEN are used to manage the interactions between activations and show/hide operations. A node is visible iff both activated & !hidden. Cc: Chengming Zhou Cc: Johannes Weiner Tested-by: Chengming Zhou Reviewed-by: Chengming Zhou Signed-off-by: Tejun Heo Link: https://lore.kernel.org/r/20220828050440.734579-9-tj@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index b77d257c1f7e..73f5c120def8 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -108,6 +108,7 @@ enum kernfs_node_flag { KERNFS_HAS_SEQ_SHOW = 0x0040, KERNFS_HAS_MMAP = 0x0080, KERNFS_LOCKDEP = 0x0100, + KERNFS_HIDDEN = 0x0200, KERNFS_SUICIDAL = 0x0400, KERNFS_SUICIDED = 0x0800, KERNFS_EMPTY_DIR = 0x1000, @@ -430,6 +431,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target); void kernfs_activate(struct kernfs_node *kn); +void kernfs_show(struct kernfs_node *kn, bool show); void kernfs_remove(struct kernfs_node *kn); void kernfs_break_active_protection(struct kernfs_node *kn); void kernfs_unbreak_active_protection(struct kernfs_node *kn); -- cgit v1.2.3 From e2691f6b44ed2135bfd005ad5fbabac4f433a7a1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 27 Aug 2022 19:04:40 -1000 Subject: cgroup: Implement cgroup_file_show() Add cgroup_file_show() which allows toggling visibility of a cgroup file using the new kernfs_show(). This will be used to hide psi interface files on cgroups where it's disabled. Cc: Chengming Zhou Cc: Johannes Weiner Tested-by: Chengming Zhou Reviewed-by: Chengming Zhou Signed-off-by: Tejun Heo Link: https://lore.kernel.org/r/20220828050440.734579-10-tj@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ed53bfe7c46c..f61dd135efbe 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -114,6 +114,7 @@ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); +void cgroup_file_show(struct cgroup_file *cfile, bool show); int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); -- cgit v1.2.3 From 16ede66973c84f890c03584f79158dd5b2d725f5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 25 Aug 2022 07:53:12 -0700 Subject: sbitmap: fix batched wait_cnt accounting Batched completions can clear multiple bits, but we're only decrementing the wait_cnt by one each time. This can cause waiters to never be woken, stalling IO. Use the batched count instead. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215679 Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20220825145312.1217900-1-kbusch@fb.com Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 8f5a86e210b9..4d2d5205ab58 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -575,8 +575,9 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue * on a &struct sbitmap_queue. * @sbq: Bitmap queue to wake up. + * @nr: Number of bits cleared. */ -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr); /** * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct -- cgit v1.2.3 From e34a0425b8ef524355811e7408dc1d53d08dc538 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 26 Aug 2022 16:34:01 -0300 Subject: vfio/pci: Split linux/vfio_pci_core.h The header in include/linux should have only the exported interface for other vfio_pci modules to use. Internal definitions for vfio_pci.ko should be in a "priv" header along side the .c files. Move the internal declarations out of vfio_pci_core.h. They either move to vfio_pci_priv.h or to the C file that is the only user. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/1-v2-1bd95d72f298+e0e-vfio_pci_priv_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 134 +----------------------------------------- 1 file changed, 1 insertion(+), 133 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 5579ece4347b..9d18b832e61a 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -20,39 +20,10 @@ #define VFIO_PCI_CORE_H #define VFIO_PCI_OFFSET_SHIFT 40 - #define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) -/* Special capability IDs predefined access */ -#define PCI_CAP_ID_INVALID 0xFF /* default raw access */ -#define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ - -/* Cap maximum number of ioeventfds per device (arbitrary) */ -#define VFIO_PCI_IOEVENTFD_MAX 1000 - -struct vfio_pci_ioeventfd { - struct list_head next; - struct vfio_pci_core_device *vdev; - struct virqfd *virqfd; - void __iomem *addr; - uint64_t data; - loff_t pos; - int bar; - int count; - bool test_mem; -}; - -struct vfio_pci_irq_ctx { - struct eventfd_ctx *trigger; - struct virqfd *unmask; - struct virqfd *mask; - char *name; - bool masked; - struct irq_bypass_producer producer; -}; - struct vfio_pci_core_device; struct vfio_pci_region; @@ -78,23 +49,6 @@ struct vfio_pci_region { u32 flags; }; -struct vfio_pci_dummy_resource { - struct resource resource; - int index; - struct list_head res_next; -}; - -struct vfio_pci_vf_token { - struct mutex lock; - uuid_t uuid; - int users; -}; - -struct vfio_pci_mmap_vma { - struct vm_area_struct *vma; - struct list_head vma_next; -}; - struct vfio_pci_core_device { struct vfio_device vdev; struct pci_dev *pdev; @@ -141,92 +95,11 @@ struct vfio_pci_core_device { struct rw_semaphore memory_lock; }; -#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) -#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX) -#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX) -#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) -#define irq_is(vdev, type) (vdev->irq_type == type) - -void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); -void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); - -int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, - uint32_t flags, unsigned index, - unsigned start, unsigned count, void *data); - -ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, - char __user *buf, size_t count, - loff_t *ppos, bool iswrite); - -ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, - size_t count, loff_t *ppos, bool iswrite); - -#ifdef CONFIG_VFIO_PCI_VGA -ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, - size_t count, loff_t *ppos, bool iswrite); -#else -static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, - char __user *buf, size_t count, - loff_t *ppos, bool iswrite) -{ - return -EINVAL; -} -#endif - -long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, - uint64_t data, int count, int fd); - -int vfio_pci_init_perm_bits(void); -void vfio_pci_uninit_perm_bits(void); - -int vfio_config_init(struct vfio_pci_core_device *vdev); -void vfio_config_free(struct vfio_pci_core_device *vdev); - +/* Will be exported for vfio pci drivers usage */ int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, unsigned int type, unsigned int subtype, const struct vfio_pci_regops *ops, size_t size, u32 flags, void *data); - -int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, - pci_power_t state); - -bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); -void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev); -u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); -void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, - u16 cmd); - -#ifdef CONFIG_VFIO_PCI_IGD -int vfio_pci_igd_init(struct vfio_pci_core_device *vdev); -#else -static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) -{ - return -ENODEV; -} -#endif - -#ifdef CONFIG_VFIO_PCI_ZDEV_KVM -int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, - struct vfio_info_cap *caps); -int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev); -void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev); -#else -static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, - struct vfio_info_cap *caps) -{ - return -ENODEV; -} - -static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev) -{ - return 0; -} - -static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) -{} -#endif - -/* Will be exported for vfio pci drivers usage */ void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga, bool is_disable_idle_d3); void vfio_pci_core_close_device(struct vfio_device *core_vdev); @@ -256,9 +129,4 @@ void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev); pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state); -static inline bool vfio_pci_is_vga(struct pci_dev *pdev) -{ - return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; -} - #endif /* VFIO_PCI_CORE_H */ -- cgit v1.2.3 From 1e979ef5df8b7b604a625343a179b812a7984068 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 26 Aug 2022 16:34:02 -0300 Subject: vfio/pci: Rename vfio_pci_register_dev_region() As this is part of the vfio_pci_core component it should be called vfio_pci_core_register_dev_region() like everything else exported from this module. Suggested-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/2-v2-1bd95d72f298+e0e-vfio_pci_priv_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 9d18b832e61a..e5cf0d3313a6 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -96,10 +96,10 @@ struct vfio_pci_core_device { }; /* Will be exported for vfio pci drivers usage */ -int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, - unsigned int type, unsigned int subtype, - const struct vfio_pci_regops *ops, - size_t size, u32 flags, void *data); +int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, + unsigned int type, unsigned int subtype, + const struct vfio_pci_regops *ops, + size_t size, u32 flags, void *data); void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga, bool is_disable_idle_d3); void vfio_pci_core_close_device(struct vfio_device *core_vdev); -- cgit v1.2.3 From 4813724c4b76b62f8e6b60dd5655633d0db1c9a8 Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Mon, 29 Aug 2022 17:18:48 +0530 Subject: vfio/pci: Mask INTx during runtime suspend This patch adds INTx handling during runtime suspend/resume. All the suspend/resume related code for the user to put the device into the low power state will be added in subsequent patches. The INTx lines may be shared among devices. Whenever any INTx interrupt comes for the VFIO devices, then vfio_intx_handler() will be called for each device sharing the interrupt. Inside vfio_intx_handler(), it calls pci_check_and_mask_intx() and checks if the interrupt has been generated for the current device. Now, if the device is already in the D3cold state, then the config space can not be read. Attempt to read config space in D3cold state can cause system unresponsiveness in a few systems. To prevent this, mask INTx in runtime suspend callback, and unmask the same in runtime resume callback. If INTx has been already masked, then no handling is needed in runtime suspend/resume callbacks. 'pm_intx_masked' tracks this, and vfio_pci_intx_mask() has been updated to return true if the INTx vfio_pci_irq_ctx.masked value is changed inside this function. For the runtime suspend which is triggered for the no user of VFIO device, the 'irq_type' will be VFIO_PCI_NUM_IRQS and these callbacks won't do anything. The MSI/MSI-X are not shared so similar handling should not be needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal() without doing any device-specific config access. When the user performs any config access or IOCTL after receiving the eventfd notification, then the device will be moved to the D0 state first before servicing any request. Another option was to check this flag 'pm_intx_masked' inside vfio_intx_handler() instead of masking the interrupts. This flag is being set inside the runtime_suspend callback but the device can be in non-D3cold state (for example, if the user has disabled D3cold explicitly by sysfs, the D3cold is not supported in the platform, etc.). Also, in D3cold supported case, the device will be in D0 till the PCI core moves the device into D3cold. In this case, there is a possibility that the device can generate an interrupt. Adding check in the IRQ handler will not clear the IRQ status and the interrupt line will still be asserted. This can cause interrupt flooding. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220829114850.4341-4-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index e5cf0d3313a6..a0f1f36e42a2 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -78,6 +78,7 @@ struct vfio_pci_core_device { bool needs_reset; bool nointx; bool needs_pm_restore; + bool pm_intx_masked; struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; -- cgit v1.2.3 From cc2742fe3660cc6500021d3da8f937d326392dbd Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Mon, 29 Aug 2022 17:18:49 +0530 Subject: vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT Currently, if the runtime power management is enabled for vfio-pci based devices in the guest OS, then the guest OS will do the register write for PCI_PM_CTRL register. This write request will be handled in vfio_pm_config_write() where it will do the actual register write of PCI_PM_CTRL register. With this, the maximum D3hot state can be achieved for low power. If we can use the runtime PM framework, then we can achieve the D3cold state (on the supported systems) which will help in saving maximum power. 1. D3cold state can't be achieved by writing PCI standard PM config registers. This patch implements the following newly added low power related device features: - VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY - VFIO_DEVICE_FEATURE_LOW_POWER_EXIT The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the device to make use of low power platform states on the host while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent further use of those power states. 2. The vfio-pci driver uses runtime PM framework for low power entry and exit. On the platforms where D3cold state is supported, the runtime PM framework will put the device into D3cold otherwise, D3hot or some other power state will be used. There are various cases where the device will not go into the runtime suspended state. For example, - The runtime power management is disabled on the host side for the device. - The user keeps the device busy after calling LOW_POWER_ENTRY. - There are dependent devices that are still in runtime active state. For these cases, the device will be in the same power state that has been configured by the user through PCI_PM_CTRL register. 3. The hypervisors can implement virtual ACPI methods. For example, in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power resources with _ON/_OFF method, then guest linux OS invokes the _OFF method during D3cold transition and then _ON during D0 transition. The hypervisor can tap these virtual ACPI calls and then call the low power device feature IOCTL. 4. The 'pm_runtime_engaged' flag tracks the entry and exit to runtime PM. This flag is protected with 'memory_lock' semaphore. 5. All the config and other region access are wrapped under pm_runtime_resume_and_get() and pm_runtime_put(). So, if any device access happens while the device is in the runtime suspended state, then the device will be resumed first before access. Once the access has been finished, then the device will again go into the runtime suspended state. 6. The memory region access through mmap will not be allowed in the low power state. Since __vfio_pci_memory_enabled() is a common function, so check for 'pm_runtime_engaged' has been added explicitly in vfio_pci_mmap_fault() to block only mmap'ed access. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index a0f1f36e42a2..1025d53fde0b 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -79,6 +79,7 @@ struct vfio_pci_core_device { bool nointx; bool needs_pm_restore; bool pm_intx_masked; + bool pm_runtime_engaged; struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; -- cgit v1.2.3 From 453e6c98fd2bdae0df9adbc86af8d8bf1164edd5 Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Mon, 29 Aug 2022 17:18:50 +0530 Subject: vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP This patch implements VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP device feature. In the VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY, if there is any access for the VFIO device on the host side, then the device will be moved out of the low power state without the user's guest driver involvement. Once the device access has been finished, then the host can move the device again into low power state. With the low power entry happened through VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP, the device will not be moved back into the low power state and a notification will be sent to the user by triggering wakeup eventfd. vfio_pci_core_pm_entry() will be called for both the variants of low power feature entry so add an extra argument for wakeup eventfd context and store locally in 'struct vfio_pci_core_device'. For the entry happened without wakeup eventfd, all the exit related handling will be done by the LOW_POWER_EXIT device feature only. When the LOW_POWER_EXIT will be called, then the vfio core layer vfio_device_pm_runtime_get() will increment the usage count and will resume the device. In the driver runtime_resume callback, the 'pm_wake_eventfd_ctx' will be NULL. Then vfio_pci_core_pm_exit() will call vfio_pci_runtime_pm_exit() and all the exit related handling will be done. For the entry happened with wakeup eventfd, in the driver resume callback, eventfd will be triggered and all the exit related handling will be done. When vfio_pci_runtime_pm_exit() will be called by vfio_pci_core_pm_exit(), then it will return early. But if the runtime suspend has not happened on the host side, then all the exit related handling will be done in vfio_pci_core_pm_exit() only. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220829114850.4341-6-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 1025d53fde0b..089b603bcfdc 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -85,6 +85,7 @@ struct vfio_pci_core_device { int ioeventfds_nr; struct eventfd_ctx *err_trigger; struct eventfd_ctx *req_trigger; + struct eventfd_ctx *pm_wake_eventfd_ctx; struct list_head dummy_resources_list; struct mutex ioeventfds_lock; struct list_head ioeventfds_list; -- cgit v1.2.3 From c8e477c649b40c1a073b7a843d89e51dc0037db7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 30 Jan 2022 19:57:52 -0500 Subject: ->getprocattr(): attribute name is const char *, TYVM... cast of ->d_name.name to char * is completely wrong - nothing is allowed to modify its contents. Reviewed-by: Christian Brauner (Microsoft) Acked-by: Paul Moore Acked-by: Casey Schaufler Signed-off-by: Al Viro --- include/linux/lsm_hook_defs.h | 2 +- include/linux/security.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 806448173033..03360d27bedf 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -253,7 +253,7 @@ LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops, LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb) LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry, struct inode *inode) -LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, char *name, +LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name, char **value) LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) LSM_HOOK(int, 0, ismaclabel, const char *name) diff --git a/include/linux/security.h b/include/linux/security.h index 1bc362cb413f..93488c01d9bd 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -461,7 +461,7 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd); int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter); void security_d_instantiate(struct dentry *dentry, struct inode *inode); -int security_getprocattr(struct task_struct *p, const char *lsm, char *name, +int security_getprocattr(struct task_struct *p, const char *lsm, const char *name, char **value); int security_setprocattr(const char *lsm, const char *name, void *value, size_t size); @@ -1301,7 +1301,7 @@ static inline void security_d_instantiate(struct dentry *dentry, { } static inline int security_getprocattr(struct task_struct *p, const char *lsm, - char *name, char **value) + const char *name, char **value) { return -EINVAL; } -- cgit v1.2.3 From ea4af4aa03c3966c63231b4191da94497de8f034 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Aug 2022 13:19:18 -0400 Subject: nd_jump_link(): constify path Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- include/linux/namei.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index caeb08a98536..00fee52df842 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -83,7 +83,7 @@ extern int follow_up(struct path *); extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); -extern int __must_check nd_jump_link(struct path *path); +extern int __must_check nd_jump_link(const struct path *path); static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) { -- cgit v1.2.3 From 977f1aa5e4d1942bc8012bf3f0e695008979ff4e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Aug 2022 18:44:27 +0000 Subject: net: bql: add more documentation Add some documentation for netdev_tx_sent_queue() and netdev_tx_completed_queue() Stating that netdev_tx_completed_queue() must be called once per TX completion round is apparently not obvious for everybody. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eec35f9b6616..cfe41c053e11 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3353,6 +3353,16 @@ static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_qu #endif } +/** + * netdev_tx_sent_queue - report the number of bytes queued to a given tx queue + * @dev_queue: network device queue + * @bytes: number of bytes queued to the device queue + * + * Report the number of bytes queued for sending/completion to the network + * device hardware queue. @bytes should be a good approximation and should + * exactly match netdev_completed_queue() @bytes. + * This is typically called once per packet, from ndo_start_xmit(). + */ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, unsigned int bytes) { @@ -3398,13 +3408,14 @@ static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue, } /** - * netdev_sent_queue - report the number of bytes queued to hardware - * @dev: network device - * @bytes: number of bytes queued to the hardware device queue + * netdev_sent_queue - report the number of bytes queued to hardware + * @dev: network device + * @bytes: number of bytes queued to the hardware device queue * - * Report the number of bytes queued for sending/completion to the network - * device hardware queue. @bytes should be a good approximation and should - * exactly match netdev_completed_queue() @bytes + * Report the number of bytes queued for sending/completion to the network + * device hardware queue#0. @bytes should be a good approximation and should + * exactly match netdev_completed_queue() @bytes. + * This is typically called once per packet, from ndo_start_xmit(). */ static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes) { @@ -3419,6 +3430,15 @@ static inline bool __netdev_sent_queue(struct net_device *dev, xmit_more); } +/** + * netdev_tx_completed_queue - report number of packets/bytes at TX completion. + * @dev_queue: network device queue + * @pkts: number of packets (currently ignored) + * @bytes: number of bytes dequeued from the device queue + * + * Must be called at most once per TX completion round (and not per + * individual packet), so that BQL can adjust its limits appropriately. + */ static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue, unsigned int pkts, unsigned int bytes) { -- cgit v1.2.3 From 3261400639463a853ba2b3be8bd009c2a8089775 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Aug 2022 23:38:09 +0000 Subject: tcp: TX zerocopy should not sense pfmemalloc status We got a recent syzbot report [1] showing a possible misuse of pfmemalloc page status in TCP zerocopy paths. Indeed, for pages coming from user space or other layers, using page_is_pfmemalloc() is moot, and possibly could give false positives. There has been attempts to make page_is_pfmemalloc() more robust, but not using it in the first place in this context is probably better, removing cpu cycles. Note to stable teams : You need to backport 84ce071e38a6 ("net: introduce __skb_fill_page_desc_noacc") as a prereq. Race is more probable after commit c07aea3ef4d4 ("mm: add a signature in struct page") because page_is_pfmemalloc() is now using low order bit from page->lru.next, which can change more often than page->index. Low order bit should never be set for lru.next (when used as an anchor in LRU list), so KCSAN report is mostly a false positive. Backporting to older kernel versions seems not necessary. [1] BUG: KCSAN: data-race in lru_add_fn / tcp_build_frag write to 0xffffea0004a1d2c8 of 8 bytes by task 18600 on cpu 0: __list_add include/linux/list.h:73 [inline] list_add include/linux/list.h:88 [inline] lruvec_add_folio include/linux/mm_inline.h:105 [inline] lru_add_fn+0x440/0x520 mm/swap.c:228 folio_batch_move_lru+0x1e1/0x2a0 mm/swap.c:246 folio_batch_add_and_move mm/swap.c:263 [inline] folio_add_lru+0xf1/0x140 mm/swap.c:490 filemap_add_folio+0xf8/0x150 mm/filemap.c:948 __filemap_get_folio+0x510/0x6d0 mm/filemap.c:1981 pagecache_get_page+0x26/0x190 mm/folio-compat.c:104 grab_cache_page_write_begin+0x2a/0x30 mm/folio-compat.c:116 ext4_da_write_begin+0x2dd/0x5f0 fs/ext4/inode.c:2988 generic_perform_write+0x1d4/0x3f0 mm/filemap.c:3738 ext4_buffered_write_iter+0x235/0x3e0 fs/ext4/file.c:270 ext4_file_write_iter+0x2e3/0x1210 call_write_iter include/linux/fs.h:2187 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x468/0x760 fs/read_write.c:578 ksys_write+0xe8/0x1a0 fs/read_write.c:631 __do_sys_write fs/read_write.c:643 [inline] __se_sys_write fs/read_write.c:640 [inline] __x64_sys_write+0x3e/0x50 fs/read_write.c:640 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffffea0004a1d2c8 of 8 bytes by task 18611 on cpu 1: page_is_pfmemalloc include/linux/mm.h:1740 [inline] __skb_fill_page_desc include/linux/skbuff.h:2422 [inline] skb_fill_page_desc include/linux/skbuff.h:2443 [inline] tcp_build_frag+0x613/0xb20 net/ipv4/tcp.c:1018 do_tcp_sendpages+0x3e8/0xaf0 net/ipv4/tcp.c:1075 tcp_sendpage_locked net/ipv4/tcp.c:1140 [inline] tcp_sendpage+0x89/0xb0 net/ipv4/tcp.c:1150 inet_sendpage+0x7f/0xc0 net/ipv4/af_inet.c:833 kernel_sendpage+0x184/0x300 net/socket.c:3561 sock_sendpage+0x5a/0x70 net/socket.c:1054 pipe_to_sendpage+0x128/0x160 fs/splice.c:361 splice_from_pipe_feed fs/splice.c:415 [inline] __splice_from_pipe+0x222/0x4d0 fs/splice.c:559 splice_from_pipe fs/splice.c:594 [inline] generic_splice_sendpage+0x89/0xc0 fs/splice.c:743 do_splice_from fs/splice.c:764 [inline] direct_splice_actor+0x80/0xa0 fs/splice.c:931 splice_direct_to_actor+0x305/0x620 fs/splice.c:886 do_splice_direct+0xfb/0x180 fs/splice.c:974 do_sendfile+0x3bf/0x910 fs/read_write.c:1249 __do_sys_sendfile64 fs/read_write.c:1317 [inline] __se_sys_sendfile64 fs/read_write.c:1303 [inline] __x64_sys_sendfile64+0x10c/0x150 fs/read_write.c:1303 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x0000000000000000 -> 0xffffea0004a1d288 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 18611 Comm: syz-executor.4 Not tainted 6.0.0-rc2-syzkaller-00248-ge022620b5d05-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/22/2022 Fixes: c07aea3ef4d4 ("mm: add a signature in struct page") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Shakeel Butt Reviewed-by: Shakeel Butt Signed-off-by: David S. Miller --- include/linux/skbuff.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ca8afa382bf2..18e163a3460d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2444,6 +2444,27 @@ static inline void skb_fill_page_desc(struct sk_buff *skb, int i, skb_shinfo(skb)->nr_frags = i + 1; } +/** + * skb_fill_page_desc_noacc - initialise a paged fragment in an skb + * @skb: buffer containing fragment to be initialised + * @i: paged fragment index to initialise + * @page: the page to use for this fragment + * @off: the offset to the data with @page + * @size: the length of the data + * + * Variant of skb_fill_page_desc() which does not deal with + * pfmemalloc, if page is not owned by us. + */ +static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i, + struct page *page, int off, + int size) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + __skb_fill_page_desc_noacc(shinfo, i, page, off, size); + shinfo->nr_frags = i + 1; +} + void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize); -- cgit v1.2.3 From c3f760ef128789252e7c4f10d3c1721422dceba9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 31 Aug 2022 17:00:58 -0700 Subject: net: remove netif_tx_napi_add() All callers are now gone. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cfe41c053e11..f0068c1ff1df 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2569,8 +2569,6 @@ netif_napi_add_tx_weight(struct net_device *dev, netif_napi_add_weight(dev, napi, poll, weight); } -#define netif_tx_napi_add netif_napi_add_tx_weight - /** * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only * @dev: network device -- cgit v1.2.3 From b30f7c8eb0780e1479a9882526e838664271f4c9 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 1 Sep 2022 13:07:32 +0100 Subject: spi: mux: Fix mux interaction with fast path optimisations The spi-mux driver is rather too clever and attempts to resubmit any message that is submitted to it to the parent controller with some adjusted callbacks. This does not play at all nicely with the fast path which now sets flags on the message indicating that it's being handled through the fast path, we see async messages flagged as being on the fast path. Ideally the spi-mux code would duplicate the message but that's rather invasive and a bit fragile in that it relies on the mux knowing which fields in the message to copy. Instead teach the core that there are controllers which can't cope with the fast path and have the mux flag itself as being such a controller, ensuring that messages going via the mux don't get partially handled via the fast path. This will reduce the performance of any spi-mux connected device since we'll now always use the thread for both the actual controller and the mux controller instead of just the actual controller but given that we were always hitting the slow path anyway it's hopefully not too much of an additional cost and it allows us to keep the fast path. Fixes: ae7d2346dc89 ("spi: Don't use the message queue if possible in spi_sync") Reported-by: Casper Andersson Tested-by: Casper Andersson Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20220901120732.49245-1-broonie@kernel.org Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e6c73d5ff1a8..f089ee1ead58 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -469,6 +469,7 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * SPI_TRANS_FAIL_NO_START. * @queue_empty: signal green light for opportunistically skipping the queue * for spi_sync transfers. + * @must_async: disable all fast paths in the core * * Each SPI controller can communicate with one or more @spi_device * children. These make a small bus, sharing MOSI, MISO and SCK signals @@ -690,6 +691,7 @@ struct spi_controller { /* Flag for enabling opportunistic skipping of the queue in spi_sync */ bool queue_empty; + bool must_async; }; static inline void *spi_controller_get_devdata(struct spi_controller *ctlr) -- cgit v1.2.3 From dc84dbbcc97bfb47e0f2b175d816e601b2890c91 Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Wed, 31 Aug 2022 11:19:06 +0800 Subject: bpf, tnums: Warn against the usage of tnum_in(tnum_range(), ...) Commit a657182a5c51 ("bpf: Don't use tnum_range on array range checking for poke descriptors") has shown that using tnum_range() as argument to tnum_in() can lead to misleading code that looks like tight bound check when in fact the actual allowed range is much wider. Document such behavior to warn against its usage in general, and suggest some scenario where result can be trusted. Signed-off-by: Shung-Hsi Yu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/984b37f9fdf7ac36831d2137415a4a915744c1b6.1661462653.git.daniel@iogearbox.net Link: https://www.openwall.com/lists/oss-security/2022/08/26/1 Link: https://lore.kernel.org/bpf/20220831031907.16133-3-shung-hsi.yu@suse.com Link: https://lore.kernel.org/bpf/20220831031907.16133-2-shung-hsi.yu@suse.com --- include/linux/tnum.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 498dbcedb451..1c3948a1d6ad 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -21,7 +21,12 @@ struct tnum { struct tnum tnum_const(u64 value); /* A completely unknown value */ extern const struct tnum tnum_unknown; -/* A value that's unknown except that @min <= value <= @max */ +/* An unknown value that is a superset of @min <= value <= @max. + * + * Could include values outside the range of [@min, @max]. + * For example tnum_range(0, 2) is represented by {0, 1, 2, *3*}, + * rather than the intended set of {0, 1, 2}. + */ struct tnum tnum_range(u64 min, u64 max); /* Arithmetic and logical ops */ @@ -73,7 +78,18 @@ static inline bool tnum_is_unknown(struct tnum a) */ bool tnum_is_aligned(struct tnum a, u64 size); -/* Returns true if @b represents a subset of @a. */ +/* Returns true if @b represents a subset of @a. + * + * Note that using tnum_range() as @a requires extra cautions as tnum_in() may + * return true unexpectedly due to tnum limited ability to represent tight + * range, e.g. + * + * tnum_in(tnum_range(0, 2), tnum_const(3)) == true + * + * As a rule of thumb, if @a is explicitly coded rather than coming from + * reg->var_off, it should be in form of tnum_const(), tnum_range(0, 2**n - 1), + * or tnum_range(2**n, 2**(n+1) - 1). + */ bool tnum_in(struct tnum a, struct tnum b); /* Formatting functions. These have snprintf-like semantics: they will write -- cgit v1.2.3 From 4ff09db1b79b98b4a2a7511571c640b76cab3beb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:28:02 -0700 Subject: bpf: net: Change sk_getsockopt() to take the sockptr_t argument This patch changes sk_getsockopt() to take the sockptr_t argument such that it can be used by bpf_getsockopt(SOL_SOCKET) in a latter patch. security_socket_getpeersec_stream() is not changed. It stays with the __user ptr (optval.user and optlen.user) to avoid changes to other security hooks. bpf_getsockopt(SOL_SOCKET) also does not support SO_PEERSEC. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002802.2888419-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 +-- include/linux/sockptr.h | 5 +++++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index a5f21dc3c432..527ae1d64e27 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -900,8 +900,7 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); -int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, - unsigned int len); +int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len); bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index d45902fb4cad..bae5e2369b4f 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -64,6 +64,11 @@ static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, return 0; } +static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size) +{ + return copy_to_sockptr_offset(dst, 0, src, size); +} + static inline void *memdup_sockptr(sockptr_t src, size_t len) { void *p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); -- cgit v1.2.3 From 728f064cd7ebea8c182e99e6f152c8b4a0a6b071 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:28:28 -0700 Subject: bpf: net: Change do_ip_getsockopt() to take the sockptr_t argument Similar to the earlier patch that changes sk_getsockopt() to take the sockptr_t argument. This patch also changes do_ip_getsockopt() to take the sockptr_t argument such that a latter patch can make bpf_getsockopt(SOL_IP) to reuse do_ip_getsockopt(). Note on the change in ip_mc_gsfget(). This function is to return an array of sockaddr_storage in optval. This function is shared between ip_get_mcast_msfilter() and compat_ip_get_mcast_msfilter(). However, the sockaddr_storage is stored at different offset of the optval because of the difference between group_filter and compat_group_filter. Thus, a new 'ss_offset' argument is added to ip_mc_gsfget(). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002828.2890585-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/igmp.h | 4 ++-- include/linux/mroute.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 93c262ecbdc9..78890143f079 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -118,9 +118,9 @@ extern int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mreq_source *mreqs, int ifindex); extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex); extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, - struct ip_msfilter __user *optval, int __user *optlen); + sockptr_t optval, sockptr_t optlen); extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, - struct sockaddr_storage __user *p); + sockptr_t optval, size_t offset); extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt, int dif, int sdif); extern void ip_mc_init_dev(struct in_device *); diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 6cbbfe94348c..80b8400ab8b2 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -17,7 +17,7 @@ static inline int ip_mroute_opt(int opt) } int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); -int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); +int ip_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); int ip_mr_init(void); @@ -29,8 +29,8 @@ static inline int ip_mroute_setsockopt(struct sock *sock, int optname, return -ENOPROTOOPT; } -static inline int ip_mroute_getsockopt(struct sock *sock, int optname, - char __user *optval, int __user *optlen) +static inline int ip_mroute_getsockopt(struct sock *sk, int optname, + sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } -- cgit v1.2.3 From 6dadbe4bac68309eb46ab0f30e8ff47a789df49a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:28:53 -0700 Subject: bpf: net: Change do_ipv6_getsockopt() to take the sockptr_t argument Similar to the earlier patch that changes sk_getsockopt() to take the sockptr_t argument . This patch also changes do_ipv6_getsockopt() to take the sockptr_t argument such that a latter patch can make bpf_getsockopt(SOL_IPV6) to reuse do_ipv6_getsockopt(). Note on the change in ip6_mc_msfget(). This function is to return an array of sockaddr_storage in optval. This function is shared between ipv6_get_msfilter() and compat_ipv6_get_msfilter(). However, the sockaddr_storage is stored at different offset of the optval because of the difference between group_filter and compat_group_filter. Thus, a new 'ss_offset' argument is added to ip6_mc_msfget(). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002853.2892532-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/mroute6.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index bc351a85ce9b..8f2b307fb124 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -27,7 +27,7 @@ struct sock; #ifdef CONFIG_IPV6_MROUTE extern int ip6_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); -extern int ip6_mroute_getsockopt(struct sock *, int, char __user *, int __user *); +extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); extern int ip6_mr_input(struct sk_buff *skb); extern int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg); extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); @@ -42,7 +42,7 @@ static inline int ip6_mroute_setsockopt(struct sock *sock, int optname, static inline int ip6_mroute_getsockopt(struct sock *sock, - int optname, char __user *optval, int __user *optlen) + int optname, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } -- cgit v1.2.3 From 2aec909912da55a6e469fd6ee8412080a5433ed2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Aug 2022 11:46:38 +0200 Subject: wifi: use struct_group to copy addresses We sometimes copy all the addresses from the 802.11 header for the AAD, which may cause complaints from fortify checks. Use struct_group() to avoid the compiler warnings/errors. Change-Id: Ic3ea389105e7813b22095b295079eecdabde5045 Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 55e6f4ad0ca6..b6e6d5b40774 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -310,9 +310,11 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) struct ieee80211_hdr { __le16 frame_control; __le16 duration_id; - u8 addr1[ETH_ALEN]; - u8 addr2[ETH_ALEN]; - u8 addr3[ETH_ALEN]; + struct_group(addrs, + u8 addr1[ETH_ALEN]; + u8 addr2[ETH_ALEN]; + u8 addr3[ETH_ALEN]; + ); __le16 seq_ctrl; u8 addr4[ETH_ALEN]; } __packed __aligned(2); -- cgit v1.2.3 From bce1b56c73826fec8caf6187f0c922ede397a5a8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 4 Sep 2022 06:39:25 -0600 Subject: Revert "sbitmap: fix batched wait_cnt accounting" This reverts commit 16ede66973c84f890c03584f79158dd5b2d725f5. This is causing issues with CPU stalls on my test box, revert it for now until we understand what is going on. It looks like infinite looping off sbitmap_queue_wake_up(), but hard to tell with a lot of CPUs hitting this issue and the console scrolling infinitely. Link: https://lore.kernel.org/linux-block/e742813b-ce5c-0d58-205b-1626f639b1bd@kernel.dk/ Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 4d2d5205ab58..8f5a86e210b9 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -575,9 +575,8 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue * on a &struct sbitmap_queue. * @sbq: Bitmap queue to wake up. - * @nr: Number of bits cleared. */ -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr); +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); /** * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct -- cgit v1.2.3 From 2e9bffc4f713db465177238f6033f7d367d6f151 Mon Sep 17 00:00:00 2001 From: Shawn Lin Date: Thu, 25 Aug 2022 21:38:34 +0200 Subject: phy: rockchip: Support PCIe v3 RK3568 supports PCIe v3 using not Combphy like PCIe v2 on rk3566. It use a dedicated PCIe-phy. Add support for this. Initial support by Shawn Lin, modifications by Peter Geis and Frank Wunderlich. Add data-lanes property for splitting pcie-lanes across controllers. The data-lanes is an array where x=0 means lane is disabled and x > 0 means controller x is assigned to phy lane. Signed-off-by: Shawn Lin Suggested-by: Peter Geis Signed-off-by: Frank Wunderlich Link: https://lore.kernel.org/r/20220825193836.54262-4-linux@fw-web.de Signed-off-by: Vinod Koul --- include/linux/phy/pcie.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 include/linux/phy/pcie.h (limited to 'include/linux') diff --git a/include/linux/phy/pcie.h b/include/linux/phy/pcie.h new file mode 100644 index 000000000000..e7ac81764576 --- /dev/null +++ b/include/linux/phy/pcie.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Rockchip Electronics Co., Ltd. + */ +#ifndef __PHY_PCIE_H +#define __PHY_PCIE_H + +#define PHY_MODE_PCIE_RC 20 +#define PHY_MODE_PCIE_EP 21 +#define PHY_MODE_PCIE_BIFURCATION 22 + +#endif -- cgit v1.2.3 From 9c06002682ae0cdd01caf011899224acbc6582b2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 13 Jul 2022 20:22:35 +0300 Subject: dmaengine: hsu: Include headers we are direct user of For the sake of integrity, include headers we are direct user of. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220713172235.22611-4-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/dma/hsu.h | 6 ++++-- include/linux/platform_data/dma-hsu.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma/hsu.h b/include/linux/dma/hsu.h index a6b7bc707356..77ea602c287c 100644 --- a/include/linux/dma/hsu.h +++ b/include/linux/dma/hsu.h @@ -8,11 +8,13 @@ #ifndef _DMA_HSU_H #define _DMA_HSU_H -#include -#include +#include +#include +#include #include +struct device; struct hsu_dma; /** diff --git a/include/linux/platform_data/dma-hsu.h b/include/linux/platform_data/dma-hsu.h index c65b412b2b33..611bae193c1c 100644 --- a/include/linux/platform_data/dma-hsu.h +++ b/include/linux/platform_data/dma-hsu.h @@ -8,7 +8,7 @@ #ifndef _PLATFORM_DATA_DMA_HSU_H #define _PLATFORM_DATA_DMA_HSU_H -#include +struct device; struct hsu_dma_slave { struct device *dma_dev; -- cgit v1.2.3 From 0eadd36d9123745f70e233a9d93951d05ca1916a Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sat, 3 Sep 2022 23:18:47 -0700 Subject: gpiolib: make fwnode_get_named_gpiod() static There are no external users of fwnode_get_named_gpiod() anymore, so let's stop exporting it and mark it as static. Signed-off-by: Dmitry Torokhov Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index fe0f460d9a3b..36460ced060b 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -174,10 +174,6 @@ int desc_to_gpio(const struct gpio_desc *desc); /* Child properties interface */ struct fwnode_handle; -struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode, - const char *propname, int index, - enum gpiod_flags dflags, - const char *label); struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode, const char *con_id, int index, enum gpiod_flags flags, @@ -553,15 +549,6 @@ static inline int desc_to_gpio(const struct gpio_desc *desc) /* Child properties interface */ struct fwnode_handle; -static inline -struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode, - const char *propname, int index, - enum gpiod_flags dflags, - const char *label) -{ - return ERR_PTR(-ENOSYS); -} - static inline struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode, const char *con_id, int index, -- cgit v1.2.3 From 4a502cf4d77e12119e7061a05d5789cd3129d185 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 2 Sep 2022 10:32:03 +0200 Subject: net: pcs: add new PCS driver for altera TSE PCS The Altera Triple Speed Ethernet has a SGMII/1000BaseC PCS that can be integrated in several ways. It can either be part of the TSE MAC's address space, accessed through 32 bits accesses on the mapped mdio device 0, or through a dedicated 16 bits register set. This driver allows using the TSE PCS outside of altera TSE's driver, since it can be used standalone by other MACs. Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- include/linux/pcs-altera-tse.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 include/linux/pcs-altera-tse.h (limited to 'include/linux') diff --git a/include/linux/pcs-altera-tse.h b/include/linux/pcs-altera-tse.h new file mode 100644 index 000000000000..92ab9f08e835 --- /dev/null +++ b/include/linux/pcs-altera-tse.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022 Bootlin + * + * Maxime Chevallier + */ + +#ifndef __LINUX_PCS_ALTERA_TSE_H +#define __LINUX_PCS_ALTERA_TSE_H + +struct phylink_pcs; +struct net_device; + +struct phylink_pcs *alt_tse_pcs_create(struct net_device *ndev, + void __iomem *pcs_base, int reg_width); + +#endif /* __LINUX_PCS_ALTERA_TSE_H */ -- cgit v1.2.3 From dec9b2f1e0455a151a7293c367da22ab973f713e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 2 Sep 2022 16:59:15 +0200 Subject: debugfs: add debugfs_lookup_and_remove() There is a very common pattern of using debugfs_remove(debufs_lookup(..)) which results in a dentry leak of the dentry that was looked up. Instead of having to open-code the correct pattern of calling dput() on the dentry, create debugfs_lookup_and_remove() to handle this pattern automatically and properly without any memory leaks. Cc: stable Reported-by: Kuyo Chang Tested-by: Kuyo Chang Link: https://lore.kernel.org/r/YxIaQ8cSinDR881k@kroah.com Signed-off-by: Greg Kroah-Hartman --- include/linux/debugfs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index c869f1e73d75..f60674692d36 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -91,6 +91,8 @@ struct dentry *debugfs_create_automount(const char *name, void debugfs_remove(struct dentry *dentry); #define debugfs_remove_recursive debugfs_remove +void debugfs_lookup_and_remove(const char *name, struct dentry *parent); + const struct file_operations *debugfs_real_fops(const struct file *filp); int debugfs_file_get(struct dentry *dentry); @@ -225,6 +227,10 @@ static inline void debugfs_remove(struct dentry *dentry) static inline void debugfs_remove_recursive(struct dentry *dentry) { } +static inline void debugfs_lookup_and_remove(const char *name, + struct dentry *parent) +{ } + const struct file_operations *debugfs_real_fops(const struct file *filp); static inline int debugfs_file_get(struct dentry *dentry) -- cgit v1.2.3 From 9ca05b0f27de928be121cccf07735819dc9e1ed3 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Mon, 29 Aug 2022 12:02:27 +0300 Subject: RDMA/mlx5: Rely on RoCE fw cap instead of devlink when setting profile When the RDMA auxiliary driver probes, it sets its profile based on devlink driverinit value. The latter might not be in sync with FW yet (In case devlink reload is not performed), thus causing a mismatch between RDMA driver and FW. This results in the following FW syndrome when the RDMA driver tries to adjust RoCE state, which fails the probe: "0xC1F678 | modify_nic_vport_context: roce_en set on a vport that doesn't support roce" To prevent this, select the PF profile based on FW RoCE capability instead of relying on devlink driverinit value. To provide backward compatibility of the RoCE disable feature, on older FW's where roce_rw is not set (FW RoCE capability is read-only), keep the current behavior e.g., rely on devlink driverinit value. Fixes: fbfa97b4d79f ("net/mlx5: Disable roce at HCA level") Reviewed-by: Shay Drory Reviewed-by: Michael Guralnik Reviewed-by: Saeed Mahameed Signed-off-by: Maher Sanalla Link: https://lore.kernel.org/r/cb34ce9a1df4a24c135cb804db87f7d2418bd6cc.1661763459.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 96b16fbe1aa4..d6338fb449c8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1279,16 +1279,17 @@ enum { MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32, }; -static inline bool mlx5_is_roce_init_enabled(struct mlx5_core_dev *dev) +bool mlx5_is_roce_on(struct mlx5_core_dev *dev); + +static inline bool mlx5_get_roce_state(struct mlx5_core_dev *dev) { - struct devlink *devlink = priv_to_devlink(dev); - union devlink_param_value val; - int err; - - err = devlink_param_driverinit_value_get(devlink, - DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, - &val); - return err ? MLX5_CAP_GEN(dev, roce) : val.vbool; + if (MLX5_CAP_GEN(dev, roce_rw_supported)) + return MLX5_CAP_GEN(dev, roce); + + /* If RoCE cap is read-only in FW, get RoCE state from devlink + * in order to support RoCE enable/disable feature + */ + return mlx5_is_roce_on(dev); } #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From 8a2dd123f12f69e5373d3103da2c97fc36223e0c Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Mon, 29 Aug 2022 12:06:04 +0300 Subject: RDMA/mlx5: Move function mlx5_core_query_ib_ppcnt() to mlx5_ib This patch doesn't change any functionality, but move one function to mlx5_ib because it is not used by mlx5_core. The actual fix is in the next patch. Reviewed-by: Roi Dayan Signed-off-by: Chris Mi Link: https://lore.kernel.org/r/fd47b9138412bd94ed30f838026cbb4cf3878150.1661763871.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 76d7661e3e63..432bd202e2fe 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1079,8 +1079,6 @@ int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *odp_caps); -int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev, - u8 port_num, void *out, size_t sz); int mlx5_init_rl_table(struct mlx5_core_dev *dev); void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 05ad5d4581c3c1cc724fe50d4652833fb9f3037b Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 2 Sep 2022 18:02:39 -0400 Subject: net: phy: Add 1000BASE-KX interface mode Add 1000BASE-KX interface mode. This 1G backplane ethernet as described in clause 70. Clause 73 autonegotiation is mandatory, and only full duplex operation is supported. Although at the PMA level this interface mode is identical to 1000BASE-X, it uses a different form of in-band autonegation. This justifies a separate interface mode, since the interface mode (along with the MLO_AN_* autonegotiation mode) sets the type of autonegotiation which will be used on a link. This results in more than just electrical differences between the link modes. With regard to 1000BASE-X, 1000BASE-KX holds a similar position to SGMII: same signaling, but different autonegotiation. PCS drivers (which typically handle in-band autonegotiation) may only support 1000BASE-X, and not 1000BASE-KX. Similarly, the phy mode is used to configure serdes phys with phy_set_mode_ext. Due to the different electrical standards (SFI or XFI vs Clause 70), they will likely want to use different configuration. Adding a phy interface mode for 1000BASE-KX helps simplify configuration in these areas. Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 7c49ab95441b..337230c135f7 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -116,6 +116,7 @@ extern const int phy_10gbit_features_array[1]; * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN * @PHY_INTERFACE_MODE_QUSGMII: Quad Universal SGMII + * @PHY_INTERFACE_MODE_1000BASEKX: 1000Base-KX - with Clause 73 AN * @PHY_INTERFACE_MODE_MAX: Book keeping * * Describes the interface between the MAC and PHY. @@ -154,6 +155,7 @@ typedef enum { /* 10GBASE-KR - with Clause 73 AN */ PHY_INTERFACE_MODE_10GKR, PHY_INTERFACE_MODE_QUSGMII, + PHY_INTERFACE_MODE_1000BASEKX, PHY_INTERFACE_MODE_MAX, } phy_interface_t; @@ -251,6 +253,8 @@ static inline const char *phy_modes(phy_interface_t interface) return "trgmii"; case PHY_INTERFACE_MODE_1000BASEX: return "1000base-x"; + case PHY_INTERFACE_MODE_1000BASEKX: + return "1000base-kx"; case PHY_INTERFACE_MODE_2500BASEX: return "2500base-x"; case PHY_INTERFACE_MODE_5GBASER: -- cgit v1.2.3 From 7c8199e24fa09d2344ae0204527d55d7803e8409 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:43 -0700 Subject: bpf: Introduce any context BPF specific memory allocator. Tracing BPF programs can attach to kprobe and fentry. Hence they run in unknown context where calling plain kmalloc() might not be safe. Front-end kmalloc() with minimal per-cpu cache of free elements. Refill this cache asynchronously from irq_work. BPF programs always run with migration disabled. It's safe to allocate from cache of the current cpu with irqs disabled. Free-ing is always done into bucket of the current cpu as well. irq_work trims extra free elements from buckets with kfree and refills them with kmalloc, so global kmalloc logic takes care of freeing objects allocated by one cpu and freed on another. struct bpf_mem_alloc supports two modes: - When size != 0 create kmem_cache and bpf_mem_cache for each cpu. This is typical bpf hash map use case when all elements have equal size. - When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on kmalloc/kfree. Max allocation size is 4096 in this case. This is bpf_dynptr and bpf_kptr use case. bpf_mem_alloc/bpf_mem_free are bpf specific 'wrappers' of kmalloc/kfree. bpf_mem_cache_alloc/bpf_mem_cache_free are 'wrappers' of kmem_cache_alloc/kmem_cache_free. The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-2-alexei.starovoitov@gmail.com --- include/linux/bpf_mem_alloc.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 include/linux/bpf_mem_alloc.h (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h new file mode 100644 index 000000000000..804733070f8d --- /dev/null +++ b/include/linux/bpf_mem_alloc.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#ifndef _BPF_MEM_ALLOC_H +#define _BPF_MEM_ALLOC_H +#include + +struct bpf_mem_cache; +struct bpf_mem_caches; + +struct bpf_mem_alloc { + struct bpf_mem_caches __percpu *caches; + struct bpf_mem_cache __percpu *cache; +}; + +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size); +void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); + +/* kmalloc/kfree equivalent: */ +void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size); +void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); + +/* kmem_cache_alloc/free equivalent: */ +void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); +void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); + +#endif /* _BPF_MEM_ALLOC_H */ -- cgit v1.2.3 From 4ab67149f3c6e97c5c506a726f0ebdec38241679 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:52 -0700 Subject: bpf: Add percpu allocation support to bpf_mem_alloc. Extend bpf_mem_alloc to cache free list of fixed size per-cpu allocations. Once such cache is created bpf_mem_cache_alloc() will return per-cpu objects. bpf_mem_cache_free() will free them back into global per-cpu pool after observing RCU grace period. per-cpu flavor of bpf_mem_alloc is going to be used by per-cpu hash maps. The free list cache consists of tuples { llist_node, per-cpu pointer } Unlike alloc_percpu() that returns per-cpu pointer the bpf_mem_cache_alloc() returns a pointer to per-cpu pointer and bpf_mem_cache_free() expects to receive it back. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-11-alexei.starovoitov@gmail.com --- include/linux/bpf_mem_alloc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 804733070f8d..653ed1584a03 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -12,7 +12,7 @@ struct bpf_mem_alloc { struct bpf_mem_cache __percpu *cache; }; -int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size); +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ -- cgit v1.2.3 From 9f2c6e96c65e6fa1aebef546be0c30a5895fcb37 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:58 -0700 Subject: bpf: Optimize rcu_barrier usage between hash map and bpf_mem_alloc. User space might be creating and destroying a lot of hash maps. Synchronous rcu_barrier-s in a destruction path of hash map delay freeing of hash buckets and other map memory and may cause artificial OOM situation under stress. Optimize rcu_barrier usage between bpf hash map and bpf_mem_alloc: - remove rcu_barrier from hash map, since htab doesn't use call_rcu directly and there are no callback to wait for. - bpf_mem_alloc has call_rcu_in_progress flag that indicates pending callbacks. Use it to avoid barriers in fast path. - When barriers are needed copy bpf_mem_alloc into temp structure and wait for rcu barrier-s in the worker to let the rest of hash map freeing to proceed. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220902211058.60789-17-alexei.starovoitov@gmail.com --- include/linux/bpf_mem_alloc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 653ed1584a03..3e164b8efaa9 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -3,6 +3,7 @@ #ifndef _BPF_MEM_ALLOC_H #define _BPF_MEM_ALLOC_H #include +#include struct bpf_mem_cache; struct bpf_mem_caches; @@ -10,6 +11,7 @@ struct bpf_mem_caches; struct bpf_mem_alloc { struct bpf_mem_caches __percpu *caches; struct bpf_mem_cache __percpu *cache; + struct work_struct work; }; int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); -- cgit v1.2.3 From f0b933236ec97de5ee49c60aae57a9c5c4dadc87 Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Sun, 4 Sep 2022 12:28:39 +0200 Subject: lib/string_helpers: Introduce parse_int_array_user() Add new helper function to allow for splitting specified user string into a sequence of integers. Internally it makes use of get_options() so the returned sequence contains the integers extracted plus an additional element that begins the sequence and specifies the integers count. Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Signed-off-by: Cezary Rojewski Link: https://lore.kernel.org/r/20220904102840.862395-2-cezary.rojewski@intel.com Signed-off-by: Mark Brown --- include/linux/string_helpers.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index 4d72258d42fd..dc2e726fd820 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -21,6 +21,8 @@ enum string_size_units { void string_get_size(u64 size, u64 blk_size, enum string_size_units units, char *buf, int len); +int parse_int_array_user(const char __user *from, size_t count, int **array); + #define UNESCAPE_SPACE BIT(0) #define UNESCAPE_OCTAL BIT(1) #define UNESCAPE_HEX BIT(2) -- cgit v1.2.3 From 8409fe92d88c332923130149fe209d1c882b286e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Sat, 27 Aug 2022 15:03:43 +0200 Subject: PCI: Move PCI_VENDOR_ID_MICROSOFT/PCI_DEVICE_ID_HYPERV_VIDEO definitions to pci_ids.h There are already three places in kernel which define PCI_VENDOR_ID_MICROSOFT and two for PCI_DEVICE_ID_HYPERV_VIDEO and there's a need to use these from core VMBus code. Move the defines where they belong. No functional change. Reviewed-by: Michael Kelley Acked-by: Bjorn Helgaas # pci_ids.h Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20220827130345.1320254-2-vkuznets@redhat.com Signed-off-by: Wei Liu --- include/linux/pci_ids.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 6feade66efdb..15b49e655ce3 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2079,6 +2079,9 @@ #define PCI_DEVICE_ID_ICE_1712 0x1712 #define PCI_DEVICE_ID_VT1724 0x1724 +#define PCI_VENDOR_ID_MICROSOFT 0x1414 +#define PCI_DEVICE_ID_HYPERV_VIDEO 0x5353 + #define PCI_VENDOR_ID_OXSEMI 0x1415 #define PCI_DEVICE_ID_OXSEMI_12PCI840 0x8403 #define PCI_DEVICE_ID_OXSEMI_PCIe840 0xC000 -- cgit v1.2.3 From 2bc9cd66eb25d0fefbb081421d6586495e25840e Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Mon, 29 Aug 2022 11:18:40 +0200 Subject: iio: Use per-device lockdep class for mlock If an IIO driver uses callbacks from another IIO driver and calls iio_channel_start_all_cb() from one of its buffer setup ops, then lockdep complains due to the lock nesting, as in the below example with lmp91000. Since the locks are being taken on different IIO devices, there is no actual deadlock. Fix the warning by telling lockdep to use a different class for each iio_device. ============================================ WARNING: possible recursive locking detected -------------------------------------------- python3/23 is trying to acquire lock: (&indio_dev->mlock){+.+.}-{3:3}, at: iio_update_buffers but task is already holding lock: (&indio_dev->mlock){+.+.}-{3:3}, at: enable_store other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&indio_dev->mlock); lock(&indio_dev->mlock); *** DEADLOCK *** May be due to missing lock nesting notation 5 locks held by python3/23: #0: (sb_writers#5){.+.+}-{0:0}, at: ksys_write #1: (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write_iter #2: (kn->active#14){.+.+}-{0:0}, at: kernfs_fop_write_iter #3: (&indio_dev->mlock){+.+.}-{3:3}, at: enable_store #4: (&iio_dev_opaque->info_exist_lock){+.+.}-{3:3}, at: iio_update_buffers Call Trace: __mutex_lock iio_update_buffers iio_channel_start_all_cb lmp91000_buffer_postenable __iio_update_buffers enable_store Fixes: 67e17300dc1d76 ("iio: potentiostat: add LMP91000 support") Signed-off-by: Vincent Whitchurch Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220829091840.2791846-1-vincent.whitchurch@axis.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio-opaque.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h index 6b3586b3f952..d1f8b30a7c8b 100644 --- a/include/linux/iio/iio-opaque.h +++ b/include/linux/iio/iio-opaque.h @@ -11,6 +11,7 @@ * checked by device drivers but should be considered * read-only as this is a core internal bit * @driver_module: used to make it harder to undercut users + * @mlock_key: lockdep class for iio_dev lock * @info_exist_lock: lock to prevent use during removal * @trig_readonly: mark the current trigger immutable * @event_interface: event chrdevs associated with interrupt lines @@ -42,6 +43,7 @@ struct iio_dev_opaque { int currentmode; int id; struct module *driver_module; + struct lock_class_key mlock_key; struct mutex info_exist_lock; bool trig_readonly; struct iio_event_interface *event_interface; -- cgit v1.2.3 From 835e699ef82adfc85ac4cc3f1f237c1adfdefd20 Mon Sep 17 00:00:00 2001 From: Jagath Jog J Date: Wed, 31 Aug 2022 12:01:16 +0530 Subject: iio: Add new event type gesture and use direction for single and double tap Add new event type for tap called gesture and the direction can be used to differentiate single and double tap. This may be used by accelerometer sensors to express single and double tap events. For directional tap, modifiers like IIO_MOD_(X/Y/Z) can be used along with singletap and doubletap direction. Signed-off-by: Jagath Jog J Link: https://lore.kernel.org/r/20220831063117.4141-2-jagathjog1996@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index 27143b03909d..82faa98c719a 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -17,6 +17,8 @@ enum iio_event_info { IIO_EV_INFO_HIGH_PASS_FILTER_3DB, IIO_EV_INFO_LOW_PASS_FILTER_3DB, IIO_EV_INFO_TIMEOUT, + IIO_EV_INFO_RESET_TIMEOUT, + IIO_EV_INFO_TAP2_MIN_DELAY, }; #define IIO_VAL_INT 1 -- cgit v1.2.3 From ead384d956345681e1ddf97890d5e15ded015f07 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 19 Aug 2022 18:20:37 +0800 Subject: efi/loongarch: Add efistub booting support This patch adds efistub booting support, which is the standard UEFI boot protocol for LoongArch to use. We use generic efistub, which means we can pass boot information (i.e., system table, memory map, kernel command line, initrd) via a light FDT and drop a lot of non-standard code. We use a flat mapping to map the efi runtime in the kernel's address space. In efi, VA = PA; in kernel, VA = PA + PAGE_OFFSET. As a result, flat mapping is not identity mapping, SetVirtualAddressMap() is still needed for the efi runtime. Tested-by: Xi Ruoyao Signed-off-by: Huacai Chen [ardb: change fpic to fpie as suggested by Xi Ruoyao] Signed-off-by: Ard Biesheuvel --- include/linux/pe.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pe.h b/include/linux/pe.h index daf09ffffe38..1d3836ef9d92 100644 --- a/include/linux/pe.h +++ b/include/linux/pe.h @@ -65,6 +65,8 @@ #define IMAGE_FILE_MACHINE_SH5 0x01a8 #define IMAGE_FILE_MACHINE_THUMB 0x01c2 #define IMAGE_FILE_MACHINE_WCEMIPSV2 0x0169 +#define IMAGE_FILE_MACHINE_LOONGARCH32 0x6232 +#define IMAGE_FILE_MACHINE_LOONGARCH64 0x6264 /* flags */ #define IMAGE_FILE_RELOCS_STRIPPED 0x0001 -- cgit v1.2.3 From 3aac580d5cc3001ca1627725b3b61edb529f341d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:54 -0700 Subject: perf: Add sample_flags to indicate the PMU-filled sample data On some platforms, some data e.g., timestamps, can be retrieved from the PMU driver. Usually, the data from the PMU driver is more accurate. The current perf kernel should output the PMU-filled sample data if it's available. To check the availability of the PMU-filled sample data, the current perf kernel initializes the related fields in the perf_sample_data_init(). When outputting a sample, the perf checks whether the field is updated by the PMU driver. If yes, the updated value will be output. If not, the perf uses an SW way to calculate the value or just outputs the initialized value if an SW way is unavailable either. With more and more data being provided by the PMU driver, more fields has to be initialized in the perf_sample_data_init(). That will increase the number of cache lines touched in perf_sample_data_init() and be harmful to the performance. Add new "sample_flags" to indicate the PMU-filled sample data. The PMU driver should set the corresponding PERF_SAMPLE_ flag when the field is updated. The initialization of the corresponding field is not required anymore. The following patches will make use of it and remove the corresponding fields from the perf_sample_data_init(), which will further minimize the number of cache lines touched. Only clear the sample flags that have already been done by the PMU driver in the perf_prepare_sample() for the PERF_RECORD_SAMPLE. For the other PERF_RECORD_ event type, the sample data is not available. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-2-kan.liang@linux.intel.com --- include/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1999408a9cbb..0978165a2d87 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1008,6 +1008,7 @@ struct perf_sample_data { * Fields set by perf_sample_data_init(), group so as to * minimize the cachelines touched. */ + u64 sample_flags; u64 addr; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; @@ -1057,6 +1058,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ + data->sample_flags = 0; data->addr = addr; data->raw = NULL; data->br_stack = NULL; -- cgit v1.2.3 From a9a931e2666878343782c82d7d55cc173ddeb3e9 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:56 -0700 Subject: perf: Use sample_flags for branch stack Use the new sample_flags to indicate whether the branch stack is filled by the PMU driver. Remove the br_stack from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-4-kan.liang@linux.intel.com --- include/linux/perf_event.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0978165a2d87..1e12e79454e0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1011,7 +1011,6 @@ struct perf_sample_data { u64 sample_flags; u64 addr; struct perf_raw_record *raw; - struct perf_branch_stack *br_stack; u64 period; union perf_sample_weight weight; u64 txn; @@ -1021,6 +1020,8 @@ struct perf_sample_data { * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ + struct perf_branch_stack *br_stack; + u64 type; u64 ip; struct { @@ -1061,7 +1062,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->sample_flags = 0; data->addr = addr; data->raw = NULL; - data->br_stack = NULL; data->period = period; data->weight.full = 0; data->data_src.val = PERF_MEM_NA; -- cgit v1.2.3 From 2abe681da0a192ab850a5271d838a7817b469fca Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:57 -0700 Subject: perf: Use sample_flags for weight Use the new sample_flags to indicate whether the weight field is filled by the PMU driver. Remove the weight field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-5-kan.liang@linux.intel.com --- include/linux/perf_event.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1e12e79454e0..06a587b5faa9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1012,7 +1012,6 @@ struct perf_sample_data { u64 addr; struct perf_raw_record *raw; u64 period; - union perf_sample_weight weight; u64 txn; union perf_mem_data_src data_src; @@ -1021,6 +1020,7 @@ struct perf_sample_data { * perf_{prepare,output}_sample(). */ struct perf_branch_stack *br_stack; + union perf_sample_weight weight; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->weight.full = 0; data->data_src.val = PERF_MEM_NA; data->txn = 0; } -- cgit v1.2.3 From e16fd7f2cb1a65555cfe76f983eaefb1eab7471f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:58 -0700 Subject: perf: Use sample_flags for data_src Use the new sample_flags to indicate whether the data_src field is filled by the PMU driver. Remove the data_src field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-6-kan.liang@linux.intel.com --- include/linux/perf_event.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 06a587b5faa9..6849f10dfc7e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1013,7 +1013,6 @@ struct perf_sample_data { struct perf_raw_record *raw; u64 period; u64 txn; - union perf_mem_data_src data_src; /* * The other fields, optionally {set,used} by @@ -1021,6 +1020,7 @@ struct perf_sample_data { */ struct perf_branch_stack *br_stack; union perf_sample_weight weight; + union perf_mem_data_src data_src; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->data_src.val = PERF_MEM_NA; data->txn = 0; } -- cgit v1.2.3 From ee9db0e14b0575aa827579dc2471a29ec5fc6877 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:59 -0700 Subject: perf: Use sample_flags for txn Use the new sample_flags to indicate whether the txn field is filled by the PMU driver. Remove the txn field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-7-kan.liang@linux.intel.com --- include/linux/perf_event.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6849f10dfc7e..581880ddb9ef 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1012,7 +1012,6 @@ struct perf_sample_data { u64 addr; struct perf_raw_record *raw; u64 period; - u64 txn; /* * The other fields, optionally {set,used} by @@ -1021,6 +1020,7 @@ struct perf_sample_data { struct perf_branch_stack *br_stack; union perf_sample_weight weight; union perf_mem_data_src data_src; + u64 txn; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->txn = 0; } /* -- cgit v1.2.3 From 0083d27b21dd2a598df8275b98f89ced532e2e53 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 6 Sep 2022 09:38:42 -1000 Subject: cgroup: Improve cftype add/rm error handling Let's track whether a cftype is currently added or not using a new flag __CFTYPE_ADDED so that duplicate operations can be failed safely and consistently allow using empty cftypes. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1283993d7ea8..9fa1652d0493 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -131,6 +131,7 @@ enum { /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ + __CFTYPE_ADDED = (1 << 18), }; /* -- cgit v1.2.3 From 8a693f7766f9e27c390c5fec8a5db1f9d1d8177e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 6 Sep 2022 09:38:55 -1000 Subject: cgroup: Remove CFTYPE_PRESSURE CFTYPE_PRESSURE is used to flag PSI related files so that they are not created if PSI is disabled during boot. It's a bit weird to use a generic flag to mark a specific file type. Let's instead move the PSI files into its own cftypes array and add/rm them conditionally. This is a bit more code but cleaner. No userland visible changes. Signed-off-by: Tejun Heo Cc: Johannes Weiner --- include/linux/cgroup-defs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 9fa1652d0493..8f481d1b159a 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -126,7 +126,6 @@ enum { CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ - CFTYPE_PRESSURE = (1 << 6), /* only if pressure feature is enabled */ /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ -- cgit v1.2.3 From 14db0b3c7b837f4edeb7c1794290c2f345c7f627 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 15 Aug 2022 16:50:51 -0700 Subject: fscrypt: stop using PG_error to track error status As a step towards freeing the PG_error flag for other uses, change ext4 and f2fs to stop using PG_error to track decryption errors. Instead, if a decryption error occurs, just mark the whole bio as failed. The coarser granularity isn't really a problem since it isn't any worse than what the block layer provides, and errors from a multi-page readahead aren't reported to applications unless a single-page read fails too. Signed-off-by: Eric Biggers Reviewed-by: Chao Yu # for f2fs part Link: https://lore.kernel.org/r/20220815235052.86545-2-ebiggers@kernel.org --- include/linux/fscrypt.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index b95b8601b9c1..488fd8c8f8af 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -351,7 +351,7 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); /* bio.c */ -void fscrypt_decrypt_bio(struct bio *bio); +bool fscrypt_decrypt_bio(struct bio *bio); int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len); @@ -644,8 +644,9 @@ static inline int fscrypt_d_revalidate(struct dentry *dentry, } /* bio.c */ -static inline void fscrypt_decrypt_bio(struct bio *bio) +static inline bool fscrypt_decrypt_bio(struct bio *bio) { + return true; } static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, -- cgit v1.2.3 From 720e6a435194fb5237833a4a7ec6aa60a78964a8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 31 Aug 2022 08:26:46 -0700 Subject: bpf: Allow struct argument in trampoline based programs Allow struct argument in trampoline based programs where the struct size should be <= 16 bytes. In such cases, the argument will be put into up to 2 registers for bpf, x86_64 and arm64 architectures. To support arch-specific trampoline manipulation, add arg_flags for additional struct information about arguments in btf_func_model. Such information will be used in arch specific function arch_prepare_bpf_trampoline() to prepare argument access properly in trampoline. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220831152646.2078089-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9c1674973e03..4d32f125f4af 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -727,10 +727,14 @@ enum bpf_cgroup_storage_type { */ #define MAX_BPF_FUNC_REG_ARGS 5 +/* The argument is a structure. */ +#define BTF_FMODEL_STRUCT_ARG BIT(0) + struct btf_func_model { u8 ret_size; u8 nr_args; u8 arg_size[MAX_BPF_FUNC_ARGS]; + u8 arg_flags[MAX_BPF_FUNC_ARGS]; }; /* Restore arguments before returning from trampoline to let original function -- cgit v1.2.3 From be376df724aa3b7abdf79390eaab60c58a92f4f0 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sat, 27 Aug 2022 04:49:03 +0200 Subject: wifi: brcmfmac: add 43439 SDIO ids and initialization Add HW and SDIO ids for use with the muRata 1YN (Cypress CYW43439). Add the firmware mapping structures for the CYW43439 chipset. The 43439 needs some things setup similar to the 43430 chipset. Signed-off-by: Marek Vasut Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220827024903.617294-1-marex@denx.de --- include/linux/mmc/sdio_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 53f0efa0bccf..74f9d9a6d330 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -74,6 +74,7 @@ #define SDIO_DEVICE_ID_BROADCOM_43362 0xa962 #define SDIO_DEVICE_ID_BROADCOM_43364 0xa9a4 #define SDIO_DEVICE_ID_BROADCOM_43430 0xa9a6 +#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439 0xa9af #define SDIO_DEVICE_ID_BROADCOM_43455 0xa9bf #define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752 0xaae8 -- cgit v1.2.3 From 9fc18f6d56d5b79d527c17a8100a0965d18345cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 21 Aug 2022 16:06:44 +0200 Subject: dma-mapping: mark dma_supported static Now that the remaining users in drivers are gone, this function can be marked static. Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 25a30906289d..0ee20b764000 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -139,7 +139,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); bool dma_can_mmap(struct device *dev); -int dma_supported(struct device *dev, u64 mask); bool dma_pci_p2pdma_supported(struct device *dev); int dma_set_mask(struct device *dev, u64 mask); int dma_set_coherent_mask(struct device *dev, u64 mask); @@ -248,10 +247,6 @@ static inline bool dma_can_mmap(struct device *dev) { return false; } -static inline int dma_supported(struct device *dev, u64 mask) -{ - return 0; -} static inline bool dma_pci_p2pdma_supported(struct device *dev) { return false; -- cgit v1.2.3 From 283945017cbf685546ba7d065f254ad77eb888b1 Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Mon, 15 Aug 2022 01:33:39 +0000 Subject: iommu: Remove comment of dev_has_feat in struct doc dev_has_feat has been removed from iommu_ops in commit 309c56e84602 ("iommu: remove the unused dev_has_feat method"), remove its description in the struct doc. Signed-off-by: Yuan Can Link: https://lore.kernel.org/r/20220815013339.2552-1-yuancan@huawei.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ea30f00dc145..a004e87e0e1d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -212,7 +212,7 @@ struct iommu_iotlb_gather { * @of_xlate: add OF master IDs to iommu grouping * @is_attach_deferred: Check if domain attach should be deferred from iommu * driver init to device driver init (default no) - * @dev_has/enable/disable_feat: per device entries to check/enable/disable + * @dev_enable/disable_feat: per device entries to enable/disable * iommu specific features. * @sva_bind: Bind process address space to device * @sva_unbind: Unbind process address space from device -- cgit v1.2.3 From a1be74c5384c4a47d91ab4af2ed854d3b7eaf763 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 5 Sep 2022 13:58:43 +0300 Subject: net/mlx5: Introduce ifc bits for page tracker Introduce ifc related stuff to enable using page tracker. A page tracker is a dirty page tracking object used by the device to report the tracking log. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220905105852.26398-2-yishaih@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 83 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4acd5610e96b..06eab92b9fb3 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -89,6 +89,7 @@ enum { MLX5_OBJ_TYPE_VIRTIO_NET_Q = 0x000d, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS = 0x001c, MLX5_OBJ_TYPE_MATCH_DEFINER = 0x0018, + MLX5_OBJ_TYPE_PAGE_TRACK = 0x46, MLX5_OBJ_TYPE_MKEY = 0xff01, MLX5_OBJ_TYPE_QP = 0xff02, MLX5_OBJ_TYPE_PSV = 0xff03, @@ -1733,7 +1734,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 max_geneve_tlv_options[0x8]; u8 reserved_at_568[0x3]; u8 max_geneve_tlv_option_data_len[0x5]; - u8 reserved_at_570[0x10]; + u8 reserved_at_570[0x9]; + u8 adv_virtualization[0x1]; + u8 reserved_at_57a[0x6]; u8 reserved_at_580[0xb]; u8 log_max_dci_stream_channels[0x5]; @@ -11818,4 +11821,82 @@ struct mlx5_ifc_load_vhca_state_out_bits { u8 reserved_at_40[0x40]; }; +struct mlx5_ifc_adv_virtualization_cap_bits { + u8 reserved_at_0[0x3]; + u8 pg_track_log_max_num[0x5]; + u8 pg_track_max_num_range[0x8]; + u8 pg_track_log_min_addr_space[0x8]; + u8 pg_track_log_max_addr_space[0x8]; + + u8 reserved_at_20[0x3]; + u8 pg_track_log_min_msg_size[0x5]; + u8 reserved_at_28[0x3]; + u8 pg_track_log_max_msg_size[0x5]; + u8 reserved_at_30[0x3]; + u8 pg_track_log_min_page_size[0x5]; + u8 reserved_at_38[0x3]; + u8 pg_track_log_max_page_size[0x5]; + + u8 reserved_at_40[0x7c0]; +}; + +struct mlx5_ifc_page_track_report_entry_bits { + u8 dirty_address_high[0x20]; + + u8 dirty_address_low[0x20]; +}; + +enum { + MLX5_PAGE_TRACK_STATE_TRACKING, + MLX5_PAGE_TRACK_STATE_REPORTING, + MLX5_PAGE_TRACK_STATE_ERROR, +}; + +struct mlx5_ifc_page_track_range_bits { + u8 start_address[0x40]; + + u8 length[0x40]; +}; + +struct mlx5_ifc_page_track_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x10]; + u8 vhca_id[0x10]; + + u8 reserved_at_60[0x20]; + + u8 state[0x4]; + u8 track_type[0x4]; + u8 log_addr_space_size[0x8]; + u8 reserved_at_90[0x3]; + u8 log_page_size[0x5]; + u8 reserved_at_98[0x3]; + u8 log_msg_size[0x5]; + + u8 reserved_at_a0[0x8]; + u8 reporting_qpn[0x18]; + + u8 reserved_at_c0[0x18]; + u8 num_ranges[0x8]; + + u8 reserved_at_e0[0x20]; + + u8 range_start_address[0x40]; + + u8 length[0x40]; + + struct mlx5_ifc_page_track_range_bits track_range[0]; +}; + +struct mlx5_ifc_create_page_track_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + struct mlx5_ifc_page_track_bits obj_context; +}; + +struct mlx5_ifc_modify_page_track_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + struct mlx5_ifc_page_track_bits obj_context; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From 939838632b9119614128028eaea3b1d7bf29f16f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 5 Sep 2022 13:58:44 +0300 Subject: net/mlx5: Query ADV_VIRTUALIZATION capabilities Query ADV_VIRTUALIZATION capabilities which provide information for advanced virtualization related features. Current capabilities refer to the page tracker object which is used for tracking the pages that are dirtied by the device. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220905105852.26398-3-yishaih@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index b5f58fd37a0f..5b41b9fb3d48 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1200,6 +1200,7 @@ enum mlx5_cap_type { MLX5_CAP_DEV_SHAMPO = 0x1d, MLX5_CAP_GENERAL_2 = 0x20, MLX5_CAP_PORT_SELECTION = 0x25, + MLX5_CAP_ADV_VIRTUALIZATION = 0x26, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1365,6 +1366,14 @@ enum mlx5_qcam_feature_groups { MLX5_GET(port_selection_cap, \ mdev->caps.hca[MLX5_CAP_PORT_SELECTION]->max, cap) +#define MLX5_CAP_ADV_VIRTUALIZATION(mdev, cap) \ + MLX5_GET(adv_virtualization_cap, \ + mdev->caps.hca[MLX5_CAP_ADV_VIRTUALIZATION]->cur, cap) + +#define MLX5_CAP_ADV_VIRTUALIZATION_MAX(mdev, cap) \ + MLX5_GET(adv_virtualization_cap, \ + mdev->caps.hca[MLX5_CAP_ADV_VIRTUALIZATION]->max, cap) + #define MLX5_CAP_FLOWTABLE_PORT_SELECTION(mdev, cap) \ MLX5_CAP_PORT_SELECTION(mdev, flow_table_properties_port_selection.cap) -- cgit v1.2.3 From 2d4697375dea514be202abf563a9419e74489c25 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 1 Sep 2022 00:27:42 +0300 Subject: swab: Add array operations For now, some simple array operations to swab. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20220831212744.56435-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/swab.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swab.h b/include/linux/swab.h index bcff5149861a..9b804dbb0d79 100644 --- a/include/linux/swab.h +++ b/include/linux/swab.h @@ -20,4 +20,29 @@ # define swab64s __swab64s # define swahw32s __swahw32s # define swahb32s __swahb32s + +static inline void swab16_array(u16 *buf, unsigned int words) +{ + while (words--) { + swab16s(buf); + buf++; + } +} + +static inline void swab32_array(u32 *buf, unsigned int words) +{ + while (words--) { + swab32s(buf); + buf++; + } +} + +static inline void swab64_array(u64 *buf, unsigned int words) +{ + while (words--) { + swab64s(buf); + buf++; + } +} + #endif /* _LINUX_SWAB_H */ -- cgit v1.2.3 From 359ad15763762c713a51300134e784a72eb9cb80 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 15 Aug 2022 16:26:49 +0100 Subject: iommu: Retire iommu_capable() With all callers now converted to the device-specific version, retire the old bus-based interface, and give drivers the chance to indicate accurate per-instance capabilities. Signed-off-by: Robin Murphy Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/d8bd8777d06929ad8f49df7fc80e1b9af32a41b5.1660574547.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a004e87e0e1d..0013a1befe7b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -227,7 +227,7 @@ struct iommu_iotlb_gather { * @owner: Driver module providing these ops */ struct iommu_ops { - bool (*capable)(enum iommu_cap); + bool (*capable)(struct device *dev, enum iommu_cap); /* Domain allocation and freeing by the iommu driver */ struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); @@ -420,7 +420,6 @@ extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops); extern int bus_iommu_probe(struct bus_type *bus); extern bool iommu_present(struct bus_type *bus); extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap); -extern bool iommu_capable(struct bus_type *bus, enum iommu_cap cap); extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus); extern struct iommu_group *iommu_group_get_by_id(int id); extern void iommu_domain_free(struct iommu_domain *domain); @@ -697,11 +696,6 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap) return false; } -static inline bool iommu_capable(struct bus_type *bus, enum iommu_cap cap) -{ - return false; -} - static inline struct iommu_domain *iommu_domain_alloc(struct bus_type *bus) { return NULL; -- cgit v1.2.3 From 29e932295bfaba792d29e66e8be0637ff3994724 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 15 Aug 2022 17:20:17 +0100 Subject: iommu: Clean up bus_set_iommu() Clean up the remaining trivial bus_set_iommu() callsites along with the implementation. Now drivers only have to know and care about iommu_device instances, phew! Reviewed-by: Kevin Tian Tested-by: Matthew Rosato # s390 Tested-by: Niklas Schnelle # s390 Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/ea383d5f4d74ffe200ab61248e5de6e95846180a.1660572783.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 0013a1befe7b..35810f87ae74 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -416,7 +416,6 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) return dev->iommu->iommu_dev->ops; } -extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops); extern int bus_iommu_probe(struct bus_type *bus); extern bool iommu_present(struct bus_type *bus); extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap); -- cgit v1.2.3 From fa49364cd5e65797014688cba623541083b3e5b0 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 16 Aug 2022 18:28:04 +0100 Subject: iommu/dma: Move public interfaces to linux/iommu.h The iommu-dma layer is now mostly encapsulated by iommu_dma_ops, with only a couple more public interfaces left pertaining to MSI integration. Since these depend on the main IOMMU API header anyway, move their declarations there, taking the opportunity to update the half-baked comments to proper kerneldoc along the way. Signed-off-by: Robin Murphy Acked-by: Catalin Marinas Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/9cd99738f52094e6bed44bfee03fa4f288d20695.1660668998.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/dma-iommu.h | 40 ---------------------------------------- include/linux/iommu.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h index 24607dc3c2ac..e83de4f1f3d6 100644 --- a/include/linux/dma-iommu.h +++ b/include/linux/dma-iommu.h @@ -15,27 +15,10 @@ /* Domain management interface for IOMMU drivers */ int iommu_get_dma_cookie(struct iommu_domain *domain); -int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); void iommu_put_dma_cookie(struct iommu_domain *domain); -/* Setup call for arch DMA mapping code */ -void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit); int iommu_dma_init_fq(struct iommu_domain *domain); -/* The DMA API isn't _quite_ the whole story, though... */ -/* - * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU device - * - * The MSI page will be stored in @desc. - * - * Return: 0 on success otherwise an error describing the failure. - */ -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); - -/* Update the MSI message if required. */ -void iommu_dma_compose_msi_msg(struct msi_desc *desc, - struct msi_msg *msg); - void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list); void iommu_dma_free_cpu_cached_iovas(unsigned int cpu, @@ -46,15 +29,8 @@ extern bool iommu_dma_forcedac; #else /* CONFIG_IOMMU_DMA */ struct iommu_domain; -struct msi_desc; -struct msi_msg; struct device; -static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base, - u64 dma_limit) -{ -} - static inline int iommu_dma_init_fq(struct iommu_domain *domain) { return -EINVAL; @@ -65,26 +41,10 @@ static inline int iommu_get_dma_cookie(struct iommu_domain *domain) return -ENODEV; } -static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) -{ - return -ENODEV; -} - static inline void iommu_put_dma_cookie(struct iommu_domain *domain) { } -static inline int iommu_dma_prepare_msi(struct msi_desc *desc, - phys_addr_t msi_addr) -{ - return 0; -} - -static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, - struct msi_msg *msg) -{ -} - static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list) { } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 35810f87ae74..a325532aeab5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1063,4 +1063,40 @@ void iommu_debugfs_setup(void); static inline void iommu_debugfs_setup(void) {} #endif +#ifdef CONFIG_IOMMU_DMA +#include + +/* Setup call for arch DMA mapping code */ +void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit); + +int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); + +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); +void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg); + +#else /* CONFIG_IOMMU_DMA */ + +struct msi_desc; +struct msi_msg; + +static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit) +{ +} + +static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) +{ + return -ENODEV; +} + +static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +{ + return 0; +} + +static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) +{ +} + +#endif /* CONFIG_IOMMU_DMA */ + #endif /* __LINUX_IOMMU_H */ -- cgit v1.2.3 From d1b2234b7fbf94348901bed4ea8d015c8a819d02 Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:16 -0700 Subject: net/mlx5: Removed esp_id from struct mlx5_flow_act esp_id is no longer in used Signed-off-by: Lior Nahmanson Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 8e73c377da2c..920cbc9524ad 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -245,7 +245,6 @@ struct mlx5_flow_act { struct mlx5_pkt_reformat *pkt_reformat; union { u32 ipsec_obj_id; - uintptr_t esp_id; }; u32 flags; struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH]; -- cgit v1.2.3 From e227ee990bf974f655ec3132b496409990f6634b Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:17 -0700 Subject: net/mlx5: Generalize Flow Context for new crypto fields In order to support MACsec offload (and maybe some other crypto features in the future), generalize flow action parameters / defines to be used by crypto offlaods other than IPsec. The following changes made: ipsec_obj_id field at flow action context was changed to crypto_obj_id, intreduced a new crypto_type field where IPsec is the default zero type for backward compatibility. Action ipsec_decrypt was changed to crypto_decrypt. Action ipsec_encrypt was changed to crypto_encrypt. IPsec offload code was updated accordingly for backward compatibility. Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Raed Salem Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 7 ++++--- include/linux/mlx5/mlx5_ifc.h | 12 ++++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 920cbc9524ad..e62d50acb6bd 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -243,9 +243,10 @@ struct mlx5_flow_act { u32 action; struct mlx5_modify_hdr *modify_hdr; struct mlx5_pkt_reformat *pkt_reformat; - union { - u32 ipsec_obj_id; - }; + struct mlx5_flow_act_crypto_params { + u8 type; + u32 obj_id; + } crypto; u32 flags; struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH]; struct ib_counters *counters; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4acd5610e96b..5758218cb3fa 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3310,8 +3310,8 @@ enum { MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH = 0x100, MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2 = 0x400, MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2 = 0x800, - MLX5_FLOW_CONTEXT_ACTION_IPSEC_DECRYPT = 0x1000, - MLX5_FLOW_CONTEXT_ACTION_IPSEC_ENCRYPT = 0x2000, + MLX5_FLOW_CONTEXT_ACTION_CRYPTO_DECRYPT = 0x1000, + MLX5_FLOW_CONTEXT_ACTION_CRYPTO_ENCRYPT = 0x2000, MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO = 0x4000, }; @@ -3321,6 +3321,10 @@ enum { MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT = 0x2, }; +enum { + MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC = 0x0, +}; + struct mlx5_ifc_vlan_bits { u8 ethtype[0x10]; u8 prio[0x3]; @@ -3374,7 +3378,7 @@ struct mlx5_ifc_flow_context_bits { u8 extended_destination[0x1]; u8 reserved_at_81[0x1]; u8 flow_source[0x2]; - u8 reserved_at_84[0x4]; + u8 encrypt_decrypt_type[0x4]; u8 destination_list_size[0x18]; u8 reserved_at_a0[0x8]; @@ -3386,7 +3390,7 @@ struct mlx5_ifc_flow_context_bits { struct mlx5_ifc_vlan_bits push_vlan_2; - u8 ipsec_obj_id[0x20]; + u8 encrypt_decrypt_obj_id[0x20]; u8 reserved_at_140[0xc0]; struct mlx5_ifc_fte_match_param_bits match_value; -- cgit v1.2.3 From 8385c51ff5bceb92f7401138e16b0a617ca2ab02 Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:18 -0700 Subject: net/mlx5: Introduce MACsec Connect-X offload hardware bits and structures Add MACsec offload related IFC structs, layouts and enumerations. Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 4 ++ include/linux/mlx5/mlx5_ifc.h | 99 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index b5f58fd37a0f..2927810f172b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1198,6 +1198,7 @@ enum mlx5_cap_type { MLX5_CAP_DEV_EVENT = 0x14, MLX5_CAP_IPSEC, MLX5_CAP_DEV_SHAMPO = 0x1d, + MLX5_CAP_MACSEC = 0x1f, MLX5_CAP_GENERAL_2 = 0x20, MLX5_CAP_PORT_SELECTION = 0x25, /* NUM OF CAP Types */ @@ -1446,6 +1447,9 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_DEV_SHAMPO(mdev, cap)\ MLX5_GET(shampo_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_SHAMPO], cap) +#define MLX5_CAP_MACSEC(mdev, cap)\ + MLX5_GET(macsec_cap, (mdev)->caps.hca[MLX5_CAP_MACSEC]->cur, cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5758218cb3fa..8decbf9a7bdd 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -82,6 +82,7 @@ enum { MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM), MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT = (1ULL << 11), MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q = (1ULL << 13), + MLX5_GENERAL_OBJ_TYPES_CAP_MACSEC_OFFLOAD = (1ULL << 39), }; enum { @@ -449,7 +450,12 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reserved_at_60[0x2]; u8 reformat_insert[0x1]; u8 reformat_remove[0x1]; - u8 reserver_at_64[0x14]; + u8 macsec_encrypt[0x1]; + u8 macsec_decrypt[0x1]; + u8 reserved_at_66[0x2]; + u8 reformat_add_macsec[0x1]; + u8 reformat_remove_macsec[0x1]; + u8 reserved_at_6a[0xe]; u8 log_max_ft_num[0x8]; u8 reserved_at_80[0x10]; @@ -611,7 +617,11 @@ struct mlx5_ifc_fte_match_set_misc2_bits { u8 metadata_reg_a[0x20]; - u8 reserved_at_1a0[0x60]; + u8 reserved_at_1a0[0x8]; + + u8 macsec_syndrome[0x8]; + + u8 reserved_at_1b0[0x50]; }; struct mlx5_ifc_fte_match_set_misc3_bits { @@ -1276,6 +1286,24 @@ struct mlx5_ifc_ipsec_cap_bits { u8 reserved_at_30[0x7d0]; }; +struct mlx5_ifc_macsec_cap_bits { + u8 macsec_epn[0x1]; + u8 reserved_at_1[0x2]; + u8 macsec_crypto_esp_aes_gcm_256_encrypt[0x1]; + u8 macsec_crypto_esp_aes_gcm_128_encrypt[0x1]; + u8 macsec_crypto_esp_aes_gcm_256_decrypt[0x1]; + u8 macsec_crypto_esp_aes_gcm_128_decrypt[0x1]; + u8 reserved_at_7[0x4]; + u8 log_max_macsec_offload[0x5]; + u8 reserved_at_10[0x10]; + + u8 min_log_macsec_full_replay_window[0x8]; + u8 max_log_macsec_full_replay_window[0x8]; + u8 reserved_at_30[0x10]; + + u8 reserved_at_40[0x7c0]; +}; + enum { MLX5_WQ_TYPE_LINKED_LIST = 0x0, MLX5_WQ_TYPE_CYCLIC = 0x1, @@ -3295,6 +3323,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_device_mem_cap_bits device_mem_cap; struct mlx5_ifc_virtio_emulation_cap_bits virtio_emulation_cap; struct mlx5_ifc_shampo_cap_bits shampo_cap; + struct mlx5_ifc_macsec_cap_bits macsec_cap; u8 reserved_at_0[0x8000]; }; @@ -3323,6 +3352,7 @@ enum { enum { MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC = 0x0, + MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_MACSEC = 0x1, }; struct mlx5_ifc_vlan_bits { @@ -6320,6 +6350,8 @@ enum mlx5_reformat_ctx_type { MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4, MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf, MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10, + MLX5_REFORMAT_TYPE_ADD_MACSEC = 0x11, + MLX5_REFORMAT_TYPE_DEL_MACSEC = 0x12, }; struct mlx5_ifc_alloc_packet_reformat_context_in_bits { @@ -11475,6 +11507,7 @@ enum { MLX5_GENERAL_OBJECT_TYPES_IPSEC = 0x13, MLX5_GENERAL_OBJECT_TYPES_SAMPLER = 0x20, MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = 0x24, + MLX5_GENERAL_OBJECT_TYPES_MACSEC = 0x27, }; enum { @@ -11525,6 +11558,67 @@ struct mlx5_ifc_modify_ipsec_obj_in_bits { struct mlx5_ifc_ipsec_obj_bits ipsec_object; }; +struct mlx5_ifc_macsec_aso_bits { + u8 valid[0x1]; + u8 reserved_at_1[0x1]; + u8 mode[0x2]; + u8 window_size[0x2]; + u8 soft_lifetime_arm[0x1]; + u8 hard_lifetime_arm[0x1]; + u8 remove_flow_enable[0x1]; + u8 epn_event_arm[0x1]; + u8 reserved_at_a[0x16]; + + u8 remove_flow_packet_count[0x20]; + + u8 remove_flow_soft_lifetime[0x20]; + + u8 reserved_at_60[0x80]; + + u8 mode_parameter[0x20]; + + u8 replay_protection_window[8][0x20]; +}; + +struct mlx5_ifc_macsec_offload_obj_bits { + u8 modify_field_select[0x40]; + + u8 confidentiality_en[0x1]; + u8 reserved_at_41[0x1]; + u8 esn_en[0x1]; + u8 esn_overlap[0x1]; + u8 reserved_at_44[0x2]; + u8 confidentiality_offset[0x2]; + u8 reserved_at_48[0x4]; + u8 aso_return_reg[0x4]; + u8 reserved_at_50[0x10]; + + u8 esn_msb[0x20]; + + u8 reserved_at_80[0x8]; + u8 dekn[0x18]; + + u8 reserved_at_a0[0x20]; + + u8 sci[0x40]; + + u8 reserved_at_100[0x8]; + u8 macsec_aso_access_pd[0x18]; + + u8 reserved_at_120[0x60]; + + u8 salt[3][0x20]; + + u8 reserved_at_1e0[0x20]; + + struct mlx5_ifc_macsec_aso_bits macsec_aso; +}; + +struct mlx5_ifc_create_macsec_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + struct mlx5_ifc_macsec_offload_obj_bits macsec_object; +}; + struct mlx5_ifc_encryption_key_obj_bits { u8 modify_field_select[0x40]; @@ -11642,6 +11736,7 @@ enum { enum { MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_TLS = 0x1, MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_IPSEC = 0x2, + MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_MACSEC = 0x4, }; struct mlx5_ifc_tls_static_params_bits { -- cgit v1.2.3 From ee534d7f81ba9cec028580f91429b3dc29b90c7f Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:20 -0700 Subject: net/mlx5: Add MACsec Tx tables support to fs_core Changed EGRESS_KERNEL namespace to EGRESS_IPSEC and add new namespace for MACsec TX. This namespace should be the last namespace for transmitted packets. Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index e62d50acb6bd..53d186774206 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -92,7 +92,8 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_SNIFFER_RX, MLX5_FLOW_NAMESPACE_SNIFFER_TX, MLX5_FLOW_NAMESPACE_EGRESS, - MLX5_FLOW_NAMESPACE_EGRESS_KERNEL, + MLX5_FLOW_NAMESPACE_EGRESS_IPSEC, + MLX5_FLOW_NAMESPACE_EGRESS_MACSEC, MLX5_FLOW_NAMESPACE_RDMA_RX, MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL, MLX5_FLOW_NAMESPACE_RDMA_TX, -- cgit v1.2.3 From e467b283ffd50cf15b84c73eef68787e257eaed5 Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:21 -0700 Subject: net/mlx5e: Add MACsec TX steering rules Tx flow steering consists of two flow tables (FTs). The first FT (crypto table) has two fixed rules: One default miss rule so non MACsec offloaded packets bypass the MACSec tables, another rule to make sure that MACsec key exchange (MKE) traffic passes unencrypted as expected (matched of ethertype). On each new MACsec offload flow, a new MACsec rule is added. This rule is matched on metadata_reg_a (which contains the id of the flow) and invokes the MACsec offload action on match. The second FT (check table) has two fixed rules: One rule for verifying that the previous offload actions were finished successfully and packet need to be transmitted. Another default rule for dropping packets that were failed in the offload actions. The MACsec FTs should be created on demand when the first MACsec rule is added and destroyed when the last MACsec rule is deleted. Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Raed Salem Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/qp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 8bda3ba5b109..be640c749d5e 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -252,6 +252,7 @@ enum { enum { MLX5_ETH_WQE_FT_META_IPSEC = BIT(0), + MLX5_ETH_WQE_FT_META_MACSEC = BIT(1), }; struct mlx5_wqe_eth_seg { -- cgit v1.2.3 From 15d187e285b3d5f4fe0328c09566355c1e387ff6 Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:24 -0700 Subject: net/mlx5: Add MACsec Rx tables support to fs_core Add new namespace for MACsec RX flows. Encrypted MACsec packets should be first decrypted and stripped from MACsec header and then continues with the kernel's steering pipeline. Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 53d186774206..c7a91981cd5a 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -79,6 +79,7 @@ static inline void build_leftovers_ft_param(int *priority, enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_BYPASS, + MLX5_FLOW_NAMESPACE_KERNEL_RX_MACSEC, MLX5_FLOW_NAMESPACE_LAG, MLX5_FLOW_NAMESPACE_OFFLOADS, MLX5_FLOW_NAMESPACE_ETHTOOL, -- cgit v1.2.3 From aaac38f614871df252aa7459647bf68d42f7c3e7 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 25 Aug 2022 06:39:36 +0000 Subject: iommu/amd: Initial support for AMD IOMMU v2 page table Introduce IO page table framework support for AMD IOMMU v2 page table. This patch implements 4 level page table within iommu amd driver and supports 4K/2M/1G page sizes. Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20220825063939.8360-7-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index ca98aeadcc80..ffe616db9409 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -16,6 +16,7 @@ enum io_pgtable_fmt { ARM_V7S, ARM_MALI_LPAE, AMD_IOMMU_V1, + AMD_IOMMU_V2, APPLE_DART, IO_PGTABLE_NUM_FMTS, }; @@ -260,6 +261,7 @@ extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns; extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns; extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns; extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns; +extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns; extern struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns; #endif /* __IO_PGTABLE_H */ -- cgit v1.2.3 From 5e0531f6b90ac096fedaf5bd0eae0bb4e5a39da5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 7 Sep 2022 16:11:25 +0200 Subject: spi: Add capability to perform some transfer with chipselect off Some components require a few clock cycles with chipselect off before or/and after the data transfer done with CS on. Typically IDT 801034 QUAD PCM CODEC datasheet states "Note *: CCLK should have one cycle before CS goes low, and two cycles after CS goes high". The cycles "before" are implicitely provided by all previous activity on the SPI bus. But the cycles "after" must be provided in order to terminate the SPI transfer. In order to use that kind of component, add a cs_off flag to spi_transfer struct. When this flag is set, the transfer is performed with chipselect off. This allows consummer to add a dummy transfer at the end of the transfer list which is performed with chipselect OFF, providing the required additional clock cycles. Signed-off-by: Christophe Leroy Link: https://lore.kernel.org/r/434165c46f06d802690208a11e7ea2500e8da4c7.1662558898.git.christophe.leroy@csgroup.eu Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e6c73d5ff1a8..6e6c62c59957 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -847,6 +847,7 @@ struct spi_res { * for this transfer. If 0 the default (from @spi_device) is used. * @dummy_data: indicates transfer is dummy bytes transfer. * @cs_change: affects chipselect after this transfer completes + * @cs_off: performs the transfer with chipselect off. * @cs_change_delay: delay between cs deassert and assert when * @cs_change is set and @spi_transfer is not the last in @spi_message * @delay: delay to be introduced after this transfer before @@ -959,6 +960,7 @@ struct spi_transfer { unsigned cs_change:1; unsigned tx_nbits:3; unsigned rx_nbits:3; + unsigned cs_off:1; #define SPI_NBITS_SINGLE 0x01 /* 1bit transfer */ #define SPI_NBITS_DUAL 0x02 /* 2bits transfer */ #define SPI_NBITS_QUAD 0x04 /* 4bits transfer */ -- cgit v1.2.3 From 787f51f210ebe5d7d5e4c8101b148f0018e9c409 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 1 Sep 2022 10:16:49 +0200 Subject: USB/ARM: Switch S3C2410 UDC to GPIO descriptors This converts the S3C2410 UDC USB device controller to use GPIO descriptor tables and modern GPIO. Cc: Krzysztof Kozlowski Cc: Alim Akhtar Cc: linux-arm-kernel@lists.infradead.org Cc: linux-samsung-soc@vger.kernel.org Acked-by: Krzysztof Kozlowski Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20220901081649.564348-1-linus.walleij@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_data/usb-s3c2410_udc.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/usb-s3c2410_udc.h b/include/linux/platform_data/usb-s3c2410_udc.h index 07394819d03b..c0fbe1fe3426 100644 --- a/include/linux/platform_data/usb-s3c2410_udc.h +++ b/include/linux/platform_data/usb-s3c2410_udc.h @@ -22,12 +22,6 @@ enum s3c2410_udc_cmd_e { struct s3c2410_udc_mach_info { void (*udc_command)(enum s3c2410_udc_cmd_e); void (*vbus_draw)(unsigned int ma); - - unsigned int pullup_pin; - unsigned int pullup_pin_inverted; - - unsigned int vbus_pin; - unsigned char vbus_pin_inverted; }; extern void __init s3c24xx_udc_set_platdata(struct s3c2410_udc_mach_info *); -- cgit v1.2.3 From e77cab77f2cb3a1ca2ba8df4af45bb35617ac16d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 1 Sep 2022 17:39:32 +0300 Subject: serial: Create uart_xmit_advance() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A very common pattern in the drivers is to advance xmit tail index and do bookkeeping of Tx'ed characters. Create uart_xmit_advance() to handle it. Reviewed-by: Andy Shevchenko Cc: stable Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220901143934.8850-2-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 6e4f4765d209..1eaea9fe44d8 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -624,6 +624,23 @@ struct uart_state { /* number of characters left in xmit buffer before we ask for more */ #define WAKEUP_CHARS 256 +/** + * uart_xmit_advance - Advance xmit buffer and account Tx'ed chars + * @up: uart_port structure describing the port + * @chars: number of characters sent + * + * This function advances the tail of circular xmit buffer by the number of + * @chars transmitted and handles accounting of transmitted bytes (into + * @up's icount.tx). + */ +static inline void uart_xmit_advance(struct uart_port *up, unsigned int chars) +{ + struct circ_buf *xmit = &up->state->xmit; + + xmit->tail = (xmit->tail + chars) & (UART_XMIT_SIZE - 1); + up->icount.tx += chars; +} + struct module; struct tty_driver; -- cgit v1.2.3 From 85d6b66d31c35158364058ee98fb69ab5bb6a6b1 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Sun, 4 Sep 2022 15:40:39 -0600 Subject: dyndbg: fix module.dyndbg handling For CONFIG_DYNAMIC_DEBUG=N, the ddebug_dyndbg_module_param_cb() stub-fn is too permissive: bash-5.1# modprobe drm JUNKdyndbg bash-5.1# modprobe drm dyndbgJUNK [ 42.933220] dyndbg param is supported only in CONFIG_DYNAMIC_DEBUG builds [ 42.937484] ACPI: bus type drm_connector registered This caused no ill effects, because unknown parameters are either ignored by default with an "unknown parameter" warning, or ignored because dyndbg allows its no-effect use on non-dyndbg builds. But since the code has an explicit feedback message, it should be issued accurately. Fix with strcmp for exact param-name match. Fixes: b48420c1d301 dynamic_debug: make dynamic-debug work for module initialization Reported-by: Rasmus Villemoes Acked-by: Jason Baron Acked-by: Daniel Vetter Signed-off-by: Jim Cromie Link: https://lore.kernel.org/r/20220904214134.408619-3-jim.cromie@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/dynamic_debug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index dce631e678dd..f30b01aa9fa4 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -201,7 +201,7 @@ static inline int ddebug_remove_module(const char *mod) static inline int ddebug_dyndbg_module_param_cb(char *param, char *val, const char *modname) { - if (strstr(param, "dyndbg")) { + if (!strcmp(param, "dyndbg")) { /* avoid pr_warn(), which wants pr_fmt() fully defined */ printk(KERN_WARNING "dyndbg param is supported only in " "CONFIG_DYNAMIC_DEBUG builds\n"); -- cgit v1.2.3 From e26ef3af964acfea311403126acee8c56c89e26b Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Sun, 4 Sep 2022 15:40:46 -0600 Subject: dyndbg: drop EXPORTed dynamic_debug_exec_queries This exported fn is unused, and will not be needed. Lets dump it. The export was added to let drm control pr_debugs, as part of using them to avoid drm_debug_enabled overheads. But its better to just implement the drm.debug bitmap interface, then its available for everyone. Fixes: a2d375eda771 ("dyndbg: refine export, rename to dynamic_debug_exec_queries()") Fixes: 4c0d77828d4f ("dyndbg: export ddebug_exec_queries") Acked-by: Jason Baron Acked-by: Daniel Vetter Signed-off-by: Jim Cromie Link: https://lore.kernel.org/r/20220904214134.408619-10-jim.cromie@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/dynamic_debug.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index f30b01aa9fa4..8d9eec5f6d8b 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -55,9 +55,6 @@ struct _ddebug { #if defined(CONFIG_DYNAMIC_DEBUG_CORE) -/* exported for module authors to exercise >control */ -int dynamic_debug_exec_queries(const char *query, const char *modname); - int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *modname); extern int ddebug_remove_module(const char *mod_name); @@ -221,12 +218,6 @@ static inline int ddebug_dyndbg_module_param_cb(char *param, char *val, rowsize, groupsize, buf, len, ascii); \ } while (0) -static inline int dynamic_debug_exec_queries(const char *query, const char *modname) -{ - pr_warn("kernel not built with CONFIG_DYNAMIC_DEBUG_CORE\n"); - return 0; -} - #endif /* !CONFIG_DYNAMIC_DEBUG_CORE */ #endif -- cgit v1.2.3 From b7b4eebdba7b6aea6b34dc29691b71c39d1dbd6a Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Sun, 4 Sep 2022 15:40:48 -0600 Subject: dyndbg: gather __dyndbg[] state into struct _ddebug_info This new struct composes the linker provided (vector,len) section, and provides a place to add other __dyndbg[] state-data later: descs - the vector of descriptors in __dyndbg section. num_descs - length of the data/section. Use it, in several different ways, as follows: In lib/dynamic_debug.c: ddebug_add_module(): Alter params-list, replacing 2 args (array,index) with a struct _ddebug_info * containing them both, with room for expansion. This helps future-proof the function prototype against the looming addition of class-map info into the dyndbg-state, by providing a place to add more member fields later. NB: later add static struct _ddebug_info builtins_state declaration, not needed yet. ddebug_add_module() is called in 2 contexts: In dynamic_debug_init(), declare, init a struct _ddebug_info di auto-var to use as a cursor. Then iterate over the prdbg blocks of the builtin modules, and update the di cursor before calling _add_module for each. Its called from kernel/module/main.c:load_info() for each loaded module: In internal.h, alter struct load_info, replacing the dyndbg array,len fields with an embedded _ddebug_info containing them both; and populate its members in find_module_sections(). The 2 calling contexts differ in that _init deals with contiguous subranges of __dyndbgs[] section, packed together, while loadable modules are added one at a time. So rename ddebug_add_module() into outer/__inner fns, call __inner from _init, and provide the offset into the builtin __dyndbgs[] where the module's prdbgs reside. The cursor provides start, len of the subrange for each. The offset will be used later to pack the results of builtin __dyndbg_sites[] de-duplication, and is 0 and unneeded for loadable modules, Note: kernel/module/main.c includes for struct _ddeubg_info. This might be prone to include loops, since its also included by printk.h. Nothing has broken in robot-land on this. cc: Luis Chamberlain Signed-off-by: Jim Cromie Link: https://lore.kernel.org/r/20220904214134.408619-12-jim.cromie@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/dynamic_debug.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 8d9eec5f6d8b..6a2001250da1 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -51,12 +51,16 @@ struct _ddebug { #endif } __attribute__((aligned(8))); - +/* encapsulate linker provided built-in (or module) dyndbg data */ +struct _ddebug_info { + struct _ddebug *descs; + unsigned int num_descs; +}; #if defined(CONFIG_DYNAMIC_DEBUG_CORE) -int ddebug_add_module(struct _ddebug *tab, unsigned int n, - const char *modname); +int ddebug_add_module(struct _ddebug_info *dyndbg, const char *modname); + extern int ddebug_remove_module(const char *mod_name); extern __printf(2, 3) void __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...); @@ -184,8 +188,7 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, #include #include -static inline int ddebug_add_module(struct _ddebug *tab, unsigned int n, - const char *modname) +static inline int ddebug_add_module(struct _ddebug_info *dinfo, const char *modname) { return 0; } -- cgit v1.2.3 From ca90fca7f7b51830dfb95bf655210a1c84588f15 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Sun, 4 Sep 2022 15:40:49 -0600 Subject: dyndbg: add class_id to pr_debug callsites DRM issues ~10 exclusive categories of debug messages; to represent this directly in dyndbg, add a new 6-bit field: struct _ddebug.class_id. This gives us 64 classes, which should be more than enough. #> echo 0x012345678 > /sys/module/drm/parameters/debug All existing callsites are initialized with _DPRINTK_CLASS_DFLT, which is 2^6-1. This reserves 0-62 for use in new categorized/class'd pr_debugs, which fits perfectly with natural enums (ints: 0..N). Thats done by extending the init macro: DEFINE_DYNAMIC_DEBUG_METADATA() with _CLS(cls, ...) suffix, and redefing old name using extended name. Then extend the factory macro callchain with _cls() versions to provide the callsite.class_id, with old-names passing _DPRINTK_CLASS_DFLT. This sets us up to create class'd prdebug callsites (class'd callsites are those with .class_id != _DPRINTK_CLASS_DFLT). No behavior change. cc: Rasmus Villemoes Signed-off-by: Jim Cromie Link: https://lore.kernel.org/r/20220904214134.408619-13-jim.cromie@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/dynamic_debug.h | 71 +++++++++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 6a2001250da1..633f4e463766 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -6,6 +6,8 @@ #include #endif +#include + /* * An instance of this structure is created in a special * ELF section at every dynamic debug callsite. At runtime, @@ -21,6 +23,9 @@ struct _ddebug { const char *filename; const char *format; unsigned int lineno:18; +#define CLS_BITS 6 + unsigned int class_id:CLS_BITS; +#define _DPRINTK_CLASS_DFLT ((1 << CLS_BITS) - 1) /* * The flags field controls the behaviour at the callsite. * The bits here are changed dynamically when the user @@ -88,7 +93,7 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, const struct ib_device *ibdev, const char *fmt, ...); -#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \ +#define DEFINE_DYNAMIC_DEBUG_METADATA_CLS(name, cls, fmt) \ static struct _ddebug __aligned(8) \ __section("__dyndbg") name = { \ .modname = KBUILD_MODNAME, \ @@ -97,8 +102,14 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, .format = (fmt), \ .lineno = __LINE__, \ .flags = _DPRINTK_FLAGS_DEFAULT, \ + .class_id = cls, \ _DPRINTK_KEY_INIT \ - } + }; \ + BUILD_BUG_ON_MSG(cls > _DPRINTK_CLASS_DFLT, \ + "classid value overflow") + +#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \ + DEFINE_DYNAMIC_DEBUG_METADATA_CLS(name, _DPRINTK_CLASS_DFLT, fmt) #ifdef CONFIG_JUMP_LABEL @@ -129,17 +140,34 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, #endif /* CONFIG_JUMP_LABEL */ -#define __dynamic_func_call(id, fmt, func, ...) do { \ - DEFINE_DYNAMIC_DEBUG_METADATA(id, fmt); \ - if (DYNAMIC_DEBUG_BRANCH(id)) \ - func(&id, ##__VA_ARGS__); \ -} while (0) - -#define __dynamic_func_call_no_desc(id, fmt, func, ...) do { \ - DEFINE_DYNAMIC_DEBUG_METADATA(id, fmt); \ +/* + * Factory macros: ($prefix)dynamic_func_call($suffix) + * + * Lower layer (with __ prefix) gets the callsite metadata, and wraps + * the func inside a debug-branch/static-key construct. Upper layer + * (with _ prefix) does the UNIQUE_ID once, so that lower can ref the + * name/label multiple times, and tie the elements together. + * Multiple flavors: + * (|_cls): adds in _DPRINT_CLASS_DFLT as needed + * (|_no_desc): former gets callsite descriptor as 1st arg (for prdbgs) + */ +#define __dynamic_func_call_cls(id, cls, fmt, func, ...) do { \ + DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt); \ if (DYNAMIC_DEBUG_BRANCH(id)) \ - func(__VA_ARGS__); \ + func(&id, ##__VA_ARGS__); \ } while (0) +#define __dynamic_func_call(id, fmt, func, ...) \ + __dynamic_func_call_cls(id, _DPRINTK_CLASS_DFLT, fmt, \ + func, ##__VA_ARGS__) + +#define __dynamic_func_call_cls_no_desc(id, cls, fmt, func, ...) do { \ + DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt); \ + if (DYNAMIC_DEBUG_BRANCH(id)) \ + func(__VA_ARGS__); \ +} while (0) +#define __dynamic_func_call_no_desc(id, fmt, func, ...) \ + __dynamic_func_call_cls_no_desc(id, _DPRINTK_CLASS_DFLT, \ + fmt, func, ##__VA_ARGS__) /* * "Factory macro" for generating a call to func, guarded by a @@ -149,22 +177,33 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, * the varargs. Note that fmt is repeated in invocations of this * macro. */ +#define _dynamic_func_call_cls(cls, fmt, func, ...) \ + __dynamic_func_call_cls(__UNIQUE_ID(ddebug), cls, fmt, func, ##__VA_ARGS__) #define _dynamic_func_call(fmt, func, ...) \ - __dynamic_func_call(__UNIQUE_ID(ddebug), fmt, func, ##__VA_ARGS__) + _dynamic_func_call_cls(_DPRINTK_CLASS_DFLT, fmt, func, ##__VA_ARGS__) + /* * A variant that does the same, except that the descriptor is not * passed as the first argument to the function; it is only called * with precisely the macro's varargs. */ -#define _dynamic_func_call_no_desc(fmt, func, ...) \ - __dynamic_func_call_no_desc(__UNIQUE_ID(ddebug), fmt, func, ##__VA_ARGS__) +#define _dynamic_func_call_cls_no_desc(cls, fmt, func, ...) \ + __dynamic_func_call_cls_no_desc(__UNIQUE_ID(ddebug), cls, fmt, \ + func, ##__VA_ARGS__) +#define _dynamic_func_call_no_desc(fmt, func, ...) \ + _dynamic_func_call_cls_no_desc(_DPRINTK_CLASS_DFLT, fmt, \ + func, ##__VA_ARGS__) + +#define dynamic_pr_debug_cls(cls, fmt, ...) \ + _dynamic_func_call_cls(cls, fmt, __dynamic_pr_debug, \ + pr_fmt(fmt), ##__VA_ARGS__) #define dynamic_pr_debug(fmt, ...) \ - _dynamic_func_call(fmt, __dynamic_pr_debug, \ + _dynamic_func_call(fmt, __dynamic_pr_debug, \ pr_fmt(fmt), ##__VA_ARGS__) #define dynamic_dev_dbg(dev, fmt, ...) \ - _dynamic_func_call(fmt,__dynamic_dev_dbg, \ + _dynamic_func_call(fmt, __dynamic_dev_dbg, \ dev, fmt, ##__VA_ARGS__) #define dynamic_netdev_dbg(dev, fmt, ...) \ -- cgit v1.2.3 From 3fc95d80a536e49e38ba4f79ca60cb4e64f99b3b Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Sun, 4 Sep 2022 15:40:50 -0600 Subject: dyndbg: add __pr_debug_cls for testing For selftest purposes, add __pr_debug_cls(class, fmt, ...) I didn't think we'd need to define this, since DRM effectively has it already in drm_dbg, drm_devdbg. But test_dynamic_debug needs it in order to demonstrate all the moving parts. Note the __ prefix; its not intended for general use, at least until a need emerges. ISTM the drm.debug model (macro wrappers inserting enum const 1st arg) is the baseline approach. That said, nouveau might want it for easy use in its debug macros. TBD. NB: it does require a builtin-constant class, __pr_debug_cls(i++, ...) is disallowed by compiler. Signed-off-by: Jim Cromie Link: https://lore.kernel.org/r/20220904214134.408619-14-jim.cromie@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/dynamic_debug.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 633f4e463766..3c9690da44d9 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -221,6 +221,13 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, KERN_DEBUG, prefix_str, prefix_type, \ rowsize, groupsize, buf, len, ascii) +/* for test only, generally expect drm.debug style macro wrappers */ +#define __pr_debug_cls(cls, fmt, ...) do { \ + BUILD_BUG_ON_MSG(!__builtin_constant_p(cls), \ + "expecting constant class int/enum"); \ + dynamic_pr_debug_cls(cls, fmt, ##__VA_ARGS__); \ + } while (0) + #else /* !CONFIG_DYNAMIC_DEBUG_CORE */ #include -- cgit v1.2.3 From aad0214f30264c19044a77fc4776def349b76fc4 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Sun, 4 Sep 2022 15:40:51 -0600 Subject: dyndbg: add DECLARE_DYNDBG_CLASSMAP macro Using DECLARE_DYNDBG_CLASSMAP, modules can declare up to 31 classnames. By doing so, they authorize dyndbg to manipulate class'd prdbgs (ie: __pr_debug_cls, and soon drm_*dbg), ala:: :#> echo class DRM_UT_KMS +p > /proc/dynamic_debug/control The macro declares and initializes a static struct ddebug_class_map:: - maps approved class-names to class_ids used in module, by array order. forex: DRM_UT_* - class-name vals allow validation of "class FOO" queries using macro is opt-in - enum class_map_type - determines interface, behavior Each module has its own class-type and class_id space, and only known class-names will be authorized for a manipulation. Only DRM modules should know and respont to this: :#> echo class DRM_UT_CORE +p > control # across all modules pr_debugs (with default class_id) are still controllable as before. DECLARE_DYNDBG_CLASSMAP(_var, _maptype, _base, classes...) is:: _var: name of the static struct var. user passes to module_param_cb() if they want a sysfs node. _maptype: this is hard-coded to DD_CLASS_TYPE_DISJOINT_BITS for now. _base: usually 0, it allows splitting 31 classes into subranges, so that multiple classes / sysfs-nodes can share the module's class-id space. classes: list of class_name strings, these are mapped to class-ids starting at _base. This class-names list must have a corresponding ENUM, with SYMBOLS that match the literals, and 1st enum val = _base. enum class_map_type has 4 values, on 2 factors:: - classes are disjoint/independent vs relative/xcontrol interface doesn't enforce the LEVELS relationship, so you could confusingly have V3 enabled, but V1 disabled. OTOH, the control iface already allows infinite tweaking of the underlying callsites; sysfs node readback can only tell the user what they previously wrote. 2. All dyndbg >control reduces to a query/command, includes +/-, which is at-root a kernel patching operation with +/- semantics. And the _NAMES handling exposes it to the user, making it API-adjacent. And its not just >control where +/- gets used (which is settled), the new place is with sysfs-nodes exposing _*_NAMES classes, and here its subtly different. _DISJOINT_NAMES: is simple, independent _LEVEL_NAMES: masks-on bits 0 .. N-1, N..max off # turn on L3,L2,L1 others off echo +L3 > /sys/module/test_dynamic_debug/parameters/p_level_names # turn on L2,L1 others off echo -L3 > /sys/module/test_dynamic_debug/parameters/p_level_names IOW, the - changes the threshold-on bitpos by 1. Alternatively, we could treat the +/- as half-duplex, where -L3 turns off L>2 (and ignores L1), and +L2 would turn on L<=2 (and ignore others). Signed-off-by: Jim Cromie Link: https://lore.kernel.org/r/20220904214134.408619-15-jim.cromie@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/dynamic_debug.h | 55 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 3c9690da44d9..98dbf1d49984 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -56,6 +56,61 @@ struct _ddebug { #endif } __attribute__((aligned(8))); +enum class_map_type { + DD_CLASS_TYPE_DISJOINT_BITS, + /** + * DD_CLASS_TYPE_DISJOINT_BITS: classes are independent, one per bit. + * expecting hex input. Built for drm.debug, basis for other types. + */ + DD_CLASS_TYPE_LEVEL_NUM, + /** + * DD_CLASS_TYPE_LEVEL_NUM: input is numeric level, 0-N. + * N turns on just bits N-1 .. 0, so N=0 turns all bits off. + */ + DD_CLASS_TYPE_DISJOINT_NAMES, + /** + * DD_CLASS_TYPE_DISJOINT_NAMES: input is a CSV of [+-]CLASS_NAMES, + * classes are independent, like _DISJOINT_BITS. + */ + DD_CLASS_TYPE_LEVEL_NAMES, + /** + * DD_CLASS_TYPE_LEVEL_NAMES: input is a CSV of [+-]CLASS_NAMES, + * intended for names like: INFO,DEBUG,TRACE, with a module prefix + * avoid EMERG,ALERT,CRIT,ERR,WARNING: they're not debug + */ +}; + +struct ddebug_class_map { + struct list_head link; + struct module *mod; + const char *mod_name; /* needed for builtins */ + const char **class_names; + const int length; + const int base; /* index of 1st .class_id, allows split/shared space */ + enum class_map_type map_type; +}; + +/** + * DECLARE_DYNDBG_CLASSMAP - declare classnames known by a module + * @_var: a struct ddebug_class_map, passed to module_param_cb + * @_type: enum class_map_type, chooses bits/verbose, numeric/symbolic + * @_base: offset of 1st class-name. splits .class_id space + * @classes: class-names used to control class'd prdbgs + */ +#define DECLARE_DYNDBG_CLASSMAP(_var, _maptype, _base, ...) \ + static const char *_var##_classnames[] = { __VA_ARGS__ }; \ + static struct ddebug_class_map __aligned(8) __used \ + __section("__dyndbg_classes") _var = { \ + .mod = THIS_MODULE, \ + .mod_name = KBUILD_MODNAME, \ + .base = _base, \ + .map_type = _maptype, \ + .length = NUM_TYPE_ARGS(char*, __VA_ARGS__), \ + .class_names = _var##_classnames, \ + } +#define NUM_TYPE_ARGS(eltype, ...) \ + (sizeof((eltype[]){__VA_ARGS__}) / sizeof(eltype)) + /* encapsulate linker provided built-in (or module) dyndbg data */ struct _ddebug_info { struct _ddebug *descs; -- cgit v1.2.3 From 66f4006b6ace1a1a1a1dca4225972f79a298e251 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Sun, 4 Sep 2022 15:40:52 -0600 Subject: kernel/module: add __dyndbg_classes section Add __dyndbg_classes section, using __dyndbg as a model. Use it: vmlinux.lds.h: KEEP the new section, which also silences orphan section warning on loadable modules. Add (__start_/__stop_)__dyndbg_classes linker symbols for the c externs (below). kernel/module/main.c: - fill new fields in find_module_sections(), using section_objs() - extend callchain prototypes to pass classes, length load_module(): pass new info to dynamic_debug_setup() dynamic_debug_setup(): new params, pass through to ddebug_add_module() dynamic_debug.c: - add externs to the linker symbols. ddebug_add_module(): - It currently builds a debug_table, and *will* find and attach classes. dynamic_debug_init(): - add class fields to the _ddebug_info cursor var: di. Signed-off-by: Jim Cromie Link: https://lore.kernel.org/r/20220904214134.408619-16-jim.cromie@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/dynamic_debug.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 98dbf1d49984..9073a43a2039 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -114,7 +114,9 @@ struct ddebug_class_map { /* encapsulate linker provided built-in (or module) dyndbg data */ struct _ddebug_info { struct _ddebug *descs; + struct ddebug_class_map *classes; unsigned int num_descs; + unsigned int num_classes; }; #if defined(CONFIG_DYNAMIC_DEBUG_CORE) -- cgit v1.2.3 From b9400852c0801aebee4fc8b62e6b7cc69c7fcbda Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Sun, 4 Sep 2022 15:40:57 -0600 Subject: dyndbg: add drm.debug style (drm/parameters/debug) bitmap support Add kernel_param_ops and callbacks to use a class-map to validate and apply input to a sysfs-node, which allows users to control classes defined in that class-map. This supports uses like: echo 0x3 > /sys/module/drm/parameters/debug IE add these: - int param_set_dyndbg_classes() - int param_get_dyndbg_classes() - struct kernel_param_ops param_ops_dyndbg_classes Following the model of kernel/params.c STANDARD_PARAM_DEFS, these are non-static and exported. This might be unnecessary here. get/set use an augmented kernel_param; the arg refs a new struct ddebug_class_param, which contains: - A ptr to user's state-store; a union of &ulong for drm.debug, &int for nouveau level debug. By ref'g the client's bit-state _var, code coordinates with existing code (like drm_debug_enabled) which uses it, so existing/remaining calls can work unchanged. Changing drm.debug to a ulong allows use of BIT() etc. - FLAGS: dyndbg.flags toggled by changes to bitmap. Usually just "p". - MAP: a pointer to struct ddebug_classes_map, which maps those class-names to .class_ids 0..N that the module is using. This class-map is declared & initialized by DECLARE_DYNDBG_CLASSMAP. - map-type: 4 enums DD_CLASS_TYPE_* select 2 input forms and 2 meanings. numeric input: DD_CLASS_TYPE_DISJOINT_BITS integer input, independent bits. ie: drm.debug DD_CLASS_TYPE_LEVEL_NUM integer input, 0..N levels classnames-list (comma separated) input: DD_CLASS_TYPE_DISJOINT_NAMES each name affects a bit, others preserved DD_CLASS_TYPE_LEVEL_NAMES names have level meanings, like kern_levels.h _NAMES - comma-separated classnames (with optional +-) _NUM - numeric input, 0-N expected _BITS - numeric input, 0x1F bitmap form expected _DISJOINT - bits are independent _LEVEL - (x /sys/module/drm/parameters/debug_catnames A naive _LEVEL_NAMES use, with one class, that sets all in the class-map according to (x /sys/module/test_dynamic_debug/parameters/p_level_names : problem solved echo -L1 > /sys/module/test_dynamic_debug/parameters/p_level_names Note this artifact: : this is same as prev cmd (due to +/-) echo L0 > /sys/module/test_dynamic_debug/parameters/p_level_names : this is "even-more" off, but same wo __pr_debug_class(L0, ".."). echo -L0 > /sys/module/test_dynamic_debug/parameters/p_level_names A stress-test/make-work usage (kid toggling a light switch): echo +L7,L0,L7,L0,L7,L0,L7,L0,L7,L0,L7,L0,L7 \ > /sys/module/test_dynamic_debug/parameters/p_level_names ddebug_apply_class_bitmap(): inside-fn, works on bitmaps, receives new-bits, finds diffs vs client-bitvector holding "current" state, and issues exec_query to commit the adjustment. param_set_dyndbg_classes(): interface fn, sends _NAMES to param_set_dyndbg_classnames() and returns, falls thru to handle _BITS, _NUM internally, and calls ddebug_apply_class_bitmap(). Finishes by updating state. param_set_dyndbg_classnames(): handles classnames-list in loop, calls ddebug_apply_class_bitmap for each, then updates state. NOTES: _LEVEL_ is overlay on _DISJOINT_; inputs are converted to a bitmask, by the callbacks. IOW this is possible, and possibly confusing: echo class V3 +p > control echo class V1 -p > control IMO thats ok, relative verbosity is an interface property. _LEVEL_NUM maps still need class-names, even though the names are not usable at the sysfs interface (unlike with _NAMES style). The names are the only way to >control the classes. - It must have a "V0" name, something below "V1" to turn "V1" off. __pr_debug_cls(V0,..) is printk, don't do that. - "class names" is required at the >control interface. - relative levels are not enforced at >control _LEVEL_NAMES bear +/- signs, which alters the on-bit-pos by 1. IOW, +L2 means L0,L1,L2, and -L2 means just L0,L1. This kinda spoils the readback fidelity, since the L0 bit gets turned on by any use of any L*, except "-L0". All the interface uncertainty here pertains to the _NAMES features. Nobody has actually asked for this, so its practical (if a little tedious) to split it out. Signed-off-by: Jim Cromie Link: https://lore.kernel.org/r/20220904214134.408619-21-jim.cromie@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/dynamic_debug.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 9073a43a2039..41682278d2e8 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -119,6 +119,15 @@ struct _ddebug_info { unsigned int num_classes; }; +struct ddebug_class_param { + union { + unsigned long *bits; + unsigned int *lvl; + }; + char flags[8]; + const struct ddebug_class_map *map; +}; + #if defined(CONFIG_DYNAMIC_DEBUG_CORE) int ddebug_add_module(struct _ddebug_info *dyndbg, const char *modname); @@ -278,6 +287,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, KERN_DEBUG, prefix_str, prefix_type, \ rowsize, groupsize, buf, len, ascii) +struct kernel_param; +int param_set_dyndbg_classes(const char *instr, const struct kernel_param *kp); +int param_get_dyndbg_classes(char *buffer, const struct kernel_param *kp); + /* for test only, generally expect drm.debug style macro wrappers */ #define __pr_debug_cls(cls, fmt, ...) do { \ BUILD_BUG_ON_MSG(!__builtin_constant_p(cls), \ @@ -324,6 +337,14 @@ static inline int ddebug_dyndbg_module_param_cb(char *param, char *val, rowsize, groupsize, buf, len, ascii); \ } while (0) +struct kernel_param; +static inline int param_set_dyndbg_classes(const char *instr, const struct kernel_param *kp) +{ return 0; } +static inline int param_get_dyndbg_classes(char *buffer, const struct kernel_param *kp) +{ return 0; } + #endif /* !CONFIG_DYNAMIC_DEBUG_CORE */ +extern const struct kernel_param_ops param_ops_dyndbg_classes; + #endif -- cgit v1.2.3 From 95f2f26f3cac06cfc046d2b29e60719d7848ea54 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:12:58 +0200 Subject: bpf: split btf_check_subprog_arg_match in two btf_check_subprog_arg_match() was used twice in verifier.c: - when checking for the type mismatches between a (sub)prog declaration and BTF - when checking the call of a subprog to see if the provided arguments are correct and valid This is problematic when we check if the first argument of a program (pointer to ctx) is correctly accessed: To be able to ensure we access a valid memory in the ctx, the verifier assumes the pointer to context is not null. This has the side effect of marking the program accessing the entire context, even if the context is never dereferenced. For example, by checking the context access with the current code, the following eBPF program would fail with -EINVAL if the ctx is set to null from the userspace: ``` SEC("syscall") int prog(struct my_ctx *args) { return 0; } ``` In that particular case, we do not want to actually check that the memory is correct while checking for the BTF validity, but we just want to ensure that the (sub)prog definition matches the BTF we have. So split btf_check_subprog_arg_match() in two so we can actually check for the memory used when in a call, and ignore that part when not. Note that a further patch is in preparation to disentangled btf_check_func_arg_match() from these two purposes, and so right now we just add a new hack around that by adding a boolean to this function. Signed-off-by: Benjamin Tissoires Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220906151303.2780789-3-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4d32f125f4af..3cf161cfd396 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1947,6 +1947,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); +int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs); int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, -- cgit v1.2.3 From eb1f7f71c126c8fd50ea81af98f97c4b581ea4ae Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:13:02 +0200 Subject: bpf/verifier: allow kfunc to return an allocated mem For drivers (outside of network), the incoming data is not statically defined in a struct. Most of the time the data buffer is kzalloc-ed and thus we can not rely on eBPF and BTF to explore the data. This commit allows to return an arbitrary memory, previously allocated by the driver. An interesting extra point is that the kfunc can mark the exported memory region as read only or read/write. So, when a kfunc is not returning a pointer to a struct but to a plain type, we can consider it is a valid allocated memory assuming that: - one of the arguments is either called rdonly_buf_size or rdwr_buf_size - and this argument is a const from the caller point of view We can then use this parameter as the size of the allocated memory. The memory is either read-only or read-write based on the name of the size parameter. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220906151303.2780789-7-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 ++++++++- include/linux/bpf_verifier.h | 2 ++ include/linux/btf.h | 10 ++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3cf161cfd396..79883f883ff3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1944,6 +1944,13 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, const char *func_name, struct btf_func_model *m); +struct bpf_kfunc_arg_meta { + u64 r0_size; + bool r0_rdonly; + int ref_obj_id; + u32 flags; +}; + struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); @@ -1952,7 +1959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, - u32 kfunc_flags); + struct bpf_kfunc_arg_meta *meta); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1fdddbf3546b..8fbc1d05281e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -598,6 +598,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, struct bpf_attach_target_info *tgt_info); void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab); +int mark_chain_precision(struct bpf_verifier_env *env, int regno); + #define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) /* extract base type from bpf_{arg, return, reg}_type. */ diff --git a/include/linux/btf.h b/include/linux/btf.h index ad93c2d9cc1c..1fcc833a8690 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -441,4 +441,14 @@ static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dt } #endif +static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) +{ + if (!btf_type_is_ptr(t)) + return false; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + return btf_type_is_struct(t); +} + #endif -- cgit v1.2.3 From 448325199f574d33824dbf9121efb03558412966 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Sep 2022 22:41:14 +0200 Subject: bpf: Add copy_map_value_long to copy to remote percpu memory bpf_long_memcpy is used while copying to remote percpu regions from BPF syscall and helpers, so that the copy is atomic at word size granularity. This might not be possible when you copy from map value hosting kptrs from or to percpu maps, as the alignment or size in disjoint regions may not be multiple of word size. Hence, to avoid complicating the copy loop, we only use bpf_long_memcpy when special fields are not present, otherwise use normal memcpy to copy the disjoint regions. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220904204145.3089-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 52 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 79883f883ff3..6a73e94821c4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -280,14 +280,33 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) } } -/* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ -static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) +/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and + * forced to use 'long' read/writes to try to atomically copy long counters. + * Best-effort only. No barriers here, since it _will_ race with concurrent + * updates from BPF programs. Called from bpf syscall and mostly used with + * size 8 or 16 bytes, so ask compiler to inline it. + */ +static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) +{ + const long *lsrc = src; + long *ldst = dst; + + size /= sizeof(long); + while (size--) + *ldst++ = *lsrc++; +} + +/* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ +static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, bool long_memcpy) { u32 curr_off = 0; int i; if (likely(!map->off_arr)) { - memcpy(dst, src, map->value_size); + if (long_memcpy) + bpf_long_memcpy(dst, src, round_up(map->value_size, 8)); + else + memcpy(dst, src, map->value_size); return; } @@ -299,6 +318,17 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) } memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); } + +static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) +{ + __copy_map_value(map, dst, src, false); +} + +static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) +{ + __copy_map_value(map, dst, src, true); +} + void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); @@ -1827,22 +1857,6 @@ int bpf_get_file_flag(int flags); int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); -/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and - * forced to use 'long' read/writes to try to atomically copy long counters. - * Best-effort only. No barriers here, since it _will_ race with concurrent - * updates from BPF programs. Called from bpf syscall and mostly used with - * size 8 or 16 bytes, so ask compiler to inline it. - */ -static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) -{ - const long *lsrc = src; - long *ldst = dst; - - size /= sizeof(long); - while (size--) - *ldst++ = *lsrc++; -} - /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr); -- cgit v1.2.3 From cc48755808c646666436745b35629c3f0d05e165 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Sep 2022 22:41:16 +0200 Subject: bpf: Add zero_map_value to zero map value with special fields We need this helper to skip over special fields (bpf_spin_lock, bpf_timer, kptrs) while zeroing a map value. Use the same logic as copy_map_value but memset instead of memcpy. Currently, the code zeroing map value memory does not have to deal with special fields, hence this is a prerequisite for introducing such support. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220904204145.3089-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6a73e94821c4..48ae05099f36 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -329,6 +329,25 @@ static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src __copy_map_value(map, dst, src, true); } +static inline void zero_map_value(struct bpf_map *map, void *dst) +{ + u32 curr_off = 0; + int i; + + if (likely(!map->off_arr)) { + memset(dst, 0, map->value_size); + return; + } + + for (i = 0; i < map->off_arr->cnt; i++) { + u32 next_off = map->off_arr->field_off[i]; + + memset(dst + curr_off, 0, next_off - curr_off); + curr_off += map->off_arr->field_sz[i]; + } + memset(dst + curr_off, 0, map->value_size - curr_off); +} + void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); -- cgit v1.2.3 From 5950e5d574c636a07dd21a872c2f8b41f6d20c55 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Aug 2022 13:18:17 +0200 Subject: freezer: Have {,un}lock_system_sleep() save/restore flags Rafael explained that the reason for having both PF_NOFREEZE and PF_FREEZER_SKIP is that {,un}lock_system_sleep() is callable from kthread context that has previously called set_freezable(). In preparation of merging the flags, have {,un}lock_system_slee() save and restore current->flags. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20220822114648.725003428@infradead.org --- include/linux/suspend.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 70f2921e2e70..34fb72c0db85 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -510,8 +510,8 @@ extern bool pm_save_wakeup_count(unsigned int count); extern void pm_wakep_autosleep_enabled(bool set); extern void pm_print_active_wakeup_sources(void); -extern void lock_system_sleep(void); -extern void unlock_system_sleep(void); +extern unsigned int lock_system_sleep(void); +extern void unlock_system_sleep(unsigned int); #else /* !CONFIG_PM_SLEEP */ @@ -534,8 +534,8 @@ static inline void pm_system_wakeup(void) {} static inline void pm_wakeup_clear(bool reset) {} static inline void pm_system_irq_wakeup(unsigned int irq_number) {} -static inline void lock_system_sleep(void) {} -static inline void unlock_system_sleep(void) {} +static inline unsigned int lock_system_sleep(void) { return 0; } +static inline void unlock_system_sleep(unsigned int flags) {} #endif /* !CONFIG_PM_SLEEP */ -- cgit v1.2.3 From 1fbcaa923ce2d7e6de17abd74fa076dc1e0be1a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Aug 2022 13:18:18 +0200 Subject: freezer,umh: Clean up freezer/initrd interaction handle_initrd() marks itself as PF_FREEZER_SKIP in order to ensure that the UMH, which is going to freeze the system, doesn't indefinitely wait for it's caller. Rework things by adding UMH_FREEZABLE to indicate the completion is freezable. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20220822114648.791019324@infradead.org --- include/linux/umh.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/umh.h b/include/linux/umh.h index 244aff638220..5d1f6129b847 100644 --- a/include/linux/umh.h +++ b/include/linux/umh.h @@ -11,10 +11,11 @@ struct cred; struct file; -#define UMH_NO_WAIT 0 /* don't wait at all */ -#define UMH_WAIT_EXEC 1 /* wait for the exec, but not the process */ -#define UMH_WAIT_PROC 2 /* wait for the process to complete */ -#define UMH_KILLABLE 4 /* wait for EXEC/PROC killable */ +#define UMH_NO_WAIT 0x00 /* don't wait at all */ +#define UMH_WAIT_EXEC 0x01 /* wait for the exec, but not the process */ +#define UMH_WAIT_PROC 0x02 /* wait for the process to complete */ +#define UMH_KILLABLE 0x04 /* wait for EXEC/PROC killable */ +#define UMH_FREEZABLE 0x08 /* wait for EXEC/PROC freezable */ struct subprocess_info { struct work_struct work; -- cgit v1.2.3 From f9fc8cad9728124cefe8844fb53d1814c92c6bfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Sep 2022 12:39:55 +0200 Subject: sched: Add TASK_ANY for wait_task_inactive() Now that wait_task_inactive()'s @match_state argument is a mask (like ttwu()) it is possible to replace the special !match_state case with an 'all-states' value such that any blocked state will match. Suggested-by: Ingo Molnar (mingo@kernel.org> Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YxhkzfuFTvRnpUaH@hirez.programming.kicks-ass.net --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index e7b2f8a5c711..0facaadbae71 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -101,6 +101,8 @@ struct task_group; #define TASK_RTLOCK_WAIT 0x1000 #define TASK_STATE_MAX 0x2000 +#define TASK_ANY (TASK_STATE_MAX-1) + /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) -- cgit v1.2.3 From 929659acea03db6411a32de9037abab9f856f586 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Aug 2022 13:18:20 +0200 Subject: sched/completion: Add wait_for_completion_state() Allows waiting with a custom @state. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lore.kernel.org/r/20220822114648.922711674@infradead.org --- include/linux/completion.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/completion.h b/include/linux/completion.h index 51d9ab079629..62b32b19e0a8 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -103,6 +103,7 @@ extern void wait_for_completion(struct completion *); extern void wait_for_completion_io(struct completion *); extern int wait_for_completion_interruptible(struct completion *x); extern int wait_for_completion_killable(struct completion *x); +extern int wait_for_completion_state(struct completion *x, unsigned int state); extern unsigned long wait_for_completion_timeout(struct completion *x, unsigned long timeout); extern unsigned long wait_for_completion_io_timeout(struct completion *x, -- cgit v1.2.3 From 3f884a10ab41efe2d495bf8ec8d443d70c500a60 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Aug 2022 13:18:21 +0200 Subject: sched/wait: Add wait_event_state() Allows waiting with a custom @state. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lore.kernel.org/r/20220822114648.989212021@infradead.org --- include/linux/wait.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 58cfbf81447c..b926eb9f3e26 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -932,6 +932,34 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); __ret; \ }) +#define __wait_event_state(wq, condition, state) \ + ___wait_event(wq, condition, state, 0, 0, schedule()) + +/** + * wait_event_state - sleep until a condition gets true + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @state: state to sleep in + * + * The process is put to sleep (@state) until the @condition evaluates to true + * or a signal is received (when allowed by @state). The @condition is checked + * each time the waitqueue @wq_head is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * The function will return -ERESTARTSYS if it was interrupted by a signal + * (when allowed by @state) and 0 if @condition evaluated to true. + */ +#define wait_event_state(wq_head, condition, state) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_state(wq_head, condition, state); \ + __ret; \ +}) + #define __wait_event_killable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_KILLABLE, 0, timeout, \ -- cgit v1.2.3 From 9963e444f71e671bcbc30d61cf23a2c686ac7d05 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Sep 2022 13:11:38 +0200 Subject: sched: Widen TAKS_state literals In preparation of adding more states, add a few 0s to the literals as we've just about ran out. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) --- include/linux/sched.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0facaadbae71..9fc23a2d9813 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -81,25 +81,25 @@ struct task_group; */ /* Used in tsk->state: */ -#define TASK_RUNNING 0x0000 -#define TASK_INTERRUPTIBLE 0x0001 -#define TASK_UNINTERRUPTIBLE 0x0002 -#define __TASK_STOPPED 0x0004 -#define __TASK_TRACED 0x0008 +#define TASK_RUNNING 0x00000000 +#define TASK_INTERRUPTIBLE 0x00000001 +#define TASK_UNINTERRUPTIBLE 0x00000002 +#define __TASK_STOPPED 0x00000004 +#define __TASK_TRACED 0x00000008 /* Used in tsk->exit_state: */ -#define EXIT_DEAD 0x0010 -#define EXIT_ZOMBIE 0x0020 +#define EXIT_DEAD 0x00000010 +#define EXIT_ZOMBIE 0x00000020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ -#define TASK_PARKED 0x0040 -#define TASK_DEAD 0x0080 -#define TASK_WAKEKILL 0x0100 -#define TASK_WAKING 0x0200 -#define TASK_NOLOAD 0x0400 -#define TASK_NEW 0x0800 +#define TASK_PARKED 0x00000040 +#define TASK_DEAD 0x00000080 +#define TASK_WAKEKILL 0x00000100 +#define TASK_WAKING 0x00000200 +#define TASK_NOLOAD 0x00000400 +#define TASK_NEW 0x00000800 /* RT specific auxilliary flag to mark RT lock waiters */ -#define TASK_RTLOCK_WAIT 0x1000 -#define TASK_STATE_MAX 0x2000 +#define TASK_RTLOCK_WAIT 0x00001000 +#define TASK_STATE_MAX 0x00002000 #define TASK_ANY (TASK_STATE_MAX-1) -- cgit v1.2.3 From f5d39b020809146cc28e6e73369bf8065e0310aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Aug 2022 13:18:22 +0200 Subject: freezer,sched: Rewrite core freezer logic Rewrite the core freezer to behave better wrt thawing and be simpler in general. By replacing PF_FROZEN with TASK_FROZEN, a special block state, it is ensured frozen tasks stay frozen until thawed and don't randomly wake up early, as is currently possible. As such, it does away with PF_FROZEN and PF_FREEZER_SKIP, freeing up two PF_flags (yay!). Specifically; the current scheme works a little like: freezer_do_not_count(); schedule(); freezer_count(); And either the task is blocked, or it lands in try_to_freezer() through freezer_count(). Now, when it is blocked, the freezer considers it frozen and continues. However, on thawing, once pm_freezing is cleared, freezer_count() stops working, and any random/spurious wakeup will let a task run before its time. That is, thawing tries to thaw things in explicit order; kernel threads and workqueues before doing bringing SMP back before userspace etc.. However due to the above mentioned races it is entirely possible for userspace tasks to thaw (by accident) before SMP is back. This can be a fatal problem in asymmetric ISA architectures (eg ARMv9) where the userspace task requires a special CPU to run. As said; replace this with a special task state TASK_FROZEN and add the following state transitions: TASK_FREEZABLE -> TASK_FROZEN __TASK_STOPPED -> TASK_FROZEN __TASK_TRACED -> TASK_FROZEN The new TASK_FREEZABLE can be set on any state part of TASK_NORMAL (IOW. TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE) -- any such state is already required to deal with spurious wakeups and the freezer causes one such when thawing the task (since the original state is lost). The special __TASK_{STOPPED,TRACED} states *can* be restored since their canonical state is in ->jobctl. With this, frozen tasks need an explicit TASK_FROZEN wakeup and are free of undue (early / spurious) wakeups. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20220822114649.055452969@infradead.org --- include/linux/freezer.h | 245 ++----------------------------------------- include/linux/sched.h | 13 ++- include/linux/sunrpc/sched.h | 7 +- include/linux/wait.h | 12 +-- 4 files changed, 26 insertions(+), 251 deletions(-) (limited to 'include/linux') diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 0621c5f86c39..b303472255be 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -8,9 +8,11 @@ #include #include #include +#include #ifdef CONFIG_FREEZER -extern atomic_t system_freezing_cnt; /* nr of freezing conds in effect */ +DECLARE_STATIC_KEY_FALSE(freezer_active); + extern bool pm_freezing; /* PM freezing in effect */ extern bool pm_nosig_freezing; /* PM nosig freezing in effect */ @@ -22,10 +24,7 @@ extern unsigned int freeze_timeout_msecs; /* * Check if a process has been frozen */ -static inline bool frozen(struct task_struct *p) -{ - return p->flags & PF_FROZEN; -} +extern bool frozen(struct task_struct *p); extern bool freezing_slow_path(struct task_struct *p); @@ -34,9 +33,10 @@ extern bool freezing_slow_path(struct task_struct *p); */ static inline bool freezing(struct task_struct *p) { - if (likely(!atomic_read(&system_freezing_cnt))) - return false; - return freezing_slow_path(p); + if (static_branch_unlikely(&freezer_active)) + return freezing_slow_path(p); + + return false; } /* Takes and releases task alloc lock using task_lock() */ @@ -48,23 +48,14 @@ extern int freeze_kernel_threads(void); extern void thaw_processes(void); extern void thaw_kernel_threads(void); -/* - * DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION - * If try_to_freeze causes a lockdep warning it means the caller may deadlock - */ -static inline bool try_to_freeze_unsafe(void) +static inline bool try_to_freeze(void) { might_sleep(); if (likely(!freezing(current))) return false; - return __refrigerator(false); -} - -static inline bool try_to_freeze(void) -{ if (!(current->flags & PF_NOFREEZE)) debug_check_no_locks_held(); - return try_to_freeze_unsafe(); + return __refrigerator(false); } extern bool freeze_task(struct task_struct *p); @@ -79,195 +70,6 @@ static inline bool cgroup_freezing(struct task_struct *task) } #endif /* !CONFIG_CGROUP_FREEZER */ -/* - * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it - * calls wait_for_completion(&vfork) and reset right after it returns from this - * function. Next, the parent should call try_to_freeze() to freeze itself - * appropriately in case the child has exited before the freezing of tasks is - * complete. However, we don't want kernel threads to be frozen in unexpected - * places, so we allow them to block freeze_processes() instead or to set - * PF_NOFREEZE if needed. Fortunately, in the ____call_usermodehelper() case the - * parent won't really block freeze_processes(), since ____call_usermodehelper() - * (the child) does a little before exec/exit and it can't be frozen before - * waking up the parent. - */ - - -/** - * freezer_do_not_count - tell freezer to ignore %current - * - * Tell freezers to ignore the current task when determining whether the - * target frozen state is reached. IOW, the current task will be - * considered frozen enough by freezers. - * - * The caller shouldn't do anything which isn't allowed for a frozen task - * until freezer_cont() is called. Usually, freezer[_do_not]_count() pair - * wrap a scheduling operation and nothing much else. - */ -static inline void freezer_do_not_count(void) -{ - current->flags |= PF_FREEZER_SKIP; -} - -/** - * freezer_count - tell freezer to stop ignoring %current - * - * Undo freezer_do_not_count(). It tells freezers that %current should be - * considered again and tries to freeze if freezing condition is already in - * effect. - */ -static inline void freezer_count(void) -{ - current->flags &= ~PF_FREEZER_SKIP; - /* - * If freezing is in progress, the following paired with smp_mb() - * in freezer_should_skip() ensures that either we see %true - * freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP. - */ - smp_mb(); - try_to_freeze(); -} - -/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ -static inline void freezer_count_unsafe(void) -{ - current->flags &= ~PF_FREEZER_SKIP; - smp_mb(); - try_to_freeze_unsafe(); -} - -/** - * freezer_should_skip - whether to skip a task when determining frozen - * state is reached - * @p: task in quesion - * - * This function is used by freezers after establishing %true freezing() to - * test whether a task should be skipped when determining the target frozen - * state is reached. IOW, if this function returns %true, @p is considered - * frozen enough. - */ -static inline bool freezer_should_skip(struct task_struct *p) -{ - /* - * The following smp_mb() paired with the one in freezer_count() - * ensures that either freezer_count() sees %true freezing() or we - * see cleared %PF_FREEZER_SKIP and return %false. This makes it - * impossible for a task to slip frozen state testing after - * clearing %PF_FREEZER_SKIP. - */ - smp_mb(); - return p->flags & PF_FREEZER_SKIP; -} - -/* - * These functions are intended to be used whenever you want allow a sleeping - * task to be frozen. Note that neither return any clear indication of - * whether a freeze event happened while in this function. - */ - -/* Like schedule(), but should not block the freezer. */ -static inline void freezable_schedule(void) -{ - freezer_do_not_count(); - schedule(); - freezer_count(); -} - -/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ -static inline void freezable_schedule_unsafe(void) -{ - freezer_do_not_count(); - schedule(); - freezer_count_unsafe(); -} - -/* - * Like schedule_timeout(), but should not block the freezer. Do not - * call this with locks held. - */ -static inline long freezable_schedule_timeout(long timeout) -{ - long __retval; - freezer_do_not_count(); - __retval = schedule_timeout(timeout); - freezer_count(); - return __retval; -} - -/* - * Like schedule_timeout_interruptible(), but should not block the freezer. Do not - * call this with locks held. - */ -static inline long freezable_schedule_timeout_interruptible(long timeout) -{ - long __retval; - freezer_do_not_count(); - __retval = schedule_timeout_interruptible(timeout); - freezer_count(); - return __retval; -} - -/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ -static inline long freezable_schedule_timeout_interruptible_unsafe(long timeout) -{ - long __retval; - - freezer_do_not_count(); - __retval = schedule_timeout_interruptible(timeout); - freezer_count_unsafe(); - return __retval; -} - -/* Like schedule_timeout_killable(), but should not block the freezer. */ -static inline long freezable_schedule_timeout_killable(long timeout) -{ - long __retval; - freezer_do_not_count(); - __retval = schedule_timeout_killable(timeout); - freezer_count(); - return __retval; -} - -/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ -static inline long freezable_schedule_timeout_killable_unsafe(long timeout) -{ - long __retval; - freezer_do_not_count(); - __retval = schedule_timeout_killable(timeout); - freezer_count_unsafe(); - return __retval; -} - -/* - * Like schedule_hrtimeout_range(), but should not block the freezer. Do not - * call this with locks held. - */ -static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, - u64 delta, const enum hrtimer_mode mode) -{ - int __retval; - freezer_do_not_count(); - __retval = schedule_hrtimeout_range(expires, delta, mode); - freezer_count(); - return __retval; -} - -/* - * Freezer-friendly wrappers around wait_event_interruptible(), - * wait_event_killable() and wait_event_interruptible_timeout(), originally - * defined in - */ - -/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ -#define wait_event_freezekillable_unsafe(wq, condition) \ -({ \ - int __retval; \ - freezer_do_not_count(); \ - __retval = wait_event_killable(wq, (condition)); \ - freezer_count_unsafe(); \ - __retval; \ -}) - #else /* !CONFIG_FREEZER */ static inline bool frozen(struct task_struct *p) { return false; } static inline bool freezing(struct task_struct *p) { return false; } @@ -281,35 +83,8 @@ static inline void thaw_kernel_threads(void) {} static inline bool try_to_freeze(void) { return false; } -static inline void freezer_do_not_count(void) {} -static inline void freezer_count(void) {} -static inline int freezer_should_skip(struct task_struct *p) { return 0; } static inline void set_freezable(void) {} -#define freezable_schedule() schedule() - -#define freezable_schedule_unsafe() schedule() - -#define freezable_schedule_timeout(timeout) schedule_timeout(timeout) - -#define freezable_schedule_timeout_interruptible(timeout) \ - schedule_timeout_interruptible(timeout) - -#define freezable_schedule_timeout_interruptible_unsafe(timeout) \ - schedule_timeout_interruptible(timeout) - -#define freezable_schedule_timeout_killable(timeout) \ - schedule_timeout_killable(timeout) - -#define freezable_schedule_timeout_killable_unsafe(timeout) \ - schedule_timeout_killable(timeout) - -#define freezable_schedule_hrtimeout_range(expires, delta, mode) \ - schedule_hrtimeout_range(expires, delta, mode) - -#define wait_event_freezekillable_unsafe(wq, condition) \ - wait_event_killable(wq, condition) - #endif /* !CONFIG_FREEZER */ #endif /* FREEZER_H_INCLUDED */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9fc23a2d9813..4c91efd6df82 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -97,12 +97,19 @@ struct task_group; #define TASK_WAKING 0x00000200 #define TASK_NOLOAD 0x00000400 #define TASK_NEW 0x00000800 -/* RT specific auxilliary flag to mark RT lock waiters */ #define TASK_RTLOCK_WAIT 0x00001000 -#define TASK_STATE_MAX 0x00002000 +#define TASK_FREEZABLE 0x00002000 +#define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) +#define TASK_FROZEN 0x00008000 +#define TASK_STATE_MAX 0x00010000 #define TASK_ANY (TASK_STATE_MAX-1) +/* + * DO NOT ADD ANY NEW USERS ! + */ +#define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) + /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) @@ -1716,7 +1723,6 @@ extern struct pid *cad_pid; #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ -#define PF_FROZEN 0x00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ @@ -1727,7 +1733,6 @@ extern struct pid *cad_pid; #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ -#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index acc62647317c..baeca2f564dc 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -252,7 +252,7 @@ int rpc_malloc(struct rpc_task *); void rpc_free(struct rpc_task *); int rpciod_up(void); void rpciod_down(void); -int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *); +int rpc_wait_for_completion_task(struct rpc_task *task); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct net; void rpc_show_tasks(struct net *); @@ -264,11 +264,6 @@ extern struct workqueue_struct *xprtiod_workqueue; void rpc_prepare_task(struct rpc_task *task); gfp_t rpc_task_gfp_mask(void); -static inline int rpc_wait_for_completion_task(struct rpc_task *task) -{ - return __rpc_wait_for_completion_task(task, NULL); -} - #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS) static inline const char * rpc_qname(const struct rpc_wait_queue *q) { diff --git a/include/linux/wait.h b/include/linux/wait.h index b926eb9f3e26..14ad8a0e9fac 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -361,8 +361,8 @@ do { \ } while (0) #define __wait_event_freezable(wq_head, condition) \ - ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ - freezable_schedule()) + ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), \ + 0, 0, schedule()) /** * wait_event_freezable - sleep (or freeze) until a condition gets true @@ -420,8 +420,8 @@ do { \ #define __wait_event_freezable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ - TASK_INTERRUPTIBLE, 0, timeout, \ - __ret = freezable_schedule_timeout(__ret)) + (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout, \ + __ret = schedule_timeout(__ret)) /* * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid @@ -642,8 +642,8 @@ do { \ #define __wait_event_freezable_exclusive(wq, condition) \ - ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ - freezable_schedule()) + ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\ + schedule()) #define wait_event_freezable_exclusive(wq, condition) \ ({ \ -- cgit v1.2.3 From fb04563d1cae6f361892b4a339ad92100b1eb0d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Sep 2022 13:27:32 +0200 Subject: sched: Show PF_flag holes Put placeholders in PF_flag so we can see how holey it is. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/YxctoffFFPXONESt@hirez.programming.kicks-ass.net --- include/linux/sched.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4c91efd6df82..15e3bd96e4ce 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1722,7 +1722,9 @@ extern struct pid *cad_pid; #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ +#define PF__HOLE__00004000 0x00004000 #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ +#define PF__HOLE__00010000 0x00010000 #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ @@ -1730,9 +1732,14 @@ extern struct pid *cad_pid; * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ +#define PF__HOLE__00800000 0x00800000 +#define PF__HOLE__01000000 0x01000000 +#define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ +#define PF__HOLE__20000000 0x20000000 +#define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* -- cgit v1.2.3 From 03b02db93be407103c385814033633364674a6f6 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 6 Sep 2022 14:14:14 +0530 Subject: perf: Consolidate branch sample filter helpers Besides the branch type filtering requests, 'event.attr.branch_sample_type' also contains various flags indicating which additional information should be captured, along with the base branch record. These flags help configure the underlying hardware, and capture the branch records appropriately when required e.g after PMU interrupt. But first, this moves an existing helper perf_sample_save_hw_index() into the header before adding some more helpers for other branch sample filter flags. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220906084414.396220-1-anshuman.khandual@arm.com --- include/linux/perf_event.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 581880ddb9ef..a627528e5f3a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1685,4 +1685,30 @@ static inline void perf_lopwr_cb(bool mode) } #endif +#ifdef CONFIG_PERF_EVENTS +static inline bool branch_sample_no_flags(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; +} + +static inline bool branch_sample_no_cycles(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; +} + +static inline bool branch_sample_type(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; +} + +static inline bool branch_sample_hw_index(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + +static inline bool branch_sample_priv(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; +} +#endif /* CONFIG_PERF_EVENTS */ #endif /* _LINUX_PERF_EVENT_H */ -- cgit v1.2.3 From 7517f08b9a5eef0fa683b976c97d6178d00e6a3d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:21 +0530 Subject: perf/core: Expand PERF_EVENT_FLAG_ARCH Two hardware event flags on x86 platform has overshot PERF_EVENT_FLAG_ARCH (0x0000ffff). These flags are PERF_X86_EVENT_PEBS_LAT_HYBRID (0x20000) and PERF_X86_EVENT_AMD_BRS (0x10000). Lets expand PERF_EVENT_FLAG_ARCH mask to accommodate those flags, and also create room for two more in the future. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-2-anshuman.khandual@arm.com --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a627528e5f3a..3e3c07512b75 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -138,7 +138,7 @@ struct hw_perf_event_extra { * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ -#define PERF_EVENT_FLAG_ARCH 0x0000ffff +#define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 /** -- cgit v1.2.3 From f67dd218fafd9de9a13d095e775b621db76a058f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:22 +0530 Subject: perf/core: Assert PERF_EVENT_FLAG_ARCH does not overlap with generic flags This just ensures that PERF_EVENT_FLAG_ARCH does not overlap with generic hardware event flags. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-3-anshuman.khandual@arm.com --- include/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3e3c07512b75..f88cb31eaf75 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -141,6 +141,8 @@ struct hw_perf_event_extra { #define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 +static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); + /** * struct hw_perf_event - performance event hardware details: */ -- cgit v1.2.3 From 91207f62616f9f51b52436364e6d064f002e9112 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:23 +0530 Subject: arm64/perf: Assert all platform event flags are within PERF_EVENT_FLAG_ARCH Ensure all platform specific event flags are within PERF_EVENT_FLAG_ARCH. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-4-anshuman.khandual@arm.com --- include/linux/perf/arm_pmu.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 0407a38b470a..0356cb6a215d 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -24,10 +24,11 @@ /* * ARM PMU hw_event flags */ -/* Event uses a 64bit counter */ -#define ARMPMU_EVT_64BIT 1 -/* Event uses a 47bit counter */ -#define ARMPMU_EVT_47BIT 2 +#define ARMPMU_EVT_64BIT 0x00001 /* Event uses a 64bit counter */ +#define ARMPMU_EVT_47BIT 0x00002 /* Event uses a 47bit counter */ + +static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_64BIT) == ARMPMU_EVT_64BIT); +static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_47BIT) == ARMPMU_EVT_47BIT); #define HW_OP_UNSUPPORTED 0xFFFF #define C(_x) PERF_COUNT_HW_CACHE_##_x -- cgit v1.2.3 From f3c0eba287049237b23d1300376768293eb89e69 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 Sep 2022 18:48:55 +0200 Subject: perf: Add a few assertions While auditing 6b959ba22d34 ("perf/core: Fix reentry problem in perf_output_read_group()") a few spots were found that wanted assertions. Notable for_each_sibling_event() relies on exclusion from modification. This would normally be holding either ctx->lock or ctx->mutex, however due to how things are constructed disabling IRQs is a valid and sufficient substitute for ctx->lock. Another possible site to add assertions would be the various pmu::{add,del,read,..}() methods, but that's not trivially expressable in C -- the best option is wrappers, but those are easy enough to forget. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/perf_event.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f88cb31eaf75..368bdc4f563f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -61,6 +61,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -634,7 +635,23 @@ struct pmu_event_list { struct list_head list; }; +/* + * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex + * as such iteration must hold either lock. However, since ctx->lock is an IRQ + * safe lock, and is only held by the CPU doing the modification, having IRQs + * disabled is sufficient since it will hold-off the IPIs. + */ +#ifdef CONFIG_PROVE_LOCKING +#define lockdep_assert_event_ctx(event) \ + WARN_ON_ONCE(__lockdep_enabled && \ + (this_cpu_read(hardirqs_enabled) || \ + lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) +#else +#define lockdep_assert_event_ctx(event) +#endif + #define for_each_sibling_event(sibling, event) \ + lockdep_assert_event_ctx(event); \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) -- cgit v1.2.3 From d219d2a9a92e39aa92799efe8f2aa21259b6dd82 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 29 Aug 2022 13:37:17 -0700 Subject: overflow: Allow mixed type arguments When the check_[op]_overflow() helpers were introduced, all arguments were required to be the same type to make the fallback macros simpler. However, now that the fallback macros have been removed[1], it is fine to allow mixed types, which makes using the helpers much more useful, as they can be used to test for type-based overflows (e.g. adding two large ints but storing into a u8), as would be handy in the drm core[2]. Remove the restriction, and add additional self-tests that exercise some of the mixed-type overflow cases, and double-check for accidental macro side-effects. [1] https://git.kernel.org/linus/4eb6bd55cfb22ffc20652732340c4962f3ac9a91 [2] https://lore.kernel.org/lkml/20220824084514.2261614-2-gwan-gyeong.mun@intel.com Cc: Rasmus Villemoes Cc: Gwan-gyeong Mun Cc: "Gustavo A. R. Silva" Cc: Nick Desaulniers Cc: linux-hardening@vger.kernel.org Reviewed-by: Andrzej Hajda Reviewed-by: Gwan-gyeong Mun Tested-by: Gwan-gyeong Mun Signed-off-by: Kees Cook --- include/linux/overflow.h | 72 +++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 0eb3b192f07a..19dfdd74835e 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -51,40 +51,50 @@ static inline bool __must_check __must_check_overflow(bool overflow) return unlikely(overflow); } -/* - * For simplicity and code hygiene, the fallback code below insists on - * a, b and *d having the same type (similar to the min() and max() - * macros), whereas gcc's type-generic overflow checkers accept - * different types. Hence we don't just make check_add_overflow an - * alias for __builtin_add_overflow, but add type checks similar to - * below. +/** check_add_overflow() - Calculate addition with overflow checking + * + * @a: first addend + * @b: second addend + * @d: pointer to store sum + * + * Returns 0 on success. + * + * *@d holds the results of the attempted addition, but is not considered + * "safe for use" on a non-zero return value, which indicates that the + * sum has overflowed or been truncated. */ -#define check_add_overflow(a, b, d) __must_check_overflow(({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - __builtin_add_overflow(__a, __b, __d); \ -})) +#define check_add_overflow(a, b, d) \ + __must_check_overflow(__builtin_add_overflow(a, b, d)) -#define check_sub_overflow(a, b, d) __must_check_overflow(({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - __builtin_sub_overflow(__a, __b, __d); \ -})) +/** check_sub_overflow() - Calculate subtraction with overflow checking + * + * @a: minuend; value to subtract from + * @b: subtrahend; value to subtract from @a + * @d: pointer to store difference + * + * Returns 0 on success. + * + * *@d holds the results of the attempted subtraction, but is not considered + * "safe for use" on a non-zero return value, which indicates that the + * difference has underflowed or been truncated. + */ +#define check_sub_overflow(a, b, d) \ + __must_check_overflow(__builtin_sub_overflow(a, b, d)) -#define check_mul_overflow(a, b, d) __must_check_overflow(({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - __builtin_mul_overflow(__a, __b, __d); \ -})) +/** check_mul_overflow() - Calculate multiplication with overflow checking + * + * @a: first factor + * @b: second factor + * @d: pointer to store product + * + * Returns 0 on success. + * + * *@d holds the results of the attempted multiplication, but is not + * considered "safe for use" on a non-zero return value, which indicates + * that the product has overflowed or been truncated. + */ +#define check_mul_overflow(a, b, d) \ + __must_check_overflow(__builtin_mul_overflow(a, b, d)) /** check_shl_overflow() - Calculate a left-shifted value and check overflow * -- cgit v1.2.3 From dfbafa70bde26c40615f8c538ce68dac82a64fb4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 26 Aug 2022 11:04:43 -0700 Subject: string: Introduce strtomem() and strtomem_pad() One of the "legitimate" uses of strncpy() is copying a NUL-terminated string into a fixed-size non-NUL-terminated character array. To avoid the weaknesses and ambiguity of intent when using strncpy(), provide replacement functions that explicitly distinguish between trailing padding and not, and require the destination buffer size be discoverable by the compiler. For example: struct obj { int foo; char small[4] __nonstring; char big[8] __nonstring; int bar; }; struct obj p; /* This will truncate to 4 chars with no trailing NUL */ strncpy(p.small, "hello", sizeof(p.small)); /* p.small contains 'h', 'e', 'l', 'l' */ /* This will NUL pad to 8 chars. */ strncpy(p.big, "hello", sizeof(p.big)); /* p.big contains 'h', 'e', 'l', 'l', 'o', '\0', '\0', '\0' */ When the "__nonstring" attributes are missing, the intent of the programmer becomes ambiguous for whether the lack of a trailing NUL in the p.small copy is a bug. Additionally, it's not clear whether the trailing padding in the p.big copy is _needed_. Both cases become unambiguous with: strtomem(p.small, "hello"); strtomem_pad(p.big, "hello", 0); See also https://github.com/KSPP/linux/issues/90 Expand the memcpy KUnit tests to include these functions. Cc: Wolfram Sang Cc: Nick Desaulniers Cc: Geert Uytterhoeven Cc: Guenter Roeck Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 32 +++++++++++++++++++++++++++++++ include/linux/string.h | 43 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 3b401fa0f374..8e8c2c87b1d5 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -77,6 +77,38 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) #define POS __pass_object_size(1) #define POS0 __pass_object_size(0) +/** + * strncpy - Copy a string to memory with non-guaranteed NUL padding + * + * @p: pointer to destination of copy + * @q: pointer to NUL-terminated source string to copy + * @size: bytes to write at @p + * + * If strlen(@q) >= @size, the copy of @q will stop after @size bytes, + * and @p will NOT be NUL-terminated + * + * If strlen(@q) < @size, following the copy of @q, trailing NUL bytes + * will be written to @p until @size total bytes have been written. + * + * Do not use this function. While FORTIFY_SOURCE tries to avoid + * over-reads of @q, it cannot defend against writing unterminated + * results to @p. Using strncpy() remains ambiguous and fragile. + * Instead, please choose an alternative, so that the expectation + * of @p's contents is unambiguous: + * + * +--------------------+-----------------+------------+ + * | @p needs to be: | padded to @size | not padded | + * +====================+=================+============+ + * | NUL-terminated | strscpy_pad() | strscpy() | + * +--------------------+-----------------+------------+ + * | not NUL-terminated | strtomem_pad() | strtomem() | + * +--------------------+-----------------+------------+ + * + * Note strscpy*()'s differing return values for detecting truncation, + * and strtomem*()'s expectation that the destination is marked with + * __nonstring when it is a character array. + * + */ __FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3) char *strncpy(char * const POS p, const char *q, __kernel_size_t size) { diff --git a/include/linux/string.h b/include/linux/string.h index 61ec7e4f6311..cf7607b32102 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -260,6 +260,49 @@ static inline const char *kbasename(const char *path) void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, int pad); +/** + * strtomem_pad - Copy NUL-terminated string to non-NUL-terminated buffer + * + * @dest: Pointer of destination character array (marked as __nonstring) + * @src: Pointer to NUL-terminated string + * @pad: Padding character to fill any remaining bytes of @dest after copy + * + * This is a replacement for strncpy() uses where the destination is not + * a NUL-terminated string, but with bounds checking on the source size, and + * an explicit padding character. If padding is not required, use strtomem(). + * + * Note that the size of @dest is not an argument, as the length of @dest + * must be discoverable by the compiler. + */ +#define strtomem_pad(dest, src, pad) do { \ + const size_t _dest_len = __builtin_object_size(dest, 1); \ + \ + BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ + _dest_len == (size_t)-1); \ + memcpy_and_pad(dest, _dest_len, src, strnlen(src, _dest_len), pad); \ +} while (0) + +/** + * strtomem - Copy NUL-terminated string to non-NUL-terminated buffer + * + * @dest: Pointer of destination character array (marked as __nonstring) + * @src: Pointer to NUL-terminated string + * + * This is a replacement for strncpy() uses where the destination is not + * a NUL-terminated string, but with bounds checking on the source size, and + * without trailing padding. If padding is required, use strtomem_pad(). + * + * Note that the size of @dest is not an argument, as the length of @dest + * must be discoverable by the compiler. + */ +#define strtomem(dest, src) do { \ + const size_t _dest_len = __builtin_object_size(dest, 1); \ + \ + BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ + _dest_len == (size_t)-1); \ + memcpy(dest, src, min(_dest_len, strnlen(src, _dest_len))); \ +} while (0) + /** * memset_after - Set a value after a struct member to the end of a struct * -- cgit v1.2.3 From d07c0acb4f41cc42a0d97530946965b3e4fa68c1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Sep 2022 13:02:26 -0700 Subject: fortify: Fix __compiletime_strlen() under UBSAN_BOUNDS_LOCAL With CONFIG_FORTIFY=y and CONFIG_UBSAN_LOCAL_BOUNDS=y enabled, we observe a runtime panic while running Android's Compatibility Test Suite's (CTS) android.hardware.input.cts.tests. This is stemming from a strlen() call in hidinput_allocate(). __compiletime_strlen() is implemented in terms of __builtin_object_size(), then does an array access to check for NUL-termination. A quirk of __builtin_object_size() is that for strings whose values are runtime dependent, __builtin_object_size(str, 1 or 0) returns the maximum size of possible values when those sizes are determinable at compile time. Example: static const char *v = "FOO BAR"; static const char *y = "FOO BA"; unsigned long x (int z) { // Returns 8, which is: // max(__builtin_object_size(v, 1), __builtin_object_size(y, 1)) return __builtin_object_size(z ? v : y, 1); } So when FORTIFY_SOURCE is enabled, the current implementation of __compiletime_strlen() will try to access beyond the end of y at runtime using the size of v. Mixed with UBSAN_LOCAL_BOUNDS we get a fault. hidinput_allocate() has a local C string whose value is control flow dependent on a switch statement, so __builtin_object_size(str, 1) evaluates to the maximum string length, making all other cases fault on the last character check. hidinput_allocate() could be cleaned up to avoid runtime calls to strlen() since the local variable can only have literal values, so there's no benefit to trying to fortify the strlen call site there. Perform a __builtin_constant_p() check against index 0 earlier in the macro to filter out the control-flow-dependant case. Add a KUnit test for checking the expected behavioral characteristics of FORTIFY_SOURCE internals. Cc: Nathan Chancellor Cc: Tom Rix Cc: Andrew Morton Cc: Vlastimil Babka Cc: "Steven Rostedt (Google)" Cc: David Gow Cc: Yury Norov Cc: Masami Hiramatsu Cc: Sander Vanheule Cc: linux-hardening@vger.kernel.org Cc: llvm@lists.linux.dev Reviewed-by: Nick Desaulniers Tested-by: Android Treehugger Robot Link: https://android-review.googlesource.com/c/kernel/common/+/2206839 Co-developed-by: Nick Desaulniers Signed-off-by: Nick Desaulniers Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 8e8c2c87b1d5..be264091f7a7 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -19,7 +19,8 @@ void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning(" unsigned char *__p = (unsigned char *)(p); \ size_t __ret = (size_t)-1; \ size_t __p_size = __builtin_object_size(p, 1); \ - if (__p_size != (size_t)-1) { \ + if (__p_size != (size_t)-1 && \ + __builtin_constant_p(*__p)) { \ size_t __p_len = __p_size - 1; \ if (__builtin_constant_p(__p[__p_len]) && \ __p[__p_len] == '\0') \ -- cgit v1.2.3 From 311fb40aa0569abacc430b0d66ee41470803111f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Sep 2022 13:23:06 -0700 Subject: fortify: Use SIZE_MAX instead of (size_t)-1 Clean up uses of "(size_t)-1" in favor of SIZE_MAX. Cc: linux-hardening@vger.kernel.org Suggested-by: Nick Desaulniers Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index be264091f7a7..e46af17d23d0 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -3,6 +3,7 @@ #define _LINUX_FORTIFY_STRING_H_ #include +#include #define __FORTIFY_INLINE extern __always_inline __gnu_inline __overloadable #define __RENAME(x) __asm__(#x) @@ -17,9 +18,9 @@ void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning(" #define __compiletime_strlen(p) \ ({ \ unsigned char *__p = (unsigned char *)(p); \ - size_t __ret = (size_t)-1; \ + size_t __ret = SIZE_MAX; \ size_t __p_size = __builtin_object_size(p, 1); \ - if (__p_size != (size_t)-1 && \ + if (__p_size != SIZE_MAX && \ __builtin_constant_p(*__p)) { \ size_t __p_len = __p_size - 1; \ if (__builtin_constant_p(__p[__p_len]) && \ @@ -127,7 +128,7 @@ char *strcat(char * const POS p, const char *q) { size_t p_size = __builtin_object_size(p, 1); - if (p_size == (size_t)-1) + if (p_size == SIZE_MAX) return __underlying_strcat(p, q); if (strlcat(p, q, p_size) >= p_size) fortify_panic(__func__); @@ -142,7 +143,7 @@ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size size_t ret; /* We can take compile-time actions when maxlen is const. */ - if (__builtin_constant_p(maxlen) && p_len != (size_t)-1) { + if (__builtin_constant_p(maxlen) && p_len != SIZE_MAX) { /* If p is const, we can use its compile-time-known len. */ if (maxlen >= p_size) return p_len; @@ -170,7 +171,7 @@ __kernel_size_t __fortify_strlen(const char * const POS p) size_t p_size = __builtin_object_size(p, 1); /* Give up if we don't know how large p is. */ - if (p_size == (size_t)-1) + if (p_size == SIZE_MAX) return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) @@ -187,7 +188,7 @@ __FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, si size_t q_len; /* Full count of source string length. */ size_t len; /* Count of characters going into destination. */ - if (p_size == (size_t)-1 && q_size == (size_t)-1) + if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strlcpy(p, q, size); q_len = strlen(q); len = (q_len >= size) ? size - 1 : q_len; @@ -215,7 +216,7 @@ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, s size_t q_size = __builtin_object_size(q, 1); /* If we cannot get size of p and q default to call strscpy. */ - if (p_size == (size_t) -1 && q_size == (size_t) -1) + if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strscpy(p, q, size); /* @@ -260,7 +261,7 @@ char *strncat(char * const POS p, const char * const POS q, __kernel_size_t coun size_t p_size = __builtin_object_size(p, 1); size_t q_size = __builtin_object_size(q, 1); - if (p_size == (size_t)-1 && q_size == (size_t)-1) + if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strncat(p, q, count); p_len = strlen(p); copy_len = strnlen(q, count); @@ -301,10 +302,10 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size, /* * Always stop accesses beyond the struct that contains the * field, when the buffer's remaining size is known. - * (The -1 test is to optimize away checks where the buffer + * (The SIZE_MAX test is to optimize away checks where the buffer * lengths are unknown.) */ - if (p_size != (size_t)(-1) && p_size < size) + if (p_size != SIZE_MAX && p_size < size) fortify_panic("memset"); } @@ -395,11 +396,11 @@ __FORTIFY_INLINE void fortify_memcpy_chk(__kernel_size_t size, /* * Always stop accesses beyond the struct that contains the * field, when the buffer's remaining size is known. - * (The -1 test is to optimize away checks where the buffer + * (The SIZE_MAX test is to optimize away checks where the buffer * lengths are unknown.) */ - if ((p_size != (size_t)(-1) && p_size < size) || - (q_size != (size_t)(-1) && q_size < size)) + if ((p_size != SIZE_MAX && p_size < size) || + (q_size != SIZE_MAX && q_size < size)) fortify_panic(func); } @@ -498,7 +499,7 @@ char *strcpy(char * const POS p, const char * const POS q) size_t size; /* If neither buffer size is known, immediately give up. */ - if (p_size == (size_t)-1 && q_size == (size_t)-1) + if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strcpy(p, q); size = strlen(q) + 1; /* Compile-time check for const size overflow. */ -- cgit v1.2.3 From 54d9469bc515dc5fcbc20eecbe19cea868b70d68 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 24 Jun 2021 15:39:26 -0700 Subject: fortify: Add run-time WARN for cross-field memcpy() Enable run-time checking of dynamic memcpy() and memmove() lengths, issuing a WARN when a write would exceed the size of the target struct member, when built with CONFIG_FORTIFY_SOURCE=y. This would have caught all of the memcpy()-based buffer overflows in the last 3 years, specifically covering all the cases where the destination buffer size is known at compile time. This change ONLY adds a run-time warning. As false positives are currently still expected, this will not block the overflow. The new warnings will look like this: memcpy: detected field-spanning write (size N) of single field "var->dest" (size M) WARNING: CPU: n PID: pppp at source/file/path.c:nr function+0xXX/0xXX [module] There may be false positives in the kernel where intentional field-spanning writes are happening. These need to be addressed similarly to how the compile-time cases were addressed: add a struct_group(), split the memcpy(), or some other refactoring. In order to make counting/investigating instances of added runtime checks easier, each instance includes the destination variable name as a WARN argument, prefixed with 'field "'. Therefore, on an x86_64 defconfig build, it is trivial to inspect the build artifacts to find instances. For example on an x86_64 defconfig build, there are 78 new run-time memcpy() bounds checks added: $ for i in vmlinux $(find . -name '*.ko'); do \ strings "$i" | grep '^field "'; done | wc -l 78 Simple cases where a destination buffer is known to be a dynamic size do not generate a WARN. For example: struct normal_flex_array { void *a; int b; u32 c; size_t array_size; u8 flex_array[]; }; struct normal_flex_array *instance; ... /* These will be ignored for run-time bounds checking. */ memcpy(instance, src, len); memcpy(instance->flex_array, src, len); However, one of the dynamic-sized destination cases is irritatingly unable to be detected by the compiler: when using memcpy() to target a composite struct member which contains a trailing flexible array struct. For example: struct wrapper { int foo; char bar; struct normal_flex_array embedded; }; struct wrapper *instance; ... /* This will incorrectly WARN when len > sizeof(instance->embedded) */ memcpy(&instance->embedded, src, len); These cases end up appearing to the compiler to be sized as if the flexible array had 0 elements. :( For more details see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832 https://godbolt.org/z/vW6x8vh4P These "composite flexible array structure destination" cases will be need to be flushed out and addressed on a case-by-case basis. Regardless, for the general case of using memcpy() on flexible array destinations, future APIs will be created to handle common cases. Those can be used to migrate away from open-coded memcpy() so that proper error handling (instead of trapping) can be used. As mentioned, none of these bounds checks block any overflows currently. For users that have tested their workloads, do not encounter any warnings, and wish to make these checks stop any overflows, they can use a big hammer and set the sysctl panic_on_warn=1. Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 70 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index e46af17d23d0..ff879efe94ed 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FORTIFY_STRING_H_ #define _LINUX_FORTIFY_STRING_H_ +#include #include #include @@ -353,7 +354,7 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size, * V = vulnerable to run-time overflow (will need refactoring to solve) * */ -__FORTIFY_INLINE void fortify_memcpy_chk(__kernel_size_t size, +__FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t p_size, const size_t q_size, const size_t p_size_field, @@ -402,16 +403,79 @@ __FORTIFY_INLINE void fortify_memcpy_chk(__kernel_size_t size, if ((p_size != SIZE_MAX && p_size < size) || (q_size != SIZE_MAX && q_size < size)) fortify_panic(func); + + /* + * Warn when writing beyond destination field size. + * + * We must ignore p_size_field == 0 for existing 0-element + * fake flexible arrays, until they are all converted to + * proper flexible arrays. + * + * The implementation of __builtin_object_size() behaves + * like sizeof() when not directly referencing a flexible + * array member, which means there will be many bounds checks + * that will appear at run-time, without a way for them to be + * detected at compile-time (as can be done when the destination + * is specifically the flexible array member). + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832 + */ + if (p_size_field != 0 && p_size_field != SIZE_MAX && + p_size != p_size_field && p_size_field < size) + return true; + + return false; } #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ size_t __fortify_size = (size_t)(size); \ - fortify_memcpy_chk(__fortify_size, p_size, q_size, \ - p_size_field, q_size_field, #op); \ + WARN_ONCE(fortify_memcpy_chk(__fortify_size, p_size, q_size, \ + p_size_field, q_size_field, #op), \ + #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \ + __fortify_size, \ + "field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \ + p_size_field); \ __underlying_##op(p, q, __fortify_size); \ }) +/* + * Notes about compile-time buffer size detection: + * + * With these types... + * + * struct middle { + * u16 a; + * u8 middle_buf[16]; + * int b; + * }; + * struct end { + * u16 a; + * u8 end_buf[16]; + * }; + * struct flex { + * int a; + * u8 flex_buf[]; + * }; + * + * void func(TYPE *ptr) { ... } + * + * Cases where destination size cannot be currently detected: + * - the size of ptr's object (seemingly by design, gcc & clang fail): + * __builtin_object_size(ptr, 1) == SIZE_MAX + * - the size of flexible arrays in ptr's obj (by design, dynamic size): + * __builtin_object_size(ptr->flex_buf, 1) == SIZE_MAX + * - the size of ANY array at the end of ptr's obj (gcc and clang bug): + * __builtin_object_size(ptr->end_buf, 1) == SIZE_MAX + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836 + * + * Cases where destination size is currently detected: + * - the size of non-array members within ptr's object: + * __builtin_object_size(ptr->a, 1) == 2 + * - the size of non-flexible-array in the middle of ptr's obj: + * __builtin_object_size(ptr->middle_buf, 1) == 16 + * + */ + /* * __builtin_object_size() must be captured here to avoid evaluating argument * side-effects further into the macro layers. -- cgit v1.2.3 From b239da34203f49c40b5d656220c39647c3ff0b3c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Sep 2022 22:41:28 +0200 Subject: bpf: Add helper macro bpf_for_each_reg_in_vstate For a lot of use cases in future patches, we will want to modify the state of registers part of some same 'group' (e.g. same ref_obj_id). It won't just be limited to releasing reference state, but setting a type flag dynamically based on certain actions, etc. Hence, we need a way to easily pass a callback to the function that iterates over all registers in current bpf_verifier_state in all frames upto (and including) the curframe. While in C++ we would be able to easily use a lambda to pass state and the callback together, sadly we aren't using C++ in the kernel. The next best thing to avoid defining a function for each case seems like statement expressions in GNU C. The kernel already uses them heavily, hence they can passed to the macro in the style of a lambda. The statement expression will then be substituted in the for loop bodies. Variables __state and __reg are set to current bpf_func_state and reg for each invocation of the expression inside the passed in verifier state. Then, convert mark_ptr_or_null_regs, clear_all_pkt_pointers, release_reference, find_good_pkt_pointers, find_equal_scalars to use bpf_for_each_reg_in_vstate. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220904204145.3089-16-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8fbc1d05281e..b49a349cc6ae 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -348,6 +348,27 @@ struct bpf_verifier_state { iter < frame->allocated_stack / BPF_REG_SIZE; \ iter++, reg = bpf_get_spilled_reg(iter, frame)) +/* Invoke __expr over regsiters in __vst, setting __state and __reg */ +#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ + ({ \ + struct bpf_verifier_state *___vstate = __vst; \ + int ___i, ___j; \ + for (___i = 0; ___i <= ___vstate->curframe; ___i++) { \ + struct bpf_reg_state *___regs; \ + __state = ___vstate->frame[___i]; \ + ___regs = __state->regs; \ + for (___j = 0; ___j < MAX_BPF_REG; ___j++) { \ + __reg = &___regs[___j]; \ + (void)(__expr); \ + } \ + bpf_for_each_spilled_reg(___j, __state, __reg) { \ + if (!__reg) \ + continue; \ + (void)(__expr); \ + } \ + } \ + }) + /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; -- cgit v1.2.3 From d01387fc16421cbbf95d1fda8fe1258195396c64 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 7 Sep 2022 15:52:31 +0100 Subject: firmware: arm_ffa: Add pointer to the ffa_dev_ops in struct ffa_dev Currently ffa_dev_ops_get() is the way to fetch the ffa_dev_ops pointer. It checks if the ffa_dev structure pointer is valid before returning the ffa_dev_ops pointer. Instead, the pointer can be made part of the ffa_dev structure and since the core driver is incharge of creating ffa_device for each identified partition, there is no need to check for the validity explicitly if the pointer is embedded in the structure. Add the pointer to the ffa_dev_ops in the ffa_dev structure itself and initialise the same as part of creation of the device. Link: https://lore.kernel.org/r/20220907145240.1683088-2-sudeep.holla@arm.com Reviewed-by: Jens Wiklander Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index e5c76c1ef9ed..91b47e42b73d 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -17,6 +17,7 @@ struct ffa_device { bool mode_32bit; uuid_t uuid; struct device dev; + const struct ffa_dev_ops *ops; }; #define to_ffa_dev(d) container_of(d, struct ffa_device, dev) @@ -47,7 +48,8 @@ static inline void *ffa_dev_get_drvdata(struct ffa_device *fdev) } #if IS_REACHABLE(CONFIG_ARM_FFA_TRANSPORT) -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id); +struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, + const struct ffa_dev_ops *ops); void ffa_device_unregister(struct ffa_device *ffa_dev); int ffa_driver_register(struct ffa_driver *driver, struct module *owner, const char *mod_name); @@ -57,7 +59,8 @@ const struct ffa_dev_ops *ffa_dev_ops_get(struct ffa_device *dev); #else static inline -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id) +struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, + const struct ffa_dev_ops *ops) { return NULL; } -- cgit v1.2.3 From 55bf84fd0a76894ae29c69b3552e073fa37818be Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 7 Sep 2022 15:52:33 +0100 Subject: firmware: arm_ffa: Remove ffa_dev_ops_get() The only user of this exported ffa_dev_ops_get() was OPTEE driver which now uses ffa_dev->ops directly, there are no other users for this. Also, since any ffa driver can use ffa_dev->ops directly, there will be no need for ffa_dev_ops_get(), so just remove ffa_dev_ops_get(). Link: https://lore.kernel.org/r/20220907145240.1683088-4-sudeep.holla@arm.com Reviewed-by: Jens Wiklander Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 91b47e42b73d..556f50f27fb1 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -55,7 +55,6 @@ int ffa_driver_register(struct ffa_driver *driver, struct module *owner, const char *mod_name); void ffa_driver_unregister(struct ffa_driver *driver); bool ffa_device_is_valid(struct ffa_device *ffa_dev); -const struct ffa_dev_ops *ffa_dev_ops_get(struct ffa_device *dev); #else static inline @@ -79,11 +78,6 @@ static inline void ffa_driver_unregister(struct ffa_driver *driver) {} static inline bool ffa_device_is_valid(struct ffa_device *ffa_dev) { return false; } -static inline -const struct ffa_dev_ops *ffa_dev_ops_get(struct ffa_device *dev) -{ - return NULL; -} #endif /* CONFIG_ARM_FFA_TRANSPORT */ #define ffa_register(driver) \ -- cgit v1.2.3 From 8c3812c8f74f050278d734ec4b90149d84bdbefb Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 7 Sep 2022 15:52:36 +0100 Subject: firmware: arm_ffa: Make memory apis ffa_device independent There is a requirement to make memory APIs independent of the ffa_device. One of the use-case is to have a common memory driver that manages the memory for all the ffa_devices. That common memory driver won't be a ffa_driver or won't have any ffa_device associated with it. So having these memory APIs accessible without a ffa_device is needed and should be possible as most of these are handled by the partition manager(SPM or hypervisor). Drop the ffa_device argument to the memory APIs and make them ffa_device independent. Link: https://lore.kernel.org/r/20220907145240.1683088-7-sudeep.holla@arm.com Acked-by: Jens Wiklander Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 556f50f27fb1..eafab07c9f58 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -262,10 +262,8 @@ struct ffa_dev_ops { int (*sync_send_receive)(struct ffa_device *dev, struct ffa_send_direct_data *data); int (*memory_reclaim)(u64 g_handle, u32 flags); - int (*memory_share)(struct ffa_device *dev, - struct ffa_mem_ops_args *args); - int (*memory_lend)(struct ffa_device *dev, - struct ffa_mem_ops_args *args); + int (*memory_share)(struct ffa_mem_ops_args *args); + int (*memory_lend)(struct ffa_mem_ops_args *args); }; #endif /* _LINUX_ARM_FFA_H */ -- cgit v1.2.3 From 7aa7a97989557011f762a4b7c2e4e3b061b638e4 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 7 Sep 2022 15:52:37 +0100 Subject: firmware: arm_ffa: Rename ffa_dev_ops as ffa_ops Except the message APIs, all other APIs are ffa_device independent and can be used without any associated ffa_device from a non ffa_driver. In order to reflect the same, just rename ffa_dev_ops as ffa_ops to avoid any confusion or to keep it simple. Link: https://lore.kernel.org/r/20220907145240.1683088-8-sudeep.holla@arm.com Suggested-by: Sumit Garg Reviewed-by: Sumit Garg Reviewed-by: Jens Wiklander Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index eafab07c9f58..4c4b06783035 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -17,7 +17,7 @@ struct ffa_device { bool mode_32bit; uuid_t uuid; struct device dev; - const struct ffa_dev_ops *ops; + const struct ffa_ops *ops; }; #define to_ffa_dev(d) container_of(d, struct ffa_device, dev) @@ -49,7 +49,7 @@ static inline void *ffa_dev_get_drvdata(struct ffa_device *fdev) #if IS_REACHABLE(CONFIG_ARM_FFA_TRANSPORT) struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_dev_ops *ops); + const struct ffa_ops *ops); void ffa_device_unregister(struct ffa_device *ffa_dev); int ffa_driver_register(struct ffa_driver *driver, struct module *owner, const char *mod_name); @@ -59,7 +59,7 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev); #else static inline struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_dev_ops *ops) + const struct ffa_ops *ops) { return NULL; } @@ -254,7 +254,7 @@ struct ffa_mem_ops_args { struct ffa_mem_region_attributes *attrs; }; -struct ffa_dev_ops { +struct ffa_ops { u32 (*api_version_get)(void); int (*partition_info_get)(const char *uuid_str, struct ffa_partition_info *buffer); -- cgit v1.2.3 From bb1be749850055d88d839eff0962e5915788f228 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 7 Sep 2022 15:52:38 +0100 Subject: firmware: arm_ffa: Add v1.1 get_partition_info support FF-A v1.1 adds support to discovery the UUIDs of the partitions that was missing in v1.0 and which the driver workarounds by using UUIDs supplied by the ffa_drivers. Add the v1.1 get_partition_info support and disable the workaround if the detected FF-A version is greater than v1.0. Link: https://lore.kernel.org/r/20220907145240.1683088-9-sudeep.holla@arm.com Reviewed-by: Jens Wiklander Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 4c4b06783035..09567ffd1f49 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -107,6 +107,7 @@ struct ffa_partition_info { /* partition can send and receive indirect messages. */ #define FFA_PARTITION_INDIRECT_MSG BIT(2) u32 properties; + u32 uuid[4]; }; /* For use with FFA_MSG_SEND_DIRECT_{REQ,RESP} which pass data via registers */ -- cgit v1.2.3 From 106b11b1ccd5a43432d9517f4a26629a1658cfe6 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 7 Sep 2022 15:52:39 +0100 Subject: firmware: arm_ffa: Set up 32bit execution mode flag using partiion property FF-A v1.1 adds a flag in the partition properties to indicate if the partition runs in the AArch32 or AArch64 execution state. Use the same to set-up the 32-bit execution flag mode in the ffa_dev automatically if the detected firmware version is above v1.0 and ignore any requests to do the same from the ffa_driver. Link: https://lore.kernel.org/r/20220907145240.1683088-10-sudeep.holla@arm.com Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 09567ffd1f49..5964b6104996 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -106,6 +106,8 @@ struct ffa_partition_info { #define FFA_PARTITION_DIRECT_SEND BIT(1) /* partition can send and receive indirect messages. */ #define FFA_PARTITION_INDIRECT_MSG BIT(2) +/* partition runs in the AArch64 execution state. */ +#define FFA_PARTITION_AARCH64_EXEC BIT(8) u32 properties; u32 uuid[4]; }; -- cgit v1.2.3 From 5b0c6328e47dccf552996ca711005ca3f44034e9 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 7 Sep 2022 15:52:40 +0100 Subject: firmware: arm_ffa: Split up ffa_ops into info, message and memory operations In preparation to make memory operations accessible for a non ffa_driver/device, it is better to split the ffa_ops into different categories of operations: info, message and memory. The info and memory are ffa_device independent and can be used without any associated ffa_device from a non ffa_driver. However, we don't export these info and memory APIs yet without the user. The first users of these APIs can export them. Link: https://lore.kernel.org/r/20220907145240.1683088-11-sudeep.holla@arm.com Reviewed-by: Jens Wiklander Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 5964b6104996..5f02d2e6b9d9 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -257,16 +257,28 @@ struct ffa_mem_ops_args { struct ffa_mem_region_attributes *attrs; }; -struct ffa_ops { +struct ffa_info_ops { u32 (*api_version_get)(void); int (*partition_info_get)(const char *uuid_str, struct ffa_partition_info *buffer); +}; + +struct ffa_msg_ops { void (*mode_32bit_set)(struct ffa_device *dev); int (*sync_send_receive)(struct ffa_device *dev, struct ffa_send_direct_data *data); +}; + +struct ffa_mem_ops { int (*memory_reclaim)(u64 g_handle, u32 flags); int (*memory_share)(struct ffa_mem_ops_args *args); int (*memory_lend)(struct ffa_mem_ops_args *args); }; +struct ffa_ops { + const struct ffa_info_ops *info_ops; + const struct ffa_msg_ops *msg_ops; + const struct ffa_mem_ops *mem_ops; +}; + #endif /* _LINUX_ARM_FFA_H */ -- cgit v1.2.3 From 2f79cdfe58c13949bbbb65ba5926abfe9561d0ec Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 31 Aug 2022 09:46:12 -0700 Subject: fs: only do a memory barrier for the first set_buffer_uptodate() Commit d4252071b97d ("add barriers to buffer_uptodate and set_buffer_uptodate") added proper memory barriers to the buffer head BH_Uptodate bit, so that anybody who tests a buffer for being up-to-date will be guaranteed to actually see initialized state. However, that commit didn't _just_ add the memory barrier, it also ended up dropping the "was it already set" logic that the BUFFER_FNS() macro had. That's conceptually the right thing for a generic "this is a memory barrier" operation, but in the case of the buffer contents, we really only care about the memory barrier for the _first_ time we set the bit, in that the only memory ordering protection we need is to avoid anybody seeing uninitialized memory contents. Any other access ordering wouldn't be about the BH_Uptodate bit anyway, and would require some other proper lock (typically BH_Lock or the folio lock). A reader that races with somebody invalidating the buffer head isn't an issue wrt the memory ordering, it's a serialization issue. Now, you'd think that the buffer head operations don't matter in this day and age (and I certainly thought so), but apparently some loads still end up being heavy users of buffer heads. In particular, the kernel test robot reported that not having this bit access optimization in place caused a noticeable direct IO performance regression on ext4: fxmark.ssd_ext4_no_jnl_DWTL_54_directio.works/sec -26.5% regression although you presumably need a fast disk and a lot of cores to actually notice. Link: https://lore.kernel.org/all/Yw8L7HTZ%2FdE2%2Fo9C@xsang-OptiPlex-9020/ Reported-by: kernel test robot Tested-by: Fengwei Yin Cc: Mikulas Patocka Cc: Matthew Wilcox (Oracle) Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 089c9ade4325..df518c429667 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -137,6 +137,17 @@ BUFFER_FNS(Defer_Completion, defer_completion) static __always_inline void set_buffer_uptodate(struct buffer_head *bh) { + /* + * If somebody else already set this uptodate, they will + * have done the memory barrier, and a reader will thus + * see *some* valid buffer state. + * + * Any other serialization (with IO errors or whatever that + * might clear the bit) has to come from other state (eg BH_Lock). + */ + if (test_bit(BH_Uptodate, &bh->b_state)) + return; + /* * make it consistent with folio_mark_uptodate * pairs with smp_load_acquire in buffer_uptodate -- cgit v1.2.3 From 86432b7f8f92b784c2e4af5b02766fb44052abf7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 8 Sep 2022 16:05:18 +0300 Subject: spi: Group cs_change and cs_off flags together in struct spi_transfer The commit 5e0531f6b90a ("spi: Add capability to perform some transfer with chipselect off") added a new flag but squeezed it into a wrong group of struct spi_transfer members (note that SPI_NBITS_* are macros for easier interpretation of the tx_nbits and rx_nbits bitfields). Group cs_change and cs_off flags together and their doc strings. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220908130518.32186-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 6e6c62c59957..19acd4c9c7e3 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -846,8 +846,8 @@ struct spi_res { * @bits_per_word: select a bits_per_word other than the device default * for this transfer. If 0 the default (from @spi_device) is used. * @dummy_data: indicates transfer is dummy bytes transfer. - * @cs_change: affects chipselect after this transfer completes * @cs_off: performs the transfer with chipselect off. + * @cs_change: affects chipselect after this transfer completes * @cs_change_delay: delay between cs deassert and assert when * @cs_change is set and @spi_transfer is not the last in @spi_message * @delay: delay to be introduced after this transfer before @@ -957,10 +957,10 @@ struct spi_transfer { struct sg_table rx_sg; unsigned dummy_data:1; + unsigned cs_off:1; unsigned cs_change:1; unsigned tx_nbits:3; unsigned rx_nbits:3; - unsigned cs_off:1; #define SPI_NBITS_SINGLE 0x01 /* 1bit transfer */ #define SPI_NBITS_DUAL 0x02 /* 2bits transfer */ #define SPI_NBITS_QUAD 0x04 /* 4bits transfer */ -- cgit v1.2.3 From 58ccf0190d19d9a8a41f8a02b9e06742b58df4a1 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 8 Sep 2022 21:34:42 +0300 Subject: vfio: Add an IOVA bitmap support The new facility adds a bunch of wrappers that abstract how an IOVA range is represented in a bitmap that is granulated by a given page_size. So it translates all the lifting of dealing with user pointers into its corresponding kernel addresses backing said user memory into doing finally the (non-atomic) bitmap ops to change various bits. The formula for the bitmap is: data[(iova / page_size) / 64] & (1ULL << (iova % 64)) Where 64 is the number of bits in a unsigned long (depending on arch) It introduces an IOVA iterator that uses a windowing scheme to minimize the pinning overhead, as opposed to pinning it on demand 4K at a time. Assuming a 4K kernel page and 4K requested page size, we can use a single kernel page to hold 512 page pointers, mapping 2M of bitmap, representing 64G of IOVA space. An example usage of these helpers for a given @base_iova, @page_size, @length and __user @data: bitmap = iova_bitmap_alloc(base_iova, page_size, length, data); if (IS_ERR(bitmap)) return -ENOMEM; ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn); iova_bitmap_free(bitmap); Each iteration of the @dirty_reporter_fn is called with a unique @iova and @length argument, indicating the current range available through the iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty areas (@iova_length) within that provided range, as following: iova_bitmap_set(bitmap, iova, iova_length); The facility is intended to be used for user bitmaps representing dirtied IOVAs by IOMMU (via IOMMUFD) and PCI Devices (via vfio-pci). Signed-off-by: Joao Martins Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-5-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/iova_bitmap.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 include/linux/iova_bitmap.h (limited to 'include/linux') diff --git a/include/linux/iova_bitmap.h b/include/linux/iova_bitmap.h new file mode 100644 index 000000000000..c006cf0a25f3 --- /dev/null +++ b/include/linux/iova_bitmap.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022, Oracle and/or its affiliates. + * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ +#ifndef _IOVA_BITMAP_H_ +#define _IOVA_BITMAP_H_ + +#include + +struct iova_bitmap; + +typedef int (*iova_bitmap_fn_t)(struct iova_bitmap *bitmap, + unsigned long iova, size_t length, + void *opaque); + +struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length, + unsigned long page_size, + u64 __user *data); +void iova_bitmap_free(struct iova_bitmap *bitmap); +int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, + iova_bitmap_fn_t fn); +void iova_bitmap_set(struct iova_bitmap *bitmap, + unsigned long iova, size_t length); + +#endif -- cgit v1.2.3 From 80c4b92a2dc48cce82a0348add48533db7e07314 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:43 +0300 Subject: vfio: Introduce the DMA logging feature support Introduce the DMA logging feature support in the vfio core layer. It includes the processing of the device start/stop/report DMA logging UAPIs and calling the relevant driver 'op' to do the work. Specifically, Upon start, the core translates the given input ranges into an interval tree, checks for unexpected overlapping, non aligned ranges and then pass the translated input to the driver for start tracking the given ranges. Upon report, the core translates the given input user space bitmap and page size into an IOVA kernel bitmap iterator. Then it iterates it and call the driver to set the corresponding bits for the dirtied pages in a specific IOVA range. Upon stop, the driver is called to stop the previous started tracking. The next patches from the series will introduce the mlx5 driver implementation for the logging ops. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-6-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e05ddc6fe6a5..0e2826559091 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -14,6 +14,7 @@ #include #include #include +#include struct kvm; @@ -33,10 +34,11 @@ struct vfio_device { struct device *dev; const struct vfio_device_ops *ops; /* - * mig_ops is a static property of the vfio_device which must be set - * prior to registering the vfio_device. + * mig_ops/log_ops is a static property of the vfio_device which must + * be set prior to registering the vfio_device. */ const struct vfio_migration_ops *mig_ops; + const struct vfio_log_ops *log_ops; struct vfio_group *group; struct vfio_device_set *dev_set; struct list_head dev_set_list; @@ -108,6 +110,28 @@ struct vfio_migration_ops { enum vfio_device_mig_state *curr_state); }; +/** + * @log_start: Optional callback to ask the device start DMA logging. + * @log_stop: Optional callback to ask the device stop DMA logging. + * @log_read_and_clear: Optional callback to ask the device read + * and clear the dirty DMAs in some given range. + * + * The vfio core implementation of the DEVICE_FEATURE_DMA_LOGGING_ set + * of features does not track logging state relative to the device, + * therefore the device implementation of vfio_log_ops must handle + * arbitrary user requests. This includes rejecting subsequent calls + * to log_start without an intervening log_stop, as well as graceful + * handling of log_stop and log_read_and_clear from invalid states. + */ +struct vfio_log_ops { + int (*log_start)(struct vfio_device *device, + struct rb_root_cached *ranges, u32 nnodes, u64 *page_size); + int (*log_stop)(struct vfio_device *device); + int (*log_read_and_clear)(struct vfio_device *device, + unsigned long iova, unsigned long length, + struct iova_bitmap *dirty); +}; + /** * vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl * @flags: Arg from the device_feature op -- cgit v1.2.3 From 1537bf26e212ffcf007d0590958025f6bfdd4ac8 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Tue, 30 Aug 2022 18:53:05 +0300 Subject: perf: RISC-V: exclude invalid pmu counters from SBI calls SBI firmware may not provide information for some counters in response to SBI_EXT_PMU_COUNTER_GET_INFO call. Exclude such counters from the subsequent SBI requests. For this purpose use global mask to keep track of fully specified counters. Signed-off-by: Sergey Matyukevich Reviewed-by: Atish Patra Link: https://lore.kernel.org/r/20220830155306.301714-3-geomatsi@gmail.com Signed-off-by: Palmer Dabbelt --- include/linux/perf/riscv_pmu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h index bf66fe011fa8..e17e86ad6f3a 100644 --- a/include/linux/perf/riscv_pmu.h +++ b/include/linux/perf/riscv_pmu.h @@ -45,7 +45,7 @@ struct riscv_pmu { irqreturn_t (*handle_irq)(int irq_num, void *dev); - int num_counters; + unsigned long cmask; u64 (*ctr_read)(struct perf_event *event); int (*ctr_get_idx)(struct perf_event *event); int (*ctr_get_width)(int idx); -- cgit v1.2.3 From bb5721f063ba63cd54cc5916881a706c21e6abc6 Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Mon, 5 Sep 2022 09:21:25 -0700 Subject: mfd: ocelot: Add helper to get regmap from a resource Several ocelot-related modules are designed for MMIO / regmaps. As such, they often use a combination of devm_platform_get_and_ioremap_resource() and devm_regmap_init_mmio(). Operating in an MFD might be different, in that it could be memory mapped, or it could be SPI, I2C... In these cases a fallback to use IORESOURCE_REG instead of IORESOURCE_MEM becomes necessary. When this happens, there's redundant logic that needs to be implemented in every driver. In order to avoid this redundancy, utilize a single function that, if the MFD scenario is enabled, will perform this fallback logic. Signed-off-by: Colin Foster Reviewed-by: Vladimir Oltean Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20220905162132.2943088-2-colin.foster@in-advantage.com --- include/linux/mfd/ocelot.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 include/linux/mfd/ocelot.h (limited to 'include/linux') diff --git a/include/linux/mfd/ocelot.h b/include/linux/mfd/ocelot.h new file mode 100644 index 000000000000..dd72073d2d4f --- /dev/null +++ b/include/linux/mfd/ocelot.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* Copyright 2022 Innovative Advantage Inc. */ + +#ifndef _LINUX_MFD_OCELOT_H +#define _LINUX_MFD_OCELOT_H + +#include +#include +#include +#include +#include +#include + +struct resource; + +static inline struct regmap * +ocelot_regmap_from_resource_optional(struct platform_device *pdev, + unsigned int index, + const struct regmap_config *config) +{ + struct device *dev = &pdev->dev; + struct resource *res; + void __iomem *regs; + + /* + * Don't use _get_and_ioremap_resource() here, since that will invoke + * prints of "invalid resource" which will simply add confusion. + */ + res = platform_get_resource(pdev, IORESOURCE_MEM, index); + if (res) { + regs = devm_ioremap_resource(dev, res); + if (IS_ERR(regs)) + return ERR_CAST(regs); + return devm_regmap_init_mmio(dev, regs, config); + } + + /* + * Fall back to using REG and getting the resource from the parent + * device, which is possible in an MFD configuration + */ + if (dev->parent) { + res = platform_get_resource(pdev, IORESOURCE_REG, index); + if (!res) + return NULL; + + return dev_get_regmap(dev->parent, res->name); + } + + return NULL; +} + +static inline struct regmap * +ocelot_regmap_from_resource(struct platform_device *pdev, unsigned int index, + const struct regmap_config *config) +{ + struct regmap *map; + + map = ocelot_regmap_from_resource_optional(pdev, index, config); + return map ?: ERR_PTR(-ENOENT); +} + +#endif -- cgit v1.2.3 From 39f7d0832c28a6a31abb5b7363e7b1ba0714fe18 Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Mon, 5 Sep 2022 09:21:30 -0700 Subject: resource: add define macro for register address resources DEFINE_RES_ macros have been created for the commonly used resource types, but not IORESOURCE_REG. Add the macro so it can be used in a similar manner to all other resource types. Signed-off-by: Colin Foster Reviewed-by: Vladimir Oltean Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20220905162132.2943088-7-colin.foster@in-advantage.com --- include/linux/ioport.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 616b683563a9..8a76dca9deee 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -172,6 +172,11 @@ enum { #define DEFINE_RES_MEM(_start, _size) \ DEFINE_RES_MEM_NAMED((_start), (_size), NULL) +#define DEFINE_RES_REG_NAMED(_start, _size, _name) \ + DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_REG) +#define DEFINE_RES_REG(_start, _size) \ + DEFINE_RES_REG_NAMED((_start), (_size), NULL) + #define DEFINE_RES_IRQ_NAMED(_irq, _name) \ DEFINE_RES_NAMED((_irq), 1, (_name), IORESOURCE_IRQ) #define DEFINE_RES_IRQ(_irq) \ -- cgit v1.2.3 From f2042ed21da7f8886c93efefff61f93e6d57e9bd Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 16 Aug 2022 18:28:05 +0100 Subject: iommu/dma: Make header private Now that dma-iommu.h only contains internal interfaces, make it private to the IOMMU subsytem. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/b237e06c56a101f77af142a54b629b27aa179d22.1660668998.git.robin.murphy@arm.com [ joro : re-add stub for iommu_dma_get_resv_regions ] Signed-off-by: Joerg Roedel --- include/linux/dma-iommu.h | 53 ----------------------------------------------- 1 file changed, 53 deletions(-) delete mode 100644 include/linux/dma-iommu.h (limited to 'include/linux') diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h deleted file mode 100644 index e83de4f1f3d6..000000000000 --- a/include/linux/dma-iommu.h +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2014-2015 ARM Ltd. - */ -#ifndef __DMA_IOMMU_H -#define __DMA_IOMMU_H - -#include -#include - -#ifdef CONFIG_IOMMU_DMA -#include -#include -#include - -/* Domain management interface for IOMMU drivers */ -int iommu_get_dma_cookie(struct iommu_domain *domain); -void iommu_put_dma_cookie(struct iommu_domain *domain); - -int iommu_dma_init_fq(struct iommu_domain *domain); - -void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list); - -void iommu_dma_free_cpu_cached_iovas(unsigned int cpu, - struct iommu_domain *domain); - -extern bool iommu_dma_forcedac; - -#else /* CONFIG_IOMMU_DMA */ - -struct iommu_domain; -struct device; - -static inline int iommu_dma_init_fq(struct iommu_domain *domain) -{ - return -EINVAL; -} - -static inline int iommu_get_dma_cookie(struct iommu_domain *domain) -{ - return -ENODEV; -} - -static inline void iommu_put_dma_cookie(struct iommu_domain *domain) -{ -} - -static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list) -{ -} - -#endif /* CONFIG_IOMMU_DMA */ -#endif /* __DMA_IOMMU_H */ -- cgit v1.2.3 From c9874d3ffeaf8ee215187692ed918b3031d996d1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 16 Aug 2018 11:47:53 -0400 Subject: termios: start unifying non-UAPI parts of asm/termios.h * new header (linut/termios_internal.h), pulled by the users of those suckers * defaults for INIT_C_CC and externs for conversion helpers moved over there * remove termios-base.h (empty now) Signed-off-by: Al Viro Link: https://lore.kernel.org/r/YxDmptU7dNGZ+/Hn@ZenIV Signed-off-by: Greg Kroah-Hartman --- include/linux/termios_internal.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 include/linux/termios_internal.h (limited to 'include/linux') diff --git a/include/linux/termios_internal.h b/include/linux/termios_internal.h new file mode 100644 index 000000000000..103ca0370948 --- /dev/null +++ b/include/linux/termios_internal.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TERMIOS_CONV_H +#define _LINUX_TERMIOS_CONV_H + +#include +#include + +#ifndef INIT_C_CC +/* intr=^C quit=^\ erase=del kill=^U + eof=^D vtime=\0 vmin=\1 sxtc=\0 + start=^Q stop=^S susp=^Z eol=\0 + reprint=^R discard=^U werase=^W lnext=^V + eol2=\0 +*/ +#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" +#endif + +int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *); +int kernel_termios_to_user_termio(struct termio __user *, struct ktermios *); +#ifdef TCGETS2 +int user_termios_to_kernel_termios(struct ktermios *, struct termios2 __user *); +int kernel_termios_to_user_termios(struct termios2 __user *, struct ktermios *); +int user_termios_to_kernel_termios_1(struct ktermios *, struct termios __user *); +int kernel_termios_to_user_termios_1(struct termios __user *, struct ktermios *); +#else /* TCGETS2 */ +int user_termios_to_kernel_termios(struct ktermios *, struct termios __user *); +int kernel_termios_to_user_termios(struct termios __user *, struct ktermios *); +#endif /* TCGETS2 */ + +#endif /* _LINUX_TERMIOS_CONV_H */ -- cgit v1.2.3 From 38fc315a73f7a1b2d19eb32dd55de089b78b2a54 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 9 Aug 2022 17:17:04 -0400 Subject: termios: consolidate values for VDISCARD in INIT_C_CC On old systems it used to be ^O. Linux had never actually used the value, but INIT_C_CC (on i386) did initialize it to ^O; unfortunately, it had a typo in the comment claiming that to be ^U. Most of the architectures copied the (correct) definition along with mistaken comment. alpha, powerpc and sparc tried to make the definition match comment. However, util-linux still resets it to ^O on any architecture, ^O is the historical value, kernel ignores it anyway and finally, Linus said "Just change everybody to do the same, nobody cares about VDISCARD". Signed-off-by: Al Viro Link: https://lore.kernel.org/r/YxDmy//MKzs3ye7l@ZenIV Signed-off-by: Greg Kroah-Hartman --- include/linux/termios_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/termios_internal.h b/include/linux/termios_internal.h index 103ca0370948..7eb3598c109d 100644 --- a/include/linux/termios_internal.h +++ b/include/linux/termios_internal.h @@ -9,7 +9,7 @@ /* intr=^C quit=^\ erase=del kill=^U eof=^D vtime=\0 vmin=\1 sxtc=\0 start=^Q stop=^S susp=^Z eol=\0 - reprint=^R discard=^U werase=^W lnext=^V + reprint=^R discard=^O werase=^W lnext=^V eol2=\0 */ #define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" -- cgit v1.2.3 From d04f9915fa44b52d7a91080677381a082238e9c4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 9 Sep 2018 17:57:42 -0400 Subject: make generic INIT_C_CC a bit more generic turn it into an array initializer; then alpha, mips and powerpc variants fold into it. Signed-off-by: Al Viro Link: https://lore.kernel.org/r/YxDm7M6M91gC2RPL@ZenIV Signed-off-by: Greg Kroah-Hartman --- include/linux/termios_internal.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/termios_internal.h b/include/linux/termios_internal.h index 7eb3598c109d..8a53141ab44a 100644 --- a/include/linux/termios_internal.h +++ b/include/linux/termios_internal.h @@ -12,7 +12,20 @@ reprint=^R discard=^O werase=^W lnext=^V eol2=\0 */ -#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" +#define INIT_C_CC { \ + [VINTR] = 'C'-0x40, \ + [VQUIT] = '\\'-0x40, \ + [VERASE] = '\177', \ + [VKILL] = 'U'-0x40, \ + [VEOF] = 'D'-0x40, \ + [VSTART] = 'Q'-0x40, \ + [VSTOP] = 'S'-0x40, \ + [VSUSP] = 'Z'-0x40, \ + [VREPRINT] = 'R'-0x40, \ + [VDISCARD] = 'O'-0x40, \ + [VWERASE] = 'W'-0x40, \ + [VLNEXT] = 'V'-0x40, \ + [VMIN] = 1 } #endif int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *); -- cgit v1.2.3 From e7b4c812b9685e22753d6355e53fdeaaa22862dd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 9 Aug 2022 19:41:40 -0400 Subject: termios: convert the last (sparc) INIT_C_CC to array Signed-off-by: Al Viro Link: https://lore.kernel.org/r/YxDnDCR2VRTA3Etp@ZenIV Signed-off-by: Greg Kroah-Hartman --- include/linux/termios_internal.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/termios_internal.h b/include/linux/termios_internal.h index 8a53141ab44a..d77f29e5e2b7 100644 --- a/include/linux/termios_internal.h +++ b/include/linux/termios_internal.h @@ -5,13 +5,19 @@ #include #include -#ifndef INIT_C_CC /* intr=^C quit=^\ erase=del kill=^U eof=^D vtime=\0 vmin=\1 sxtc=\0 start=^Q stop=^S susp=^Z eol=\0 reprint=^R discard=^O werase=^W lnext=^V eol2=\0 */ + +#ifdef VDSUSP +#define INIT_C_CC_VDSUSP_EXTRA [VDSUSP] = 'Y'-0x40, +#else +#define INIT_C_CC_VDSUSP_EXTRA +#endif + #define INIT_C_CC { \ [VINTR] = 'C'-0x40, \ [VQUIT] = '\\'-0x40, \ @@ -25,8 +31,8 @@ [VDISCARD] = 'O'-0x40, \ [VWERASE] = 'W'-0x40, \ [VLNEXT] = 'V'-0x40, \ + INIT_C_CC_VDSUSP_EXTRA \ [VMIN] = 1 } -#endif int user_termio_to_kernel_termios(struct ktermios *, struct termio __user *); int kernel_termios_to_user_termio(struct termio __user *, struct ktermios *); -- cgit v1.2.3 From 89bbeb7e3199e1514729aa6de8057289e6375fe6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 9 Aug 2022 19:41:40 -0400 Subject: termios: get rid of non-UAPI asm/termios.h All non-UAPI asm/termios.h consist of include of UAPI counterpart and, possibly, include of linux/uaccess.h The latter can't be simply removed, even though nothing in linux/termios.h doesn't depend upon it anymore - there are several places that rely upon that indirect chain of includes to pull linux/uaccess.h. So the include needs to be lifted out of there - we lift into tty_driver.h, serdev.h and places that pull asm/termios.h, but none of * linux/uaccess.h (obvious) * net/sock.h (pulls uaccess.h) * linux/{tty,tty_driver,serdev}.h (tty.h pulls tty_driver.h) That leaves us just with the include of UAPI asm/termios.h, which is what will resolve to if we simply remove non-UAPI header. Signed-off-by: Al Viro Link: https://lore.kernel.org/r/YxDnKvYCHn/ogBUv@ZenIV Signed-off-by: Greg Kroah-Hartman --- include/linux/serdev.h | 1 + include/linux/tty_driver.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 3368c261ab62..66f624fc618c 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -7,6 +7,7 @@ #include #include +#include #include #include diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index b2456b545ba0..f961164a5274 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From d79ddb069c5257a924456eb99b53fc1ea715c0a3 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:05 +0800 Subject: sched/psi: Move private helpers to sched/stats.h This patch move psi_task_change/psi_task_switch declarations out of PSI public header, since they are only needed for implementing the PSI stats tracking in sched/stats.h psi_task_switch is obvious, psi_task_change can't be public helper since it doesn't check psi_disabled static key. And there is no any user now, so put it in sched/stats.h too. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-5-zhouchengming@bytedance.com --- include/linux/psi.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/psi.h b/include/linux/psi.h index dd74411ac21d..fffd229fbf19 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -18,10 +18,6 @@ extern struct psi_group psi_system; void psi_init(void); -void psi_task_change(struct task_struct *task, int clear, int set); -void psi_task_switch(struct task_struct *prev, struct task_struct *next, - bool sleep); - void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); -- cgit v1.2.3 From 71dbdde7914d32e86f01ac1f6e54e964c9dfdbd9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Aug 2022 00:41:07 +0800 Subject: sched/psi: Remove NR_ONCPU task accounting We put all fields updated by the scheduler in the first cacheline of struct psi_group_cpu for performance. Since we want add another PSI_IRQ_FULL to track IRQ/SOFTIRQ pressure, we need to reclaim space first. This patch remove NR_ONCPU task accounting in struct psi_group_cpu, use one bit in state_mask to track instead. Signed-off-by: Johannes Weiner Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Tested-by: Chengming Zhou Link: https://lore.kernel.org/r/20220825164111.29534-7-zhouchengming@bytedance.com --- include/linux/psi_types.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index c7fe7c089718..54cb74946db4 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -15,13 +15,6 @@ enum psi_task_count { NR_IOWAIT, NR_MEMSTALL, NR_RUNNING, - /* - * This can't have values other than 0 or 1 and could be - * implemented as a bit flag. But for now we still have room - * in the first cacheline of psi_group_cpu, and this way we - * don't have to special case any state tracking for it. - */ - NR_ONCPU, /* * For IO and CPU stalls the presence of running/oncpu tasks * in the domain means a partial rather than a full stall. @@ -32,16 +25,18 @@ enum psi_task_count { * threads and memstall ones. */ NR_MEMSTALL_RUNNING, - NR_PSI_TASK_COUNTS = 5, + NR_PSI_TASK_COUNTS = 4, }; /* Task state bitmasks */ #define TSK_IOWAIT (1 << NR_IOWAIT) #define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_RUNNING (1 << NR_RUNNING) -#define TSK_ONCPU (1 << NR_ONCPU) #define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) +/* Only one task can be scheduled, no corresponding task count */ +#define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS) + /* Resources that workloads could be stalled on */ enum psi_res { PSI_IO, @@ -68,6 +63,9 @@ enum psi_states { NR_PSI_STATES = 7, }; +/* Use one bit in the state mask to track TSK_ONCPU */ +#define PSI_ONCPU (1 << NR_PSI_STATES) + enum psi_aggregators { PSI_AVGS = 0, PSI_POLL, -- cgit v1.2.3 From 52b1364ba0b105122d6de0e719b36db705011ac1 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:08 +0800 Subject: sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure Now PSI already tracked workload pressure stall information for CPU, memory and IO. Apart from these, IRQ/SOFTIRQ could have obvious impact on some workload productivity, such as web service workload. When CONFIG_IRQ_TIME_ACCOUNTING, we can get IRQ/SOFTIRQ delta time from update_rq_clock_task(), in which we can record that delta to CPU curr task's cgroups as PSI_IRQ_FULL status. Note we don't use PSI_IRQ_SOME since IRQ/SOFTIRQ always happen in the current task on the CPU, make nothing productive could run even if it were runnable, so we only use PSI_IRQ_FULL. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-8-zhouchengming@bytedance.com --- include/linux/psi_types.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 54cb74946db4..40c28171cd91 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -42,7 +42,10 @@ enum psi_res { PSI_IO, PSI_MEM, PSI_CPU, - NR_PSI_RESOURCES = 3, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + PSI_IRQ, +#endif + NR_PSI_RESOURCES, }; /* @@ -58,9 +61,12 @@ enum psi_states { PSI_MEM_FULL, PSI_CPU_SOME, PSI_CPU_FULL, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + PSI_IRQ_FULL, +#endif /* Only per-CPU, to weigh the CPU in the global average: */ PSI_NONIDLE, - NR_PSI_STATES = 7, + NR_PSI_STATES, }; /* Use one bit in the state mask to track TSK_ONCPU */ -- cgit v1.2.3 From 57899a6610e67ba26fa3251ebbef4a5ed21efc5d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:09 +0800 Subject: sched/psi: Consolidate cgroup_psi() cgroup_psi() can't return psi_group for root cgroup, so we have many open code "psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi". This patch move cgroup_psi() definition to , in which we can return psi_system for root cgroup, so can handle all cgroups. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-9-zhouchengming@bytedance.com --- include/linux/cgroup.h | 5 ----- include/linux/psi.h | 6 ++++++ 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b0914aa26506..80cb970257be 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -673,11 +673,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } -static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) -{ - return cgrp->psi; -} - bool cgroup_psi_enabled(void); static inline void cgroup_init_kthreadd(void) diff --git a/include/linux/psi.h b/include/linux/psi.h index fffd229fbf19..362a74ca1d3b 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -7,6 +7,7 @@ #include #include #include +#include struct seq_file; struct css_set; @@ -30,6 +31,11 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); #ifdef CONFIG_CGROUPS +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) +{ + return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; +} + int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); -- cgit v1.2.3 From dc86aba751e2867244411adda1562f6664747019 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:10 +0800 Subject: sched/psi: Cache parent psi_group to speed up group iteration We use iterate_groups() to iterate each level psi_group to update PSI stats, which is a very hot path. In current code, iterate_groups() have to use multiple branches and cgroup_parent() to get parent psi_group for each level, which is not very efficient. This patch cache parent psi_group in struct psi_group, only need to get psi_group of task itself first, then just use group->parent to iterate. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-10-zhouchengming@bytedance.com --- include/linux/psi_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 40c28171cd91..a0b746258c68 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -151,6 +151,8 @@ struct psi_trigger { }; struct psi_group { + struct psi_group *parent; + /* Protects data used by the aggregator */ struct mutex avgs_lock; -- cgit v1.2.3 From 34f26a15611afb03c33df6819359d36f5b382589 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Wed, 7 Sep 2022 17:03:32 +0800 Subject: sched/psi: Per-cgroup PSI accounting disable/re-enable interface PSI accounts stalls for each cgroup separately and aggregates it at each level of the hierarchy. This may cause non-negligible overhead for some workloads when under deep level of the hierarchy. commit 3958e2d0c34e ("cgroup: make per-cgroup pressure stall tracking configurable") make PSI to skip per-cgroup stall accounting, only account system-wide to avoid this each level overhead. But for our use case, we also want leaf cgroup PSI stats accounted for userspace adjustment on that cgroup, apart from only system-wide adjustment. So this patch introduce a per-cgroup PSI accounting disable/re-enable interface "cgroup.pressure", which is a read-write single value file that allowed values are "0" and "1", the defaults is "1" so per-cgroup PSI stats is enabled by default. Implementation details: It should be relatively straight-forward to disable and re-enable state aggregation, time tracking, averaging on a per-cgroup level, if we can live with losing history from while it was disabled. I.e. the avgs will restart from 0, total= will have gaps. But it's hard or complex to stop/restart groupc->tasks[] updates, which is not implemented in this patch. So we always update groupc->tasks[] and PSI_ONCPU bit in psi_group_change() even when the cgroup PSI stats is disabled. Suggested-by: Johannes Weiner Suggested-by: Tejun Heo Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20220907090332.2078-1-zhouchengming@bytedance.com --- include/linux/cgroup-defs.h | 3 +++ include/linux/psi.h | 2 ++ include/linux/psi_types.h | 3 +++ 3 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4bcf56b3491c..7df76b318245 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -428,6 +428,9 @@ struct cgroup { struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file events_file; /* handle for "cgroup.events" */ + /* handles for "{cpu,memory,io,irq}.pressure" */ + struct cgroup_file psi_files[NR_PSI_RESOURCES]; + /* * The bitmask of subsystems enabled on the child cgroups. * ->subtree_control is the one configured through diff --git a/include/linux/psi.h b/include/linux/psi.h index 362a74ca1d3b..b029a847def1 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -39,6 +39,7 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); +void psi_cgroup_restart(struct psi_group *group); #endif #else /* CONFIG_PSI */ @@ -60,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) { rcu_assign_pointer(p->cgroups, to); } +static inline void psi_cgroup_restart(struct psi_group *group) {} #endif #endif /* CONFIG_PSI */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index a0b746258c68..6e4372735068 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -152,6 +152,7 @@ struct psi_trigger { struct psi_group { struct psi_group *parent; + bool enabled; /* Protects data used by the aggregator */ struct mutex avgs_lock; @@ -194,6 +195,8 @@ struct psi_group { #else /* CONFIG_PSI */ +#define NR_PSI_RESOURCES 0 + struct psi_group { }; #endif /* CONFIG_PSI */ -- cgit v1.2.3 From b9be19eef35587b15da0a5d887d6e04c0c6cefab Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Tue, 6 Sep 2022 16:35:42 -0400 Subject: drivers/base: Fix unsigned comparison to -1 in CPUMAP_FILE_MAX_BYTES As PAGE_SIZE is unsigned long, -1 > PAGE_SIZE when NR_CPUS <= 3. This leads to very large file sizes: topology$ ls -l total 0 -r--r--r-- 1 root root 18446744073709551615 Sep 5 11:59 core_cpus -r--r--r-- 1 root root 4096 Sep 5 11:59 core_cpus_list -r--r--r-- 1 root root 4096 Sep 5 10:58 core_id -r--r--r-- 1 root root 18446744073709551615 Sep 5 10:10 core_siblings -r--r--r-- 1 root root 4096 Sep 5 11:59 core_siblings_list -r--r--r-- 1 root root 18446744073709551615 Sep 5 11:59 die_cpus -r--r--r-- 1 root root 4096 Sep 5 11:59 die_cpus_list -r--r--r-- 1 root root 4096 Sep 5 11:59 die_id -r--r--r-- 1 root root 18446744073709551615 Sep 5 11:59 package_cpus -r--r--r-- 1 root root 4096 Sep 5 11:59 package_cpus_list -r--r--r-- 1 root root 4096 Sep 5 10:58 physical_package_id -r--r--r-- 1 root root 18446744073709551615 Sep 5 10:10 thread_siblings -r--r--r-- 1 root root 4096 Sep 5 11:59 thread_siblings_list Adjust the inequality to catch the case when NR_CPUS is configured to a small value. Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Yury Norov Cc: stable@vger.kernel.org Cc: feng xiangjun Fixes: 7ee951acd31a ("drivers/base: fix userspace break from using bin_attributes for cpumap and cpulist") Reported-by: feng xiangjun Signed-off-by: Phil Auld Signed-off-by: Yury Norov --- include/linux/cpumask.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index bd047864c7ac..e8ad12b5b9d2 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1127,9 +1127,10 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, * cover a worst-case of every other cpu being on one of two nodes for a * very large NR_CPUS. * - * Use PAGE_SIZE as a minimum for smaller configurations. + * Use PAGE_SIZE as a minimum for smaller configurations while avoiding + * unsigned comparison to -1. */ -#define CPUMAP_FILE_MAX_BYTES ((((NR_CPUS * 9)/32 - 1) > PAGE_SIZE) \ +#define CPUMAP_FILE_MAX_BYTES (((NR_CPUS * 9)/32 > PAGE_SIZE) \ ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE) #define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE) -- cgit v1.2.3 From 811d59fdf56a17c66742578fe8be8a6a841af448 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 29 Aug 2022 11:29:49 -0500 Subject: ACPI: s2idle: Add a new ->check() callback for platform_s2idle_ops On some platforms it is found that Linux more aggressively enters s2idle than Windows enters Modern Standby and this uncovers some synchronization issues for the platform. To aid in debugging this class of problems in the future, add support for an extra optional callback intended for drivers to emit extra debugging. Signed-off-by: Mario Limonciello Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20220829162953.5947-2-mario.limonciello@amd.com Signed-off-by: Hans de Goede --- include/linux/acpi.h | 1 + include/linux/suspend.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6f64b2f3dc54..acaa2ddc067d 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1075,6 +1075,7 @@ acpi_status acpi_os_prepare_extended_sleep(u8 sleep_state, struct acpi_s2idle_dev_ops { struct list_head list_node; void (*prepare)(void); + void (*check)(void); void (*restore)(void); }; int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 70f2921e2e70..03ed42ed2c7f 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -191,6 +191,7 @@ struct platform_s2idle_ops { int (*begin)(void); int (*prepare)(void); int (*prepare_late)(void); + void (*check)(void); bool (*wake)(void); void (*restore_early)(void); void (*restore)(void); -- cgit v1.2.3 From 6bb057bfd9d509755349cd2a6ca5e5e6e6071304 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Tue, 16 Aug 2022 13:16:26 +0300 Subject: ACPI: resource: Add helper function acpi_dev_get_memory_resources() Wrapper function that finds all memory type resources by using acpi_dev_get_resources(). It removes the need for the drivers to check the resource data type separately. Signed-off-by: Heikki Krogerus Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6f64b2f3dc54..ed4aa395cc49 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -506,6 +506,7 @@ int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list, void *preproc_data); int acpi_dev_get_dma_resources(struct acpi_device *adev, struct list_head *list); +int acpi_dev_get_memory_resources(struct acpi_device *adev, struct list_head *list); int acpi_dev_filter_resource_type(struct acpi_resource *ares, unsigned long types); -- cgit v1.2.3 From d4f7bdb2ed7bf320a8772258fdc257655d225afb Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 7 Sep 2022 10:40:37 -0600 Subject: bpf: Add stub for btf_struct_access() Add corresponding unimplemented stub for when CONFIG_BPF_SYSCALL=n Signed-off-by: Daniel Xu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/4021398e884433b1fef57a4d28361bb9fcf1bd05.1662568410.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 48ae05099f36..54178b9e9c3a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2211,6 +2211,15 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) return ERR_PTR(-ENOTSUPP); } +static inline int btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, + u32 *next_btf_id, enum bpf_type_flag *flag) +{ + return -EACCES; +} + static inline const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { -- cgit v1.2.3 From 1bfe26fb082724be453e4d7fd9bb358e3ba669b2 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 8 Sep 2022 16:07:16 -0700 Subject: bpf: Add verifier support for custom callback return range Verifier logic to confirm that a callback function returns 0 or 1 was added in commit 69c087ba6225b ("bpf: Add bpf_for_each_map_elem() helper"). At the time, callback return value was only used to continue or stop iteration. In order to support callbacks with a broader return value range, such as those added in rbtree series[0] and others, add a callback_ret_range to bpf_func_state. Verifier's helpers which set in_callback_fn will also set the new field, which the verifier will later use to check return value bounds. Default to tnum_range(0, 0) instead of using tnum_unknown as a sentinel value as the latter would prevent the valid range (0, U64_MAX) being used. Previous global default tnum_range(0, 1) is explicitly set for extant callback helpers. The change to global default was made after discussion around this patch in rbtree series [1], goal here is to make it more obvious that callback_ret_range should be explicitly set. [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com/ [1]: lore.kernel.org/bpf/20220830172759.4069786-2-davemarchevsky@fb.com/ Signed-off-by: Dave Marchevsky Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220908230716.2751723-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b49a349cc6ae..e197f8fb27e2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -248,6 +248,7 @@ struct bpf_func_state { */ u32 async_entry_cnt; bool in_callback_fn; + struct tnum callback_ret_range; bool in_async_callback_fn; /* The following fields should be last. See copy_func_state() */ -- cgit v1.2.3 From 9cd4f1434479f1ac25c440c421fbf52069079914 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sun, 11 Sep 2022 11:18:45 +0800 Subject: iommu/vt-d: Fix possible recursive locking in intel_iommu_init() The global rwsem dmar_global_lock was introduced by commit 3a5670e8ac932 ("iommu/vt-d: Introduce a rwsem to protect global data structures"). It is used to protect DMAR related global data from DMAR hotplug operations. The dmar_global_lock used in the intel_iommu_init() might cause recursive locking issue, for example, intel_iommu_get_resv_regions() is taking the dmar_global_lock from within a section where intel_iommu_init() already holds it via probe_acpi_namespace_devices(). Using dmar_global_lock in intel_iommu_init() could be relaxed since it is unlikely that any IO board must be hot added before the IOMMU subsystem is initialized. This eliminates the possible recursive locking issue by moving down DMAR hotplug support after the IOMMU is initialized and removing the uses of dmar_global_lock in intel_iommu_init(). Fixes: d5692d4af08cd ("iommu/vt-d: Fix suspicious RCU usage in probe_acpi_namespace_devices()") Reported-by: Robin Murphy Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/894db0ccae854b35c73814485569b634237b5538.1657034828.git.robin.murphy@arm.com Link: https://lore.kernel.org/r/20220718235325.3952426-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/dmar.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index d81a51978d01..8917a32173c4 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -65,6 +65,7 @@ struct dmar_pci_notify_info { extern struct rw_semaphore dmar_global_lock; extern struct list_head dmar_drhd_units; +extern int intel_iommu_enabled; #define for_each_drhd_unit(drhd) \ list_for_each_entry_rcu(drhd, &dmar_drhd_units, list, \ @@ -88,7 +89,8 @@ extern struct list_head dmar_drhd_units; static inline bool dmar_rcu_check(void) { return rwsem_is_locked(&dmar_global_lock) || - system_state == SYSTEM_BOOTING; + system_state == SYSTEM_BOOTING || + (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled); } #define dmar_rcu_dereference(p) rcu_dereference_check((p), dmar_rcu_check()) -- cgit v1.2.3 From a1c7c1a40478db4edef8ad0a0c6975c8921088b7 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 19 Jul 2022 13:41:31 +0200 Subject: power: supply: Explain maintenance charging In order for everyone to understand clearly why we want to use maintenance charging for batteries, expand the description with two diagrams and some text. Signed-off-by: Linus Walleij Reviewed-by: Matti Vaittinen Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 48 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index cb380c1d9459..aa2c4a7c4826 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -374,9 +374,37 @@ struct power_supply_vbat_ri_table { * These timers should be chosen to align with the typical discharge curve * for the battery. * - * When the main CC/CV charging is complete the battery can optionally be - * maintenance charged at the voltages from this table: a table of settings is - * traversed using a slightly lower current and voltage than what is used for + * Ordinary CC/CV charging will stop charging when the charge current goes + * below charge_term_current_ua, and then restart it (if the device is still + * plugged into the charger) at charge_restart_voltage_uv. This happens in most + * consumer products because the power usage while connected to a charger is + * not zero, and devices are not manufactured to draw power directly from the + * charger: instead they will at all times dissipate the battery a little, like + * the power used in standby mode. This will over time give a charge graph + * such as this: + * + * Energy + * ^ ... ... ... ... ... ... ... + * | . . . . . . . . . . . . . + * | .. . .. . .. . .. . .. . .. . .. + * |. .. .. .. .. .. .. + * +-------------------------------------------------------------------> t + * + * Practically this means that the Li-ions are wandering back and forth in the + * battery and this causes degeneration of the battery anode and cathode. + * To prolong the life of the battery, maintenance charging is applied after + * reaching charge_term_current_ua to hold up the charge in the battery while + * consuming power, thus lowering the wear on the battery: + * + * Energy + * ^ ....................................... + * | . ...................... + * | .. + * |. + * +-------------------------------------------------------------------> t + * + * Maintenance charging uses the voltages from this table: a table of settings + * is traversed using a slightly lower current and voltage than what is used for * CC/CV charging. The maintenance charging will for safety reasons not go on * indefinately: we lower the current and voltage with successive maintenance * settings, then disable charging completely after we reach the last one, @@ -385,14 +413,22 @@ struct power_supply_vbat_ri_table { * ordinary CC/CV charging from there. * * As an example, a Samsung EB425161LA Lithium-Ion battery is CC/CV charged - * at 900mA to 4340mV, then maintenance charged at 600mA and 4150mV for - * 60 hours, then maintenance charged at 600mA and 4100mV for 200 hours. + * at 900mA to 4340mV, then maintenance charged at 600mA and 4150mV for up to + * 60 hours, then maintenance charged at 600mA and 4100mV for up to 200 hours. * After this the charge cycle is restarted waiting for * charge_restart_voltage_uv. * * For most mobile electronics this type of maintenance charging is enough for * the user to disconnect the device and make use of it before both maintenance - * charging cycles are complete. + * charging cycles are complete, if the current and voltage has been chosen + * appropriately. These need to be determined from battery discharge curves + * and expected standby current. + * + * If the voltage anyway drops to charge_restart_voltage_uv during maintenance + * charging, ordinary CC/CV charging is restarted. This can happen if the + * device is e.g. actively used during charging, so more current is drawn than + * the expected stand-by current. Also overvoltage protection will be applied + * as usual. */ struct power_supply_maintenance_charge_table { int charge_current_max_ua; -- cgit v1.2.3 From 65d3440e8d2e9e024ef2357f8ff021611b068c99 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 26 Aug 2022 10:18:07 -0700 Subject: mm/memory-failure: fix detection of memory_failure() handlers Some pagemap types, like MEMORY_DEVICE_GENERIC (device-dax) do not even have pagemap ops which results in crash signatures like this: BUG: kernel NULL pointer dereference, address: 0000000000000010 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8000000205073067 P4D 8000000205073067 PUD 2062b3067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 22 PID: 4535 Comm: device-dax Tainted: G OE N 6.0.0-rc2+ #59 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:memory_failure+0x667/0xba0 [..] Call Trace: ? _printk+0x58/0x73 do_madvise.part.0.cold+0xaf/0xc5 Check for ops before checking if the ops have a memory_failure() handler. Link: https://lkml.kernel.org/r/166153428781.2758201.1990616683438224741.stgit@dwillia2-xfh.jf.intel.com Fixes: 33a8f7f2b3a3 ("pagemap,pmem: introduce ->memory_failure()") Signed-off-by: Dan Williams Acked-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Reviewed-by: Christoph Hellwig Cc: Shiyang Ruan Cc: Darrick J. Wong Cc: Al Viro Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- include/linux/memremap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 19010491a603..c3b4cc84877b 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -139,6 +139,11 @@ struct dev_pagemap { }; }; +static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap) +{ + return pgmap->ops && pgmap->ops->memory_failure; +} + static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) { if (pgmap->flags & PGMAP_ALTMAP_VALID) -- cgit v1.2.3 From 825cf206ed510c4a1758bef8957e2b039253e2e3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 26 Aug 2022 23:58:44 -0700 Subject: statx: add direct I/O alignment information Traditionally, the conditions for when DIO (direct I/O) is supported were fairly simple. For both block devices and regular files, DIO had to be aligned to the logical block size of the block device. However, due to filesystem features that have been added over time (e.g. multi-device support, data journalling, inline data, encryption, verity, compression, checkpoint disabling, log-structured mode), the conditions for when DIO is allowed on a regular file have gotten increasingly complex. Whether a particular regular file supports DIO, and with what alignment, can depend on various file attributes and filesystem mount options, as well as which block device(s) the file's data is located on. Moreover, the general rule of DIO needing to be aligned to the block device's logical block size was recently relaxed to allow user buffers (but not file offsets) aligned to the DMA alignment instead. See commit bf8d08532bc1 ("iomap: add support for dma aligned direct-io"). XFS has an ioctl XFS_IOC_DIOINFO that exposes DIO alignment information. Uplifting this to the VFS is one possibility. However, as discussed (https://lore.kernel.org/linux-fsdevel/20220120071215.123274-1-ebiggers@kernel.org/T/#u), this ioctl is rarely used and not known to be used outside of XFS-specific code. It was also never intended to indicate when a file doesn't support DIO at all, nor was it intended for block devices. Therefore, let's expose this information via statx(). Add the STATX_DIOALIGN flag and two new statx fields associated with it: * stx_dio_mem_align: the alignment (in bytes) required for user memory buffers for DIO, or 0 if DIO is not supported on the file. * stx_dio_offset_align: the alignment (in bytes) required for file offsets and I/O segment lengths for DIO, or 0 if DIO is not supported on the file. This will only be nonzero if stx_dio_mem_align is nonzero, and vice versa. Note that as with other statx() extensions, if STATX_DIOALIGN isn't set in the returned statx struct, then these new fields won't be filled in. This will happen if the file is neither a regular file nor a block device, or if the file is a regular file and the filesystem doesn't support STATX_DIOALIGN. It might also happen if the caller didn't include STATX_DIOALIGN in the request mask, since statx() isn't required to return unrequested information. This commit only adds the VFS-level plumbing for STATX_DIOALIGN. For regular files, individual filesystems will still need to add code to support it. For block devices, a separate commit will wire it up too. Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Martin K. Petersen Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20220827065851.135710-2-ebiggers@kernel.org --- include/linux/stat.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stat.h b/include/linux/stat.h index 7df06931f25d..ff277ced50e9 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -50,6 +50,8 @@ struct kstat { struct timespec64 btime; /* File creation time */ u64 blocks; u64 mnt_id; + u32 dio_mem_align; + u32 dio_offset_align; }; #endif -- cgit v1.2.3 From 2d985f8c6b91b5007a16e640bb9c038c5fb2839b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 26 Aug 2022 23:58:45 -0700 Subject: vfs: support STATX_DIOALIGN on block devices Add support for STATX_DIOALIGN to block devices, so that direct I/O alignment restrictions are exposed to userspace in a generic way. Note that this breaks the tradition of stat operating only on the block device node, not the block device itself. However, it was felt that doing this is preferable, in order to make the interface useful and avoid needing separate interfaces for regular files and block devices. Signed-off-by: Eric Biggers Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20220827065851.135710-3-ebiggers@kernel.org --- include/linux/blkdev.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 84b13fdd34a7..8038c5fbde40 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1498,6 +1498,7 @@ int sync_blockdev(struct block_device *bdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); +void bdev_statx_dioalign(struct inode *inode, struct kstat *stat); void printk_all_partitions(void); #else static inline void invalidate_bdev(struct block_device *bdev) @@ -1514,6 +1515,9 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) static inline void sync_bdevs(bool wait) { } +static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) +{ +} static inline void printk_all_partitions(void) { } -- cgit v1.2.3 From 53dd3f802a6e269868cb599609287a841e65a996 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 26 Aug 2022 23:58:46 -0700 Subject: fscrypt: change fscrypt_dio_supported() to prepare for STATX_DIOALIGN To prepare for STATX_DIOALIGN support, make two changes to fscrypt_dio_supported(). First, remove the filesystem-block-alignment check and make the filesystems handle it instead. It previously made sense to have it in fs/crypto/; however, to support STATX_DIOALIGN the alignment restriction would have to be returned to filesystems. It ends up being simpler if filesystems handle this part themselves, especially for f2fs which only allows fs-block-aligned DIO in the first place. Second, make fscrypt_dio_supported() work on inodes whose encryption key hasn't been set up yet, by making it set up the key if needed. This is required for statx(), since statx() doesn't require a file descriptor. Reviewed-by: Christoph Hellwig Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20220827065851.135710-4-ebiggers@kernel.org --- include/linux/fscrypt.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 7d2f1e0f23b1..13598859d5b3 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -768,7 +768,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh); -bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter); +bool fscrypt_dio_supported(struct inode *inode); u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks); @@ -801,11 +801,8 @@ static inline bool fscrypt_mergeable_bio_bh(struct bio *bio, return true; } -static inline bool fscrypt_dio_supported(struct kiocb *iocb, - struct iov_iter *iter) +static inline bool fscrypt_dio_supported(struct inode *inode) { - const struct inode *inode = file_inode(iocb->ki_filp); - return !fscrypt_needs_contents_encryption(inode); } -- cgit v1.2.3 From a7f4e6e4c47c41869fe5bea17e013b5557c57ed3 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:25 -0700 Subject: mm/thp: add flag to enforce sysfs THP in hugepage_vma_check() MADV_COLLAPSE is not coupled to the kernel-oriented sysfs THP settings[1]. hugepage_vma_check() is the authority on determining if a VMA is eligible for THP allocation/collapse, and currently enforces the sysfs THP settings. Add a flag to disable these checks. For now, only apply this arg to anon and file, which use /sys/kernel/transparent_hugepage/enabled. We can expand this to shmem, which uses /sys/kernel/transparent_hugepage/shmem_enabled, later. Use this flag in collapse_pte_mapped_thp() where previously the VMA flags passed to hugepage_vma_check() were OR'd with VM_HUGEPAGE to elide the VM_HUGEPAGE check in "madvise" THP mode. Prior to "mm: khugepaged: check THP flag in hugepage_vma_check()", this check also didn't check "never" THP mode. As such, this restores the previous behavior of collapse_pte_mapped_thp() where sysfs THP settings are ignored. See comment in code for justification why this is OK. [1] https://lore.kernel.org/linux-mm/CAAa6QmQxay1_=Pmt8oCX2-Va18t44FV-Vs-WsQt_6+qBks4nZA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220706235936.2197195-8-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 768e5261fdae..b0e80cc72b0c 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf); +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, + bool smaps, bool in_pf, bool enforce_sysfs); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -321,8 +320,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, } static inline bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf) + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs) { return false; } -- cgit v1.2.3 From 7d8faaf155454f8798ec56404faca29a82689c77 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:27 -0700 Subject: mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse This idea was introduced by David Rientjes[1]. Introduce a new madvise mode, MADV_COLLAPSE, that allows users to request a synchronous collapse of memory at their own expense. The benefits of this approach are: * CPU is charged to the process that wants to spend the cycles for the THP * Avoid unpredictable timing of khugepaged collapse Semantics This call is independent of the system-wide THP sysfs settings, but will fail for memory marked VM_NOHUGEPAGE. If the ranges provided span multiple VMAs, the semantics of the collapse over each VMA is independent from the others. This implies a hugepage cannot cross a VMA boundary. If collapse of a given hugepage-aligned/sized region fails, the operation may continue to attempt collapsing the remainder of memory specified. The memory ranges provided must be page-aligned, but are not required to be hugepage-aligned. If the memory ranges are not hugepage-aligned, the start/end of the range will be clamped to the first/last hugepage-aligned address covered by said range. The memory ranges must span at least one hugepage-sized region. All non-resident pages covered by the range will first be swapped/faulted-in, before being internally copied onto a freshly allocated hugepage. Unmapped pages will have their data directly initialized to 0 in the new hugepage. However, for every eligible hugepage aligned/sized region to-be collapsed, at least one page must currently be backed by memory (a PMD covering the address range must already exist). Allocation for the new hugepage may enter direct reclaim and/or compaction, regardless of VMA flags. When the system has multiple NUMA nodes, the hugepage will be allocated from the node providing the most native pages. This operation operates on the current state of the specified process and makes no persistent changes or guarantees on how pages will be mapped, constructed, or faulted in the future Return Value If all hugepage-sized/aligned regions covered by the provided range were either successfully collapsed, or were already PMD-mapped THPs, this operation will be deemed successful. On success, process_madvise(2) returns the number of bytes advised, and madvise(2) returns 0. Else, -1 is returned and errno is set to indicate the error for the most-recently attempted hugepage collapse. Note that many failures might have occurred, since the operation may continue to collapse in the event a single hugepage-sized/aligned region fails. ENOMEM Memory allocation failed or VMA not found EBUSY Memcg charging failed EAGAIN Required resource temporarily unavailable. Try again might succeed. EINVAL Other error: No PMD found, subpage doesn't have Present bit set, "Special" page no backed by struct page, VMA incorrectly sized, address not page-aligned, ... Most notable here is ENOMEM and EBUSY (new to madvise) which are intended to provide the caller with actionable feedback so they may take an appropriate fallback measure. Use Cases An immediate user of this new functionality are malloc() implementations that manage memory in hugepage-sized chunks, but sometimes subrelease memory back to the system in native-sized chunks via MADV_DONTNEED; zapping the pmd. Later, when the memory is hot, the implementation could madvise(MADV_COLLAPSE) to re-back the memory by THPs to regain hugepage coverage and dTLB performance. TCMalloc is such an implementation that could benefit from this[2]. Only privately-mapped anon memory is supported for now, but additional support for file, shmem, and HugeTLB high-granularity mappings[2] is expected. File and tmpfs/shmem support would permit: * Backing executable text by THPs. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which might impair services from serving at their full rated load after (re)starting. Tricks like mremap(2)'ing text onto anonymous memory to immediately realize iTLB performance prevents page sharing and demand paging, both of which increase steady state memory footprint. With MADV_COLLAPSE, we get the best of both worlds: Peak upfront performance and lower RAM footprints. * Backing guest memory by hugapages after the memory contents have been migrated in native-page-sized chunks to a new host, in a userfaultfd-based live-migration stack. [1] https://lore.kernel.org/linux-mm/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com/ [2] https://github.com/google/tcmalloc/tree/master/tcmalloc [jrdr.linux@gmail.com: avoid possible memory leak in failure path] Link: https://lkml.kernel.org/r/20220713024109.62810-1-jrdr.linux@gmail.com [zokeefe@google.com add missing kfree() to madvise_collapse()] Link: https://lore.kernel.org/linux-mm/20220713024109.62810-1-jrdr.linux@gmail.com/ Link: https://lkml.kernel.org/r/20220713161851.1879439-1-zokeefe@google.com [zokeefe@google.com: delay computation of hpage boundaries until use]] Link: https://lkml.kernel.org/r/20220720140603.1958773-4-zokeefe@google.com Link: https://lkml.kernel.org/r/20220706235936.2197195-10-zokeefe@google.com Signed-off-by: Zach O'Keefe Signed-off-by: "Souptick Joarder (HPE)" Suggested-by: David Rientjes Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b0e80cc72b0c..38265f9f782e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -218,6 +218,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); +int madvise_collapse(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); @@ -361,9 +364,16 @@ static inline void split_huge_pmd_address(struct vm_area_struct *vma, static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { - BUG(); - return 0; + return -EINVAL; } + +static inline int madvise_collapse(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + return -EINVAL; +} + static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, -- cgit v1.2.3 From e9c2dbc8bf71a5039604a1dc45b10f24a2098f3b Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Mon, 8 Aug 2022 00:56:45 +0000 Subject: mm/vmscan: define macros for refaults in struct lruvec The magic number 0 and 1 are used in several places in vmscan.c. Define macros for them to improve code readability. Link: https://lkml.kernel.org/r/20220808005644.1721066-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Cc: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e24b40c52468..8f571dc7c524 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -306,6 +306,8 @@ static inline bool is_active_lru(enum lru_list lru) return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } +#define WORKINGSET_ANON 0 +#define WORKINGSET_FILE 1 #define ANON_AND_FILE 2 enum lruvec_flags { -- cgit v1.2.3 From d2226ebd5484afcf9f9b71b394ec1567a7730eb1 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 5 Aug 2022 08:59:03 +0800 Subject: mm/hugetlb: add dedicated func to get 'allowed' nodemask for current process Muchun Song found that after MPOL_PREFERRED_MANY policy was introduced in commit b27abaccf8e8 ("mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes"), the policy_nodemask_current()'s semantics for this new policy has been changed, which returns 'preferred' nodes instead of 'allowed' nodes. With the changed semantic of policy_nodemask_current, a task with MPOL_PREFERRED_MANY policy could fail to get its reservation even though it can fall back to other nodes (either defined by cpusets or all online nodes) for that reservation failing mmap calles unnecessarily early. The fix is to not consider MPOL_PREFERRED_MANY for reservations at all because they, unlike MPOL_MBIND, do not pose any actual hard constrain. Michal suggested the policy_nodemask_current() is only used by hugetlb, and could be moved to hugetlb code with more explicit name to enforce the 'allowed' semantics for which only MPOL_BIND policy matters. apply_policy_zone() is made extern to be called in hugetlb code and its return value is changed to bool. [1]. https://lore.kernel.org/lkml/20220801084207.39086-1-songmuchun@bytedance.com/t/ Link: https://lkml.kernel.org/r/20220805005903.95563-1-feng.tang@intel.com Fixes: b27abaccf8e8 ("mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes") Signed-off-by: Feng Tang Reported-by: Muchun Song Suggested-by: Michal Hocko Acked-by: Michal Hocko Reviewed-by: Muchun Song Cc: Mike Kravetz Cc: Dave Hansen Cc: Ben Widawsky Signed-off-by: Andrew Morton --- include/linux/mempolicy.h | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 668389b4b53d..d232de7cdc56 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -151,13 +151,6 @@ extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); -static inline nodemask_t *policy_nodemask_current(gfp_t gfp) -{ - struct mempolicy *mpol = get_task_policy(current); - - return policy_nodemask(gfp, mpol); -} - extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; @@ -189,6 +182,7 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol) return (pol->mode == MPOL_PREFERRED_MANY); } +extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); #else @@ -294,11 +288,6 @@ static inline void mpol_put_task_policy(struct task_struct *task) { } -static inline nodemask_t *policy_nodemask_current(gfp_t gfp) -{ - return NULL; -} - static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; -- cgit v1.2.3 From b84e04f1baeebe6872b22a027cfc558621e842d4 Mon Sep 17 00:00:00 2001 From: Imran Khan Date: Mon, 15 Aug 2022 05:53:53 +1000 Subject: kfence: add sysfs interface to disable kfence for selected slabs. By default kfence allocation can happen for any slab object, whose size is up to PAGE_SIZE, as long as that allocation is the first allocation after expiration of kfence sample interval. But in certain debugging scenarios we may be interested in debugging corruptions involving some specific slub objects like dentry or ext4_* etc. In such cases limiting kfence for allocations involving only specific slub objects will increase the probablity of catching the issue since kfence pool will not be consumed by other slab objects. This patch introduces a sysfs interface '/sys/kernel/slab//skip_kfence' to disable kfence for specific slabs. Having the interface work in this way does not impact current/default behavior of kfence and allows us to use kfence for specific slabs (when needed) as well. The decision to skip/use kfence is taken depending on whether kmem_cache.flags has (newly introduced) SLAB_SKIP_KFENCE flag set or not. Link: https://lkml.kernel.org/r/20220814195353.2540848-1-imran.f.khan@oracle.com Signed-off-by: Imran Khan Reviewed-by: Vlastimil Babka Reviewed-by: Marco Elver Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/slab.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 0fefdf528e0d..352e3f082acc 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -119,6 +119,12 @@ */ #define SLAB_NO_USER_FLAGS ((slab_flags_t __force)0x10000000U) +#ifdef CONFIG_KFENCE +#define SLAB_SKIP_KFENCE ((slab_flags_t __force)0x20000000U) +#else +#define SLAB_SKIP_KFENCE 0 +#endif + /* The following flags affect the page allocator grouping pages by mobility */ /* Objects are reclaimable */ #define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U) -- cgit v1.2.3 From 736a8ccce99c633b2457996bb9bf2cefff0db43d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 29 Jul 2022 16:01:04 +0800 Subject: hugetlb_cgroup: remove unneeded return value The return value of set_hugetlb_cgroup and set_hugetlb_cgroup_rsvd are always ignored. Remove them to clean up the code. Link: https://lkml.kernel.org/r/20220729080106.12752-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 379344828e78..630cd255d0cf 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -90,32 +90,31 @@ hugetlb_cgroup_from_page_rsvd(struct page *page) return __hugetlb_cgroup_from_page(page, true); } -static inline int __set_hugetlb_cgroup(struct page *page, +static inline void __set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg, bool rsvd) { VM_BUG_ON_PAGE(!PageHuge(page), page); if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) - return -1; + return; if (rsvd) set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD, (unsigned long)h_cg); else set_page_private(page + SUBPAGE_INDEX_CGROUP, (unsigned long)h_cg); - return 0; } -static inline int set_hugetlb_cgroup(struct page *page, +static inline void set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) { - return __set_hugetlb_cgroup(page, h_cg, false); + __set_hugetlb_cgroup(page, h_cg, false); } -static inline int set_hugetlb_cgroup_rsvd(struct page *page, +static inline void set_hugetlb_cgroup_rsvd(struct page *page, struct hugetlb_cgroup *h_cg) { - return __set_hugetlb_cgroup(page, h_cg, true); + __set_hugetlb_cgroup(page, h_cg, true); } static inline bool hugetlb_cgroup_disabled(void) @@ -199,16 +198,14 @@ hugetlb_cgroup_from_page_rsvd(struct page *page) return NULL; } -static inline int set_hugetlb_cgroup(struct page *page, +static inline void set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) { - return 0; } -static inline int set_hugetlb_cgroup_rsvd(struct page *page, +static inline void set_hugetlb_cgroup_rsvd(struct page *page, struct hugetlb_cgroup *h_cg) { - return 0; } static inline bool hugetlb_cgroup_disabled(void) -- cgit v1.2.3 From 4d86d4f7227c6f2acfbbbe0623d49865aa71b756 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Tue, 26 Jul 2022 16:02:41 -0700 Subject: mm: add more BUILD_BUG_ONs to gfp_migratetype() gfp_migratetype() also expects GFP_RECLAIMABLE and GFP_MOVABLE|GFP_RECLAIMABLE to be shiftable into MIGRATE_* enum values, so add some more BUILD_BUG_ONs to reflect this assumption. Link: https://linux-review.googlesource.com/id/Iae64e2182f75c3aca776a486b71a72571d66d83e Link: https://lkml.kernel.org/r/20220726230241.3770532-1-pcc@google.com Signed-off-by: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/gfp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f314be58fa77..ea6cb9399152 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -18,6 +18,9 @@ static inline int gfp_migratetype(const gfp_t gfp_flags) VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); + BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE); + BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >> + GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC); if (unlikely(page_group_by_mobility_disabled)) return MIGRATE_UNMOVABLE; -- cgit v1.2.3 From 33024536bafd9129f1d16ade0974671c648700ac Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2022 16:39:51 +0800 Subject: memory tiering: hot page selection with hint page fault latency Patch series "memory tiering: hot page selection", v4. To optimize page placement in a memory tiering system with NUMA balancing, the hot pages in the slow memory nodes need to be identified. Essentially, the original NUMA balancing implementation selects the mostly recently accessed (MRU) pages to promote. But this isn't a perfect algorithm to identify the hot pages. Because the pages with quite low access frequency may be accessed eventually given the NUMA balancing page table scanning period could be quite long (e.g. 60 seconds). So in this patchset, we implement a new hot page identification algorithm based on the latency between NUMA balancing page table scanning and hint page fault. Which is a kind of mostly frequently accessed (MFU) algorithm. In NUMA balancing memory tiering mode, if there are hot pages in slow memory node and cold pages in fast memory node, we need to promote/demote hot/cold pages between the fast and cold memory nodes. A choice is to promote/demote as fast as possible. But the CPU cycles and memory bandwidth consumed by the high promoting/demoting throughput will hurt the latency of some workload because of accessing inflating and slow memory bandwidth contention. A way to resolve this issue is to restrict the max promoting/demoting throughput. It will take longer to finish the promoting/demoting. But the workload latency will be better. This is implemented in this patchset as the page promotion rate limit mechanism. The promotion hot threshold is workload and system configuration dependent. So in this patchset, a method to adjust the hot threshold automatically is implemented. The basic idea is to control the number of the candidate promotion pages to match the promotion rate limit. We used the pmbench memory accessing benchmark tested the patchset on a 2-socket server system with DRAM and PMEM installed. The test results are as follows, pmbench score promote rate (accesses/s) MB/s ------------- ------------ base 146887704.1 725.6 hot selection 165695601.2 544.0 rate limit 162814569.8 165.2 auto adjustment 170495294.0 136.9 From the results above, With hot page selection patch [1/3], the pmbench score increases about 12.8%, and promote rate (overhead) decreases about 25.0%, compared with base kernel. With rate limit patch [2/3], pmbench score decreases about 1.7%, and promote rate decreases about 69.6%, compared with hot page selection patch. With threshold auto adjustment patch [3/3], pmbench score increases about 4.7%, and promote rate decrease about 17.1%, compared with rate limit patch. Baolin helped to test the patchset with MySQL on a machine which contains 1 DRAM node (30G) and 1 PMEM node (126G). sysbench /usr/share/sysbench/oltp_read_write.lua \ ...... --tables=200 \ --table-size=1000000 \ --report-interval=10 \ --threads=16 \ --time=120 The tps can be improved about 5%. This patch (of 3): To optimize page placement in a memory tiering system with NUMA balancing, the hot pages in the slow memory node need to be identified. Essentially, the original NUMA balancing implementation selects the mostly recently accessed (MRU) pages to promote. But this isn't a perfect algorithm to identify the hot pages. Because the pages with quite low access frequency may be accessed eventually given the NUMA balancing page table scanning period could be quite long (e.g. 60 seconds). The most frequently accessed (MFU) algorithm is better. So, in this patch we implemented a better hot page selection algorithm. Which is based on NUMA balancing page table scanning and hint page fault as follows, - When the page tables of the processes are scanned to change PTE/PMD to be PROT_NONE, the current time is recorded in struct page as scan time. - When the page is accessed, hint page fault will occur. The scan time is gotten from the struct page. And The hint page fault latency is defined as hint page fault time - scan time The shorter the hint page fault latency of a page is, the higher the probability of their access frequency to be higher. So the hint page fault latency is a better estimation of the page hot/cold. It's hard to find some extra space in struct page to hold the scan time. Fortunately, we can reuse some bits used by the original NUMA balancing. NUMA balancing uses some bits in struct page to store the page accessing CPU and PID (referring to page_cpupid_xchg_last()). Which is used by the multi-stage node selection algorithm to avoid to migrate pages shared accessed by the NUMA nodes back and forth. But for pages in the slow memory node, even if they are shared accessed by multiple NUMA nodes, as long as the pages are hot, they need to be promoted to the fast memory node. So the accessing CPU and PID information are unnecessary for the slow memory pages. We can reuse these bits in struct page to record the scan time. For the fast memory pages, these bits are used as before. For the hot threshold, the default value is 1 second, which works well in our performance test. All pages with hint page fault latency < hot threshold will be considered hot. It's hard for users to determine the hot threshold. So we don't provide a kernel ABI to set it, just provide a debugfs interface for advanced users to experiment. We will continue to work on a hot threshold automatic adjustment mechanism. The downside of the above method is that the response time to the workload hot spot changing may be much longer. For example, - A previous cold memory area becomes hot - The hint page fault will be triggered. But the hint page fault latency isn't shorter than the hot threshold. So the pages will not be promoted. - When the memory area is scanned again, maybe after a scan period, the hint page fault latency measured will be shorter than the hot threshold and the pages will be promoted. To mitigate this, if there are enough free space in the fast memory node, the hot threshold will not be used, all pages will be promoted upon the hint page fault for fast response. Thanks Zhong Jiang reported and tested the fix for a bug when disabling memory tiering mode dynamically. Link: https://lkml.kernel.org/r/20220713083954.34196-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20220713083954.34196-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: Mel Gorman Cc: Peter Zijlstra Cc: Dave Hansen Cc: Yang Shi Cc: Zi Yan Cc: Wei Xu Cc: osalvador Cc: Shakeel Butt Cc: Zhong Jiang Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/mm.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 21f8b27bd9fd..27839b158ca4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1255,6 +1255,18 @@ static inline int folio_nid(const struct folio *folio) } #ifdef CONFIG_NUMA_BALANCING +/* page access time bits needs to hold at least 4 seconds */ +#define PAGE_ACCESS_TIME_MIN_BITS 12 +#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS +#define PAGE_ACCESS_TIME_BUCKETS \ + (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT) +#else +#define PAGE_ACCESS_TIME_BUCKETS 0 +#endif + +#define PAGE_ACCESS_TIME_MASK \ + (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS) + static inline int cpu_pid_to_cpupid(int cpu, int pid) { return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); @@ -1318,12 +1330,25 @@ static inline void page_cpupid_reset_last(struct page *page) page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ + +static inline int xchg_page_access_time(struct page *page, int time) +{ + int last_time; + + last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS); + return last_time << PAGE_ACCESS_TIME_BUCKETS; +} #else /* !CONFIG_NUMA_BALANCING */ static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { return page_to_nid(page); /* XXX */ } +static inline int xchg_page_access_time(struct page *page, int time) +{ + return 0; +} + static inline int page_cpupid_last(struct page *page) { return page_to_nid(page); /* XXX */ -- cgit v1.2.3 From c6833e10008f976a173dd5abdf992e492cbc3bcf Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2022 16:39:52 +0800 Subject: memory tiering: rate limit NUMA migration throughput In NUMA balancing memory tiering mode, if there are hot pages in slow memory node and cold pages in fast memory node, we need to promote/demote hot/cold pages between the fast and cold memory nodes. A choice is to promote/demote as fast as possible. But the CPU cycles and memory bandwidth consumed by the high promoting/demoting throughput will hurt the latency of some workload because of accessing inflating and slow memory bandwidth contention. A way to resolve this issue is to restrict the max promoting/demoting throughput. It will take longer to finish the promoting/demoting. But the workload latency will be better. This is implemented in this patch as the page promotion rate limit mechanism. The number of the candidate pages to be promoted to the fast memory node via NUMA balancing is counted, if the count exceeds the limit specified by the users, the NUMA balancing promotion will be stopped until the next second. A new sysctl knob kernel.numa_balancing_promote_rate_limit_MBps is added for the users to specify the limit. Link: https://lkml.kernel.org/r/20220713083954.34196-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Dave Hansen Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: osalvador Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shakeel Butt Cc: Wei Xu Cc: Yang Shi Cc: Zhong Jiang Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 7 +++++++ include/linux/sched/sysctl.h | 1 + 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8f571dc7c524..a0003eaa751f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -221,6 +221,7 @@ enum node_stat_item { #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ + PGPROMOTE_CANDIDATE, /* candidate pages to promote */ #endif NR_VM_NODE_STAT_ITEMS }; @@ -998,6 +999,12 @@ typedef struct pglist_data { struct deferred_split deferred_split_queue; #endif +#ifdef CONFIG_NUMA_BALANCING + /* start time in ms of current promote rate limit period */ + unsigned int nbp_rl_start; + /* number of promote candidate pages at start time of current rate limit period */ + unsigned long nbp_rl_nr_cand; +#endif /* Fields commonly accessed by the page reclaim scanner */ /* diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index e650946816d0..303ee7dd0c7e 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -27,6 +27,7 @@ enum sched_tunable_scaling { #ifdef CONFIG_NUMA_BALANCING extern int sysctl_numa_balancing_mode; +extern unsigned int sysctl_numa_balancing_promote_rate_limit; #else #define sysctl_numa_balancing_mode 0 #endif -- cgit v1.2.3 From c959924b0dc53bf6252793f41480bc01b9792570 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2022 16:39:53 +0800 Subject: memory tiering: adjust hot threshold automatically The promotion hot threshold is workload and system configuration dependent. So in this patch, a method to adjust the hot threshold automatically is implemented. The basic idea is to control the number of the candidate promotion pages to match the promotion rate limit. If the hint page fault latency of a page is less than the hot threshold, we will try to promote the page, and the page is called the candidate promotion page. If the number of the candidate promotion pages in the statistics interval is much more than the promotion rate limit, the hot threshold will be decreased to reduce the number of the candidate promotion pages. Otherwise, the hot threshold will be increased to increase the number of the candidate promotion pages. To make the above method works, in each statistics interval, the total number of the pages to check (on which the hint page faults occur) and the hot/cold distribution need to be stable. Because the page tables are scanned linearly in NUMA balancing, but the hot/cold distribution isn't uniform along the address usually, the statistics interval should be larger than the NUMA balancing scan period. So in the patch, the max scan period is used as statistics interval and it works well in our tests. Link: https://lkml.kernel.org/r/20220713083954.34196-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Dave Hansen Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: osalvador Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shakeel Butt Cc: Wei Xu Cc: Yang Shi Cc: Zhong Jiang Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a0003eaa751f..025754b0bc09 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1004,6 +1004,15 @@ typedef struct pglist_data { unsigned int nbp_rl_start; /* number of promote candidate pages at start time of current rate limit period */ unsigned long nbp_rl_nr_cand; + /* promote threshold in ms */ + unsigned int nbp_threshold; + /* start time in ms of current promote threshold adjustment period */ + unsigned int nbp_th_start; + /* + * number of promote candidate pages at stat time of current promote + * threshold adjustment period + */ + unsigned long nbp_th_nr_cand; #endif /* Fields commonly accessed by the page reclaim scanner */ -- cgit v1.2.3 From 0192445cb2f7ed1cd7a95a0fc8c7645480baba25 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 15 Aug 2022 10:39:59 -0400 Subject: arch: mm: rename FORCE_MAX_ZONEORDER to ARCH_FORCE_MAX_ORDER This Kconfig option is used by individual arch to set its desired MAX_ORDER. Rename it to reflect its actual use. Link: https://lkml.kernel.org/r/20220815143959.1511278-1-zi.yan@sent.com Acked-by: Mike Rapoport Signed-off-by: Zi Yan Acked-by: Guo Ren [csky] Acked-by: Arnd Bergmann Acked-by: Catalin Marinas [arm64] Acked-by: Huacai Chen [LoongArch] Acked-by: Michael Ellerman [powerpc] Cc: Vineet Gupta Cc: Taichi Sugaya Cc: Neil Armstrong Cc: Qin Jian Cc: Guo Ren Cc: Geert Uytterhoeven Cc: Thomas Bogendoerfer Cc: Dinh Nguyen Cc: Christophe Leroy Cc: Yoshinori Sato Cc: "David S. Miller" Cc: Chris Zankel Cc: Ley Foon Tan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 025754b0bc09..fd61347b4b1f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -24,10 +24,10 @@ #include /* Free memory management - zoned buddy allocator. */ -#ifndef CONFIG_FORCE_MAX_ZONEORDER +#ifndef CONFIG_ARCH_FORCE_MAX_ORDER #define MAX_ORDER 11 #else -#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER +#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) -- cgit v1.2.3 From fb70c4878d6b3001ef40fa39432a38d8cabdcbf7 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 15 Aug 2022 19:10:17 +0800 Subject: mm: kill find_min_pfn_with_active_regions() find_min_pfn_with_active_regions() is only called from free_area_init(). Open-code the PHYS_PFN(memblock_start_of_DRAM()) into free_area_init(), and kill find_min_pfn_with_active_regions(). Link: https://lkml.kernel.org/r/20220815111017.39341-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 27839b158ca4..e98ef2cb1176 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2520,7 +2520,6 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); -extern unsigned long find_min_pfn_with_active_regions(void); #ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) -- cgit v1.2.3 From b1d5488a252dc9c0d9574100d0b8d807bf154603 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Thu, 18 Aug 2022 19:20:00 +0530 Subject: mm: fix use-after free of page_ext after race with memory-offline The below is one path where race between page_ext and offline of the respective memory blocks will cause use-after-free on the access of page_ext structure. process1 process2 --------- --------- a)doing /proc/page_owner doing memory offline through offline_pages. b) PageBuddy check is failed thus proceed to get the page_owner information through page_ext access. page_ext = lookup_page_ext(page); migrate_pages(); ................. Since all pages are successfully migrated as part of the offline operation,send MEM_OFFLINE notification where for page_ext it calls: offline_page_ext()--> __free_page_ext()--> free_page_ext()--> vfree(ms->page_ext) mem_section->page_ext = NULL c) Check for the PAGE_EXT flags in the page_ext->flags access results into the use-after-free (leading to the translation faults). As mentioned above, there is really no synchronization between page_ext access and its freeing in the memory_offline. The memory offline steps(roughly) on a memory block is as below: 1) Isolate all the pages 2) while(1) try free the pages to buddy.(->free_list[MIGRATE_ISOLATE]) 3) delete the pages from this buddy list. 4) Then free page_ext.(Note: The struct page is still alive as it is freed only during hot remove of the memory which frees the memmap, which steps the user might not perform). This design leads to the state where struct page is alive but the struct page_ext is freed, where the later is ideally part of the former which just representing the page_flags (check [3] for why this design is chosen). The abovementioned race is just one example __but the problem persists in the other paths too involving page_ext->flags access(eg: page_is_idle())__. Fix all the paths where offline races with page_ext access by maintaining synchronization with rcu lock and is achieved in 3 steps: 1) Invalidate all the page_ext's of the sections of a memory block by storing a flag in the LSB of mem_section->page_ext. 2) Wait until all the existing readers to finish working with the ->page_ext's with synchronize_rcu(). Any parallel process that starts after this call will not get page_ext, through lookup_page_ext(), for the block parallel offline operation is being performed. 3) Now safely free all sections ->page_ext's of the block on which offline operation is being performed. Note: If synchronize_rcu() takes time then optimizations can be done in this path through call_rcu()[2]. Thanks to David Hildenbrand for his views/suggestions on the initial discussion[1] and Pavan kondeti for various inputs on this patch. [1] https://lore.kernel.org/linux-mm/59edde13-4167-8550-86f0-11fc67882107@quicinc.com/ [2] https://lore.kernel.org/all/a26ce299-aed1-b8ad-711e-a49e82bdd180@quicinc.com/T/#u [3] https://lore.kernel.org/all/6fa6b7aa-731e-891c-3efb-a03d6a700efa@redhat.com/ [quic_charante@quicinc.com: rename label `loop' to `ext_put_continue' per David] Link: https://lkml.kernel.org/r/1661496993-11473-1-git-send-email-quic_charante@quicinc.com Link: https://lkml.kernel.org/r/1660830600-9068-1-git-send-email-quic_charante@quicinc.com Signed-off-by: Charan Teja Kalla Suggested-by: David Hildenbrand Suggested-by: Michal Hocko Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Fernand Sieber Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavan Kondeti Cc: SeongJae Park Cc: Shakeel Butt Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 17 +++++++++++------ include/linux/page_idle.h | 34 ++++++++++++++++++++++++---------- 2 files changed, 35 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index fabb2e1e087f..ed27198cdaf4 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -55,7 +55,8 @@ static inline void page_ext_init(void) } #endif -struct page_ext *lookup_page_ext(const struct page *page); +extern struct page_ext *page_ext_get(struct page *page); +extern void page_ext_put(struct page_ext *page_ext); static inline struct page_ext *page_ext_next(struct page_ext *curr) { @@ -71,11 +72,6 @@ static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { } -static inline struct page_ext *lookup_page_ext(const struct page *page) -{ - return NULL; -} - static inline void page_ext_init(void) { } @@ -87,5 +83,14 @@ static inline void page_ext_init_flatmem_late(void) static inline void page_ext_init_flatmem(void) { } + +static inline struct page_ext *page_ext_get(struct page *page) +{ + return NULL; +} + +static inline void page_ext_put(struct page_ext *page_ext) +{ +} #endif /* CONFIG_PAGE_EXTENSION */ #endif /* __LINUX_PAGE_EXT_H */ diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 4663dfed1293..5cb7bd2078ec 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -13,65 +13,79 @@ * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ - static inline bool folio_test_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); + bool page_young; if (unlikely(!page_ext)) return false; - return test_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_young = test_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); + + return page_young; } static inline void folio_set_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); } static inline bool folio_test_clear_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); + bool page_young; if (unlikely(!page_ext)) return false; - return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_young = test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); + + return page_young; } static inline bool folio_test_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); + bool page_idle; if (unlikely(!page_ext)) return false; - return test_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_idle = test_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); + + return page_idle; } static inline void folio_set_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); } static inline void folio_clear_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); } #endif /* !CONFIG_64BIT */ -- cgit v1.2.3 From e2f8f44b7686bf65747f485ac7c9c43de8b8de09 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Mon, 22 Aug 2022 15:01:32 +0200 Subject: mm: pagewalk: fix documentation of PTE hole handling Empty PTEs are passed to the pte_entry callback, not to pte_hole. Link: https://lkml.kernel.org/r/3695521.kQq0lBPeGt@devpool047 Signed-off-by: Rolf Eike Beer Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index ac7b38ad5903..f3fafb731ffd 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -15,12 +15,12 @@ struct mm_walk; * this handler is required to be able to handle * pmd_trans_huge() pmds. They may simply choose to * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each non-empty PTE (lowest-level) - * entry + * @pte_entry: if set, called for each PTE (lowest-level) entry, + * including empty ones * @pte_hole: if set, called for each hole at all levels, - * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD - * 4:PTE. Any folded depths (where PTRS_PER_P?D is equal - * to 1) are skipped. + * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. + * Any folded depths (where PTRS_PER_P?D is equal to 1) + * are skipped. * @hugetlb_entry: if set, called for each hugetlb entry * @test_walk: caller specific callback function to determine whether * we walk over the current vma or not. Returning 0 means -- cgit v1.2.3 From 408587baee39753304dd572dacf374f75545d25a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Aug 2022 00:05:05 +0000 Subject: mm: page_counter: rearrange struct page_counter fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With memcg v2 enabled, memcg->memory.usage is a very hot member for the workloads doing memcg charging on multiple CPUs concurrently. Particularly the network intensive workloads. In addition, there is a false cache sharing between memory.usage and memory.high on the charge path. This patch moves the usage into a separate cacheline and move all the read most fields into separate cacheline. To evaluate the impact of this optimization, on a 72 CPUs machine, we ran the following workload in a three level of cgroup hierarchy. $ netserver -6 # 36 instances of netperf with following params $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K Results (average throughput of netperf): Without (6.0-rc1) 10482.7 Mbps With patch 12413.7 Mbps (18.4% improvement) With the patch, the throughput improved by 18.4%. One side-effect of this patch is the increase in the size of struct mem_cgroup. For example with this patch on 64 bit build, the size of struct mem_cgroup increased from 4032 bytes to 4416 bytes. However for the performance improvement, this additional size is worth it. In addition there are opportunities to reduce the size of struct mem_cgroup like deprecation of kmem and tcpmem page counters and better packing. Link: https://lkml.kernel.org/r/20220825000506.239406-3-shakeelb@google.com Signed-off-by: Shakeel Butt Reported-by: kernel test robot Reviewed-by: Feng Tang Acked-by: Soheil Hassas Yeganeh Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Eric Dumazet Cc: Johannes Weiner Cc: "Michal Koutný" Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/page_counter.h | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 679591301994..78a1c934e416 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -3,15 +3,26 @@ #define _LINUX_PAGE_COUNTER_H #include +#include #include #include +#if defined(CONFIG_SMP) +struct pc_padding { + char x[0]; +} ____cacheline_internodealigned_in_smp; +#define PC_PADDING(name) struct pc_padding name +#else +#define PC_PADDING(name) +#endif + struct page_counter { + /* + * Make sure 'usage' does not share cacheline with any other field. The + * memcg->memory.usage is a hot member of struct mem_cgroup. + */ atomic_long_t usage; - unsigned long min; - unsigned long low; - unsigned long high; - unsigned long max; + PC_PADDING(_pad1_); /* effective memory.min and memory.min usage tracking */ unsigned long emin; @@ -23,18 +34,18 @@ struct page_counter { atomic_long_t low_usage; atomic_long_t children_low_usage; - /* legacy */ unsigned long watermark; unsigned long failcnt; - /* - * 'parent' is placed here to be far from 'usage' to reduce - * cache false sharing, as 'usage' is written mostly while - * parent is frequently read for cgroup's hierarchical - * counting nature. - */ + /* Keep all the read most fields in a separete cacheline. */ + PC_PADDING(_pad2_); + + unsigned long min; + unsigned long low; + unsigned long high; + unsigned long max; struct page_counter *parent; -}; +} ____cacheline_internodealigned_in_smp; #if BITS_PER_LONG == 32 #define PAGE_COUNTER_MAX LONG_MAX -- cgit v1.2.3 From 1813e51eece0ad6f4aacaeb738e7cced46feb470 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Aug 2022 00:05:06 +0000 Subject: memcg: increase MEMCG_CHARGE_BATCH to 64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For several years, MEMCG_CHARGE_BATCH was kept at 32 but with bigger machines and the network intensive workloads requiring througput in Gbps, 32 is too small and makes the memcg charging path a bottleneck. For now, increase it to 64 for easy acceptance to 6.0. We will need to revisit this in future for ever increasing demand of higher performance. Please note that the memcg charge path drain the per-cpu memcg charge stock, so there should not be any oom behavior change. Though it does have impact on rstat flushing and high limit reclaim backoff. To evaluate the impact of this optimization, on a 72 CPUs machine, we ran the following workload in a three level of cgroup hierarchy. $ netserver -6 # 36 instances of netperf with following params $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K Results (average throughput of netperf): Without (6.0-rc1) 10482.7 Mbps With patch 17064.7 Mbps (62.7% improvement) With the patch, the throughput improved by 62.7%. Link: https://lkml.kernel.org/r/20220825000506.239406-4-shakeelb@google.com Signed-off-by: Shakeel Butt Reported-by: kernel test robot Acked-by: Soheil Hassas Yeganeh Reviewed-by: Feng Tang Acked-by: Roman Gushchin Acked-by: Muchun Song Acked-by: Michal Hocko Cc: Eric Dumazet Cc: Johannes Weiner Cc: "Michal Koutný" Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6257867fbf95..a2461f9a8738 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -354,10 +354,11 @@ struct mem_cgroup { }; /* - * size of first charge trial. "32" comes from vmscan.c's magic value. - * TODO: maybe necessary to use big numbers in big irons. + * size of first charge trial. + * TODO: maybe necessary to use big numbers in big irons or dynamic based of the + * workload. */ -#define MEMCG_CHARGE_BATCH 32U +#define MEMCG_CHARGE_BATCH 64U extern struct mem_cgroup *root_mem_cgroup; -- cgit v1.2.3 From c4f20f1479c456d9dd1c1e6d8bf956a25de742dc Mon Sep 17 00:00:00 2001 From: Li Zhe Date: Thu, 25 Aug 2022 18:27:14 +0800 Subject: page_ext: introduce boot parameter 'early_page_ext' In commit 2f1ee0913ce5 ("Revert "mm: use early_pfn_to_nid in page_ext_init""), we call page_ext_init() after page_alloc_init_late() to avoid some panic problem. It seems that we cannot track early page allocations in current kernel even if page structure has been initialized early. This patch introduces a new boot parameter 'early_page_ext' to resolve this problem. If we pass it to the kernel, page_ext_init() will be moved up and the feature 'deferred initialization of struct pages' will be disabled to initialize the page allocator early and prevent the panic problem above. It can help us to catch early page allocations. This is useful especially when we find that the free memory value is not the same right after different kernel booting. [akpm@linux-foundation.org: fix section issue by removing __meminitdata] Link: https://lkml.kernel.org/r/20220825102714.669-1-lizhe.67@bytedance.com Signed-off-by: Li Zhe Suggested-by: Michal Hocko Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Jason A. Donenfeld Cc: Jonathan Corbet Cc: Kees Cook Cc: Mark-PK Tsai Cc: Masami Hiramatsu (Google) Cc: Steven Rostedt Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index ed27198cdaf4..22be4582faae 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -36,9 +36,15 @@ struct page_ext { unsigned long flags; }; +extern bool early_page_ext; extern unsigned long page_ext_size; extern void pgdat_page_ext_init(struct pglist_data *pgdat); +static inline bool early_page_ext_enabled(void) +{ + return early_page_ext; +} + #ifdef CONFIG_SPARSEMEM static inline void page_ext_init_flatmem(void) { @@ -68,6 +74,11 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr) #else /* !CONFIG_PAGE_EXTENSION */ struct page_ext; +static inline bool early_page_ext_enabled(void) +{ + return false; +} + static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { } -- cgit v1.2.3 From 35b471467f88b8d8b52ba5b006a29b9d057d09bf Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 23 Aug 2022 17:40:17 -0700 Subject: filemap: add filemap_get_folios_contig() Patch series "Convert to filemap_get_folios_contig()", v3. This patch series replaces find_get_pages_contig() with filemap_get_folios_contig(). This patch (of 7): This function is meant to replace find_get_pages_contig(). Unlike find_get_pages_contig(), filemap_get_folios_contig() no longer takes in a target number of pages to find - It returns up to 15 contiguous folios. To be more consistent with filemap_get_folios(), filemap_get_folios_contig() now also updates the start index passed in, and takes an end index. Link: https://lkml.kernel.org/r/20220824004023.77310-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20220824004023.77310-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Chris Mason Cc: Josef Bacik Cc: David Sterba Cc: Ryusuke Konishi Cc: Al Viro Cc: Matthew Wilcox Cc: David Sterba Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0178b2040ea3..8689c32d628b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -718,6 +718,8 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); +unsigned filemap_get_folios_contig(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, -- cgit v1.2.3 From 48658d8509d2db3391c95aa74308a2b1fc8e8461 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 23 Aug 2022 17:40:23 -0700 Subject: filemap: remove find_get_pages_contig() All callers of find_get_pages_contig() have been removed, so it is no longer needed. Link: https://lkml.kernel.org/r/20220824004023.77310-8-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Al Viro Cc: Chris Mason Cc: David Sterba Cc: David Sterba Cc: Josef Bacik Cc: Matthew Wilcox Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8689c32d628b..09de43e36a64 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -720,8 +720,6 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); -unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, - unsigned int nr_pages, struct page **pages); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages); -- cgit v1.2.3 From 639118d1571f70b1157b4bb5ac574b0ab0f38099 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 27 Aug 2022 19:20:43 +0800 Subject: mm: kill is_memblock_offlined() Directly check state of struct memory_block, no need a single function. Link: https://lkml.kernel.org/r/20220827112043.187028-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e0b2209ab71c..54675791bc50 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -11,7 +11,6 @@ struct page; struct zone; struct pglist_data; struct mem_section; -struct memory_block; struct memory_group; struct resource; struct vmem_altmap; @@ -333,7 +332,6 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); -extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); -- cgit v1.2.3 From b4a0215e11dcfe23a48c65c6d6c82c0c2c551a48 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 27 Aug 2022 19:19:59 +0800 Subject: mm: fix null-ptr-deref in kswapd_is_running() kswapd_run/stop() will set pgdat->kswapd to NULL, which could race with kswapd_is_running() in kcompactd(), kswapd_run/stop() kcompactd() kswapd_is_running() pgdat->kswapd // error or nomal ptr verify pgdat->kswapd // load non-NULL pgdat->kswapd pgdat->kswapd = NULL task_is_running(pgdat->kswapd) // Null pointer derefence KASAN reports the null-ptr-deref shown below, vmscan: Failed to start kswapd on node 0 ... BUG: KASAN: null-ptr-deref in kcompactd+0x440/0x504 Read of size 8 at addr 0000000000000024 by task kcompactd0/37 CPU: 0 PID: 37 Comm: kcompactd0 Kdump: loaded Tainted: G OE 5.10.60 #1 Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: dump_backtrace+0x0/0x394 show_stack+0x34/0x4c dump_stack+0x158/0x1e4 __kasan_report+0x138/0x140 kasan_report+0x44/0xdc __asan_load8+0x94/0xd0 kcompactd+0x440/0x504 kthread+0x1a4/0x1f0 ret_from_fork+0x10/0x18 At present kswapd/kcompactd_run() and kswapd/kcompactd_stop() are protected by mem_hotplug_begin/done(), but without kcompactd(). There is no need to involve memory hotplug lock in kcompactd(), so let's add a new mutex to protect pgdat->kswapd accesses. Also, because the kcompactd task will check the state of kswapd task, it's better to call kcompactd_stop() before kswapd_stop() to reduce lock conflicts. [akpm@linux-foundation.org: add comments] Link: https://lkml.kernel.org/r/20220827111959.186838-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 20 ++++++++++++++++++++ include/linux/mmzone.h | 6 ++++-- 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 54675791bc50..51052969dbfe 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -215,6 +215,22 @@ void put_online_mems(void); void mem_hotplug_begin(void); void mem_hotplug_done(void); +/* See kswapd_is_running() */ +static inline void pgdat_kswapd_lock(pg_data_t *pgdat) +{ + mutex_lock(&pgdat->kswapd_lock); +} + +static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) +{ + mutex_unlock(&pgdat->kswapd_lock); +} + +static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) +{ + mutex_init(&pgdat->kswapd_lock); +} + #else /* ! CONFIG_MEMORY_HOTPLUG */ #define pfn_to_online_page(pfn) \ ({ \ @@ -251,6 +267,10 @@ static inline bool movable_node_is_enabled(void) { return false; } + +static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {} +static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {} +static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {} #endif /* ! CONFIG_MEMORY_HOTPLUG */ /* diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fd61347b4b1f..18cf0fc5ce67 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -956,8 +956,10 @@ typedef struct pglist_data { atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ unsigned long nr_reclaim_start; /* nr pages written while throttled * when throttling started. */ - struct task_struct *kswapd; /* Protected by - mem_hotplug_begin/done() */ +#ifdef CONFIG_MEMORY_HOTPLUG + struct mutex kswapd_lock; +#endif + struct task_struct *kswapd; /* Protected by kswapd_lock */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; -- cgit v1.2.3 From a38c94ed59fc312b51a33d867444ce73c6805ff6 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 29 Aug 2022 17:57:09 +0800 Subject: mm/thp: simplify has_transparent_hugepage by using IS_BUILTIN Simplify code of has_transparent_hugepage define by using IS_BUILTIN. No functional change. Link: https://lkml.kernel.org/r/20220829095709.3287462-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 014ee8f0fbaa..5911761487de 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1598,11 +1598,7 @@ typedef unsigned int pgtbl_mod_mask; #endif #ifndef has_transparent_hugepage -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define has_transparent_hugepage() 1 -#else -#define has_transparent_hugepage() 0 -#endif +#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif /* -- cgit v1.2.3 From bcd0dea5f4fb3d098b03ef3064fe6c8201d7c515 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 29 Aug 2022 17:51:25 +0800 Subject: mm/thp: remove redundant CONFIG_TRANSPARENT_HUGEPAGE Simplify code by removing redundant CONFIG_TRANSPARENT_HUGEPAGE judgment. No functional change. Link: https://lkml.kernel.org/r/20220829095125.3284567-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Cc: Kefeng Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5911761487de..d13b4f7cc5be 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1276,8 +1276,7 @@ static inline int pgd_devmap(pgd_t pgd) #endif #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ - (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ - !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline int pud_trans_huge(pud_t pud) { return 0; -- cgit v1.2.3 From 214f8796907b8015b778badf4710a4701472779a Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 1 Sep 2022 21:34:52 +0800 Subject: fs/buffer: remove __breadahead_gfp() Patch series "fs/buffer: remove ll_rw_block()", v2. ll_rw_block() will skip locked buffer before submitting IO, it assumes that locked buffer means it is under IO. This assumption is not always true because we cannot guarantee every buffer lock path would submit IO. After commit 88dbcbb3a484 ("blkdev: avoid migration stalls for blkdev pages"), buffer_migrate_folio_norefs() becomes one exceptional case, and there may be others. So ll_rw_block() is not safe on the sync read path, we could get false positive EIO return value when filesystem reading metadata. It seems that it could be only used on the readahead path. Unfortunately, many filesystem misuse the ll_rw_block() on the sync read path. This patch set just remove ll_rw_block() and add new friendly helpers, which could prevent false positive EIO on the read metadata path. Thanks for the suggestion from Jan, the original discussion is at [1]. patch 1: remove unused helpers in fs/buffer.c patch 2: add new bh_read_[*] helpers patch 3-11: remove all ll_rw_block() calls in filesystems patch 12-14: do some leftover cleanups. [1]. https://lore.kernel.org/linux-mm/20220825080146.2021641-1-chengzhihao1@huawei.com/ This patch (of 14): No one use __breadahead_gfp() and sb_breadahead_unmovable() any more, remove them. Link: https://lkml.kernel.org/r/20220901133505.2510834-1-yi.zhang@huawei.com Link: https://lkml.kernel.org/r/20220901133505.2510834-2-yi.zhang@huawei.com Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Cc: Alexander Viro Cc: Andreas Gruenbacher Cc: Bob Peterson Cc: Evgeniy Dushistov Cc: Heming Zhao Cc: Jens Axboe Cc: Konstantin Komarov Cc: Mark Fasheh Cc: Theodore Ts'o Cc: Yu Kuai Cc: Zhihao Cheng Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 089c9ade4325..c3863c417b00 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -214,8 +214,6 @@ struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block, void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); -void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size, - gfp_t gfp); struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); @@ -340,12 +338,6 @@ sb_breadahead(struct super_block *sb, sector_t block) __breadahead(sb->s_bdev, block, sb->s_blocksize); } -static inline void -sb_breadahead_unmovable(struct super_block *sb, sector_t block) -{ - __breadahead_gfp(sb->s_bdev, block, sb->s_blocksize, 0); -} - static inline struct buffer_head * sb_getblk(struct super_block *sb, sector_t block) { -- cgit v1.2.3 From fdee117ee86479fd2644bcd9ac2b2469e55722d1 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 1 Sep 2022 21:34:53 +0800 Subject: fs/buffer: add some new buffer read helpers Current ll_rw_block() helper is fragile because it assumes that locked buffer means it's under IO which is submitted by some other who holds the lock, it skip buffer if it failed to get the lock, so it's only safe on the readahead path. Unfortunately, now that most filesystems still use this helper mistakenly on the sync metadata read path. There is no guarantee that the one who holds the buffer lock always submit IO (e.g. buffer_migrate_folio_norefs() after commit 88dbcbb3a484 ("blkdev: avoid migration stalls for blkdev pages"), it could lead to false positive -EIO when submitting reading IO. This patch add some friendly buffer read helpers to prepare replacing ll_rw_block() and similar calls. We can only call bh_readahead_[] helpers for the readahead paths. Link: https://lkml.kernel.org/r/20220901133505.2510834-3-yi.zhang@huawei.com Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c3863c417b00..6d09785bed9f 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -232,6 +232,9 @@ void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); int bh_uptodate_or_lock(struct buffer_head *bh); int bh_submit_read(struct buffer_head *bh); +int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait); +void __bh_read_batch(int nr, struct buffer_head *bhs[], + blk_opf_t op_flags, bool force_lock); extern int buffer_heads_over_limit; @@ -399,6 +402,41 @@ static inline struct buffer_head *__getblk(struct block_device *bdev, return __getblk_gfp(bdev, block, size, __GFP_MOVABLE); } +static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags) +{ + if (!buffer_uptodate(bh) && trylock_buffer(bh)) { + if (!buffer_uptodate(bh)) + __bh_read(bh, op_flags, false); + else + unlock_buffer(bh); + } +} + +static inline void bh_read_nowait(struct buffer_head *bh, blk_opf_t op_flags) +{ + if (!bh_uptodate_or_lock(bh)) + __bh_read(bh, op_flags, false); +} + +/* Returns 1 if buffer uptodated, 0 on success, and -EIO on error. */ +static inline int bh_read(struct buffer_head *bh, blk_opf_t op_flags) +{ + if (bh_uptodate_or_lock(bh)) + return 1; + return __bh_read(bh, op_flags, true); +} + +static inline void bh_read_batch(int nr, struct buffer_head *bhs[]) +{ + __bh_read_batch(nr, bhs, 0, true); +} + +static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[], + blk_opf_t op_flags) +{ + __bh_read_batch(nr, bhs, op_flags, false); +} + /** * __bread() - reads a specified block and returns the bh * @bdev: the block_device to read from -- cgit v1.2.3 From 79f5978420691fb84593f98557ea56f8b32228f2 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 1 Sep 2022 21:35:03 +0800 Subject: fs/buffer: remove ll_rw_block() helper Now that all ll_rw_block() users has been replaced to new safe helpers, we just remove it here. Link: https://lkml.kernel.org/r/20220901133505.2510834-13-yi.zhang@huawei.com Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6d09785bed9f..b415d8bc2a09 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -223,7 +223,6 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); void unlock_buffer(struct buffer_head *bh); void __lock_buffer(struct buffer_head *bh); -void ll_rw_block(blk_opf_t, int, struct buffer_head * bh[]); int sync_dirty_buffer(struct buffer_head *bh); int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); -- cgit v1.2.3 From 454552d0145486cf82572e73cca266ab6a56e86b Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 1 Sep 2022 21:35:05 +0800 Subject: fs/buffer: remove bh_submit_read() helper bh_submit_read() has no user anymore, just remove it. Link: https://lkml.kernel.org/r/20220901133505.2510834-15-yi.zhang@huawei.com Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index b415d8bc2a09..9b6556d3f110 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -230,7 +230,6 @@ int submit_bh(blk_opf_t, struct buffer_head *); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); int bh_uptodate_or_lock(struct buffer_head *bh); -int bh_submit_read(struct buffer_head *bh); int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait); void __bh_read_batch(int nr, struct buffer_head *bhs[], blk_opf_t op_flags, bool force_lock); -- cgit v1.2.3 From 263b899802fc43cd0e6979f819271dfbb93c94af Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 1 Sep 2022 20:00:21 +0800 Subject: hugetlb: make hugetlb_cma_check() static Patch series "A few cleanup patches for hugetlb", v2. This series contains a few cleanup patches to use helper functions to simplify the codes, remove unneeded nid parameter and so on. More details can be found in the respective changelogs. This patch (of 10): Make hugetlb_cma_check() static as it's only used inside mm/hugetlb.c. Link: https://lkml.kernel.org/r/20220901120030.63318-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220901120030.63318-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Mike Kravetz Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3ec981a0d8b3..57e72954a482 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1123,14 +1123,10 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h, #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) extern void __init hugetlb_cma_reserve(int order); -extern void __init hugetlb_cma_check(void); #else static inline __init void hugetlb_cma_reserve(int order) { } -static inline __init void hugetlb_cma_check(void) -{ -} #endif bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); -- cgit v1.2.3 From 088b8aa537c2c767765f1c19b555f21ffe555786 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 1 Sep 2022 10:35:59 +0200 Subject: mm: fix PageAnonExclusive clearing racing with concurrent RCU GUP-fast commit 6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive") made sure that when PageAnonExclusive() has to be cleared during temporary unmapping of a page, that the PTE is cleared/invalidated and that the TLB is flushed. What we want to achieve in all cases is that we cannot end up with a pin on an anonymous page that may be shared, because such pins would be unreliable and could result in memory corruptions when the mapped page and the pin go out of sync due to a write fault. That TLB flush handling was inspired by an outdated comment in mm/ksm.c:write_protect_page(), which similarly required the TLB flush in the past to synchronize with GUP-fast. However, ever since general RCU GUP fast was introduced in commit 2667f50e8b81 ("mm: introduce a general RCU get_user_pages_fast()"), a TLB flush is no longer sufficient to handle concurrent GUP-fast in all cases -- it only handles traditional IPI-based GUP-fast correctly. Peter Xu (thankfully) questioned whether that TLB flush is really required. On architectures that send an IPI broadcast on TLB flush, it works as expected. To synchronize with RCU GUP-fast properly, we're conceptually fine, however, we have to enforce a certain memory order and are missing memory barriers. Let's document that, avoid the TLB flush where possible and use proper explicit memory barriers where required. We shouldn't really care about the additional memory barriers here, as we're not on extremely hot paths -- and we're getting rid of some TLB flushes. We use a smp_mb() pair for handling concurrent pinning and a smp_rmb()/smp_wmb() pair for handling the corner case of only temporary PTE changes but permanent PageAnonExclusive changes. One extreme example, whereby GUP-fast takes a R/O pin and KSM wants to convert an exclusive anonymous page to a KSM page, and that page is already mapped write-protected (-> no PTE change) would be: Thread 0 (KSM) Thread 1 (GUP-fast) (B1) Read the PTE # (B2) skipped without FOLL_WRITE (A1) Clear PTE smp_mb() (A2) Check pinned (B3) Pin the mapped page smp_mb() (A3) Clear PageAnonExclusive smp_wmb() (A4) Restore PTE (B4) Check if the PTE changed smp_rmb() (B5) Check PageAnonExclusive Thread 1 will properly detect that PageAnonExclusive was cleared and back off. Note that we don't need a memory barrier between checking if the page is pinned and clearing PageAnonExclusive, because stores are not speculated. The possible issues due to reordering are of theoretical nature so far and attempts to reproduce the race failed. Especially the "no PTE change" case isn't the common case, because we'd need an exclusive anonymous page that's mapped R/O and the PTE is clean in KSM code -- and using KSM with page pinning isn't extremely common. Further, the clear+TLB flush we used for now implies a memory barrier. So the problematic missing part should be the missing memory barrier after pinning but before checking if the PTE changed. Link: https://lkml.kernel.org/r/20220901083559.67446-1-david@redhat.com Fixes: 6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive") Signed-off-by: David Hildenbrand Cc: Jason Gunthorpe Cc: John Hubbard Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Peter Xu Cc: Alistair Popple Cc: Nadav Amit Cc: Yang Shi Cc: Vlastimil Babka Cc: Michal Hocko Cc: Mike Kravetz Cc: Andrea Parri Cc: Will Deacon Cc: Peter Zijlstra Cc: "Paul E. McKenney" Cc: Christoph von Recklinghausen Cc: Don Dutile Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 +++++-- include/linux/rmap.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 68 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e98ef2cb1176..8a5ad9d050bf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2999,8 +2999,8 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) * PageAnonExclusive() has to protect against concurrent GUP: * * Ordinary GUP: Using the PT lock * * GUP-fast and fork(): mm->write_protect_seq - * * GUP-fast and KSM or temporary unmapping (swap, migration): - * clear/invalidate+flush of the page table entry + * * GUP-fast and KSM or temporary unmapping (swap, migration): see + * page_try_share_anon_rmap() * * Must be called with the (sub)page that's actually referenced via the * page table entry, which might not necessarily be the head page for a @@ -3021,6 +3021,11 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page) */ if (!PageAnon(page)) return false; + + /* Paired with a memory barrier in page_try_share_anon_rmap(). */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_rmb(); + /* * Note that PageKsm() pages cannot be exclusive, and consequently, * cannot get pinned. diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bf80adca980b..72b2bcc37f73 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -267,7 +267,7 @@ dup: * @page: the exclusive anonymous page to try marking possibly shared * * The caller needs to hold the PT lock and has to have the page table entry - * cleared/invalidated+flushed, to properly sync against GUP-fast. + * cleared/invalidated. * * This is similar to page_try_dup_anon_rmap(), however, not used during fork() * to duplicate a mapping, but instead to prepare for KSM or temporarily @@ -283,12 +283,68 @@ static inline int page_try_share_anon_rmap(struct page *page) { VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page); - /* See page_try_dup_anon_rmap(). */ - if (likely(!is_device_private_page(page) && - unlikely(page_maybe_dma_pinned(page)))) - return -EBUSY; + /* device private pages cannot get pinned via GUP. */ + if (unlikely(is_device_private_page(page))) { + ClearPageAnonExclusive(page); + return 0; + } + /* + * We have to make sure that when we clear PageAnonExclusive, that + * the page is not pinned and that concurrent GUP-fast won't succeed in + * concurrently pinning the page. + * + * Conceptually, PageAnonExclusive clearing consists of: + * (A1) Clear PTE + * (A2) Check if the page is pinned; back off if so. + * (A3) Clear PageAnonExclusive + * (A4) Restore PTE (optional, but certainly not writable) + * + * When clearing PageAnonExclusive, we cannot possibly map the page + * writable again, because anon pages that may be shared must never + * be writable. So in any case, if the PTE was writable it cannot + * be writable anymore afterwards and there would be a PTE change. Only + * if the PTE wasn't writable, there might not be a PTE change. + * + * Conceptually, GUP-fast pinning of an anon page consists of: + * (B1) Read the PTE + * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so. + * (B3) Pin the mapped page + * (B4) Check if the PTE changed by re-reading it; back off if so. + * (B5) If the original PTE is not writable, check if + * PageAnonExclusive is not set; back off if so. + * + * If the PTE was writable, we only have to make sure that GUP-fast + * observes a PTE change and properly backs off. + * + * If the PTE was not writable, we have to make sure that GUP-fast either + * detects a (temporary) PTE change or that PageAnonExclusive is cleared + * and properly backs off. + * + * Consequently, when clearing PageAnonExclusive(), we have to make + * sure that (A1), (A2)/(A3) and (A4) happen in the right memory + * order. In GUP-fast pinning code, we have to make sure that (B3),(B4) + * and (B5) happen in the right memory order. + * + * We assume that there might not be a memory barrier after + * clearing/invalidating the PTE (A1) and before restoring the PTE (A4), + * so we use explicit ones here. + */ + + /* Paired with the memory barrier in try_grab_folio(). */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_mb(); + + if (unlikely(page_maybe_dma_pinned(page))) + return -EBUSY; ClearPageAnonExclusive(page); + + /* + * This is conceptually a smp_wmb() paired with the smp_rmb() in + * gup_must_unshare(). + */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_mb__after_atomic(); return 0; } -- cgit v1.2.3 From 7bb5da0d490b2d836c5218f5186ee588d2145310 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Thu, 30 Jun 2022 23:32:57 +0100 Subject: kexec: turn all kexec_mutex acquisitions into trylocks Patch series "kexec, panic: Making crash_kexec() NMI safe", v4. This patch (of 2): Most acquistions of kexec_mutex are done via mutex_trylock() - those were a direct "translation" from: 8c5a1cf0ad3a ("kexec: use a mutex for locking rather than xchg()") there have however been two additions since then that use mutex_lock(): crash_get_memory_size() and crash_shrink_memory(). A later commit will replace said mutex with an atomic variable, and locking operations will become atomic_cmpxchg(). Rather than having those mutex_lock() become while (atomic_cmpxchg(&lock, 0, 1)), turn them into trylocks that can return -EBUSY on acquisition failure. This does halve the printable size of the crash kernel, but that's still neighbouring 2G for 32bit kernels which should be ample enough. Link: https://lkml.kernel.org/r/20220630223258.4144112-1-vschneid@redhat.com Link: https://lkml.kernel.org/r/20220630223258.4144112-2-vschneid@redhat.com Signed-off-by: Valentin Schneider Cc: Arnd Bergmann Cc: "Eric W . Biederman" Cc: Juri Lelli Cc: Luis Claudio R. Goncalves Cc: Miaohe Lin Cc: Petr Mladek Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Baoquan He Signed-off-by: Andrew Morton --- include/linux/kexec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 13e6c4b58f07..41a686996aaa 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -427,7 +427,7 @@ extern int kexec_load_disabled; extern bool kexec_in_progress; int crash_shrink_memory(unsigned long new_size); -size_t crash_get_memory_size(void); +ssize_t crash_get_memory_size(void); #ifndef arch_kexec_protect_crashkres /* -- cgit v1.2.3 From 2be9880dc87342dc7ae459c9ea5c9ee2a45b33d8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 19 Aug 2022 09:44:06 +0800 Subject: kernel: exit: cleanup release_thread() Only x86 has own release_thread(), introduce a new weak release_thread() function to clean empty definitions in other ARCHs. Link: https://lkml.kernel.org/r/20220819014406.32266-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Guo Ren [csky] Acked-by: Russell King (Oracle) Acked-by: Geert Uytterhoeven Acked-by: Brian Cain Acked-by: Michael Ellerman [powerpc] Acked-by: Stafford Horne [openrisc] Acked-by: Catalin Marinas [arm64] Acked-by: Huacai Chen [LoongArch] Cc: Alexander Gordeev Cc: Anton Ivanov Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: Dave Hansen Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Guo Ren [csky] Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Johannes Berg Cc: Jonas Bonn Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Xuerui Wang Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/sched/task.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 81cab4b01edc..d6c48163c6de 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -127,6 +127,9 @@ static inline void put_task_struct_many(struct task_struct *t, int nr) void put_task_struct_rcu_user(struct task_struct *task); +/* Free all architecture-specific resources held by a thread. */ +void release_thread(struct task_struct *dead_task); + #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; #else -- cgit v1.2.3 From da3f52ba359590d8eb465ae0d8b6e11d6fd9432f Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 21 Aug 2022 21:30:11 +0200 Subject: iversion: use atomic64_try_cmpxchg) Use atomic64_try_cmpxchg instead of atomic64_cmpxchg (*ptr, old, new) == old in inode_set_max_iversion_raw, inode_maybe_inc_version and inode_query_iversion. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. The loop in inode_maybe_inc_iversion improves from: 5563: 48 89 ca mov %rcx,%rdx 5566: 48 89 c8 mov %rcx,%rax 5569: 48 83 e2 fe and $0xfffffffffffffffe,%rdx 556d: 48 83 c2 02 add $0x2,%rdx 5571: f0 48 0f b1 16 lock cmpxchg %rdx,(%rsi) 5576: 48 39 c1 cmp %rax,%rcx 5579: 0f 84 85 fc ff ff je 5204 <...> 557f: 48 89 c1 mov %rax,%rcx 5582: eb df jmp 5563 <...> to: 5563: 48 89 c2 mov %rax,%rdx 5566: 48 83 e2 fe and $0xfffffffffffffffe,%rdx 556a: 48 83 c2 02 add $0x2,%rdx 556e: f0 48 0f b1 11 lock cmpxchg %rdx,(%rcx) 5573: 0f 84 8b fc ff ff je 5204 <...> 5579: eb e8 jmp 5563 <...> Link: https://lkml.kernel.org/r/20220821193011.88208-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/iversion.h | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iversion.h b/include/linux/iversion.h index 3bfebde5a1a6..eb5a15810169 100644 --- a/include/linux/iversion.h +++ b/include/linux/iversion.h @@ -123,17 +123,12 @@ inode_peek_iversion_raw(const struct inode *inode) static inline void inode_set_max_iversion_raw(struct inode *inode, u64 val) { - u64 cur, old; + u64 cur = inode_peek_iversion_raw(inode); - cur = inode_peek_iversion_raw(inode); - for (;;) { + do { if (cur > val) break; - old = atomic64_cmpxchg(&inode->i_version, cur, val); - if (likely(old == cur)) - break; - cur = old; - } + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, val)); } /** @@ -197,7 +192,7 @@ inode_set_iversion_queried(struct inode *inode, u64 val) static inline bool inode_maybe_inc_iversion(struct inode *inode, bool force) { - u64 cur, old, new; + u64 cur, new; /* * The i_version field is not strictly ordered with any other inode @@ -211,19 +206,14 @@ inode_maybe_inc_iversion(struct inode *inode, bool force) */ smp_mb(); cur = inode_peek_iversion_raw(inode); - for (;;) { + do { /* If flag is clear then we needn't do anything */ if (!force && !(cur & I_VERSION_QUERIED)) return false; /* Since lowest bit is flag, add 2 to avoid it */ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; - - old = atomic64_cmpxchg(&inode->i_version, cur, new); - if (likely(old == cur)) - break; - cur = old; - } + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return true; } @@ -304,10 +294,10 @@ inode_peek_iversion(const struct inode *inode) static inline u64 inode_query_iversion(struct inode *inode) { - u64 cur, old, new; + u64 cur, new; cur = inode_peek_iversion_raw(inode); - for (;;) { + do { /* If flag is already set, then no need to swap */ if (cur & I_VERSION_QUERIED) { /* @@ -320,11 +310,7 @@ inode_query_iversion(struct inode *inode) } new = cur | I_VERSION_QUERIED; - old = atomic64_cmpxchg(&inode->i_version, cur, new); - if (likely(old == cur)) - break; - cur = old; - } + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return cur >> I_VERSION_QUERIED_SHIFT; } -- cgit v1.2.3 From e1d7c7609ae0933b59840390c1b207ac0a925c8b Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 22 Aug 2022 16:38:51 +0200 Subject: bitops: use try_cmpxchg in set_mask_bits and bit_clear_unless Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in set_mask_bits and bit_clear_unless. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. Link: https://lkml.kernel.org/r/20220822143851.3290-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/bitops.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 3b89c64bcfd8..fb24a513d917 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -328,10 +328,10 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, const typeof(*(ptr)) mask__ = (mask), bits__ = (bits); \ typeof(*(ptr)) old__, new__; \ \ + old__ = READ_ONCE(*(ptr)); \ do { \ - old__ = READ_ONCE(*(ptr)); \ new__ = (old__ & ~mask__) | bits__; \ - } while (cmpxchg(ptr, old__, new__) != old__); \ + } while (!try_cmpxchg(ptr, &old__, new__)); \ \ old__; \ }) @@ -343,11 +343,12 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, const typeof(*(ptr)) clear__ = (clear), test__ = (test);\ typeof(*(ptr)) old__, new__; \ \ + old__ = READ_ONCE(*(ptr)); \ do { \ - old__ = READ_ONCE(*(ptr)); \ + if (old__ & test__) \ + break; \ new__ = old__ & ~clear__; \ - } while (!(old__ & test__) && \ - cmpxchg(ptr, old__, new__) != old__); \ + } while (!try_cmpxchg(ptr, &old__, new__)); \ \ !(old__ & test__); \ }) -- cgit v1.2.3 From 4acb83417cadfdcbe64215f9d0ddcf3132af808e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 9 Sep 2022 11:40:22 -0700 Subject: sbitmap: fix batched wait_cnt accounting Batched completions can clear multiple bits, but we're only decrementing the wait_cnt by one each time. This can cause waiters to never be woken, stalling IO. Use the batched count instead. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215679 Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20220909184022.1709476-1-kbusch@fb.com Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 8f5a86e210b9..4d2d5205ab58 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -575,8 +575,9 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue * on a &struct sbitmap_queue. * @sbq: Bitmap queue to wake up. + * @nr: Number of bits cleared. */ -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr); /** * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct -- cgit v1.2.3 From 320fb0f91e55ba248d4bad106b408e59099cfa89 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 Aug 2022 10:22:37 +0800 Subject: blk-throttle: fix that io throttle can only work for single bio Test scripts: cd /sys/fs/cgroup/blkio/ echo "8:0 1024" > blkio.throttle.write_bps_device echo $$ > cgroup.procs dd if=/dev/zero of=/dev/sda bs=10k count=1 oflag=direct & dd if=/dev/zero of=/dev/sda bs=10k count=1 oflag=direct & Test result: 10240 bytes (10 kB, 10 KiB) copied, 10.0134 s, 1.0 kB/s 10240 bytes (10 kB, 10 KiB) copied, 10.0135 s, 1.0 kB/s The problem is that the second bio is finished after 10s instead of 20s. Root cause: 1) second bio will be flagged: __blk_throtl_bio while (true) { ... if (sq->nr_queued[rw]) -> some bio is throttled already break }; bio_set_flag(bio, BIO_THROTTLED); -> flag the bio 2) flagged bio will be dispatched without waiting: throtl_dispatch_tg tg_may_dispatch tg_with_in_bps_limit if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) *wait = 0; -> wait time is zero return true; commit 9f5ede3c01f9 ("block: throttle split bio in case of iops limit") support to count split bios for iops limit, thus it adds flagged bio checking in tg_with_in_bps_limit() so that split bios will only count once for bps limit, however, it introduce a new problem that io throttle won't work if multiple bios are throttled. In order to fix the problem, handle iops/bps limit in different ways: 1) for iops limit, there is no flag to record if the bio is throttled, and iops is always applied. 2) for bps limit, original bio will be flagged with BIO_BPS_THROTTLED, and io throttle will ignore bio with the flag. Noted this patch also remove the code to set flag in __bio_clone(), it's introduced in commit 111be8839817 ("block-throttle: avoid double charge"), and author thinks split bio can be resubmited and throttled again, which is wrong because split bio will continue to dispatch from caller. Fixes: 9f5ede3c01f9 ("block: throttle split bio in case of iops limit") Cc: Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220829022240.3348319-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- include/linux/blk_types.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index ca22b06700a9..2c5806997bbf 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -509,7 +509,7 @@ static inline void bio_set_dev(struct bio *bio, struct block_device *bdev) { bio_clear_flag(bio, BIO_REMAPPED); if (bio->bi_bdev != bdev) - bio_clear_flag(bio, BIO_THROTTLED); + bio_clear_flag(bio, BIO_BPS_THROTTLED); bio->bi_bdev = bdev; bio_associate_blkg(bio); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1ef99790f6ed..41afb4cfb9b0 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -325,7 +325,7 @@ enum { BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ - BIO_THROTTLED, /* This bio has already been subjected to + BIO_BPS_THROTTLED, /* This bio has already been subjected to * throttling rules. Don't do it again. */ BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion * of this bio. */ -- cgit v1.2.3 From fd72cb1bb5c71d2738a17cbdee6280365a451706 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Thu, 8 Sep 2022 00:50:59 +0300 Subject: mei: add kdoc for struct mei_aux_device struct mei_aux_device is an interface structure requires proper documenation. Signed-off-by: Tomas Winkler Reviewed-by: Daniele Ceraolo Spurio Signed-off-by: Daniele Ceraolo Spurio Link: https://patchwork.freedesktop.org/patch/msgid/20220907215113.1596567-3-tomas.winkler@intel.com Acked-by: Greg Kroah-Hartman Signed-off-by: Joonas Lahtinen --- include/linux/mei_aux.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mei_aux.h b/include/linux/mei_aux.h index 587f25128848..a0cb587006d5 100644 --- a/include/linux/mei_aux.h +++ b/include/linux/mei_aux.h @@ -7,6 +7,12 @@ #include +/** + * struct mei_aux_device - mei auxiliary device + * @aux_dev: - auxiliary device object + * @irq: interrupt driving the mei auxiliary device + * @bar: mmio resource bar reserved to mei auxiliary device + */ struct mei_aux_device { struct auxiliary_device aux_dev; int irq; -- cgit v1.2.3 From ed57967ab64f1dcdb9efb78a3f4a950dfdccf544 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Thu, 8 Sep 2022 00:51:00 +0300 Subject: mei: add slow_firmware flag to the mei auxiliary device Add slow_firmware flag to the mei auxiliary device info to inform the mei driver about slow underlying firmware. Such firmware will require to use larger operation timeouts. Signed-off-by: Alexander Usyskin Signed-off-by: Tomas Winkler Reviewed-by: Daniele Ceraolo Spurio Signed-off-by: Daniele Ceraolo Spurio Link: https://patchwork.freedesktop.org/patch/msgid/20220907215113.1596567-4-tomas.winkler@intel.com Acked-by: Greg Kroah-Hartman Signed-off-by: Joonas Lahtinen --- include/linux/mei_aux.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mei_aux.h b/include/linux/mei_aux.h index a0cb587006d5..4894d8bf4159 100644 --- a/include/linux/mei_aux.h +++ b/include/linux/mei_aux.h @@ -12,11 +12,14 @@ * @aux_dev: - auxiliary device object * @irq: interrupt driving the mei auxiliary device * @bar: mmio resource bar reserved to mei auxiliary device + * @slow_firmware: The device has slow underlying firmware. + * Such firmware will require to use larger operation timeouts. */ struct mei_aux_device { struct auxiliary_device aux_dev; int irq; struct resource bar; + bool slow_firmware; }; #define auxiliary_dev_to_mei_aux_dev(auxiliary_dev) \ -- cgit v1.2.3 From 342e4c7e2d38e421f99898b629b5d82e5ae90323 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Thu, 8 Sep 2022 00:51:08 +0300 Subject: mei: gsc: setup gsc extended operational memory 1. Retrieve extended operational memory physical pointers from the auxiliary device info. 2. Setup memory registers. 3. Notify firmware that the memory is ready by sending the memory ready command. 4. Disable PXP device if GSC is not in PXP mode. CC: Daniele Ceraolo Spurio Signed-off-by: Tomas Winkler Signed-off-by: Alexander Usyskin Reviewed-by: Daniele Ceraolo Spurio Signed-off-by: Daniele Ceraolo Spurio Link: https://patchwork.freedesktop.org/patch/msgid/20220907215113.1596567-12-tomas.winkler@intel.com Acked-by: Greg Kroah-Hartman Signed-off-by: Joonas Lahtinen Acked-by: Greg Kroah-Hartman Signed-off-by: Joonas Lahtinen --- include/linux/mei_aux.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mei_aux.h b/include/linux/mei_aux.h index 4894d8bf4159..506912ad363b 100644 --- a/include/linux/mei_aux.h +++ b/include/linux/mei_aux.h @@ -12,6 +12,8 @@ * @aux_dev: - auxiliary device object * @irq: interrupt driving the mei auxiliary device * @bar: mmio resource bar reserved to mei auxiliary device + * @ext_op_mem: resource for extend operational memory + * used in graphics PXP mode. * @slow_firmware: The device has slow underlying firmware. * Such firmware will require to use larger operation timeouts. */ @@ -19,6 +21,7 @@ struct mei_aux_device { struct auxiliary_device aux_dev; int irq; struct resource bar; + struct resource ext_op_mem; bool slow_firmware; }; -- cgit v1.2.3 From a47126ec29f538e1197862919f94d3b6668144a4 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 9 Sep 2022 15:24:57 -0500 Subject: PCI/PTM: Cache PTM Capability offset Cache the PTM Capability offset instead of searching for it every time we enable/disable PTM or save/restore PTM state. No functional change intended. Link: https://lore.kernel.org/r/20220909202505.314195-2-helgaas@kernel.org Tested-by: Rajvi Jingar Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Mika Westerberg --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 060af91bafcd..54be939023a3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -475,6 +475,7 @@ struct pci_dev { unsigned int broken_cmd_compl:1; /* No compl for some cmds */ #endif #ifdef CONFIG_PCIE_PTM + u16 ptm_cap; /* PTM Capability */ unsigned int ptm_root:1; unsigned int ptm_enabled:1; u8 ptm_granularity; -- cgit v1.2.3 From e8bdc5ea481638e0a4fd5639050d2b170417f493 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 9 Sep 2022 15:25:00 -0500 Subject: PCI/PTM: Add pci_suspend_ptm() and pci_resume_ptm() We disable PTM during suspend because that allows some Root Ports to enter lower-power PM states, which means we also need to disable PTM for all downstream devices. Add pci_suspend_ptm() and pci_resume_ptm() for this purpose. pci_enable_ptm() and pci_disable_ptm() are for drivers to use to enable or disable PTM. They use dev->ptm_enabled to keep track of whether PTM should be enabled. pci_suspend_ptm() and pci_resume_ptm() are PCI core-internal functions to temporarily disable PTM during suspend and (depending on dev->ptm_enabled) re-enable PTM during resume. Enable/disable/suspend/resume all use internal __pci_enable_ptm() and __pci_disable_ptm() functions that only update the PTM Control register. Outline: pci_enable_ptm(struct pci_dev *dev) { __pci_enable_ptm(dev); dev->ptm_enabled = 1; pci_ptm_info(dev); } pci_disable_ptm(struct pci_dev *dev) { if (dev->ptm_enabled) { __pci_disable_ptm(dev); dev->ptm_enabled = 0; } } pci_suspend_ptm(struct pci_dev *dev) { if (dev->ptm_enabled) __pci_disable_ptm(dev); } pci_resume_ptm(struct pci_dev *dev) { if (dev->ptm_enabled) __pci_enable_ptm(dev); } Nothing currently calls pci_resume_ptm(); the suspend path saves the PTM state before disabling PTM, so the PTM state restore in the resume path implicitly re-enables it. A future change will use pci_resume_ptm() to fix some problems with this approach. Link: https://lore.kernel.org/r/20220909202505.314195-5-helgaas@kernel.org Tested-by: Rajvi Jingar Signed-off-by: Bjorn Helgaas Reviewed-by: Mika Westerberg --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 54be939023a3..cb5f796e3319 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1678,10 +1678,12 @@ bool pci_ats_disabled(void); #ifdef CONFIG_PCIE_PTM int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); +void pci_disable_ptm(struct pci_dev *dev); bool pcie_ptm_enabled(struct pci_dev *dev); #else static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) { return -EINVAL; } +static inline void pci_disable_ptm(struct pci_dev *dev) { } static inline bool pcie_ptm_enabled(struct pci_dev *dev) { return false; } #endif -- cgit v1.2.3 From 4b9d1bc7911c9d9159c4881455c584cde99fbb19 Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Tue, 6 Sep 2022 13:49:01 +0800 Subject: Input: hp_sdc: fix spelling typo in comment Fix spelling typo in comment. Reported-by: k2ci Signed-off-by: Jiangshan Yi Signed-off-by: Helge Deller --- include/linux/hp_sdc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hp_sdc.h b/include/linux/hp_sdc.h index 6f1dee7e67e0..9be8704e2d38 100644 --- a/include/linux/hp_sdc.h +++ b/include/linux/hp_sdc.h @@ -180,7 +180,7 @@ switch (val) { \ #define HP_SDC_CMD_SET_IM 0x40 /* 010xxxxx == set irq mask */ -/* The documents provided do not explicitly state that all registers betweem +/* The documents provided do not explicitly state that all registers between * 0x01 and 0x1f inclusive can be read by sending their register index as a * command, but this is implied and appears to be the case. */ -- cgit v1.2.3 From 6f7a71f804287a7566314ab1a73d8ca2c18ca0d7 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 13 Sep 2022 14:34:54 +0200 Subject: regulator: Add driver for MT6331 PMIC regulators Add a driver for the regulators found in the MT6331 PMIC. This PMIC features six buck and 21 Low DropOut (LDO) regulators. Signed-off-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20220913123456.384513-3-angelogioacchino.delregno@collabora.com Signed-off-by: Mark Brown --- include/linux/regulator/mt6331-regulator.h | 46 ++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 include/linux/regulator/mt6331-regulator.h (limited to 'include/linux') diff --git a/include/linux/regulator/mt6331-regulator.h b/include/linux/regulator/mt6331-regulator.h new file mode 100644 index 000000000000..2801a9879c14 --- /dev/null +++ b/include/linux/regulator/mt6331-regulator.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2022 Collabora Ltd. + * Author: AngeloGioacchino Del Regno + */ + +#ifndef __LINUX_REGULATOR_MT6331_H +#define __LINUX_REGULATOR_MT6331_H + +enum { + /* BUCK */ + MT6331_ID_VDVFS11 = 0, + MT6331_ID_VDVFS12, + MT6331_ID_VDVFS13, + MT6331_ID_VDVFS14, + MT6331_ID_VCORE2, + MT6331_ID_VIO18, + /* LDO */ + MT6331_ID_VTCXO1, + MT6331_ID_VTCXO2, + MT6331_ID_AVDD32_AUD, + MT6331_ID_VAUXA32, + MT6331_ID_VCAMA, + MT6331_ID_VIO28, + MT6331_ID_VCAM_AF, + MT6331_ID_VMC, + MT6331_ID_VMCH, + MT6331_ID_VEMC33, + MT6331_ID_VGP1, + MT6331_ID_VSIM1, + MT6331_ID_VSIM2, + MT6331_ID_VMIPI, + MT6331_ID_VIBR, + MT6331_ID_VGP4, + MT6331_ID_VCAMD, + MT6331_ID_VUSB10, + MT6331_ID_VCAM_IO, + MT6331_ID_VSRAM_DVFS1, + MT6331_ID_VGP2, + MT6331_ID_VGP3, + MT6331_ID_VRTC, + MT6331_ID_VDIG18, + MT6331_ID_VREG_MAX +}; + +#endif /* __LINUX_REGULATOR_MT6331_H */ -- cgit v1.2.3 From 1cc5a52e873a4f9725eafe5aa9cd213b7b58e29e Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 13 Sep 2022 14:34:56 +0200 Subject: regulator: Add driver for MT6332 PMIC regulators Add a driver for the regulators found in the MT6332 PMICs, including six buck and four LDO regulators. Signed-off-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20220913123456.384513-5-angelogioacchino.delregno@collabora.com Signed-off-by: Mark Brown --- include/linux/regulator/mt6332-regulator.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 include/linux/regulator/mt6332-regulator.h (limited to 'include/linux') diff --git a/include/linux/regulator/mt6332-regulator.h b/include/linux/regulator/mt6332-regulator.h new file mode 100644 index 000000000000..af5e3ed31029 --- /dev/null +++ b/include/linux/regulator/mt6332-regulator.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2022 Collabora Ltd. + * Author: AngeloGioacchino Del Regno + */ + +#ifndef __LINUX_REGULATOR_MT6332_H +#define __LINUX_REGULATOR_MT6332_H + +enum { + /* BUCK */ + MT6332_ID_VDRAM = 0, + MT6332_ID_VDVFS2, + MT6332_ID_VPA, + MT6332_ID_VRF1, + MT6332_ID_VRF2, + MT6332_ID_VSBST, + /* LDO */ + MT6332_ID_VAUXB32, + MT6332_ID_VBIF28, + MT6332_ID_VDIG18, + MT6332_ID_VSRAM_DVFS2, + MT6332_ID_VUSB33, + MT6332_ID_VREG_MAX +}; + +#endif /* __LINUX_REGULATOR_MT6332_H */ -- cgit v1.2.3 From 1dd611a9c55f6657328429b988e302323691c3dc Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 19 Aug 2022 23:26:17 +0200 Subject: mmc: core: Switch to basic workqueue API for sdio_irq_work The delay parameter isn't set by any user, therefore simplify the code and switch to the basic workqueue API w/o delay support. This also reduces the size of struct mmc_host. Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/13d8200a-e2a8-d907-38ce-a16fc5ce14aa@gmail.com Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index eb8bc5b9b0b7..8fdd3cf971a3 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -476,7 +476,7 @@ struct mmc_host { unsigned int sdio_irqs; struct task_struct *sdio_irq_thread; - struct delayed_work sdio_irq_work; + struct work_struct sdio_irq_work; bool sdio_irq_pending; atomic_t sdio_irq_thread_abort; -- cgit v1.2.3 From 96a7457a14d9cf98cf58de1e26c03180e0f28141 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Sep 2022 19:07:19 +0200 Subject: can: skb: unify skb CAN frame identification helpers Replace open coded checks for sk_buffs containing Classical CAN and CAN FD frame structures as a preparation for CAN XL support. With the added length check the unintended processing of CAN XL frames having the CANXL_XLF bit set can be suppressed even when the skb->len fits to non CAN XL frames. The CAN_RAW socket needs a rework to use these helpers. Therefore the use of these helpers is postponed to the CAN_RAW CAN XL integration. The J1939 protocol gets a check for Classical CAN frames too. Acked-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20220912170725.120748-2-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/linux/can/skb.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index 182749e858b3..27ebfc28510f 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -97,10 +97,20 @@ static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb) return nskb; } +static inline bool can_is_can_skb(const struct sk_buff *skb) +{ + struct can_frame *cf = (struct can_frame *)skb->data; + + /* the CAN specific type of skb is identified by its data length */ + return (skb->len == CAN_MTU && cf->len <= CAN_MAX_DLEN); +} + static inline bool can_is_canfd_skb(const struct sk_buff *skb) { + struct canfd_frame *cfd = (struct canfd_frame *)skb->data; + /* the CAN specific type of skb is identified by its data length */ - return skb->len == CANFD_MTU; + return (skb->len == CANFD_MTU && cfd->len <= CANFD_MAX_DLEN); } #endif /* !_CAN_SKB_H */ -- cgit v1.2.3 From 467ef4c7b9d1c22ee64342804bf92549d765df14 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Sep 2022 19:07:20 +0200 Subject: can: skb: add skb CAN frame data length helpers Add two helpers to retrieve the data length from CAN sk_buffs and prepare the length information to be a uint16 value for the CAN XL support. Acked-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20220912170725.120748-3-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/linux/can/skb.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index 27ebfc28510f..ddffc2fc008c 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -20,7 +20,8 @@ void can_flush_echo_skb(struct net_device *dev); int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx, unsigned int frame_len); struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, - u8 *len_ptr, unsigned int *frame_len_ptr); + unsigned int *len_ptr, + unsigned int *frame_len_ptr); unsigned int __must_check can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr); @@ -113,4 +114,25 @@ static inline bool can_is_canfd_skb(const struct sk_buff *skb) return (skb->len == CANFD_MTU && cfd->len <= CANFD_MAX_DLEN); } +/* get length element value from can[fd]_frame structure */ +static inline unsigned int can_skb_get_len_val(struct sk_buff *skb) +{ + const struct canfd_frame *cfd = (struct canfd_frame *)skb->data; + + return cfd->len; +} + +/* get needed data length inside CAN frame for all frame types (RTR aware) */ +static inline unsigned int can_skb_get_data_len(struct sk_buff *skb) +{ + unsigned int len = can_skb_get_len_val(skb); + const struct can_frame *cf = (struct can_frame *)skb->data; + + /* RTR frames have an actual length of zero */ + if (can_is_can_skb(skb) && cf->can_id & CAN_RTR_FLAG) + return 0; + + return len; +} + #endif /* !_CAN_SKB_H */ -- cgit v1.2.3 From fb08cba12b52cba4366e858932307649dc5304e2 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Sep 2022 19:07:23 +0200 Subject: can: canxl: update CAN infrastructure for CAN XL frames - add new ETH_P_CANXL ethernet protocol type - update skb checks for CAN XL - add alloc_canxl_skb() which now needs a data length parameter - introduce init_can_skb_reserve() to reduce code duplication Acked-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20220912170725.120748-6-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/linux/can/skb.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index ddffc2fc008c..1abc25a8d144 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -30,6 +30,9 @@ void can_free_echo_skb(struct net_device *dev, unsigned int idx, struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf); struct sk_buff *alloc_canfd_skb(struct net_device *dev, struct canfd_frame **cfd); +struct sk_buff *alloc_canxl_skb(struct net_device *dev, + struct canxl_frame **cxl, + unsigned int data_len); struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf); bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb); @@ -114,11 +117,29 @@ static inline bool can_is_canfd_skb(const struct sk_buff *skb) return (skb->len == CANFD_MTU && cfd->len <= CANFD_MAX_DLEN); } -/* get length element value from can[fd]_frame structure */ +static inline bool can_is_canxl_skb(const struct sk_buff *skb) +{ + const struct canxl_frame *cxl = (struct canxl_frame *)skb->data; + + if (skb->len < CANXL_HDR_SIZE + CANXL_MIN_DLEN || skb->len > CANXL_MTU) + return false; + + /* this also checks valid CAN XL data length boundaries */ + if (skb->len != CANXL_HDR_SIZE + cxl->len) + return false; + + return cxl->flags & CANXL_XLF; +} + +/* get length element value from can[|fd|xl]_frame structure */ static inline unsigned int can_skb_get_len_val(struct sk_buff *skb) { + const struct canxl_frame *cxl = (struct canxl_frame *)skb->data; const struct canfd_frame *cfd = (struct canfd_frame *)skb->data; + if (can_is_canxl_skb(skb)) + return cxl->len; + return cfd->len; } -- cgit v1.2.3 From ebf87fc728502244550eaf8819fc785e2014b2ad Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Sep 2022 19:07:24 +0200 Subject: can: dev: add CAN XL support to virtual CAN Make use of new can_skb_get_data_len() helper. Add support for variable CANXL MTU using the new can_is_canxl_dev_mtu(). Acked-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20220912170725.120748-7-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index c3e50e537e39..58f5431a5559 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -147,6 +147,11 @@ static inline u32 can_get_static_ctrlmode(struct can_priv *priv) return priv->ctrlmode & ~priv->ctrlmode_supported; } +static inline bool can_is_canxl_dev_mtu(unsigned int mtu) +{ + return (mtu >= CANXL_MIN_MTU && mtu <= CANXL_MAX_MTU); +} + void can_setup(struct net_device *dev); struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, -- cgit v1.2.3 From 85ebe0afd3f8377a9b506321314f4b8344caa8ec Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Thu, 18 Aug 2022 12:28:10 -0400 Subject: isa: Introduce the module_isa_driver_with_irq helper macro Several ISA drivers feature IRQ support that can configured via an "irq" array module parameter. This array typically matches directly with the respective "base" array module parameter. To reduce code repetition, a module_isa_driver_with_irq helper macro is introduced to provide a check ensuring that the number of "irq" passed to the module matches with the respective number of "base". Signed-off-by: William Breathitt Gray Acked-by: William Breathitt Gray Signed-off-by: Bartosz Golaszewski --- include/linux/isa.h | 52 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/isa.h b/include/linux/isa.h index e30963190968..4fbbf5e36e08 100644 --- a/include/linux/isa.h +++ b/include/linux/isa.h @@ -38,6 +38,32 @@ static inline void isa_unregister_driver(struct isa_driver *d) } #endif +#define module_isa_driver_init(__isa_driver, __num_isa_dev) \ +static int __init __isa_driver##_init(void) \ +{ \ + return isa_register_driver(&(__isa_driver), __num_isa_dev); \ +} \ +module_init(__isa_driver##_init) + +#define module_isa_driver_with_irq_init(__isa_driver, __num_isa_dev, __num_irq) \ +static int __init __isa_driver##_init(void) \ +{ \ + if (__num_irq != __num_isa_dev) { \ + pr_err("%s: Number of irq (%u) does not match number of base (%u)\n", \ + __isa_driver.driver.name, __num_irq, __num_isa_dev); \ + return -EINVAL; \ + } \ + return isa_register_driver(&(__isa_driver), __num_isa_dev); \ +} \ +module_init(__isa_driver##_init) + +#define module_isa_driver_exit(__isa_driver) \ +static void __exit __isa_driver##_exit(void) \ +{ \ + isa_unregister_driver(&(__isa_driver)); \ +} \ +module_exit(__isa_driver##_exit) + /** * module_isa_driver() - Helper macro for registering a ISA driver * @__isa_driver: isa_driver struct @@ -48,16 +74,22 @@ static inline void isa_unregister_driver(struct isa_driver *d) * use this macro once, and calling it replaces module_init and module_exit. */ #define module_isa_driver(__isa_driver, __num_isa_dev) \ -static int __init __isa_driver##_init(void) \ -{ \ - return isa_register_driver(&(__isa_driver), __num_isa_dev); \ -} \ -module_init(__isa_driver##_init); \ -static void __exit __isa_driver##_exit(void) \ -{ \ - isa_unregister_driver(&(__isa_driver)); \ -} \ -module_exit(__isa_driver##_exit); +module_isa_driver_init(__isa_driver, __num_isa_dev); \ +module_isa_driver_exit(__isa_driver) + +/** + * module_isa_driver_with_irq() - Helper macro for registering an ISA driver with irq + * @__isa_driver: isa_driver struct + * @__num_isa_dev: number of devices to register + * @__num_irq: number of IRQ to register + * + * Helper macro for ISA drivers with irq that do not do anything special in + * module init/exit. Each module may only use this macro once, and calling it + * replaces module_init and module_exit. + */ +#define module_isa_driver_with_irq(__isa_driver, __num_isa_dev, __num_irq) \ +module_isa_driver_with_irq_init(__isa_driver, __num_isa_dev, __num_irq); \ +module_isa_driver_exit(__isa_driver) /** * max_num_isa_dev() - Maximum possible number registered of an ISA device -- cgit v1.2.3 From 47e34cb74d376ddfeaef94abb1d6dfb3c905ee51 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 12 Sep 2022 08:45:44 -0700 Subject: bpf: Add verifier check for BPF_PTR_POISON retval and arg BPF_PTR_POISON was added in commit c0a5a21c25f37 ("bpf: Allow storing referenced kptr in map") to denote a bpf_func_proto btf_id which the verifier will replace with a dynamically-determined btf_id at verification time. This patch adds verifier 'poison' functionality to BPF_PTR_POISON in order to prepare for expanded use of the value to poison ret- and arg-btf_id in ongoing work, namely rbtree and linked list patchsets [0, 1]. Specifically, when the verifier checks helper calls, it assumes that BPF_PTR_POISON'ed ret type will be replaced with a valid type before - or in lieu of - the default ret_btf_id logic. Similarly for arg btf_id. If poisoned btf_id reaches default handling block for either, consider this a verifier internal error and fail verification. Otherwise a helper w/ poisoned btf_id but no verifier logic replacing the type will cause a crash as the invalid pointer is dereferenced. Also move BPF_PTR_POISON to existing include/linux/posion.h header and remove unnecessary shift. [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com [1]: lore.kernel.org/bpf/20220904204145.3089-1-memxor@gmail.com Signed-off-by: Dave Marchevsky Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220912154544.1398199-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/poison.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/poison.h b/include/linux/poison.h index d62ef5a6b4e9..2d3249eb0e62 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -81,4 +81,7 @@ /********** net/core/page_pool.c **********/ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) +/********** kernel/bpf/ **********/ +#define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA)) + #endif -- cgit v1.2.3 From 2747b93ebbede2af2d7bb088b9ddae3193ceede8 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 25 Aug 2022 18:08:56 +0200 Subject: locking: Detect includes rwlock.h outside of spinlock.h From: Michael S. Tsirkin The check for __LINUX_SPINLOCK_H within rwlock.h (and other files) detects the direct include of the header file if it is at the very beginning of the include section. If it is listed later then chances are high that spinlock.h was already included (including rwlock.h) and the additional listing of rwlock.h will not cause any failure. On PREEMPT_RT this additional rwlock.h will lead to compile failures since it uses a different rwlock implementation. Add __LINUX_INSIDE_SPINLOCK_H to spinlock.h and check for this instead of __LINUX_SPINLOCK_H to detect wrong includes. This will help detect direct includes of rwlock.h with without PREEMPT_RT enabled. [ bigeasy: add remaining __LINUX_SPINLOCK_H user and rewrite commit description. ] Signed-off-by: Michael S. Tsirkin Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YweemHxJx7O8rjBx@linutronix.de --- include/linux/rwlock.h | 2 +- include/linux/spinlock.h | 2 ++ include/linux/spinlock_api_smp.h | 2 +- include/linux/spinlock_api_up.h | 2 +- include/linux/spinlock_rt.h | 2 +- include/linux/spinlock_up.h | 2 +- 6 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 8f416c5e929e..c0ef596f340b 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -1,7 +1,7 @@ #ifndef __LINUX_RWLOCK_H #define __LINUX_RWLOCK_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H # error "please don't include this file directly" #endif diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 5c0c5174155d..1341f7d62da4 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SPINLOCK_H #define __LINUX_SPINLOCK_H +#define __LINUX_INSIDE_SPINLOCK_H /* * include/linux/spinlock.h - generic spinlock/rwlock declarations @@ -492,4 +493,5 @@ int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, void free_bucket_spinlocks(spinlock_t *locks); +#undef __LINUX_INSIDE_SPINLOCK_H #endif /* __LINUX_SPINLOCK_H */ diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 51fa0dab68c4..89eb6f4c659c 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_API_SMP_H #define __LINUX_SPINLOCK_API_SMP_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H # error "please don't include this file directly" #endif diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index b8ba00ccccde..819aeba1c87e 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_API_UP_H #define __LINUX_SPINLOCK_API_UP_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H # error "please don't include this file directly" #endif diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h index 835aedaf68ac..61c49b16f69a 100644 --- a/include/linux/spinlock_rt.h +++ b/include/linux/spinlock_rt.h @@ -2,7 +2,7 @@ #ifndef __LINUX_SPINLOCK_RT_H #define __LINUX_SPINLOCK_RT_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H #error Do not include directly. Use spinlock.h #endif diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 16521074b6f7..c87204247592 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_UP_H #define __LINUX_SPINLOCK_UP_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H # error "please don't include this file directly" #endif -- cgit v1.2.3 From f24a0b1c22c2e90abb4ee1f7a3b0f0d8fc2ede5f Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:09 +0200 Subject: clk: Mention that .recalc_rate can return 0 on error Multiple platforms (amlogic, imx8) return 0 when the clock rate cannot be determined properly by the recalc_rate hook. Mention in the documentation that the framework is ok with that. Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-5-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 1615010aa0ec..9a14cfa0d201 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -118,8 +118,9 @@ struct clk_duty { * * @recalc_rate Recalculate the rate of this clock, by querying hardware. The * parent rate is an input parameter. It is up to the caller to - * ensure that the prepare_mutex is held across this call. - * Returns the calculated rate. Optional, but recommended - if + * ensure that the prepare_mutex is held across this call. If the + * driver cannot figure out a rate for this clock, it must return + * 0. Returns the calculated rate. Optional, but recommended - if * this op is not set then clock rate will be initialized to 0. * * @round_rate: Given a target rate as input, returns the closest rate actually -- cgit v1.2.3 From c35e84b0977617f96fb1dbef3fb8d71a861325c0 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:21 +0200 Subject: clk: Introduce clk_hw_init_rate_request() clk-divider instantiates clk_rate_request internally for its round_rate implementations to share the code with its determine_rate implementations. However, it's missing a few fields (min_rate, max_rate) that would be initialized properly if it was using clk_core_init_rate_req(). Let's create the clk_hw_init_rate_request() function for clock providers to be able to share the code to instation clk_rate_requests with the framework. This will also be useful for some tests introduced in later patches. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-17-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 9a14cfa0d201..d857717aa386 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -42,6 +42,8 @@ struct dentry; * struct clk_rate_request - Structure encoding the clk constraints that * a clock user might require. * + * Should be initialized by calling clk_hw_init_rate_request(). + * * @rate: Requested clock rate. This field will be adjusted by * clock drivers according to hardware capabilities. * @min_rate: Minimum rate imposed by clk users. @@ -60,6 +62,10 @@ struct clk_rate_request { struct clk_hw *best_parent_hw; }; +void clk_hw_init_rate_request(const struct clk_hw *hw, + struct clk_rate_request *req, + unsigned long rate); + /** * struct clk_duty - Struture encoding the duty cycle ratio of a clock * -- cgit v1.2.3 From 22fb0e284fbc3c1b85d24c5a1df8ea3ac82ab1d1 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:25 +0200 Subject: clk: Constify clk_has_parent() clk_has_parent() doesn't modify the clocks being passed, so let's make it const. Suggested-by: Stephen Boyd Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-21-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- include/linux/clk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clk.h b/include/linux/clk.h index c13061cabdfc..1ef013324237 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -799,7 +799,7 @@ int clk_set_rate_exclusive(struct clk *clk, unsigned long rate); * * Returns true if @parent is a possible parent for @clk, false otherwise. */ -bool clk_has_parent(struct clk *clk, struct clk *parent); +bool clk_has_parent(const struct clk *clk, const struct clk *parent); /** * clk_set_rate_range - set a rate range for a clock source -- cgit v1.2.3 From 262ca38f4b6eb418b20b8e1d6d8495c6a98727c1 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:26 +0200 Subject: clk: Stop forwarding clk_rate_requests to the parent If the clock cannot modify its rate and has CLK_SET_RATE_PARENT, clk_mux_determine_rate_flags(), clk_core_round_rate_nolock() and a number of drivers will forward the clk_rate_request to the parent clock. clk_core_round_rate_nolock() will pass the pointer directly, which means that we pass a clk_rate_request to the parent that has the rate, min_rate and max_rate of the child, and the best_parent_rate and best_parent_hw fields will be relative to the child as well, so will point to our current clock and its rate. The most common case for CLK_SET_RATE_PARENT is that the child and parent clock rates will be equal, so the rate field isn't a worry, but the other fields are. Similarly, if the parent clock driver ever modifies the best_parent_rate or best_parent_hw, this will be applied to the child once the call to clk_core_round_rate_nolock() is done. best_parent_hw is probably not going to be a valid parent, and best_parent_rate might lead to a parent rate change different to the one that was initially computed. clk_mux_determine_rate_flags() and the affected drivers will copy the request before forwarding it to the parents, so they won't be affected by the latter issue, but the former is still going to be there and will lead to erroneous data and context being passed to the various clock drivers in the same sub-tree. Let's create two new functions, clk_core_forward_rate_req() and clk_hw_forward_rate_request() for the framework and the clock providers that will copy a request from a child clock and update the context to match the parent's. We also update the relevant call sites in the framework and drivers to use that new function. Let's also add a test to make sure we avoid regressions there. Tested-by: Alexander Stein # imx8mp Tested-by: Marek Szyprowski # exynos4210, meson g12b Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-22-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index d857717aa386..8bce6c524f29 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -65,6 +65,11 @@ struct clk_rate_request { void clk_hw_init_rate_request(const struct clk_hw *hw, struct clk_rate_request *req, unsigned long rate); +void clk_hw_forward_rate_request(const struct clk_hw *core, + const struct clk_rate_request *old_req, + const struct clk_hw *parent, + struct clk_rate_request *req, + unsigned long parent_rate); /** * struct clk_duty - Struture encoding the duty cycle ratio of a clock -- cgit v1.2.3 From 253993253466ba7187730b196174146d5247e97b Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 16 Aug 2022 13:25:28 +0200 Subject: clk: Introduce the clk_hw_get_rate_range function Some clock providers are hand-crafting their clk_rate_request, and need to figure out the current boundaries of their clk_hw to fill it properly. Let's create such a function for clock providers. Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20220816112530.1837489-24-maxime@cerno.tech Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 8bce6c524f29..8724a3547a79 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -1267,6 +1267,8 @@ int clk_mux_determine_rate_flags(struct clk_hw *hw, struct clk_rate_request *req, unsigned long flags); void clk_hw_reparent(struct clk_hw *hw, struct clk_hw *new_parent); +void clk_hw_get_rate_range(struct clk_hw *hw, unsigned long *min_rate, + unsigned long *max_rate); void clk_hw_set_rate_range(struct clk_hw *hw, unsigned long min_rate, unsigned long max_rate); -- cgit v1.2.3 From b404cb45990bf24d41c29fe856aafb0746a7b81f Mon Sep 17 00:00:00 2001 From: Xinlei Lee Date: Wed, 14 Sep 2022 21:21:00 +0800 Subject: soc: mediatek: Add mmsys func to adapt to dpi output for MT8186 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add mmsys func to manipulate dpi output format config for MT8186. Co-developed-by: Jitao Shi Signed-off-by: Jitao Shi Signed-off-by: Xinlei Lee Reviewed-by: Nís F. R. A. Prado Link: https://lore.kernel.org/all/1663161662-1598-2-git-send-email-xinlei.lee@mediatek.com/ Signed-off-by: Matthias Brugger --- include/linux/soc/mediatek/mtk-mmsys.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h index 59117d970daf..d2b02bb43768 100644 --- a/include/linux/soc/mediatek/mtk-mmsys.h +++ b/include/linux/soc/mediatek/mtk-mmsys.h @@ -65,4 +65,6 @@ void mtk_mmsys_ddp_disconnect(struct device *dev, enum mtk_ddp_comp_id cur, enum mtk_ddp_comp_id next); +void mtk_mmsys_ddp_dpi_fmt_config(struct device *dev, u32 val); + #endif /* __MTK_MMSYS_H */ -- cgit v1.2.3 From 7187440dd7c45fd8b6dd4f3ff56b03ca4aa1bbd2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Sep 2022 15:20:23 +0300 Subject: iov_iter: use "maxpages" parameter This was intended to be "maxpages" instead of INT_MAX. There is only one caller and it passes INT_MAX so this does not affect runtime. Fixes: b93235e68921 ("tls: cap the output scatter list to something reasonable") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- include/linux/uio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 5896af36199c..2e3134b14ffd 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -298,7 +298,7 @@ iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes) shorted = iov_iter_count(i) - max_bytes; iov_iter_truncate(i, max_bytes); } - npages = iov_iter_npages(i, INT_MAX); + npages = iov_iter_npages(i, maxpages); if (shorted) iov_iter_reexpand(i, iov_iter_count(i) + shorted); -- cgit v1.2.3 From 82f00b24f532557fb0e15a6a2747859e4b70c4bd Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Fri, 9 Sep 2022 17:46:55 +0800 Subject: crypto: hisilicon/qm - get hardware features from hardware registers Before hardware V3, hardwares do not provide the feature registers, driver resolves hardware differences based on the hardware version. As a result, the driver does not support the new hardware. Hardware V3 and later versions support to obtain hardware features, such as power-gating management and doorbell isolation, through the hardware registers. To be compatible with later hardware versions, the features of the current device is obtained by reading the hardware registers instead of the hardware version. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 116e8bd68c99..851c962ba473 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -168,6 +168,15 @@ enum qm_vf_state { QM_NOT_READY, }; +enum qm_cap_bits { + QM_SUPPORT_DB_ISOLATION = 0x0, + QM_SUPPORT_FUNC_QOS, + QM_SUPPORT_STOP_QP, + QM_SUPPORT_MB_COMMAND, + QM_SUPPORT_SVA_PREFETCH, + QM_SUPPORT_RPM, +}; + struct dfx_diff_registers { u32 *regs; u32 reg_offset; @@ -258,6 +267,18 @@ struct hisi_qm_err_ini { void (*err_info_init)(struct hisi_qm *qm); }; +struct hisi_qm_cap_info { + u32 type; + /* Register offset */ + u32 offset; + /* Bit offset in register */ + u32 shift; + u32 mask; + u32 v1_val; + u32 v2_val; + u32 v3_val; +}; + struct hisi_qm_list { struct mutex lock; struct list_head list; @@ -278,6 +299,9 @@ struct hisi_qm { struct pci_dev *pdev; void __iomem *io_base; void __iomem *db_io_base; + + /* Capbility version, 0: not supports */ + u32 cap_ver; u32 sqe_size; u32 qp_base; u32 qp_num; @@ -304,6 +328,8 @@ struct hisi_qm { struct hisi_qm_err_info err_info; struct hisi_qm_err_status err_status; unsigned long misc_ctl; /* driver removing and reset sched */ + /* Device capability bit */ + unsigned long caps; struct rw_semaphore qps_lock; struct idr qp_idr; @@ -326,8 +352,6 @@ struct hisi_qm { bool use_sva; bool is_frozen; - /* doorbell isolation enable */ - bool use_db_isolation; resource_size_t phys_base; resource_size_t db_phys_base; struct uacce_device *uacce; @@ -501,6 +525,9 @@ void hisi_qm_pm_init(struct hisi_qm *qm); int hisi_qm_get_dfx_access(struct hisi_qm *qm); void hisi_qm_put_dfx_access(struct hisi_qm *qm); void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset); +u32 hisi_qm_get_hw_info(struct hisi_qm *qm, + const struct hisi_qm_cap_info *info_table, + u32 index, bool is_read); /* Used by VFIO ACC live migration driver */ struct pci_driver *hisi_sec_get_pf_driver(void); -- cgit v1.2.3 From 129a9f340172b4f3857260a7a7bb9d7b3496ba50 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Fri, 9 Sep 2022 17:46:56 +0800 Subject: crypto: hisilicon/qm - get qp num and depth from hardware registers Hardware V3 and later versions can obtain qp num and depth supported by the hardware from registers. To be compatible with later hardware versions, get qp num and depth from registers instead of fixed marcos. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 851c962ba473..371c0e7ffd27 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -109,7 +109,6 @@ QM_MAILBOX_TIMEOUT | QM_FLR_TIMEOUT) #define QM_BASE_CE QM_ECC_1BIT -#define QM_Q_DEPTH 1024 #define QM_MIN_QNUM 2 #define HISI_ACC_SGL_SGE_NR_MAX 255 #define QM_SHAPER_CFG 0x100164 @@ -310,6 +309,8 @@ struct hisi_qm { u32 max_qp_num; u32 vfs_num; u32 db_interval; + u16 eq_depth; + u16 aeq_depth; struct list_head list; struct hisi_qm_list *qm_list; @@ -375,6 +376,8 @@ struct hisi_qp_ops { struct hisi_qp { u32 qp_id; + u16 sq_depth; + u16 cq_depth; u8 alg_type; u8 req_type; -- cgit v1.2.3 From d90fab0deb8e580aa001f6876e4436c21e944f27 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Fri, 9 Sep 2022 17:46:58 +0800 Subject: crypto: hisilicon/qm - get error type from hardware registers Hardware V3 and later versions support get error type from registers. To be compatible with later hardware versions, get error type from registers instead of fixed marco. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 371c0e7ffd27..e230c7c46110 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -87,28 +87,6 @@ #define PEH_AXUSER_CFG 0x401001 #define PEH_AXUSER_CFG_ENABLE 0xffffffff -#define QM_AXI_RRESP BIT(0) -#define QM_AXI_BRESP BIT(1) -#define QM_ECC_MBIT BIT(2) -#define QM_ECC_1BIT BIT(3) -#define QM_ACC_GET_TASK_TIMEOUT BIT(4) -#define QM_ACC_DO_TASK_TIMEOUT BIT(5) -#define QM_ACC_WB_NOT_READY_TIMEOUT BIT(6) -#define QM_SQ_CQ_VF_INVALID BIT(7) -#define QM_CQ_VF_INVALID BIT(8) -#define QM_SQ_VF_INVALID BIT(9) -#define QM_DB_TIMEOUT BIT(10) -#define QM_OF_FIFO_OF BIT(11) -#define QM_DB_RANDOM_INVALID BIT(12) -#define QM_MAILBOX_TIMEOUT BIT(13) -#define QM_FLR_TIMEOUT BIT(14) - -#define QM_BASE_NFE (QM_AXI_RRESP | QM_AXI_BRESP | QM_ECC_MBIT | \ - QM_ACC_GET_TASK_TIMEOUT | QM_DB_TIMEOUT | \ - QM_OF_FIFO_OF | QM_DB_RANDOM_INVALID | \ - QM_MAILBOX_TIMEOUT | QM_FLR_TIMEOUT) -#define QM_BASE_CE QM_ECC_1BIT - #define QM_MIN_QNUM 2 #define HISI_ACC_SGL_SGE_NR_MAX 255 #define QM_SHAPER_CFG 0x100164 @@ -240,7 +218,10 @@ struct hisi_qm_err_info { char *acpi_rst; u32 msi_wr_port; u32 ecc_2bits_mask; - u32 dev_ce_mask; + u32 qm_shutdown_mask; + u32 dev_shutdown_mask; + u32 qm_reset_mask; + u32 dev_reset_mask; u32 ce; u32 nfe; u32 fe; -- cgit v1.2.3 From 882597aff2d442a52ca173c293939232c2e1ebea Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 14 Sep 2022 07:14:24 -0700 Subject: Input: auo-pixcir-ts - drop support for platform data Currently there are no users of auo_pixcir_ts_platdata in the mainline, and having it (with legacy gpio numbers) prevents us from converting the driver to gpiod API, so let's drop it. If, in the future, someone wants to use this driver on non-device tree, non-ACPI system, they should use static device properties instead of platform data. Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20220914141428.2201784-1-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- include/linux/input/auo-pixcir-ts.h | 44 ------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 include/linux/input/auo-pixcir-ts.h (limited to 'include/linux') diff --git a/include/linux/input/auo-pixcir-ts.h b/include/linux/input/auo-pixcir-ts.h deleted file mode 100644 index ed0776997a7a..000000000000 --- a/include/linux/input/auo-pixcir-ts.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Driver for AUO in-cell touchscreens - * - * Copyright (c) 2011 Heiko Stuebner - * - * based on auo_touch.h from Dell Streak kernel - * - * Copyright (c) 2008 QUALCOMM Incorporated. - * Copyright (c) 2008 QUALCOMM USA, INC. - */ - -#ifndef __AUO_PIXCIR_TS_H__ -#define __AUO_PIXCIR_TS_H__ - -/* - * Interrupt modes: - * periodical: interrupt is asserted periodicaly - * compare coordinates: interrupt is asserted when coordinates change - * indicate touch: interrupt is asserted during touch - */ -#define AUO_PIXCIR_INT_PERIODICAL 0x00 -#define AUO_PIXCIR_INT_COMP_COORD 0x01 -#define AUO_PIXCIR_INT_TOUCH_IND 0x02 - -/* - * @gpio_int interrupt gpio - * @int_setting one of AUO_PIXCIR_INT_* - * @init_hw hardwarespecific init - * @exit_hw hardwarespecific shutdown - * @x_max x-resolution - * @y_max y-resolution - */ -struct auo_pixcir_ts_platdata { - int gpio_int; - int gpio_rst; - - int int_setting; - - unsigned int x_max; - unsigned int y_max; -}; - -#endif -- cgit v1.2.3 From f67f12ff57bcfcd7d64280f748787793217faeaf Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 9 Sep 2022 22:36:08 +0300 Subject: ata: libahci_platform: Introduce reset assertion/deassertion methods Currently the ACHI-platform library supports only the assert and deassert reset signals and ignores the platforms with self-deasserting reset lines. That prone to having the platforms with self-deasserting reset method misbehaviour when it comes to resuming from sleep state after the clocks have been fully disabled. For such cases the controller needs to be fully reset all over after the reference clocks are enabled and stable, otherwise the controller state machine might be in an undetermined state. The best solution would be to auto-detect which reset method is supported by the particular platform and use it implicitly in the framework of the ahci_platform_enable_resources()/ahci_platform_disable_resources() methods. Alas it can't be implemented due to the AHCI-platform library already supporting the shared reset control lines. As [1] says in such case we have to use only one of the next methods: + reset_control_assert()/reset_control_deassert(); + reset_control_reset()/reset_control_rearm(). If the driver had an exclusive control over the reset lines we could have been able to manipulate the lines with no much limitation and just used the combination of the methods above to cover all the possible reset-control cases. Since the shared reset control has already been advertised and couldn't be changed with no risk to breaking the platforms relying on it, we have no choice but to make the platform drivers to determine which reset methods the platform reset system supports. In order to implement both types of reset control support we suggest to introduce the new AHCI-platform flag: AHCI_PLATFORM_RST_TRIGGER, which when passed to the ahci_platform_get_resources() method together with the AHCI_PLATFORM_GET_RESETS flag will indicate that the reset lines are self-deasserting thus the reset_control_reset()/reset_control_rearm() will be used to control the reset state. Otherwise the reset_control_deassert()/reset_control_assert() methods will be utilized. [1] Documentation/driver-api/reset.rst Signed-off-by: Serge Semin Reviewed-by: Hannes Reinecke Signed-off-by: Damien Le Moal --- include/linux/ahci_platform.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h index 49e5383d4222..6d7dd472d370 100644 --- a/include/linux/ahci_platform.h +++ b/include/linux/ahci_platform.h @@ -23,6 +23,8 @@ int ahci_platform_enable_phys(struct ahci_host_priv *hpriv); void ahci_platform_disable_phys(struct ahci_host_priv *hpriv); int ahci_platform_enable_clks(struct ahci_host_priv *hpriv); void ahci_platform_disable_clks(struct ahci_host_priv *hpriv); +int ahci_platform_deassert_rsts(struct ahci_host_priv *hpriv); +int ahci_platform_assert_rsts(struct ahci_host_priv *hpriv); int ahci_platform_enable_regulators(struct ahci_host_priv *hpriv); void ahci_platform_disable_regulators(struct ahci_host_priv *hpriv); int ahci_platform_enable_resources(struct ahci_host_priv *hpriv); @@ -41,6 +43,7 @@ int ahci_platform_resume_host(struct device *dev); int ahci_platform_suspend(struct device *dev); int ahci_platform_resume(struct device *dev); -#define AHCI_PLATFORM_GET_RESETS 0x01 +#define AHCI_PLATFORM_GET_RESETS BIT(0) +#define AHCI_PLATFORM_RST_TRIGGER BIT(1) #endif /* _AHCI_PLATFORM_H */ -- cgit v1.2.3 From 6ce73f3a6fc0ad6af56ba4c14ab7a410876f8286 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 9 Sep 2022 22:36:16 +0300 Subject: ata: libahci_platform: Add function returning a clock-handle by id Since all the clocks are retrieved by the method ahci_platform_get_resources() there is no need for the LLD (glue) drivers to be looking for some particular of them in the kernel clocks table again. Instead we suggest to add a simple method returning a device-specific clock with passed connection ID if it is managed to be found. Otherwise the function will return NULL. Thus the glue-drivers won't need to either manually touching the hpriv->clks array or calling clk_get()-friends. The AHCI platform drivers will be able to use the new function right after the ahci_platform_get_resources() method invocation and up to the device removal. Note the method is left unused here, but will be utilized in the framework of the DWC AHCI SATA driver being added in the next commit. Signed-off-by: Serge Semin Signed-off-by: Damien Le Moal --- include/linux/ahci_platform.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h index 6d7dd472d370..17fa26215292 100644 --- a/include/linux/ahci_platform.h +++ b/include/linux/ahci_platform.h @@ -13,6 +13,7 @@ #include +struct clk; struct device; struct ata_port_info; struct ahci_host_priv; @@ -21,6 +22,8 @@ struct scsi_host_template; int ahci_platform_enable_phys(struct ahci_host_priv *hpriv); void ahci_platform_disable_phys(struct ahci_host_priv *hpriv); +struct clk *ahci_platform_find_clk(struct ahci_host_priv *hpriv, + const char *con_id); int ahci_platform_enable_clks(struct ahci_host_priv *hpriv); void ahci_platform_disable_clks(struct ahci_host_priv *hpriv); int ahci_platform_deassert_rsts(struct ahci_host_priv *hpriv); -- cgit v1.2.3 From bfeb7e399bacae4ee46ad978f5fce3e47f0978d6 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Mon, 5 Sep 2022 12:01:49 +0300 Subject: bpf: Use bpf_capable() instead of CAP_SYS_ADMIN for blinding decision The full CAP_SYS_ADMIN requirement for blinding looks too strict nowadays. These days given unprivileged BPF is disabled by default, the main users for constant blinding coming from unprivileged in particular via cBPF -> eBPF migration (e.g. old-style socket filters). Signed-off-by: Yauheni Kaliuta Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220831090655.156434-1-ykaliuta@redhat.com Link: https://lore.kernel.org/bpf/20220905090149.61221-1-ykaliuta@redhat.com --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 527ae1d64e27..75335432fcbc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1099,7 +1099,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) return false; if (!bpf_jit_harden) return false; - if (bpf_jit_harden == 1 && capable(CAP_SYS_ADMIN)) + if (bpf_jit_harden == 1 && bpf_capable()) return false; return true; -- cgit v1.2.3 From ceea991a019c57a1fb0edd12a5f836a0fa431aee Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 3 Sep 2022 15:11:54 +0200 Subject: bpf: Move bpf_dispatcher function out of ftrace locations The dispatcher function is attached/detached to trampoline by dispatcher update function. At the same time it's available as ftrace attachable function. After discussion [1] the proposed solution is to use compiler attributes to alter bpf_dispatcher_##name##_func function: - remove it from being instrumented with __no_instrument_function__ attribute, so ftrace has no track of it - but still generate 5 nop instructions with patchable_function_entry(5) attribute, which are expected by bpf_arch_text_poke used by dispatcher update function Enabling HAVE_DYNAMIC_FTRACE_NO_PATCHABLE option for x86, so __patchable_function_entries functions are not part of ftrace/mcount locations. Adding attributes to bpf_dispatcher_XXX function on x86_64 so it's kept out of ftrace locations and has 5 byte nop generated at entry. These attributes need to be arch specific as pointed out by Ilya Leoshkevic in here [2]. The dispatcher image is generated only for x86_64 arch, so the code can stay as is for other archs. [1] https://lore.kernel.org/bpf/20220722110811.124515-1-jolsa@kernel.org/ [2] https://lore.kernel.org/bpf/969a14281a7791c334d476825863ee449964dd0c.camel@linux.ibm.com/ Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20220903131154.420467-3-jolsa@kernel.org --- include/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 54178b9e9c3a..e0dbe0c0a17e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -977,7 +977,14 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); }, \ } +#ifdef CONFIG_X86_64 +#define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5))) +#else +#define BPF_DISPATCHER_ATTRIBUTES +#endif + #define DEFINE_BPF_DISPATCHER(name) \ + notrace BPF_DISPATCHER_ATTRIBUTES \ noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ -- cgit v1.2.3 From c2f2e2c3aecdbabf822272a4b6e7d91537633cd9 Mon Sep 17 00:00:00 2001 From: ChiaEn Wu Date: Thu, 15 Sep 2022 17:47:32 +0800 Subject: lib: add linear range index macro Add linear_range_idx macro for declaring the linear_range struct simply. Reviewed-by: Matti Vaittinen Signed-off-by: ChiaEn Wu Signed-off-by: Sebastian Reichel --- include/linux/linear_range.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/linear_range.h b/include/linux/linear_range.h index fd3d0b358f22..2e4f4c3539c0 100644 --- a/include/linux/linear_range.h +++ b/include/linux/linear_range.h @@ -26,6 +26,17 @@ struct linear_range { unsigned int step; }; +#define LINEAR_RANGE(_min, _min_sel, _max_sel, _step) \ + { \ + .min = _min, \ + .min_sel = _min_sel, \ + .max_sel = _max_sel, \ + .step = _step, \ + } + +#define LINEAR_RANGE_IDX(_idx, _min, _min_sel, _max_sel, _step) \ + [_idx] = LINEAR_RANGE(_min, _min_sel, _max_sel, _step) + unsigned int linear_range_values_in_range(const struct linear_range *r); unsigned int linear_range_values_in_range_array(const struct linear_range *r, int ranges); -- cgit v1.2.3 From c7007d9f19527b47992ff78a088e8697a9e9d5f5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 2 May 2022 00:55:49 +0200 Subject: efi/libstub: add some missing EFI prototypes Define the correct prototypes for the load_image, start_image and unload_image boot service pointers so we can call them from the EFI zboot code. Also add some prototypes related to installation and deinstallation of protocols in to the EFI protocol database, including some definitions related to device paths. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index d2b84c2fec39..af90f7989f80 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -368,6 +368,9 @@ void efi_native_runtime_setup(void); #define UV_SYSTEM_TABLE_GUID EFI_GUID(0x3b13a7d4, 0x633e, 0x11dd, 0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93) #define LINUX_EFI_CRASH_GUID EFI_GUID(0xcfc8fc79, 0xbe2e, 0x4ddc, 0x97, 0xf0, 0x9f, 0x98, 0xbf, 0xe2, 0x98, 0xa0) #define LOADED_IMAGE_PROTOCOL_GUID EFI_GUID(0x5b1b31a1, 0x9562, 0x11d2, 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b) +#define LOADED_IMAGE_DEVICE_PATH_PROTOCOL_GUID EFI_GUID(0xbc62157e, 0x3e33, 0x4fec, 0x99, 0x20, 0x2d, 0x3b, 0x36, 0xd7, 0x50, 0xdf) +#define EFI_DEVICE_PATH_PROTOCOL_GUID EFI_GUID(0x09576e91, 0x6d3f, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b) +#define EFI_DEVICE_PATH_TO_TEXT_PROTOCOL_GUID EFI_GUID(0x8b843e20, 0x8132, 0x4852, 0x90, 0xcc, 0x55, 0x1a, 0x4e, 0x4a, 0x7f, 0x1c) #define EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID EFI_GUID(0x9042a9de, 0x23dc, 0x4a38, 0x96, 0xfb, 0x7a, 0xde, 0xd0, 0x80, 0x51, 0x6a) #define EFI_UGA_PROTOCOL_GUID EFI_GUID(0x982c298b, 0xf4fa, 0x41cb, 0xb8, 0x38, 0x77, 0xaa, 0x68, 0x8f, 0xb8, 0x39) #define EFI_PCI_IO_PROTOCOL_GUID EFI_GUID(0x4cf5b200, 0x68b8, 0x4ca5, 0x9e, 0xec, 0xb2, 0x3e, 0x3f, 0x50, 0x02, 0x9a) @@ -952,6 +955,7 @@ extern int efi_status_to_err(efi_status_t status); #define EFI_DEV_MEDIA_VENDOR 3 #define EFI_DEV_MEDIA_FILE 4 #define EFI_DEV_MEDIA_PROTOCOL 5 +#define EFI_DEV_MEDIA_REL_OFFSET 8 #define EFI_DEV_BIOS_BOOT 0x05 #define EFI_DEV_END_PATH 0x7F #define EFI_DEV_END_PATH2 0xFF @@ -982,12 +986,20 @@ struct efi_vendor_dev_path { u8 vendordata[]; } __packed; +struct efi_rel_offset_dev_path { + struct efi_generic_dev_path header; + u32 reserved; + u64 starting_offset; + u64 ending_offset; +} __packed; + struct efi_dev_path { union { struct efi_generic_dev_path header; struct efi_acpi_dev_path acpi; struct efi_pci_dev_path pci; struct efi_vendor_dev_path vendor; + struct efi_rel_offset_dev_path rel_offset; }; } __packed; -- cgit v1.2.3 From f5a5e83379b537f6252526bb4d285b771f6f0b89 Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Wed, 14 Sep 2022 09:34:31 +0100 Subject: soc: apple: rtkit: Add apple_rtkit_poll This allows a client to receive messages in atomic context, by polling. Signed-off-by: Hector Martin Signed-off-by: Russell King (Oracle) Reviewed-by: Sven Peter Reviewed-by: Eric Curtin Reviewed-by: Linus Walleij Signed-off-by: Arnd Bergmann --- include/linux/soc/apple/rtkit.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/apple/rtkit.h b/include/linux/soc/apple/rtkit.h index 88eb832eac7b..c9cabb679cd1 100644 --- a/include/linux/soc/apple/rtkit.h +++ b/include/linux/soc/apple/rtkit.h @@ -152,4 +152,16 @@ int apple_rtkit_send_message(struct apple_rtkit *rtk, u8 ep, u64 message, int apple_rtkit_send_message_wait(struct apple_rtkit *rtk, u8 ep, u64 message, unsigned long timeout, bool atomic); +/* + * Process incoming messages in atomic context. + * This only guarantees that messages arrive as far as the recv_message_early + * callback; drivers expecting to handle incoming messages synchronously + * by calling this function must do it that way. + * Will return 1 if some data was processed, 0 if none was, or a + * negative error code on failure. + * + * @rtk: RTKit reference + */ +int apple_rtkit_poll(struct apple_rtkit *rtk); + #endif /* _LINUX_APPLE_RTKIT_H_ */ -- cgit v1.2.3 From 460d9cb62f7fe82cc835d6b3924a8d06fd2d510a Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sun, 14 Aug 2022 23:12:44 -0500 Subject: soc: sunxi: sram: Return void from the release function There is no point in returning an error here, as the caller can do nothing about it. In fact, all callers already ignore the return value. Acked-by: Jernej Skrabec Signed-off-by: Samuel Holland Reviewed-by: Heiko Stuebner Tested-by: Heiko Stuebner Link: https://lore.kernel.org/r/20220815041248.53268-8-samuel@sholland.org Signed-off-by: Jernej Skrabec --- include/linux/soc/sunxi/sunxi_sram.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/sunxi/sunxi_sram.h b/include/linux/soc/sunxi/sunxi_sram.h index c5f663bba9c2..60e274d1b821 100644 --- a/include/linux/soc/sunxi/sunxi_sram.h +++ b/include/linux/soc/sunxi/sunxi_sram.h @@ -14,6 +14,6 @@ #define _SUNXI_SRAM_H_ int sunxi_sram_claim(struct device *dev); -int sunxi_sram_release(struct device *dev); +void sunxi_sram_release(struct device *dev); #endif /* _SUNXI_SRAM_H_ */ -- cgit v1.2.3 From e63efbcaba7d6f6846b1c2ec5cf259249b9ed88f Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Fri, 16 Sep 2022 17:02:46 +0100 Subject: wifi: brcmfmac: pcie: Read Apple OTP information On Apple platforms, the One Time Programmable ROM in the Broadcom chips contains information about the specific board design (module, vendor, version) that is required to select the correct NVRAM file. Parse this OTP ROM and extract the required strings. Note that the user OTP offset/size is per-chip. This patch does not add any chips yet. Reviewed-by: Arend van Spriel Signed-off-by: Hector Martin Signed-off-by: Russell King (Oracle) Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/E1oZDni-0077aM-I6@rmk-PC.armlinux.org.uk --- include/linux/bcma/bcma_driver_chipcommon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index e3314f746bfa..2d94c30ed439 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -271,6 +271,7 @@ #define BCMA_CC_SROM_CONTROL_OP_WRDIS 0x40000000 #define BCMA_CC_SROM_CONTROL_OP_WREN 0x60000000 #define BCMA_CC_SROM_CONTROL_OTPSEL 0x00000010 +#define BCMA_CC_SROM_CONTROL_OTP_PRESENT 0x00000020 #define BCMA_CC_SROM_CONTROL_LOCK 0x00000008 #define BCMA_CC_SROM_CONTROL_SIZE_MASK 0x00000006 #define BCMA_CC_SROM_CONTROL_SIZE_1K 0x00000000 -- cgit v1.2.3 From 555bb4ccd1dd78d0263eae31629fe1fdd65c1fb5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Aug 2022 18:41:24 +0200 Subject: preempt: Provide preempt_[dis|en]able_nested() On PREEMPT_RT enabled kernels, spinlocks and rwlocks are neither disabling preemption nor interrupts. Though there are a few places which depend on the implicit preemption/interrupt disable of those locks, e.g. seqcount write sections, per CPU statistics updates etc. To avoid sprinkling CONFIG_PREEMPT_RT conditionals all over the place, add preempt_disable_nested() and preempt_enable_nested() which should be descriptive enough. Add a lockdep assertion for the !PREEMPT_RT case to catch callers which do not have preemption disabled. Suggested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220825164131.402717-2-bigeasy@linutronix.de --- include/linux/preempt.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index b4381f255a5c..0df425bf9bd7 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -421,4 +421,46 @@ static inline void migrate_enable(void) { } #endif /* CONFIG_SMP */ +/** + * preempt_disable_nested - Disable preemption inside a normally preempt disabled section + * + * Use for code which requires preemption protection inside a critical + * section which has preemption disabled implicitly on non-PREEMPT_RT + * enabled kernels, by e.g.: + * - holding a spinlock/rwlock + * - soft interrupt context + * - regular interrupt handlers + * + * On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft + * interrupt context and regular interrupt handlers are preemptible and + * only prevent migration. preempt_disable_nested() ensures that preemption + * is disabled for cases which require CPU local serialization even on + * PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP. + * + * The use cases are code sequences which are not serialized by a + * particular lock instance, e.g.: + * - seqcount write side critical sections where the seqcount is not + * associated to a particular lock and therefore the automatic + * protection mechanism does not work. This prevents a live lock + * against a preempting high priority reader. + * - RMW per CPU variable updates like vmstat. + */ +/* Macro to avoid header recursion hell vs. lockdep */ +#define preempt_disable_nested() \ +do { \ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) \ + preempt_disable(); \ + else \ + lockdep_assert_preemption_disabled(); \ +} while (0) + +/** + * preempt_enable_nested - Undo the effect of preempt_disable_nested() + */ +static __always_inline void preempt_enable_nested(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); +} + #endif /* __LINUX_PREEMPT_H */ -- cgit v1.2.3 From a738e9bad6030d4fd33bfd7db3399a250b7e94d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Aug 2022 18:41:27 +0200 Subject: mm/debug: Provide VM_WARN_ON_IRQS_ENABLED() Some places in the VM code expect interrupts disabled, which is a valid expectation on non-PREEMPT_RT kernels, but does not hold on RT kernels in some places because the RT spinlock substitution does not disable interrupts. To avoid sprinkling CONFIG_PREEMPT_RT conditionals into those places, provide VM_WARN_ON_IRQS_ENABLED() which is only enabled when VM_DEBUG=y and PREEMPT_RT=n. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Acked-by: Michal Hocko Link: https://lore.kernel.org/r/20220825164131.402717-5-bigeasy@linutronix.de --- include/linux/mmdebug.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 15ae78cd2853..b8728d11c949 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm); #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif +#ifdef CONFIG_DEBUG_VM_IRQSOFF +#define VM_WARN_ON_IRQS_ENABLED() WARN_ON_ONCE(!irqs_disabled()) +#else +#define VM_WARN_ON_IRQS_ENABLED() do { } while (0) +#endif + #ifdef CONFIG_DEBUG_VIRTUAL #define VIRTUAL_BUG_ON(cond) BUG_ON(cond) #else -- cgit v1.2.3 From 44b0c2957adc62b86fcd51adeaf8e993171bc319 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Aug 2022 18:41:31 +0200 Subject: u64_stats: Streamline the implementation The u64 stats code handles 3 different cases: - 32bit UP - 32bit SMP - 64bit with an unreadable #ifdef maze, which was recently expanded with PREEMPT_RT conditionals. Reduce it to two cases (32bit and 64bit) and drop the optimization for 32bit UP as suggested by Linus. Use the new preempt_disable/enable_nested() helpers to get rid of the CONFIG_PREEMPT_RT conditionals. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220825164131.402717-9-bigeasy@linutronix.de --- include/linux/u64_stats_sync.h | 145 ++++++++++++++++++----------------------- 1 file changed, 64 insertions(+), 81 deletions(-) (limited to 'include/linux') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index 6ad4e9032d53..46040d66334a 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -8,7 +8,7 @@ * * Key points : * - * - Use a seqcount on 32-bit SMP, only disable preemption for 32-bit UP. + * - Use a seqcount on 32-bit * - The whole thing is a no-op on 64-bit architectures. * * Usage constraints: @@ -20,7 +20,8 @@ * writer and also spin forever. * * 3) Write side must use the _irqsave() variant if other writers, or a reader, - * can be invoked from an IRQ context. + * can be invoked from an IRQ context. On 64bit systems this variant does not + * disable interrupts. * * 4) If reader fetches several counters, there is no guarantee the whole values * are consistent w.r.t. each other (remember point #2: seqcounts are not @@ -29,11 +30,6 @@ * 5) Readers are allowed to sleep or be preempted/interrupted: they perform * pure reads. * - * 6) Readers must use both u64_stats_fetch_{begin,retry}_irq() if the stats - * might be updated from a hardirq or softirq context (remember point #1: - * seqcounts are not used for UP kernels). 32-bit UP stat readers could read - * corrupted 64-bit values otherwise. - * * Usage : * * Stats producer (writer) should use following template granted it already got @@ -66,7 +62,7 @@ #include struct u64_stats_sync { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) +#if BITS_PER_LONG == 32 seqcount_t seq; #endif }; @@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_stats_t *p) local64_inc(&p->v); } -#else +static inline void u64_stats_init(struct u64_stats_sync *syncp) { } +static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { } +static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { } +static inline unsigned long __u64_stats_irqsave(void) { return 0; } +static inline void __u64_stats_irqrestore(unsigned long flags) { } +static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +{ + return 0; +} +static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) +{ + return false; +} + +#else /* 64 bit */ typedef struct { u64 v; @@ -123,123 +134,95 @@ static inline void u64_stats_inc(u64_stats_t *p) { p->v++; } -#endif -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) -#define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) -#else static inline void u64_stats_init(struct u64_stats_sync *syncp) { + seqcount_init(&syncp->seq); } -#endif -static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) +static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); write_seqcount_begin(&syncp->seq); -#endif } -static inline void u64_stats_update_end(struct u64_stats_sync *syncp) +static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); -#endif + preempt_enable_nested(); } -static inline unsigned long -u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) +static inline unsigned long __u64_stats_irqsave(void) { - unsigned long flags = 0; + unsigned long flags; -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); - else - local_irq_save(flags); - write_seqcount_begin(&syncp->seq); -#endif + local_irq_save(flags); return flags; } -static inline void -u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, - unsigned long flags) +static inline void __u64_stats_irqrestore(unsigned long flags) { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) - write_seqcount_end(&syncp->seq); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); - else - local_irq_restore(flags); -#endif + local_irq_restore(flags); } static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_begin(&syncp->seq); -#else - return 0; -#endif } -static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) { -#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) - preempt_disable(); -#endif - return __u64_stats_fetch_begin(syncp); + return read_seqcount_retry(&syncp->seq, start); } +#endif /* !64 bit */ -static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, - unsigned int start) +static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) - return read_seqcount_retry(&syncp->seq, start); -#else - return false; -#endif + __u64_stats_update_begin(syncp); +} + +static inline void u64_stats_update_end(struct u64_stats_sync *syncp) +{ + __u64_stats_update_end(syncp); +} + +static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) +{ + unsigned long flags = __u64_stats_irqsave(); + + __u64_stats_update_begin(syncp); + return flags; +} + +static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, + unsigned long flags) +{ + __u64_stats_update_end(syncp); + __u64_stats_irqrestore(flags); +} + +static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +{ + return __u64_stats_fetch_begin(syncp); } static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) - preempt_enable(); -#endif return __u64_stats_fetch_retry(syncp, start); } -/* - * In case irq handlers can update u64 counters, readers can use following helpers - * - SMP 32bit arches use seqcount protection, irq safe. - * - UP 32bit must disable irqs. - * - 64bit have no problem atomically reading u64 values, irq safe. - */ +/* Obsolete interfaces */ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) - preempt_disable(); -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) - local_irq_disable(); -#endif - return __u64_stats_fetch_begin(syncp); + return u64_stats_fetch_begin(syncp); } static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) - preempt_enable(); -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) - local_irq_enable(); -#endif - return __u64_stats_fetch_retry(syncp, start); + return u64_stats_fetch_retry(syncp, start); } #endif /* _LINUX_U64_STATS_SYNC_H */ -- cgit v1.2.3 From 6a164c646999847b843e651f71c53dfaceb2c2b4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 9 May 2022 16:04:08 +0200 Subject: genirq: Provide generic_handle_domain_irq_safe(). commit 509853f9e1e7b ("genirq: Provide generic_handle_irq_safe()") addressed the problem of demultiplexing interrupt handlers which are force threaded on PREEMPT_RT enabled kernels which means that the demultiplexed handler is invoked with interrupts enabled which triggers a lockdep warning due to a non-irq safe lock acquisition. The same problem exists for the irq domain based interrupt handling via generic_handle_domain_irq() which has been reported against the AMD pin-ctrl driver. Provide generic_handle_domain_irq_safe() which can used from any context. [ tglx: Split the usage sites out and massaged changelog ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/YnkfWFzvusFFktSt@linutronix.de Link: https://bugzilla.kernel.org/show_bug.cgi?id=215954 --- include/linux/irqdesc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 1cd4e36890fb..844a8e30e6de 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -169,6 +169,7 @@ int generic_handle_irq_safe(unsigned int irq); * conversion failed. */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq); +int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq); int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq); #endif -- cgit v1.2.3 From b88c48bfdd85820b19f0c0295a88d1596876e7c8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 26 Aug 2022 20:26:41 +0300 Subject: pwm: core: Get rid of unused devm_of_pwm_get() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The devm_of_pwm_get() has recently lost its single user, drop the dead API as well. Note, the new code should use either plain pwm_get() or managed devm_pwm_get() or devm_fwnode_pwm_get() APIs. Signed-off-by: Andy Shevchenko Acked-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20220826172642.16404-2-andriy.shevchenko@linux.intel.com Signed-off-by: Guenter Roeck --- include/linux/pwm.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 9429930c5566..572ba92e4206 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -408,8 +408,6 @@ struct pwm_device *of_pwm_get(struct device *dev, struct device_node *np, void pwm_put(struct pwm_device *pwm); struct pwm_device *devm_pwm_get(struct device *dev, const char *con_id); -struct pwm_device *devm_of_pwm_get(struct device *dev, struct device_node *np, - const char *con_id); struct pwm_device *devm_fwnode_pwm_get(struct device *dev, struct fwnode_handle *fwnode, const char *con_id); @@ -517,14 +515,6 @@ static inline struct pwm_device *devm_pwm_get(struct device *dev, return ERR_PTR(-ENODEV); } -static inline struct pwm_device *devm_of_pwm_get(struct device *dev, - struct device_node *np, - const char *con_id) -{ - might_sleep(); - return ERR_PTR(-ENODEV); -} - static inline struct pwm_device * devm_fwnode_pwm_get(struct device *dev, struct fwnode_handle *fwnode, const char *con_id) -- cgit v1.2.3 From b5ae0ad56455dfb277b165b9270382f0e1c1d943 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 26 Aug 2022 20:26:42 +0300 Subject: pwm: core: Make of_pwm_get() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users outside of PWM core of the of_pwm_get(). Make it static. Signed-off-by: Andy Shevchenko Acked-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20220826172642.16404-3-andriy.shevchenko@linux.intel.com Signed-off-by: Guenter Roeck --- include/linux/pwm.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 572ba92e4206..d70c6e5a839d 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -403,8 +403,6 @@ struct pwm_device *of_pwm_single_xlate(struct pwm_chip *pc, const struct of_phandle_args *args); struct pwm_device *pwm_get(struct device *dev, const char *con_id); -struct pwm_device *of_pwm_get(struct device *dev, struct device_node *np, - const char *con_id); void pwm_put(struct pwm_device *pwm); struct pwm_device *devm_pwm_get(struct device *dev, const char *con_id); @@ -495,14 +493,6 @@ static inline struct pwm_device *pwm_get(struct device *dev, return ERR_PTR(-ENODEV); } -static inline struct pwm_device *of_pwm_get(struct device *dev, - struct device_node *np, - const char *con_id) -{ - might_sleep(); - return ERR_PTR(-ENODEV); -} - static inline void pwm_put(struct pwm_device *pwm) { might_sleep(); -- cgit v1.2.3 From 41929b72eb5dab26280dfc1e7c1523f0f4853dde Mon Sep 17 00:00:00 2001 From: Michael Shych Date: Wed, 10 Aug 2022 20:15:50 +0300 Subject: platform_data/emc2305: define platform data for EMC2305 driver Introduce platform data structure for EM2305 driver to allow configuration device PWMs and thermal zones by passing required platform data to the driver. If no platform data is provided, the driver is supposed to work with default settings. Signed-off-by: Michael Shych Reviewed-by: Vadim Pasternak Link: https://lore.kernel.org/r/20220810171552.56417-2-michaelsh@nvidia.com Signed-off-by: Guenter Roeck --- include/linux/platform_data/emc2305.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 include/linux/platform_data/emc2305.h (limited to 'include/linux') diff --git a/include/linux/platform_data/emc2305.h b/include/linux/platform_data/emc2305.h new file mode 100644 index 000000000000..54d672dd6f7d --- /dev/null +++ b/include/linux/platform_data/emc2305.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __LINUX_PLATFORM_DATA_EMC2305__ +#define __LINUX_PLATFORM_DATA_EMC2305__ + +#define EMC2305_PWM_MAX 5 + +/** + * struct emc2305_platform_data - EMC2305 driver platform data + * @max_state: maximum cooling state of the cooling device; + * @pwm_num: number of active channels; + * @pwm_separate: separate PWM settings for every channel; + * @pwm_min: array of minimum PWM per channel; + */ +struct emc2305_platform_data { + u8 max_state; + u8 pwm_num; + bool pwm_separate; + u8 pwm_min[EMC2305_PWM_MAX]; +}; + +#endif -- cgit v1.2.3 From 5db72fdb74983a1e81331aadf99ae2305f277562 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 13 Sep 2022 19:31:40 +0300 Subject: ACPI: utils: Add acpi_dev_uid_to_integer() helper to get _UID as integer Some users interpret _UID only as integer and for them it's easier to have an integer representation of _UID. Add respective helper for that. Signed-off-by: Andy Shevchenko Reviewed-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6f64b2f3dc54..9434db02cb60 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -798,6 +798,11 @@ acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *u return false; } +static inline int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer) +{ + return -ENODEV; +} + static inline struct acpi_device * acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv) { -- cgit v1.2.3 From 38bef8e57f2149acd2c910a98f57dd6291d2e0ec Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 5 Sep 2022 16:08:17 -0700 Subject: smp: add set_nr_cpu_ids() In preparation to support compile-time nr_cpu_ids, add a setter for the variable. This is a no-op for all arches. Signed-off-by: Yury Norov --- include/linux/cpumask.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index e8ad12b5b9d2..24763997894d 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -39,6 +39,11 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; #define nr_cpu_ids 1U #else extern unsigned int nr_cpu_ids; + +static inline void set_nr_cpu_ids(unsigned int nr) +{ + nr_cpu_ids = nr; +} #endif #ifdef CONFIG_CPUMASK_OFFSTACK -- cgit v1.2.3 From 7102b3bb070fdf4580a05cbfc5ad3c0691dc4bf9 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 5 Sep 2022 16:08:18 -0700 Subject: lib/cpumask: delete misleading comment The comment says that HOTPLUG config option enables all cpus in cpu_possible_mask up to NR_CPUs. This is wrong. Even if HOTPLUG is enabled, the mask is populated on boot with respect to ACPI/DT records. Signed-off-by: Yury Norov --- include/linux/cpumask.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 24763997894d..9d2f0e3e927e 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -72,10 +72,6 @@ static inline void set_nr_cpu_ids(unsigned int nr) * cpu_online_mask is the dynamic subset of cpu_present_mask, * indicating those CPUs available for scheduling. * - * If HOTPLUG is enabled, then cpu_possible_mask is forced to have - * all NR_CPUS bits set, otherwise it is just the set of CPUs that - * ACPI reports present at boot. - * * If HOTPLUG is enabled, then cpu_present_mask varies dynamically, * depending on what ACPI reports as currently plugged in, otherwise * cpu_present_mask is just a copy of cpu_possible_mask. -- cgit v1.2.3 From aa47a7c215e79a2ade6916f163c5a17b561bce4f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 5 Sep 2022 16:08:19 -0700 Subject: lib/cpumask: deprecate nr_cpumask_bits Cpumask code is written in assumption that when CONFIG_CPUMASK_OFFSTACK is enabled, all cpumasks have boot-time defined size, otherwise the size is always NR_CPUS. The latter is wrong because the number of possible cpus is always calculated on boot, and it may be less than NR_CPUS. On my 4-cpu arm64 VM the nr_cpu_ids is 4, as expected, and nr_cpumask_bits is 256, which corresponds to NR_CPUS. This not only leads to useless traversing of cpumask bits greater than 4, this also makes some cpumask routines fail. For example, cpumask_full(0b1111000..000) would erroneously return false in the example above because tail bits in the mask are all unset. This patch deprecates nr_cpumask_bits and wires it to nr_cpu_ids unconditionally, so that cpumask routines will not waste time traversing unused part of cpu masks. It also fixes cpumask_full() and similar routines. As a side effect, because now a length of cpumasks is defined at run-time even if CPUMASK_OFFSTACK is disabled, compiler can't optimize corresponding functions. It increases kernel size by ~2.5KB if OFFSTACK is off. This is addressed in the following patch. Signed-off-by: Yury Norov --- include/linux/cpumask.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 9d2f0e3e927e..2f6622cead1f 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -46,13 +46,8 @@ static inline void set_nr_cpu_ids(unsigned int nr) } #endif -#ifdef CONFIG_CPUMASK_OFFSTACK -/* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also, - * not all bits may be allocated. */ +/* Deprecated. Always use nr_cpu_ids. */ #define nr_cpumask_bits nr_cpu_ids -#else -#define nr_cpumask_bits ((unsigned int)NR_CPUS) -#endif /* * The following particular system cpumasks and operations manage -- cgit v1.2.3 From a050910972bb25152b42ad2e544652117c5ad915 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 2 May 2022 01:08:16 +0200 Subject: efi/libstub: implement generic EFI zboot Implement a minimal EFI app that decompresses the real kernel image and launches it using the firmware's LoadImage and StartImage boot services. This removes the need for any arch-specific hacks. Note that on systems that have UEFI secure boot policies enabled, LoadImage/StartImage require images to be signed, or their hashes known a priori, in order to be permitted to boot. There are various possible strategies to work around this requirement, but they all rely either on overriding internal PI/DXE protocols (which are not part of the EFI spec) or omitting the firmware provided LoadImage() and StartImage() boot services, which is also undesirable, given that they encapsulate platform specific policies related to secure boot and measured boot, but also related to memory permissions (whether or not and which types of heap allocations have both write and execute permissions.) The only generic and truly portable way around this is to simply sign both the inner and the outer image with the same key/cert pair, so this is what is implemented here. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index af90f7989f80..5efc3105f8e0 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -411,6 +411,7 @@ void efi_native_runtime_setup(void); #define LINUX_EFI_TPM_FINAL_LOG_GUID EFI_GUID(0x1e2ed096, 0x30e2, 0x4254, 0xbd, 0x89, 0x86, 0x3b, 0xbe, 0xf8, 0x23, 0x25) #define LINUX_EFI_MEMRESERVE_TABLE_GUID EFI_GUID(0x888eb0c6, 0x8ede, 0x4ff5, 0xa8, 0xf0, 0x9a, 0xee, 0x5c, 0xb9, 0x77, 0xc2) #define LINUX_EFI_INITRD_MEDIA_GUID EFI_GUID(0x5568e427, 0x68fc, 0x4f3d, 0xac, 0x74, 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68) +#define LINUX_EFI_ZBOOT_MEDIA_GUID EFI_GUID(0xe565a30d, 0x47da, 0x4dbd, 0xb3, 0x54, 0x9b, 0xb5, 0xc8, 0x4f, 0x8b, 0xe2) #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89) #define LINUX_EFI_COCO_SECRET_AREA_GUID EFI_GUID(0xadf956ad, 0xe98c, 0x484c, 0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47) -- cgit v1.2.3 From db01868bf2e919a10551066d54cf4bef5dd5a01e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 11 Sep 2022 04:06:57 +0300 Subject: net: introduce iterators over synced hw addresses Some network drivers use __dev_mc_sync()/__dev_uc_sync() and therefore program the hardware only with addresses with a non-zero sync_cnt. Some of the above drivers also need to save/restore the address filtering lists when certain events happen, and they need to walk through the struct net_device :: uc and struct net_device :: mc lists. But these lists contain unsynced addresses too. To keep the appearance of an elementary form of data encapsulation, provide iterators through these lists that only look at entries with a non-zero sync_cnt, instead of filtering entries out from device drivers. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f0068c1ff1df..9f42fc871c3b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -253,11 +253,17 @@ struct netdev_hw_addr_list { #define netdev_uc_empty(dev) netdev_hw_addr_list_empty(&(dev)->uc) #define netdev_for_each_uc_addr(ha, dev) \ netdev_hw_addr_list_for_each(ha, &(dev)->uc) +#define netdev_for_each_synced_uc_addr(_ha, _dev) \ + netdev_for_each_uc_addr((_ha), (_dev)) \ + if ((_ha)->sync_cnt) #define netdev_mc_count(dev) netdev_hw_addr_list_count(&(dev)->mc) #define netdev_mc_empty(dev) netdev_hw_addr_list_empty(&(dev)->mc) #define netdev_for_each_mc_addr(ha, dev) \ netdev_hw_addr_list_for_each(ha, &(dev)->mc) +#define netdev_for_each_synced_mc_addr(_ha, _dev) \ + netdev_for_each_mc_addr((_ha), (_dev)) \ + if ((_ha)->sync_cnt) struct hh_cache { unsigned int hh_len; -- cgit v1.2.3 From 1e839143d674603b0bbbc4c513bca35404967dbc Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 2 Sep 2022 15:29:23 +0200 Subject: HID: core: store the unique system identifier in hid_device This unique identifier is currently used only for ensuring uniqueness in sysfs. However, this could be handful for userspace to refer to a specific hid_device by this id. 2 use cases are in my mind: LEDs (and their naming convention), and HID-BPF. Reviewed-by: Greg Kroah-Hartman Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220902132938.2409206-9-benjamin.tissoires@redhat.com --- include/linux/hid.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 4363a63b9775..a43dd17bc78f 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -658,6 +658,8 @@ struct hid_device { /* device report descriptor */ struct list_head debug_list; spinlock_t debug_list_lock; wait_queue_head_t debug_wait; + + unsigned int id; /* system unique id */ }; #define to_hid_device(pdev) \ -- cgit v1.2.3 From ead77b65aef430d3bfe63524c243a60a29eb8d90 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 2 Sep 2022 15:29:24 +0200 Subject: HID: export hid_report_type to uapi When we are dealing with eBPF, we need to have access to the report type. Currently our implementation differs from the USB standard, making it impossible for users to know the exact value besides hardcoding it themselves. And instead of a blank define, convert it as an enum. Note that we need to also do change in the ll_driver API, but given that this will have a wider impact outside of this tree, we leave this as a TODO for the future. Reviewed-by: Greg Kroah-Hartman Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220902132938.2409206-10-benjamin.tissoires@redhat.com --- include/linux/hid.h | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index a43dd17bc78f..b1a33dbbc78e 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -314,15 +314,6 @@ struct hid_item { #define HID_BAT_ABSOLUTESTATEOFCHARGE 0x00850065 #define HID_VD_ASUS_CUSTOM_MEDIA_KEYS 0xff310076 -/* - * HID report types --- Ouch! HID spec says 1 2 3! - */ - -#define HID_INPUT_REPORT 0 -#define HID_OUTPUT_REPORT 1 -#define HID_FEATURE_REPORT 2 - -#define HID_REPORT_TYPES 3 /* * HID connect requests @@ -509,7 +500,7 @@ struct hid_report { struct list_head hidinput_list; struct list_head field_entry_list; /* ordered list of input fields */ unsigned int id; /* id of this report */ - unsigned int type; /* report type */ + enum hid_report_type type; /* report type */ unsigned int application; /* application usage for this report */ struct hid_field *field[HID_MAX_FIELDS]; /* fields of the report */ struct hid_field_entry *field_entries; /* allocated memory of input field_entry */ @@ -926,7 +917,8 @@ extern int hidinput_connect(struct hid_device *hid, unsigned int force); extern void hidinput_disconnect(struct hid_device *); int hid_set_field(struct hid_field *, unsigned, __s32); -int hid_input_report(struct hid_device *, int type, u8 *, u32, int); +int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, + int interrupt); struct hid_field *hidinput_get_led_field(struct hid_device *hid); unsigned int hidinput_count_leds(struct hid_device *hid); __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code); @@ -935,11 +927,11 @@ int __hid_request(struct hid_device *hid, struct hid_report *rep, int reqtype); u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags); struct hid_device *hid_allocate_device(void); struct hid_report *hid_register_report(struct hid_device *device, - unsigned int type, unsigned int id, + enum hid_report_type type, unsigned int id, unsigned int application); int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size); struct hid_report *hid_validate_values(struct hid_device *hid, - unsigned int type, unsigned int id, + enum hid_report_type type, unsigned int id, unsigned int field_index, unsigned int report_counts); @@ -1111,7 +1103,7 @@ void hid_hw_request(struct hid_device *hdev, struct hid_report *report, int reqtype); int hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, - size_t len, unsigned char rtype, int reqtype); + size_t len, enum hid_report_type rtype, int reqtype); int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len); /** @@ -1184,8 +1176,8 @@ static inline u32 hid_report_len(struct hid_report *report) return DIV_ROUND_UP(report->size, 8) + (report->id > 0); } -int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size, - int interrupt); +int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, + int interrupt); /* HID quirks API */ unsigned long hid_lookup_quirk(const struct hid_device *hdev); -- cgit v1.2.3 From 735e1bb1b8067e209941a6bdfde23214696ff47e Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 2 Sep 2022 15:29:25 +0200 Subject: HID: convert defines of HID class requests into a proper enum This allows to export the type in BTF and so in the automatically generated vmlinux.h. It will also add some static checks on the users when we change the ll driver API (see not below). Note that we need to also do change in the ll_driver API, but given that this will have a wider impact outside of this tree, we leave this as a TODO for the future. Reviewed-by: Greg Kroah-Hartman Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220902132938.2409206-11-benjamin.tissoires@redhat.com --- include/linux/hid.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index b1a33dbbc78e..8677ae38599e 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -923,7 +923,7 @@ struct hid_field *hidinput_get_led_field(struct hid_device *hid); unsigned int hidinput_count_leds(struct hid_device *hid); __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code); void hid_output_report(struct hid_report *report, __u8 *data); -int __hid_request(struct hid_device *hid, struct hid_report *rep, int reqtype); +int __hid_request(struct hid_device *hid, struct hid_report *rep, enum hid_class_request reqtype); u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags); struct hid_device *hid_allocate_device(void); struct hid_report *hid_register_report(struct hid_device *device, @@ -1100,10 +1100,11 @@ void hid_hw_stop(struct hid_device *hdev); int __must_check hid_hw_open(struct hid_device *hdev); void hid_hw_close(struct hid_device *hdev); void hid_hw_request(struct hid_device *hdev, - struct hid_report *report, int reqtype); + struct hid_report *report, enum hid_class_request reqtype); int hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, - size_t len, enum hid_report_type rtype, int reqtype); + size_t len, enum hid_report_type rtype, + enum hid_class_request reqtype); int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len); /** @@ -1131,7 +1132,7 @@ static inline int hid_hw_power(struct hid_device *hdev, int level) * @reqtype: hid request type */ static inline int hid_hw_idle(struct hid_device *hdev, int report, int idle, - int reqtype) + enum hid_class_request reqtype) { if (hdev->ll_driver->idle) return hdev->ll_driver->idle(hdev, report, idle, reqtype); -- cgit v1.2.3 From 176042404ee6a96ba7e9054e1bda6220360a26ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Sep 2022 10:41:56 +0100 Subject: mm: add PSI accounting around ->read_folio and ->readahead calls PSI tries to account for the cost of bringing back in pages discarded by the MM LRU management. Currently the prime place for that is hooked into the bio submission path, which is a rather bad place: - it does not actually account I/O for non-block file systems, of which we have many - it adds overhead and a layering violation to the block layer Add the accounting into the two places in the core MM code that read pages into an address space by calling into ->read_folio and ->readahead so that the entire file system operations are covered, to broaden the coverage and allow removing the accounting in the block layer going forward. As psi_memstall_enter can deal with nested calls this will not lead to double accounting even while the bio annotations are still present. Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220915094200.139713-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/pagemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0178b2040ea3..201dc7281640 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1173,6 +1173,8 @@ struct readahead_control { pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; + bool _workingset; + unsigned long _pflags; }; #define DEFINE_READAHEAD(ractl, f, r, m, i) \ -- cgit v1.2.3 From 118f3663fbc658e9ad6165e129076981c7b685c5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Sep 2022 10:42:00 +0100 Subject: block: remove PSI accounting from the bio layer PSI accounting is now done by the VM code, where it should have been since the beginning. Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220915094200.139713-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 41afb4cfb9b0..e0b098089ef2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -321,7 +321,6 @@ enum { BIO_NO_PAGE_REF, /* don't put release vec pages */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ - BIO_WORKINGSET, /* contains userspace workingset pages */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ -- cgit v1.2.3 From 256dea9134c38fcc409ec7ea6a1eb67829b563bc Mon Sep 17 00:00:00 2001 From: Ronak Jain Date: Wed, 14 Sep 2022 18:03:15 +0530 Subject: firmware: xilinx: add support for sd/gem config Add new APIs in firmware to configure SD/GEM registers. Internally it calls PM IOCTL for below SD/GEM register configuration: - SD/EMMC select - SD slot type - SD base clock - SD 8 bit support - SD fixed config - GEM SGMII Mode - GEM fixed config Signed-off-by: Ronak Jain Signed-off-by: Radhey Shyam Pandey Reviewed-by: Claudiu Beznea Signed-off-by: Jakub Kicinski --- include/linux/firmware/xlnx-zynqmp.h | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 9f50dacbf7d6..76d2b3ebad84 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -153,6 +153,9 @@ enum pm_ioctl_id { /* Runtime feature configuration */ IOCTL_SET_FEATURE_CONFIG = 26, IOCTL_GET_FEATURE_CONFIG = 27, + /* Dynamic SD/GEM configuration */ + IOCTL_SET_SD_CONFIG = 30, + IOCTL_SET_GEM_CONFIG = 31, }; enum pm_query_id { @@ -399,6 +402,30 @@ enum pm_feature_config_id { PM_FEATURE_EXTWDT_VALUE = 4, }; +/** + * enum pm_sd_config_type - PM SD configuration. + * @SD_CONFIG_EMMC_SEL: To set SD_EMMC_SEL in CTRL_REG_SD and SD_SLOTTYPE + * @SD_CONFIG_BASECLK: To set SD_BASECLK in SD_CONFIG_REG1 + * @SD_CONFIG_8BIT: To set SD_8BIT in SD_CONFIG_REG2 + * @SD_CONFIG_FIXED: To set fixed config registers + */ +enum pm_sd_config_type { + SD_CONFIG_EMMC_SEL = 1, + SD_CONFIG_BASECLK = 2, + SD_CONFIG_8BIT = 3, + SD_CONFIG_FIXED = 4, +}; + +/** + * enum pm_gem_config_type - PM GEM configuration. + * @GEM_CONFIG_SGMII_MODE: To set GEM_SGMII_MODE in GEM_CLK_CTRL register + * @GEM_CONFIG_FIXED: To set fixed config registers + */ +enum pm_gem_config_type { + GEM_CONFIG_SGMII_MODE = 1, + GEM_CONFIG_FIXED = 2, +}; + /** * struct zynqmp_pm_query_data - PM query data * @qid: query ID @@ -475,6 +502,9 @@ int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id); int zynqmp_pm_set_feature_config(enum pm_feature_config_id id, u32 value); int zynqmp_pm_get_feature_config(enum pm_feature_config_id id, u32 *payload); int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset); +int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value); +int zynqmp_pm_set_gem_config(u32 node, enum pm_gem_config_type config, + u32 value); #else static inline int zynqmp_pm_get_api_version(u32 *version) { @@ -745,6 +775,21 @@ static inline int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset) { return -ENODEV; } + +static inline int zynqmp_pm_set_sd_config(u32 node, + enum pm_sd_config_type config, + u32 value) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_set_gem_config(u32 node, + enum pm_gem_config_type config, + u32 value) +{ + return -ENODEV; +} + #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 17df341d35265c8e14d71a48886fc7dcf92ef8a1 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 11 Sep 2022 13:39:01 +0200 Subject: headers: Remove some left-over license text Remove a left-over from commit 2874c5fd2842 ("treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 152") There is no need for an empty "License:". Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/0e5ff727626b748238f4b78932f81572143d8f0b.1662896317.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski --- include/linux/if_pppol2tp.h | 2 -- include/linux/if_pppox.h | 2 -- 2 files changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_pppol2tp.h b/include/linux/if_pppol2tp.h index 96d40942e5a3..c87efd333faa 100644 --- a/include/linux/if_pppol2tp.h +++ b/include/linux/if_pppol2tp.h @@ -4,8 +4,6 @@ * * This file supplies definitions required by the PPP over L2TP driver * (l2tp_ppp.c). All version information wrt this file is located in l2tp_ppp.c - * - * License: */ #ifndef __LINUX_IF_PPPOL2TP_H #define __LINUX_IF_PPPOL2TP_H diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index 69e813bcb947..ff3beda1312c 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -5,8 +5,6 @@ * * This file supplies definitions required by the PPP over Ethernet driver * (pppox.c). All version information wrt this file is located in pppox.c - * - * License: */ #ifndef __LINUX_IF_PPPOX_H #define __LINUX_IF_PPPOX_H -- cgit v1.2.3 From fdf214978a71b2749d26f6da2b1d51d9ac23831d Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 20 Sep 2022 08:15:24 -0600 Subject: bpf: Move nf_conn extern declarations to filter.h We're seeing the following new warnings on netdev/build_32bit and netdev/build_allmodconfig_warn CI jobs: ../net/core/filter.c:8608:1: warning: symbol 'nf_conn_btf_access_lock' was not declared. Should it be static? ../net/core/filter.c:8611:5: warning: symbol 'nfct_bsa' was not declared. Should it be static? Fix by ensuring extern declaration is present while compiling filter.o. Fixes: 864b656f82cc ("bpf: Add support for writing to nf_conn:mark") Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/2bd2e0283df36d8a4119605878edb1838d144174.1663683114.git.dxu@dxuuu.xyz Signed-off-by: Martin KaFai Lau --- include/linux/filter.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 75335432fcbc..98e28126c24b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -567,6 +567,12 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); +extern struct mutex nf_conn_btf_access_lock; +extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); + typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, unsigned int (*bpf_func)(const void *, -- cgit v1.2.3 From 6f9c07be9d020489326098801f0661f754c7c865 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 5 Sep 2022 16:08:20 -0700 Subject: lib/cpumask: add FORCE_NR_CPUS config option The size of cpumasks is hard-limited by compile-time parameter NR_CPUS, but defined at boot-time when kernel parses ACPI/DT tables, and stored in nr_cpu_ids. In many practical cases, number of CPUs for a target is known at compile time, and can be provided with NR_CPUS. In that case, compiler may be instructed to rely on NR_CPUS as on actual number of CPUs, not an upper limit. It allows to optimize many cpumask routines and significantly shrink size of the kernel image. This patch adds FORCE_NR_CPUS option to teach the compiler to rely on NR_CPUS and enable corresponding optimizations. If FORCE_NR_CPUS=y, kernel will not set nr_cpu_ids at boot, but only check that the actual number of possible CPUs is equal to NR_CPUS, and WARN if that doesn't hold. The new option is especially useful in embedded applications because kernel configurations are unique for each SoC, the number of CPUs is constant and known well, and memory limitations are typically harder. For my 4-CPU ARM64 build with NR_CPUS=4, FORCE_NR_CPUS=y saves 46KB: add/remove: 3/4 grow/shrink: 46/729 up/down: 652/-46952 (-46300) Signed-off-by: Yury Norov --- include/linux/cpumask.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 2f6622cead1f..1b442fb2001f 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -35,16 +35,20 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; */ #define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp) -#if NR_CPUS == 1 -#define nr_cpu_ids 1U +#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) +#define nr_cpu_ids ((unsigned int)NR_CPUS) #else extern unsigned int nr_cpu_ids; +#endif static inline void set_nr_cpu_ids(unsigned int nr) { +#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) + WARN_ON(nr != nr_cpu_ids); +#else nr_cpu_ids = nr; -} #endif +} /* Deprecated. Always use nr_cpu_ids. */ #define nr_cpumask_bits nr_cpu_ids -- cgit v1.2.3 From 690aa8c3ae308bc696ec8b1b357b995193927083 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 16 Sep 2022 14:28:32 +0200 Subject: ata: fix ata_id_sense_reporting_enabled() and ata_id_has_sense_reporting() ACS-5 section 7.13.6.41 Words 85..87, 120: Commands and feature sets supported or enabled states that: If bit 15 of word 86 is set to one, bit 14 of word 119 is set to one, and bit 15 of word 119 is cleared to zero, then word 119 is valid. If bit 15 of word 86 is set to one, bit 14 of word 120 is set to one, and bit 15 of word 120 is cleared to zero, then word 120 is valid. (This text also exists in really old ACS standards, e.g. ACS-3.) Currently, ata_id_sense_reporting_enabled() and ata_id_has_sense_reporting() both check bit 15 of word 86, but neither of them check that bit 14 of word 119 is set to one, or that bit 15 of word 119 is cleared to zero. Additionally, make ata_id_sense_reporting_enabled() return false if !ata_id_has_sense_reporting(), similar to how e.g. ata_id_flush_ext_enabled() returns false if !ata_id_has_flush_ext(). Fixes: e87fd28cf9a2 ("libata: Implement support for sense data reporting") Signed-off-by: Niklas Cassel Signed-off-by: Damien Le Moal --- include/linux/ata.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index 21292b5bbb55..868bfd503aee 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -771,16 +771,21 @@ static inline bool ata_id_has_read_log_dma_ext(const u16 *id) static inline bool ata_id_has_sense_reporting(const u16 *id) { - if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15))) + if (!(id[ATA_ID_CFS_ENABLE_2] & BIT(15))) + return false; + if ((id[ATA_ID_COMMAND_SET_3] & (BIT(15) | BIT(14))) != BIT(14)) return false; - return id[ATA_ID_COMMAND_SET_3] & (1 << 6); + return id[ATA_ID_COMMAND_SET_3] & BIT(6); } static inline bool ata_id_sense_reporting_enabled(const u16 *id) { - if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15))) + if (!ata_id_has_sense_reporting(id)) + return false; + /* ata_id_has_sense_reporting() == true, word 86 must have bit 15 set */ + if ((id[ATA_ID_COMMAND_SET_4] & (BIT(15) | BIT(14))) != BIT(14)) return false; - return id[ATA_ID_COMMAND_SET_4] & (1 << 6); + return id[ATA_ID_COMMAND_SET_4] & BIT(6); } /** -- cgit v1.2.3 From 9c6e09a434e1317e09b78b3b69cd384022ec9a03 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 16 Sep 2022 14:28:33 +0200 Subject: ata: fix ata_id_has_devslp() ACS-5 section 7.13.6.36 Word 78: Serial ATA features supported states that: If word 76 is not 0000h or FFFFh, word 78 reports the features supported by the device. If this word is not supported, the word shall be cleared to zero. (This text also exists in really old ACS standards, e.g. ACS-3.) Additionally, move the macro to the other ATA_ID_FEATURE_SUPP macros (which already have this check), thus making it more likely that the next ATA_ID_FEATURE_SUPP macro that is added will include this check. Fixes: 65fe1f0f66a5 ("ahci: implement aggressive SATA device sleep support") Signed-off-by: Niklas Cassel Signed-off-by: Damien Le Moal --- include/linux/ata.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index 868bfd503aee..bc136a43689f 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -566,6 +566,10 @@ struct ata_bmdma_prd { ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \ ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \ ((id)[ATA_ID_FEATURE_SUPP] & (1 << 2))) +#define ata_id_has_devslp(id) \ + ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \ + ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \ + ((id)[ATA_ID_FEATURE_SUPP] & (1 << 8))) #define ata_id_iordy_disable(id) ((id)[ATA_ID_CAPABILITY] & (1 << 10)) #define ata_id_has_iordy(id) ((id)[ATA_ID_CAPABILITY] & (1 << 11)) #define ata_id_u32(id,n) \ @@ -578,7 +582,6 @@ struct ata_bmdma_prd { #define ata_id_cdb_intr(id) (((id)[ATA_ID_CONFIG] & 0x60) == 0x20) #define ata_id_has_da(id) ((id)[ATA_ID_SATA_CAPABILITY_2] & (1 << 4)) -#define ata_id_has_devslp(id) ((id)[ATA_ID_FEATURE_SUPP] & (1 << 8)) #define ata_id_has_ncq_autosense(id) \ ((id)[ATA_ID_FEATURE_SUPP] & (1 << 7)) -- cgit v1.2.3 From a5fb6bf853148974dbde092ec1bde553bea5e49f Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 16 Sep 2022 14:28:34 +0200 Subject: ata: fix ata_id_has_ncq_autosense() ACS-5 section 7.13.6.36 Word 78: Serial ATA features supported states that: If word 76 is not 0000h or FFFFh, word 78 reports the features supported by the device. If this word is not supported, the word shall be cleared to zero. (This text also exists in really old ACS standards, e.g. ACS-3.) Additionally, move the macro to the other ATA_ID_FEATURE_SUPP macros (which already have this check), thus making it more likely that the next ATA_ID_FEATURE_SUPP macro that is added will include this check. Fixes: 5b01e4b9efa0 ("libata: Implement NCQ autosense") Signed-off-by: Niklas Cassel Signed-off-by: Damien Le Moal --- include/linux/ata.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index bc136a43689f..4845443e0f08 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -570,6 +570,10 @@ struct ata_bmdma_prd { ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \ ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \ ((id)[ATA_ID_FEATURE_SUPP] & (1 << 8))) +#define ata_id_has_ncq_autosense(id) \ + ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \ + ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \ + ((id)[ATA_ID_FEATURE_SUPP] & (1 << 7))) #define ata_id_iordy_disable(id) ((id)[ATA_ID_CAPABILITY] & (1 << 10)) #define ata_id_has_iordy(id) ((id)[ATA_ID_CAPABILITY] & (1 << 11)) #define ata_id_u32(id,n) \ @@ -582,8 +586,6 @@ struct ata_bmdma_prd { #define ata_id_cdb_intr(id) (((id)[ATA_ID_CONFIG] & 0x60) == 0x20) #define ata_id_has_da(id) ((id)[ATA_ID_SATA_CAPABILITY_2] & (1 << 4)) -#define ata_id_has_ncq_autosense(id) \ - ((id)[ATA_ID_FEATURE_SUPP] & (1 << 7)) static inline bool ata_id_has_hipm(const u16 *id) { -- cgit v1.2.3 From 630624cb1b5826d753ac8e01a0e42de43d66dedf Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 16 Sep 2022 14:28:35 +0200 Subject: ata: fix ata_id_has_dipm() ACS-5 section 7.13.6.36 Word 78: Serial ATA features supported states that: If word 76 is not 0000h or FFFFh, word 78 reports the features supported by the device. If this word is not supported, the word shall be cleared to zero. (This text also exists in really old ACS standards, e.g. ACS-3.) The problem with ata_id_has_dipm() is that the while it performs a check against 0 and 0xffff, it performs the check against ATA_ID_FEATURE_SUPP (word 78), the same word where the feature bit is stored. Fix this by performing the check against ATA_ID_SATA_CAPABILITY (word 76), like required by the spec. The feature bit check itself is of course still performed against ATA_ID_FEATURE_SUPP (word 78). Additionally, move the macro to the other ATA_ID_FEATURE_SUPP macros (which already have this check), thus making it more likely that the next ATA_ID_FEATURE_SUPP macro that is added will include this check. Fixes: ca77329fb713 ("[libata] Link power management infrastructure") Signed-off-by: Niklas Cassel Signed-off-by: Damien Le Moal --- include/linux/ata.h | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index 4845443e0f08..e3050e153a71 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -574,6 +574,10 @@ struct ata_bmdma_prd { ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \ ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \ ((id)[ATA_ID_FEATURE_SUPP] & (1 << 7))) +#define ata_id_has_dipm(id) \ + ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \ + ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \ + ((id)[ATA_ID_FEATURE_SUPP] & (1 << 3))) #define ata_id_iordy_disable(id) ((id)[ATA_ID_CAPABILITY] & (1 << 10)) #define ata_id_has_iordy(id) ((id)[ATA_ID_CAPABILITY] & (1 << 11)) #define ata_id_u32(id,n) \ @@ -597,17 +601,6 @@ static inline bool ata_id_has_hipm(const u16 *id) return val & (1 << 9); } -static inline bool ata_id_has_dipm(const u16 *id) -{ - u16 val = id[ATA_ID_FEATURE_SUPP]; - - if (val == 0 || val == 0xffff) - return false; - - return val & (1 << 3); -} - - static inline bool ata_id_has_fua(const u16 *id) { if ((id[ATA_ID_CFSSE] & 0xC000) != 0x4000) -- cgit v1.2.3 From 7ebb5f8e001037efbdf18afabbbebf71bd98825e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 21 Sep 2022 10:40:53 +0800 Subject: Revert "iommu/vt-d: Fix possible recursive locking in intel_iommu_init()" This reverts commit 9cd4f1434479f1ac25c440c421fbf52069079914. Some issues were reported on the original commit. Some thunderbolt devices don't work anymore due to the following DMA fault. DMAR: DRHD: handling fault status reg 2 DMAR: [INTR-REMAP] Request device [09:00.0] fault index 0x8080 [fault reason 0x25] Blocked a compatibility format interrupt request Bring it back for now to avoid functional regression. Fixes: 9cd4f1434479f ("iommu/vt-d: Fix possible recursive locking in intel_iommu_init()") Link: https://lore.kernel.org/linux-iommu/485A6EA5-6D58-42EA-B298-8571E97422DE@getmailspring.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=216497 Cc: Mika Westerberg Cc: # 5.19.x Reported-and-tested-by: George Hilliard Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20220920081701.3453504-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/dmar.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 8917a32173c4..d81a51978d01 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -65,7 +65,6 @@ struct dmar_pci_notify_info { extern struct rw_semaphore dmar_global_lock; extern struct list_head dmar_drhd_units; -extern int intel_iommu_enabled; #define for_each_drhd_unit(drhd) \ list_for_each_entry_rcu(drhd, &dmar_drhd_units, list, \ @@ -89,8 +88,7 @@ extern int intel_iommu_enabled; static inline bool dmar_rcu_check(void) { return rwsem_is_locked(&dmar_global_lock) || - system_state == SYSTEM_BOOTING || - (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled); + system_state == SYSTEM_BOOTING; } #define dmar_rcu_dereference(p) rcu_dereference_check((p), dmar_rcu_check()) -- cgit v1.2.3 From 65394169bdae073bfb2c6816f5bf095bd7d53e61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= Date: Wed, 29 Jun 2022 14:57:34 +0200 Subject: mtd: track maximum number of bitflips for each read request MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mtd_read_oob() callers are currently oblivious to the details of ECC errors detected during the read operation - they only learn (through the return value) whether any corrected bitflips or uncorrectable errors occurred. More detailed ECC information can be useful to user-space applications for making better-informed choices about moving data around. Extend struct mtd_oob_ops with a pointer to a newly-introduced struct mtd_req_stats and set its 'max_bitflips' field to the maximum number of bitflips found in a single ECC step during the read operation performed by mtd_read_oob(). This is a prerequisite for ultimately passing that value back to user space. Suggested-by: Boris Brezillon Signed-off-by: Michał Kępień Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220629125737.14418-2-kernel@kempniu.pl --- include/linux/mtd/mtd.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 955aee14b0f7..fccad1766458 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -40,6 +40,10 @@ struct mtd_erase_region_info { unsigned long *lockmap; /* If keeping bitmap of locks */ }; +struct mtd_req_stats { + unsigned int max_bitflips; +}; + /** * struct mtd_oob_ops - oob operation operands * @mode: operation mode @@ -70,6 +74,7 @@ struct mtd_oob_ops { uint32_t ooboffs; uint8_t *datbuf; uint8_t *oobbuf; + struct mtd_req_stats *stats; }; /** -- cgit v1.2.3 From 7bea6056927727f98f4efdd338f112f7517f05b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= Date: Wed, 29 Jun 2022 14:57:36 +0200 Subject: mtd: add ECC error accounting for each read request MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend struct mtd_req_stats with two new fields holding the number of corrected bitflips and uncorrectable errors detected during a read operation. This is a prerequisite for ultimately passing those counters to user space, where they can be useful to applications for making better-informed choices about moving data around. Unlike 'max_bitflips' (which is set - in a common code path - to the return value of a function called while the MTD device's mutex is held), these counters have to be maintained in each MTD driver which defines the '_read_oob' callback because the statistics need to be calculated while the MTD device's mutex is held. Suggested-by: Boris Brezillon Signed-off-by: Michał Kępień Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220629125737.14418-4-kernel@kempniu.pl --- include/linux/mtd/mtd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index fccad1766458..c12a5930f32c 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -41,6 +41,8 @@ struct mtd_erase_region_info { }; struct mtd_req_stats { + unsigned int uncorrectable_errors; + unsigned int corrected_bitflips; unsigned int max_bitflips; }; -- cgit v1.2.3 From b2bed51a5261f4266ecb857bba680a7f668d3ddf Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 Sep 2022 13:06:26 -0700 Subject: block: Fix the enum blk_eh_timer_return documentation The documentation of the blk_eh_timer_return enumeration values does not reflect correctly how e.g. the SCSI core uses these values. Fix the documentation. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Damien Le Moal Cc: Johannes Thumshirn Fixes: 88b0cfad2888 ("block: document the blk_eh_timer_return values") Signed-off-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20220920200626.3422296-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 74b99d716b0b..e21ff85751cf 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -268,9 +268,16 @@ static inline void rq_list_move(struct request **src, struct request **dst, rq_list_add(dst, rq); } +/** + * enum blk_eh_timer_return - How the timeout handler should proceed + * @BLK_EH_DONE: The block driver completed the command or will complete it at + * a later time. + * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the + * request to complete. + */ enum blk_eh_timer_return { - BLK_EH_DONE, /* drivers has completed the command */ - BLK_EH_RESET_TIMER, /* reset timer and try again */ + BLK_EH_DONE, + BLK_EH_RESET_TIMER, }; #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ -- cgit v1.2.3 From 9f0deaa12d832f488500a5afe9b912e9b3cfc432 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 16 Aug 2022 06:59:59 -0700 Subject: eventfd: guard wake_up in eventfd fs calls as well Guard wakeups that the user can trigger, and that may end up triggering a call back into eventfd_signal. This is in addition to the current approach that only guards in eventfd_signal. Rename in_eventfd_signal -> in_eventfd at the same time to reflect this. Without this there would be a deadlock in the following code using libaio: int main() { struct io_context *ctx = NULL; struct iocb iocb; struct iocb *iocbs[] = { &iocb }; int evfd; uint64_t val = 1; evfd = eventfd(0, EFD_CLOEXEC); assert(!io_setup(2, &ctx)); io_prep_poll(&iocb, evfd, POLLIN); io_set_eventfd(&iocb, evfd); assert(1 == io_submit(ctx, 1, iocbs)); write(evfd, &val, 8); } Signed-off-by: Dylan Yudaken Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20220816135959.1490641-1-dylany@fb.com Signed-off-by: Jens Axboe --- include/linux/eventfd.h | 2 +- include/linux/sched.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 305d5f19093b..30eb30d6909b 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -46,7 +46,7 @@ void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); static inline bool eventfd_signal_allowed(void) { - return !current->in_eventfd_signal; + return !current->in_eventfd; } #else /* CONFIG_EVENTFD */ diff --git a/include/linux/sched.h b/include/linux/sched.h index e7b2f8a5c711..8d82d6d32670 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -936,7 +936,7 @@ struct task_struct { #endif #ifdef CONFIG_EVENTFD /* Recursion prevention for eventfd_signal() */ - unsigned in_eventfd_signal:1; + unsigned in_eventfd:1; #endif #ifdef CONFIG_IOMMU_SVA unsigned pasid_activated:1; -- cgit v1.2.3 From c0e0d6ba25f180ab76d3c18f8b360a119dffa634 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 30 Aug 2022 05:50:10 -0700 Subject: io_uring: add IORING_SETUP_DEFER_TASKRUN Allow deferring async tasks until the user calls io_uring_enter(2) with the IORING_ENTER_GETEVENTS flag. Enable this mode with a flag at io_uring_setup time. This functionality requires that the later io_uring_enter will be called from the same submission task, and therefore restrict this flag to work only when IORING_SETUP_SINGLE_ISSUER is also set. Being able to hand pick when tasks are run prevents the problem where there is current work to be done, however task work runs anyway. For example, a common workload would obtain a batch of CQEs, and process each one. Interrupting this to additional taskwork would add latency but not gain anything. If instead task work is deferred to just before more CQEs are obtained then no additional latency is added. The way this is implemented is by trying to keep task work local to a io_ring_ctx, rather than to the submission task. This is required, as the application will want to wake up only a single io_ring_ctx at a time to process work, and so the lists of work have to be kept separate. This has some other benefits like not having to check the task continually in handle_tw_list (and potentially unlocking/locking those), and reducing locks in the submit & process completions path. There are networking cases where using this option can reduce request latency by 50%. For example a contrived example using [1] where the client sends 2k data and receives the same data back while doing some system calls (to trigger task work) shows this reduction. The reason ends up being that if sending responses is delayed by processing task work, then the client side sits idle. Whereas reordering the sends first means that the client runs it's workload in parallel with the local task work. [1]: Using https://github.com/DylanZA/netbench/tree/defer_run Client: ./netbench --client_only 1 --control_port 10000 --host --tx "epoll --threads 16 --per_thread 1 --size 2048 --resp 2048 --workload 1000" Server: ./netbench --server_only 1 --control_port 10000 --rx "io_uring --defer_taskrun 0 --workload 100" --rx "io_uring --defer_taskrun 1 --workload 100" Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220830125013.570060-5-dylany@fb.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 677a25d44d7f..d56ff2185168 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -301,6 +301,8 @@ struct io_ring_ctx { struct io_hash_table cancel_table; bool poll_multi_queue; + struct llist_head work_llist; + struct list_head io_buffers_comp; } ____cacheline_aligned_in_smp; -- cgit v1.2.3 From 21a091b970cdbcf3e8ff829234b51be6f9192766 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 30 Aug 2022 05:50:12 -0700 Subject: io_uring: signal registered eventfd to process deferred task work Some workloads rely on a registered eventfd (via io_uring_register_eventfd(3)) in order to wake up and process the io_uring. In the case of a ring setup with IORING_SETUP_DEFER_TASKRUN, that eventfd also needs to be signalled when there are tasks to run. This changes an old behaviour which assumed 1 eventfd signal implied at least 1 CQE, however only when this new flag is set (and so old users will not notice). This should be expected with the IORING_SETUP_DEFER_TASKRUN flag as it is not guaranteed that every task will result in a CQE. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220830125013.570060-7-dylany@fb.com [axboe: fold in call_rcu() serialization fix] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d56ff2185168..aa4d90a53866 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -184,6 +184,8 @@ struct io_ev_fd { struct eventfd_ctx *cq_ev_fd; unsigned int eventfd_async: 1; struct rcu_head rcu; + atomic_t refs; + atomic_t ops; }; struct io_alloc_cache { -- cgit v1.2.3 From de27e18e86173b704beaa19f0ee376f3305c4794 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 23 Aug 2022 21:44:40 +0530 Subject: fs: add file_operations->uring_cmd_iopoll io_uring will invoke this to do completion polling on uring-cmd operations. Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20220823161443.49436-2-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9eced4cc286e..d6badd19784f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2132,6 +2132,7 @@ struct file_operations { loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); + int (*uring_cmd_iopoll)(struct io_uring_cmd *ioucmd); } __randomize_layout; struct inode_operations { -- cgit v1.2.3 From 5756a3a7e713bcab705a5f0c810a2b1f7f4ecfaa Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 23 Aug 2022 21:44:41 +0530 Subject: io_uring: add iopoll infrastructure for io_uring_cmd Put this up in the same way as iopoll is done for regular read/write IO. Make place for storing a cookie into struct io_uring_cmd on submission. Perform the completion using the ->uring_cmd_iopoll handler. Signed-off-by: Kanchan Joshi Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20220823161443.49436-3-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 4a2f6cc5a492..58676c0a398f 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -20,8 +20,12 @@ enum io_uring_cmd_flags { struct io_uring_cmd { struct file *file; const void *cmd; - /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd); + union { + /* callback to defer completions to task context */ + void (*task_work_cb)(struct io_uring_cmd *cmd); + /* used for polled completion */ + void *cookie; + }; u32 cmd_op; u32 pad; u8 pdu[32]; /* available inline for free use */ -- cgit v1.2.3 From c6e99ea482e2a9e1fef2488891242f9749584225 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 23 Aug 2022 21:44:42 +0530 Subject: block: export blk_rq_is_poll This is in preparation to support iopoll for nvme passthrough. Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20220823161443.49436-4-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 92294a5fb083..de384f5d2c37 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -980,6 +980,7 @@ int blk_rq_map_kern(struct request_queue *, struct request *, void *, int blk_rq_append_bio(struct request *rq, struct bio *bio); void blk_execute_rq_nowait(struct request *rq, bool at_head); blk_status_t blk_execute_rq(struct request *rq, bool at_head); +bool blk_rq_is_poll(struct request *rq); struct req_iterator { struct bvec_iter iter; -- cgit v1.2.3 From de97fcb30316410a2c46be102f074a454ecc6cf1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 2 Sep 2022 15:18:05 -0600 Subject: fs: add batch and poll flags to the uring_cmd_iopoll() handler We need the poll_flags to know how to poll for the IO, and we should have the batch structure in preparation for supporting batched completions with iopoll. Signed-off-by: Jens Axboe --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index d6badd19784f..01681d061a6a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2132,7 +2132,8 @@ struct file_operations { loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); - int (*uring_cmd_iopoll)(struct io_uring_cmd *ioucmd); + int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, + unsigned int poll_flags); } __randomize_layout; struct inode_operations { -- cgit v1.2.3 From 1d7b61c06dc310421911dac7c5d2d15b754c8b63 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Wed, 21 Sep 2022 15:50:44 +0200 Subject: remoteproc: virtio: Create platform device for the remoteproc_virtio Define a platform driver to manage the remoteproc virtio device as a platform devices. The platform device allows to pass rproc_vdev_data platform data to specify properties that are stored in the rproc_vdev structure. Such approach will allow to preserve legacy remoteproc virtio device creation but also to probe the device using device tree mechanism. remoteproc_virtio.c update: - Add rproc_virtio_driver platform driver. The probe ops replaces the rproc_rvdev_add_device function. - All reference to the rvdev->dev has been updated to rvdev-pdev->dev. - rproc_rvdev_release is removed as associated to the rvdev device. - The use of rvdev->kref counter is replaced by get/put_device on the remoteproc virtio platform device. - The vdev device no longer increments rproc device counter. increment/decrement is done in rproc_virtio_probe/rproc_virtio_remove function in charge of the vrings allocation/free. remoteproc_core.c update: Migrate from the rvdev device to the rvdev platform device. From this patch, when a vdev resource is found in the resource table the remoteproc core register a platform device. Signed-off-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20220921135044.917140-5-arnaud.pouliquen@foss.st.com Signed-off-by: Mathieu Poirier --- include/linux/remoteproc.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index aea79c77db0f..1abf56ad02da 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -616,9 +616,8 @@ struct rproc_vring { /** * struct rproc_vdev - remoteproc state for a supported virtio device - * @refcount: reference counter for the vdev and vring allocations * @subdev: handle for registering the vdev as a rproc subdevice - * @dev: device struct used for reference count semantics + * @pdev: remoteproc virtio platform device * @id: virtio device id (as in virtio_ids.h) * @node: list node * @rproc: the rproc handle @@ -627,10 +626,9 @@ struct rproc_vring { * @index: vdev position versus other vdev declared in resource table */ struct rproc_vdev { - struct kref refcount; struct rproc_subdev subdev; - struct device dev; + struct platform_device *pdev; unsigned int id; struct list_head node; -- cgit v1.2.3 From 14a99e130f2758bc826a7e7a8bdf6f7400b54f0f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 14 Sep 2022 19:07:28 -0700 Subject: lib/find_bit: create find_first_zero_bit_le() find_first_zero_bit_le() is an alias to find_next_zero_bit_le(), despite that 'next' is known to be slower than 'first' version. Now that we have common FIND_FIRST_BIT() macro helper, it's trivial to implement find_first_zero_bit_le() as a real function. Reviewed-by: Valentin Schneider Signed-off-by: Yury Norov --- include/linux/find.h | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index 424ef67d4a42..2464bff5de04 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -17,6 +17,10 @@ extern unsigned long _find_first_and_bit(const unsigned long *addr1, extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); +#ifdef __BIG_ENDIAN +unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size); +#endif + #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region @@ -251,6 +255,20 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned } #endif +#ifndef find_first_zero_bit_le +static inline +unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0); + + return val == ~0UL ? size : ffz(val); + } + + return _find_first_zero_bit_le(addr, size); +} +#endif + #ifndef find_next_bit_le static inline unsigned long find_next_bit_le(const void *addr, unsigned @@ -270,11 +288,6 @@ unsigned long find_next_bit_le(const void *addr, unsigned } #endif -#ifndef find_first_zero_bit_le -#define find_first_zero_bit_le(addr, size) \ - find_next_zero_bit_le((addr), (size), 0) -#endif - #else #error "Please fix " #endif -- cgit v1.2.3 From e79864f3164c573afce09ec4b72b75ebe061c14d Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 14 Sep 2022 19:07:29 -0700 Subject: lib/find_bit: optimize find_next_bit() functions Over the past couple years, the function _find_next_bit() was extended with parameters that modify its behavior to implement and- zero- and le- flavors. The parameters are passed at compile time, but current design prevents a compiler from optimizing out the conditionals. As find_next_bit() API grows, I expect that more parameters will be added. Current design would require more conditional code in _find_next_bit(), which would bloat the helper even more and make it barely readable. This patch replaces _find_next_bit() with a macro FIND_NEXT_BIT, and adds a set of wrappers, so that the compile-time optimizations become possible. The common logic is moved to the new macro, and all flavors may be generated by providing a FETCH macro parameter, like in this example: #define FIND_NEXT_BIT(FETCH, MUNGE, size, start) ... find_next_xornot_and_bit(addr1, addr2, addr3, size, start) { return FIND_NEXT_BIT(addr1[idx] ^ ~addr2[idx] & addr3[idx], /* nop */, size, start); } The FETCH may be of any complexity, as soon as it only refers the bitmap(s) and an iterator idx. MUNGE is here to support _le code generation for BE builds. May be empty. I ran find_bit_benchmark 16 times on top of 6.0-rc2 and 16 times on top of 6.0-rc2 + this series. The results for kvm/x86_64 are: v6.0-rc2 Optimized Difference Z-score Random dense bitmap ns ns ns % find_next_bit: 787735 670546 117189 14.9 3.97 find_next_zero_bit: 777492 664208 113284 14.6 10.51 find_last_bit: 830925 687573 143352 17.3 2.35 find_first_bit: 3874366 3306635 567731 14.7 1.84 find_first_and_bit: 40677125 37739887 2937238 7.2 1.36 find_next_and_bit: 347865 304456 43409 12.5 1.35 Random sparse bitmap find_next_bit: 19816 14021 5795 29.2 6.10 find_next_zero_bit: 1318901 1223794 95107 7.2 1.41 find_last_bit: 14573 13514 1059 7.3 6.92 find_first_bit: 1313321 1249024 64297 4.9 1.53 find_first_and_bit: 8921 8098 823 9.2 4.56 find_next_and_bit: 9796 7176 2620 26.7 5.39 Where the statistics is significant (z-score > 3), the improvement is ~15%. According to the bloat-o-meter, the Image size is 10-11K less: x86_64/defconfig: add/remove: 32/14 grow/shrink: 61/782 up/down: 6344/-16521 (-10177) arm64/defconfig: add/remove: 3/2 grow/shrink: 50/714 up/down: 608/-11556 (-10948) Suggested-by: Linus Torvalds Signed-off-by: Yury Norov --- include/linux/find.h | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index 2464bff5de04..dead6f53a97b 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -8,9 +8,12 @@ #include -extern unsigned long _find_next_bit(const unsigned long *addr1, - const unsigned long *addr2, unsigned long nbits, - unsigned long start, unsigned long invert, unsigned long le); +unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits, + unsigned long start); +unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start); +unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, + unsigned long start); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); @@ -19,6 +22,10 @@ extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long siz #ifdef __BIG_ENDIAN unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size); +unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned + long size, unsigned long offset); +unsigned long _find_next_bit_le(const unsigned long *addr, unsigned + long size, unsigned long offset); #endif #ifndef find_next_bit @@ -45,7 +52,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, return val ? __ffs(val) : size; } - return _find_next_bit(addr, NULL, size, offset, 0UL, 0); + return _find_next_bit(addr, size, offset); } #endif @@ -75,7 +82,7 @@ unsigned long find_next_and_bit(const unsigned long *addr1, return val ? __ffs(val) : size; } - return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); + return _find_next_and_bit(addr1, addr2, size, offset); } #endif @@ -103,7 +110,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, return val == ~0UL ? size : ffz(val); } - return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); + return _find_next_zero_bit(addr, size, offset); } #endif @@ -251,7 +258,7 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned return val == ~0UL ? size : ffz(val); } - return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); + return _find_next_zero_bit_le(addr, size, offset); } #endif @@ -284,7 +291,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned return val ? __ffs(val) : size; } - return _find_next_bit(addr, NULL, size, offset, 0UL, 1); + return _find_next_bit_le(addr, size, offset); } #endif -- cgit v1.2.3 From cb9ff3f3b84c95867856c3be42de73972feb1249 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:43:47 +0800 Subject: vfio: Add helpers for unifying vfio_device life cycle The idea is to let vfio core manage the vfio_device life cycle instead of duplicating the logic cross drivers. This is also a preparatory step for adding struct device into vfio_device. New pair of helpers together with a kref in vfio_device: - vfio_alloc_device() - vfio_put_device() Drivers can register @init/@release callbacks to manage any private state wrapping the vfio_device. However vfio-ccw doesn't fit this model due to a life cycle mess that its private structure mixes both parent and mdev info hence must be allocated/freed outside of the life cycle of vfio device. Per prior discussions this won't be fixed in short term by IBM folks. Instead of waiting for those modifications introduce another helper vfio_init_device() so ccw can call it to initialize a pre-allocated vfio_device. Further implication of the ccw trick is that vfio_device cannot be freed uniformly in vfio core. Instead, require *EVERY* driver to implement @release and free vfio_device inside. Then ccw can choose to delay the free at its own discretion. Another trick down the road is that kvzalloc() is used to accommodate the need of gvt which uses vzalloc() while all others use kzalloc(). So drivers should call a helper vfio_free_device() to free the vfio_device instead of assuming that kfree() or vfree() is appliable. Later once the ccw mess is fixed we can remove those tricks and fully handle structure alloc/free in vfio core. Existing vfio_{un}init_group_dev() will be deprecated after all existing usages are converted to the new model. Suggested-by: Jason Gunthorpe Co-developed-by: Yi Liu Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Tony Krowiak Reviewed-by: Jason Gunthorpe Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20220921104401.38898-2-kevin.tian@intel.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 0e2826559091..f67cac700e6f 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -47,7 +47,8 @@ struct vfio_device { struct kvm *kvm; /* Members below here are private, not for driver use */ - refcount_t refcount; + struct kref kref; /* object life cycle */ + refcount_t refcount; /* user count on registered device*/ unsigned int open_count; struct completion comp; struct list_head group_next; @@ -57,6 +58,8 @@ struct vfio_device { /** * struct vfio_device_ops - VFIO bus driver device callbacks * + * @init: initialize private fields in device structure + * @release: Reclaim private fields in device structure * @open_device: Called when the first file descriptor is opened for this device * @close_device: Opposite of open_device * @read: Perform read(2) on device file descriptor @@ -74,6 +77,8 @@ struct vfio_device { */ struct vfio_device_ops { char *name; + int (*init)(struct vfio_device *vdev); + void (*release)(struct vfio_device *vdev); int (*open_device)(struct vfio_device *vdev); void (*close_device)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, @@ -161,6 +166,24 @@ static inline int vfio_check_feature(u32 flags, size_t argsz, u32 supported_ops, return 1; } +struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, + const struct vfio_device_ops *ops); +#define vfio_alloc_device(dev_struct, member, dev, ops) \ + container_of(_vfio_alloc_device(sizeof(struct dev_struct) + \ + BUILD_BUG_ON_ZERO(offsetof( \ + struct dev_struct, member)), \ + dev, ops), \ + struct dev_struct, member) + +int vfio_init_device(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops); +void vfio_free_device(struct vfio_device *device); +void vfio_device_release(struct kref *kref); +static inline void vfio_put_device(struct vfio_device *device) +{ + kref_put(&device->kref, vfio_device_release); +} + void vfio_init_group_dev(struct vfio_device *device, struct device *dev, const struct vfio_device_ops *ops); void vfio_uninit_group_dev(struct vfio_device *device); -- cgit v1.2.3 From 63d7c77989de98d3e92611dbb858028b74dca377 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:48 +0800 Subject: vfio/pci: Use the new device life cycle helpers Also introduce two pci core helpers as @init/@release for pci drivers: - vfio_pci_core_init_dev() - vfio_pci_core_release_dev() Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-3-kevin.tian@intel.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 089b603bcfdc..0499ea836058 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -109,6 +109,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev); void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, struct pci_dev *pdev, const struct vfio_device_ops *vfio_pci_ops); +int vfio_pci_core_init_dev(struct vfio_device *core_vdev); +void vfio_pci_core_release_dev(struct vfio_device *core_vdev); int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev); -- cgit v1.2.3 From 27aeb915595b87165a3004aab05b2b837d01e6ed Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:50 +0800 Subject: vfio/hisi_acc: Use the new device life cycle helpers Tidy up @probe so all migration specific initialization logic is moved to migration specific @init callback. Remove vfio_pci_core_{un}init_device() given no user now. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Shameer Kolothum Link: https://lore.kernel.org/r/20220921104401.38898-5-kevin.tian@intel.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 0499ea836058..367fd79226a3 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -106,13 +106,9 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga, bool is_disable_idle_d3); void vfio_pci_core_close_device(struct vfio_device *core_vdev); -void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, - struct pci_dev *pdev, - const struct vfio_device_ops *vfio_pci_ops); int vfio_pci_core_init_dev(struct vfio_device *core_vdev); void vfio_pci_core_release_dev(struct vfio_device *core_vdev); int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev); -void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev); extern const struct pci_error_handlers vfio_pci_core_err_handlers; int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, -- cgit v1.2.3 From ebb72b765fb49685c4603d2bff47a4ab5d2580a9 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:43:59 +0800 Subject: vfio/ccw: Use the new device life cycle helpers ccw is the only exception which cannot use vfio_alloc_device() because its private device structure is designed to serve both mdev and parent. Life cycle of the parent is managed by css_driver so vfio_ccw_private must be allocated/freed in css_driver probe/remove path instead of conforming to vfio core life cycle for mdev. Given that use a wait/completion scheme so the mdev remove path waits after vfio_put_device() until receiving a completion notification from @release. The completion indicates that all active references on vfio_device have been released. After that point although free of vfio_ccw_private is delayed to css_driver it's at least guaranteed to have no parallel reference on released vfio device part from other code paths. memset() in @probe is removed. vfio_device is either already cleared when probed for the first time or cleared in @release from last probe. The right fix is to introduce separate structures for mdev and parent, but this won't happen in short term per prior discussions. Remove vfio_init/uninit_group_dev() as no user now. Suggested-by: Jason Gunthorpe Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220921104401.38898-14-kevin.tian@intel.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index f67cac700e6f..3cf857b1eec7 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -184,9 +184,6 @@ static inline void vfio_put_device(struct vfio_device *device) kref_put(&device->kref, vfio_device_release); } -void vfio_init_group_dev(struct vfio_device *device, struct device *dev, - const struct vfio_device_ops *ops); -void vfio_uninit_group_dev(struct vfio_device *device); int vfio_register_group_dev(struct vfio_device *device); int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); -- cgit v1.2.3 From 3c28a76124b25882411f005924be73795b6ef078 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:44:01 +0800 Subject: vfio: Add struct device to vfio_device and replace kref. With it a 'vfio-dev/vfioX' node is created under the sysfs path of the parent, indicating the device is bound to a vfio driver, e.g.: /sys/devices/pci0000\:6f/0000\:6f\:01.0/vfio-dev/vfio0 It is also a preparatory step toward adding cdev for supporting future device-oriented uAPI. Add Documentation/ABI/testing/sysfs-devices-vfio-dev. Suggested-by: Jason Gunthorpe Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-16-kevin.tian@intel.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 3cf857b1eec7..ee399a768070 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -47,7 +47,8 @@ struct vfio_device { struct kvm *kvm; /* Members below here are private, not for driver use */ - struct kref kref; /* object life cycle */ + unsigned int index; + struct device device; /* device.kref covers object life circle */ refcount_t refcount; /* user count on registered device*/ unsigned int open_count; struct completion comp; @@ -178,10 +179,9 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, int vfio_init_device(struct vfio_device *device, struct device *dev, const struct vfio_device_ops *ops); void vfio_free_device(struct vfio_device *device); -void vfio_device_release(struct kref *kref); static inline void vfio_put_device(struct vfio_device *device) { - kref_put(&device->kref, vfio_device_release); + put_device(&device->device); } int vfio_register_group_dev(struct vfio_device *device); -- cgit v1.2.3 From 583c1f420173f7d84413a1a1fbf5109d798b4faa Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 19 Sep 2022 19:00:57 -0500 Subject: bpf: Define new BPF_MAP_TYPE_USER_RINGBUF map type We want to support a ringbuf map type where samples are published from user-space, to be consumed by BPF programs. BPF currently supports a kernel -> user-space circular ring buffer via the BPF_MAP_TYPE_RINGBUF map type. We'll need to define a new map type for user-space -> kernel, as none of the helpers exported for BPF_MAP_TYPE_RINGBUF will apply to a user-space producer ring buffer, and we'll want to add one or more helper functions that would not apply for a kernel-producer ring buffer. This patch therefore adds a new BPF_MAP_TYPE_USER_RINGBUF map type definition. The map type is useless in its current form, as there is no way to access or use it for anything until we one or more BPF helpers. A follow-on patch will therefore add a new helper function that allows BPF programs to run callbacks on samples that are published to the ring buffer. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220920000100.477320-2-void@manifault.com --- include/linux/bpf_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2b9112b80171..2c6a4f2562a7 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -126,6 +126,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) -- cgit v1.2.3 From 20571567384428dfc9fe5cf9f2e942e1df13c2dd Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 19 Sep 2022 19:00:58 -0500 Subject: bpf: Add bpf_user_ringbuf_drain() helper In a prior change, we added a new BPF_MAP_TYPE_USER_RINGBUF map type which will allow user-space applications to publish messages to a ring buffer that is consumed by a BPF program in kernel-space. In order for this map-type to be useful, it will require a BPF helper function that BPF programs can invoke to drain samples from the ring buffer, and invoke callbacks on those samples. This change adds that capability via a new BPF helper function: bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) BPF programs may invoke this function to run callback_fn() on a series of samples in the ring buffer. callback_fn() has the following signature: long callback_fn(struct bpf_dynptr *dynptr, void *context); Samples are provided to the callback in the form of struct bpf_dynptr *'s, which the program can read using BPF helper functions for querying struct bpf_dynptr's. In order to support bpf_ringbuf_drain(), a new PTR_TO_DYNPTR register type is added to the verifier to reflect a dynptr that was allocated by a helper function and passed to a BPF program. Unlike PTR_TO_STACK dynptrs which are allocated on the stack by a BPF program, PTR_TO_DYNPTR dynptrs need not use reference tracking, as the BPF helper is trusted to properly free the dynptr before returning. The verifier currently only supports PTR_TO_DYNPTR registers that are also DYNPTR_TYPE_LOCAL. Note that while the corresponding user-space libbpf logic will be added in a subsequent patch, this patch does contain an implementation of the .map_poll() callback for BPF_MAP_TYPE_USER_RINGBUF maps. This .map_poll() callback guarantees that an epoll-waiting user-space producer will receive at least one event notification whenever at least one sample is drained in an invocation of bpf_user_ringbuf_drain(), provided that the function is not invoked with the BPF_RB_NO_WAKEUP flag. If the BPF_RB_FORCE_WAKEUP flag is provided, a wakeup notification is sent even if no sample was drained. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220920000100.477320-3-void@manifault.com --- include/linux/bpf.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e0dbe0c0a17e..33e543b86e1a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -451,7 +451,7 @@ enum bpf_type_flag { /* DYNPTR points to memory local to the bpf program. */ DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), - /* DYNPTR points to a ringbuf record. */ + /* DYNPTR points to a kernel-produced ringbuf record. */ DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), /* Size is known at compile time. */ @@ -656,6 +656,7 @@ enum bpf_reg_type { PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_DYNPTR, /* reg points to a dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ @@ -1394,6 +1395,11 @@ struct bpf_array { #define BPF_MAP_CAN_READ BIT(0) #define BPF_MAP_CAN_WRITE BIT(1) +/* Maximum number of user-producer ring buffer samples that can be drained in + * a call to bpf_user_ringbuf_drain(). + */ +#define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024) + static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) { u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); @@ -2495,6 +2501,7 @@ extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; extern const struct bpf_func_proto bpf_set_retval_proto; extern const struct bpf_func_proto bpf_get_retval_proto; +extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -2639,7 +2646,7 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_INVALID, /* Points to memory that is local to the bpf program */ BPF_DYNPTR_TYPE_LOCAL, - /* Underlying data is a ringbuf record */ + /* Underlying data is a kernel-produced ringbuf record */ BPF_DYNPTR_TYPE_RINGBUF, }; -- cgit v1.2.3 From b8d31762a0ae6861e1115302ee338560d853e317 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:42 +0200 Subject: btf: Allow dynamic pointer parameters in kfuncs Allow dynamic pointers (struct bpf_dynptr_kern *) to be specified as parameters in kfuncs. Also, ensure that dynamic pointers passed as argument are valid and initialized, are a pointer to the stack, and of the type local. More dynamic pointer types can be supported in the future. To properly detect whether a parameter is of the desired type, introduce the stringify_struct() macro to compare the returned structure name with the desired name. In addition, protect against structure renames, by halting the build with BUILD_BUG_ON(), so that developers have to revisit the code. To check if a dynamic pointer passed to the kfunc is valid and initialized, and if its type is local, export the existing functions is_dynptr_reg_valid_init() and is_dynptr_type_expected(). Cc: Joanne Koong Cc: Kumar Kartikeya Dwivedi Signed-off-by: Roberto Sassu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-5-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ include/linux/btf.h | 9 +++++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e197f8fb27e2..9e1e6965f407 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -593,6 +593,11 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); +bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, + struct bpf_reg_state *reg); +bool is_dynptr_type_expected(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + enum bpf_arg_type arg_type); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, diff --git a/include/linux/btf.h b/include/linux/btf.h index 1fcc833a8690..f9aababc5d78 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -52,6 +52,15 @@ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ +/* + * Return the name of the passed struct, if exists, or halt the build if for + * example the structure gets renamed. In this way, developers have to revisit + * the code using that structure name, and update it accordingly. + */ +#define stringify_struct(x) \ + ({ BUILD_BUG_ON(sizeof(struct x) < 0); \ + __stringify(x); }) + struct btf; struct btf_member; struct btf_type; -- cgit v1.2.3 From 51df4865718540f51bb5d3e552c50dc88e1333d6 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:43 +0200 Subject: bpf: Export bpf_dynptr_get_size() Export bpf_dynptr_get_size(), so that kernel code dealing with eBPF dynamic pointers can obtain the real size of data carried by this data structure. Signed-off-by: Roberto Sassu Reviewed-by: Joanne Koong Acked-by: KP Singh Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-6-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 33e543b86e1a..6535fb1e21b9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2654,6 +2654,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); int bpf_dynptr_check_size(u32 size); +u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); -- cgit v1.2.3 From 90fd8f26edd47942203639bf3a5dde8fa1931a0e Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:44 +0200 Subject: KEYS: Move KEY_LOOKUP_ to include/linux/key.h and define KEY_LOOKUP_ALL In preparation for the patch that introduces the bpf_lookup_user_key() eBPF kfunc, move KEY_LOOKUP_ definitions to include/linux/key.h, to be able to validate the kfunc parameters. Add them to enum key_lookup_flag, so that all the current ones and the ones defined in the future are automatically exported through BTF and available to eBPF programs. Also, add KEY_LOOKUP_ALL to the enum, with the logical OR of currently defined flags as value, to facilitate checking whether a variable contains only those flags. Signed-off-by: Roberto Sassu Acked-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20220920075951.929132-7-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/key.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/key.h b/include/linux/key.h index 7febc4881363..d27477faf00d 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -88,6 +88,12 @@ enum key_need_perm { KEY_DEFER_PERM_CHECK, /* Special: permission check is deferred */ }; +enum key_lookup_flag { + KEY_LOOKUP_CREATE = 0x01, + KEY_LOOKUP_PARTIAL = 0x02, + KEY_LOOKUP_ALL = (KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL), +}; + struct seq_file; struct user_struct; struct signal_struct; -- cgit v1.2.3 From f3cf4134c5c6c47b9b5c7aa3cb2d67e107887a7b Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:45 +0200 Subject: bpf: Add bpf_lookup_*_key() and bpf_key_put() kfuncs Add the bpf_lookup_user_key(), bpf_lookup_system_key() and bpf_key_put() kfuncs, to respectively search a key with a given key handle serial number and flags, obtain a key from a pre-determined ID defined in include/linux/verification.h, and cleanup. Introduce system_keyring_id_check() to validate the keyring ID parameter of bpf_lookup_system_key(). Signed-off-by: Roberto Sassu Acked-by: Kumar Kartikeya Dwivedi Acked-by: Song Liu Link: https://lore.kernel.org/r/20220920075951.929132-8-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ include/linux/verification.h | 8 ++++++++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6535fb1e21b9..a1435b019aca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2664,4 +2664,12 @@ static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {} static inline void bpf_cgroup_atype_put(int cgroup_atype) {} #endif /* CONFIG_BPF_LSM */ +struct key; + +#ifdef CONFIG_KEYS +struct bpf_key { + struct key *key; + bool has_ref; +}; +#endif /* CONFIG_KEYS */ #endif /* _LINUX_BPF_H */ diff --git a/include/linux/verification.h b/include/linux/verification.h index a655923335ae..f34e50ebcf60 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -17,6 +17,14 @@ #define VERIFY_USE_SECONDARY_KEYRING ((struct key *)1UL) #define VERIFY_USE_PLATFORM_KEYRING ((struct key *)2UL) +static inline int system_keyring_id_check(u64 id) +{ + if (id > (unsigned long)VERIFY_USE_PLATFORM_KEYRING) + return -EINVAL; + + return 0; +} + /* * The use to which an asymmetric key is being put. */ -- cgit v1.2.3 From 05b24ff9b2cfabfcfd951daaa915a036ab53c9e1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 16 Sep 2022 09:19:14 +0200 Subject: bpf: Prevent bpf program recursion for raw tracepoint probes We got report from sysbot [1] about warnings that were caused by bpf program attached to contention_begin raw tracepoint triggering the same tracepoint by using bpf_trace_printk helper that takes trace_printk_lock lock. Call Trace: ? trace_event_raw_event_bpf_trace_printk+0x5f/0x90 bpf_trace_printk+0x2b/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 __unfreeze_partials+0x5b/0x160 ... The can be reproduced by attaching bpf program as raw tracepoint on contention_begin tracepoint. The bpf prog calls bpf_trace_printk helper. Then by running perf bench the spin lock code is forced to take slow path and call contention_begin tracepoint. Fixing this by skipping execution of the bpf program if it's already running, Using bpf prog 'active' field, which is being currently used by trampoline programs for the same reason. Moving bpf_prog_inc_misses_counter to syscall.c because trampoline.c is compiled in just for CONFIG_BPF_JIT option. Reviewed-by: Stanislav Fomichev Reported-by: syzbot+2251879aa068ad9c960d@syzkaller.appspotmail.com [1] https://lore.kernel.org/bpf/YxhFe3EwqchC%2FfYf@krava/T/#t Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220916071914.7156-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a1435b019aca..edd43edb27d6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2042,6 +2042,8 @@ static inline bool has_current_bpf_ctx(void) { return !!current->bpf_ctx; } + +void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2264,6 +2266,10 @@ static inline bool has_current_bpf_ctx(void) { return false; } + +static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, -- cgit v1.2.3 From d7e7b9af104c7b389a0c21eb26532511bce4b510 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 1 Sep 2022 12:32:06 -0700 Subject: fscrypt: stop using keyrings subsystem for fscrypt_master_key The approach of fs/crypto/ internally managing the fscrypt_master_key structs as the payloads of "struct key" objects contained in a "struct key" keyring has outlived its usefulness. The original idea was to simplify the code by reusing code from the keyrings subsystem. However, several issues have arisen that can't easily be resolved: - When a master key struct is destroyed, blk_crypto_evict_key() must be called on any per-mode keys embedded in it. (This started being the case when inline encryption support was added.) Yet, the keyrings subsystem can arbitrarily delay the destruction of keys, even past the time the filesystem was unmounted. Therefore, currently there is no easy way to call blk_crypto_evict_key() when a master key is destroyed. Currently, this is worked around by holding an extra reference to the filesystem's request_queue(s). But it was overlooked that the request_queue reference is *not* guaranteed to pin the corresponding blk_crypto_profile too; for device-mapper devices that support inline crypto, it doesn't. This can cause a use-after-free. - When the last inode that was using an incompletely-removed master key is evicted, the master key removal is completed by removing the key struct from the keyring. Currently this is done via key_invalidate(). Yet, key_invalidate() takes the key semaphore. This can deadlock when called from the shrinker, since in fscrypt_ioctl_add_key(), memory is allocated with GFP_KERNEL under the same semaphore. - More generally, the fact that the keyrings subsystem can arbitrarily delay the destruction of keys (via garbage collection delay, or via random processes getting temporary key references) is undesirable, as it means we can't strictly guarantee that all secrets are ever wiped. - Doing the master key lookups via the keyrings subsystem results in the key_permission LSM hook being called. fscrypt doesn't want this, as all access control for encrypted files is designed to happen via the files themselves, like any other files. The workaround which SELinux users are using is to change their SELinux policy to grant key search access to all domains. This works, but it is an odd extra step that shouldn't really have to be done. The fix for all these issues is to change the implementation to what I should have done originally: don't use the keyrings subsystem to keep track of the filesystem's fscrypt_master_key structs. Instead, just store them in a regular kernel data structure, and rework the reference counting, locking, and lifetime accordingly. Retain support for RCU-mode key lookups by using a hash table. Replace fscrypt_sb_free() with fscrypt_sb_delete(), which releases the keys synchronously and runs a bit earlier during unmount, so that block devices are still available. A side effect of this patch is that neither the master keys themselves nor the filesystem keyrings will be listed in /proc/keys anymore. ("Master key users" and the master key users keyrings will still be listed.) However, this was mostly an implementation detail, and it was intended just for debugging purposes. I don't know of anyone using it. This patch does *not* change how "master key users" (->mk_users) works; that still uses the keyrings subsystem. That is still needed for key quotas, and changing that isn't necessary to solve the issues listed above. If we decide to change that too, it would be a separate patch. I've marked this as fixing the original commit that added the fscrypt keyring, but as noted above the most important issue that this patch fixes wasn't introduced until the addition of inline encryption support. Fixes: 22d94f493bfb ("fscrypt: add FS_IOC_ADD_ENCRYPTION_KEY ioctl") Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20220901193208.138056-2-ebiggers@kernel.org --- include/linux/fs.h | 2 +- include/linux/fscrypt.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9eced4cc286e..0830486f47ef 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1472,7 +1472,7 @@ struct super_block { const struct xattr_handler **s_xattr; #ifdef CONFIG_FS_ENCRYPTION const struct fscrypt_operations *s_cop; - struct key *s_master_keys; /* master crypto keys in use */ + struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ #endif #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 488fd8c8f8af..db5bb5650bf2 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -310,7 +310,7 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) } /* keyring.c */ -void fscrypt_sb_free(struct super_block *sb); +void fscrypt_sb_delete(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); int fscrypt_add_test_dummy_key(struct super_block *sb, const struct fscrypt_dummy_policy *dummy_policy); @@ -524,7 +524,7 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) } /* keyring.c */ -static inline void fscrypt_sb_free(struct super_block *sb) +static inline void fscrypt_sb_delete(struct super_block *sb) { } -- cgit v1.2.3 From 0e91fc1e0f5c70ce575451103ec66c2ec21f1a6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Sep 2022 12:32:08 -0700 Subject: fscrypt: work on block_devices instead of request_queues request_queues are a block layer implementation detail that should not leak into file systems. Change the fscrypt inline crypto code to retrieve block devices instead of request_queues from the file system. As part of that, clean up the interaction with multi-device file systems by returning both the number of devices and the actual device array in a single method call. Signed-off-by: Christoph Hellwig [ebiggers: bug fixes and minor tweaks] Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20220901193208.138056-4-ebiggers@kernel.org --- include/linux/fscrypt.h | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index db5bb5650bf2..1f12ebb4a69d 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -161,24 +161,21 @@ struct fscrypt_operations { int *ino_bits_ret, int *lblk_bits_ret); /* - * Return the number of block devices to which the filesystem may write - * encrypted file contents. + * Return an array of pointers to the block devices to which the + * filesystem may write encrypted file contents, NULL if the filesystem + * only has a single such block device, or an ERR_PTR() on error. + * + * On successful non-NULL return, *num_devs is set to the number of + * devices in the returned array. The caller must free the returned + * array using kfree(). * * If the filesystem can use multiple block devices (other than block * devices that aren't used for encrypted file contents, such as * external journal devices), and wants to support inline encryption, * then it must implement this function. Otherwise it's not needed. */ - int (*get_num_devices)(struct super_block *sb); - - /* - * If ->get_num_devices() returns a value greater than 1, then this - * function is called to get the array of request_queues that the - * filesystem is using -- one per block device. (There may be duplicate - * entries in this array, as block devices can share a request_queue.) - */ - void (*get_devices)(struct super_block *sb, - struct request_queue **devs); + struct block_device **(*get_devices)(struct super_block *sb, + unsigned int *num_devs); }; static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode) -- cgit v1.2.3 From d7f06bdd6ee87fbefa05af5f57361d85e7715b11 Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Tue, 6 Sep 2022 16:35:42 -0400 Subject: drivers/base: Fix unsigned comparison to -1 in CPUMAP_FILE_MAX_BYTES As PAGE_SIZE is unsigned long, -1 > PAGE_SIZE when NR_CPUS <= 3. This leads to very large file sizes: topology$ ls -l total 0 -r--r--r-- 1 root root 18446744073709551615 Sep 5 11:59 core_cpus -r--r--r-- 1 root root 4096 Sep 5 11:59 core_cpus_list -r--r--r-- 1 root root 4096 Sep 5 10:58 core_id -r--r--r-- 1 root root 18446744073709551615 Sep 5 10:10 core_siblings -r--r--r-- 1 root root 4096 Sep 5 11:59 core_siblings_list -r--r--r-- 1 root root 18446744073709551615 Sep 5 11:59 die_cpus -r--r--r-- 1 root root 4096 Sep 5 11:59 die_cpus_list -r--r--r-- 1 root root 4096 Sep 5 11:59 die_id -r--r--r-- 1 root root 18446744073709551615 Sep 5 11:59 package_cpus -r--r--r-- 1 root root 4096 Sep 5 11:59 package_cpus_list -r--r--r-- 1 root root 4096 Sep 5 10:58 physical_package_id -r--r--r-- 1 root root 18446744073709551615 Sep 5 10:10 thread_siblings -r--r--r-- 1 root root 4096 Sep 5 11:59 thread_siblings_list Adjust the inequality to catch the case when NR_CPUS is configured to a small value. Fixes: 7ee951acd31a ("drivers/base: fix userspace break from using bin_attributes for cpumap and cpulist") Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Yury Norov Cc: stable@vger.kernel.org Cc: feng xiangjun Reported-by: feng xiangjun Signed-off-by: Phil Auld Signed-off-by: Yury Norov Link: https://lore.kernel.org/r/20220906203542.1796629-1-pauld@redhat.com Signed-off-by: Greg Kroah-Hartman --- include/linux/cpumask.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index bd047864c7ac..e8ad12b5b9d2 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1127,9 +1127,10 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, * cover a worst-case of every other cpu being on one of two nodes for a * very large NR_CPUS. * - * Use PAGE_SIZE as a minimum for smaller configurations. + * Use PAGE_SIZE as a minimum for smaller configurations while avoiding + * unsigned comparison to -1. */ -#define CPUMAP_FILE_MAX_BYTES ((((NR_CPUS * 9)/32 - 1) > PAGE_SIZE) \ +#define CPUMAP_FILE_MAX_BYTES (((NR_CPUS * 9)/32 > PAGE_SIZE) \ ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE) #define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE) -- cgit v1.2.3 From cca1fd41ab2862465d75443822d751e4f9a112ee Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Thu, 22 Sep 2022 07:20:57 -0400 Subject: counter: Realign counter_comp comment block to 80 characters The member documentation comment lines for struct counter_comp extend past the 80-characters column boundary due to extra identation at the start of each section. This patch realigns the comment block within the 80-characters boundary by removing these superfluous indents. Reviewed-by: Yanteng Si Link: https://lore.kernel.org/r/20220902120839.4260-1-william.gray@linaro.org/ Signed-off-by: William Breathitt Gray Link: https://lore.kernel.org/r/8294b04153c33602e9c3dd21ac90c1e99bd0fdaf.1663844776.git.william.gray@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/counter.h | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index 1fe17f5adb09..a81234bc8ea8 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -38,64 +38,64 @@ enum counter_comp_type { * @type: Counter component data type * @name: device-specific component name * @priv: component-relevant data - * @action_read: Synapse action mode read callback. The read value of the + * @action_read: Synapse action mode read callback. The read value of the * respective Synapse action mode should be passed back via * the action parameter. - * @device_u8_read: Device u8 component read callback. The read value of the + * @device_u8_read: Device u8 component read callback. The read value of the * respective Device u8 component should be passed back via * the val parameter. - * @count_u8_read: Count u8 component read callback. The read value of the + * @count_u8_read: Count u8 component read callback. The read value of the * respective Count u8 component should be passed back via * the val parameter. - * @signal_u8_read: Signal u8 component read callback. The read value of the + * @signal_u8_read: Signal u8 component read callback. The read value of the * respective Signal u8 component should be passed back via * the val parameter. - * @device_u32_read: Device u32 component read callback. The read value of + * @device_u32_read: Device u32 component read callback. The read value of * the respective Device u32 component should be passed * back via the val parameter. - * @count_u32_read: Count u32 component read callback. The read value of the + * @count_u32_read: Count u32 component read callback. The read value of the * respective Count u32 component should be passed back via * the val parameter. - * @signal_u32_read: Signal u32 component read callback. The read value of + * @signal_u32_read: Signal u32 component read callback. The read value of * the respective Signal u32 component should be passed * back via the val parameter. - * @device_u64_read: Device u64 component read callback. The read value of + * @device_u64_read: Device u64 component read callback. The read value of * the respective Device u64 component should be passed * back via the val parameter. - * @count_u64_read: Count u64 component read callback. The read value of the + * @count_u64_read: Count u64 component read callback. The read value of the * respective Count u64 component should be passed back via * the val parameter. - * @signal_u64_read: Signal u64 component read callback. The read value of + * @signal_u64_read: Signal u64 component read callback. The read value of * the respective Signal u64 component should be passed * back via the val parameter. - * @action_write: Synapse action mode write callback. The write value of + * @action_write: Synapse action mode write callback. The write value of * the respective Synapse action mode is passed via the * action parameter. - * @device_u8_write: Device u8 component write callback. The write value of + * @device_u8_write: Device u8 component write callback. The write value of * the respective Device u8 component is passed via the val * parameter. - * @count_u8_write: Count u8 component write callback. The write value of + * @count_u8_write: Count u8 component write callback. The write value of * the respective Count u8 component is passed via the val * parameter. - * @signal_u8_write: Signal u8 component write callback. The write value of + * @signal_u8_write: Signal u8 component write callback. The write value of * the respective Signal u8 component is passed via the val * parameter. - * @device_u32_write: Device u32 component write callback. The write value of + * @device_u32_write: Device u32 component write callback. The write value of * the respective Device u32 component is passed via the * val parameter. - * @count_u32_write: Count u32 component write callback. The write value of + * @count_u32_write: Count u32 component write callback. The write value of * the respective Count u32 component is passed via the val * parameter. - * @signal_u32_write: Signal u32 component write callback. The write value of + * @signal_u32_write: Signal u32 component write callback. The write value of * the respective Signal u32 component is passed via the * val parameter. - * @device_u64_write: Device u64 component write callback. The write value of + * @device_u64_write: Device u64 component write callback. The write value of * the respective Device u64 component is passed via the * val parameter. - * @count_u64_write: Count u64 component write callback. The write value of + * @count_u64_write: Count u64 component write callback. The write value of * the respective Count u64 component is passed via the val * parameter. - * @signal_u64_write: Signal u64 component write callback. The write value of + * @signal_u64_write: Signal u64 component write callback. The write value of * the respective Signal u64 component is passed via the * val parameter. */ -- cgit v1.2.3 From 4d269ed485298e8a09485a664e7b35b370ab4ada Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 2 Sep 2022 15:48:09 +0000 Subject: x86/resctrl: Kill off alloc_enabled rdt_resources_all[] used to have extra entries for L2CODE/L2DATA. These were hidden from resctrl by the alloc_enabled value. Now that the L2/L2CODE/L2DATA resources have been merged together, alloc_enabled doesn't mean anything, it always has the same value as alloc_capable which indicates allocation is supported by this resource. Remove alloc_enabled and its helpers. Signed-off-by: James Morse Signed-off-by: Borislav Petkov Reviewed-by: Jamie Iles Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Tested-by: Xin Hao Tested-by: Shaopeng Tan Tested-by: Cristian Marussi Link: https://lore.kernel.org/r/20220902154829.30399-2-james.morse@arm.com --- include/linux/resctrl.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 21deb5212bbd..386ab3a41500 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -130,7 +130,6 @@ struct resctrl_schema; /** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource - * @alloc_enabled: Is allocation enabled on this machine * @mon_enabled: Is monitoring enabled for this feature * @alloc_capable: Is allocation available on this machine * @mon_capable: Is monitor feature available on this machine @@ -150,7 +149,6 @@ struct resctrl_schema; */ struct rdt_resource { int rid; - bool alloc_enabled; bool mon_enabled; bool alloc_capable; bool mon_capable; -- cgit v1.2.3 From bab6ee736873becc0216ba5fd159394e272d01b2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 2 Sep 2022 15:48:10 +0000 Subject: x86/resctrl: Merge mon_capable and mon_enabled mon_enabled and mon_capable are always set as a pair by rdt_get_mon_l3_config(). There is no point having two values. Merge them together. Signed-off-by: James Morse Signed-off-by: Borislav Petkov Reviewed-by: Jamie Iles Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Tested-by: Xin Hao Tested-by: Shaopeng Tan Tested-by: Cristian Marussi Link: https://lore.kernel.org/r/20220902154829.30399-3-james.morse@arm.com --- include/linux/resctrl.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 386ab3a41500..8180c539800d 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -130,7 +130,6 @@ struct resctrl_schema; /** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource - * @mon_enabled: Is monitoring enabled for this feature * @alloc_capable: Is allocation available on this machine * @mon_capable: Is monitor feature available on this machine * @num_rmid: Number of RMIDs available @@ -149,7 +148,6 @@ struct resctrl_schema; */ struct rdt_resource { int rid; - bool mon_enabled; bool alloc_capable; bool mon_capable; int num_rmid; -- cgit v1.2.3 From de84a090d99a3b991bd89cd86a94b65d15bd1bbe Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 20 Sep 2022 12:11:21 +0200 Subject: net: ethernet: mtk_eth_wed: add wed support for mt7986 chipset Introduce Wireless Etherne Dispatcher support on transmission side for mt7986 chipset Tested-by: Daniel Golle Co-developed-by: Bo Jiao Signed-off-by: Bo Jiao Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Signed-off-by: Paolo Abeni --- include/linux/soc/mediatek/mtk_wed.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 7e00cca06709..592221a7149b 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -14,6 +14,7 @@ struct mtk_wdma_desc; struct mtk_wed_ring { struct mtk_wdma_desc *desc; dma_addr_t desc_phys; + u32 desc_size; int size; u32 reg_base; @@ -45,10 +46,17 @@ struct mtk_wed_device { struct pci_dev *pci_dev; u32 wpdma_phys; + u32 wpdma_int; + u32 wpdma_mask; + u32 wpdma_tx; + u32 wpdma_txfree; u16 token_start; unsigned int nbuf; + u8 tx_tbit[MTK_WED_TX_QUEUES]; + u8 txfree_tbit; + u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id); int (*offload_enable)(struct mtk_wed_device *wed); void (*offload_disable)(struct mtk_wed_device *wed); -- cgit v1.2.3 From 2b2ba3ecb2411c5e2a0d670be5e9ded2c93351e9 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 20 Sep 2022 12:11:22 +0200 Subject: net: ethernet: mtk_eth_wed: add axi bus support Other than pcie bus, introduce support for axi bus to mtk wed driver. Axi bus is used to connect mt7986-wmac soc chip available on mt7986 device. Tested-by: Daniel Golle Co-developed-by: Bo Jiao Signed-off-by: Bo Jiao Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Signed-off-by: Paolo Abeni --- include/linux/soc/mediatek/mtk_wed.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 592221a7149b..4450c8b7a1cb 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -11,6 +11,11 @@ struct mtk_wed_hw; struct mtk_wdma_desc; +enum mtk_wed_bus_tye { + MTK_WED_BUS_PCIE, + MTK_WED_BUS_AXI, +}; + struct mtk_wed_ring { struct mtk_wdma_desc *desc; dma_addr_t desc_phys; @@ -43,7 +48,11 @@ struct mtk_wed_device { /* filled by driver: */ struct { - struct pci_dev *pci_dev; + union { + struct platform_device *platform_dev; + struct pci_dev *pci_dev; + }; + enum mtk_wed_bus_tye bus_type; u32 wpdma_phys; u32 wpdma_int; -- cgit v1.2.3 From 3a7232cdf19e39e7f24c493117b373788b348af2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 2 Sep 2022 15:48:11 +0000 Subject: x86/resctrl: Add domain online callback for resctrl work Because domains are exposed to user-space via resctrl, the filesystem must update its state when CPU hotplug callbacks are triggered. Some of this work is common to any architecture that would support resctrl, but the work is tied up with the architecture code to allocate the memory. Move domain_setup_mon_state(), the monitor subdir creation call and the mbm/limbo workers into a new resctrl_online_domain() call. These bits are not specific to the architecture. Grouping them in one function allows that code to be moved to /fs/ and re-used by another architecture. Signed-off-by: James Morse Signed-off-by: Borislav Petkov Reviewed-by: Jamie Iles Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Tested-by: Xin Hao Tested-by: Shaopeng Tan Tested-by: Cristian Marussi Link: https://lore.kernel.org/r/20220902154829.30399-4-james.morse@arm.com --- include/linux/resctrl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 8180c539800d..d512455b4c3a 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -192,5 +192,6 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, u32 closid, enum resctrl_conf_type type); +int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); #endif /* _RESCTRL_H */ -- cgit v1.2.3 From 798fd4b9ac37fec571f55fb8592497b0dd5f7a73 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 2 Sep 2022 15:48:13 +0000 Subject: x86/resctrl: Add domain offline callback for resctrl work Because domains are exposed to user-space via resctrl, the filesystem must update its state when CPU hotplug callbacks are triggered. Some of this work is common to any architecture that would support resctrl, but the work is tied up with the architecture code to free the memory. Move the monitor subdir removal and the cancelling of the mbm/limbo works into a new resctrl_offline_domain() call. These bits are not specific to the architecture. Grouping them in one function allows that code to be moved to /fs/ and re-used by another architecture. Signed-off-by: James Morse Signed-off-by: Borislav Petkov Reviewed-by: Jamie Iles Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Tested-by: Xin Hao Tested-by: Shaopeng Tan Tested-by: Cristian Marussi Link: https://lore.kernel.org/r/20220902154829.30399-6-james.morse@arm.com --- include/linux/resctrl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index d512455b4c3a..5d283bdd6162 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -193,5 +193,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, u32 closid, enum resctrl_conf_type type); int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); +void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); #endif /* _RESCTRL_H */ -- cgit v1.2.3 From 7a4e0d2c7fb8e28bb8ce0687925c9cf91d65f2a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Fri, 16 Sep 2022 03:54:59 +0200 Subject: tty: remove TTY_MAGIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to Greg, in the context of magic numbers as defined in magic-number.rst, "the tty layer should not need this and I'll gladly take patches" Acked-by: Jiri Slaby Signed-off-by: Ahelenia Ziemiańska Ref: https://lore.kernel.org/linux-doc/YyMlovoskUcHLEb7@kroah.com/ Link: https://lore.kernel.org/r/476d024cd6b04160a5de381ea2b9856b60088cbd.1663288066.git.nabijaczleweli@nabijaczleweli.xyz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index ae41893f8653..730c3301d710 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -122,8 +122,6 @@ struct tty_operations; /** * struct tty_struct - state associated with a tty while open * - * @magic: magic value set early in @alloc_tty_struct to %TTY_MAGIC, for - * debugging purposes * @kref: reference counting by tty_kref_get() and tty_kref_put(), reaching zero * frees the structure * @dev: class device or %NULL (e.g. ptys, serdev) @@ -193,7 +191,6 @@ struct tty_operations; * &struct tty_port. */ struct tty_struct { - int magic; struct kref kref; struct device *dev; struct tty_driver *driver; @@ -260,9 +257,6 @@ struct tty_file_private { struct list_head list; }; -/* tty magic number */ -#define TTY_MAGIC 0x5401 - /** * DOC: TTY Struct Flags * -- cgit v1.2.3 From 5052df99d3bc3cd281222bbcba44323b2d0937d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Fri, 16 Sep 2022 03:55:05 +0200 Subject: tty: remove TTY_DRIVER_MAGIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to Greg, in the context of magic numbers as defined in magic-number.rst, "the tty layer should not need this and I'll gladly take patches" Acked-by: Jiri Slaby Signed-off-by: Ahelenia Ziemiańska Ref: https://lore.kernel.org/linux-doc/YyMlovoskUcHLEb7@kroah.com/ Link: https://lore.kernel.org/r/723478a270a3858f27843cbec621df4d5d44efcc.1663288066.git.nabijaczleweli@nabijaczleweli.xyz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index f961164a5274..e00034118c7b 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -397,7 +397,6 @@ struct tty_operations { /** * struct tty_driver -- driver for TTY devices * - * @magic: set to %TTY_DRIVER_MAGIC in __tty_alloc_driver() * @kref: reference counting. Reaching zero frees all the internals and the * driver. * @cdevs: allocated/registered character /dev devices @@ -433,7 +432,6 @@ struct tty_operations { * @driver_name, @name, @type, @subtype, @init_termios, and @ops. */ struct tty_driver { - int magic; struct kref kref; struct cdev **cdevs; struct module *owner; @@ -490,9 +488,6 @@ static inline void tty_set_operations(struct tty_driver *driver, driver->ops = op; } -/* tty driver magic number */ -#define TTY_DRIVER_MAGIC 0x5402 - /** * DOC: TTY Driver Flags * -- cgit v1.2.3 From 9906890c89e4dbd900ed87ad3040080339a7f411 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Wed, 21 Sep 2022 00:35:32 +0100 Subject: serial: 8250: Let drivers request full 16550A feature probing A SERIAL_8250_16550A_VARIANTS configuration option has been recently defined that lets one request the 8250 driver not to probe for 16550A device features so as to reduce the driver's device startup time in virtual machines. Some actual hardware devices require these features to have been fully determined however for their driver to work correctly, so define a flag to let drivers request full 16550A feature probing on a device-by-device basis if required regardless of the SERIAL_8250_16550A_VARIANTS option setting chosen. Fixes: dc56ecb81a0a ("serial: 8250: Support disabling mdelay-filled probes of 16550A variants") Cc: stable@vger.kernel.org # v5.6+ Reported-by: Anders Blomdell Signed-off-by: Maciej W. Rozycki Link: https://lore.kernel.org/r/alpine.DEB.2.21.2209202357520.41633@angie.orcam.me.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 02a4299b7d42..2ae182f0f1de 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -422,7 +422,7 @@ struct uart_icount { __u32 buf_overrun; }; -typedef unsigned int __bitwise upf_t; +typedef u64 __bitwise upf_t; typedef unsigned int __bitwise upstat_t; struct uart_port { @@ -530,6 +530,7 @@ struct uart_port { #define UPF_FIXED_PORT ((__force upf_t) (1 << 29)) #define UPF_DEAD ((__force upf_t) (1 << 30)) #define UPF_IOREMAP ((__force upf_t) (1 << 31)) +#define UPF_FULL_PROBE ((__force upf_t) (1ULL << 32)) #define __UPF_CHANGE_MASK 0x17fff #define UPF_CHANGE_MASK ((__force upf_t) __UPF_CHANGE_MASK) -- cgit v1.2.3 From 46a8973c4d9d7b12e0e4dd9f589d08d420fb6c0d Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Wed, 21 Sep 2022 00:35:42 +0100 Subject: serial: 8250: Switch UART port flags to using BIT_ULL Use BIT_ULL rather than encoding bits explicitly where applicable with UART port flags. This makes a (__force upf_t) cast redundant, but keep it for visual consistency with the flags defined in terms of userspace macros. Signed-off-by: Maciej W. Rozycki Link: https://lore.kernel.org/r/alpine.DEB.2.21.2209210007030.41633@angie.orcam.me.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 2ae182f0f1de..9e0a9d379390 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -513,24 +513,24 @@ struct uart_port { #define UPF_BUGGY_UART ((__force upf_t) ASYNC_BUGGY_UART /* 14 */ ) #define UPF_MAGIC_MULTIPLIER ((__force upf_t) ASYNC_MAGIC_MULTIPLIER /* 16 */ ) -#define UPF_NO_THRE_TEST ((__force upf_t) (1 << 19)) +#define UPF_NO_THRE_TEST ((__force upf_t) BIT_ULL(19)) /* Port has hardware-assisted h/w flow control */ -#define UPF_AUTO_CTS ((__force upf_t) (1 << 20)) -#define UPF_AUTO_RTS ((__force upf_t) (1 << 21)) +#define UPF_AUTO_CTS ((__force upf_t) BIT_ULL(20)) +#define UPF_AUTO_RTS ((__force upf_t) BIT_ULL(21)) #define UPF_HARD_FLOW ((__force upf_t) (UPF_AUTO_CTS | UPF_AUTO_RTS)) /* Port has hardware-assisted s/w flow control */ -#define UPF_SOFT_FLOW ((__force upf_t) (1 << 22)) -#define UPF_CONS_FLOW ((__force upf_t) (1 << 23)) -#define UPF_SHARE_IRQ ((__force upf_t) (1 << 24)) -#define UPF_EXAR_EFR ((__force upf_t) (1 << 25)) -#define UPF_BUG_THRE ((__force upf_t) (1 << 26)) +#define UPF_SOFT_FLOW ((__force upf_t) BIT_ULL(22)) +#define UPF_CONS_FLOW ((__force upf_t) BIT_ULL(23)) +#define UPF_SHARE_IRQ ((__force upf_t) BIT_ULL(24)) +#define UPF_EXAR_EFR ((__force upf_t) BIT_ULL(25)) +#define UPF_BUG_THRE ((__force upf_t) BIT_ULL(26)) /* The exact UART type is known and should not be probed. */ -#define UPF_FIXED_TYPE ((__force upf_t) (1 << 27)) -#define UPF_BOOT_AUTOCONF ((__force upf_t) (1 << 28)) -#define UPF_FIXED_PORT ((__force upf_t) (1 << 29)) -#define UPF_DEAD ((__force upf_t) (1 << 30)) -#define UPF_IOREMAP ((__force upf_t) (1 << 31)) -#define UPF_FULL_PROBE ((__force upf_t) (1ULL << 32)) +#define UPF_FIXED_TYPE ((__force upf_t) BIT_ULL(27)) +#define UPF_BOOT_AUTOCONF ((__force upf_t) BIT_ULL(28)) +#define UPF_FIXED_PORT ((__force upf_t) BIT_ULL(29)) +#define UPF_DEAD ((__force upf_t) BIT_ULL(30)) +#define UPF_IOREMAP ((__force upf_t) BIT_ULL(31)) +#define UPF_FULL_PROBE ((__force upf_t) BIT_ULL(32)) #define __UPF_CHANGE_MASK 0x17fff #define UPF_CHANGE_MASK ((__force upf_t) __UPF_CHANGE_MASK) -- cgit v1.2.3 From 039d4926379b1d1c17b51cf21c500a5eed86899e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 22 Sep 2022 10:00:05 +0300 Subject: serial: 8250: Toggle IER bits on only after irq has been set up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Invoking TIOCVHANGUP on 8250_mid port on Ice Lake-D and then reopening the port triggers these faults during serial8250_do_startup(): DMAR: DRHD: handling fault status reg 3 DMAR: [DMA Write NO_PASID] Request device [00:1a.0] fault addr 0x0 [fault reason 0x05] PTE Write access is not set If the IRQ hasn't been set up yet, the UART will have zeroes in its MSI address/data registers. Disabling the IRQ at the interrupt controller won't stop the UART from performing a DMA write to the address programmed in its MSI address register (zero) when it wants to signal an interrupt. The UARTs (in Ice Lake-D) implement PCI 2.1 style MSI without masking capability, so there is no way to mask the interrupt at the source PCI function level, except disabling the MSI capability entirely, but that would cause it to fall back to INTx# assertion, and the PCI specification prohibits disabling the MSI capability as a way to mask a function's interrupt service request. The MSI address register is zeroed by the hangup as the irq is freed. The interrupt is signalled during serial8250_do_startup() performing a THRE test that temporarily toggles THRI in IER. The THRE test currently occurs before UART's irq (and MSI address) is properly set up. Refactor serial8250_do_startup() such that irq is set up before the THRE test. The current irq setup code is intermixed with the timer setup code. As THRE test must be performed prior to the timer setup, extract it into own function and call it only after the THRE test. The ->setup_timer() needs to be part of the struct uart_8250_ops in order to not create circular dependency between 8250 and 8250_base modules. Fixes: 40b36daad0ac ("[PATCH] 8250 UART backup timer") Reported-by: Lennert Buytenhek Tested-by: Lennert Buytenhek Reviewed-by: Andy Shevchenko Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220922070005.2965-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 43edd10e8c96..19376bee9667 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -74,6 +74,7 @@ struct uart_8250_port; struct uart_8250_ops { int (*setup_irq)(struct uart_8250_port *); void (*release_irq)(struct uart_8250_port *); + void (*setup_timer)(struct uart_8250_port *); }; struct uart_8250_em485 { -- cgit v1.2.3 From 781096d971dfe3c5f9401a300bdb0b148a600584 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 2 Sep 2022 15:48:16 +0000 Subject: x86/resctrl: Create mba_sc configuration in the rdt_domain To support resctrl's MBA software controller, the architecture must provide a second configuration array to hold the mbps_val[] from user-space. This complicates the interface between the architecture specific code and the filesystem portions of resctrl that will move to /fs/, to allow multiple architectures to support resctrl. Make the filesystem parts of resctrl create an array for the mba_sc values. The software controller can be changed to use this, allowing the architecture code to only consider the values configured in hardware. Signed-off-by: James Morse Signed-off-by: Borislav Petkov Reviewed-by: Jamie Iles Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Tested-by: Xin Hao Tested-by: Shaopeng Tan Tested-by: Cristian Marussi Link: https://lore.kernel.org/r/20220902154829.30399-9-james.morse@arm.com --- include/linux/resctrl.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 5d283bdd6162..93dfe553b364 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -15,6 +15,9 @@ int proc_resctrl_show(struct seq_file *m, #endif +/* max value for struct rdt_domain's mbps_val */ +#define MBA_MAX_MBPS U32_MAX + /** * enum resctrl_conf_type - The type of configuration. * @CDP_NONE: No prioritisation, both code and data are controlled or monitored. @@ -53,6 +56,9 @@ struct resctrl_staged_config { * @cqm_work_cpu: worker CPU for CQM h/w counters * @plr: pseudo-locked region (if any) associated with domain * @staged_config: parsed configuration to be applied + * @mbps_val: When mba_sc is enabled, this holds the array of user + * specified control values for mba_sc in MBps, indexed + * by closid */ struct rdt_domain { struct list_head list; @@ -67,6 +73,7 @@ struct rdt_domain { int cqm_work_cpu; struct pseudo_lock_region *plr; struct resctrl_staged_config staged_config[CDP_NUM_TYPES]; + u32 *mbps_val; }; /** -- cgit v1.2.3 From ff6357bb50023af2a1dc8f113930082c5252c753 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 2 Sep 2022 15:48:19 +0000 Subject: x86/resctrl: Allow update_mba_bw() to update controls directly update_mba_bw() calculates a new control value for the MBA resource based on the user provided mbps_val and the current measured bandwidth. Some control values need remapping by delay_bw_map(). It does this by calling wrmsrl() directly. This needs splitting up to be done by an architecture specific helper, so that the remainder can eventually be moved to /fs/. Add resctrl_arch_update_one() to apply one configuration value to the provided resource and domain. This avoids the staging and cross-calling that is only needed with changes made by user-space. delay_bw_map() moves to be part of the arch code, to maintain the 'percentage control' view of MBA resources in resctrl. Signed-off-by: James Morse Signed-off-by: Borislav Petkov Reviewed-by: Jamie Iles Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Tested-by: Xin Hao Tested-by: Shaopeng Tan Tested-by: Cristian Marussi Link: https://lore.kernel.org/r/20220902154829.30399-12-james.morse@arm.com --- include/linux/resctrl.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 93dfe553b364..f4c9101df461 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -197,6 +197,14 @@ struct resctrl_schema { /* The number of closid supported by this resource regardless of CDP */ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); + +/* + * Update the ctrl_val and apply this config right now. + * Must be called on one of the domain's CPUs. + */ +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, + u32 closid, enum resctrl_conf_type t, u32 cfg_val); + u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, u32 closid, enum resctrl_conf_type type); int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); -- cgit v1.2.3 From 21803630c4ffb433bab2951fbe0ee7b8dbcc8bcb Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Wed, 21 Sep 2022 11:10:46 -0700 Subject: net/mlx5: Fix fields name prefix in MACsec Fix ifc fields name to be consistent with the device spec document. Fixes: 8385c51ff5bc ("net/mlx5: Introduce MACsec Connect-X offload hardware bits and structures") Signed-off-by: Emeel Hakim Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 8decbf9a7bdd..da0ed11fcebd 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -11585,15 +11585,15 @@ struct mlx5_ifc_macsec_offload_obj_bits { u8 confidentiality_en[0x1]; u8 reserved_at_41[0x1]; - u8 esn_en[0x1]; - u8 esn_overlap[0x1]; + u8 epn_en[0x1]; + u8 epn_overlap[0x1]; u8 reserved_at_44[0x2]; u8 confidentiality_offset[0x2]; u8 reserved_at_48[0x4]; u8 aso_return_reg[0x4]; u8 reserved_at_50[0x10]; - u8 esn_msb[0x20]; + u8 epn_msb[0x20]; u8 reserved_at_80[0x8]; u8 dekn[0x18]; -- cgit v1.2.3 From 23cc83c6ca87c9a4da62b9ddf4d0fc09f3054f81 Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Wed, 21 Sep 2022 11:10:49 -0700 Subject: net/mlx5: Add ifc bits for MACsec extended packet number (EPN) and replay protection Add ifc bits related to advanced steering operations (ASO) and general object modify for macsec to use as part of offloading EPN and replay protection features. Reviewed-by: Raed Salem Signed-off-by: Emeel Hakim Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index da0ed11fcebd..bd577b99b146 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -11558,6 +11558,20 @@ struct mlx5_ifc_modify_ipsec_obj_in_bits { struct mlx5_ifc_ipsec_obj_bits ipsec_object; }; +enum { + MLX5_MACSEC_ASO_REPLAY_PROTECTION = 0x1, +}; + +enum { + MLX5_MACSEC_ASO_REPLAY_WIN_32BIT = 0x0, + MLX5_MACSEC_ASO_REPLAY_WIN_64BIT = 0x1, + MLX5_MACSEC_ASO_REPLAY_WIN_128BIT = 0x2, + MLX5_MACSEC_ASO_REPLAY_WIN_256BIT = 0x3, +}; + +#define MLX5_MACSEC_ASO_INC_SN 0x2 +#define MLX5_MACSEC_ASO_REG_C_4_5 0x2 + struct mlx5_ifc_macsec_aso_bits { u8 valid[0x1]; u8 reserved_at_1[0x1]; @@ -11619,6 +11633,21 @@ struct mlx5_ifc_create_macsec_obj_in_bits { struct mlx5_ifc_macsec_offload_obj_bits macsec_object; }; +struct mlx5_ifc_modify_macsec_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + struct mlx5_ifc_macsec_offload_obj_bits macsec_object; +}; + +enum { + MLX5_MODIFY_MACSEC_BITMASK_EPN_OVERLAP = BIT(0), + MLX5_MODIFY_MACSEC_BITMASK_EPN_MSB = BIT(1), +}; + +struct mlx5_ifc_query_macsec_obj_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr; + struct mlx5_ifc_macsec_offload_obj_bits macsec_object; +}; + struct mlx5_ifc_encryption_key_obj_bits { u8 modify_field_select[0x40]; -- cgit v1.2.3 From 4411a6c0abd3e55b4a4fb9432b3a0553f12337c2 Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Wed, 21 Sep 2022 11:10:53 -0700 Subject: net/mlx5e: Support MACsec offload extended packet number (EPN) MACsec EPN splits the packet number (PN) into two 32-bits fields, epn_lsb (32 least significant bits (LSBs) of PN) and epn_msb (32 most significant bits (MSBs) of PN). Epn_msb bits are managed by SW and for that HW is required to send an object change event of type EPN event notifying the SW to update the epn_msb in addition, once epn_msb is updated SW update HW with the new epn_msb value for HW to perform replay protection. To prevent HW from stopping while handling the event, SW manages another bit for HW called epn_overlap, HW uses the latter to get an indication regarding how to read the epn_msb value correctly while still receiving packets. Add epn event handling that updates the epn_overlap and epn_msb for every 2^31 packets according to the following logic: if epn_lsb crosses 2^31 (half sequence number wraparound) upon HW relevant event, SW updates the esn_overlap value to OLD (value = 1). When the epn_lsb crosses 2^32 (full sequence number wraparound) upon HW relevant event, SW updates the esn_overlap to NEW (value = 0) and increment the esn_msb. When using MACsec EPN a salt and short secure channel id (ssci) needs to be provided by the user, when offloading EPN need to pass this salt and ssci to the HW to be used in the initial vector (IV) calculations. Reviewed-by: Raed Salem Signed-off-by: Emeel Hakim Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Reported-by: kernel test robot Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/device.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 2927810f172b..dcd60fb9e6b4 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -325,6 +325,7 @@ enum mlx5_event { MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10, MLX5_EVENT_TYPE_WQ_ACCESS_ERROR = 0x11, MLX5_EVENT_TYPE_SRQ_CATAS_ERROR = 0x12, + MLX5_EVENT_TYPE_OBJECT_CHANGE = 0x27, MLX5_EVENT_TYPE_INTERNAL_ERROR = 0x08, MLX5_EVENT_TYPE_PORT_CHANGE = 0x09, @@ -699,6 +700,12 @@ struct mlx5_eqe_temp_warning { __be64 sensor_warning_lsb; } __packed; +struct mlx5_eqe_obj_change { + u8 rsvd0[2]; + __be16 obj_type; + __be32 obj_id; +} __packed; + #define SYNC_RST_STATE_MASK 0xf enum sync_rst_state_type { @@ -737,6 +744,7 @@ union ev_data { struct mlx5_eqe_xrq_err xrq_err; struct mlx5_eqe_sync_fw_update sync_fw_update; struct mlx5_eqe_vhca_state vhca_state; + struct mlx5_eqe_obj_change obj_change; } __packed; struct mlx5_eqe { -- cgit v1.2.3 From 6edf2576a6cc46460c164831517a36064eb8109c Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 13 Sep 2022 14:54:20 +0800 Subject: mm/slub: enable debugging memory wasting of kmalloc kmalloc's API family is critical for mm, with one nature that it will round up the request size to a fixed one (mostly power of 2). Say when user requests memory for '2^n + 1' bytes, actually 2^(n+1) bytes could be allocated, so in worst case, there is around 50% memory space waste. The wastage is not a big issue for requests that get allocated/freed quickly, but may cause problems with objects that have longer life time. We've met a kernel boot OOM panic (v5.10), and from the dumped slab info: [ 26.062145] kmalloc-2k 814056KB 814056KB From debug we found there are huge number of 'struct iova_magazine', whose size is 1032 bytes (1024 + 8), so each allocation will waste 1016 bytes. Though the issue was solved by giving the right (bigger) size of RAM, it is still nice to optimize the size (either use a kmalloc friendly size or create a dedicated slab for it). And from lkml archive, there was another crash kernel OOM case [1] back in 2019, which seems to be related with the similar slab waste situation, as the log is similar: [ 4.332648] iommu: Adding device 0000:20:02.0 to group 16 [ 4.338946] swapper/0 invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null), order=0, oom_score_adj=0 ... [ 4.857565] kmalloc-2048 59164KB 59164KB The crash kernel only has 256M memory, and 59M is pretty big here. (Note: the related code has been changed and optimised in recent kernel [2], these logs are just picked to demo the problem, also a patch changing its size to 1024 bytes has been merged) So add an way to track each kmalloc's memory waste info, and leverage the existing SLUB debug framework (specifically SLUB_STORE_USER) to show its call stack of original allocation, so that user can evaluate the waste situation, identify some hot spots and optimize accordingly, for a better utilization of memory. The waste info is integrated into existing interface: '/sys/kernel/debug/slab/kmalloc-xx/alloc_traces', one example of 'kmalloc-4k' after boot is: 126 ixgbe_alloc_q_vector+0xbe/0x830 [ixgbe] waste=233856/1856 age=280763/281414/282065 pid=1330 cpus=32 nodes=1 __kmem_cache_alloc_node+0x11f/0x4e0 __kmalloc_node+0x4e/0x140 ixgbe_alloc_q_vector+0xbe/0x830 [ixgbe] ixgbe_init_interrupt_scheme+0x2ae/0xc90 [ixgbe] ixgbe_probe+0x165f/0x1d20 [ixgbe] local_pci_probe+0x78/0xc0 work_for_cpu_fn+0x26/0x40 ... which means in 'kmalloc-4k' slab, there are 126 requests of 2240 bytes which got a 4KB space (wasting 1856 bytes each and 233856 bytes in total), from ixgbe_alloc_q_vector(). And when system starts some real workload like multiple docker instances, there could are more severe waste. [1]. https://lkml.org/lkml/2019/8/12/266 [2]. https://lore.kernel.org/lkml/2920df89-9975-5785-f79b-257d3052dfaf@huawei.com/ [Thanks Hyeonggon for pointing out several bugs about sorting/format] [Thanks Vlastimil for suggesting way to reduce memory usage of orig_size and keep it only for kmalloc objects] Signed-off-by: Feng Tang Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Robin Murphy Cc: John Garry Cc: Kefeng Wang Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 0fefdf528e0d..a713b0e5bbcd 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -29,6 +29,8 @@ #define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U) /* DEBUG: Poison objects */ #define SLAB_POISON ((slab_flags_t __force)0x00000800U) +/* Indicate a kmalloc slab */ +#define SLAB_KMALLOC ((slab_flags_t __force)0x00001000U) /* Align objs on cache lines */ #define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000U) /* Use GFP_DMA memory */ -- cgit v1.2.3 From fea62d370d7a1ba288d71d0cae7ad47c2a02b839 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 2 Sep 2022 15:48:22 +0000 Subject: x86/resctrl: Allow per-rmid arch private storage to be reset To abstract the rmid counters into a helper that returns the number of bytes counted, architecture specific per-rmid state is needed. It needs to be possible to reset this hidden state, as the values may outlive the life of an rmid, or the mount time of the filesystem. mon_event_read() is called with first = true when an rmid is first allocated in mkdir_mondata_subdir(). Add resctrl_arch_reset_rmid() and call it from __mon_event_count()'s rr->first check. Signed-off-by: James Morse Signed-off-by: Borislav Petkov Reviewed-by: Jamie Iles Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Tested-by: Xin Hao Tested-by: Shaopeng Tan Tested-by: Cristian Marussi Link: https://lore.kernel.org/r/20220902154829.30399-15-james.morse@arm.com --- include/linux/resctrl.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index f4c9101df461..818456770176 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -32,6 +32,16 @@ enum resctrl_conf_type { #define CDP_NUM_TYPES (CDP_DATA + 1) +/* + * Event IDs, the values match those used to program IA32_QM_EVTSEL before + * reading IA32_QM_CTR on RDT systems. + */ +enum resctrl_event_id { + QOS_L3_OCCUP_EVENT_ID = 0x01, + QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, + QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, +}; + /** * struct resctrl_staged_config - parsed configuration to be applied * @new_ctrl: new ctrl value to be loaded @@ -210,4 +220,17 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); +/** + * resctrl_arch_reset_rmid() - Reset any private state associated with rmid + * and eventid. + * @r: The domain's resource. + * @d: The rmid's domain. + * @rmid: The rmid whose counter values should be reset. + * @eventid: The eventid whose counter values should be reset. + * + * This can be called from any CPU. + */ +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, + u32 rmid, enum resctrl_event_id eventid); + #endif /* _RESCTRL_H */ -- cgit v1.2.3 From 72bc36956f73ac54f19a7ac7302fb274069bec18 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 20 Sep 2022 18:12:28 -0400 Subject: net: phylink: Document MAC_(A)SYM_PAUSE This documents the possible MLO_PAUSE_* settings which can result from different combinations of MAC_(A)SYM_PAUSE. Special note is paid to settings which can result from user configuration (MLO_PAUSE_AN). The autonegotiation results are more-or-less a direct consequence of IEEE 802.3 Table 28B-2. Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phylink.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 6d06896fc20d..1f997e14bf80 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -21,6 +21,35 @@ enum { MLO_AN_FIXED, /* Fixed-link mode */ MLO_AN_INBAND, /* In-band protocol */ + /* MAC_SYM_PAUSE and MAC_ASYM_PAUSE are used when configuring our + * autonegotiation advertisement. They correspond to the PAUSE and + * ASM_DIR bits defined by 802.3, respectively. + * + * The following table lists the values of tx_pause and rx_pause which + * might be requested in mac_link_up. The exact values depend on either + * the results of autonegotation (if MLO_PAUSE_AN is set) or user + * configuration (if MLO_PAUSE_AN is not set). + * + * MAC_SYM_PAUSE MAC_ASYM_PAUSE MLO_PAUSE_AN tx_pause/rx_pause + * ============= ============== ============ ================== + * 0 0 0 0/0 + * 0 0 1 0/0 + * 0 1 0 0/0, 0/1, 1/0, 1/1 + * 0 1 1 0/0, 1/0 + * 1 0 0 0/0, 1/1 + * 1 0 1 0/0, 1/1 + * 1 1 0 0/0, 0/1, 1/0, 1/1 + * 1 1 1 0/0, 0/1, 1/1 + * + * If you set MAC_ASYM_PAUSE, the user may request any combination of + * tx_pause and rx_pause. You do not have to support these + * combinations. + * + * However, you should support combinations of tx_pause and rx_pause + * which might be the result of autonegotation. For example, don't set + * MAC_SYM_PAUSE unless your device can support tx_pause and rx_pause + * at the same time. + */ MAC_SYM_PAUSE = BIT(0), MAC_ASYM_PAUSE = BIT(1), MAC_10HD = BIT(2), -- cgit v1.2.3 From 606116529ab2d12e93bf751f74ed50a621b46846 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 20 Sep 2022 18:12:29 -0400 Subject: net: phylink: Export phylink_caps_to_linkmodes This function is convenient for MAC drivers. They can use it to add or remove particular link modes based on capabilities (such as if half duplex is not supported for a particular interface mode). Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phylink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 1f997e14bf80..7cf26d7a522d 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -547,6 +547,7 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int mode, phy_interface_t interface, int speed, int duplex); #endif +void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps); void phylink_get_linkmodes(unsigned long *linkmodes, phy_interface_t interface, unsigned long mac_capabilities); void phylink_generic_validate(struct phylink_config *config, -- cgit v1.2.3 From 3e6eab8f3ef93cd78cd4b67f497ef6035eb073ad Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 20 Sep 2022 18:12:30 -0400 Subject: net: phylink: Generate caps and convert to linkmodes separately If we call phylink_caps_to_linkmodes directly from phylink_get_linkmodes, it is difficult to re-use this functionality in MAC drivers. This is because MAC drivers must then work with an ethtool linkmode bitmap, instead of with mac capabilities. Instead, let the caller of phylink_get_linkmodes do the conversion. To reflect this change, rename the function to phylink_get_capabilities. Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phylink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 7cf26d7a522d..cc039ae7e80c 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -548,8 +548,8 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int mode, #endif void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps); -void phylink_get_linkmodes(unsigned long *linkmodes, phy_interface_t interface, - unsigned long mac_capabilities); +unsigned long phylink_get_capabilities(phy_interface_t interface, + unsigned long mac_capabilities); void phylink_generic_validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); -- cgit v1.2.3 From 0c3e10cb44232833a50cb8e3e784c432906a60c1 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 20 Sep 2022 18:12:31 -0400 Subject: net: phy: Add support for rate matching This adds support for rate matching (also known as rate adaptation) to the phy subsystem. The general idea is that the phy interface runs at one speed, and the MAC throttles the rate at which it sends packets to the link speed. There's a good overview of several techniques for achieving this at [1]. This patch adds support for three: pause-frame based (such as in Aquantia phys), CRS-based (such as in 10PASS-TS and 2BASE-TL), and open-loop-based (such as in 10GBASE-W). This patch makes a few assumptions and a few non assumptions about the types of rate matching available. First, it assumes that different phys may use different forms of rate matching. Second, it assumes that phys can use rate matching for any of their supported link speeds (e.g. if a phy supports 10BASE-T and XGMII, then it can adapt XGMII to 10BASE-T). Third, it does not assume that all interface modes will use the same form of rate matching. Fourth, it does not assume that all phy devices will support rate matching (even if some do). Relaxing or strengthening these (non-)assumptions could result in a different API. For example, if all interface modes were assumed to use the same form of rate matching, then a bitmask of interface modes supportting rate matching would suffice. For some better visibility into the process, the current rate matching mode is exposed as part of the ethtool ksettings. For the moment, only read access is supported. I'm not sure what userspace might want to configure yet (disable it altogether, disable just one mode, specify the mode to use, etc.). For the moment, since only pause-based rate adaptation support is added in the next few commits, rate matching can be disabled altogether by adjusting the advertisement. 802.3 calls this feature "rate adaptation" in clause 49 (10GBASE-R) and "rate matching" in clause 61 (10PASS-TL and 2BASE-TS). Aquantia also calls this feature "rate adaptation". I chose "rate matching" because it is shorter, and because Russell doesn't think "adaptation" is correct in this context. Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phy.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 337230c135f7..9c66f357f489 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -280,7 +280,6 @@ static inline const char *phy_modes(phy_interface_t interface) } } - #define PHY_INIT_TIMEOUT 100000 #define PHY_FORCE_TIMEOUT 10 @@ -574,6 +573,7 @@ struct macsec_ops; * @lp_advertising: Current link partner advertised linkmodes * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited * @autoneg: Flag autoneg being used + * @rate_matching: Current rate matching mode * @link: Current link state * @autoneg_complete: Flag auto negotiation of the link has completed * @mdix: Current crossover @@ -641,6 +641,8 @@ struct phy_device { unsigned irq_suspended:1; unsigned irq_rerun:1; + int rate_matching; + enum phy_state state; u32 dev_flags; @@ -805,6 +807,21 @@ struct phy_driver { */ int (*get_features)(struct phy_device *phydev); + /** + * @get_rate_matching: Get the supported type of rate matching for a + * particular phy interface. This is used by phy consumers to determine + * whether to advertise lower-speed modes for that interface. It is + * assumed that if a rate matching mode is supported on an interface, + * then that interface's rate can be adapted to all slower link speeds + * supported by the phy. If iface is %PHY_INTERFACE_MODE_NA, and the phy + * supports any kind of rate matching for any interface, then it must + * return that rate matching mode (preferring %RATE_MATCH_PAUSE to + * %RATE_MATCH_CRS). If the interface is not supported, this should + * return %RATE_MATCH_NONE. + */ + int (*get_rate_matching)(struct phy_device *phydev, + phy_interface_t iface); + /* PHY Power Management */ /** @suspend: Suspend the hardware, saving state if needed */ int (*suspend)(struct phy_device *phydev); @@ -971,6 +988,7 @@ struct phy_fixup { const char *phy_speed_to_str(int speed); const char *phy_duplex_to_str(unsigned int duplex); +const char *phy_rate_matching_to_str(int rate_matching); int phy_interface_num_ports(phy_interface_t interface); @@ -1687,6 +1705,8 @@ int phy_disable_interrupts(struct phy_device *phydev); void phy_request_interrupt(struct phy_device *phydev); void phy_free_interrupt(struct phy_device *phydev); void phy_print_status(struct phy_device *phydev); +int phy_get_rate_matching(struct phy_device *phydev, + phy_interface_t iface); void phy_set_max_speed(struct phy_device *phydev, u32 max_speed); void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode); void phy_advertise_supported(struct phy_device *phydev); -- cgit v1.2.3 From ae0e4bb2a0e0e434e9f98fb4994093ec2ee71997 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 20 Sep 2022 18:12:32 -0400 Subject: net: phylink: Adjust link settings based on rate matching If the phy is configured to use pause-based rate matching, ensure that the link is full duplex with pause frame reception enabled. As suggested, if pause-based rate matching is enabled by the phy, then pause reception is unconditionally enabled. The interface duplex is determined based on the rate matching type. When rate matching is enabled, so is the speed. We assume the maximum interface speed is used. This is only relevant for MLO_AN_PHY. For MLO_AN_INBAND, the MAC/PCS's view of the interface speed will be used. Although there are no RATE_ADAPT_CRS phys in-tree, it has been added for comparison (and the implementation is quite simple). Co-developed-by: Russell King Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phylink.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index cc039ae7e80c..5c99c21e42b5 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -88,6 +88,10 @@ static inline bool phylink_autoneg_inband(unsigned int mode) * @speed: link speed, one of the SPEED_* constants. * @duplex: link duplex mode, one of DUPLEX_* constants. * @pause: link pause state, described by MLO_PAUSE_* constants. + * @rate_matching: rate matching being performed, one of the RATE_MATCH_* + * constants. If rate matching is taking place, then the speed/duplex of + * the medium link mode (@speed and @duplex) and the speed/duplex of the phy + * interface mode (@interface) are different. * @link: true if the link is up. * @an_enabled: true if autonegotiation is enabled/desired. * @an_complete: true if autonegotiation has completed. @@ -99,6 +103,7 @@ struct phylink_link_state { int speed; int duplex; int pause; + int rate_matching; unsigned int link:1; unsigned int an_enabled:1; unsigned int an_complete:1; -- cgit v1.2.3 From b7e9294885b610791fcebc799bf2a9e219446fd6 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 20 Sep 2022 18:12:33 -0400 Subject: net: phylink: Adjust advertisement based on rate matching This adds support for adjusting the advertisement for pause-based rate matching. This may result in a lossy link, since the final link settings are not adjusted. Asymmetric pause support is necessary. It would be possible for a MAC supporting only symmetric pause to use pause-based rate adaptation, but only if pause reception was enabled as well. Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phylink.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 5c99c21e42b5..664dd409feb9 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -554,7 +554,8 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int mode, void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps); unsigned long phylink_get_capabilities(phy_interface_t interface, - unsigned long mac_capabilities); + unsigned long mac_capabilities, + int rate_matching); void phylink_generic_validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); -- cgit v1.2.3 From 4d044c521a63b2cd394ea6e3547012032145e47e Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 2 Sep 2022 15:48:23 +0000 Subject: x86/resctrl: Abstract __rmid_read() __rmid_read() selects the specified eventid and returns the counter value from the MSR. The error handling is architecture specific, and handled by the callers, rdtgroup_mondata_show() and __mon_event_count(). Error handling should be handled by architecture specific code, as a different architecture may have different requirements. MPAM's counters can report that they are 'not ready', requiring a second read after a short delay. This should be hidden from resctrl. Make __rmid_read() the architecture specific function for reading a counter. Rename it resctrl_arch_rmid_read() and move the error handling into it. A read from a counter that hardware supports but resctrl does not now returns -EINVAL instead of -EIO from the default case in __mon_event_count(). It isn't possible for user-space to see this change as resctrl doesn't expose counters it doesn't support. Signed-off-by: James Morse Signed-off-by: Borislav Petkov Reviewed-by: Jamie Iles Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Tested-by: Xin Hao Tested-by: Shaopeng Tan Tested-by: Cristian Marussi Link: https://lore.kernel.org/r/20220902154829.30399-16-james.morse@arm.com --- include/linux/resctrl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 818456770176..efe60dd7fd21 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -219,6 +219,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, u32 closid, enum resctrl_conf_type type); int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); +int resctrl_arch_rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *res); /** * resctrl_arch_reset_rmid() - Reset any private state associated with rmid -- cgit v1.2.3 From 8286618aca331bf17323ff3023ca831ac6e4b86f Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 2 Sep 2022 15:48:24 +0000 Subject: x86/resctrl: Pass the required parameters into resctrl_arch_rmid_read() resctrl_arch_rmid_read() is intended as the function that an architecture agnostic resctrl filesystem driver can use to read a value in bytes from a hardware register. Currently the function returns the MBM values in chunks directly from hardware. To convert this to bytes, some correction and overflow calculations are needed. These depend on the resource and domain structures. Overflow detection requires the old chunks value. None of this is available to resctrl_arch_rmid_read(). MPAM requires the resource and domain structures to find the MMIO device that holds the registers. Pass the resource and domain to resctrl_arch_rmid_read(). This makes rmid_dirty() too big. Instead merge it with its only caller, and the name is kept as a local variable. Signed-off-by: James Morse Signed-off-by: Borislav Petkov Reviewed-by: Jamie Iles Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Tested-by: Xin Hao Tested-by: Shaopeng Tan Tested-by: Cristian Marussi Link: https://lore.kernel.org/r/20220902154829.30399-17-james.morse@arm.com --- include/linux/resctrl.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index efe60dd7fd21..7ccfa0d1bb34 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -219,7 +219,23 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, u32 closid, enum resctrl_conf_type type); int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); -int resctrl_arch_rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *res); + +/** + * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid + * for this resource and domain. + * @r: resource that the counter should be read from. + * @d: domain that the counter should be read from. + * @rmid: rmid of the counter to read. + * @eventid: eventid to read, e.g. L3 occupancy. + * @val: result of the counter read in chunks. + * + * Call from process context on a CPU that belongs to domain @d. + * + * Return: + * 0 on success, or -EIO, -EINVAL etc on error. + */ +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, + u32 rmid, enum resctrl_event_id eventid, u64 *val); /** * resctrl_arch_reset_rmid() - Reset any private state associated with rmid -- cgit v1.2.3 From ae2328b52962531c2d7c6b531022a3eb2d680f17 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 2 Sep 2022 15:48:27 +0000 Subject: x86/resctrl: Rename and change the units of resctrl_cqm_threshold resctrl_cqm_threshold is stored in a hardware specific chunk size, but exposed to user-space as bytes. This means the filesystem parts of resctrl need to know how the hardware counts, to convert the user provided byte value to chunks. The interface between the architecture's resctrl code and the filesystem ought to treat everything as bytes. Change the unit of resctrl_cqm_threshold to bytes. resctrl_arch_rmid_read() still returns its value in chunks, so this needs converting to bytes. As all the users have been touched, rename the variable to resctrl_rmid_realloc_threshold, which describes what the value is for. Neither r->num_rmid nor hw_res->mon_scale are guaranteed to be a power of 2, so the existing code introduces a rounding error from resctrl's theoretical fraction of the cache usage. This behaviour is kept as it ensures the user visible value matches the value read from hardware when the rmid will be reallocated. Signed-off-by: James Morse Signed-off-by: Borislav Petkov Reviewed-by: Jamie Iles Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Tested-by: Xin Hao Tested-by: Shaopeng Tan Tested-by: Cristian Marussi Link: https://lore.kernel.org/r/20220902154829.30399-20-james.morse@arm.com --- include/linux/resctrl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 7ccfa0d1bb34..9995d043650a 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -250,4 +250,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, u32 rmid, enum resctrl_event_id eventid); +extern unsigned int resctrl_rmid_realloc_threshold; + #endif /* _RESCTRL_H */ -- cgit v1.2.3 From d80975e264c8f01518890f3d91ab5bada8fa7f5e Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 2 Sep 2022 15:48:28 +0000 Subject: x86/resctrl: Add resctrl_rmid_realloc_limit to abstract x86's boot_cpu_data resctrl_rmid_realloc_threshold can be set by user-space. The maximum value is specified by the architecture. Currently max_threshold_occ_write() reads the maximum value from boot_cpu_data.x86_cache_size, which is not portable to another architecture. Add resctrl_rmid_realloc_limit to describe the maximum size in bytes that user-space can set the threshold to. Signed-off-by: James Morse Signed-off-by: Borislav Petkov Reviewed-by: Jamie Iles Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Tested-by: Xin Hao Tested-by: Shaopeng Tan Tested-by: Cristian Marussi Link: https://lore.kernel.org/r/20220902154829.30399-21-james.morse@arm.com --- include/linux/resctrl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 9995d043650a..cb857f753322 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -251,5 +251,6 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, u32 rmid, enum resctrl_event_id eventid); extern unsigned int resctrl_rmid_realloc_threshold; +extern unsigned int resctrl_rmid_realloc_limit; #endif /* _RESCTRL_H */ -- cgit v1.2.3 From f7b1843eca6fe295ba0c71fc02a3291954078f2b Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 2 Sep 2022 15:48:29 +0000 Subject: x86/resctrl: Make resctrl_arch_rmid_read() return values in bytes resctrl_arch_rmid_read() returns a value in chunks, as read from the hardware. This needs scaling to bytes by mon_scale, as provided by the architecture code. Now that resctrl_arch_rmid_read() performs the overflow and corrections itself, it may as well return a value in bytes directly. This allows the accesses to the architecture specific 'hw' structure to be removed. Move the mon_scale conversion into resctrl_arch_rmid_read(). mbm_bw_count() is updated to calculate bandwidth from bytes. Signed-off-by: James Morse Signed-off-by: Borislav Petkov Reviewed-by: Jamie Iles Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Tested-by: Xin Hao Tested-by: Shaopeng Tan Tested-by: Cristian Marussi Link: https://lore.kernel.org/r/20220902154829.30399-22-james.morse@arm.com --- include/linux/resctrl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index cb857f753322..0cf5b20c6ddf 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -227,7 +227,7 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); * @d: domain that the counter should be read from. * @rmid: rmid of the counter to read. * @eventid: eventid to read, e.g. L3 occupancy. - * @val: result of the counter read in chunks. + * @val: result of the counter read in bytes. * * Call from process context on a CPU that belongs to domain @d. * -- cgit v1.2.3 From 1e1f26635e5459db4134952369b76b8d59c50438 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 19 Sep 2022 19:58:03 -0700 Subject: ASoC: ssm2518: drop support for platform data There are currently no users of this driver's platform data in the mainline kernel, so let's drop it. Newer devices should use DT, ACPI, or static software properties to describe the hardware. Signed-off-by: Dmitry Torokhov Link: https://lore.kernel.org/r/20220920025804.1788667-1-dmitry.torokhov@gmail.com Signed-off-by: Mark Brown --- include/linux/platform_data/ssm2518.h | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 include/linux/platform_data/ssm2518.h (limited to 'include/linux') diff --git a/include/linux/platform_data/ssm2518.h b/include/linux/platform_data/ssm2518.h deleted file mode 100644 index 3f9e632d6f63..000000000000 --- a/include/linux/platform_data/ssm2518.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * SSM2518 amplifier audio driver - * - * Copyright 2013 Analog Devices Inc. - * Author: Lars-Peter Clausen - */ - -#ifndef __LINUX_PLATFORM_DATA_SSM2518_H__ -#define __LINUX_PLATFORM_DATA_SSM2518_H__ - -/** - * struct ssm2518_platform_data - Platform data for the ssm2518 driver - * @enable_gpio: GPIO connected to the nSD pin. Set to -1 if the nSD pin is - * hardwired. - */ -struct ssm2518_platform_data { - int enable_gpio; -}; - -#endif -- cgit v1.2.3 From 22873deac9e7b273bbf17eee515c8170510d861a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 06:59:59 +0200 Subject: vfs: add vfs_tmpfile_open() helper This helper unifies tmpfile creation with opening. Existing vfs_tmpfile() callers outside of fs/namei.c will be converted to using this helper. There are two such callers: cachefile and overlayfs. The cachefiles code currently uses the open_with_fake_path() helper to open the tmpfile, presumably to disable accounting of the open file. Overlayfs uses tmpfile for copy_up, which means these struct file instances will be short lived, hence it doesn't really matter if they are accounted or not. Disable accounting in this helper too, which should be okay for both callers. Add MAY_OPEN permission checking for consistency. Like for create(2) read/write permissions are not checked. Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9eced4cc286e..15fafda95dd3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2007,6 +2007,10 @@ static inline int vfs_whiteout(struct user_namespace *mnt_userns, struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode, int open_flag); +struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, + const struct path *parentpath, + umode_t mode, int open_flag, const struct cred *cred); + int vfs_mkobj(struct dentry *, umode_t, int (*f)(struct dentry *, umode_t, void *), void *); -- cgit v1.2.3 From 3e9d4c593558ea86f49e10e62373a54c7f5a63e4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 07:00:00 +0200 Subject: vfs: make vfs_tmpfile() static No callers outside of fs/namei.c anymore. Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 15fafda95dd3..02646542f6bb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2004,9 +2004,6 @@ static inline int vfs_whiteout(struct user_namespace *mnt_userns, WHITEOUT_DEV); } -struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns, - struct dentry *dentry, umode_t mode, int open_flag); - struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred); -- cgit v1.2.3 From 9751b338656f05a0ce918befd5118fcd970c71c6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 07:00:00 +0200 Subject: vfs: move open right after ->tmpfile() Create a helper finish_open_simple() that opens the file with the original dentry. Handle the error case here as well to simplify callers. Call this helper right after ->tmpfile() is called. Next patch will change the tmpfile API and move this call into tmpfile instances. Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 02646542f6bb..a3c50869e79b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2780,6 +2780,15 @@ extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); extern int finish_no_open(struct file *file, struct dentry *dentry); +/* Helper for the simple case when original dentry is used */ +static inline int finish_open_simple(struct file *file, int error) +{ + if (error) + return error; + + return finish_open(file, file->f_path.dentry, NULL); +} + /* fs/dcache.c */ extern void __init vfs_caches_init_early(void); extern void __init vfs_caches_init(void); -- cgit v1.2.3 From 863f144f12add1f4eab80b70561a90857c524a8b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 24 Sep 2022 07:00:00 +0200 Subject: vfs: open inside ->tmpfile() This is in preparation for adding tmpfile support to fuse, which requires that the tmpfile creation and opening are done as a single operation. Replace the 'struct dentry *' argument of i_op->tmpfile with 'struct file *'. Call finish_open_simple() as the last thing in ->tmpfile() instances (may be omitted in the error case). Change d_tmpfile() argument to 'struct file *' as well to make callers more readable. Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- include/linux/dcache.h | 3 ++- include/linux/fs.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 92c78ed02b54..bde9f8ff8869 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -16,6 +16,7 @@ #include struct path; +struct file; struct vfsmount; /* @@ -250,7 +251,7 @@ extern struct dentry * d_make_root(struct inode *); /* - the ramfs-type tree */ extern void d_genocide(struct dentry *); -extern void d_tmpfile(struct dentry *, struct inode *); +extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); diff --git a/include/linux/fs.h b/include/linux/fs.h index a3c50869e79b..8218d9964ff8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2168,7 +2168,7 @@ struct inode_operations { struct file *, unsigned open_flag, umode_t create_mode); int (*tmpfile) (struct user_namespace *, struct inode *, - struct dentry *, umode_t); + struct file *, umode_t); int (*set_acl)(struct user_namespace *, struct inode *, struct posix_acl *, int); int (*fileattr_set)(struct user_namespace *mnt_userns, -- cgit v1.2.3 From 4a575865c1ea67018d96acda9b43e5d3d25b2366 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Fri, 16 Sep 2022 13:20:49 +0100 Subject: mtd: allow getting MTD device associated with a specific DT node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MTD subsystem API allows interacting with MTD devices (e.g. reading, writing, handling bad blocks). So far a random driver could get MTD device only by its name (get_mtd_device_nm()). This change allows getting them also by a DT node. This API is required for drivers handling DT defined MTD partitions in a specific way (e.g. U-Boot (sub)partition with environment variables). Acked-by: Miquel Raynal Signed-off-by: Rafał Miłecki Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220916122100.170016-3-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mtd/mtd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 955aee14b0f7..6fc841ceef31 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -677,6 +677,7 @@ extern int mtd_device_unregister(struct mtd_info *master); extern struct mtd_info *get_mtd_device(struct mtd_info *mtd, int num); extern int __get_mtd_device(struct mtd_info *mtd); extern void __put_mtd_device(struct mtd_info *mtd); +extern struct mtd_info *of_get_mtd_device_by_node(struct device_node *np); extern struct mtd_info *get_mtd_device_nm(const char *name); extern void put_mtd_device(struct mtd_info *mtd); -- cgit v1.2.3 From aade55c86033bee868a93e4bf3843c9c99e84526 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 22 Sep 2022 16:54:10 +0300 Subject: device property: Add const qualifier to device_get_match_data() parameter Add const qualifier to the device_get_match_data() parameter. Some of the future users may utilize this function without forcing the type. All the same, dev_fwnode() may be used with a const qualifier. Reported-by: kernel test robot Acked-by: Heikki Krogerus Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220922135410.49694-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/property.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index a5b429d623f6..117cc200c656 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -32,7 +32,7 @@ enum dev_dma_attr { DEV_DMA_COHERENT, }; -struct fwnode_handle *dev_fwnode(struct device *dev); +struct fwnode_handle *dev_fwnode(const struct device *dev); bool device_property_present(struct device *dev, const char *propname); int device_property_read_u8_array(struct device *dev, const char *propname, @@ -387,7 +387,7 @@ bool device_dma_supported(struct device *dev); enum dev_dma_attr device_get_dma_attr(struct device *dev); -const void *device_get_match_data(struct device *dev); +const void *device_get_match_data(const struct device *dev); int device_get_phy_mode(struct device *dev); int fwnode_get_phy_mode(struct fwnode_handle *fwnode); -- cgit v1.2.3 From bf2ee8d0c385f883a00473768b67faf2189b2410 Mon Sep 17 00:00:00 2001 From: Jianmin Lv Date: Sun, 11 Sep 2022 17:06:34 +0800 Subject: ACPI: scan: Support multiple DMA windows with different offsets In DT systems configurations, of_dma_get_range() returns struct bus_dma_region DMA regions; they are used to set-up devices DMA windows with different offset available for translation between DMA address and CPU address. In ACPI systems configuration, acpi_dma_get_range() does not return DMA regions yet and that precludes setting up the dev->dma_range_map pointer and therefore DMA regions with multiple offsets. Update acpi_dma_get_range() to return struct bus_dma_region DMA regions like of_dma_get_range() does. After updating acpi_dma_get_range(), acpi_arch_dma_setup() is changed for ARM64, where the original dma_addr and size are removed as these arguments are now redundant, and pass 0 and U64_MAX for dma_base and size of arch_setup_dma_ops; this is a simplification consistent with what other ACPI architectures also pass to iommu_setup_dma_ops(). Reviewed-by: Robin Murphy Signed-off-by: Jianmin Lv Reviewed-by: Lorenzo Pieralisi Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6f64b2f3dc54..bb41623dab77 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -281,12 +281,12 @@ void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa); #ifdef CONFIG_ARM64 void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa); -void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size); +void acpi_arch_dma_setup(struct device *dev); #else static inline void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { } static inline void -acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size) { } +acpi_arch_dma_setup(struct device *dev) { } #endif int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma); @@ -977,8 +977,7 @@ static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev) return DEV_DMA_NOT_SUPPORTED; } -static inline int acpi_dma_get_range(struct device *dev, u64 *dma_addr, - u64 *offset, u64 *size) +static inline int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map) { return -ENODEV; } -- cgit v1.2.3 From c78c43fe7d42524c8f364aaf95ef3652e7f1186b Mon Sep 17 00:00:00 2001 From: Jianmin Lv Date: Sun, 11 Sep 2022 17:06:35 +0800 Subject: LoongArch: Use acpi_arch_dma_setup() and remove ARCH_HAS_PHYS_TO_DMA Use _DMA defined in ACPI spec for translation between DMA address and CPU address, and implement acpi_arch_dma_setup for initializing dev->dma_range_map, where acpi_dma_get_range is called for parsing _DMA. e.g. If we have two dma ranges: cpu address dma address size offset 0x200080000000 0x2080000000 0x400000000 0x1fe000000000 0x400080000000 0x4080000000 0x400000000 0x3fc000000000 _DMA for pci devices should be declared in host bridge as flowing: Name (_DMA, ResourceTemplate() { QWordMemory (ResourceProducer, PosDecode, MinFixed, MaxFixed, NonCacheable, ReadWrite, 0x0, 0x4080000000, 0x447fffffff, 0x3fc000000000, 0x400000000, , , ) QWordMemory (ResourceProducer, PosDecode, MinFixed, MaxFixed, NonCacheable, ReadWrite, 0x0, 0x2080000000, 0x247fffffff, 0x1fe000000000, 0x400000000, , , ) }) Acked-by: Huacai Chen Signed-off-by: Jianmin Lv Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index bb41623dab77..a71d73a0d43e 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -279,14 +279,17 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { } void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa); +#if defined(CONFIG_ARM64) || defined(CONFIG_LOONGARCH) +void acpi_arch_dma_setup(struct device *dev); +#else +static inline void acpi_arch_dma_setup(struct device *dev) { } +#endif + #ifdef CONFIG_ARM64 void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa); -void acpi_arch_dma_setup(struct device *dev); #else static inline void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { } -static inline void -acpi_arch_dma_setup(struct device *dev) { } #endif int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma); -- cgit v1.2.3 From 43cf36974d760a3d1c705a83de89ac58059e5f0b Mon Sep 17 00:00:00 2001 From: Daniel Scally Date: Thu, 22 Sep 2022 00:04:37 +0100 Subject: platform/x86: int3472: Support multiple clock consumers At present, the tps68470.c only supports a single clock consumer when passing platform data to the clock driver. In some devices multiple sensors depend on the clock provided by a single TPS68470 and so all need to be able to acquire the clock. Support passing multiple consumers as platform data. Reviewed-by: Hans de Goede Signed-off-by: Daniel Scally Reviewed-by: Stephen Boyd Acked-by: Stephen Boyd Signed-off-by: Rafael J. Wysocki --- include/linux/platform_data/tps68470.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/tps68470.h b/include/linux/platform_data/tps68470.h index 126d082c3f2e..e605a2cab07f 100644 --- a/include/linux/platform_data/tps68470.h +++ b/include/linux/platform_data/tps68470.h @@ -27,9 +27,14 @@ struct tps68470_regulator_platform_data { const struct regulator_init_data *reg_init_data[TPS68470_NUM_REGULATORS]; }; -struct tps68470_clk_platform_data { +struct tps68470_clk_consumer { const char *consumer_dev_name; const char *consumer_con_id; }; +struct tps68470_clk_platform_data { + unsigned int n_consumers; + struct tps68470_clk_consumer consumers[]; +}; + #endif -- cgit v1.2.3 From 7c7f9bc986e698873b489c371a08f206979d06b7 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 22 Sep 2022 18:27:33 +0200 Subject: serial: Deassert Transmit Enable on probe in driver-specific way MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a UART port is newly registered, uart_configure_port() seeks to deassert RS485 Transmit Enable by setting the RTS bit in port->mctrl. However a number of UART drivers interpret a set RTS bit as *assertion* instead of deassertion: Affected drivers include those using serial8250_em485_config() (except 8250_bcm2835aux.c) and some using mctrl_gpio (e.g. imx.c). Since the interpretation of the RTS bit is driver-specific, it is not suitable as a means to centrally deassert Transmit Enable in the serial core. Instead, the serial core must call on drivers to deassert it in their driver-specific way. One way to achieve that is to call ->rs485_config(). It implicitly deasserts Transmit Enable. So amend uart_configure_port() and uart_resume_port() to invoke uart_rs485_config(). That allows removing calls to uart_rs485_config() from drivers' ->probe() hooks and declaring the function static. Skip any invocation of ->set_mctrl() if RS485 is enabled. RS485 has no hardware flow control, so the modem control lines are irrelevant and need not be touched. When leaving RS485 mode, reset the modem control lines to the state stored in port->mctrl. That way, UARTs which are muxed between RS485 and RS232 transceivers drive the lines correctly when switched to RS232. (serial8250_do_startup() historically raises the OUT1 modem signal because otherwise interrupts are not signaled on ancient PC UARTs, but I believe that no longer applies to modern, RS485-capable UARTs and is thus safe to be skipped.) imx.c modifies port->mctrl whenever Transmit Enable is asserted and deasserted. Stop it from doing that so port->mctrl reflects the RS232 line state. 8250_omap.c deasserts Transmit Enable on ->runtime_resume() by calling ->set_mctrl(). Because that is now a no-op in RS485 mode, amend the function to call serial8250_em485_stop_tx(). fsl_lpuart.c retrieves and applies the RS485 device tree properties after registering the UART port. Because applying now happens on registration in uart_configure_port(), move retrieval of the properties ahead of uart_add_one_port(). Link: https://lore.kernel.org/all/20220329085050.311408-1-matthias.schiffer@ew.tq-group.com/ Link: https://lore.kernel.org/all/8f538a8903795f22f9acc94a9a31b03c9c4ccacb.camel@ginzinger.com/ Fixes: d3b3404df318 ("serial: Fix incorrect rs485 polarity on uart open") Cc: stable@vger.kernel.org # v4.14+ Reported-by: Matthias Schiffer Reported-by: Roosen Henri Tested-by: Matthias Schiffer Reviewed-by: Ilpo Järvinen Signed-off-by: Lukas Wunner Link: https://lore.kernel.org/r/2de36eba3fbe11278d5002e4e501afe0ceaca039.1663863805.git.lukas@wunner.de Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index b4c21f84ad79..d657f2a42a7b 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -951,5 +951,4 @@ static inline int uart_handle_break(struct uart_port *port) !((cflag) & CLOCAL)) int uart_get_rs485_mode(struct uart_port *port); -int uart_rs485_config(struct uart_port *port); #endif /* LINUX_SERIAL_CORE_H */ -- cgit v1.2.3 From 1a77dd1c2bb5d4a58c16d198cf593720787c02e4 Mon Sep 17 00:00:00 2001 From: Arun Easi Date: Wed, 7 Sep 2022 16:33:08 -0700 Subject: scsi: tracing: Fix compile error in trace_array calls when TRACING is disabled Fix this compilation error seen when CONFIG_TRACING is not enabled: drivers/scsi/qla2xxx/qla_os.c: In function 'qla_trace_init': drivers/scsi/qla2xxx/qla_os.c:2854:25: error: implicit declaration of function 'trace_array_get_by_name'; did you mean 'trace_array_set_clr_event'? [-Werror=implicit-function-declaration] 2854 | qla_trc_array = trace_array_get_by_name("qla2xxx"); | ^~~~~~~~~~~~~~~~~~~~~~~ | trace_array_set_clr_event drivers/scsi/qla2xxx/qla_os.c: In function 'qla_trace_uninit': drivers/scsi/qla2xxx/qla_os.c:2869:9: error: implicit declaration of function 'trace_array_put' [-Werror=implicit-function-declaration] 2869 | trace_array_put(qla_trc_array); | ^~~~~~~~~~~~~~~ Link: https://lore.kernel.org/r/20220907233308.4153-2-aeasi@marvell.com Reported-by: kernel test robot Reviewed-by: Steven Rostedt (Google) Signed-off-by: Arun Easi Signed-off-by: Martin K. Petersen --- include/linux/trace.h | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace.h b/include/linux/trace.h index bf169612ffe1..b5e16e438448 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -2,8 +2,6 @@ #ifndef _LINUX_TRACE_H #define _LINUX_TRACE_H -#ifdef CONFIG_TRACING - #define TRACE_EXPORT_FUNCTION BIT(0) #define TRACE_EXPORT_EVENT BIT(1) #define TRACE_EXPORT_MARKER BIT(2) @@ -28,6 +26,8 @@ struct trace_export { int flags; }; +#ifdef CONFIG_TRACING + int register_ftrace_export(struct trace_export *export); int unregister_ftrace_export(struct trace_export *export); @@ -48,6 +48,38 @@ void osnoise_arch_unregister(void); void osnoise_trace_irq_entry(int id); void osnoise_trace_irq_exit(int id, const char *desc); +#else /* CONFIG_TRACING */ +static inline int register_ftrace_export(struct trace_export *export) +{ + return -EINVAL; +} +static inline int unregister_ftrace_export(struct trace_export *export) +{ + return 0; +} +static inline void trace_printk_init_buffers(void) +{ +} +static inline int trace_array_printk(struct trace_array *tr, unsigned long ip, + const char *fmt, ...) +{ + return 0; +} +static inline int trace_array_init_printk(struct trace_array *tr) +{ + return -EINVAL; +} +static inline void trace_array_put(struct trace_array *tr) +{ +} +static inline struct trace_array *trace_array_get_by_name(const char *name) +{ + return NULL; +} +static inline int trace_array_destroy(struct trace_array *tr) +{ + return 0; +} #endif /* CONFIG_TRACING */ #endif /* _LINUX_TRACE_H */ -- cgit v1.2.3 From 38622010a6de3a62cc72688348548854ed82dcf5 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Mon, 15 Aug 2022 13:54:28 -0700 Subject: btrfs: send: add support for fs-verity Preserve the fs-verity status of a btrfs file across send/recv. There is no facility for installing the Merkle tree contents directly on the receiving filesystem, so we package up the parameters used to enable verity found in the verity descriptor. This gives the receive side enough information to properly enable verity again. Note that this means that receive will have to re-compute the whole Merkle tree, similar to how compression worked before encoded_write. Since the file becomes read-only after verity is enabled, it is important that verity is added to the send stream after any file writes. Therefore, when we process a verity item, merely note that it happened, then actually create the command in the send stream during 'finish_inode_if_needed'. This also creates V3 of the send stream format, without any format changes besides adding the new commands and attributes. Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- include/linux/fsverity.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 7af030fa3c36..40f14e5fed9d 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -22,6 +22,9 @@ */ #define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE +/* Arbitrary limit to bound the kmalloc() size. Can be changed. */ +#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 + /* Verity operations for filesystems */ struct fsverity_operations { -- cgit v1.2.3 From 691cdf016d3be6f66a3ea384809be229e0f9c590 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 7 Sep 2022 11:34:45 +0200 Subject: powerpc: Rely on generic definition of hugepd_t and is_hugepd when unused CONFIG_ARCH_HAS_HUGEPD is used to tell core mm when huge page directories are used. When they are not used, no need to provide hugepd_t or is_hugepd(), just rely on the core mm fallback definition. For that, change core mm behaviour so that CONFIG_ARCH_HAS_HUGEPD is used instead of indirect is_hugepd macro existence. powerpc being the only user of huge page directories, there is no impact on other architectures. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/da81462d93069bb90fe5e762dd3283a644318937.1662543243.git.christophe.leroy@csgroup.eu --- include/linux/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3ec981a0d8b3..1ec1535be04f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -17,7 +17,7 @@ struct ctl_table; struct user_struct; struct mmu_gather; -#ifndef is_hugepd +#ifndef CONFIG_ARCH_HAS_HUGEPD typedef struct { unsigned long pd; } hugepd_t; #define is_hugepd(hugepd) (0) #define __hugepd(x) ((hugepd_t) { (x) }) -- cgit v1.2.3 From 4f58330fcc8482aa90674e1f40f601e82f18ed4a Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 13 Sep 2022 12:47:20 +0100 Subject: iommu/iova: Fix module config properly IOMMU_IOVA is intended to be an optional library for users to select as and when they desire. Since it can be a module now, this means that built-in code which has chosen not to select it should not fail to link if it happens to have selected as a module by someone else. Replace IS_ENABLED() with IS_REACHABLE() to do the right thing. CC: Thierry Reding Reported-by: John Garry Fixes: 15bbdec3931e ("iommu: Make the iova library a module") Signed-off-by: Robin Murphy Reviewed-by: Thierry Reding Link: https://lore.kernel.org/r/548c2f683ca379aface59639a8f0cccc3a1ac050.1663069227.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iova.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iova.h b/include/linux/iova.h index c6ba6d95d79c..83c00fac2acb 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -75,7 +75,7 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova) return iova >> iova_shift(iovad); } -#if IS_ENABLED(CONFIG_IOMMU_IOVA) +#if IS_REACHABLE(CONFIG_IOMMU_IOVA) int iova_cache_get(void); void iova_cache_put(void); -- cgit v1.2.3 From dc09fe1c5edd9c27a52cb6dc5a7bb4452d45c71c Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Fri, 16 Sep 2022 11:41:51 +0200 Subject: iommu/io-pgtable-dart: Add DART PTE support for t6000 The DARTs present in the M1 Pro/Max/Ultra SoC use a diffent PTE format. They support a 42bit physical address space by shifting the paddr and extending its mask inside the PTE. They also come with mandatory sub-page protection now which we just configure to always allow access to the entire page. This feature is already present but optional on the previous DARTs which allows to unconditionally configure it. Signed-off-by: Sven Peter Co-developed-by: Janne Grunau Signed-off-by: Janne Grunau Reviewed-by: Rob Herring Acked-by: Hector Martin Link: https://lore.kernel.org/r/20220916094152.87137-5-j@jannau.net Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index ca98aeadcc80..b768937382cd 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -17,6 +17,7 @@ enum io_pgtable_fmt { ARM_MALI_LPAE, AMD_IOMMU_V1, APPLE_DART, + APPLE_DART2, IO_PGTABLE_NUM_FMTS, }; -- cgit v1.2.3 From c59fb127583869350256656b7ed848c398bef879 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 21 Sep 2022 00:32:01 +0000 Subject: KVM: remove KVM_REQ_UNHALT KVM_REQ_UNHALT is now unnecessary because it is replaced by the return value of kvm_vcpu_block/kvm_vcpu_halt. Remove it. No functional change intended. Signed-off-by: Paolo Bonzini Signed-off-by: Sean Christopherson Acked-by: Marc Zyngier Message-Id: <20220921003201.1441511-13-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 04c7e5f2f727..32f259fa5801 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -151,12 +151,11 @@ static inline bool is_error_page(struct page *page) #define KVM_REQUEST_NO_ACTION BIT(10) /* * Architecture-independent vcpu->requests bit members - * Bits 4-7 are reserved for more arch-independent bits. + * Bits 3-7 are reserved for more arch-independent bits. */ #define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_VM_DEAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_UNBLOCK 2 -#define KVM_REQ_UNHALT 3 #define KVM_REQUEST_ARCH_BASE 8 /* -- cgit v1.2.3 From 9fca7115827b2e5f48d84e50bceb4edfd4cb6375 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 8 Sep 2022 14:54:45 -0700 Subject: cfi: Remove CONFIG_CFI_CLANG_SHADOW In preparation to switching to -fsanitize=kcfi, remove support for the CFI module shadow that will no longer be needed. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Nathan Chancellor Acked-by: Peter Zijlstra (Intel) Tested-by: Peter Zijlstra (Intel) Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220908215504.3686827-4-samitolvanen@google.com --- include/linux/cfi.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cfi.h b/include/linux/cfi.h index c6dfc1ed0626..4ab51c067007 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -20,18 +20,6 @@ extern void __cfi_check(uint64_t id, void *ptr, void *diag); #define __CFI_ADDRESSABLE(fn, __attr) \ const void *__cfi_jt_ ## fn __visible __attr = (void *)&fn -#ifdef CONFIG_CFI_CLANG_SHADOW - -extern void cfi_module_add(struct module *mod, unsigned long base_addr); -extern void cfi_module_remove(struct module *mod, unsigned long base_addr); - -#else - -static inline void cfi_module_add(struct module *mod, unsigned long base_addr) {} -static inline void cfi_module_remove(struct module *mod, unsigned long base_addr) {} - -#endif /* CONFIG_CFI_CLANG_SHADOW */ - #else /* !CONFIG_CFI_CLANG */ #ifdef CONFIG_X86_KERNEL_IBT -- cgit v1.2.3 From 92efda8eb15295a07f450828b2db14485bfc09c2 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 8 Sep 2022 14:54:46 -0700 Subject: cfi: Drop __CFI_ADDRESSABLE The __CFI_ADDRESSABLE macro is used for init_module and cleanup_module to ensure we have the address of the CFI jump table, and with CONFIG_X86_KERNEL_IBT to ensure LTO won't optimize away the symbols. As __CFI_ADDRESSABLE is no longer necessary with -fsanitize=kcfi, add a more flexible version of the __ADDRESSABLE macro and always ensure these symbols won't be dropped. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Nathan Chancellor Acked-by: Peter Zijlstra (Intel) Tested-by: Peter Zijlstra (Intel) Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220908215504.3686827-5-samitolvanen@google.com --- include/linux/cfi.h | 20 -------------------- include/linux/compiler.h | 6 ++++-- include/linux/module.h | 4 ++-- 3 files changed, 6 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 4ab51c067007..2cdbc0fbd0ab 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -13,26 +13,6 @@ typedef void (*cfi_check_fn)(uint64_t id, void *ptr, void *diag); /* Compiler-generated function in each module, and the kernel */ extern void __cfi_check(uint64_t id, void *ptr, void *diag); -/* - * Force the compiler to generate a CFI jump table entry for a function - * and store the jump table address to __cfi_jt_. - */ -#define __CFI_ADDRESSABLE(fn, __attr) \ - const void *__cfi_jt_ ## fn __visible __attr = (void *)&fn - -#else /* !CONFIG_CFI_CLANG */ - -#ifdef CONFIG_X86_KERNEL_IBT - -#define __CFI_ADDRESSABLE(fn, __attr) \ - const void *__cfi_jt_ ## fn __visible __attr = (void *)&fn - -#endif /* CONFIG_X86_KERNEL_IBT */ - #endif /* CONFIG_CFI_CLANG */ -#ifndef __CFI_ADDRESSABLE -#define __CFI_ADDRESSABLE(fn, __attr) -#endif - #endif /* _LINUX_CFI_H */ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 7713d7bcdaea..7bfafc69172a 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -221,9 +221,11 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, * otherwise, or eliminated entirely due to lack of references that are * visible to the compiler. */ -#define __ADDRESSABLE(sym) \ - static void * __section(".discard.addressable") __used \ +#define ___ADDRESSABLE(sym, __attrs) \ + static void * __used __attrs \ __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)&sym; +#define __ADDRESSABLE(sym) \ + ___ADDRESSABLE(sym, __section(".discard.addressable")) /** * offset_to_ptr - convert a relative memory offset to an absolute pointer diff --git a/include/linux/module.h b/include/linux/module.h index 518296ea7f73..8937b020ec04 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -132,7 +132,7 @@ extern void cleanup_module(void); { return initfn; } \ int init_module(void) __copy(initfn) \ __attribute__((alias(#initfn))); \ - __CFI_ADDRESSABLE(init_module, __initdata); + ___ADDRESSABLE(init_module, __initdata); /* This is only required if you want to be unloadable. */ #define module_exit(exitfn) \ @@ -140,7 +140,7 @@ extern void cleanup_module(void); { return exitfn; } \ void cleanup_module(void) __copy(exitfn) \ __attribute__((alias(#exitfn))); \ - __CFI_ADDRESSABLE(cleanup_module, __exitdata); + ___ADDRESSABLE(cleanup_module, __exitdata); #endif -- cgit v1.2.3 From 89245600941e4e0f87d77f60ee269b5e61ef4e49 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 8 Sep 2022 14:54:47 -0700 Subject: cfi: Switch to -fsanitize=kcfi Switch from Clang's original forward-edge control-flow integrity implementation to -fsanitize=kcfi, which is better suited for the kernel, as it doesn't require LTO, doesn't use a jump table that requires altering function references, and won't break cross-module function address equality. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Nathan Chancellor Acked-by: Peter Zijlstra (Intel) Tested-by: Peter Zijlstra (Intel) Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220908215504.3686827-6-samitolvanen@google.com --- include/linux/cfi.h | 29 +++++++++++++++++++++++++---- include/linux/compiler-clang.h | 14 +++----------- include/linux/module.h | 6 +++--- 3 files changed, 31 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 2cdbc0fbd0ab..5e134f4ce8b7 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -2,17 +2,38 @@ /* * Clang Control Flow Integrity (CFI) support. * - * Copyright (C) 2021 Google LLC + * Copyright (C) 2022 Google LLC */ #ifndef _LINUX_CFI_H #define _LINUX_CFI_H +#include +#include + #ifdef CONFIG_CFI_CLANG -typedef void (*cfi_check_fn)(uint64_t id, void *ptr, void *diag); +enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, + unsigned long *target, u32 type); -/* Compiler-generated function in each module, and the kernel */ -extern void __cfi_check(uint64_t id, void *ptr, void *diag); +static inline enum bug_trap_type report_cfi_failure_noaddr(struct pt_regs *regs, + unsigned long addr) +{ + return report_cfi_failure(regs, addr, NULL, 0); +} +#ifdef CONFIG_ARCH_USES_CFI_TRAPS +bool is_cfi_trap(unsigned long addr); +#endif #endif /* CONFIG_CFI_CLANG */ +#ifdef CONFIG_MODULES +#ifdef CONFIG_ARCH_USES_CFI_TRAPS +void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + struct module *mod); +#else +static inline void module_cfi_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *mod) {} +#endif /* CONFIG_ARCH_USES_CFI_TRAPS */ +#endif /* CONFIG_MODULES */ + #endif /* _LINUX_CFI_H */ diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index c84fec767445..42e55579d649 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -66,17 +66,9 @@ # define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif -#define __nocfi __attribute__((__no_sanitize__("cfi"))) -#define __cficanonical __attribute__((__cfi_canonical_jump_table__)) - -#if defined(CONFIG_CFI_CLANG) -/* - * With CONFIG_CFI_CLANG, the compiler replaces function address - * references with the address of the function's CFI jump table - * entry. The function_nocfi macro always returns the address of the - * actual function instead. - */ -#define function_nocfi(x) __builtin_function_start(x) +#if __has_feature(kcfi) +/* Disable CFI checking inside a function. */ +#define __nocfi __attribute__((__no_sanitize__("kcfi"))) #endif /* diff --git a/include/linux/module.h b/include/linux/module.h index 8937b020ec04..ec61fb53979a 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -387,8 +386,9 @@ struct module { const s32 *crcs; unsigned int num_syms; -#ifdef CONFIG_CFI_CLANG - cfi_check_fn cfi_check; +#ifdef CONFIG_ARCH_USES_CFI_TRAPS + s32 *kcfi_traps; + s32 *kcfi_traps_end; #endif /* Kernel parameters. */ -- cgit v1.2.3 From e84e008e7b02c015047e76261726da1550130a59 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 8 Sep 2022 14:54:48 -0700 Subject: cfi: Add type helper macros With CONFIG_CFI_CLANG, assembly functions called indirectly from C code must be annotated with type identifiers to pass CFI checking. In order to make this easier, the compiler emits a __kcfi_typeid_ symbol for each address-taken function declaration in C, which contains the expected type identifier that we can refer to in assembly code. Add a typed version of SYM_FUNC_START, which emits the type identifier before the function. Architectures that support KCFI can define their own __CFI_TYPE macro to override the default preamble format. As an example, for the x86_64 blowfish_dec_blk function, the compiler emits the following type symbol: $ readelf -sW vmlinux | grep __kcfi_typeid_blowfish_dec_blk 120204: 00000000ef478db5 0 NOTYPE WEAK DEFAULT ABS __kcfi_typeid_blowfish_dec_blk And SYM_TYPED_FUNC_START will generate the following preamble based on the __CFI_TYPE definition for the architecture: $ objdump -dr arch/x86/crypto/blowfish-x86_64-asm_64.o ... 0000000000000400 <__cfi_blowfish_dec_blk>: ... 40b: b8 00 00 00 00 mov $0x0,%eax 40c: R_X86_64_32 __kcfi_typeid_blowfish_dec_blk 0000000000000410 : ... Note that the address of all assembly functions annotated with SYM_TYPED_FUNC_START must be taken in C code that's linked into the binary or the missing __kcfi_typeid_ symbol will result in a linker error with CONFIG_CFI_CLANG. If the code that contains the indirect call is not always compiled in, __ADDRESSABLE(functionname) can be used to ensure that the __kcfi_typeid_ symbol is emitted. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Nathan Chancellor Acked-by: Peter Zijlstra (Intel) Tested-by: Peter Zijlstra (Intel) Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220908215504.3686827-7-samitolvanen@google.com --- include/linux/cfi_types.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 include/linux/cfi_types.h (limited to 'include/linux') diff --git a/include/linux/cfi_types.h b/include/linux/cfi_types.h new file mode 100644 index 000000000000..6b8713675765 --- /dev/null +++ b/include/linux/cfi_types.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Clang Control Flow Integrity (CFI) type definitions. + */ +#ifndef _LINUX_CFI_TYPES_H +#define _LINUX_CFI_TYPES_H + +#ifdef __ASSEMBLY__ +#include + +#ifdef CONFIG_CFI_CLANG +/* + * Use the __kcfi_typeid_ type identifier symbol to + * annotate indirectly called assembly functions. The compiler emits + * these symbols for all address-taken function declarations in C + * code. + */ +#ifndef __CFI_TYPE +#define __CFI_TYPE(name) \ + .4byte __kcfi_typeid_##name +#endif + +#define SYM_TYPED_ENTRY(name, linkage, align...) \ + linkage(name) ASM_NL \ + align ASM_NL \ + __CFI_TYPE(name) ASM_NL \ + name: + +#define SYM_TYPED_START(name, linkage, align...) \ + SYM_TYPED_ENTRY(name, linkage, align) + +#else /* CONFIG_CFI_CLANG */ + +#define SYM_TYPED_START(name, linkage, align...) \ + SYM_START(name, linkage, align) + +#endif /* CONFIG_CFI_CLANG */ + +#ifndef SYM_TYPED_FUNC_START +#define SYM_TYPED_FUNC_START(name) \ + SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* _LINUX_CFI_TYPES_H */ -- cgit v1.2.3 From 5dbbb3eaa2a784342f3206b77381b516181d089c Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 8 Sep 2022 14:54:54 -0700 Subject: init: Drop __nocfi from __init It's no longer necessary to disable CFI checking for all __init functions. Drop the __nocfi attribute from __init. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Nathan Chancellor Acked-by: Peter Zijlstra (Intel) Tested-by: Peter Zijlstra (Intel) Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220908215504.3686827-13-samitolvanen@google.com --- include/linux/init.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index baf0b29a7010..88f2964097f5 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -47,7 +47,7 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ -#define __init __section(".init.text") __cold __latent_entropy __noinitretpoline __nocfi +#define __init __section(".init.text") __cold __latent_entropy __noinitretpoline #define __initdata __section(".init.data") #define __initconst __section(".init.rodata") #define __exitdata __section(".exit.data") -- cgit v1.2.3 From 607289a7cd7a3ca42b8a6877fcb6072e6eb20c34 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 8 Sep 2022 14:54:55 -0700 Subject: treewide: Drop function_nocfi With -fsanitize=kcfi, we no longer need function_nocfi() as the compiler won't change function references to point to a jump table. Remove all implementations and uses of the macro. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Nathan Chancellor Acked-by: Peter Zijlstra (Intel) Tested-by: Peter Zijlstra (Intel) Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220908215504.3686827-14-samitolvanen@google.com --- include/linux/compiler.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 7bfafc69172a..973a1bfd7ef5 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -203,16 +203,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, __v; \ }) -/* - * With CONFIG_CFI_CLANG, the compiler replaces function addresses in - * instrumented C code with jump table addresses. Architectures that - * support CFI can define this macro to return the actual function address - * when needed. - */ -#ifndef function_nocfi -#define function_nocfi(x) (x) -#endif - #endif /* __KERNEL__ */ /* -- cgit v1.2.3 From 5659b598b4dcb352b1a567c55fc5a658bc80076c Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 8 Sep 2022 14:54:57 -0700 Subject: treewide: Drop __cficanonical CONFIG_CFI_CLANG doesn't use a jump table anymore and therefore, won't change function references to point elsewhere. Remove the __cficanonical attribute and all uses of it. Note that the Clang definition of the attribute was removed earlier, just clean up the no-op definition and users. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Nathan Chancellor Acked-by: Peter Zijlstra (Intel) Tested-by: Peter Zijlstra (Intel) Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220908215504.3686827-16-samitolvanen@google.com --- include/linux/compiler_types.h | 4 ---- include/linux/init.h | 4 ++-- include/linux/pci.h | 4 ++-- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 4f2a819fd60a..6f2ec0976e2d 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -265,10 +265,6 @@ struct ftrace_likely_data { # define __nocfi #endif -#ifndef __cficanonical -# define __cficanonical -#endif - /* * Any place that could be marked with the "alloc_size" attribute is also * a place to be marked with the "malloc" attribute. Do this as part of the diff --git a/include/linux/init.h b/include/linux/init.h index 88f2964097f5..a0a90cd73ebe 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -220,8 +220,8 @@ extern bool initcall_debug; __initcall_name(initstub, __iid, id) #define __define_initcall_stub(__stub, fn) \ - int __init __cficanonical __stub(void); \ - int __init __cficanonical __stub(void) \ + int __init __stub(void); \ + int __init __stub(void) \ { \ return fn(); \ } \ diff --git a/include/linux/pci.h b/include/linux/pci.h index 060af91bafcd..5da0846aa3c1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2019,8 +2019,8 @@ enum pci_fixup_pass { #ifdef CONFIG_LTO_CLANG #define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class, \ class_shift, hook, stub) \ - void __cficanonical stub(struct pci_dev *dev); \ - void __cficanonical stub(struct pci_dev *dev) \ + void stub(struct pci_dev *dev); \ + void stub(struct pci_dev *dev) \ { \ hook(dev); \ } \ -- cgit v1.2.3 From fa35198f39571bbdae53c5b321020021eaad6bd2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 19 Sep 2022 16:33:33 -0700 Subject: fortify: Explicitly check bounds are compile-time constants In preparation for replacing __builtin_object_size() with __builtin_dynamic_object_size(), all the compile-time size checks need to check that the bounds comparisons are, in fact, known at compile-time. Enforce what was guaranteed with __bos(). In other words, since all uses of __bos() were constant expressions, it was not required to test for this. When these change to __bdos(), they _may_ be constant expressions, and the checks are only valid when the prior condition holds. This results in no binary differences. Cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/lkml/20220920192202.190793-3-keescook@chromium.org Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 49 ++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index ff879efe94ed..1c582224c525 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -80,6 +80,11 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) #define POS __pass_object_size(1) #define POS0 __pass_object_size(0) +#define __compiletime_lessthan(bounds, length) ( \ + __builtin_constant_p((bounds) < (length)) && \ + (bounds) < (length) \ +) + /** * strncpy - Copy a string to memory with non-guaranteed NUL padding * @@ -117,7 +122,7 @@ char *strncpy(char * const POS p, const char *q, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 1); - if (__builtin_constant_p(size) && p_size < size) + if (__compiletime_lessthan(p_size, size)) __write_overflow(); if (p_size < size) fortify_panic(__func__); @@ -224,7 +229,7 @@ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, s * If size can be known at compile time and is greater than * p_size, generate a compile time write overflow error. */ - if (__builtin_constant_p(size) && size > p_size) + if (__compiletime_lessthan(p_size, size)) __write_overflow(); /* @@ -281,15 +286,16 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size, /* * Length argument is a constant expression, so we * can perform compile-time bounds checking where - * buffer sizes are known. + * buffer sizes are also known at compile time. */ /* Error when size is larger than enclosing struct. */ - if (p_size > p_size_field && p_size < size) + if (__compiletime_lessthan(p_size_field, p_size) && + __compiletime_lessthan(p_size, size)) __write_overflow(); /* Warn when write size is larger than dest field. */ - if (p_size_field < size) + if (__compiletime_lessthan(p_size_field, size)) __write_overflow_field(p_size_field, size); } /* @@ -365,25 +371,28 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, /* * Length argument is a constant expression, so we * can perform compile-time bounds checking where - * buffer sizes are known. + * buffer sizes are also known at compile time. */ /* Error when size is larger than enclosing struct. */ - if (p_size > p_size_field && p_size < size) + if (__compiletime_lessthan(p_size_field, p_size) && + __compiletime_lessthan(p_size, size)) __write_overflow(); - if (q_size > q_size_field && q_size < size) + if (__compiletime_lessthan(q_size_field, q_size) && + __compiletime_lessthan(q_size, size)) __read_overflow2(); /* Warn when write size argument larger than dest field. */ - if (p_size_field < size) + if (__compiletime_lessthan(p_size_field, size)) __write_overflow_field(p_size_field, size); /* * Warn for source field over-read when building with W=1 * or when an over-write happened, so both can be fixed at * the same time. */ - if ((IS_ENABLED(KBUILD_EXTRA_WARN1) || p_size_field < size) && - q_size_field < size) + if ((IS_ENABLED(KBUILD_EXTRA_WARN1) || + __compiletime_lessthan(p_size_field, size)) && + __compiletime_lessthan(q_size_field, size)) __read_overflow2_field(q_size_field, size); } /* @@ -494,7 +503,7 @@ __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) + if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(__func__); @@ -508,9 +517,9 @@ int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t size_t q_size = __builtin_object_size(q, 0); if (__builtin_constant_p(size)) { - if (p_size < size) + if (__compiletime_lessthan(p_size, size)) __read_overflow(); - if (q_size < size) + if (__compiletime_lessthan(q_size, size)) __read_overflow2(); } if (p_size < size || q_size < size) @@ -523,7 +532,7 @@ void *memchr(const void * const POS0 p, int c, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) + if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(__func__); @@ -535,7 +544,7 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) { size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) + if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(__func__); @@ -547,7 +556,7 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp { size_t p_size = __builtin_object_size(p, 0); - if (__builtin_constant_p(size) && p_size < size) + if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(__func__); @@ -563,11 +572,13 @@ char *strcpy(char * const POS p, const char * const POS q) size_t size; /* If neither buffer size is known, immediately give up. */ - if (p_size == SIZE_MAX && q_size == SIZE_MAX) + if (__builtin_constant_p(p_size) && + __builtin_constant_p(q_size) && + p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strcpy(p, q); size = strlen(q) + 1; /* Compile-time check for const size overflow. */ - if (__builtin_constant_p(size) && p_size < size) + if (__compiletime_lessthan(p_size, size)) __write_overflow(); /* Run-time check for dynamic size overflow. */ if (p_size < size) -- cgit v1.2.3 From 9f7d69c5cd23904a29178a7ecc4eee9c1cfba04b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 19 Sep 2022 19:50:32 -0700 Subject: fortify: Convert to struct vs member helpers In preparation for adding support for __builtin_dynamic_object_size(), wrap each instance of __builtin_object_size(p, N) with either the new __struct_size(p) as __bos(p, 0), or __member_size(p) as __bos(p, 1). This will allow us to replace the definitions with __bdos() next. There are no binary differences from this change. Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tom Rix Cc: linux-hardening@vger.kernel.org Cc: llvm@lists.linux.dev Link: https://lore.kernel.org/lkml/20220920192202.190793-4-keescook@chromium.org Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 68 ++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 1c582224c525..b62c90cfafaf 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -20,7 +20,7 @@ void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning(" ({ \ unsigned char *__p = (unsigned char *)(p); \ size_t __ret = SIZE_MAX; \ - size_t __p_size = __builtin_object_size(p, 1); \ + size_t __p_size = __member_size(p); \ if (__p_size != SIZE_MAX && \ __builtin_constant_p(*__p)) { \ size_t __p_len = __p_size - 1; \ @@ -72,13 +72,15 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __underlying_memcpy(dst, src, bytes) /* - * Clang's use of __builtin_object_size() within inlines needs hinting via - * __pass_object_size(). The preference is to only ever use type 1 (member + * Clang's use of __builtin_*object_size() within inlines needs hinting via + * __pass_*object_size(). The preference is to only ever use type 1 (member * size, rather than struct size), but there remain some stragglers using * type 0 that will be converted in the future. */ -#define POS __pass_object_size(1) -#define POS0 __pass_object_size(0) +#define POS __pass_object_size(1) +#define POS0 __pass_object_size(0) +#define __struct_size(p) __builtin_object_size(p, 0) +#define __member_size(p) __builtin_object_size(p, 1) #define __compiletime_lessthan(bounds, length) ( \ __builtin_constant_p((bounds) < (length)) && \ @@ -120,7 +122,7 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3) char *strncpy(char * const POS p, const char *q, __kernel_size_t size) { - size_t p_size = __builtin_object_size(p, 1); + size_t p_size = __member_size(p); if (__compiletime_lessthan(p_size, size)) __write_overflow(); @@ -132,7 +134,7 @@ char *strncpy(char * const POS p, const char *q, __kernel_size_t size) __FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2) char *strcat(char * const POS p, const char *q) { - size_t p_size = __builtin_object_size(p, 1); + size_t p_size = __member_size(p); if (p_size == SIZE_MAX) return __underlying_strcat(p, q); @@ -144,7 +146,7 @@ char *strcat(char * const POS p, const char *q) extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size_t maxlen) { - size_t p_size = __builtin_object_size(p, 1); + size_t p_size = __member_size(p); size_t p_len = __compiletime_strlen(p); size_t ret; @@ -174,7 +176,7 @@ __FORTIFY_INLINE __diagnose_as(__builtin_strlen, 1) __kernel_size_t __fortify_strlen(const char * const POS p) { __kernel_size_t ret; - size_t p_size = __builtin_object_size(p, 1); + size_t p_size = __member_size(p); /* Give up if we don't know how large p is. */ if (p_size == SIZE_MAX) @@ -189,8 +191,8 @@ __kernel_size_t __fortify_strlen(const char * const POS p) extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); __FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, size_t size) { - size_t p_size = __builtin_object_size(p, 1); - size_t q_size = __builtin_object_size(q, 1); + size_t p_size = __member_size(p); + size_t q_size = __member_size(q); size_t q_len; /* Full count of source string length. */ size_t len; /* Count of characters going into destination. */ @@ -218,8 +220,8 @@ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, s { size_t len; /* Use string size rather than possible enclosing struct size. */ - size_t p_size = __builtin_object_size(p, 1); - size_t q_size = __builtin_object_size(q, 1); + size_t p_size = __member_size(p); + size_t q_size = __member_size(q); /* If we cannot get size of p and q default to call strscpy. */ if (p_size == SIZE_MAX && q_size == SIZE_MAX) @@ -264,8 +266,8 @@ __FORTIFY_INLINE __diagnose_as(__builtin_strncat, 1, 2, 3) char *strncat(char * const POS p, const char * const POS q, __kernel_size_t count) { size_t p_len, copy_len; - size_t p_size = __builtin_object_size(p, 1); - size_t q_size = __builtin_object_size(q, 1); + size_t p_size = __member_size(p); + size_t q_size = __member_size(q); if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strncat(p, q, count); @@ -323,11 +325,11 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size, }) /* - * __builtin_object_size() must be captured here to avoid evaluating argument - * side-effects further into the macro layers. + * __struct_size() vs __member_size() must be captured here to avoid + * evaluating argument side-effects further into the macro layers. */ #define memset(p, c, s) __fortify_memset_chk(p, c, s, \ - __builtin_object_size(p, 0), __builtin_object_size(p, 1)) + __struct_size(p), __member_size(p)) /* * To make sure the compiler can enforce protection against buffer overflows, @@ -420,7 +422,7 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, * fake flexible arrays, until they are all converted to * proper flexible arrays. * - * The implementation of __builtin_object_size() behaves + * The implementation of __builtin_*object_size() behaves * like sizeof() when not directly referencing a flexible * array member, which means there will be many bounds checks * that will appear at run-time, without a way for them to be @@ -486,22 +488,22 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, */ /* - * __builtin_object_size() must be captured here to avoid evaluating argument - * side-effects further into the macro layers. + * __struct_size() vs __member_size() must be captured here to avoid + * evaluating argument side-effects further into the macro layers. */ #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ - __builtin_object_size(p, 0), __builtin_object_size(q, 0), \ - __builtin_object_size(p, 1), __builtin_object_size(q, 1), \ + __struct_size(p), __struct_size(q), \ + __member_size(p), __member_size(q), \ memcpy) #define memmove(p, q, s) __fortify_memcpy_chk(p, q, s, \ - __builtin_object_size(p, 0), __builtin_object_size(q, 0), \ - __builtin_object_size(p, 1), __builtin_object_size(q, 1), \ + __struct_size(p), __struct_size(q), \ + __member_size(p), __member_size(q), \ memmove) extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size) { - size_t p_size = __builtin_object_size(p, 0); + size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); @@ -513,8 +515,8 @@ __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size) __FORTIFY_INLINE __diagnose_as(__builtin_memcmp, 1, 2, 3) int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t size) { - size_t p_size = __builtin_object_size(p, 0); - size_t q_size = __builtin_object_size(q, 0); + size_t p_size = __struct_size(p); + size_t q_size = __struct_size(q); if (__builtin_constant_p(size)) { if (__compiletime_lessthan(p_size, size)) @@ -530,7 +532,7 @@ int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t __FORTIFY_INLINE __diagnose_as(__builtin_memchr, 1, 2, 3) void *memchr(const void * const POS0 p, int c, __kernel_size_t size) { - size_t p_size = __builtin_object_size(p, 0); + size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); @@ -542,7 +544,7 @@ void *memchr(const void * const POS0 p, int c, __kernel_size_t size) void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) { - size_t p_size = __builtin_object_size(p, 0); + size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); @@ -554,7 +556,7 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup); __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp) { - size_t p_size = __builtin_object_size(p, 0); + size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); @@ -567,8 +569,8 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp __FORTIFY_INLINE __diagnose_as(__builtin_strcpy, 1, 2) char *strcpy(char * const POS p, const char * const POS q) { - size_t p_size = __builtin_object_size(p, 1); - size_t q_size = __builtin_object_size(q, 1); + size_t p_size = __member_size(p); + size_t q_size = __member_size(q); size_t size; /* If neither buffer size is known, immediately give up. */ -- cgit v1.2.3 From 90bfc37b5ab91c1a6165e3e5cfc49bf04571b762 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Sep 2022 15:09:53 -0400 Subject: SUNRPC: Fix svcxdr_init_decode's end-of-buffer calculation Ensure that stream-based argument decoding can't go past the actual end of the receive buffer. xdr_init_decode's calculation of the value of xdr->end over-estimates the end of the buffer because the Linux kernel RPC server code does not remove the size of the RPC header from rqstp->rq_arg before calling the upper layer's dispatcher. The server-side still uses the svc_getnl() macros to decode the RPC call header. These macros reduce the length of the head iov but do not update the total length of the message in the buffer (buf->len). A proper fix for this would be to replace the use of svc_getnl() and friends in the RPC header decoder, but that would be a large and invasive change that would be difficult to backport. Fixes: 5191955d6fc6 ("SUNRPC: Prepare for xdr_stream-style decoding on the server-side") Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index daecb009c05b..5a830b66f059 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -544,16 +544,27 @@ static inline void svc_reserve_auth(struct svc_rqst *rqstp, int space) } /** - * svcxdr_init_decode - Prepare an xdr_stream for svc Call decoding + * svcxdr_init_decode - Prepare an xdr_stream for Call decoding * @rqstp: controlling server RPC transaction context * + * This function currently assumes the RPC header in rq_arg has + * already been decoded. Upon return, xdr->p points to the + * location of the upper layer header. */ static inline void svcxdr_init_decode(struct svc_rqst *rqstp) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; - struct kvec *argv = rqstp->rq_arg.head; + struct xdr_buf *buf = &rqstp->rq_arg; + struct kvec *argv = buf->head; - xdr_init_decode(xdr, &rqstp->rq_arg, argv->iov_base, NULL); + /* + * svc_getnl() and friends do not keep the xdr_buf's ::len + * field up to date. Refresh that field before initializing + * the argument decoding stream. + */ + buf->len = buf->head->iov_len + buf->page_len + buf->tail->iov_len; + + xdr_init_decode(xdr, buf, argv->iov_base, NULL); xdr_set_scratch_page(xdr, rqstp->rq_scratch_page); } -- cgit v1.2.3 From 1242a87da0d8cd2a428e96ca68e7ea899b0f4624 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Sep 2022 15:09:59 -0400 Subject: SUNRPC: Fix svcxdr_init_encode's buflen calculation Commit 2825a7f90753 ("nfsd4: allow encoding across page boundaries") added an explicit computation of the remaining length in the rq_res XDR buffer. The computation appears to suffer from an "off-by-one" bug. Because buflen is too large by one page, XDR encoding can run off the end of the send buffer by eventually trying to use the struct page address in rq_page_end, which always contains NULL. Fixes: bddfdbcddbe2 ("NFSD: Extract the svcxdr_init_encode() helper") Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 5a830b66f059..0ca8a8ffb47e 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -587,7 +587,7 @@ static inline void svcxdr_init_encode(struct svc_rqst *rqstp) xdr->end = resv->iov_base + PAGE_SIZE - rqstp->rq_auth_slack; buf->len = resv->iov_len; xdr->page_ptr = buf->pages - 1; - buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages); + buf->buflen = PAGE_SIZE * (rqstp->rq_page_end - buf->pages); buf->buflen -= rqstp->rq_auth_slack; xdr->rqst = NULL; } -- cgit v1.2.3 From 103cc1fafee48adb91fca0e19deb869fd23e46ab Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 12 Sep 2022 17:22:38 -0400 Subject: SUNRPC: Parametrize how much of argsize should be zeroed Currently, SUNRPC clears the whole of .pc_argsize before processing each incoming RPC transaction. Add an extra parameter to struct svc_procedure to enable upper layers to reduce the amount of each operation's argument structure that is zeroed by SUNRPC. The size of struct nfsd4_compoundargs, in particular, is a lot to clear on each incoming RPC Call. A subsequent patch will cut this down to something closer to what NFSv2 and NFSv3 uses. This patch should cause no behavior changes. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 0ca8a8ffb47e..88de45491376 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -472,6 +472,7 @@ struct svc_procedure { /* XDR free result: */ void (*pc_release)(struct svc_rqst *); unsigned int pc_argsize; /* argument struct size */ + unsigned int pc_argzero; /* how much of argument to clear */ unsigned int pc_ressize; /* result struct size */ unsigned int pc_cachetype; /* cache info (NFS) */ unsigned int pc_xdrressize; /* maximum size of XDR reply */ -- cgit v1.2.3 From 98124f5bd6c76699d514fbe491dd95265369cc99 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 12 Sep 2022 17:22:56 -0400 Subject: NFSD: Refactor common code out of dirlist helpers The dust has settled a bit and it's become obvious what code is totally common between nfsd_init_dirlist_pages() and nfsd3_init_dirlist_pages(). Move that common code to SUNRPC. The new helper brackets the existing xdr_init_decode_pages() API. Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdr.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 69175029abbb..f84e2a1358e1 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -240,6 +240,8 @@ typedef int (*kxdrdproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr, extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst); +extern void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, + struct page **pages, struct rpc_rqst *rqst); extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); extern int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes); -- cgit v1.2.3 From 24291caf8447f6fc060c8d00136bdc30ee207f38 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 17 Sep 2022 20:07:12 -0700 Subject: lib/bitmap: add bitmap_weight_and() The function calculates Hamming weight of (bitmap1 & bitmap2). Now we have to do like this: tmp = bitmap_alloc(nbits); bitmap_and(tmp, map1, map2, nbits); weight = bitmap_weight(tmp, nbits); bitmap_free(tmp); This requires additional memory, adds pressure on alloc subsystem, and way less cache-friendly than just: weight = bitmap_weight_and(map1, map2, nbits); The following patches apply it for cpumask functions. Signed-off-by: Yury Norov --- include/linux/bitmap.h | 12 ++++++++++++ include/linux/cpumask.h | 11 +++++++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index f65410a49fda..b2aef45af0db 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -51,6 +51,7 @@ struct device; * bitmap_empty(src, nbits) Are all bits zero in *src? * bitmap_full(src, nbits) Are all bits set in *src? * bitmap_weight(src, nbits) Hamming Weight: number set bits + * bitmap_weight_and(src1, src2, nbits) Hamming Weight of and'ed bitmap * bitmap_set(dst, pos, nbits) Set specified bit area * bitmap_clear(dst, pos, nbits) Clear specified bit area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area @@ -164,6 +165,8 @@ bool __bitmap_intersects(const unsigned long *bitmap1, bool __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); +unsigned int __bitmap_weight_and(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); void __bitmap_set(unsigned long *map, unsigned int start, int len); void __bitmap_clear(unsigned long *map, unsigned int start, int len); @@ -439,6 +442,15 @@ unsigned int bitmap_weight(const unsigned long *src, unsigned int nbits) return __bitmap_weight(src, nbits); } +static __always_inline +unsigned long bitmap_weight_and(const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + return hweight_long(*src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)); + return __bitmap_weight_and(src1, src2, nbits); +} + static __always_inline void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits) { diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 1b442fb2001f..9a71af8097ca 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -586,6 +586,17 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp) return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits); } +/** + * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2) + * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. + * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. + */ +static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, + const struct cpumask *srcp2) +{ + return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits); +} + /** * cpumask_shift_right - *dstp = *srcp >> n * @dstp: the cpumask result -- cgit v1.2.3 From 3cea8d475327756066e2a54f0b651bb7284dd448 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 17 Sep 2022 20:07:13 -0700 Subject: lib: add find_nth{,_and,_andnot}_bit() Kernel lacks for a function that searches for Nth bit in a bitmap. Usually people do it like this: for_each_set_bit(bit, mask, size) if (n-- == 0) return bit; We can do it more efficiently, if we: 1. find a word containing Nth bit, using hweight(); and 2. find the bit, using a helper fns(), that works similarly to __ffs() and ffz(). fns() is implemented as a simple loop. For x86_64, there's PDEP instruction to do that: ret = clz(pdep(1 << idx, num)). However, for large bitmaps the most of improvement comes from using hweight(), so I kept fns() simple. New find_nth_bit() is ~70 times faster on x86_64/kvm in find_bit benchmark: find_nth_bit: 7154190 ns, 16411 iterations for_each_bit: 505493126 ns, 16315 iterations With all that, a family of 3 new functions is added, and used where appropriate in the following patches. Signed-off-by: Yury Norov --- include/linux/bitops.h | 19 +++++++++++ include/linux/find.h | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 3b89c64bcfd8..d7dd83fafeba 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -247,6 +247,25 @@ static inline unsigned long __ffs64(u64 word) return __ffs((unsigned long)word); } +/** + * fns - find N'th set bit in a word + * @word: The word to search + * @n: Bit to find + */ +static inline unsigned long fns(unsigned long word, unsigned int n) +{ + unsigned int bit; + + while (word) { + bit = __ffs(word); + if (n-- == 0) + return bit; + __clear_bit(bit, &word); + } + + return BITS_PER_LONG; +} + /** * assign_bit - Assign value to a bit in memory * @nr: the bit to set diff --git a/include/linux/find.h b/include/linux/find.h index dead6f53a97b..b100944daba0 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -15,6 +15,11 @@ unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n); +unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long size, unsigned long n); +unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long size, unsigned long n); extern unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); @@ -136,6 +141,87 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) } #endif +/** + * find_nth_bit - find N'th set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum number of bits to search + * @n: The number of set bit, which position is needed, counting from 0 + * + * The following is semantically equivalent: + * idx = find_nth_bit(addr, size, 0); + * idx = find_first_bit(addr, size); + * + * Returns the bit number of the N'th set bit. + * If no such, returns @size. + */ +static inline +unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) +{ + if (n >= size) + return size; + + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? fns(val, n) : size; + } + + return __find_nth_bit(addr, size, n); +} + +/** + * find_nth_and_bit - find N'th set bit in 2 memory regions + * @addr1: The 1st address to start the search at + * @addr2: The 2nd address to start the search at + * @size: The maximum number of bits to search + * @n: The number of set bit, which position is needed, counting from 0 + * + * Returns the bit number of the N'th set bit. + * If no such, returns @size. + */ +static inline +unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long size, unsigned long n) +{ + if (n >= size) + return size; + + if (small_const_nbits(size)) { + unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); + + return val ? fns(val, n) : size; + } + + return __find_nth_and_bit(addr1, addr2, size, n); +} + +/** + * find_nth_andnot_bit - find N'th set bit in 2 memory regions, + * flipping bits in 2nd region + * @addr1: The 1st address to start the search at + * @addr2: The 2nd address to start the search at + * @size: The maximum number of bits to search + * @n: The number of set bit, which position is needed, counting from 0 + * + * Returns the bit number of the N'th set bit. + * If no such, returns @size. + */ +static inline +unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long size, unsigned long n) +{ + if (n >= size) + return size; + + if (small_const_nbits(size)) { + unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0); + + return val ? fns(val, n) : size; + } + + return __find_nth_andnot_bit(addr1, addr2, size, n); +} + #ifndef find_first_and_bit /** * find_first_and_bit - find the first set bit in both memory regions -- cgit v1.2.3 From 97848c10f9f8a8ce4296b149d06cab424eba05b3 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 17 Sep 2022 20:07:15 -0700 Subject: lib/bitmap: remove bitmap_ord_to_pos Now that we have find_nth_bit(), we can drop bitmap_ord_to_pos(). Signed-off-by: Yury Norov --- include/linux/bitmap.h | 1 - include/linux/nodemask.h | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index b2aef45af0db..7d6d73b78147 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -225,7 +225,6 @@ void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int n #else #define bitmap_copy_le bitmap_copy #endif -unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits); int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits); diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 4b71a96190a8..0c45fb066caa 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -508,8 +508,7 @@ static inline int node_random(const nodemask_t *maskp) w = nodes_weight(*maskp); if (w) - bit = bitmap_ord_to_pos(maskp->bits, - get_random_int() % w, MAX_NUMNODES); + bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_int() % w); return bit; #else return 0; -- cgit v1.2.3 From 944c417daeb63fa345fe0f754c57a5a23ca6d701 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 17 Sep 2022 20:07:16 -0700 Subject: cpumask: add cpumask_nth_{,and,andnot} Add cpumask_nth_{,and,andnot} as wrappers around corresponding find functions, and use it in cpumask_local_spread(). Signed-off-by: Yury Norov --- include/linux/cpumask.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 9a71af8097ca..e4f9136a4a63 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -337,6 +337,50 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) return i; } +/** + * cpumask_nth - get the first cpu in a cpumask + * @srcp: the cpumask pointer + * @cpu: the N'th cpu to find, starting from 0 + * + * Returns >= nr_cpu_ids if such cpu doesn't exist. + */ +static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) +{ + return find_nth_bit(cpumask_bits(srcp), nr_cpumask_bits, cpumask_check(cpu)); +} + +/** + * cpumask_nth_and - get the first cpu in 2 cpumasks + * @srcp1: the cpumask pointer + * @srcp2: the cpumask pointer + * @cpu: the N'th cpu to find, starting from 0 + * + * Returns >= nr_cpu_ids if such cpu doesn't exist. + */ +static inline +unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, + const struct cpumask *srcp2) +{ + return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), + nr_cpumask_bits, cpumask_check(cpu)); +} + +/** + * cpumask_nth_andnot - get the first cpu set in 1st cpumask, and clear in 2nd. + * @srcp1: the cpumask pointer + * @srcp2: the cpumask pointer + * @cpu: the N'th cpu to find, starting from 0 + * + * Returns >= nr_cpu_ids if such cpu doesn't exist. + */ +static inline +unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1, + const struct cpumask *srcp2) +{ + return find_nth_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), + nr_cpumask_bits, cpumask_check(cpu)); +} + #define CPU_BITS_NONE \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ -- cgit v1.2.3 From eab3126571ed1e3e57ce0f066b566af472ebc47a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 3 Jun 2022 15:29:22 +0200 Subject: efi: libstub: simplify efi_get_memory_map() and struct efi_boot_memmap Currently, struct efi_boot_memmap is a struct that is passed around between callers of efi_get_memory_map() and the users of the resulting data, and which carries pointers to various variables whose values are provided by the EFI GetMemoryMap() boot service. This is overly complex, and it is much easier to carry these values in the struct itself. So turn the struct into one that carries these data items directly, including a flex array for the variable number of EFI memory descriptors that the boot service may return. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index d2b84c2fec39..f1b3e0d1b3fa 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -518,6 +518,15 @@ typedef union { efi_system_table_32_t mixed_mode; } efi_system_table_t; +struct efi_boot_memmap { + unsigned long map_size; + unsigned long desc_size; + u32 desc_ver; + unsigned long map_key; + unsigned long buff_size; + efi_memory_desc_t map[]; +}; + /* * Architecture independent structure for describing a memory map for the * benefit of efi_memmap_init_early(), and for passing context between -- cgit v1.2.3 From de185b56e8a62822d4e1cdb3e068b38ca709aa47 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:05:00 +0200 Subject: blk-cgroup: pass a gendisk to blkcg_schedule_throttle Pass the gendisk to blkcg_schedule_throttle as part of moving the blk-cgroup infrastructure to be gendisk based. Remove the unused !BLK_CGROUP stub while we're at it. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-17-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9f40dbc65f82..dd5841a42c33 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -18,14 +18,14 @@ struct bio; struct cgroup_subsys_state; -struct request_queue; +struct gendisk; #define FC_APPID_LEN 129 #ifdef CONFIG_BLK_CGROUP extern struct cgroup_subsys_state * const blkcg_root_css; -void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); +void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay); void blkcg_maybe_throttle_current(void); bool blk_cgroup_congested(void); void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css); @@ -39,7 +39,6 @@ struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio); static inline void blkcg_maybe_throttle_current(void) { } static inline bool blk_cgroup_congested(void) { return false; } -static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } static inline struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) { return NULL; -- cgit v1.2.3 From 21c9e90ab9a4c991d21dd15cc5163c99a885d4a8 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 30 Aug 2022 20:36:01 +0800 Subject: mm, hwpoison: use num_poisoned_pages_sub() to decrease num_poisoned_pages Use num_poisoned_pages_sub() to combine multiple atomic ops into one. Also num_poisoned_pages_dec() can be killed as there's no caller now. Link: https://lkml.kernel.org/r/20220830123604.25763-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- include/linux/swapops.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index a3d435bf9f97..88825d1785d2 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -485,11 +485,6 @@ static inline void num_poisoned_pages_inc(void) atomic_long_inc(&num_poisoned_pages); } -static inline void num_poisoned_pages_dec(void) -{ - atomic_long_dec(&num_poisoned_pages); -} - static inline void num_poisoned_pages_sub(long i) { atomic_long_sub(i, &num_poisoned_pages); -- cgit v1.2.3 From eba4d770efc86a3710e36b828190858abfa3bb74 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 11 Aug 2022 12:13:26 -0400 Subject: mm/swap: comment all the ifdef in swapops.h swapops.h contains quite a few layers of ifdef, some of the "else" and "endif" doesn't get proper comment on the macro so it's hard to follow on what are they referring to. Add the comments. Link: https://lkml.kernel.org/r/20220811161331.37055-3-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Nadav Amit Reviewed-by: Huang Ying Reviewed-by: Alistair Popple Cc: Andi Kleen Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Minchan Kim Cc: Vlastimil Babka Cc: Dave Hansen Signed-off-by: Andrew Morton --- include/linux/swapops.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 88825d1785d2..7d1b74046520 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -247,8 +247,8 @@ extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, #ifdef CONFIG_HUGETLB_PAGE extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl); extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); -#endif -#else +#endif /* CONFIG_HUGETLB_PAGE */ +#else /* CONFIG_MIGRATION */ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); @@ -276,7 +276,7 @@ static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, #ifdef CONFIG_HUGETLB_PAGE static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { } -#endif +#endif /* CONFIG_HUGETLB_PAGE */ static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; @@ -286,7 +286,7 @@ static inline int is_readable_migration_entry(swp_entry_t entry) return 0; } -#endif +#endif /* CONFIG_MIGRATION */ typedef unsigned long pte_marker; @@ -426,7 +426,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd) { return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); } -#else +#else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) { @@ -455,7 +455,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd) { return 0; } -#endif +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ #ifdef CONFIG_MEMORY_FAILURE @@ -490,7 +490,7 @@ static inline void num_poisoned_pages_sub(long i) atomic_long_sub(i, &num_poisoned_pages); } -#else +#else /* CONFIG_MEMORY_FAILURE */ static inline swp_entry_t make_hwpoison_entry(struct page *page) { @@ -509,7 +509,7 @@ static inline void num_poisoned_pages_inc(void) static inline void num_poisoned_pages_sub(long i) { } -#endif +#endif /* CONFIG_MEMORY_FAILURE */ static inline int non_swap_entry(swp_entry_t entry) { -- cgit v1.2.3 From 0d206b5d2e0d7d7f09ac9540e3ab3e35a34f536e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 11 Aug 2022 12:13:27 -0400 Subject: mm/swap: add swp_offset_pfn() to fetch PFN from swap entry We've got a bunch of special swap entries that stores PFN inside the swap offset fields. To fetch the PFN, normally the user just calls swp_offset() assuming that'll be the PFN. Add a helper swp_offset_pfn() to fetch the PFN instead, fetching only the max possible length of a PFN on the host, meanwhile doing proper check with MAX_PHYSMEM_BITS to make sure the swap offsets can actually store the PFNs properly always using the BUILD_BUG_ON() in is_pfn_swap_entry(). One reason to do so is we never tried to sanitize whether swap offset can really fit for storing PFN. At the meantime, this patch also prepares us with the future possibility to store more information inside the swp offset field, so assuming "swp_offset(entry)" to be the PFN will not stand any more very soon. Replace many of the swp_offset() callers to use swp_offset_pfn() where proper. Note that many of the existing users are not candidates for the replacement, e.g.: (1) When the swap entry is not a pfn swap entry at all, or, (2) when we wanna keep the whole swp_offset but only change the swp type. For the latter, it can happen when fork() triggered on a write-migration swap entry pte, we may want to only change the migration type from write->read but keep the rest, so it's not "fetching PFN" but "changing swap type only". They're left aside so that when there're more information within the swp offset they'll be carried over naturally in those cases. Since at it, dropping hwpoison_entry_to_pfn() because that's exactly what the new swp_offset_pfn() is about. Link: https://lkml.kernel.org/r/20220811161331.37055-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: "Huang, Ying" Cc: Alistair Popple Cc: Andi Kleen Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Minchan Kim Cc: Nadav Amit Cc: Vlastimil Babka Cc: Dave Hansen Signed-off-by: Andrew Morton --- include/linux/swapops.h | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 7d1b74046520..578212fbf2be 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -23,6 +23,20 @@ #define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT) #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) +/* + * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To + * store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries + * can use the extra bits to store other information besides PFN. + */ +#ifdef MAX_PHYSMEM_BITS +#define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) +#else /* MAX_PHYSMEM_BITS */ +#define SWP_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT) +#endif /* MAX_PHYSMEM_BITS */ +#define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1) + +static inline bool is_pfn_swap_entry(swp_entry_t entry); + /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { @@ -64,6 +78,17 @@ static inline pgoff_t swp_offset(swp_entry_t entry) return entry.val & SWP_OFFSET_MASK; } +/* + * This should only be called upon a pfn swap entry to get the PFN stored + * in the swap entry. Please refers to is_pfn_swap_entry() for definition + * of pfn swap entry. + */ +static inline unsigned long swp_offset_pfn(swp_entry_t entry) +{ + VM_BUG_ON(!is_pfn_swap_entry(entry)); + return swp_offset(entry) & SWP_PFN_MASK; +} + /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { @@ -369,7 +394,7 @@ static inline int pte_none_mostly(pte_t pte) static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { - struct page *p = pfn_to_page(swp_offset(entry)); + struct page *p = pfn_to_page(swp_offset_pfn(entry)); /* * Any use of migration entries may only occur while the @@ -387,6 +412,9 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) */ static inline bool is_pfn_swap_entry(swp_entry_t entry) { + /* Make sure the swp offset can always store the needed fields */ + BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); + return is_migration_entry(entry) || is_device_private_entry(entry) || is_device_exclusive_entry(entry); } @@ -475,11 +503,6 @@ static inline int is_hwpoison_entry(swp_entry_t entry) return swp_type(entry) == SWP_HWPOISON; } -static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry) -{ - return swp_offset(entry); -} - static inline void num_poisoned_pages_inc(void) { atomic_long_inc(&num_poisoned_pages); -- cgit v1.2.3 From 2e3468778dbe3ec389a10c21a703bb8e5be5cfbc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 11 Aug 2022 12:13:29 -0400 Subject: mm: remember young/dirty bit for page migrations When page migration happens, we always ignore the young/dirty bit settings in the old pgtable, and marking the page as old in the new page table using either pte_mkold() or pmd_mkold(), and keeping the pte clean. That's fine from functional-wise, but that's not friendly to page reclaim because the moving page can be actively accessed within the procedure. Not to mention hardware setting the young bit can bring quite some overhead on some systems, e.g. x86_64 needs a few hundreds nanoseconds to set the bit. The same slowdown problem to dirty bits when the memory is first written after page migration happened. Actually we can easily remember the A/D bit configuration and recover the information after the page is migrated. To achieve it, define a new set of bits in the migration swap offset field to cache the A/D bits for old pte. Then when removing/recovering the migration entry, we can recover the A/D bits even if the page changed. One thing to mention is that here we used max_swapfile_size() to detect how many swp offset bits we have, and we'll only enable this feature if we know the swp offset is big enough to store both the PFN value and the A/D bits. Otherwise the A/D bits are dropped like before. Link: https://lkml.kernel.org/r/20220811161331.37055-6-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: "Huang, Ying" Cc: Alistair Popple Cc: Andi Kleen Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Minchan Kim Cc: Nadav Amit Cc: Vlastimil Babka Cc: Dave Hansen Signed-off-by: Andrew Morton --- include/linux/swapops.h | 99 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 578212fbf2be..11b874f212a2 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -8,6 +8,10 @@ #ifdef CONFIG_MMU +#ifdef CONFIG_SWAP +#include +#endif /* CONFIG_SWAP */ + /* * swapcache pages are stored in the swapper_space radix tree. We want to * get good packing density in that tree, so the index should be dense in @@ -35,6 +39,31 @@ #endif /* MAX_PHYSMEM_BITS */ #define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1) +/** + * Migration swap entry specific bitfield definitions. Layout: + * + * |----------+--------------------| + * | swp_type | swp_offset | + * |----------+--------+-+-+-------| + * | | resv |D|A| PFN | + * |----------+--------+-+-+-------| + * + * @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A) + * @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D) + * + * Note: A/D bits will be stored in migration entries iff there're enough + * free bits in arch specific swp offset. By default we'll ignore A/D bits + * when migrating a page. Please refer to migration_entry_supports_ad() + * for more information. If there're more bits besides PFN and A/D bits, + * they should be reserved and always be zeros. + */ +#define SWP_MIG_YOUNG_BIT (SWP_PFN_BITS) +#define SWP_MIG_DIRTY_BIT (SWP_PFN_BITS + 1) +#define SWP_MIG_TOTAL_BITS (SWP_PFN_BITS + 2) + +#define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT) +#define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT) + static inline bool is_pfn_swap_entry(swp_entry_t entry); /* Clear all flags but only keep swp_entry_t related information */ @@ -265,6 +294,57 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) return swp_entry(SWP_MIGRATION_WRITE, offset); } +/* + * Returns whether the host has large enough swap offset field to support + * carrying over pgtable A/D bits for page migrations. The result is + * pretty much arch specific. + */ +static inline bool migration_entry_supports_ad(void) +{ + /* + * max_swapfile_size() returns the max supported swp-offset plus 1. + * We can support the migration A/D bits iff the pfn swap entry has + * the offset large enough to cover all of them (PFN, A & D bits). + */ +#ifdef CONFIG_SWAP + return max_swapfile_size() >= (1UL << SWP_MIG_TOTAL_BITS); +#else /* CONFIG_SWAP */ + return false; +#endif /* CONFIG_SWAP */ +} + +static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) +{ + if (migration_entry_supports_ad()) + return swp_entry(swp_type(entry), + swp_offset(entry) | SWP_MIG_YOUNG); + return entry; +} + +static inline bool is_migration_entry_young(swp_entry_t entry) +{ + if (migration_entry_supports_ad()) + return swp_offset(entry) & SWP_MIG_YOUNG; + /* Keep the old behavior of aging page after migration */ + return false; +} + +static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) +{ + if (migration_entry_supports_ad()) + return swp_entry(swp_type(entry), + swp_offset(entry) | SWP_MIG_DIRTY); + return entry; +} + +static inline bool is_migration_entry_dirty(swp_entry_t entry) +{ + if (migration_entry_supports_ad()) + return swp_offset(entry) & SWP_MIG_DIRTY; + /* Keep the old behavior of clean page after migration */ + return false; +} + extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl); extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, @@ -311,6 +391,25 @@ static inline int is_readable_migration_entry(swp_entry_t entry) return 0; } +static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) +{ + return entry; +} + +static inline bool is_migration_entry_young(swp_entry_t entry) +{ + return false; +} + +static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) +{ + return entry; +} + +static inline bool is_migration_entry_dirty(swp_entry_t entry) +{ + return false; +} #endif /* CONFIG_MIGRATION */ typedef unsigned long pte_marker; -- cgit v1.2.3 From be45a4902c7caa717fee6b2f671e59b396ed395c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 11 Aug 2022 12:13:30 -0400 Subject: mm/swap: cache maximum swapfile size when init swap We used to have swapfile_maximum_size() fetching a maximum value of swapfile size per-arch. As the caller of max_swapfile_size() grows, this patch introduce a variable "swapfile_maximum_size" and cache the value of old max_swapfile_size(), so that we don't need to calculate the value every time. Caching the value in swapfile_init() is safe because when reaching the phase we should have initialized all the relevant information. Here the major arch to take care of is x86, which defines the max swapfile size based on L1TF mitigation. Here both X86_BUG_L1TF or l1tf_mitigation should have been setup properly when reaching swapfile_init(). As a reference, the code path looks like this for x86: - start_kernel - setup_arch - early_cpu_init - early_identify_cpu --> setup X86_BUG_L1TF - parse_early_param - l1tf_cmdline --> set l1tf_mitigation - check_bugs - l1tf_select_mitigation --> set l1tf_mitigation - arch_call_rest_init - rest_init - kernel_init - kernel_init_freeable - do_basic_setup - do_initcalls --> calls swapfile_init() (initcall level 4) The swapfile size only depends on swp pte format on non-x86 archs, so caching it is safe too. Since at it, rename max_swapfile_size() to arch_max_swapfile_size() because arch can define its own function, so it's more straightforward to have "arch_" as its prefix. At the meantime, export swapfile_maximum_size to replace the old usages of max_swapfile_size(). [peterx@redhat.com: declare arch_max_swapfile_size) in swapfile.h] Link: https://lkml.kernel.org/r/YxTh1GuC6ro5fKL5@xz-m1.local Link: https://lkml.kernel.org/r/20220811161331.37055-7-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: "Huang, Ying" Cc: Alistair Popple Cc: Andi Kleen Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Minchan Kim Cc: Nadav Amit Cc: Vlastimil Babka Cc: Dave Hansen Signed-off-by: Andrew Morton --- include/linux/swapfile.h | 5 ++++- include/linux/swapops.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index 54078542134c..e2d11ae4e73d 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -8,6 +8,9 @@ */ extern struct swap_info_struct *swap_info[]; extern unsigned long generic_max_swapfile_size(void); -extern unsigned long max_swapfile_size(void); +unsigned long arch_max_swapfile_size(void); + +/* Maximum swapfile size supported for the arch (not inclusive). */ +extern unsigned long swapfile_maximum_size; #endif /* _LINUX_SWAPFILE_H */ diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 11b874f212a2..027b4095e132 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -307,7 +307,7 @@ static inline bool migration_entry_supports_ad(void) * the offset large enough to cover all of them (PFN, A & D bits). */ #ifdef CONFIG_SWAP - return max_swapfile_size() >= (1UL << SWP_MIG_TOTAL_BITS); + return swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS); #else /* CONFIG_SWAP */ return false; #endif /* CONFIG_SWAP */ -- cgit v1.2.3 From 5154e607967d3f587fda84a40abbf900275016c9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 11 Aug 2022 12:13:31 -0400 Subject: mm/swap: cache swap migration A/D bits support Introduce a variable swap_migration_ad_supported to cache whether the arch supports swap migration A/D bits. Here one thing to mention is that SWP_MIG_TOTAL_BITS will internally reference the other macro MAX_PHYSMEM_BITS, which is a function call on x86 (constant on all the rest of archs). It's safe to reference it in swapfile_init() because when reaching here we're already during initcalls level 4 so we must have initialized 5-level pgtable for x86_64 (right after early_identify_cpu() finishes). - start_kernel - setup_arch - early_cpu_init - get_cpu_cap --> fetch from CPUID (including X86_FEATURE_LA57) - early_identify_cpu --> clear X86_FEATURE_LA57 (if early lvl5 not enabled (USE_EARLY_PGTABLE_L5)) - arch_call_rest_init - rest_init - kernel_init - kernel_init_freeable - do_basic_setup - do_initcalls --> calls swapfile_init() (initcall level 4) This should slightly speed up the migration swap entry handlings. Link: https://lkml.kernel.org/r/20220811161331.37055-8-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andi Kleen Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Huang Ying Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Minchan Kim Cc: Nadav Amit Cc: Vlastimil Babka Cc: Dave Hansen Signed-off-by: Andrew Morton --- include/linux/swapfile.h | 2 ++ include/linux/swapops.h | 7 +------ 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index e2d11ae4e73d..7ed529a77c5b 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -12,5 +12,7 @@ unsigned long arch_max_swapfile_size(void); /* Maximum swapfile size supported for the arch (not inclusive). */ extern unsigned long swapfile_maximum_size; +/* Whether swap migration entry supports storing A/D bits for the arch */ +extern bool swap_migration_ad_supported; #endif /* _LINUX_SWAPFILE_H */ diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 027b4095e132..86b95ccb81bb 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -301,13 +301,8 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) */ static inline bool migration_entry_supports_ad(void) { - /* - * max_swapfile_size() returns the max supported swp-offset plus 1. - * We can support the migration A/D bits iff the pfn swap entry has - * the offset large enough to cover all of them (PFN, A & D bits). - */ #ifdef CONFIG_SWAP - return swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS); + return swap_migration_ad_supported; #else /* CONFIG_SWAP */ return false; #endif /* CONFIG_SWAP */ -- cgit v1.2.3 From aa1cf99b87e934e761b46ce2b925335a398980da Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Mon, 15 Aug 2022 07:11:35 +0000 Subject: delayacct: support re-entrance detection of thrashing accounting Once upon a time, we only support accounting thrashing of page cache. Then Joonsoo introduced workingset detection for anonymous pages and we gained the ability to account thrashing of them[1]. For page cache thrashing accounting, there is no suitable place to do it in fs level likes swap_readpage(). So we have to do it in folio_wait_bit_common(). Then for anonymous pages thrashing accounting, we have to do it in both swap_readpage() and folio_wait_bit_common(). This likes PSI, so we should let thrashing accounting supports re-entrance detection. This patch is to prepare complete thrashing accounting, and is based on patch "filemap: make the accounting of thrashing more consistent". [1] commit aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU") Link: https://lkml.kernel.org/r/20220815071134.74551-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Signed-off-by: CGEL ZTE Reviewed-by: Ran Xiaokai Reviewed-by: wangyong Acked-by: Joonsoo Kim Signed-off-by: Andrew Morton --- include/linux/delayacct.h | 16 ++++++++-------- include/linux/sched.h | 4 ++++ 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 58aea2d7385c..0da97dba9ef8 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -73,8 +73,8 @@ extern int delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); -extern void __delayacct_thrashing_start(void); -extern void __delayacct_thrashing_end(void); +extern void __delayacct_thrashing_start(bool *in_thrashing); +extern void __delayacct_thrashing_end(bool *in_thrashing); extern void __delayacct_swapin_start(void); extern void __delayacct_swapin_end(void); extern void __delayacct_compact_start(void); @@ -143,22 +143,22 @@ static inline void delayacct_freepages_end(void) __delayacct_freepages_end(); } -static inline void delayacct_thrashing_start(void) +static inline void delayacct_thrashing_start(bool *in_thrashing) { if (!static_branch_unlikely(&delayacct_key)) return; if (current->delays) - __delayacct_thrashing_start(); + __delayacct_thrashing_start(in_thrashing); } -static inline void delayacct_thrashing_end(void) +static inline void delayacct_thrashing_end(bool *in_thrashing) { if (!static_branch_unlikely(&delayacct_key)) return; if (current->delays) - __delayacct_thrashing_end(); + __delayacct_thrashing_end(in_thrashing); } static inline void delayacct_swapin_start(void) @@ -237,9 +237,9 @@ static inline void delayacct_freepages_start(void) {} static inline void delayacct_freepages_end(void) {} -static inline void delayacct_thrashing_start(void) +static inline void delayacct_thrashing_start(bool *in_thrashing) {} -static inline void delayacct_thrashing_end(void) +static inline void delayacct_thrashing_end(bool *in_thrashing) {} static inline void delayacct_swapin_start(void) {} diff --git a/include/linux/sched.h b/include/linux/sched.h index e7b2f8a5c711..d9a2466664f7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -944,6 +944,10 @@ struct task_struct { #ifdef CONFIG_CPU_SUP_INTEL unsigned reported_split_lock:1; #endif +#ifdef CONFIG_TASK_DELAY_ACCT + /* delay due to memory thrashing */ + unsigned in_thrashing:1; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ -- cgit v1.2.3 From e1fd09e3d1dd4a1a8b3b33bc1fd647eee9f4e475 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 01:59:58 -0600 Subject: mm: x86, arm64: add arch_has_hw_pte_young() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Multi-Gen LRU Framework", v14. What's new ========== 1. OpenWrt, in addition to Android, Arch Linux Zen, Armbian, ChromeOS, Liquorix, post-factum and XanMod, is now shipping MGLRU on 5.15. 2. Fixed long-tailed direct reclaim latency seen on high-memory (TBs) machines. The old direct reclaim backoff, which tries to enforce a minimum fairness among all eligible memcgs, over-swapped by about (total_mem>>DEF_PRIORITY)-nr_to_reclaim. The new backoff, which pulls the plug on swapping once the target is met, trades some fairness for curtailed latency: https://lore.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com/ 3. Fixed minior build warnings and conflicts. More comments and nits. TLDR ==== The current page reclaim is too expensive in terms of CPU usage and it often makes poor choices about what to evict. This patchset offers an alternative solution that is performant, versatile and straightforward. Patchset overview ================= The design and implementation overview is in patch 14: https://lore.kernel.org/r/20220918080010.2920238-15-yuzhao@google.com/ 01. mm: x86, arm64: add arch_has_hw_pte_young() 02. mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG Take advantage of hardware features when trying to clear the accessed bit in many PTEs. 03. mm/vmscan.c: refactor shrink_node() 04. Revert "include/linux/mm_inline.h: fold __update_lru_size() into its sole caller" Minor refactors to improve readability for the following patches. 05. mm: multi-gen LRU: groundwork Adds the basic data structure and the functions that insert pages to and remove pages from the multi-gen LRU (MGLRU) lists. 06. mm: multi-gen LRU: minimal implementation A minimal implementation without optimizations. 07. mm: multi-gen LRU: exploit locality in rmap Exploits spatial locality to improve efficiency when using the rmap. 08. mm: multi-gen LRU: support page table walks Further exploits spatial locality by optionally scanning page tables. 09. mm: multi-gen LRU: optimize multiple memcgs Optimizes the overall performance for multiple memcgs running mixed types of workloads. 10. mm: multi-gen LRU: kill switch Adds a kill switch to enable or disable MGLRU at runtime. 11. mm: multi-gen LRU: thrashing prevention 12. mm: multi-gen LRU: debugfs interface Provide userspace with features like thrashing prevention, working set estimation and proactive reclaim. 13. mm: multi-gen LRU: admin guide 14. mm: multi-gen LRU: design doc Add an admin guide and a design doc. Benchmark results ================= Independent lab results ----------------------- Based on the popularity of searches [01] and the memory usage in Google's public cloud, the most popular open-source memory-hungry applications, in alphabetical order, are: Apache Cassandra Memcached Apache Hadoop MongoDB Apache Spark PostgreSQL MariaDB (MySQL) Redis An independent lab evaluated MGLRU with the most widely used benchmark suites for the above applications. They posted 960 data points along with kernel metrics and perf profiles collected over more than 500 hours of total benchmark time. Their final reports show that, with 95% confidence intervals (CIs), the above applications all performed significantly better for at least part of their benchmark matrices. On 5.14: 1. Apache Spark [02] took 95% CIs [9.28, 11.19]% and [12.20, 14.93]% less wall time to sort three billion random integers, respectively, under the medium- and the high-concurrency conditions, when overcommitting memory. There were no statistically significant changes in wall time for the rest of the benchmark matrix. 2. MariaDB [03] achieved 95% CIs [5.24, 10.71]% and [20.22, 25.97]% more transactions per minute (TPM), respectively, under the medium- and the high-concurrency conditions, when overcommitting memory. There were no statistically significant changes in TPM for the rest of the benchmark matrix. 3. Memcached [04] achieved 95% CIs [23.54, 32.25]%, [20.76, 41.61]% and [21.59, 30.02]% more operations per second (OPS), respectively, for sequential access, random access and Gaussian (distribution) access, when THP=always; 95% CIs [13.85, 15.97]% and [23.94, 29.92]% more OPS, respectively, for random access and Gaussian access, when THP=never. There were no statistically significant changes in OPS for the rest of the benchmark matrix. 4. MongoDB [05] achieved 95% CIs [2.23, 3.44]%, [6.97, 9.73]% and [2.16, 3.55]% more operations per second (OPS), respectively, for exponential (distribution) access, random access and Zipfian (distribution) access, when underutilizing memory; 95% CIs [8.83, 10.03]%, [21.12, 23.14]% and [5.53, 6.46]% more OPS, respectively, for exponential access, random access and Zipfian access, when overcommitting memory. On 5.15: 5. Apache Cassandra [06] achieved 95% CIs [1.06, 4.10]%, [1.94, 5.43]% and [4.11, 7.50]% more operations per second (OPS), respectively, for exponential (distribution) access, random access and Zipfian (distribution) access, when swap was off; 95% CIs [0.50, 2.60]%, [6.51, 8.77]% and [3.29, 6.75]% more OPS, respectively, for exponential access, random access and Zipfian access, when swap was on. 6. Apache Hadoop [07] took 95% CIs [5.31, 9.69]% and [2.02, 7.86]% less average wall time to finish twelve parallel TeraSort jobs, respectively, under the medium- and the high-concurrency conditions, when swap was on. There were no statistically significant changes in average wall time for the rest of the benchmark matrix. 7. PostgreSQL [08] achieved 95% CI [1.75, 6.42]% more transactions per minute (TPM) under the high-concurrency condition, when swap was off; 95% CIs [12.82, 18.69]% and [22.70, 46.86]% more TPM, respectively, under the medium- and the high-concurrency conditions, when swap was on. There were no statistically significant changes in TPM for the rest of the benchmark matrix. 8. Redis [09] achieved 95% CIs [0.58, 5.94]%, [6.55, 14.58]% and [11.47, 19.36]% more total operations per second (OPS), respectively, for sequential access, random access and Gaussian (distribution) access, when THP=always; 95% CIs [1.27, 3.54]%, [10.11, 14.81]% and [8.75, 13.64]% more total OPS, respectively, for sequential access, random access and Gaussian access, when THP=never. Our lab results --------------- To supplement the above results, we ran the following benchmark suites on 5.16-rc7 and found no regressions [10]. fs_fio_bench_hdd_mq pft fs_lmbench pgsql-hammerdb fs_parallelio redis fs_postmark stream hackbench sysbenchthread kernbench tpcc_spark memcached unixbench multichase vm-scalability mutilate will-it-scale nginx [01] https://trends.google.com [02] https://lore.kernel.org/r/20211102002002.92051-1-bot@edi.works/ [03] https://lore.kernel.org/r/20211009054315.47073-1-bot@edi.works/ [04] https://lore.kernel.org/r/20211021194103.65648-1-bot@edi.works/ [05] https://lore.kernel.org/r/20211109021346.50266-1-bot@edi.works/ [06] https://lore.kernel.org/r/20211202062806.80365-1-bot@edi.works/ [07] https://lore.kernel.org/r/20211209072416.33606-1-bot@edi.works/ [08] https://lore.kernel.org/r/20211218071041.24077-1-bot@edi.works/ [09] https://lore.kernel.org/r/20211122053248.57311-1-bot@edi.works/ [10] https://lore.kernel.org/r/20220104202247.2903702-1-yuzhao@google.com/ Read-world applications ======================= Third-party testimonials ------------------------ Konstantin reported [11]: I have Archlinux with 8G RAM + zswap + swap. While developing, I have lots of apps opened such as multiple LSP-servers for different langs, chats, two browsers, etc... Usually, my system gets quickly to a point of SWAP-storms, where I have to kill LSP-servers, restart browsers to free memory, etc, otherwise the system lags heavily and is barely usable. 1.5 day ago I migrated from 5.11.15 kernel to 5.12 + the LRU patchset, and I started up by opening lots of apps to create memory pressure, and worked for a day like this. Till now I had not a single SWAP-storm, and mind you I got 3.4G in SWAP. I was never getting to the point of 3G in SWAP before without a single SWAP-storm. Vaibhav from IBM reported [12]: In a synthetic MongoDB Benchmark, seeing an average of ~19% throughput improvement on POWER10(Radix MMU + 64K Page Size) with MGLRU patches on top of 5.16 kernel for MongoDB + YCSB across three different request distributions, namely, Exponential, Uniform and Zipfan. Shuang from U of Rochester reported [13]: With the MGLRU, fio achieved 95% CIs [38.95, 40.26]%, [4.12, 6.64]% and [9.26, 10.36]% higher throughput, respectively, for random access, Zipfian (distribution) access and Gaussian (distribution) access, when the average number of jobs per CPU is 1; 95% CIs [42.32, 49.15]%, [9.44, 9.89]% and [20.99, 22.86]% higher throughput, respectively, for random access, Zipfian access and Gaussian access, when the average number of jobs per CPU is 2. Daniel from Michigan Tech reported [14]: With Memcached allocating ~100GB of byte-addressable Optante, performance improvement in terms of throughput (measured as queries per second) was about 10% for a series of workloads. Large-scale deployments ----------------------- We've rolled out MGLRU to tens of millions of ChromeOS users and about a million Android users. Google's fleetwide profiling [15] shows an overall 40% decrease in kswapd CPU usage, in addition to improvements in other UX metrics, e.g., an 85% decrease in the number of low-memory kills at the 75th percentile and an 18% decrease in app launch time at the 50th percentile. The downstream kernels that have been using MGLRU include: 1. Android [16] 2. Arch Linux Zen [17] 3. Armbian [18] 4. ChromeOS [19] 5. Liquorix [20] 6. OpenWrt [21] 7. post-factum [22] 8. XanMod [23] [11] https://lore.kernel.org/r/140226722f2032c86301fbd326d91baefe3d7d23.camel@yandex.ru/ [12] https://lore.kernel.org/r/87czj3mux0.fsf@vajain21.in.ibm.com/ [13] https://lore.kernel.org/r/20220105024423.26409-1-szhai2@cs.rochester.edu/ [14] https://lore.kernel.org/r/CA+4-3vksGvKd18FgRinxhqHetBS1hQekJE2gwco8Ja-bJWKtFw@mail.gmail.com/ [15] https://dl.acm.org/doi/10.1145/2749469.2750392 [16] https://android.com [17] https://archlinux.org [18] https://armbian.com [19] https://chromium.org [20] https://liquorix.net [21] https://openwrt.org [22] https://codeberg.org/pf-kernel [23] https://xanmod.org Summary ======= The facts are: 1. The independent lab results and the real-world applications indicate substantial improvements; there are no known regressions. 2. Thrashing prevention, working set estimation and proactive reclaim work out of the box; there are no equivalent solutions. 3. There is a lot of new code; no smaller changes have been demonstrated similar effects. Our options, accordingly, are: 1. Given the amount of evidence, the reported improvements will likely materialize for a wide range of workloads. 2. Gauging the interest from the past discussions, the new features will likely be put to use for both personal computers and data centers. 3. Based on Google's track record, the new code will likely be well maintained in the long term. It'd be more difficult if not impossible to achieve similar effects with other approaches. This patch (of 14): Some architectures automatically set the accessed bit in PTEs, e.g., x86 and arm64 v8.2. On architectures that do not have this capability, clearing the accessed bit in a PTE usually triggers a page fault following the TLB miss of this PTE (to emulate the accessed bit). Being aware of this capability can help make better decisions, e.g., whether to spread the work out over a period of time to reduce bursty page faults when trying to clear the accessed bit in many PTEs. Note that theoretically this capability can be unreliable, e.g., hotplugged CPUs might be different from builtin ones. Therefore it should not be used in architecture-independent code that involves correctness, e.g., to determine whether TLB flushes are required (in combination with the accessed bit). Link: https://lkml.kernel.org/r/20220918080010.2920238-1-yuzhao@google.com Link: https://lkml.kernel.org/r/20220918080010.2920238-2-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Barry Song Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Acked-by: Will Deacon Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: linux-arm-kernel@lists.infradead.org Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Tejun Heo Cc: Vlastimil Babka Cc: Miaohe Lin Cc: Mike Rapoport Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index d13b4f7cc5be..375e8e7e64f4 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -260,6 +260,19 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef arch_has_hw_pte_young +/* + * Return whether the accessed bit is supported on the local CPU. + * + * This stub assumes accessing through an old PTE triggers a page fault. + * Architectures that automatically set the access bit should overwrite it. + */ +static inline bool arch_has_hw_pte_young(void) +{ + return false; +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, -- cgit v1.2.3 From eed9a328aa1ae6ac1edaa026957e6882f57de0dd Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 01:59:59 -0600 Subject: mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some architectures support the accessed bit in non-leaf PMD entries, e.g., x86 sets the accessed bit in a non-leaf PMD entry when using it as part of linear address translation [1]. Page table walkers that clear the accessed bit may use this capability to reduce their search space. Note that: 1. Although an inline function is preferable, this capability is added as a configuration option for consistency with the existing macros. 2. Due to the little interest in other varieties, this capability was only tested on Intel and AMD CPUs. Thanks to the following developers for their efforts [2][3]. Randy Dunlap Stephen Rothwell [1]: Intel 64 and IA-32 Architectures Software Developer's Manual Volume 3 (June 2021), section 4.8 [2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/ [3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/ Link: https://lkml.kernel.org/r/20220918080010.2920238-3-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Barry Song Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Qi Zheng Cc: Tejun Heo Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 375e8e7e64f4..a108b60a6962 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -213,7 +213,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) @@ -234,7 +234,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, BUILD_BUG(); return 0; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -- cgit v1.2.3 From aa1b67903a19e026d1749241fad177f6185c2d42 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 02:00:01 -0600 Subject: Revert "include/linux/mm_inline.h: fold __update_lru_size() into its sole caller" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch undoes the following refactor: commit 289ccba18af4 ("include/linux/mm_inline.h: fold __update_lru_size() into its sole caller") The upcoming changes to include/linux/mm_inline.h will reuse __update_lru_size(). Link: https://lkml.kernel.org/r/20220918080010.2920238-5-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Miaohe Lin Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Barry Song Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Qi Zheng Cc: Tejun Heo Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 7b25b53c474a..fb8aadb81cd6 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -34,7 +34,7 @@ static inline int page_is_file_lru(struct page *page) return folio_is_file_lru(page_folio(page)); } -static __always_inline void update_lru_size(struct lruvec *lruvec, +static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { @@ -43,6 +43,13 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); +} + +static __always_inline void update_lru_size(struct lruvec *lruvec, + enum lru_list lru, enum zone_type zid, + long nr_pages) +{ + __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif -- cgit v1.2.3 From ec1c86b25f4bdd9dce6436c0539d2a6ae676e1c4 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 02:00:02 -0600 Subject: mm: multi-gen LRU: groundwork MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Evictable pages are divided into multiple generations for each lruvec. The youngest generation number is stored in lrugen->max_seq for both anon and file types as they are aged on an equal footing. The oldest generation numbers are stored in lrugen->min_seq[] separately for anon and file types as clean file pages can be evicted regardless of swap constraints. These three variables are monotonically increasing. Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into the gen counter in folio->flags. Each truncated generation number is an index to lrugen->lists[]. The sliding window technique is used to track at least MIN_NR_GENS and at most MAX_NR_GENS generations. The gen counter stores a value within [1, MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it stores 0. There are two conceptually independent procedures: "the aging", which produces young generations, and "the eviction", which consumes old generations. They form a closed-loop system, i.e., "the page reclaim". Both procedures can be invoked from userspace for the purposes of working set estimation and proactive reclaim. These techniques are commonly used to optimize job scheduling (bin packing) in data centers [1][2]. To avoid confusion, the terms "hot" and "cold" will be applied to the multi-gen LRU, as a new convention; the terms "active" and "inactive" will be applied to the active/inactive LRU, as usual. The protection of hot pages and the selection of cold pages are based on page access channels and patterns. There are two access channels: one through page tables and the other through file descriptors. The protection of the former channel is by design stronger because: 1. The uncertainty in determining the access patterns of the former channel is higher due to the approximation of the accessed bit. 2. The cost of evicting the former channel is higher due to the TLB flushes required and the likelihood of encountering the dirty bit. 3. The penalty of underprotecting the former channel is higher because applications usually do not prepare themselves for major page faults like they do for blocked I/O. E.g., GUI applications commonly use dedicated I/O threads to avoid blocking rendering threads. There are also two access patterns: one with temporal locality and the other without. For the reasons listed above, the former channel is assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is present; the latter channel is assumed to follow the latter pattern unless outlying refaults have been observed [3][4]. The next patch will address the "outlying refaults". Three macros, i.e., LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in this patch to make the entire patchset less diffy. A page is added to the youngest generation on faulting. The aging needs to check the accessed bit at least twice before handing this page over to the eviction. The first check takes care of the accessed bit set on the initial fault; the second check makes sure this page has not been used since then. This protocol, AKA second chance, requires a minimum of two generations, hence MIN_NR_GENS. [1] https://dl.acm.org/doi/10.1145/3297858.3304053 [2] https://dl.acm.org/doi/10.1145/3503222.3507731 [3] https://lwn.net/Articles/495543/ [4] https://lwn.net/Articles/815342/ Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Barry Song Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Qi Zheng Cc: Tejun Heo Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 175 ++++++++++++++++++++++++++++++++++++++ include/linux/mmzone.h | 102 ++++++++++++++++++++++ include/linux/page-flags-layout.h | 13 +-- include/linux/page-flags.h | 4 +- include/linux/sched.h | 4 + 5 files changed, 291 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index fb8aadb81cd6..2ff703900fd0 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -40,6 +40,9 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, { struct pglist_data *pgdat = lruvec_pgdat(lruvec); + lockdep_assert_held(&lruvec->lru_lock); + WARN_ON_ONCE(nr_pages != (int)nr_pages); + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); @@ -101,11 +104,177 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio) return lru; } +#ifdef CONFIG_LRU_GEN + +static inline bool lru_gen_enabled(void) +{ + return true; +} + +static inline bool lru_gen_in_fault(void) +{ + return current->in_lru_fault; +} + +static inline int lru_gen_from_seq(unsigned long seq) +{ + return seq % MAX_NR_GENS; +} + +static inline int folio_lru_gen(struct folio *folio) +{ + unsigned long flags = READ_ONCE(folio->flags); + + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; +} + +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) +{ + unsigned long max_seq = lruvec->lrugen.max_seq; + + VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + + /* see the comment on MIN_NR_GENS */ + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); +} + +static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio, + int old_gen, int new_gen) +{ + int type = folio_is_file_lru(folio); + int zone = folio_zonenum(folio); + int delta = folio_nr_pages(folio); + enum lru_list lru = type * LRU_INACTIVE_FILE; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); + + if (old_gen >= 0) + WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], + lrugen->nr_pages[old_gen][type][zone] - delta); + if (new_gen >= 0) + WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], + lrugen->nr_pages[new_gen][type][zone] + delta); + + /* addition */ + if (old_gen < 0) { + if (lru_gen_is_active(lruvec, new_gen)) + lru += LRU_ACTIVE; + __update_lru_size(lruvec, lru, zone, delta); + return; + } + + /* deletion */ + if (new_gen < 0) { + if (lru_gen_is_active(lruvec, old_gen)) + lru += LRU_ACTIVE; + __update_lru_size(lruvec, lru, zone, -delta); + return; + } +} + +static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +{ + unsigned long seq; + unsigned long flags; + int gen = folio_lru_gen(folio); + int type = folio_is_file_lru(folio); + int zone = folio_zonenum(folio); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); + + if (folio_test_unevictable(folio)) + return false; + /* + * There are three common cases for this page: + * 1. If it's hot, e.g., freshly faulted in or previously hot and + * migrated, add it to the youngest generation. + * 2. If it's cold but can't be evicted immediately, i.e., an anon page + * not in swapcache or a dirty page pending writeback, add it to the + * second oldest generation. + * 3. Everything else (clean, cold) is added to the oldest generation. + */ + if (folio_test_active(folio)) + seq = lrugen->max_seq; + else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || + (folio_test_reclaim(folio) && + (folio_test_dirty(folio) || folio_test_writeback(folio)))) + seq = lrugen->min_seq[type] + 1; + else + seq = lrugen->min_seq[type]; + + gen = lru_gen_from_seq(seq); + flags = (gen + 1UL) << LRU_GEN_PGOFF; + /* see the comment on MIN_NR_GENS about PG_active */ + set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); + + lru_gen_update_size(lruvec, folio, -1, gen); + /* for folio_rotate_reclaimable() */ + if (reclaiming) + list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]); + else + list_add(&folio->lru, &lrugen->lists[gen][type][zone]); + + return true; +} + +static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +{ + unsigned long flags; + int gen = folio_lru_gen(folio); + + if (gen < 0) + return false; + + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + + /* for folio_migrate_flags() */ + flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; + flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); + gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + + lru_gen_update_size(lruvec, folio, gen, -1); + list_del(&folio->lru); + + return true; +} + +#else /* !CONFIG_LRU_GEN */ + +static inline bool lru_gen_enabled(void) +{ + return false; +} + +static inline bool lru_gen_in_fault(void) +{ + return false; +} + +static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +{ + return false; +} + +static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +{ + return false; +} + +#endif /* CONFIG_LRU_GEN */ + static __always_inline void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + if (lru_gen_add_folio(lruvec, folio, false)) + return; + update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); if (lru != LRU_UNEVICTABLE) @@ -123,6 +292,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + if (lru_gen_add_folio(lruvec, folio, true)) + return; + update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); /* This is not expected to be used on LRU_UNEVICTABLE */ @@ -140,6 +312,9 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + if (lru_gen_del_folio(lruvec, folio, false)) + return; + if (lru != LRU_UNEVICTABLE) list_del(&folio->lru); update_lru_size(lruvec, lru, folio_zonenum(folio), diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 18cf0fc5ce67..6f4ea078d90f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -317,6 +317,102 @@ enum lruvec_flags { */ }; +#endif /* !__GENERATING_BOUNDS_H */ + +/* + * Evictable pages are divided into multiple generations. The youngest and the + * oldest generation numbers, max_seq and min_seq, are monotonically increasing. + * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An + * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the + * corresponding generation. The gen counter in folio->flags stores gen+1 while + * a page is on one of lrugen->lists[]. Otherwise it stores 0. + * + * A page is added to the youngest generation on faulting. The aging needs to + * check the accessed bit at least twice before handing this page over to the + * eviction. The first check takes care of the accessed bit set on the initial + * fault; the second check makes sure this page hasn't been used since then. + * This process, AKA second chance, requires a minimum of two generations, + * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive + * LRU, e.g., /proc/vmstat, these two generations are considered active; the + * rest of generations, if they exist, are considered inactive. See + * lru_gen_is_active(). + * + * PG_active is always cleared while a page is on one of lrugen->lists[] so that + * the aging needs not to worry about it. And it's set again when a page + * considered active is isolated for non-reclaiming purposes, e.g., migration. + * See lru_gen_add_folio() and lru_gen_del_folio(). + * + * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the + * number of categories of the active/inactive LRU when keeping track of + * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits + * in folio->flags. + */ +#define MIN_NR_GENS 2U +#define MAX_NR_GENS 4U + +#ifndef __GENERATING_BOUNDS_H + +struct lruvec; + +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) +#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) + +#ifdef CONFIG_LRU_GEN + +enum { + LRU_GEN_ANON, + LRU_GEN_FILE, +}; + +/* + * The youngest generation number is stored in max_seq for both anon and file + * types as they are aged on an equal footing. The oldest generation numbers are + * stored in min_seq[] separately for anon and file types as clean file pages + * can be evicted regardless of swap constraints. + * + * Normally anon and file min_seq are in sync. But if swapping is constrained, + * e.g., out of swap space, file min_seq is allowed to advance and leave anon + * min_seq behind. + * + * The number of pages in each generation is eventually consistent and therefore + * can be transiently negative. + */ +struct lru_gen_struct { + /* the aging increments the youngest generation number */ + unsigned long max_seq; + /* the eviction increments the oldest generation numbers */ + unsigned long min_seq[ANON_AND_FILE]; + /* the multi-gen LRU lists, lazily sorted on eviction */ + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the multi-gen LRU sizes, eventually consistent */ + long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; +}; + +void lru_gen_init_lruvec(struct lruvec *lruvec); + +#ifdef CONFIG_MEMCG +void lru_gen_init_memcg(struct mem_cgroup *memcg); +void lru_gen_exit_memcg(struct mem_cgroup *memcg); +#endif + +#else /* !CONFIG_LRU_GEN */ + +static inline void lru_gen_init_lruvec(struct lruvec *lruvec) +{ +} + +#ifdef CONFIG_MEMCG +static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) +{ +} +#endif + +#endif /* CONFIG_LRU_GEN */ + struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* per lruvec lru_lock for memcg */ @@ -334,6 +430,10 @@ struct lruvec { unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; +#ifdef CONFIG_LRU_GEN + /* evictable pages divided into generations */ + struct lru_gen_struct lrugen; +#endif #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif @@ -749,6 +849,8 @@ static inline bool zone_is_empty(struct zone *zone) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) +#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) +#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) /* * Define the bit shifts to access each section. For non-existent diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index ef1e3e736e14..240905407a18 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -55,7 +55,8 @@ #define SECTIONS_WIDTH 0 #endif -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ + <= BITS_PER_LONG - NR_PAGEFLAGS #define NODES_WIDTH NODES_SHIFT #elif defined(CONFIG_SPARSEMEM_VMEMMAP) #error "Vmemmap: No space for nodes field in page flags" @@ -89,8 +90,8 @@ #define LAST_CPUPID_SHIFT 0 #endif -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ - <= BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT #else #define LAST_CPUPID_WIDTH 0 @@ -100,10 +101,12 @@ #define LAST_CPUPID_NOT_IN_PAGE_FLAGS #endif -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ - > BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS #error "Not enough bits in page flags" #endif +#define LRU_REFS_WIDTH 0 + #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 465ff35a8c00..0b0ae5084e60 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1058,7 +1058,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_slab | 1UL << PG_active | \ - 1UL << PG_unevictable | __PG_MLOCKED) + 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) /* * Flags checked when a page is prepped for return by the page allocator. @@ -1069,7 +1069,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ - (PAGEFLAGS_MASK & ~__PG_HWPOISON) + ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/include/linux/sched.h b/include/linux/sched.h index d9a2466664f7..a2dcfb91df03 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -914,6 +914,10 @@ struct task_struct { #ifdef CONFIG_MEMCG unsigned in_user_fault:1; #endif +#ifdef CONFIG_LRU_GEN + /* whether the LRU algorithm may apply to this access */ + unsigned in_lru_fault:1; +#endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif -- cgit v1.2.3 From ac35a490237446b71e3b4b782b1596967edd0aa8 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 02:00:03 -0600 Subject: mm: multi-gen LRU: minimal implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To avoid confusion, the terms "promotion" and "demotion" will be applied to the multi-gen LRU, as a new convention; the terms "activation" and "deactivation" will be applied to the active/inactive LRU, as usual. The aging produces young generations. Given an lruvec, it increments max_seq when max_seq-min_seq+1 approaches MIN_NR_GENS. The aging promotes hot pages to the youngest generation when it finds them accessed through page tables; the demotion of cold pages happens consequently when it increments max_seq. Promotion in the aging path does not involve any LRU list operations, only the updates of the gen counter and lrugen->nr_pages[]; demotion, unless as the result of the increment of max_seq, requires LRU list operations, e.g., lru_deactivate_fn(). The aging has the complexity O(nr_hot_pages), since it is only interested in hot pages. The eviction consumes old generations. Given an lruvec, it increments min_seq when lrugen->lists[] indexed by min_seq%MAX_NR_GENS becomes empty. A feedback loop modeled after the PID controller monitors refaults over anon and file types and decides which type to evict when both types are available from the same generation. The protection of pages accessed multiple times through file descriptors takes place in the eviction path. Each generation is divided into multiple tiers. A page accessed N times through file descriptors is in tier order_base_2(N). Tiers do not have dedicated lrugen->lists[], only bits in folio->flags. The aforementioned feedback loop also monitors refaults over all tiers and decides when to protect pages in which tiers (N>1), using the first tier (N=0,1) as a baseline. The first tier contains single-use unmapped clean pages, which are most likely the best choices. In contrast to promotion in the aging path, the protection of a page in the eviction path is achieved by moving this page to the next generation, i.e., min_seq+1, if the feedback loop decides so. This approach has the following advantages: 1. It removes the cost of activation in the buffered access path by inferring whether pages accessed multiple times through file descriptors are statistically hot and thus worth protecting in the eviction path. 2. It takes pages accessed through page tables into account and avoids overprotecting pages accessed multiple times through file descriptors. (Pages accessed through page tables are in the first tier, since N=0.) 3. More tiers provide better protection for pages accessed more than twice through file descriptors, when under heavy buffered I/O workloads. Server benchmark results: Single workload: fio (buffered I/O): +[30, 32]% IOPS BW 5.19-rc1: 2673k 10.2GiB/s patch1-6: 3491k 13.3GiB/s Single workload: memcached (anon): -[4, 6]% Ops/sec KB/sec 5.19-rc1: 1161501.04 45177.25 patch1-6: 1106168.46 43025.04 Configurations: CPU: two Xeon 6154 Mem: total 256G Node 1 was only used as a ram disk to reduce the variance in the results. patch drivers/block/brd.c < gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM | __GFP_THISNODE; > page = alloc_pages_node(1, gfp_flags, 0); EOF cat >>/etc/systemd/system.conf <>/etc/memcached.conf </sys/fs/cgroup/user.slice/test/memory.max echo $$ >/sys/fs/cgroup/user.slice/test/cgroup.procs fio -name=mglru --numjobs=72 --directory=/mnt --size=1408m \ --buffered=1 --ioengine=io_uring --iodepth=128 \ --iodepth_batch_submit=32 --iodepth_batch_complete=32 \ --rw=randread --random_distribution=random --norandommap \ --time_based --ramp_time=10m --runtime=5m --group_reporting cat memcached.sh modprobe brd rd_nr=1 rd_size=113246208 swapoff -a mkswap /dev/ram0 swapon /dev/ram0 memtier_benchmark -S /var/run/memcached/memcached.sock \ -P memcache_binary -n allkeys --key-minimum=1 \ --key-maximum=65000000 --key-pattern=P:P -c 1 -t 36 \ --ratio 1:0 --pipeline 8 -d 2000 memtier_benchmark -S /var/run/memcached/memcached.sock \ -P memcache_binary -n allkeys --key-minimum=1 \ --key-maximum=65000000 --key-pattern=R:R -c 1 -t 36 \ --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed Client benchmark results: kswapd profiles: 5.19-rc1 40.33% page_vma_mapped_walk (overhead) 21.80% lzo1x_1_do_compress (real work) 7.53% do_raw_spin_lock 3.95% _raw_spin_unlock_irq 2.52% vma_interval_tree_iter_next 2.37% folio_referenced_one 2.28% vma_interval_tree_subtree_search 1.97% anon_vma_interval_tree_iter_first 1.60% ptep_clear_flush 1.06% __zram_bvec_write patch1-6 39.03% lzo1x_1_do_compress (real work) 18.47% page_vma_mapped_walk (overhead) 6.74% _raw_spin_unlock_irq 3.97% do_raw_spin_lock 2.49% ptep_clear_flush 2.48% anon_vma_interval_tree_iter_first 1.92% folio_referenced_one 1.88% __zram_bvec_write 1.48% memmove 1.31% vma_interval_tree_iter_next Configurations: CPU: single Snapdragon 7c Mem: total 4G ChromeOS MemoryPressure [1] [1] https://chromium.googlesource.com/chromiumos/platform/tast-tests/ Link: https://lkml.kernel.org/r/20220918080010.2920238-7-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Barry Song Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Qi Zheng Cc: Tejun Heo Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 36 ++++++++++++++++++++++++++++++++++ include/linux/mmzone.h | 41 +++++++++++++++++++++++++++++++++++++++ include/linux/page-flags-layout.h | 5 ++++- 3 files changed, 81 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 2ff703900fd0..f2b2296a42f9 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -121,6 +121,33 @@ static inline int lru_gen_from_seq(unsigned long seq) return seq % MAX_NR_GENS; } +static inline int lru_hist_from_seq(unsigned long seq) +{ + return seq % NR_HIST_GENS; +} + +static inline int lru_tier_from_refs(int refs) +{ + VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); + + /* see the comment in folio_lru_refs() */ + return order_base_2(refs + 1); +} + +static inline int folio_lru_refs(struct folio *folio) +{ + unsigned long flags = READ_ONCE(folio->flags); + bool workingset = flags & BIT(PG_workingset); + + /* + * Return the number of accesses beyond PG_referenced, i.e., N-1 if the + * total number of accesses is N>1, since N=0,1 both map to the first + * tier. lru_tier_from_refs() will account for this off-by-one. Also see + * the comment on MAX_NR_TIERS. + */ + return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; +} + static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); @@ -173,6 +200,15 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli __update_lru_size(lruvec, lru, zone, -delta); return; } + + /* promotion */ + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { + __update_lru_size(lruvec, lru, zone, -delta); + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); + } + + /* demotion requires isolation, e.g., lru_deactivate_fn() */ + VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6f4ea078d90f..7e343420bfb1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -350,6 +350,28 @@ enum lruvec_flags { #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U +/* + * Each generation is divided into multiple tiers. A page accessed N times + * through file descriptors is in tier order_base_2(N). A page in the first tier + * (N=0,1) is marked by PG_referenced unless it was faulted in through page + * tables or read ahead. A page in any other tier (N>1) is marked by + * PG_referenced and PG_workingset. This implies a minimum of two tiers is + * supported without using additional bits in folio->flags. + * + * In contrast to moving across generations which requires the LRU lock, moving + * across tiers only involves atomic operations on folio->flags and therefore + * has a negligible cost in the buffered access path. In the eviction path, + * comparisons of refaulted/(evicted+protected) from the first tier and the + * rest infer whether pages accessed multiple times through file descriptors + * are statistically hot and thus worth protecting. + * + * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the + * number of categories of the active/inactive LRU when keeping track of + * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in + * folio->flags. + */ +#define MAX_NR_TIERS 4U + #ifndef __GENERATING_BOUNDS_H struct lruvec; @@ -364,6 +386,16 @@ enum { LRU_GEN_FILE, }; +#define MIN_LRU_BATCH BITS_PER_LONG +#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) + +/* whether to keep historical stats from evicted generations */ +#ifdef CONFIG_LRU_GEN_STATS +#define NR_HIST_GENS MAX_NR_GENS +#else +#define NR_HIST_GENS 1U +#endif + /* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are @@ -386,6 +418,15 @@ struct lru_gen_struct { struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the exponential moving average of refaulted */ + unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; + /* the exponential moving average of evicted+protected */ + unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; + /* the first tier doesn't need protection, hence the minus one */ + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; + /* can be modified without holding the LRU lock */ + atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; }; void lru_gen_init_lruvec(struct lruvec *lruvec); diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 240905407a18..7d79818dc065 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -106,7 +106,10 @@ #error "Not enough bits in page flags" #endif -#define LRU_REFS_WIDTH 0 +/* see the comment on MAX_NR_TIERS */ +#define LRU_REFS_WIDTH min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \ + ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \ + NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH) #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ -- cgit v1.2.3 From 018ee47f14893d500131dfca2ff9f3ff8ebd4ed2 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 02:00:04 -0600 Subject: mm: multi-gen LRU: exploit locality in rmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Searching the rmap for PTEs mapping each page on an LRU list (to test and clear the accessed bit) can be expensive because pages from different VMAs (PA space) are not cache friendly to the rmap (VA space). For workloads mostly using mapped pages, searching the rmap can incur the highest CPU cost in the reclaim path. This patch exploits spatial locality to reduce the trips into the rmap. When shrink_page_list() walks the rmap and finds a young PTE, a new function lru_gen_look_around() scans at most BITS_PER_LONG-1 adjacent PTEs. On finding another young PTE, it clears the accessed bit and updates the gen counter of the page mapped by this PTE to (max_seq%MAX_NR_GENS)+1. Server benchmark results: Single workload: fio (buffered I/O): no change Single workload: memcached (anon): +[3, 5]% Ops/sec KB/sec patch1-6: 1106168.46 43025.04 patch1-7: 1147696.57 44640.29 Configurations: no change Client benchmark results: kswapd profiles: patch1-6 39.03% lzo1x_1_do_compress (real work) 18.47% page_vma_mapped_walk (overhead) 6.74% _raw_spin_unlock_irq 3.97% do_raw_spin_lock 2.49% ptep_clear_flush 2.48% anon_vma_interval_tree_iter_first 1.92% folio_referenced_one 1.88% __zram_bvec_write 1.48% memmove 1.31% vma_interval_tree_iter_next patch1-7 48.16% lzo1x_1_do_compress (real work) 8.20% page_vma_mapped_walk (overhead) 7.06% _raw_spin_unlock_irq 2.92% ptep_clear_flush 2.53% __zram_bvec_write 2.11% do_raw_spin_lock 2.02% memmove 1.93% lru_gen_look_around 1.56% free_unref_page_list 1.40% memset Configurations: no change Link: https://lkml.kernel.org/r/20220918080010.2920238-8-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Barry Song Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Qi Zheng Cc: Tejun Heo Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 31 +++++++++++++++++++++++++++++++ include/linux/mm.h | 5 +++++ include/linux/mmzone.h | 6 ++++++ 3 files changed, 42 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a2461f9a8738..9b8ab121d948 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -445,6 +445,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. @@ -506,6 +507,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() * * For a kmem page a caller should hold an rcu read lock to protect memcg * associated with a kmem page from being released. @@ -960,6 +962,23 @@ void unlock_page_memcg(struct page *page); void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); +/* try to stablize folio_memcg() for all the pages in a memcg */ +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + rcu_read_lock(); + + if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) + return true; + + rcu_read_unlock(); + return false; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) @@ -1434,6 +1453,18 @@ static inline void folio_memcg_unlock(struct folio *folio) { } +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + /* to match folio_memcg_rcu() */ + rcu_read_lock(); + return true; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + static inline void mem_cgroup_handle_over_high(void) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a5ad9d050bf..7cc9ffc19e7f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1490,6 +1490,11 @@ static inline unsigned long folio_pfn(struct folio *folio) return page_to_pfn(&folio->page); } +static inline struct folio *pfn_folio(unsigned long pfn) +{ + return page_folio(pfn_to_page(pfn)); +} + static inline atomic_t *folio_pincount_ptr(struct folio *folio) { return &folio_page(folio, 1)->compound_pincount; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7e343420bfb1..9ef5aa37c60c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -375,6 +375,7 @@ enum lruvec_flags { #ifndef __GENERATING_BOUNDS_H struct lruvec; +struct page_vma_mapped_walk; #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) @@ -430,6 +431,7 @@ struct lru_gen_struct { }; void lru_gen_init_lruvec(struct lruvec *lruvec); +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); #ifdef CONFIG_MEMCG void lru_gen_init_memcg(struct mem_cgroup *memcg); @@ -442,6 +444,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } +static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +{ +} + #ifdef CONFIG_MEMCG static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { -- cgit v1.2.3 From bd74fdaea146029e4fa12c6de89adbe0779348a9 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 02:00:05 -0600 Subject: mm: multi-gen LRU: support page table walks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To further exploit spatial locality, the aging prefers to walk page tables to search for young PTEs and promote hot pages. A kill switch will be added in the next patch to disable this behavior. When disabled, the aging relies on the rmap only. NB: this behavior has nothing similar with the page table scanning in the 2.4 kernel [1], which searches page tables for old PTEs, adds cold pages to swapcache and unmaps them. To avoid confusion, the term "iteration" specifically means the traversal of an entire mm_struct list; the term "walk" will be applied to page tables and the rmap, as usual. An mm_struct list is maintained for each memcg, and an mm_struct follows its owner task to the new memcg when this task is migrated. Given an lruvec, the aging iterates lruvec_memcg()->mm_list and calls walk_page_range() with each mm_struct on this list to promote hot pages before it increments max_seq. When multiple page table walkers iterate the same list, each of them gets a unique mm_struct; therefore they can run concurrently. Page table walkers ignore any misplaced pages, e.g., if an mm_struct was migrated, pages it left in the previous memcg will not be promoted when its current memcg is under reclaim. Similarly, page table walkers will not promote pages from nodes other than the one under reclaim. This patch uses the following optimizations when walking page tables: 1. It tracks the usage of mm_struct's between context switches so that page table walkers can skip processes that have been sleeping since the last iteration. 2. It uses generational Bloom filters to record populated branches so that page table walkers can reduce their search space based on the query results, e.g., to skip page tables containing mostly holes or misplaced pages. 3. It takes advantage of the accessed bit in non-leaf PMD entries when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y. 4. It does not zigzag between a PGD table and the same PMD table spanning multiple VMAs. IOW, it finishes all the VMAs within the range of the same PMD table before it returns to a PGD table. This improves the cache performance for workloads that have large numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5. Server benchmark results: Single workload: fio (buffered I/O): no change Single workload: memcached (anon): +[8, 10]% Ops/sec KB/sec patch1-7: 1147696.57 44640.29 patch1-8: 1245274.91 48435.66 Configurations: no change Client benchmark results: kswapd profiles: patch1-7 48.16% lzo1x_1_do_compress (real work) 8.20% page_vma_mapped_walk (overhead) 7.06% _raw_spin_unlock_irq 2.92% ptep_clear_flush 2.53% __zram_bvec_write 2.11% do_raw_spin_lock 2.02% memmove 1.93% lru_gen_look_around 1.56% free_unref_page_list 1.40% memset patch1-8 49.44% lzo1x_1_do_compress (real work) 6.19% page_vma_mapped_walk (overhead) 5.97% _raw_spin_unlock_irq 3.13% get_pfn_folio 2.85% ptep_clear_flush 2.42% __zram_bvec_write 2.08% do_raw_spin_lock 1.92% memmove 1.44% alloc_zspage 1.36% memset Configurations: no change Thanks to the following developers for their efforts [3]. kernel test robot [1] https://lwn.net/Articles/23732/ [2] https://llvm.org/docs/ScudoHardenedAllocator.html [3] https://lore.kernel.org/r/202204160827.ekEARWQo-lkp@intel.com/ Link: https://lkml.kernel.org/r/20220918080010.2920238-9-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Barry Song Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Qi Zheng Cc: Tejun Heo Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 +++ include/linux/mm_types.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mmzone.h | 56 +++++++++++++++++++++++++++++++++- include/linux/swap.h | 4 +++ 4 files changed, 140 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9b8ab121d948..344022f102c2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -350,6 +350,11 @@ struct mem_cgroup { struct deferred_split deferred_split_queue; #endif +#ifdef CONFIG_LRU_GEN + /* per-memcg mm_struct list */ + struct lru_gen_mm_list mm_list; +#endif + struct mem_cgroup_per_node *nodeinfo[]; }; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cf97f3884fda..e1797813cc2c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -672,6 +672,22 @@ struct mm_struct { */ unsigned long ksm_merging_pages; #endif +#ifdef CONFIG_LRU_GEN + struct { + /* this mm_struct is on lru_gen_mm_list */ + struct list_head list; + /* + * Set when switching to this mm_struct, as a hint of + * whether it has been used since the last time per-node + * page table walkers cleared the corresponding bits. + */ + unsigned long bitmap; +#ifdef CONFIG_MEMCG + /* points to the memcg of "owner" above */ + struct mem_cgroup *memcg; +#endif + } lru_gen; +#endif /* CONFIG_LRU_GEN */ } __randomize_layout; /* @@ -698,6 +714,66 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) return (struct cpumask *)&mm->cpu_bitmap; } +#ifdef CONFIG_LRU_GEN + +struct lru_gen_mm_list { + /* mm_struct list for page table walkers */ + struct list_head fifo; + /* protects the list above */ + spinlock_t lock; +}; + +void lru_gen_add_mm(struct mm_struct *mm); +void lru_gen_del_mm(struct mm_struct *mm); +#ifdef CONFIG_MEMCG +void lru_gen_migrate_mm(struct mm_struct *mm); +#endif + +static inline void lru_gen_init_mm(struct mm_struct *mm) +{ + INIT_LIST_HEAD(&mm->lru_gen.list); + mm->lru_gen.bitmap = 0; +#ifdef CONFIG_MEMCG + mm->lru_gen.memcg = NULL; +#endif +} + +static inline void lru_gen_use_mm(struct mm_struct *mm) +{ + /* + * When the bitmap is set, page reclaim knows this mm_struct has been + * used since the last time it cleared the bitmap. So it might be worth + * walking the page tables of this mm_struct to clear the accessed bit. + */ + WRITE_ONCE(mm->lru_gen.bitmap, -1); +} + +#else /* !CONFIG_LRU_GEN */ + +static inline void lru_gen_add_mm(struct mm_struct *mm) +{ +} + +static inline void lru_gen_del_mm(struct mm_struct *mm) +{ +} + +#ifdef CONFIG_MEMCG +static inline void lru_gen_migrate_mm(struct mm_struct *mm) +{ +} +#endif + +static inline void lru_gen_init_mm(struct mm_struct *mm) +{ +} + +static inline void lru_gen_use_mm(struct mm_struct *mm) +{ +} + +#endif /* CONFIG_LRU_GEN */ + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9ef5aa37c60c..b1635c4020dc 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -408,7 +408,7 @@ enum { * min_seq behind. * * The number of pages in each generation is eventually consistent and therefore - * can be transiently negative. + * can be transiently negative when reset_batch_size() is pending. */ struct lru_gen_struct { /* the aging increments the youngest generation number */ @@ -430,6 +430,53 @@ struct lru_gen_struct { atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; }; +enum { + MM_LEAF_TOTAL, /* total leaf entries */ + MM_LEAF_OLD, /* old leaf entries */ + MM_LEAF_YOUNG, /* young leaf entries */ + MM_NONLEAF_TOTAL, /* total non-leaf entries */ + MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ + MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ + NR_MM_STATS +}; + +/* double-buffering Bloom filters */ +#define NR_BLOOM_FILTERS 2 + +struct lru_gen_mm_state { + /* set to max_seq after each iteration */ + unsigned long seq; + /* where the current iteration continues (inclusive) */ + struct list_head *head; + /* where the last iteration ended (exclusive) */ + struct list_head *tail; + /* to wait for the last page table walker to finish */ + struct wait_queue_head wait; + /* Bloom filters flip after each iteration */ + unsigned long *filters[NR_BLOOM_FILTERS]; + /* the mm stats for debugging */ + unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; + /* the number of concurrent page table walkers */ + int nr_walkers; +}; + +struct lru_gen_mm_walk { + /* the lruvec under reclaim */ + struct lruvec *lruvec; + /* unstable max_seq from lru_gen_struct */ + unsigned long max_seq; + /* the next address within an mm to scan */ + unsigned long next_addr; + /* to batch promoted pages */ + int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* to batch the mm stats */ + int mm_stats[NR_MM_STATS]; + /* total batched items */ + int batched; + bool can_swap; + bool force_scan; +}; + void lru_gen_init_lruvec(struct lruvec *lruvec); void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); @@ -480,6 +527,8 @@ struct lruvec { #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_struct lrugen; + /* to concurrently iterate lru_gen_mm_list */ + struct lru_gen_mm_state mm_state; #endif #ifdef CONFIG_MEMCG struct pglist_data *pgdat; @@ -1176,6 +1225,11 @@ typedef struct pglist_data { unsigned long flags; +#ifdef CONFIG_LRU_GEN + /* kswap mm walk data */ + struct lru_gen_mm_walk mm_walk; +#endif + ZONE_PADDING(_pad2_) /* Per-node vmstats */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 43150b9bbc5c..6308150b234a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -162,6 +162,10 @@ union swap_header { */ struct reclaim_state { unsigned long reclaimed_slab; +#ifdef CONFIG_LRU_GEN + /* per-thread mm walk data */ + struct lru_gen_mm_walk *mm_walk; +#endif }; #ifdef __KERNEL__ -- cgit v1.2.3 From 354ed597442952fb680c9cafc7e4eb8a76f9514c Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 02:00:07 -0600 Subject: mm: multi-gen LRU: kill switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that can be disabled include: 0x0001: the multi-gen LRU core 0x0002: walking page table, when arch_has_hw_pte_young() returns true 0x0004: clearing the accessed bit in non-leaf PMD entries, when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y [yYnN]: apply to all the components above E.g., echo y >/sys/kernel/mm/lru_gen/enabled cat /sys/kernel/mm/lru_gen/enabled 0x0007 echo 5 >/sys/kernel/mm/lru_gen/enabled cat /sys/kernel/mm/lru_gen/enabled 0x0005 NB: the page table walks happen on the scale of seconds under heavy memory pressure, in which case the mmap_lock contention is a lesser concern, compared with the LRU lock contention and the I/O congestion. So far the only well-known case of the mmap_lock contention happens on Android, due to Scudo [1] which allocates several thousand VMAs for merely a few hundred MBs. The SPF and the Maple Tree also have provided their own assessments [2][3]. However, if walking page tables does worsen the mmap_lock contention, the kill switch can be used to disable it. In this case the multi-gen LRU will suffer a minor performance degradation, as shown previously. Clearing the accessed bit in non-leaf PMD entries can also be disabled, since this behavior was not tested on x86 varieties other than Intel and AMD. [1] https://source.android.com/devices/tech/debug/scudo [2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/ [3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/ Link: https://lkml.kernel.org/r/20220918080010.2920238-11-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Barry Song Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Qi Zheng Cc: Tejun Heo Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/cgroup.h | 15 ++++++++++++++- include/linux/mm_inline.h | 15 +++++++++++++-- include/linux/mmzone.h | 9 +++++++++ 3 files changed, 36 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ac5d0515680e..9179463c3c9f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp) css_put(&cgrp->self); } +extern struct mutex cgroup_mutex; + +static inline void cgroup_lock(void) +{ + mutex_lock(&cgroup_mutex); +} + +static inline void cgroup_unlock(void) +{ + mutex_unlock(&cgroup_mutex); +} + /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for @@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp) * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU -extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ @@ -708,6 +719,8 @@ struct cgroup; static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} +static inline void cgroup_lock(void) {} +static inline void cgroup_unlock(void) {} static inline int cgroup_attach_task_all(struct task_struct *from, struct task_struct *t) { return 0; } static inline int cgroupstats_build(struct cgroupstats *stats, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index f2b2296a42f9..4949eda9a9a2 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -106,10 +106,21 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio) #ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_GEN_ENABLED static inline bool lru_gen_enabled(void) { - return true; + DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); + + return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); +} +#else +static inline bool lru_gen_enabled(void) +{ + DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); + + return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); } +#endif static inline bool lru_gen_in_fault(void) { @@ -222,7 +233,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); - if (folio_test_unevictable(folio)) + if (folio_test_unevictable(folio) || !lrugen->enabled) return false; /* * There are three common cases for this page: diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b1635c4020dc..95c58c7fbdff 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -387,6 +387,13 @@ enum { LRU_GEN_FILE, }; +enum { + LRU_GEN_CORE, + LRU_GEN_MM_WALK, + LRU_GEN_NONLEAF_YOUNG, + NR_LRU_GEN_CAPS +}; + #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) @@ -428,6 +435,8 @@ struct lru_gen_struct { /* can be modified without holding the LRU lock */ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + /* whether the multi-gen LRU is enabled */ + bool enabled; }; enum { -- cgit v1.2.3 From 1332a809d95a4fc763cabe5ecb6d4fb6a6d941b2 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 02:00:08 -0600 Subject: mm: multi-gen LRU: thrashing prevention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as requested by many desktop users [1]. When set to value N, it prevents the working set of N milliseconds from getting evicted. The OOM killer is triggered if this working set cannot be kept in memory. Based on the average human detectable lag (~100ms), N=1000 usually eliminates intolerable lags due to thrashing. Larger values like N=3000 make lags less noticeable at the risk of premature OOM kills. Compared with the size-based approach [2], this time-based approach has the following advantages: 1. It is easier to configure because it is agnostic to applications and memory sizes. 2. It is more reliable because it is directly wired to the OOM killer. [1] https://lore.kernel.org/r/Ydza%2FzXKY9ATRoh6@google.com/ [2] https://lore.kernel.org/r/20101028191523.GA14972@google.com/ Link: https://lkml.kernel.org/r/20220918080010.2920238-12-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Barry Song Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Qi Zheng Cc: Tejun Heo Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 95c58c7fbdff..87347945270b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -422,6 +422,8 @@ struct lru_gen_struct { unsigned long max_seq; /* the eviction increments the oldest generation numbers */ unsigned long min_seq[ANON_AND_FILE]; + /* the birth time of each generation in jiffies */ + unsigned long timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ -- cgit v1.2.3 From d6c3af7d8a2ba5602c28841248c551a712ac50f5 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 02:00:09 -0600 Subject: mm: multi-gen LRU: debugfs interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add /sys/kernel/debug/lru_gen for working set estimation and proactive reclaim. These techniques are commonly used to optimize job scheduling (bin packing) in data centers [1][2]. Compared with the page table-based approach and the PFN-based approach, this lruvec-based approach has the following advantages: 1. It offers better choices because it is aware of memcgs, NUMA nodes, shared mappings and unmapped page cache. 2. It is more scalable because it is O(nr_hot_pages), whereas the PFN-based approach is O(nr_total_pages). Add /sys/kernel/debug/lru_gen_full for debugging. [1] https://dl.acm.org/doi/10.1145/3297858.3304053 [2] https://dl.acm.org/doi/10.1145/3503222.3507731 Link: https://lkml.kernel.org/r/20220918080010.2920238-13-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Qi Zheng Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Barry Song Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Tejun Heo Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/nodemask.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 4b71a96190a8..3a0eec9f2faa 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -493,6 +493,7 @@ static inline int num_node_state(enum node_states state) #define first_online_node 0 #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) +#define next_memory_node(nid) (MAX_NUMNODES) #define nr_node_ids 1U #define nr_online_nodes 1U -- cgit v1.2.3 From 992bf77591cb7e696fcc59aa7e64d1200b673513 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 18 Aug 2022 18:40:33 +0530 Subject: mm/demotion: add support for explicit memory tiers Patch series "mm/demotion: Memory tiers and demotion", v15. The current kernel has the basic memory tiering support: Inactive pages on a higher tier NUMA node can be migrated (demoted) to a lower tier NUMA node to make room for new allocations on the higher tier NUMA node. Frequently accessed pages on a lower tier NUMA node can be migrated (promoted) to a higher tier NUMA node to improve the performance. In the current kernel, memory tiers are defined implicitly via a demotion path relationship between NUMA nodes, which is created during the kernel initialization and updated when a NUMA node is hot-added or hot-removed. The current implementation puts all nodes with CPU into the highest tier, and builds the tier hierarchy tier-by-tier by establishing the per-node demotion targets based on the distances between nodes. This current memory tier kernel implementation needs to be improved for several important use cases: * The current tier initialization code always initializes each memory-only NUMA node into a lower tier. But a memory-only NUMA node may have a high performance memory device (e.g. a DRAM-backed memory-only node on a virtual machine) and that should be put into a higher tier. * The current tier hierarchy always puts CPU nodes into the top tier. But on a system with HBM (e.g. GPU memory) devices, these memory-only HBM NUMA nodes should be in the top tier, and DRAM nodes with CPUs are better to be placed into the next lower tier. * Also because the current tier hierarchy always puts CPU nodes into the top tier, when a CPU is hot-added (or hot-removed) and triggers a memory node from CPU-less into a CPU node (or vice versa), the memory tier hierarchy gets changed, even though no memory node is added or removed. This can make the tier hierarchy unstable and make it difficult to support tier-based memory accounting. * A higher tier node can only be demoted to nodes with shortest distance on the next lower tier as defined by the demotion path, not any other node from any lower tier. This strict, demotion order does not work in all use cases (e.g. some use cases may want to allow cross-socket demotion to another node in the same demotion tier as a fallback when the preferred demotion node is out of space), and has resulted in the feature request for an interface to override the system-wide, per-node demotion order from the userspace. This demotion order is also inconsistent with the page allocation fallback order when all the nodes in a higher tier are out of space: The page allocation can fall back to any node from any lower tier, whereas the demotion order doesn't allow that. This patch series make the creation of memory tiers explicit under the control of device driver. Memory Tier Initialization ========================== Linux kernel presents memory devices as NUMA nodes and each memory device is of a specific type. The memory type of a device is represented by its abstract distance. A memory tier corresponds to a range of abstract distance. This allows for classifying memory devices with a specific performance range into a memory tier. By default, all memory nodes are assigned to the default tier with abstract distance 512. A device driver can move its memory nodes from the default tier. For example, PMEM can move its memory nodes below the default tier, whereas GPU can move its memory nodes above the default tier. The kernel initialization code makes the decision on which exact tier a memory node should be assigned to based on the requests from the device drivers as well as the memory device hardware information provided by the firmware. Hot-adding/removing CPUs doesn't affect memory tier hierarchy. This patch (of 10): In the current kernel, memory tiers are defined implicitly via a demotion path relationship between NUMA nodes, which is created during the kernel initialization and updated when a NUMA node is hot-added or hot-removed. The current implementation puts all nodes with CPU into the highest tier, and builds the tier hierarchy by establishing the per-node demotion targets based on the distances between nodes. This current memory tier kernel implementation needs to be improved for several important use cases, The current tier initialization code always initializes each memory-only NUMA node into a lower tier. But a memory-only NUMA node may have a high performance memory device (e.g. a DRAM-backed memory-only node on a virtual machine) that should be put into a higher tier. The current tier hierarchy always puts CPU nodes into the top tier. But on a system with HBM or GPU devices, the memory-only NUMA nodes mapping these devices should be in the top tier, and DRAM nodes with CPUs are better to be placed into the next lower tier. With current kernel higher tier node can only be demoted to nodes with shortest distance on the next lower tier as defined by the demotion path, not any other node from any lower tier. This strict, demotion order does not work in all use cases (e.g. some use cases may want to allow cross-socket demotion to another node in the same demotion tier as a fallback when the preferred demotion node is out of space), This demotion order is also inconsistent with the page allocation fallback order when all the nodes in a higher tier are out of space: The page allocation can fall back to any node from any lower tier, whereas the demotion order doesn't allow that. This patch series address the above by defining memory tiers explicitly. Linux kernel presents memory devices as NUMA nodes and each memory device is of a specific type. The memory type of a device is represented by its abstract distance. A memory tier corresponds to a range of abstract distance. This allows for classifying memory devices with a specific performance range into a memory tier. This patch configures the range/chunk size to be 128. The default DRAM abstract distance is 512. We can have 4 memory tiers below the default DRAM with abstract distance range 0 - 127, 127 - 255, 256- 383, 384 - 511. Faster memory devices can be placed in these faster(higher) memory tiers. Slower memory devices like persistent memory will have abstract distance higher than the default DRAM level. [akpm@linux-foundation.org: fix comment, per Aneesh] Link: https://lkml.kernel.org/r/20220818131042.113280-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20220818131042.113280-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: "Huang, Ying" Acked-by: Wei Xu Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Hesham Almatary Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Cc: Yang Shi Cc: Jagdish Gediya Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 include/linux/memory-tiers.h (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h new file mode 100644 index 000000000000..ecada7bf4091 --- /dev/null +++ b/include/linux/memory-tiers.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MEMORY_TIERS_H +#define _LINUX_MEMORY_TIERS_H + +/* + * Each tier cover a abstrace distance chunk size of 128 + */ +#define MEMTIER_CHUNK_BITS 7 +#define MEMTIER_CHUNK_SIZE (1 << MEMTIER_CHUNK_BITS) +/* + * Smaller abstract distance values imply faster (higher) memory tiers. Offset + * the DRAM adistance so that we can accommodate devices with a slightly lower + * adistance value (slightly faster) than default DRAM adistance to be part of + * the same memory tier. + */ +#define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1)) + +#endif /* _LINUX_MEMORY_TIERS_H */ -- cgit v1.2.3 From 9195244022788935eac0df16132394ffa5613542 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 18 Aug 2022 18:40:34 +0530 Subject: mm/demotion: move memory demotion related code This moves memory demotion related code to mm/memory-tiers.c. No functional change in this patch. Link: https://lkml.kernel.org/r/20220818131042.113280-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: "Huang, Ying" Acked-by: Wei Xu Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Hesham Almatary Cc: Jagdish Gediya Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Cc: Yang Shi Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 8 ++++++++ include/linux/migrate.h | 2 -- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index ecada7bf4091..5f24396da76c 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -15,4 +15,12 @@ */ #define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1)) +#ifdef CONFIG_NUMA +#include +extern bool numa_demotion_enabled; + +#else + +#define numa_demotion_enabled false +#endif /* CONFIG_NUMA */ #endif /* _LINUX_MEMORY_TIERS_H */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 22c0a0cf5e0c..96f8c84413fe 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -103,7 +103,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #if defined(CONFIG_MIGRATION) && defined(CONFIG_NUMA) extern void set_migration_target_nodes(void); extern void migrate_on_reclaim_init(void); -extern bool numa_demotion_enabled; extern int next_demotion_node(int node); #else static inline void set_migration_target_nodes(void) {} @@ -112,7 +111,6 @@ static inline int next_demotion_node(int node) { return NUMA_NO_NODE; } -#define numa_demotion_enabled false #endif #ifdef CONFIG_COMPACTION -- cgit v1.2.3 From c6123a19c9f040e597f55f856c679651c26b31d1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 18 Aug 2022 18:40:35 +0530 Subject: mm/demotion: add hotplug callbacks to handle new numa node onlined If the new NUMA node onlined doesn't have a abstract distance assigned, the kernel adds the NUMA node to default memory tier. [aneesh.kumar@linux.ibm.com: fix kernel error with memory hotplug] Link: https://lkml.kernel.org/r/20220825092019.379069-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20220818131042.113280-4-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: "Huang, Ying" Acked-by: Wei Xu Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Hesham Almatary Cc: Jagdish Gediya Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Cc: Yang Shi Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 5f24396da76c..7ae6308b2191 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -14,6 +14,7 @@ * the same memory tier. */ #define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1)) +#define MEMTIER_HOTPLUG_PRIO 100 #ifdef CONFIG_NUMA #include -- cgit v1.2.3 From 7b88bda3761b95856cf97822efe8281c8100067b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 18 Aug 2022 18:40:36 +0530 Subject: mm/demotion/dax/kmem: set node's abstract distance to MEMTIER_DEFAULT_DAX_ADISTANCE By default, all nodes are assigned to the default memory tier which is the memory tier designated for nodes with DRAM Set dax kmem device node's tier to slower memory tier by assigning abstract distance to MEMTIER_DEFAULT_DAX_ADISTANCE. Low-level drivers like papr_scm or ACPI NFIT can initialize memory device type to a more accurate value based on device tree details or HMAT. If the kernel doesn't find the memory type initialized, a default slower memory type is assigned by the kmem driver. [aneesh.kumar@linux.ibm.com: assign correct memory type for multiple dax devices with the same node affinity] Link: https://lkml.kernel.org/r/20220826100224.542312-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20220818131042.113280-5-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: "Huang, Ying" Acked-by: Wei Xu Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Hesham Almatary Cc: Jagdish Gediya Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Cc: Yang Shi Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 7ae6308b2191..49d281866dca 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -2,6 +2,9 @@ #ifndef _LINUX_MEMORY_TIERS_H #define _LINUX_MEMORY_TIERS_H +#include +#include +#include /* * Each tier cover a abstrace distance chunk size of 128 */ @@ -16,12 +19,49 @@ #define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1)) #define MEMTIER_HOTPLUG_PRIO 100 +struct memory_tier; +struct memory_dev_type { + /* list of memory types that are part of same tier as this type */ + struct list_head tier_sibiling; + /* abstract distance for this specific memory type */ + int adistance; + /* Nodes of same abstract distance */ + nodemask_t nodes; + struct kref kref; + struct memory_tier *memtier; +}; + #ifdef CONFIG_NUMA -#include extern bool numa_demotion_enabled; +struct memory_dev_type *alloc_memory_type(int adistance); +void destroy_memory_type(struct memory_dev_type *memtype); +void init_node_memory_type(int node, struct memory_dev_type *default_type); +void clear_node_memory_type(int node, struct memory_dev_type *memtype); #else #define numa_demotion_enabled false +/* + * CONFIG_NUMA implementation returns non NULL error. + */ +static inline struct memory_dev_type *alloc_memory_type(int adistance) +{ + return NULL; +} + +static inline void destroy_memory_type(struct memory_dev_type *memtype) +{ + +} + +static inline void init_node_memory_type(int node, struct memory_dev_type *default_type) +{ + +} + +static inline void clear_node_memory_type(int node, struct memory_dev_type *memtype) +{ + +} #endif /* CONFIG_NUMA */ #endif /* _LINUX_MEMORY_TIERS_H */ -- cgit v1.2.3 From 6c542ab75714fe90dae292aeb3e91ac53f5ff599 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 18 Aug 2022 18:40:37 +0530 Subject: mm/demotion: build demotion targets based on explicit memory tiers This patch switch the demotion target building logic to use memory tiers instead of NUMA distance. All N_MEMORY NUMA nodes will be placed in the default memory tier and additional memory tiers will be added by drivers like dax kmem. This patch builds the demotion target for a NUMA node by looking at all memory tiers below the tier to which the NUMA node belongs. The closest node in the immediately following memory tier is used as a demotion target. Since we are now only building demotion target for N_MEMORY NUMA nodes the CPU hotplug calls are removed in this patch. Link: https://lkml.kernel.org/r/20220818131042.113280-6-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: "Huang, Ying" Acked-by: Wei Xu Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Hesham Almatary Cc: Jagdish Gediya Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Cc: Yang Shi Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 13 +++++++++++++ include/linux/migrate.h | 13 ------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 49d281866dca..ce9c6bac6725 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -37,6 +37,14 @@ struct memory_dev_type *alloc_memory_type(int adistance); void destroy_memory_type(struct memory_dev_type *memtype); void init_node_memory_type(int node, struct memory_dev_type *default_type); void clear_node_memory_type(int node, struct memory_dev_type *memtype); +#ifdef CONFIG_MIGRATION +int next_demotion_node(int node); +#else +static inline int next_demotion_node(int node) +{ + return NUMA_NO_NODE; +} +#endif #else @@ -63,5 +71,10 @@ static inline void clear_node_memory_type(int node, struct memory_dev_type *memt { } + +static inline int next_demotion_node(int node) +{ + return NUMA_NO_NODE; +} #endif /* CONFIG_NUMA */ #endif /* _LINUX_MEMORY_TIERS_H */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 96f8c84413fe..704a04f5a074 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -100,19 +100,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #endif /* CONFIG_MIGRATION */ -#if defined(CONFIG_MIGRATION) && defined(CONFIG_NUMA) -extern void set_migration_target_nodes(void); -extern void migrate_on_reclaim_init(void); -extern int next_demotion_node(int node); -#else -static inline void set_migration_target_nodes(void) {} -static inline void migrate_on_reclaim_init(void) {} -static inline int next_demotion_node(int node) -{ - return NUMA_NO_NODE; -} -#endif - #ifdef CONFIG_COMPACTION bool PageMovable(struct page *page); void __SetPageMovable(struct page *page, const struct movable_operations *ops); -- cgit v1.2.3 From 7766cf7a7e7545ab434a16c6f9531b09efe14dc1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 18 Aug 2022 18:40:38 +0530 Subject: mm/demotion: add pg_data_t member to track node memory tier details Also update different helpes to use NODE_DATA()->memtier. Since node specific memtier can change based on the reassignment of NUMA node to a different memory tiers, accessing NODE_DATA()->memtier needs to happen under an rcu read lock or memory_tier_lock. Link: https://lkml.kernel.org/r/20220818131042.113280-7-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: "Huang, Ying" Acked-by: Wei Xu Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Hesham Almatary Cc: Jagdish Gediya Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Cc: Yang Shi Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 87347945270b..e335a492c2eb 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1246,6 +1246,9 @@ typedef struct pglist_data { /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; +#ifdef CONFIG_NUMA + struct memory_tier __rcu *memtier; +#endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) -- cgit v1.2.3 From b26ac6f3ba38fac83db2d72551e6d994d0e0516f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 18 Aug 2022 18:40:39 +0530 Subject: mm/demotion: drop memtier from memtype Now that we track node-specific memtier in pg_data_t, we can drop memtier from memtype. Link: https://lkml.kernel.org/r/20220818131042.113280-8-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: "Huang, Ying" Acked-by: Wei Xu Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Hesham Almatary Cc: Jagdish Gediya Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Cc: Yang Shi Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index ce9c6bac6725..7ca52ad2789f 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -28,7 +28,6 @@ struct memory_dev_type { /* Nodes of same abstract distance */ nodemask_t nodes; struct kref kref; - struct memory_tier *memtier; }; #ifdef CONFIG_NUMA -- cgit v1.2.3 From 32008027289239100d8d2876f50b15d92bde1855 Mon Sep 17 00:00:00 2001 From: Jagdish Gediya Date: Thu, 18 Aug 2022 18:40:40 +0530 Subject: mm/demotion: demote pages according to allocation fallback order Currently, a higher tier node can only be demoted to selected nodes on the next lower tier as defined by the demotion path. This strict demotion order does not work in all use cases (e.g. some use cases may want to allow cross-socket demotion to another node in the same demotion tier as a fallback when the preferred demotion node is out of space). This demotion order is also inconsistent with the page allocation fallback order when all the nodes in a higher tier are out of space: The page allocation can fall back to any node from any lower tier, whereas the demotion order doesn't allow that currently. This patch adds support to get all the allowed demotion targets for a memory tier. demote_page_list() function is now modified to utilize this allowed node mask as the fallback allocation mask. Link: https://lkml.kernel.org/r/20220818131042.113280-9-aneesh.kumar@linux.ibm.com Signed-off-by: Jagdish Gediya Signed-off-by: Aneesh Kumar K.V Reviewed-by: "Huang, Ying" Acked-by: Wei Xu Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Hesham Almatary Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Cc: Yang Shi Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 7ca52ad2789f..42791554b9b9 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -5,6 +5,7 @@ #include #include #include +#include /* * Each tier cover a abstrace distance chunk size of 128 */ @@ -38,11 +39,17 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type); void clear_node_memory_type(int node, struct memory_dev_type *memtype); #ifdef CONFIG_MIGRATION int next_demotion_node(int node); +void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); #else static inline int next_demotion_node(int node) { return NUMA_NO_NODE; } + +static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) +{ + *targets = NODE_MASK_NONE; +} #endif #else @@ -75,5 +82,10 @@ static inline int next_demotion_node(int node) { return NUMA_NO_NODE; } + +static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) +{ + *targets = NODE_MASK_NONE; +} #endif /* CONFIG_NUMA */ #endif /* _LINUX_MEMORY_TIERS_H */ -- cgit v1.2.3 From 467b171af881282fc627328e6c164f044a6df888 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 18 Aug 2022 18:40:41 +0530 Subject: mm/demotion: update node_is_toptier to work with memory tiers With memory tier support we can have memory only NUMA nodes in the top tier from which we want to avoid promotion tracking NUMA faults. Update node_is_toptier to work with memory tiers. All NUMA nodes are by default top tier nodes. With lower(slower) memory tiers added we consider all memory tiers above a memory tier having CPU NUMA nodes as a top memory tier [sj@kernel.org: include missed header file, memory-tiers.h] Link: https://lkml.kernel.org/r/20220820190720.248704-1-sj@kernel.org [akpm@linux-foundation.org: mm/memory.c needs linux/memory-tiers.h] [aneesh.kumar@linux.ibm.com: make toptier_distance inclusive upper bound of toptiers] Link: https://lkml.kernel.org/r/20220830081457.118960-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20220818131042.113280-10-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: "Huang, Ying" Acked-by: Wei Xu Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Hesham Almatary Cc: Jagdish Gediya Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Cc: Yang Shi Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 11 +++++++++++ include/linux/node.h | 5 ----- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 42791554b9b9..965009aa01d7 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -40,6 +40,7 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype); #ifdef CONFIG_MIGRATION int next_demotion_node(int node); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); +bool node_is_toptier(int node); #else static inline int next_demotion_node(int node) { @@ -50,6 +51,11 @@ static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *target { *targets = NODE_MASK_NONE; } + +static inline bool node_is_toptier(int node) +{ + return true; +} #endif #else @@ -87,5 +93,10 @@ static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *target { *targets = NODE_MASK_NONE; } + +static inline bool node_is_toptier(int node) +{ + return true; +} #endif /* CONFIG_NUMA */ #endif /* _LINUX_MEMORY_TIERS_H */ diff --git a/include/linux/node.h b/include/linux/node.h index 40d641a8bfb0..9ec680dd607f 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -185,9 +185,4 @@ static inline void register_hugetlbfs_with_node(node_registration_func_t reg, #define to_node(device) container_of(device, struct node, dev) -static inline bool node_is_toptier(int node) -{ - return node_state(node, N_CPU); -} - #endif /* _LINUX_NODE_H_ */ -- cgit v1.2.3 From 3e061d924fe9c7b487ca45935f92fcd414ce6f1a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 18 Aug 2022 18:40:42 +0530 Subject: lib/nodemask: optimize node_random for nodemask with single NUMA node The most common case for certain node_random usage (demotion nodemask) is with nodemask weight 1. We can avoid calling get_random_init() in that case and always return the only node set in the nodemask. A simple test as below before = rdtsc_ordered(); for (i= 0; i < 100; i++) { rand = node_random(&nmask); } after = rdtsc_ordered(); Without fix after - before : 16438 With fix after - before : 816 Link: https://lkml.kernel.org/r/20220818131042.113280-11-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: "Huang, Ying" Acked-by: Wei Xu Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Hesham Almatary Cc: Jagdish Gediya Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Cc: Yang Shi Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/nodemask.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 3a0eec9f2faa..e66742db741c 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -505,12 +505,21 @@ static inline int num_node_state(enum node_states state) static inline int node_random(const nodemask_t *maskp) { #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) - int w, bit = NUMA_NO_NODE; + int w, bit; w = nodes_weight(*maskp); - if (w) + switch (w) { + case 0: + bit = NUMA_NO_NODE; + break; + case 1: + bit = first_node(*maskp); + break; + default: bit = bitmap_ord_to_pos(maskp->bits, - get_random_int() % w, MAX_NUMNODES); + get_random_int() % w, MAX_NUMNODES); + break; + } return bit; #else return 0; -- cgit v1.2.3 From 54a611b605901c7d5d05b6b8f5d04a6ceb0962aa Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:39 +0000 Subject: Maple Tree: add new data structure Patch series "Introducing the Maple Tree" The maple tree is an RCU-safe range based B-tree designed to use modern processor cache efficiently. There are a number of places in the kernel that a non-overlapping range-based tree would be beneficial, especially one with a simple interface. If you use an rbtree with other data structures to improve performance or an interval tree to track non-overlapping ranges, then this is for you. The tree has a branching factor of 10 for non-leaf nodes and 16 for leaf nodes. With the increased branching factor, it is significantly shorter than the rbtree so it has fewer cache misses. The removal of the linked list between subsequent entries also reduces the cache misses and the need to pull in the previous and next VMA during many tree alterations. The first user that is covered in this patch set is the vm_area_struct, where three data structures are replaced by the maple tree: the augmented rbtree, the vma cache, and the linked list of VMAs in the mm_struct. The long term goal is to reduce or remove the mmap_lock contention. The plan is to get to the point where we use the maple tree in RCU mode. Readers will not block for writers. A single write operation will be allowed at a time. A reader re-walks if stale data is encountered. VMAs would be RCU enabled and this mode would be entered once multiple tasks are using the mm_struct. Davidlor said : Yes I like the maple tree, and at this stage I don't think we can ask for : more from this series wrt the MM - albeit there seems to still be some : folks reporting breakage. Fundamentally I see Liam's work to (re)move : complexity out of the MM (not to say that the actual maple tree is not : complex) by consolidating the three complimentary data structures very : much worth it considering performance does not take a hit. This was very : much a turn off with the range locking approach, which worst case scenario : incurred in prohibitive overhead. Also as Liam and Matthew have : mentioned, RCU opens up a lot of nice performance opportunities, and in : addition academia[1] has shown outstanding scalability of address spaces : with the foundation of replacing the locked rbtree with RCU aware trees. A similar work has been discovered in the academic press https://pdos.csail.mit.edu/papers/rcuvm:asplos12.pdf Sheer coincidence. We designed our tree with the intention of solving the hardest problem first. Upon settling on a b-tree variant and a rough outline, we researched ranged based b-trees and RCU b-trees and did find that article. So it was nice to find reassurances that we were on the right path, but our design choice of using ranges made that paper unusable for us. This patch (of 70): The maple tree is an RCU-safe range based B-tree designed to use modern processor cache efficiently. There are a number of places in the kernel that a non-overlapping range-based tree would be beneficial, especially one with a simple interface. If you use an rbtree with other data structures to improve performance or an interval tree to track non-overlapping ranges, then this is for you. The tree has a branching factor of 10 for non-leaf nodes and 16 for leaf nodes. With the increased branching factor, it is significantly shorter than the rbtree so it has fewer cache misses. The removal of the linked list between subsequent entries also reduces the cache misses and the need to pull in the previous and next VMA during many tree alterations. The first user that is covered in this patch set is the vm_area_struct, where three data structures are replaced by the maple tree: the augmented rbtree, the vma cache, and the linked list of VMAs in the mm_struct. The long term goal is to reduce or remove the mmap_lock contention. The plan is to get to the point where we use the maple tree in RCU mode. Readers will not block for writers. A single write operation will be allowed at a time. A reader re-walks if stale data is encountered. VMAs would be RCU enabled and this mode would be entered once multiple tasks are using the mm_struct. There is additional BUG_ON() calls added within the tree, most of which are in debug code. These will be replaced with a WARN_ON() call in the future. There is also additional BUG_ON() calls within the code which will also be reduced in number at a later date. These exist to catch things such as out-of-range accesses which would crash anyways. Link: https://lkml.kernel.org/r/20220906194824.2110408-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220906194824.2110408-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Tested-by: David Howells Tested-by: Sven Schnelle Tested-by: Yu Zhao Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: Catalin Marinas Cc: SeongJae Park Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 685 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 685 insertions(+) create mode 100644 include/linux/maple_tree.h (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h new file mode 100644 index 000000000000..2effab72add1 --- /dev/null +++ b/include/linux/maple_tree.h @@ -0,0 +1,685 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _LINUX_MAPLE_TREE_H +#define _LINUX_MAPLE_TREE_H +/* + * Maple Tree - An RCU-safe adaptive tree for storing ranges + * Copyright (c) 2018-2022 Oracle + * Authors: Liam R. Howlett + * Matthew Wilcox + */ + +#include +#include +#include +/* #define CONFIG_MAPLE_RCU_DISABLED */ +/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */ + +/* + * Allocated nodes are mutable until they have been inserted into the tree, + * at which time they cannot change their type until they have been removed + * from the tree and an RCU grace period has passed. + * + * Removed nodes have their ->parent set to point to themselves. RCU readers + * check ->parent before relying on the value that they loaded from the + * slots array. This lets us reuse the slots array for the RCU head. + * + * Nodes in the tree point to their parent unless bit 0 is set. + */ +#if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) +/* 64bit sizes */ +#define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */ +#define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */ +#define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */ +#define MAPLE_ARANGE64_META_MAX 15 /* Out of range for metadata */ +#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1) +#else +/* 32bit sizes */ +#define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */ +#define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */ +#define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */ +#define MAPLE_ARANGE64_META_MAX 31 /* Out of range for metadata */ +#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2) +#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */ + +#define MAPLE_NODE_MASK 255UL + +/* + * The node->parent of the root node has bit 0 set and the rest of the pointer + * is a pointer to the tree itself. No more bits are available in this pointer + * (on m68k, the data structure may only be 2-byte aligned). + * + * Internal non-root nodes can only have maple_range_* nodes as parents. The + * parent pointer is 256B aligned like all other tree nodes. When storing a 32 + * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an + * extra bit to store the offset. This extra bit comes from a reuse of the last + * bit in the node type. This is possible by using bit 1 to indicate if bit 2 + * is part of the type or the slot. + * + * Once the type is decided, the decision of an allocation range type or a range + * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE + * flag. + * + * Node types: + * 0x??1 = Root + * 0x?00 = 16 bit nodes + * 0x010 = 32 bit nodes + * 0x110 = 64 bit nodes + * + * Slot size and location in the parent pointer: + * type : slot location + * 0x??1 : Root + * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 + * 0x010 : 32 bit values, type in 0-2, slot in 3-6 + * 0x110 : 64 bit values, type in 0-2, slot in 3-6 + */ + +/* + * This metadata is used to optimize the gap updating code and in reverse + * searching for gaps or any other code that needs to find the end of the data. + */ +struct maple_metadata { + unsigned char end; + unsigned char gap; +}; + +/* + * Leaf nodes do not store pointers to nodes, they store user data. Users may + * store almost any bit pattern. As noted above, the optimisation of storing an + * entry at 0 in the root pointer cannot be done for data which have the bottom + * two bits set to '10'. We also reserve values with the bottom two bits set to + * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs + * return errnos as a negative errno shifted right by two bits and the bottom + * two bits set to '10', and while choosing to store these values in the array + * is not an error, it may lead to confusion if you're testing for an error with + * mas_is_err(). + * + * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits + * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now. + * + * In regular B-Tree terms, pivots are called keys. The term pivot is used to + * indicate that the tree is specifying ranges, Pivots may appear in the + * subtree with an entry attached to the value whereas keys are unique to a + * specific position of a B-tree. Pivot values are inclusive of the slot with + * the same index. + */ + +struct maple_range_64 { + struct maple_pnode *parent; + unsigned long pivot[MAPLE_RANGE64_SLOTS - 1]; + union { + void __rcu *slot[MAPLE_RANGE64_SLOTS]; + struct { + void __rcu *pad[MAPLE_RANGE64_SLOTS - 1]; + struct maple_metadata meta; + }; + }; +}; + +/* + * At tree creation time, the user can specify that they're willing to trade off + * storing fewer entries in a tree in return for storing more information in + * each node. + * + * The maple tree supports recording the largest range of NULL entries available + * in this node, also called gaps. This optimises the tree for allocating a + * range. + */ +struct maple_arange_64 { + struct maple_pnode *parent; + unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1]; + void __rcu *slot[MAPLE_ARANGE64_SLOTS]; + unsigned long gap[MAPLE_ARANGE64_SLOTS]; + struct maple_metadata meta; +}; + +struct maple_alloc { + unsigned long total; + unsigned char node_count; + unsigned int request_count; + struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; +}; + +struct maple_topiary { + struct maple_pnode *parent; + struct maple_enode *next; /* Overlaps the pivot */ +}; + +enum maple_type { + maple_dense, + maple_leaf_64, + maple_range_64, + maple_arange_64, +}; + + +/** + * DOC: Maple tree flags + * + * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree + * * MT_FLAGS_USE_RCU - Operate in RCU mode + * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags + * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value + * * MT_FLAGS_LOCK_MASK - How the mt_lock is used + * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe + * * MT_FLAGS_LOCK_BH - Acquired bh-safe + * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used + * + * MAPLE_HEIGHT_MAX The largest height that can be stored + */ +#define MT_FLAGS_ALLOC_RANGE 0x01 +#define MT_FLAGS_USE_RCU 0x02 +#define MT_FLAGS_HEIGHT_OFFSET 0x02 +#define MT_FLAGS_HEIGHT_MASK 0x7C +#define MT_FLAGS_LOCK_MASK 0x300 +#define MT_FLAGS_LOCK_IRQ 0x100 +#define MT_FLAGS_LOCK_BH 0x200 +#define MT_FLAGS_LOCK_EXTERN 0x300 + +#define MAPLE_HEIGHT_MAX 31 + + +#define MAPLE_NODE_TYPE_MASK 0x0F +#define MAPLE_NODE_TYPE_SHIFT 0x03 + +#define MAPLE_RESERVED_RANGE 4096 + +#ifdef CONFIG_LOCKDEP +typedef struct lockdep_map *lockdep_map_p; +#define mt_lock_is_held(mt) lock_is_held(mt->ma_external_lock) +#define mt_set_external_lock(mt, lock) \ + (mt)->ma_external_lock = &(lock)->dep_map +#else +typedef struct { /* nothing */ } lockdep_map_p; +#define mt_lock_is_held(mt) 1 +#define mt_set_external_lock(mt, lock) do { } while (0) +#endif + +/* + * If the tree contains a single entry at index 0, it is usually stored in + * tree->ma_root. To optimise for the page cache, an entry which ends in '00', + * '01' or '11' is stored in the root, but an entry which ends in '10' will be + * stored in a node. Bits 3-6 are used to store enum maple_type. + * + * The flags are used both to store some immutable information about this tree + * (set at tree creation time) and dynamic information set under the spinlock. + * + * Another use of flags are to indicate global states of the tree. This is the + * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in + * RCU mode. This mode was added to allow the tree to reuse nodes instead of + * re-allocating and RCU freeing nodes when there is a single user. + */ +struct maple_tree { + union { + spinlock_t ma_lock; + lockdep_map_p ma_external_lock; + }; + void __rcu *ma_root; + unsigned int ma_flags; +}; + +/** + * MTREE_INIT() - Initialize a maple tree + * @name: The maple tree name + * @__flags: The maple tree flags + * + */ +#define MTREE_INIT(name, __flags) { \ + .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock), \ + .ma_flags = __flags, \ + .ma_root = NULL, \ +} + +/** + * MTREE_INIT_EXT() - Initialize a maple tree with an external lock. + * @name: The tree name + * @__flags: The maple tree flags + * @__lock: The external lock + */ +#ifdef CONFIG_LOCKDEP +#define MTREE_INIT_EXT(name, __flags, __lock) { \ + .ma_external_lock = &(__lock).dep_map, \ + .ma_flags = (__flags), \ + .ma_root = NULL, \ +} +#else +#define MTREE_INIT_EXT(name, __flags, __lock) MTREE_INIT(name, __flags) +#endif + +#define DEFINE_MTREE(name) \ + struct maple_tree name = MTREE_INIT(name, 0) + +#define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) +#define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) + +/* + * The Maple Tree squeezes various bits in at various points which aren't + * necessarily obvious. Usually, this is done by observing that pointers are + * N-byte aligned and thus the bottom log_2(N) bits are available for use. We + * don't use the high bits of pointers to store additional information because + * we don't know what bits are unused on any given architecture. + * + * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8 + * low bits for our own purposes. Nodes are currently of 4 types: + * 1. Single pointer (Range is 0-0) + * 2. Non-leaf Allocation Range nodes + * 3. Non-leaf Range nodes + * 4. Leaf Range nodes All nodes consist of a number of node slots, + * pivots, and a parent pointer. + */ + +struct maple_node { + union { + struct { + struct maple_pnode *parent; + void __rcu *slot[MAPLE_NODE_SLOTS]; + }; + struct { + void *pad; + struct rcu_head rcu; + struct maple_enode *piv_parent; + unsigned char parent_slot; + enum maple_type type; + unsigned char slot_len; + unsigned int ma_flags; + }; + struct maple_range_64 mr64; + struct maple_arange_64 ma64; + struct maple_alloc alloc; + }; +}; + +/* + * More complicated stores can cause two nodes to become one or three and + * potentially alter the height of the tree. Either half of the tree may need + * to be rebalanced against the other. The ma_topiary struct is used to track + * which nodes have been 'cut' from the tree so that the change can be done + * safely at a later date. This is done to support RCU. + */ +struct ma_topiary { + struct maple_enode *head; + struct maple_enode *tail; + struct maple_tree *mtree; +}; + +void *mtree_load(struct maple_tree *mt, unsigned long index); + +int mtree_insert(struct maple_tree *mt, unsigned long index, + void *entry, gfp_t gfp); +int mtree_insert_range(struct maple_tree *mt, unsigned long first, + unsigned long last, void *entry, gfp_t gfp); +int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, + void *entry, unsigned long size, unsigned long min, + unsigned long max, gfp_t gfp); +int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, + void *entry, unsigned long size, unsigned long min, + unsigned long max, gfp_t gfp); + +int mtree_store_range(struct maple_tree *mt, unsigned long first, + unsigned long last, void *entry, gfp_t gfp); +int mtree_store(struct maple_tree *mt, unsigned long index, + void *entry, gfp_t gfp); +void *mtree_erase(struct maple_tree *mt, unsigned long index); + +void mtree_destroy(struct maple_tree *mt); +void __mt_destroy(struct maple_tree *mt); + +/** + * mtree_empty() - Determine if a tree has any present entries. + * @mt: Maple Tree. + * + * Context: Any context. + * Return: %true if the tree contains only NULL pointers. + */ +static inline bool mtree_empty(const struct maple_tree *mt) +{ + return mt->ma_root == NULL; +} + +/* Advanced API */ + +/* + * The maple state is defined in the struct ma_state and is used to keep track + * of information during operations, and even between operations when using the + * advanced API. + * + * If state->node has bit 0 set then it references a tree location which is not + * a node (eg the root). If bit 1 is set, the rest of the bits are a negative + * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the + * node type. + * + * state->alloc either has a request number of nodes or an allocated node. If + * stat->alloc has a requested number of nodes, the first bit will be set (0x1) + * and the remaining bits are the value. If state->alloc is a node, then the + * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for + * storing more allocated nodes, a total number of nodes allocated, and the + * node_count in this node. node_count is the number of allocated nodes in this + * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further + * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc + * by removing a node from the state->alloc node until state->alloc->node_count + * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted + * to state->alloc. Nodes are pushed onto state->alloc by putting the current + * state->alloc into the pushed node's slot[0]. + * + * The state also contains the implied min/max of the state->node, the depth of + * this search, and the offset. The implied min/max are either from the parent + * node or are 0-oo for the root node. The depth is incremented or decremented + * every time a node is walked down or up. The offset is the slot/pivot of + * interest in the node - either for reading or writing. + * + * When returning a value the maple state index and last respectively contain + * the start and end of the range for the entry. Ranges are inclusive in the + * Maple Tree. + */ +struct ma_state { + struct maple_tree *tree; /* The tree we're operating in */ + unsigned long index; /* The index we're operating on - range start */ + unsigned long last; /* The last index we're operating on - range end */ + struct maple_enode *node; /* The node containing this entry */ + unsigned long min; /* The minimum index of this node - implied pivot min */ + unsigned long max; /* The maximum index of this node - implied pivot max */ + struct maple_alloc *alloc; /* Allocated nodes for this operation */ + unsigned char depth; /* depth of tree descent during write */ + unsigned char offset; + unsigned char mas_flags; +}; + +struct ma_wr_state { + struct ma_state *mas; + struct maple_node *node; /* Decoded mas->node */ + unsigned long r_min; /* range min */ + unsigned long r_max; /* range max */ + enum maple_type type; /* mas->node type */ + unsigned char offset_end; /* The offset where the write ends */ + unsigned char node_end; /* mas->node end */ + unsigned long *pivots; /* mas->node->pivots pointer */ + unsigned long end_piv; /* The pivot at the offset end */ + void __rcu **slots; /* mas->node->slots pointer */ + void *entry; /* The entry to write */ + void *content; /* The existing entry that is being overwritten */ +}; + +#define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) +#define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) + + +/* + * Special values for ma_state.node. + * MAS_START means we have not searched the tree. + * MAS_ROOT means we have searched the tree and the entry we found lives in + * the root of the tree (ie it has index 0, length 1 and is the only entry in + * the tree). + * MAS_NONE means we have searched the tree and there is no node in the + * tree for this entry. For example, we searched for index 1 in an empty + * tree. Or we have a tree which points to a full leaf node and we + * searched for an entry which is larger than can be contained in that + * leaf node. + * MA_ERROR represents an errno. After dropping the lock and attempting + * to resolve the error, the walk would have to be restarted from the + * top of the tree as the tree may have been modified. + */ +#define MAS_START ((struct maple_enode *)1UL) +#define MAS_ROOT ((struct maple_enode *)5UL) +#define MAS_NONE ((struct maple_enode *)9UL) +#define MAS_PAUSE ((struct maple_enode *)17UL) +#define MA_ERROR(err) \ + ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) + +#define MA_STATE(name, mt, first, end) \ + struct ma_state name = { \ + .tree = mt, \ + .index = first, \ + .last = end, \ + .node = MAS_START, \ + .min = 0, \ + .max = ULONG_MAX, \ + .alloc = NULL, \ + } + +#define MA_WR_STATE(name, ma_state, wr_entry) \ + struct ma_wr_state name = { \ + .mas = ma_state, \ + .content = NULL, \ + .entry = wr_entry, \ + } + +#define MA_TOPIARY(name, tree) \ + struct ma_topiary name = { \ + .head = NULL, \ + .tail = NULL, \ + .mtree = tree, \ + } + +void *mas_walk(struct ma_state *mas); +void *mas_store(struct ma_state *mas, void *entry); +void *mas_erase(struct ma_state *mas); +int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); +void mas_store_prealloc(struct ma_state *mas, void *entry); +void *mas_find(struct ma_state *mas, unsigned long max); +void *mas_find_rev(struct ma_state *mas, unsigned long min); +int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); +bool mas_is_err(struct ma_state *mas); + +bool mas_nomem(struct ma_state *mas, gfp_t gfp); +void mas_pause(struct ma_state *mas); +void maple_tree_init(void); +void mas_destroy(struct ma_state *mas); +int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); + +void *mas_prev(struct ma_state *mas, unsigned long min); +void *mas_next(struct ma_state *mas, unsigned long max); + +int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, + unsigned long size); + +/* Checks if a mas has not found anything */ +static inline bool mas_is_none(struct ma_state *mas) +{ + return mas->node == MAS_NONE; +} + +/* Checks if a mas has been paused */ +static inline bool mas_is_paused(struct ma_state *mas) +{ + return mas->node == MAS_PAUSE; +} + +void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas); +void mas_dup_store(struct ma_state *mas, void *entry); + +/* + * This finds an empty area from the highest address to the lowest. + * AKA "Topdown" version, + */ +int mas_empty_area_rev(struct ma_state *mas, unsigned long min, + unsigned long max, unsigned long size); +/** + * mas_reset() - Reset a Maple Tree operation state. + * @mas: Maple Tree operation state. + * + * Resets the error or walk state of the @mas so future walks of the + * array will start from the root. Use this if you have dropped the + * lock and want to reuse the ma_state. + * + * Context: Any context. + */ +static inline void mas_reset(struct ma_state *mas) +{ + mas->node = MAS_START; +} + +/** + * mas_for_each() - Iterate over a range of the maple tree. + * @__mas: Maple Tree operation state (maple_state) + * @__entry: Entry retrieved from the tree + * @__max: maximum index to retrieve from the tree + * + * When returned, mas->index and mas->last will hold the entire range for the + * entry. + * + * Note: may return the zero entry. + * + */ +#define mas_for_each(__mas, __entry, __max) \ + while (((__entry) = mas_find((__mas), (__max))) != NULL) + + +/** + * mas_set_range() - Set up Maple Tree operation state for a different index. + * @mas: Maple Tree operation state. + * @start: New start of range in the Maple Tree. + * @last: New end of range in the Maple Tree. + * + * Move the operation state to refer to a different range. This will + * have the effect of starting a walk from the top; see mas_next() + * to move to an adjacent index. + */ +static inline +void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) +{ + mas->index = start; + mas->last = last; + mas->node = MAS_START; +} + +/** + * mas_set() - Set up Maple Tree operation state for a different index. + * @mas: Maple Tree operation state. + * @index: New index into the Maple Tree. + * + * Move the operation state to refer to a different index. This will + * have the effect of starting a walk from the top; see mas_next() + * to move to an adjacent index. + */ +static inline void mas_set(struct ma_state *mas, unsigned long index) +{ + + mas_set_range(mas, index, index); +} + +static inline bool mt_external_lock(const struct maple_tree *mt) +{ + return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN; +} + +/** + * mt_init_flags() - Initialise an empty maple tree with flags. + * @mt: Maple Tree + * @flags: maple tree flags. + * + * If you need to initialise a Maple Tree with special flags (eg, an + * allocation tree), use this function. + * + * Context: Any context. + */ +static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags) +{ + mt->ma_flags = flags; + if (!mt_external_lock(mt)) + spin_lock_init(&mt->ma_lock); + rcu_assign_pointer(mt->ma_root, NULL); +} + +/** + * mt_init() - Initialise an empty maple tree. + * @mt: Maple Tree + * + * An empty Maple Tree. + * + * Context: Any context. + */ +static inline void mt_init(struct maple_tree *mt) +{ + mt_init_flags(mt, 0); +} + +static inline bool mt_in_rcu(struct maple_tree *mt) +{ +#ifdef CONFIG_MAPLE_RCU_DISABLED + return false; +#endif + return mt->ma_flags & MT_FLAGS_USE_RCU; +} + +/** + * mt_clear_in_rcu() - Switch the tree to non-RCU mode. + * @mt: The Maple Tree + */ +static inline void mt_clear_in_rcu(struct maple_tree *mt) +{ + if (!mt_in_rcu(mt)) + return; + + if (mt_external_lock(mt)) { + BUG_ON(!mt_lock_is_held(mt)); + mt->ma_flags &= ~MT_FLAGS_USE_RCU; + } else { + mtree_lock(mt); + mt->ma_flags &= ~MT_FLAGS_USE_RCU; + mtree_unlock(mt); + } +} + +/** + * mt_set_in_rcu() - Switch the tree to RCU safe mode. + * @mt: The Maple Tree + */ +static inline void mt_set_in_rcu(struct maple_tree *mt) +{ + if (mt_in_rcu(mt)) + return; + + if (mt_external_lock(mt)) { + BUG_ON(!mt_lock_is_held(mt)); + mt->ma_flags |= MT_FLAGS_USE_RCU; + } else { + mtree_lock(mt); + mt->ma_flags |= MT_FLAGS_USE_RCU; + mtree_unlock(mt); + } +} + +void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max); +void *mt_find_after(struct maple_tree *mt, unsigned long *index, + unsigned long max); +void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min); +void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); + +/** + * mt_for_each - Iterate over each entry starting at index until max. + * @__tree: The Maple Tree + * @__entry: The current entry + * @__index: The index to update to track the location in the tree + * @__max: The maximum limit for @index + * + * Note: Will not return the zero entry. + */ +#define mt_for_each(__tree, __entry, __index, __max) \ + for (__entry = mt_find(__tree, &(__index), __max); \ + __entry; __entry = mt_find_after(__tree, &(__index), __max)) + + +#ifdef CONFIG_DEBUG_MAPLE_TREE +extern atomic_t maple_tree_tests_run; +extern atomic_t maple_tree_tests_passed; + +void mt_dump(const struct maple_tree *mt); +void mt_validate(struct maple_tree *mt); +#define MT_BUG_ON(__tree, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mt_dump(__tree); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) +#else +#define MT_BUG_ON(__tree, __x) BUG_ON(__x) +#endif /* CONFIG_DEBUG_MAPLE_TREE */ + +#endif /*_LINUX_MAPLE_TREE_H */ -- cgit v1.2.3 From d4af56c5c7c6781ca6ca8075e2cf5bc119ed33d1 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:45 +0000 Subject: mm: start tracking VMAs with maple tree Start tracking the VMAs with the new maple tree structure in parallel with the rb_tree. Add debug and trace events for maple tree operations and duplicate the rb_tree that is created on forks into the maple tree. The maple tree is added to the mm_struct including the mm_init struct, added support in required mm/mmap functions, added tracking in kernel/fork for process forking, and used to find the unmapped_area and checked against what the rbtree finds. This also moves the mmap_lock() in exit_mmap() since the oom reaper call does walk the VMAs. Otherwise lockdep will be unhappy if oom happens. When splitting a vma fails due to allocations of the maple tree nodes, the error path in __split_vma() calls new->vm_ops->close(new). The page accounting for hugetlb is actually in the close() operation, so it accounts for the removal of 1/2 of the VMA which was not adjusted. This results in a negative exit value. To avoid the negative charge, set vm_start = vm_end and vm_pgoff = 0. There is also a potential accounting issue in special mappings from insert_vm_struct() failing to allocate, so reverse the charge there in the failure scenario. Link: https://lkml.kernel.org/r/20220906194824.2110408-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ include/linux/mm_types.h | 3 +++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7cc9ffc19e7f..896d04248e66 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2567,6 +2567,8 @@ extern bool arch_has_descending_max_zone_pfns(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); +/* mmap.c */ +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); /* interval_tree.c */ void vma_interval_tree_insert(struct vm_area_struct *node, @@ -2630,6 +2632,9 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas); + static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e1797813cc2c..425bc5f7d477 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -486,6 +487,7 @@ struct kioctx_table; struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ + struct maple_tree mm_mt; struct rb_root mm_rb; u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU @@ -697,6 +699,7 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; +#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN) extern struct mm_struct init_mm; /* Pointer magic because the dynamic array size confuses some compilers. */ -- cgit v1.2.3 From f39af05949a4280b9f04d5dd0f606b81aac3dae8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:46 +0000 Subject: mm: add VMA iterator This thin layer of abstraction over the maple tree state is for iterating over VMAs. You can go forwards, go backwards or ask where the iterator is. Rename the existing vma_next() to __vma_next() -- it will be removed by the end of this series. Link: https://lkml.kernel.org/r/20220906194824.2110408-10-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: David Hildenbrand Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 21 +++++++++++++++++++++ 2 files changed, 53 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 896d04248e66..3701da1fac5f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -661,6 +661,38 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } +static inline +struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) +{ + return mas_find(&vmi->mas, max); +} + +static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) +{ + /* + * Uses vma_find() to get the first VMA when the iterator starts. + * Calling mas_next() could skip the first entry. + */ + return vma_find(vmi, ULONG_MAX); +} + +static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) +{ + return mas_prev(&vmi->mas, 0); +} + +static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) +{ + return vmi->mas.index; +} + +#define for_each_vma(__vmi, __vma) \ + while (((__vma) = vma_next(&(__vmi))) != NULL) + +/* The MM code likes to work with exclusive end addresses */ +#define for_each_vma_range(__vmi, __vma, __end) \ + while (((__vma) = vma_find(&(__vmi), (__end) - 1)) != NULL) + #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 425bc5f7d477..d0b51fbdf5d4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -777,6 +777,27 @@ static inline void lru_gen_use_mm(struct mm_struct *mm) #endif /* CONFIG_LRU_GEN */ +struct vma_iterator { + struct ma_state mas; +}; + +#define VMA_ITERATOR(name, __mm, __addr) \ + struct vma_iterator name = { \ + .mas = { \ + .tree = &(__mm)->mm_mt, \ + .index = __addr, \ + .node = MAS_START, \ + }, \ + } + +static inline void vma_iter_init(struct vma_iterator *vmi, + struct mm_struct *mm, unsigned long addr) +{ + vmi->mas.tree = &mm->mm_mt; + vmi->mas.index = addr; + vmi->mas.node = MAS_START; +} + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); -- cgit v1.2.3 From c9dbe82cb99db5b6029c6bc43fcf7881d3f50268 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:47 +0000 Subject: kernel/fork: use maple tree for dup_mmap() during forking The maple tree was already tracking VMAs in this function by an earlier commit, but the rbtree iterator was being used to iterate the list. Change the iterator to use a maple tree native iterator and switch to the maple tree advanced API to avoid multiple walks of the tree during insert operations. Unexport the now-unused vma_store() function. For performance reasons we bulk allocate the maple tree nodes. The node calculations are done internally to the tree and use the VMA count and assume the worst-case node requirements. The VM_DONT_COPY flag does not allow for the most efficient copy method of the tree and so a bulk loading algorithm is used. Link: https://lkml.kernel.org/r/20220906194824.2110408-15-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3701da1fac5f..646ea4d3bd74 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2599,8 +2599,6 @@ extern bool arch_has_descending_max_zone_pfns(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); -/* mmap.c */ -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); /* interval_tree.c */ void vma_interval_tree_insert(struct vm_area_struct *node, -- cgit v1.2.3 From 524e00b36e8c547f5582eef3fb645a8d9fc5e3df Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:48 +0000 Subject: mm: remove rb tree. Remove the RB tree and start using the maple tree for vm_area_struct tracking. Drop validate_mm() calls in expand_upwards() and expand_downwards() as the lock is not held. Link: https://lkml.kernel.org/r/20220906194824.2110408-18-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- include/linux/mm_types.h | 14 -------------- 2 files changed, 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 646ea4d3bd74..dfce1aaa7a64 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2654,8 +2654,6 @@ extern int __split_vma(struct mm_struct *, struct vm_area_struct *, extern int split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, - struct rb_node **, struct rb_node *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d0b51fbdf5d4..ac747273c4d6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -410,19 +410,6 @@ struct vm_area_struct { /* linked list of VM areas per task, sorted by address */ struct vm_area_struct *vm_next, *vm_prev; - - struct rb_node vm_rb; - - /* - * Largest free memory gap in bytes to the left of this VMA. - * Either between this VMA and vma->vm_prev, or between one of the - * VMAs below us in the VMA rbtree and its ->vm_prev. This helps - * get_unmapped_area find a free area of the right size. - */ - unsigned long rb_subtree_gap; - - /* Second cache line starts here. */ - struct mm_struct *vm_mm; /* The address space we belong to. */ /* @@ -488,7 +475,6 @@ struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ struct maple_tree mm_mt; - struct rb_root mm_rb; u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, -- cgit v1.2.3 From dc8635b25e87232f62276c02899b9d21dd0793c2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:49 +0000 Subject: mm: optimize find_exact_vma() to use vma_lookup() Use vma_lookup() to walk the tree to the start value requested. If the vma at the start does not match, then the answer is NULL and there is no need to look at the next vma the way that find_vma() would. Link: https://lkml.kernel.org/r/20220906194824.2110408-21-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index dfce1aaa7a64..a80083091f53 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2850,7 +2850,7 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) { - struct vm_area_struct *vma = find_vma(mm, vm_start); + struct vm_area_struct *vma = vma_lookup(mm, vm_start); if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) vma = NULL; -- cgit v1.2.3 From abdba2dda0c477ca708a939b02f9b2e74666ed2d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:50 +0000 Subject: mm: use maple tree operations for find_vma_intersection() Move find_vma_intersection() to mmap.c and change implementation to maple tree. When searching for a vma within a range, it is easier to use the maple tree interface. Exported find_vma_intersection() for kvm module. Link: https://lkml.kernel.org/r/20220906194824.2110408-24-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a80083091f53..06a6b8db75b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2778,26 +2778,12 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); -/** - * find_vma_intersection() - Look up the first VMA which intersects the interval - * @mm: The process address space. - * @start_addr: The inclusive start user address. - * @end_addr: The exclusive end user address. - * - * Returns: The first VMA within the provided range, %NULL otherwise. Assumes - * start_addr < end_addr. +/* + * Look up the first VMA which intersects the interval [start_addr, end_addr) + * NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, - unsigned long start_addr, - unsigned long end_addr) -{ - struct vm_area_struct *vma = find_vma(mm, start_addr); - - if (vma && end_addr <= vma->vm_start) - vma = NULL; - return vma; -} + unsigned long start_addr, unsigned long end_addr); /** * vma_lookup() - Find a VMA at a specific address -- cgit v1.2.3 From 7964cf8caa4dfa42c4149f3833d3878713cda3dc Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:51 +0000 Subject: mm: remove vmacache By using the maple tree and the maple tree state, the vmacache is no longer beneficial and is complicating the VMA code. Remove the vmacache to reduce the work in keeping it up to date and code complexity. Link: https://lkml.kernel.org/r/20220906194824.2110408-26-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 1 - include/linux/mm_types_task.h | 12 ------------ include/linux/sched.h | 1 - include/linux/vm_event_item.h | 4 ---- include/linux/vmacache.h | 28 ---------------------------- include/linux/vmstat.h | 6 ------ 6 files changed, 52 deletions(-) delete mode 100644 include/linux/vmacache.h (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ac747273c4d6..4541b74b1bdb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -475,7 +475,6 @@ struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ struct maple_tree mm_mt; - u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index c1bc6731125c..0bb4b6da9993 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -24,18 +24,6 @@ IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) -/* - * The per task VMA cache array: - */ -#define VMACACHE_BITS 2 -#define VMACACHE_SIZE (1U << VMACACHE_BITS) -#define VMACACHE_MASK (VMACACHE_SIZE - 1) - -struct vmacache { - u64 seqnum; - struct vm_area_struct *vmas[VMACACHE_SIZE]; -}; - /* * When updating this, please also update struct resident_page_types[] in * kernel/fork.c diff --git a/include/linux/sched.h b/include/linux/sched.h index a2dcfb91df03..fbac3c19fe35 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -861,7 +861,6 @@ struct task_struct { struct mm_struct *active_mm; /* Per-thread vma caching: */ - struct vmacache vmacache; #ifdef SPLIT_RSS_COUNTING struct task_rss_stat rss_stat; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index f3fc36cd2276..3518dba1e02f 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -129,10 +129,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, NR_TLB_LOCAL_FLUSH_ALL, NR_TLB_LOCAL_FLUSH_ONE, #endif /* CONFIG_DEBUG_TLBFLUSH */ -#ifdef CONFIG_DEBUG_VM_VMACACHE - VMACACHE_FIND_CALLS, - VMACACHE_FIND_HITS, -#endif #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h deleted file mode 100644 index 6fce268a4588..000000000000 --- a/include/linux/vmacache.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_VMACACHE_H -#define __LINUX_VMACACHE_H - -#include -#include - -static inline void vmacache_flush(struct task_struct *tsk) -{ - memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas)); -} - -extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); -extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, - unsigned long addr); - -#ifndef CONFIG_MMU -extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, - unsigned long start, - unsigned long end); -#endif - -static inline void vmacache_invalidate(struct mm_struct *mm) -{ - mm->vmacache_seqnum++; -} - -#endif /* __LINUX_VMACACHE_H */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index bfe38869498d..19cf5b6892ce 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -125,12 +125,6 @@ static inline void vm_events_fold_cpu(int cpu) #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) #endif -#ifdef CONFIG_DEBUG_VM_VMACACHE -#define count_vm_vmacache_event(x) count_vm_event(x) -#else -#define count_vm_vmacache_event(x) do {} while (0) -#endif - #define __count_zid_vm_events(item, zid, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) -- cgit v1.2.3 From d7c62295570f012e1d386ae6ed472b36baf037ad Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:51 +0000 Subject: mm: convert vma_lookup() to use mtree_load() Unlike the rbtree, the Maple Tree will return a NULL if there's nothing at a particular address. Since the previous commit dropped the vmacache, it is now possible to consult the tree directly. Link: https://lkml.kernel.org/r/20220906194824.2110408-27-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 06a6b8db75b7..49a58807719b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2795,12 +2795,7 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = find_vma(mm, addr); - - if (vma && addr < vma->vm_start) - vma = NULL; - - return vma; + return mtree_load(&mm->mm_mt, addr); } static inline unsigned long vm_start_gap(struct vm_area_struct *vma) -- cgit v1.2.3 From 11f9a21ab65542189372b7d64bb2d2937dfdc9dc Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:52 +0000 Subject: mm/mmap: reorganize munmap to use maple states Remove __do_munmap() in favour of do_munmap(), do_mas_munmap(), and do_mas_align_munmap(). do_munmap() is a wrapper to create a maple state for any callers that have not been converted to the maple tree. do_mas_munmap() takes a maple state to mumap a range. This is just a small function which checks for error conditions and aligns the end of the range. do_mas_align_munmap() uses the aligned range to mumap a range. do_mas_align_munmap() starts with the first VMA in the range, then finds the last VMA in the range. Both start and end are split if necessary. Then the VMAs are removed from the linked list and the mm mlock count is updated at the same time. Followed by a single tree operation of overwriting the area in with a NULL. Finally, the detached list is unmapped and freed. By reorganizing the munmap calls as outlined, it is now possible to avoid extra work of aligning pre-aligned callers which are known to be safe, avoid extra VMA lookups or tree walks for modifications. detach_vmas_to_be_unmapped() is no longer used, so drop this code. vm_brk_flags() can just call the do_mas_munmap() as it checks for intersecting VMAs directly. Link: https://lkml.kernel.org/r/20220906194824.2110408-29-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 49a58807719b..579449d6c23b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2710,8 +2710,9 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); -extern int __do_munmap(struct mm_struct *, unsigned long, size_t, - struct list_head *uf, bool downgrade); +extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); -- cgit v1.2.3 From 69dbe6daf1041e32e003f966d71f70f20c63af53 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:57 +0000 Subject: userfaultfd: use maple tree iterator to iterate VMAs Don't use the mm_struct linked list or the vma->vm_next in prep for removal. Link: https://lkml.kernel.org/r/20220906194824.2110408-45-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e1b8a915e9e9..f07e6998bb68 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -175,9 +175,8 @@ extern bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); -extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *uf); +extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf); extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); @@ -258,7 +257,7 @@ static inline bool userfaultfd_remove(struct vm_area_struct *vma, return true; } -static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, +static inline int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf) { -- cgit v1.2.3 From 763ecb035029f500d7e6dc99acd1ad299b7726a1 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:06 +0000 Subject: mm: remove the vma linked list Replace any vm_next use with vma_find(). Update free_pgtables(), unmap_vmas(), and zap_page_range() to use the maple tree. Use the new free_pgtables() and unmap_vmas() in do_mas_align_munmap(). At the same time, alter the loop to be more compact. Now that free_pgtables() and unmap_vmas() take a maple tree as an argument, rearrange do_mas_align_munmap() to use the new tree to hold the vmas to remove. Remove __vma_link_list() and __vma_unlink_list() as they are exclusively used to update the linked list. Drop linked list update from __insert_vm_struct(). Rework validation of tree as it was depending on the linked list. [yang.lee@linux.alibaba.com: fix one kernel-doc comment] Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=1949 Link: https://lkml.kernel.org/r/20220824021918.94116-1-yang.lee@linux.alibaba.comLink: https://lkml.kernel.org/r/20220906194824.2110408-69-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Yang Li Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++-- include/linux/mm_types.h | 4 ---- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 579449d6c23b..37384a84f71a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1857,8 +1857,9 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long start, unsigned long end); +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long start, + unsigned long end); struct mmu_notifier_range; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4541b74b1bdb..5e32211cb5a9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -408,8 +408,6 @@ struct vm_area_struct { unsigned long vm_end; /* The first byte after our end address within vm_mm. */ - /* linked list of VM areas per task, sorted by address */ - struct vm_area_struct *vm_next, *vm_prev; struct mm_struct *vm_mm; /* The address space we belong to. */ /* @@ -473,7 +471,6 @@ struct vm_area_struct { struct kioctx_table; struct mm_struct { struct { - struct vm_area_struct *mmap; /* list of VMAs */ struct maple_tree mm_mt; #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, @@ -488,7 +485,6 @@ struct mm_struct { unsigned long mmap_compat_legacy_base; #endif unsigned long task_size; /* size of task vm space */ - unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; #ifdef CONFIG_MEMBARRIER -- cgit v1.2.3 From bf3980c85212fc71512d27a46f5aab66f46ca284 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 31 May 2022 15:30:59 -0700 Subject: mm: drop oom code from exit_mmap The primary reason to invoke the oom reaper from the exit_mmap path used to be a prevention of an excessive oom killing if the oom victim exit races with the oom reaper (see [1] for more details). The invocation has moved around since then because of the interaction with the munlock logic but the underlying reason has remained the same (see [2]). Munlock code is no longer a problem since [3] and there shouldn't be any blocking operation before the memory is unmapped by exit_mmap so the oom reaper invocation can be dropped. The unmapping part can be done with the non-exclusive mmap_sem and the exclusive one is only required when page tables are freed. Remove the oom_reaper from exit_mmap which will make the code easier to read. This is really unlikely to make any observable difference although some microbenchmarks could benefit from one less branch that needs to be evaluated even though it almost never is true. [1] 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") [2] 27ae357fa82b ("mm, oom: fix concurrent munlock and oom reaper unmap, v3") [3] a213e5cf71cb ("mm/munlock: delete munlock_vma_pages_all(), allow oomreap") [akpm@linux-foundation.org: restore Suren's mmap_read_lock() optimization] Link: https://lkml.kernel.org/r/20220531223100.510392-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: Christian Brauner (Microsoft) Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Jann Horn Cc: Johannes Weiner Cc: John Hubbard Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Minchan Kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/oom.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index 02d1e7bbd8cd..6cdde62b078b 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -106,8 +106,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) return 0; } -bool __oom_reap_task_mm(struct mm_struct *mm); - long oom_badness(struct task_struct *p, unsigned long totalpages); -- cgit v1.2.3 From b3541d912a84dc40cabb516f2deeac9ae6fa30da Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 31 May 2022 15:31:00 -0700 Subject: mm: delete unused MMF_OOM_VICTIM flag With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is now unused and can be removed. [akpm@linux-foundation.org: remove comment about now-removed mm_is_oom_victim()] Link: https://lkml.kernel.org/r/20220531223100.510392-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Cc: David Rientjes Cc: Matthew Wilcox Cc: Johannes Weiner Cc: Roman Gushchin Cc: Minchan Kim Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Christian Brauner (Microsoft) Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: David Hildenbrand Cc: Jann Horn Cc: Shakeel Butt Cc: Peter Xu Cc: John Hubbard Cc: Shuah Khan Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/oom.h | 9 --------- include/linux/sched/coredump.h | 7 +++---- 2 files changed, 3 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index 6cdde62b078b..7d0c9c48a0c5 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) return tsk->signal->oom_mm; } -/* - * Use this helper if tsk->mm != mm and the victim mm needs a special - * handling. This is guaranteed to stay true after once set. - */ -static inline bool mm_is_oom_victim(struct mm_struct *mm) -{ - return test_bit(MMF_OOM_VICTIM, &mm->flags); -} - /* * Checks whether a page fault on the given mm is still reliable. * This is no longer true if the oom reaper started to reap the diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 4d0a5be28b70..8270ad7ae14c 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -71,9 +71,8 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_OOM_VICTIM 25 /* mm is the oom victim */ -#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ -#define MMF_MULTIPROCESS 27 /* mm is shared between processes */ +#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either * replaced in the future by mm.pinned_vm when it becomes stable, or grow into @@ -81,7 +80,7 @@ static inline int get_dumpable(struct mm_struct *mm) * pinned pages were unpinned later on, we'll still keep this bit set for the * lifecycle of this mm, just for simplicity. */ -#define MMF_HAS_PINNED 28 /* FOLL_PIN has run, never cleared */ +#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ -- cgit v1.2.3 From 474098edac262ae26bfab1c48445877075a31cbd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Aug 2022 18:46:57 +0200 Subject: mm/gup: replace FOLL_NUMA by gup_can_follow_protnone() Patch series "mm: minor cleanups around NUMA hinting". Working on some GUP cleanups (e.g., getting rid of some FOLL_ flags) and preparing for other GUP changes (getting rid of FOLL_FORCE|FOLL_WRITE for for taking a R/O longterm pin), this is something I can easily send out independently. Get rid of FOLL_NUMA, allow FOLL_FORCE access to PROT_NONE mapped pages in GUP-fast, and fixup some documentation around NUMA hinting. This patch (of 3): No need for a special flag that is not even properly documented to be internal-only. Let's just factor this check out and get rid of this flag. The separate function has the nice benefit that we can centralize comments. Link: https://lkml.kernel.org/r/20220825164659.89824-2-david@redhat.com Link: https://lkml.kernel.org/r/20220825164659.89824-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox Cc: Mel Gorman Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 37384a84f71a..eb25cae06c55 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2933,7 +2933,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * and return without waiting upon it */ #define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ @@ -3054,6 +3053,21 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page) return !PageAnonExclusive(page); } +/* + * Indicates whether GUP can follow a PROT_NONE mapped page, or whether + * a (NUMA hinting) fault is required. + */ +static inline bool gup_can_follow_protnone(unsigned int flags) +{ + /* + * FOLL_FORCE has to be able to make progress even if the VMA is + * inaccessible. Further, FOLL_FORCE access usually does not represent + * application behaviour and we should avoid triggering NUMA hinting + * faults. + */ + return flags & FOLL_FORCE; +} + typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); -- cgit v1.2.3 From 7014887a01587d8c50871d5985cd572ca08b29c0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Aug 2022 18:46:59 +0200 Subject: mm: fixup documentation regarding pte_numa() and PROT_NUMA pte_numa() no longer exists -- replaced by pte_protnone() -- and PROT_NUMA probably never existed: MM_CP_PROT_NUMA also ends up using PROT_NONE. Let's fixup the doc. Link: https://lkml.kernel.org/r/20220825164659.89824-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox Cc: Mel Gorman Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5e32211cb5a9..26573ba485f3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -614,22 +614,22 @@ struct mm_struct { #endif #ifdef CONFIG_NUMA_BALANCING /* - * numa_next_scan is the next time that the PTEs will be marked - * pte_numa. NUMA hinting faults will gather statistics and - * migrate pages to new nodes if necessary. + * numa_next_scan is the next time that PTEs will be remapped + * PROT_NONE to trigger NUMA hinting faults; such faults gather + * statistics and migrate pages to new nodes if necessary. */ unsigned long numa_next_scan; - /* Restart point for scanning and setting pte_numa */ + /* Restart point for scanning and remapping PTEs. */ unsigned long numa_scan_offset; - /* numa_scan_seq prevents two threads setting pte_numa */ + /* numa_scan_seq prevents two threads remapping PTEs. */ int numa_scan_seq; #endif /* * An operation with batched TLB flushing is going on. Anything * that can move process memory needs to flush the TLB when - * moving a PROT_NONE or PROT_NUMA mapped page. + * moving a PROT_NONE mapped page. */ atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -- cgit v1.2.3 From 974f4367dd315acc15ad4a6453f8304aea60dfbd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 23 Aug 2022 11:22:30 +0200 Subject: mm: reduce noise in show_mem for lowmem allocations While discussing early DMA pool pre-allocation failure with Christoph [1] I have realized that the allocation failure warning is rather noisy for constrained allocations like GFP_DMA{32}. Those zones are usually not populated on all nodes very often as their memory ranges are constrained. This is an attempt to reduce the ballast that doesn't provide any relevant information for those allocation failures investigation. Please note that I have only compile tested it (in my default config setup) and I am throwing it mostly to see what people think about it. [1] http://lkml.kernel.org/r/20220817060647.1032426-1-hch@lst.de [mhocko@suse.com: update] Link: https://lkml.kernel.org/r/Yw29bmJTIkKogTiW@dhcp22.suse.cz [mhocko@suse.com: fix build] [akpm@linux-foundation.org: fix it for mapletree] [akpm@linux-foundation.org: update it for Michal's update] [mhocko@suse.com: fix arch/powerpc/xmon/xmon.c] Link: https://lkml.kernel.org/r/Ywh3C4dKB9B93jIy@dhcp22.suse.cz [akpm@linux-foundation.org: fix arch/sparc/kernel/setup_32.c] Link: https://lkml.kernel.org/r/YwScVmVofIZkopkF@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Mel Gorman Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index eb25cae06c55..e56dd8f7eae1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1838,7 +1838,11 @@ extern void pagefault_out_of_memory(void); */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ -extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); +extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); +static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask) +{ + __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1); +} #ifdef CONFIG_MMU extern bool can_do_mlock(void); @@ -2578,7 +2582,12 @@ extern void calculate_min_free_kbytes(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); -extern void show_mem(unsigned int flags, nodemask_t *nodemask); + +extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); +static inline void show_mem(unsigned int flags, nodemask_t *nodemask) +{ + __show_mem(flags, nodemask, MAX_NR_ZONES - 1); +} extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); -- cgit v1.2.3 From e6ad640bc404eb298dd1880113131768ddf5c6a8 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 26 Aug 2022 23:06:42 +0000 Subject: mm: deduplicate cacheline padding code There are three users (mmzone.h, memcontrol.h, page_counter.h) using similar code for forcing cacheline padding between fields of different structures. Dedup that code. Link: https://lkml.kernel.org/r/20220826230642.566725-1-shakeelb@google.com Signed-off-by: Shakeel Butt Suggested-by: Feng Tang Reviewed-by: Feng Tang Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/cache.h | 13 +++++++++++++ include/linux/memcontrol.h | 13 ++----------- include/linux/mmzone.h | 24 +++++------------------- include/linux/page_counter.h | 13 ++----------- 4 files changed, 22 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cache.h b/include/linux/cache.h index d742c57eaee5..5da1bbd96154 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -85,4 +85,17 @@ #define cache_line_size() L1_CACHE_BYTES #endif +/* + * Helper to add padding within a struct to ensure data fall into separate + * cachelines. + */ +#if defined(CONFIG_SMP) +struct cacheline_padding { + char x[0]; +} ____cacheline_internodealigned_in_smp; +#define CACHELINE_PADDING(name) struct cacheline_padding name +#else +#define CACHELINE_PADDING(name) +#endif + #endif /* __LINUX_CACHE_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 344022f102c2..60545e4a1c03 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -185,15 +185,6 @@ struct mem_cgroup_thresholds { struct mem_cgroup_threshold_ary *spare; }; -#if defined(CONFIG_SMP) -struct memcg_padding { - char x[0]; -} ____cacheline_internodealigned_in_smp; -#define MEMCG_PADDING(name) struct memcg_padding name -#else -#define MEMCG_PADDING(name) -#endif - /* * Remember four most recent foreign writebacks with dirty pages in this * cgroup. Inode sharing is expected to be uncommon and, even if we miss @@ -304,7 +295,7 @@ struct mem_cgroup { spinlock_t move_lock; unsigned long move_lock_flags; - MEMCG_PADDING(_pad1_); + CACHELINE_PADDING(_pad1_); /* memory.stat */ struct memcg_vmstats vmstats; @@ -326,7 +317,7 @@ struct mem_cgroup { struct list_head objcg_list; #endif - MEMCG_PADDING(_pad2_); + CACHELINE_PADDING(_pad2_); /* * set > 0 if pages under this cgroup are moving to other cgroup. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e335a492c2eb..c69c08156822 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -121,20 +121,6 @@ static inline bool free_area_empty(struct free_area *area, int migratetype) struct pglist_data; -/* - * Add a wild amount of padding here to ensure data fall into separate - * cachelines. There are very few zone structures in the machine, so space - * consumption is not a concern here. - */ -#if defined(CONFIG_SMP) -struct zone_padding { - char x[0]; -} ____cacheline_internodealigned_in_smp; -#define ZONE_PADDING(name) struct zone_padding name; -#else -#define ZONE_PADDING(name) -#endif - #ifdef CONFIG_NUMA enum numa_stat_item { NUMA_HIT, /* allocated in intended node */ @@ -837,7 +823,7 @@ struct zone { int initialized; /* Write-intensive fields used from the page allocator */ - ZONE_PADDING(_pad1_) + CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ struct free_area free_area[MAX_ORDER]; @@ -849,7 +835,7 @@ struct zone { spinlock_t lock; /* Write-intensive fields used by compaction and vmstats. */ - ZONE_PADDING(_pad2_) + CACHELINE_PADDING(_pad2_); /* * When free pages are below this point, additional steps are taken @@ -886,7 +872,7 @@ struct zone { bool contiguous; - ZONE_PADDING(_pad3_) + CACHELINE_PADDING(_pad3_); /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; @@ -1196,7 +1182,7 @@ typedef struct pglist_data { #endif /* CONFIG_NUMA */ /* Write-intensive fields used by page reclaim */ - ZONE_PADDING(_pad1_) + CACHELINE_PADDING(_pad1_); #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* @@ -1241,7 +1227,7 @@ typedef struct pglist_data { struct lru_gen_mm_walk mm_walk; #endif - ZONE_PADDING(_pad2_) + CACHELINE_PADDING(_pad2_); /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 78a1c934e416..c141ea9a95ef 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -7,22 +7,13 @@ #include #include -#if defined(CONFIG_SMP) -struct pc_padding { - char x[0]; -} ____cacheline_internodealigned_in_smp; -#define PC_PADDING(name) struct pc_padding name -#else -#define PC_PADDING(name) -#endif - struct page_counter { /* * Make sure 'usage' does not share cacheline with any other field. The * memcg->memory.usage is a hot member of struct mem_cgroup. */ atomic_long_t usage; - PC_PADDING(_pad1_); + CACHELINE_PADDING(_pad1_); /* effective memory.min and memory.min usage tracking */ unsigned long emin; @@ -38,7 +29,7 @@ struct page_counter { unsigned long failcnt; /* Keep all the read most fields in a separete cacheline. */ - PC_PADDING(_pad2_); + CACHELINE_PADDING(_pad2_); unsigned long min; unsigned long low; -- cgit v1.2.3 From cb4df4cae4f2bd8cf7a32eff81178fce31600f7c Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 30 Aug 2022 14:38:38 +0000 Subject: ksm: count allocated ksm rmap_items for each process Patch series "ksm: count allocated rmap_items and update documentation", v5. KSM can save memory by merging identical pages, but also can consume additional memory, because it needs to generate rmap_items to save each scanned page's brief rmap information. To determine how beneficial the ksm-policy (like madvise), they are using brings, so we add a new interface /proc//ksm_stat for each process The value "ksm_rmap_items" in it indicates the total allocated ksm rmap_items of this process. The detailed description can be seen in the following patches' commit message. This patch (of 2): KSM can save memory by merging identical pages, but also can consume additional memory, because it needs to generate rmap_items to save each scanned page's brief rmap information. Some of these pages may be merged, but some may not be abled to be merged after being checked several times, which are unprofitable memory consumed. The information about whether KSM save memory or consume memory in system-wide range can be determined by the comprehensive calculation of pages_sharing, pages_shared, pages_unshared and pages_volatile. A simple approximate calculation: profit =~ pages_sharing * sizeof(page) - (all_rmap_items) * sizeof(rmap_item); where all_rmap_items equals to the sum of pages_sharing, pages_shared, pages_unshared and pages_volatile. But we cannot calculate this kind of ksm profit inner single-process wide because the information of ksm rmap_item's number of a process is lacked. For user applications, if this kind of information could be obtained, it helps upper users know how beneficial the ksm-policy (like madvise) they are using brings, and then optimize their app code. For example, one application madvise 1000 pages as MERGEABLE, while only a few pages are really merged, then it's not cost-efficient. So we add a new interface /proc//ksm_stat for each process in which the value of ksm_rmap_itmes is only shown now and so more values can be added in future. So similarly, we can calculate the ksm profit approximately for a single process by: profit =~ ksm_merging_pages * sizeof(page) - ksm_rmap_items * sizeof(rmap_item); where ksm_merging_pages is shown at /proc//ksm_merging_pages, and ksm_rmap_items is shown in /proc//ksm_stat. Link: https://lkml.kernel.org/r/20220830143731.299702-1-xu.xin16@zte.com.cn Link: https://lkml.kernel.org/r/20220830143838.299758-1-xu.xin16@zte.com.cn Signed-off-by: xu xin Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Signed-off-by: CGEL ZTE Cc: Alexey Dobriyan Cc: Bagas Sanjaya Cc: Hugh Dickins Cc: Izik Eidus Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 26573ba485f3..8f30f262431c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -654,6 +654,11 @@ struct mm_struct { * merging. */ unsigned long ksm_merging_pages; + /* + * Represent how many pages are checked for ksm merging + * including merged and not merged. + */ + unsigned long ksm_rmap_items; #endif #ifdef CONFIG_LRU_GEN struct { -- cgit v1.2.3 From bf7a87f1075f67c286f794519f0fedfa8b0b18cc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Sep 2022 17:33:35 +0200 Subject: kprobes: Add new KPROBE_FLAG_ON_FUNC_ENTRY kprobe flag Adding KPROBE_FLAG_ON_FUNC_ENTRY kprobe flag to indicate that attach address is on function entry. This is used in following changes in get_func_ip helper to return correct function address. Acked-by: Masami Hiramatsu (Google) Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220926153340.1621984-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/kprobes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 55041d2f884d..a0b92be98984 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -103,6 +103,7 @@ struct kprobe { * this flag is only for optimized_kprobe. */ #define KPROBE_FLAG_FTRACE 8 /* probe is using ftrace */ +#define KPROBE_FLAG_ON_FUNC_ENTRY 16 /* probe is on the function entry */ /* Has this kprobe gone ? */ static inline bool kprobe_gone(struct kprobe *p) -- cgit v1.2.3 From 19c02415da2345d0dda2b5c4495bc17cc14b18b5 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 26 Sep 2022 11:47:38 -0700 Subject: bpf: use bpf_prog_pack for bpf_dispatcher Allocate bpf_dispatcher with bpf_prog_pack_alloc so that bpf_dispatcher can share pages with bpf programs. arch_prepare_bpf_dispatcher() is updated to provide a RW buffer as working area for arch code to write to. This also fixes CPA W^X warnning like: CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ... Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20220926184739.3512547-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- include/linux/filter.h | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index edd43edb27d6..9ae155c75014 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -946,6 +946,7 @@ struct bpf_dispatcher { struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX]; int num_progs; void *image; + void *rw_image; u32 image_off; struct bpf_ksym ksym; }; @@ -964,7 +965,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); -int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); +int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs); #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ .func = &_name##_func, \ diff --git a/include/linux/filter.h b/include/linux/filter.h index 98e28126c24b..efc42a6e3aed 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1023,6 +1023,8 @@ extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); +void bpf_jit_fill_hole_with_zero(void *area, unsigned int size); + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -1035,6 +1037,9 @@ void bpf_jit_free(struct bpf_prog *fp); struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); +void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); +void bpf_prog_pack_free(struct bpf_binary_header *hdr); + static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { return list_empty(&fp->aux->ksym.lnode) || -- cgit v1.2.3 From 5b0d1c7bd5722467960829af51d523f5a6ffd848 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 26 Sep 2022 11:47:39 -0700 Subject: bpf: Enforce W^X for bpf trampoline Mark the trampoline as RO+X after arch_prepare_bpf_trampoline, so that the trampoine follows W^X rule strictly. This will turn off warnings like CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ... Also remove bpf_jit_alloc_exec_page(), since it is not used any more. Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20220926184739.3512547-3-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9ae155c75014..5161fac0513f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1008,7 +1008,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ -void *bpf_jit_alloc_exec_page(void); void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); -- cgit v1.2.3 From 1c32a8012b7fabe469b6af826edfd4ae2a6201d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Sep 2022 15:38:58 +0200 Subject: nvme: improve the NVME_CONNECT_AUTHREQ* definitions Mark them as unsigned so that we don't need extra casts, and define them relative to cdword0 instead of requiring extra shifts. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke --- include/linux/nvme.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index ae53d74f3696..050d7d0cd81b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1482,8 +1482,8 @@ struct nvmf_connect_command { }; enum { - NVME_CONNECT_AUTHREQ_ASCR = (1 << 2), - NVME_CONNECT_AUTHREQ_ATR = (1 << 1), + NVME_CONNECT_AUTHREQ_ASCR = (1U << 18), + NVME_CONNECT_AUTHREQ_ATR = (1U << 17), }; struct nvmf_connect_data { -- cgit v1.2.3 From f4dc7fffa9873db50ec25624572f8217a6225de8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 16 Sep 2022 14:03:06 +0200 Subject: efi: libstub: unify initrd loading between architectures Use a EFI configuration table to pass the initrd to the core kernel, instead of per-arch methods. This cleans up the code considerably, and should make it easier for architectures to get rid of their reliance on DT for doing EFI boot in the future. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index f1b3e0d1b3fa..778ddb22f7da 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1330,6 +1330,11 @@ struct linux_efi_coco_secret_area { u64 size; }; +struct linux_efi_initrd { + unsigned long base; + unsigned long size; +}; + /* Header of a populated EFI secret area */ #define EFI_SECRET_TABLE_HEADER_GUID EFI_GUID(0x1e74f542, 0x71dd, 0x4d66, 0x96, 0x3e, 0xef, 0x42, 0x87, 0xff, 0x17, 0x3b) -- cgit v1.2.3 From 171539f5a90e3fdf7d17f5396fac79d7e44ad68e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 15 Sep 2022 23:20:06 +0200 Subject: efi: libstub: install boot-time memory map as config table Expose the EFI boot time memory map to the kernel via a configuration table. This is arch agnostic and enables future changes that remove the dependency on DT on architectures that don't otherwise rely on it. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 778ddb22f7da..252b9b328577 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -410,6 +410,7 @@ void efi_native_runtime_setup(void); #define LINUX_EFI_INITRD_MEDIA_GUID EFI_GUID(0x5568e427, 0x68fc, 0x4f3d, 0xac, 0x74, 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68) #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89) #define LINUX_EFI_COCO_SECRET_AREA_GUID EFI_GUID(0xadf956ad, 0xe98c, 0x484c, 0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47) +#define LINUX_EFI_BOOT_MEMMAP_GUID EFI_GUID(0x800f683f, 0xd08b, 0x423a, 0xa2, 0x93, 0x96, 0x5c, 0x3c, 0x6f, 0xe2, 0xb4) #define RISCV_EFI_BOOT_PROTOCOL_GUID EFI_GUID(0xccd15fec, 0x6f73, 0x4eec, 0x83, 0x95, 0x3e, 0x69, 0xe4, 0xb9, 0x40, 0xbf) -- cgit v1.2.3 From 3c6edd9034240ce9582be3392112321336bd25bb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 22 Sep 2022 12:03:52 +0200 Subject: efi: zboot: create MemoryMapped() device path for the parent if needed LoadImage() is supposed to install an instance of the protocol EFI_LOADED_IMAGE_DEVICE_PATH_PROTOCOL onto the loaded image's handle so that the program can figure out where it was loaded from. The reference implementation even does this (with a NULL protocol pointer) if the call to LoadImage() used the source buffer and size arguments, and passed NULL for the image device path. Hand rolled implementations of LoadImage may behave differently, though, and so it is better to tolerate situations where the protocol is missing. And actually, concatenating an Offset() node to a NULL device path (as we do currently) is not great either. So in cases where the protocol is absent, or when it points to NULL, construct a MemoryMapped() device node as the base node that describes the parent image's footprint in memory. Cc: Daan De Meyer Cc: Jeremy Linton Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 89f16ec3ebab..da3974bf05d3 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1004,6 +1004,13 @@ struct efi_rel_offset_dev_path { u64 ending_offset; } __packed; +struct efi_mem_mapped_dev_path { + struct efi_generic_dev_path header; + u32 memory_type; + u64 starting_addr; + u64 ending_addr; +} __packed; + struct efi_dev_path { union { struct efi_generic_dev_path header; -- cgit v1.2.3 From 4bf207d7a54d49637da94dbc00d2c025b74764d1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 1 Sep 2022 11:20:53 -0300 Subject: net/mlx5: Add IFC bits for mkey ATS Allows telling a mkey to use PCI ATS for DMA that flows through it. Link: https://lore.kernel.org/r/1-v1-bd147097458e+ede-umem_dmabuf_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/mlx5_ifc.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4acd5610e96b..92602e33a82c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1707,7 +1707,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 steering_format_version[0x4]; u8 create_qp_start_hint[0x18]; - u8 reserved_at_460[0x3]; + u8 reserved_at_460[0x1]; + u8 ats[0x1]; + u8 reserved_at_462[0x1]; u8 log_max_uctx[0x5]; u8 reserved_at_468[0x2]; u8 ipsec_offload[0x1]; @@ -3873,7 +3875,9 @@ struct mlx5_ifc_mkc_bits { u8 lw[0x1]; u8 lr[0x1]; u8 access_mode_1_0[0x2]; - u8 reserved_at_18[0x8]; + u8 reserved_at_18[0x2]; + u8 ma_translation_mode[0x2]; + u8 reserved_at_1c[0x4]; u8 qpn[0x18]; u8 mkey_7_0[0x8]; @@ -11134,7 +11138,8 @@ struct mlx5_ifc_dealloc_memic_out_bits { struct mlx5_ifc_umem_bits { u8 reserved_at_0[0x80]; - u8 reserved_at_80[0x1b]; + u8 ats[0x1]; + u8 reserved_at_81[0x1a]; u8 log_page_size[0x5]; u8 page_offset[0x20]; -- cgit v1.2.3 From 987f20a9dcce3989e48d87cff3952c095c994445 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 26 Sep 2022 17:15:32 -0500 Subject: a.out: Remove the a.out implementation In commit 19e8b701e258 ("a.out: Stop building a.out/osf1 support on alpha and m68k") the last users of a.out were disabled. As nothing has turned up to cause this change to be reverted, let's remove the code implementing a.out support as well. There may be userspace users of the uapi bits left so the uapi headers have been left untouched. Signed-off-by: "Eric W. Biederman" Acked-by: Arnd Bergmann # arm defconfigs Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/871qrx3hq3.fsf@email.froward.int.ebiederm.org --- include/linux/a.out.h | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 include/linux/a.out.h (limited to 'include/linux') diff --git a/include/linux/a.out.h b/include/linux/a.out.h deleted file mode 100644 index 600cf45645c6..000000000000 --- a/include/linux/a.out.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __A_OUT_GNU_H__ -#define __A_OUT_GNU_H__ - -#include - -#ifndef __ASSEMBLY__ -#ifdef linux -#include -#if defined(__i386__) || defined(__mc68000__) -#else -#ifndef SEGMENT_SIZE -#define SEGMENT_SIZE PAGE_SIZE -#endif -#endif -#endif -#endif /*__ASSEMBLY__ */ -#endif /* __A_OUT_GNU_H__ */ -- cgit v1.2.3 From 568ec936bf1384fc15873908c96a9aeb62536edb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Sep 2022 09:58:15 +0200 Subject: block: replace blk_queue_nowait with bdev_nowait Replace blk_queue_nowait with a bdev_nowait helpers that takes the block_device given that the I/O submission path should not have to look into the request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Pankaj Raghav Link: https://lore.kernel.org/r/20220927075815.269694-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 84b13fdd34a7..4750772ef228 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -618,7 +618,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) -#define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); @@ -1280,6 +1279,11 @@ static inline bool bdev_fua(struct block_device *bdev) return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags); } +static inline bool bdev_nowait(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags); +} + static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From 644c4229559257cadc4267fc36c2dc22ee9c040f Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 21 Sep 2022 02:44:58 +0200 Subject: clk: qcom: smd: Add SM6375 clocks Add support for controlling SMD RPM clocks on SM6375. Signed-off-by: Konrad Dybcio Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220921004458.151842-3-konrad.dybcio@somainline.org --- include/linux/soc/qcom/smd-rpm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/smd-rpm.h b/include/linux/soc/qcom/smd-rpm.h index 82c9d489833a..3ab8c07f71c0 100644 --- a/include/linux/soc/qcom/smd-rpm.h +++ b/include/linux/soc/qcom/smd-rpm.h @@ -41,6 +41,7 @@ struct qcom_smd_rpm; #define QCOM_SMD_RPM_HWKM_CLK 0x6d6b7768 #define QCOM_SMD_RPM_PKA_CLK 0x616b70 #define QCOM_SMD_RPM_MCFG_CLK 0x6766636d +#define QCOM_SMD_RPM_MMXI_CLK 0x69786d6d int qcom_rpm_smd_write(struct qcom_smd_rpm *rpm, int state, -- cgit v1.2.3 From 3008119a3dd8052b7e5c07488e20a7abb0d287f2 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Fri, 23 Sep 2022 17:00:12 +0800 Subject: ftrace: Remove obsoleted code from ftrace and task_struct The trace of "struct task_struct" was no longer used since commit 345ddcc882d8 ("ftrace: Have set_ftrace_pid use the bitmap like events do"), and the functions about flags for current->trace is useless, so remove them. Link: https://lkml.kernel.org/r/20220923090012.505990-1-cuigaosheng1@huawei.com Signed-off-by: Gaosheng Cui Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 41 ----------------------------------------- include/linux/sched.h | 3 --- 2 files changed, 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0b61371e287b..62557d4bffc2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1122,47 +1122,6 @@ static inline void unpause_graph_tracing(void) { } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ #ifdef CONFIG_TRACING - -/* flags for current->trace */ -enum { - TSK_TRACE_FL_TRACE_BIT = 0, - TSK_TRACE_FL_GRAPH_BIT = 1, -}; -enum { - TSK_TRACE_FL_TRACE = 1 << TSK_TRACE_FL_TRACE_BIT, - TSK_TRACE_FL_GRAPH = 1 << TSK_TRACE_FL_GRAPH_BIT, -}; - -static inline void set_tsk_trace_trace(struct task_struct *tsk) -{ - set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace); -} - -static inline void clear_tsk_trace_trace(struct task_struct *tsk) -{ - clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace); -} - -static inline int test_tsk_trace_trace(struct task_struct *tsk) -{ - return tsk->trace & TSK_TRACE_FL_TRACE; -} - -static inline void set_tsk_trace_graph(struct task_struct *tsk) -{ - set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace); -} - -static inline void clear_tsk_trace_graph(struct task_struct *tsk) -{ - clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace); -} - -static inline int test_tsk_trace_graph(struct task_struct *tsk) -{ - return tsk->trace & TSK_TRACE_FL_GRAPH; -} - enum ftrace_dump_mode; extern enum ftrace_dump_mode ftrace_dump_on_oops; diff --git a/include/linux/sched.h b/include/linux/sched.h index e7b2f8a5c711..c7ee04e9147a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1381,9 +1381,6 @@ struct task_struct { #endif #ifdef CONFIG_TRACING - /* State flags for use by tracers: */ - unsigned long trace; - /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ -- cgit v1.2.3 From 976a859c9c68fdf160379bfc154431489f318292 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Wed, 7 Sep 2022 16:36:23 -0700 Subject: net/mlx5: Expose NPPS related registers Add management capability bits indicating firmware may support N pulses per second. Add corresponding fields in MTPPS register. Signed-off-by: Aya Levin Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 06eab92b9fb3..e929b0dcd985 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9792,7 +9792,9 @@ struct mlx5_ifc_pcam_reg_bits { struct mlx5_ifc_mcam_enhanced_features_bits { u8 reserved_at_0[0x5d]; u8 mcia_32dwords[0x1]; - u8 reserved_at_5e[0xc]; + u8 out_pulse_duration_ns[0x1]; + u8 npps_period[0x1]; + u8 reserved_at_60[0xa]; u8 reset_state[0x1]; u8 ptpcyc2realtime_modify[0x1]; u8 reserved_at_6c[0x2]; @@ -10292,7 +10294,12 @@ struct mlx5_ifc_mtpps_reg_bits { u8 reserved_at_18[0x4]; u8 cap_max_num_of_pps_out_pins[0x4]; - u8 reserved_at_20[0x24]; + u8 reserved_at_20[0x13]; + u8 cap_log_min_npps_period[0x5]; + u8 reserved_at_38[0x3]; + u8 cap_log_min_out_pulse_duration_ns[0x5]; + + u8 reserved_at_40[0x4]; u8 cap_pin_3_mode[0x4]; u8 reserved_at_48[0x4]; u8 cap_pin_2_mode[0x4]; @@ -10311,7 +10318,9 @@ struct mlx5_ifc_mtpps_reg_bits { u8 cap_pin_4_mode[0x4]; u8 field_select[0x20]; - u8 reserved_at_a0[0x60]; + u8 reserved_at_a0[0x20]; + + u8 npps_period[0x40]; u8 enable[0x1]; u8 reserved_at_101[0xb]; @@ -10320,7 +10329,8 @@ struct mlx5_ifc_mtpps_reg_bits { u8 pin_mode[0x4]; u8 pin[0x8]; - u8 reserved_at_120[0x20]; + u8 reserved_at_120[0x2]; + u8 out_pulse_duration_ns[0x1e]; u8 time_stamp[0x40]; -- cgit v1.2.3 From f0462bc3e9024e9738fcd1a026456238317d84c7 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Wed, 7 Sep 2022 16:36:24 -0700 Subject: net/mlx5: Add support for NPPS with real time mode Add support for setting NPPS. NPPS is currently available in REAL_TIME_CLOCK mode only. In addition allow the user to set the pulse duration. When NPPS pulse duration is not set explicitly by the user, driver set it to 50% of the NPPS period. Signed-off-by: Aya Levin Reviewed-by: Eran Ben Elisha Reviewed-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 954af9cafb1d..481506376344 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -698,6 +698,8 @@ struct mlx5_pps { struct work_struct out_work; u64 start[MAX_PIN_NUM]; u8 enabled; + u64 min_npps_period; + u64 min_out_pulse_duration_ns; }; struct mlx5_timer { -- cgit v1.2.3 From 8d1ac895fff96a228db20db92243e93687659ef7 Mon Sep 17 00:00:00 2001 From: "Liu, Changcheng" Date: Wed, 7 Sep 2022 16:36:25 -0700 Subject: net/mlx5: add IFC bits for bypassing port select flow table port_select_flow_table_bypass - When set, device supports bypass port select flow table. active_port - Bitmask indicates the current active ports in PORT_SELECT_FT LAG. MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION - op_mod to operate PORT_SELECTION_Capabilities. Signed-off-by: Liu, Changcheng Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e929b0dcd985..b7f4e93df0f2 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -68,6 +68,7 @@ enum { MLX5_SET_HCA_CAP_OP_MOD_ODP = 0x2, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC = 0x3, MLX5_SET_HCA_CAP_OP_MOD_ROCE = 0x4, + MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION = 0x25, }; enum { @@ -814,7 +815,9 @@ struct mlx5_ifc_flow_table_nic_cap_bits { struct mlx5_ifc_port_selection_cap_bits { u8 reserved_at_0[0x10]; u8 port_select_flow_table[0x1]; - u8 reserved_at_11[0xf]; + u8 reserved_at_11[0x1]; + u8 port_select_flow_table_bypass[0x1]; + u8 reserved_at_13[0xd]; u8 reserved_at_20[0x1e0]; @@ -10933,7 +10936,9 @@ struct mlx5_ifc_lagc_bits { u8 reserved_at_18[0x5]; u8 lag_state[0x3]; - u8 reserved_at_20[0x14]; + u8 reserved_at_20[0xc]; + u8 active_port[0x4]; + u8 reserved_at_30[0x4]; u8 tx_remap_affinity_2[0x4]; u8 reserved_at_38[0x4]; u8 tx_remap_affinity_1[0x4]; -- cgit v1.2.3 From a83bb5df2ac604ab418fbe0a8720f55de46652eb Mon Sep 17 00:00:00 2001 From: "Liu, Changcheng" Date: Wed, 7 Sep 2022 16:36:26 -0700 Subject: RDMA/mlx5: Don't set tx affinity when lag is in hash mode In hash mode, without setting tx affinity explicitly, the port select flow table decides which port is used for the traffic. If port_select_flow_table_bypass capability is supported and tx affinity is set explicitly for QP/TIS, they will be added into the explicit affinity table in FW to check which port is used for the traffic. 1. The overloaded explicit affinity table may affect performance. To avoid this, do not set tx affinity explicitly by default. 2. The packets of the same flow need to be transmitted on the same port. Because the packets of the same flow use different QPs in slow & fast path, it shouldn't set tx affinity explicitly for these QPs. Signed-off-by: Liu, Changcheng Reviewed-by: Mark Bloch Reviewed-by: Vlad Buslov Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 481506376344..9114caceb2b5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1153,6 +1153,7 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_roce(struct mlx5_core_dev *dev); bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); +bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev); bool mlx5_lag_is_master(struct mlx5_core_dev *dev); bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 66af4fe37119dbf6a82078949a82e625155777ef Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 7 Sep 2022 16:36:30 -0700 Subject: net/mlx5: Remove unused functions Remove functions which are no longer used in the driver: mlx5e_ipsec_is_tx_flow mlx5_health_flush get_cqe_enhanced_num_mini_cqes get_cqe_l3_hdr_type mlx5_health_flush mlx5_fs_is_ipsec_flow _mlx5_fs_is_outer_ipproto_flow mlx5_fs_is_outer_tcp_flow mlx5_fs_is_outer_udp_flow mlx5_fs_is_vxlan_flow mlx5_fs_is_outer_ipsec_flow Signed-off-by: Gal Pressman Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 11 ---------- include/linux/mlx5/driver.h | 1 - include/linux/mlx5/fs_helpers.h | 48 ----------------------------------------- 3 files changed, 60 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 5b41b9fb3d48..753753f3ce12 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -874,12 +874,6 @@ static inline u8 get_cqe_opcode(struct mlx5_cqe64 *cqe) return cqe->op_own >> 4; } -static inline u8 get_cqe_enhanced_num_mini_cqes(struct mlx5_cqe64 *cqe) -{ - /* num_of_mini_cqes is zero based */ - return get_cqe_opcode(cqe) + 1; -} - static inline u8 get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe) { return (cqe->lro.tcppsh_abort_dupack >> 6) & 1; @@ -890,11 +884,6 @@ static inline u8 get_cqe_l4_hdr_type(struct mlx5_cqe64 *cqe) return (cqe->l4_l3_hdr_type >> 4) & 0x7; } -static inline u8 get_cqe_l3_hdr_type(struct mlx5_cqe64 *cqe) -{ - return (cqe->l4_l3_hdr_type >> 2) & 0x3; -} - static inline bool cqe_is_tunneled(struct mlx5_cqe64 *cqe) { return cqe->tls_outer_l3_tunneled & 0x1; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9114caceb2b5..52f34fe47049 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1018,7 +1018,6 @@ int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, bool mlx5_cmd_is_down(struct mlx5_core_dev *dev); int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); -void mlx5_health_flush(struct mlx5_core_dev *dev); void mlx5_health_cleanup(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev); void mlx5_start_health_poll(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/fs_helpers.h b/include/linux/mlx5/fs_helpers.h index 9db21cd0e92c..bc5125bc0561 100644 --- a/include/linux/mlx5/fs_helpers.h +++ b/include/linux/mlx5/fs_helpers.h @@ -38,46 +38,6 @@ #define MLX5_FS_IPV4_VERSION 4 #define MLX5_FS_IPV6_VERSION 6 -static inline bool mlx5_fs_is_ipsec_flow(const u32 *match_c) -{ - void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, - misc_parameters); - - return MLX5_GET(fte_match_set_misc, misc_params_c, outer_esp_spi); -} - -static inline bool _mlx5_fs_is_outer_ipproto_flow(const u32 *match_c, - const u32 *match_v, u8 match) -{ - const void *headers_c = MLX5_ADDR_OF(fte_match_param, match_c, - outer_headers); - const void *headers_v = MLX5_ADDR_OF(fte_match_param, match_v, - outer_headers); - - return MLX5_GET(fte_match_set_lyr_2_4, headers_c, ip_protocol) == 0xff && - MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol) == match; -} - -static inline bool mlx5_fs_is_outer_tcp_flow(const u32 *match_c, - const u32 *match_v) -{ - return _mlx5_fs_is_outer_ipproto_flow(match_c, match_v, IPPROTO_TCP); -} - -static inline bool mlx5_fs_is_outer_udp_flow(const u32 *match_c, - const u32 *match_v) -{ - return _mlx5_fs_is_outer_ipproto_flow(match_c, match_v, IPPROTO_UDP); -} - -static inline bool mlx5_fs_is_vxlan_flow(const u32 *match_c) -{ - void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, - misc_parameters); - - return MLX5_GET(fte_match_set_misc, misc_params_c, vxlan_vni); -} - static inline bool _mlx5_fs_is_outer_ipv_flow(struct mlx5_core_dev *mdev, const u32 *match_c, const u32 *match_v, int version) @@ -131,12 +91,4 @@ mlx5_fs_is_outer_ipv6_flow(struct mlx5_core_dev *mdev, const u32 *match_c, MLX5_FS_IPV6_VERSION); } -static inline bool mlx5_fs_is_outer_ipsec_flow(const u32 *match_c) -{ - void *misc_params_c = - MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters); - - return MLX5_GET(fte_match_set_misc, misc_params_c, outer_esp_spi); -} - #endif -- cgit v1.2.3 From b53ff37fcd5c7038d9dd000dcf682575a8f47999 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 7 Sep 2022 16:36:31 -0700 Subject: net/mlx5: Remove unused structs Remove structs which are no longer used in the driver: mlx5dr_cmd_qp_create_attr mlx5_fs_dr_ns mlx5_pas Signed-off-by: Gal Pressman Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 52f34fe47049..949feb7f889f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -857,11 +857,6 @@ struct mlx5_cmd_work_ent { refcount_t refcnt; }; -struct mlx5_pas { - u64 pa; - u8 log_sz; -}; - enum phy_port_state { MLX5_AAA_111 }; -- cgit v1.2.3 From 9175d8103780084e70bc89f2040ea62dc30f6f60 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 7 Sep 2022 16:36:32 -0700 Subject: net/mlx5: Remove from FPGA IFC file not-needed definitions Move IP layout bits definitions to be close to the place that actually uses it, together with removal extra defines that not in-use. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 16 ++++++++++++++++ include/linux/mlx5/mlx5_ifc_fpga.h | 24 ------------------------ 2 files changed, 16 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index b7f4e93df0f2..c25f2ce75734 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -478,6 +478,22 @@ struct mlx5_ifc_odp_per_transport_service_cap_bits { u8 reserved_at_6[0x1a]; }; +struct mlx5_ifc_ipv4_layout_bits { + u8 reserved_at_0[0x60]; + + u8 ipv4[0x20]; +}; + +struct mlx5_ifc_ipv6_layout_bits { + u8 ipv6[16][0x8]; +}; + +union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits { + struct mlx5_ifc_ipv6_layout_bits ipv6_layout; + struct mlx5_ifc_ipv4_layout_bits ipv4_layout; + u8 reserved_at_0[0x80]; +}; + struct mlx5_ifc_fte_match_set_lyr_2_4_bits { u8 smac_47_16[0x20]; diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index 45c7c0d67635..0596472923ad 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -32,30 +32,6 @@ #ifndef MLX5_IFC_FPGA_H #define MLX5_IFC_FPGA_H -struct mlx5_ifc_ipv4_layout_bits { - u8 reserved_at_0[0x60]; - - u8 ipv4[0x20]; -}; - -struct mlx5_ifc_ipv6_layout_bits { - u8 ipv6[16][0x8]; -}; - -union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits { - struct mlx5_ifc_ipv6_layout_bits ipv6_layout; - struct mlx5_ifc_ipv4_layout_bits ipv4_layout; - u8 reserved_at_0[0x80]; -}; - -enum { - MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX = 0x2c9, -}; - -enum { - MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_IPSEC = 0x2, -}; - struct mlx5_ifc_fpga_shell_caps_bits { u8 max_num_qps[0x10]; u8 reserved_at_10[0x8]; -- cgit v1.2.3 From 7b084630153152239d84990ac4540c2dd360186f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Sep 2022 15:00:31 -0700 Subject: perf: Use sample_flags for addr Use the new sample_flags to indicate whether the addr field is filled by the PMU driver. As most PMU drivers pass 0, it can set the flag only if it has a non-zero value. And use 0 in perf_sample_output() if it's not filled already. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220921220032.2858517-1-namhyung@kernel.org --- include/linux/perf_event.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 368bdc4f563f..f4a13579b0e8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,7 +1028,6 @@ struct perf_sample_data { * minimize the cachelines touched. */ u64 sample_flags; - u64 addr; struct perf_raw_record *raw; u64 period; @@ -1040,6 +1039,7 @@ struct perf_sample_data { union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; + u64 addr; u64 type; u64 ip; @@ -1079,9 +1079,13 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, { /* remaining struct members initialized in perf_prepare_sample() */ data->sample_flags = 0; - data->addr = addr; data->raw = NULL; data->period = period; + + if (addr) { + data->addr = addr; + data->sample_flags |= PERF_SAMPLE_ADDR; + } } /* -- cgit v1.2.3 From 838d9bb62d132ec3baf1b5aba2e95ef9a7a9a3cd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Sep 2022 15:00:32 -0700 Subject: perf: Use sample_flags for raw_data Use the new sample_flags to indicate whether the raw data field is filled by the PMU driver. Although it could check with the NULL, follow the same rule with other fields. Remove the raw field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220921220032.2858517-2-namhyung@kernel.org --- include/linux/perf_event.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f4a13579b0e8..e9b151cde491 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,7 +1028,6 @@ struct perf_sample_data { * minimize the cachelines touched. */ u64 sample_flags; - struct perf_raw_record *raw; u64 period; /* @@ -1040,6 +1039,7 @@ struct perf_sample_data { union perf_mem_data_src data_src; u64 txn; u64 addr; + struct perf_raw_record *raw; u64 type; u64 ip; @@ -1078,8 +1078,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ - data->sample_flags = 0; - data->raw = NULL; + data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; if (addr) { -- cgit v1.2.3 From b8a94bfb33952bb17fbc65f8903d242a721c533d Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Mon, 5 Apr 2021 05:03:50 +0200 Subject: kallsyms: increase maximum kernel symbol length to 512 Rust symbols can become quite long due to namespacing introduced by modules, types, traits, generics, etc. For instance, the following code: pub mod my_module { pub struct MyType; pub struct MyGenericType(T); pub trait MyTrait { fn my_method() -> u32; } impl MyTrait for MyGenericType { fn my_method() -> u32 { 42 } } } generates a symbol of length 96 when using the upcoming v0 mangling scheme: _RNvXNtCshGpAVYOtgW1_7example9my_moduleINtB2_13MyGenericTypeNtB2_6MyTypeENtB2_7MyTrait9my_method At the moment, Rust symbols may reach up to 300 in length. Setting 512 as the maximum seems like a reasonable choice to keep some headroom. Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Reviewed-by: Greg Kroah-Hartman Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Boqun Feng Signed-off-by: Boqun Feng Signed-off-by: Miguel Ojeda --- include/linux/kallsyms.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index ad39636e0c3f..649faac31ddb 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -15,7 +15,7 @@ #include -#define KSYM_NAME_LEN 128 +#define KSYM_NAME_LEN 512 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s %s]") + \ (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + \ -- cgit v1.2.3 From 2f7ab1267dc9b2d1f29695aff3211c87483480f3 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 3 Jul 2021 16:42:57 +0200 Subject: Kbuild: add Rust support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Having most of the new files in place, we now enable Rust support in the build system, including `Kconfig` entries related to Rust, the Rust configuration printer and a few other bits. Reviewed-by: Kees Cook Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Reviewed-by: Greg Kroah-Hartman Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Finn Behrens Signed-off-by: Finn Behrens Co-developed-by: Adam Bratschi-Kaye Signed-off-by: Adam Bratschi-Kaye Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Michael Ellerman Signed-off-by: Michael Ellerman Co-developed-by: Sven Van Asbroeck Signed-off-by: Sven Van Asbroeck Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Boris-Chengbiao Zhou Signed-off-by: Boris-Chengbiao Zhou Co-developed-by: Boqun Feng Signed-off-by: Boqun Feng Co-developed-by: Douglas Su Signed-off-by: Douglas Su Co-developed-by: Dariusz Sosnowski Signed-off-by: Dariusz Sosnowski Co-developed-by: Antonio Terceiro Signed-off-by: Antonio Terceiro Co-developed-by: Daniel Xu Signed-off-by: Daniel Xu Co-developed-by: Björn Roy Baron Signed-off-by: Björn Roy Baron Co-developed-by: Martin Rodriguez Reboredo Signed-off-by: Martin Rodriguez Reboredo Signed-off-by: Miguel Ojeda --- include/linux/compiler_types.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 4f2a819fd60a..50b3f6b9502e 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -4,8 +4,12 @@ #ifndef __ASSEMBLY__ +/* + * Skipped when running bindgen due to a libclang issue; + * see https://github.com/rust-lang/rust-bindgen/issues/2244. + */ #if defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \ - __has_attribute(btf_type_tag) + __has_attribute(btf_type_tag) && !defined(__BINDGEN__) # define BTF_TYPE_TAG(value) __attribute__((btf_type_tag(#value))) #else # define BTF_TYPE_TAG(value) /* nothing */ -- cgit v1.2.3 From 5aec788aeb8eb74282b75ac1b317beb0fbb69a42 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 27 Sep 2022 21:02:34 +0200 Subject: sched: Fix TASK_state comparisons Task state is fundamentally a bitmask; direct comparisons are probably not working as intended. Specifically the normal wait-state have a number of possible modifiers: TASK_UNINTERRUPTIBLE: TASK_WAKEKILL, TASK_NOLOAD, TASK_FREEZABLE TASK_INTERRUPTIBLE: TASK_FREEZABLE Specifically, the addition of TASK_FREEZABLE wrecked __wait_is_interruptible(). This however led to an audit of direct comparisons yielding the rest of the changes. Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") Reported-by: Christian Borntraeger Debugged-by: Christian Borntraeger Signed-off-by: Peter Zijlstra (Intel) Tested-by: Christian Borntraeger --- include/linux/wait.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 14ad8a0e9fac..7f5a51aae0a7 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -281,7 +281,7 @@ static inline void wake_up_pollfree(struct wait_queue_head *wq_head) #define ___wait_is_interruptible(state) \ (!__builtin_constant_p(state) || \ - state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ + (state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); -- cgit v1.2.3 From 99df7a2810b6d24651d4887ab61a142e042fb235 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 26 Sep 2022 08:16:42 -0500 Subject: powerpc/pseries: block untrusted device tree changes when locked down The /proc/powerpc/ofdt interface allows the root user to freely alter the in-kernel device tree, enabling arbitrary physical address writes via drivers that could bind to malicious device nodes, thus making it possible to disable lockdown. Historically this interface has been used on the pseries platform to facilitate the runtime addition and removal of processor, memory, and device resources (aka Dynamic Logical Partitioning or DLPAR). Years ago, the processor and memory use cases were migrated to designs that happen to be lockdown-friendly: device tree updates are communicated directly to the kernel from firmware without passing through untrusted user space. I/O device DLPAR via the "drmgr" command in powerpc-utils remains the sole legitimate user of /proc/powerpc/ofdt, but it is already broken in lockdown since it uses /dev/mem to allocate argument buffers for the rtas syscall. So only illegitimate uses of the interface should see a behavior change when running on a locked down kernel. Signed-off-by: Nathan Lynch Acked-by: Paul Moore (LSM) Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220926131643.146502-2-nathanl@linux.ibm.com --- include/linux/security.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 1bc362cb413f..7da801ceb5a4 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -114,6 +114,7 @@ enum lockdown_reason { LOCKDOWN_IOPORT, LOCKDOWN_MSR, LOCKDOWN_ACPI_TABLES, + LOCKDOWN_DEVICE_TREE, LOCKDOWN_PCMCIA_CIS, LOCKDOWN_TIOCSSERIAL, LOCKDOWN_MODULE_PARAMETERS, -- cgit v1.2.3 From b8f3e48834fe8c86b4f21739c6effd160e2c2c19 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 26 Sep 2022 08:16:43 -0500 Subject: powerpc/rtas: block error injection when locked down The error injection facility on pseries VMs allows corruption of arbitrary guest memory, potentially enabling a sufficiently privileged user to disable lockdown or perform other modifications of the running kernel via the rtas syscall. Block the PAPR error injection facility from being opened or called when locked down. Signed-off-by: Nathan Lynch Acked-by: Paul Moore (LSM) Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220926131643.146502-3-nathanl@linux.ibm.com --- include/linux/security.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 7da801ceb5a4..a6d67600759e 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -123,6 +123,7 @@ enum lockdown_reason { LOCKDOWN_XMON_WR, LOCKDOWN_BPF_WRITE_USER, LOCKDOWN_DBG_WRITE_KERNEL, + LOCKDOWN_RTAS_ERROR_INJECTION, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, -- cgit v1.2.3 From 141f3d6256e58103ece1c3dd2835e871f1dde240 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 24 Sep 2022 15:18:26 +0900 Subject: ata: libata-sata: Fix device queue depth control The function __ata_change_queue_depth() uses the helper ata_scsi_find_dev() to get the ata_device structure of a scsi device and set that device maximum queue depth. However, when the ata device is managed by libsas, ata_scsi_find_dev() returns NULL, turning __ata_change_queue_depth() into a nop, which prevents the user from setting the maximum queue depth of ATA devices used with libsas based HBAs. Fix this by renaming __ata_change_queue_depth() to ata_change_queue_depth() and adding a pointer to the ata_device structure of the target device as argument. This pointer is provided by ata_scsi_change_queue_depth() using ata_scsi_find_dev() in the case of a libata managed device and by sas_change_queue_depth() using sas_to_ata_dev() in the case of a libsas managed ata device. Signed-off-by: Damien Le Moal Tested-by: John Garry --- include/linux/libata.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 698032e5ef2d..20765d1c5f80 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1136,8 +1136,8 @@ extern int ata_scsi_slave_config(struct scsi_device *sdev); extern void ata_scsi_slave_destroy(struct scsi_device *sdev); extern int ata_scsi_change_queue_depth(struct scsi_device *sdev, int queue_depth); -extern int __ata_change_queue_depth(struct ata_port *ap, struct scsi_device *sdev, - int queue_depth); +extern int ata_change_queue_depth(struct ata_port *ap, struct ata_device *dev, + struct scsi_device *sdev, int queue_depth); extern struct ata_device *ata_dev_pair(struct ata_device *adev); extern int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev); extern void ata_scsi_port_error_handler(struct Scsi_Host *host, struct ata_port *ap); -- cgit v1.2.3 From f25723dcef4a38f6a39e17afeabd1adf6402230e Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Tue, 27 Sep 2022 13:21:14 +0200 Subject: spi: Save current RX and TX DMA devices Save the current RX and TX DMA devices to avoid having to duplicate the logic to pick them, since we'll need access to them in some more functions to fix a bug in the cache handling. Signed-off-by: Vincent Whitchurch Link: https://lore.kernel.org/r/20220927112117.77599-2-vincent.whitchurch@axis.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 19acd4c9c7e3..4b4041e9895a 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -378,6 +378,8 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * @cleanup: frees controller-specific state * @can_dma: determine whether this controller supports DMA * @dma_map_dev: device which can be used for DMA mapping + * @cur_rx_dma_dev: device which is currently used for RX DMA mapping + * @cur_tx_dma_dev: device which is currently used for TX DMA mapping * @queued: whether this controller is providing an internal message queue * @kworker: pointer to thread struct for message pump * @pump_messages: work struct for scheduling work to the message pump @@ -609,6 +611,8 @@ struct spi_controller { struct spi_device *spi, struct spi_transfer *xfer); struct device *dma_map_dev; + struct device *cur_rx_dma_dev; + struct device *cur_tx_dma_dev; /* * These hooks are for drivers that want to use the generic -- cgit v1.2.3 From 612d5494aef9bd2ab68d585a8c0ac2b16d12d520 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 27 Sep 2022 20:45:57 +0800 Subject: irqchip: Make irqchip_init() usable on pure ACPI systems Pure ACPI systems (e.g., LoongArch) do not need OF_IRQ, but still require irqchip_init() to perform the ACPI irqchip probing, even when OF_IRQ isn't selected. Relax the dependency to enable the generic irqchip support when ACPI_GENERIC_GSI is configured. Signed-off-by: Huacai Chen Tested-by: Tiezhu Yang [maz: revamped commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220927124557.3246737-1-chenhuacai@loongson.cn --- include/linux/of_irq.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 83fccd0c9bba..d6d3eae2f145 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -37,9 +37,8 @@ extern unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data); extern int of_irq_to_resource(struct device_node *dev, int index, struct resource *r); -extern void of_irq_init(const struct of_device_id *matches); - #ifdef CONFIG_OF_IRQ +extern void of_irq_init(const struct of_device_id *matches); extern int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_args *out_irq); extern int of_irq_count(struct device_node *dev); @@ -57,6 +56,9 @@ extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, extern void of_msi_configure(struct device *dev, struct device_node *np); u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in); #else +static inline void of_irq_init(const struct of_device_id *matches) +{ +} static inline int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_args *out_irq) { -- cgit v1.2.3 From 334f7d42db3eb0274aa6b4aba7ce14d87df3fef0 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Thu, 22 Sep 2022 11:12:42 -0500 Subject: irqchip: Allow extra fields to be passed to IRQCHIP_PLATFORM_DRIVER_END IRQCHIP_PLATFORM_DRIVER_* doesn't allow some fields (such as .pm) to be set in the platform_driver structure. Make IRQCHIP_PLATFORM_DRIVER_END variadic so that .pm or another field can be set if needed. Signed-off-by: Frank Li [maz: revamped commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220922161246.20586-3-Frank.Li@nxp.com --- include/linux/irqchip.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h index 3a091d0710ae..d5e6024cb2a8 100644 --- a/include/linux/irqchip.h +++ b/include/linux/irqchip.h @@ -44,7 +44,8 @@ static const struct of_device_id drv_name##_irqchip_match_table[] = { #define IRQCHIP_MATCH(compat, fn) { .compatible = compat, \ .data = typecheck_irq_init_cb(fn), }, -#define IRQCHIP_PLATFORM_DRIVER_END(drv_name) \ + +#define IRQCHIP_PLATFORM_DRIVER_END(drv_name, ...) \ {}, \ }; \ MODULE_DEVICE_TABLE(of, drv_name##_irqchip_match_table); \ @@ -56,6 +57,7 @@ static struct platform_driver drv_name##_driver = { \ .owner = THIS_MODULE, \ .of_match_table = drv_name##_irqchip_match_table, \ .suppress_bind_attrs = true, \ + __VA_ARGS__ \ }, \ }; \ builtin_platform_driver(drv_name##_driver) -- cgit v1.2.3 From 2d48bfca42a6d2b5f92d24ceae4425fc4fae14ab Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Mon, 8 Aug 2022 12:38:07 -0500 Subject: mfd: rk808: Add Rockchip rk817 battery charger support Add rk817 charger support cell to rk808 mfd driver. Signed-off-by: Chris Morgan Signed-off-by: Maya Matuszczyk Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20220808173809.11320-3-macroalpha82@gmail.com --- include/linux/mfd/rk808.h | 91 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h index 58602032e642..9af1f3105f80 100644 --- a/include/linux/mfd/rk808.h +++ b/include/linux/mfd/rk808.h @@ -519,6 +519,77 @@ enum rk809_reg_id { #define MIC_DIFF_DIS (0x0 << 7) #define MIC_DIFF_EN (0x1 << 7) +/* RK817 Battery Registers */ +#define RK817_GAS_GAUGE_ADC_CONFIG0 0x50 +#define RK817_GG_EN (0x1 << 7) +#define RK817_SYS_VOL_ADC_EN (0x1 << 6) +#define RK817_TS_ADC_EN (0x1 << 5) +#define RK817_USB_VOL_ADC_EN (0x1 << 4) +#define RK817_BAT_VOL_ADC_EN (0x1 << 3) +#define RK817_BAT_CUR_ADC_EN (0x1 << 2) + +#define RK817_GAS_GAUGE_ADC_CONFIG1 0x55 + +#define RK817_VOL_CUR_CALIB_UPD BIT(7) + +#define RK817_GAS_GAUGE_GG_CON 0x56 +#define RK817_GAS_GAUGE_GG_STS 0x57 + +#define RK817_BAT_CON (0x1 << 4) +#define RK817_RELAX_VOL_UPD (0x3 << 2) +#define RK817_RELAX_STS (0x1 << 1) + +#define RK817_GAS_GAUGE_RELAX_THRE_H 0x58 +#define RK817_GAS_GAUGE_RELAX_THRE_L 0x59 +#define RK817_GAS_GAUGE_OCV_THRE_VOL 0x62 +#define RK817_GAS_GAUGE_OCV_VOL_H 0x63 +#define RK817_GAS_GAUGE_OCV_VOL_L 0x64 +#define RK817_GAS_GAUGE_PWRON_VOL_H 0x6b +#define RK817_GAS_GAUGE_PWRON_VOL_L 0x6c +#define RK817_GAS_GAUGE_PWRON_CUR_H 0x6d +#define RK817_GAS_GAUGE_PWRON_CUR_L 0x6e +#define RK817_GAS_GAUGE_OFF_CNT 0x6f +#define RK817_GAS_GAUGE_Q_INIT_H3 0x70 +#define RK817_GAS_GAUGE_Q_INIT_H2 0x71 +#define RK817_GAS_GAUGE_Q_INIT_L1 0x72 +#define RK817_GAS_GAUGE_Q_INIT_L0 0x73 +#define RK817_GAS_GAUGE_Q_PRES_H3 0x74 +#define RK817_GAS_GAUGE_Q_PRES_H2 0x75 +#define RK817_GAS_GAUGE_Q_PRES_L1 0x76 +#define RK817_GAS_GAUGE_Q_PRES_L0 0x77 +#define RK817_GAS_GAUGE_BAT_VOL_H 0x78 +#define RK817_GAS_GAUGE_BAT_VOL_L 0x79 +#define RK817_GAS_GAUGE_BAT_CUR_H 0x7a +#define RK817_GAS_GAUGE_BAT_CUR_L 0x7b +#define RK817_GAS_GAUGE_USB_VOL_H 0x7e +#define RK817_GAS_GAUGE_USB_VOL_L 0x7f +#define RK817_GAS_GAUGE_SYS_VOL_H 0x80 +#define RK817_GAS_GAUGE_SYS_VOL_L 0x81 +#define RK817_GAS_GAUGE_Q_MAX_H3 0x82 +#define RK817_GAS_GAUGE_Q_MAX_H2 0x83 +#define RK817_GAS_GAUGE_Q_MAX_L1 0x84 +#define RK817_GAS_GAUGE_Q_MAX_L0 0x85 +#define RK817_GAS_GAUGE_SLEEP_CON_SAMP_CUR_H 0x8f +#define RK817_GAS_GAUGE_SLEEP_CON_SAMP_CUR_L 0x90 +#define RK817_GAS_GAUGE_CAL_OFFSET_H 0x91 +#define RK817_GAS_GAUGE_CAL_OFFSET_L 0x92 +#define RK817_GAS_GAUGE_VCALIB0_H 0x93 +#define RK817_GAS_GAUGE_VCALIB0_L 0x94 +#define RK817_GAS_GAUGE_VCALIB1_H 0x95 +#define RK817_GAS_GAUGE_VCALIB1_L 0x96 +#define RK817_GAS_GAUGE_IOFFSET_H 0x97 +#define RK817_GAS_GAUGE_IOFFSET_L 0x98 +#define RK817_GAS_GAUGE_BAT_R1 0x9a +#define RK817_GAS_GAUGE_BAT_R2 0x9b +#define RK817_GAS_GAUGE_BAT_R3 0x9c +#define RK817_GAS_GAUGE_DATA0 0x9d +#define RK817_GAS_GAUGE_DATA1 0x9e +#define RK817_GAS_GAUGE_DATA2 0x9f +#define RK817_GAS_GAUGE_DATA3 0xa0 +#define RK817_GAS_GAUGE_DATA4 0xa1 +#define RK817_GAS_GAUGE_DATA5 0xa2 +#define RK817_GAS_GAUGE_CUR_ADC_K0 0xb0 + #define RK817_POWER_EN_REG(i) (0xb1 + (i)) #define RK817_POWER_SLP_EN_REG(i) (0xb5 + (i)) @@ -544,10 +615,30 @@ enum rk809_reg_id { #define RK817_LDO_ON_VSEL_REG(idx) (0xcc + (idx) * 2) #define RK817_BOOST_OTG_CFG (0xde) +#define RK817_PMIC_CHRG_OUT 0xe4 +#define RK817_CHRG_VOL_SEL (0x07 << 4) +#define RK817_CHRG_CUR_SEL (0x07 << 0) + +#define RK817_PMIC_CHRG_IN 0xe5 +#define RK817_USB_VLIM_EN (0x01 << 7) +#define RK817_USB_VLIM_SEL (0x07 << 4) +#define RK817_USB_ILIM_EN (0x01 << 3) +#define RK817_USB_ILIM_SEL (0x07 << 0) +#define RK817_PMIC_CHRG_TERM 0xe6 +#define RK817_CHRG_TERM_ANA_DIG (0x01 << 2) +#define RK817_CHRG_TERM_ANA_SEL (0x03 << 0) +#define RK817_CHRG_EN (0x01 << 6) + +#define RK817_PMIC_CHRG_STS 0xeb +#define RK817_BAT_EXS BIT(7) +#define RK817_CHG_STS (0x07 << 4) + #define RK817_ID_MSB 0xed #define RK817_ID_LSB 0xee #define RK817_SYS_STS 0xf0 +#define RK817_PLUG_IN_STS (0x1 << 6) + #define RK817_SYS_CFG(i) (0xf1 + (i)) #define RK817_ON_SOURCE_REG 0xf5 -- cgit v1.2.3 From a47137a5134be2f0b4724fe548362a253727d8b1 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 5 Sep 2022 13:58:10 +0200 Subject: mfd/omap1: htc-i2cpld: Convert to a pure GPIO driver Instead of passing GPIO numbers pertaining to ourselves through platform data, just request GPIO descriptors from our own GPIO chips and use them, and cut down on the unnecessary complexity. Cc: Aaro Koskinen Cc: Janusz Krzysztofik Cc: Tony Lindgren Cc: Cory Maccarrone Cc: linux-omap@vger.kernel.org Signed-off-by: Linus Walleij Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20220905115810.5987-1-linus.walleij@linaro.org --- include/linux/htcpld.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/htcpld.h b/include/linux/htcpld.h index 842fce69ac06..5f8ac9b1d724 100644 --- a/include/linux/htcpld.h +++ b/include/linux/htcpld.h @@ -13,8 +13,6 @@ struct htcpld_chip_platform_data { }; struct htcpld_core_platform_data { - unsigned int int_reset_gpio_hi; - unsigned int int_reset_gpio_lo; unsigned int i2c_adapter_id; struct htcpld_chip_platform_data *chip; -- cgit v1.2.3 From b1cab78ba392162a5ef11173d02db6e182e04edd Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 26 Sep 2022 19:59:31 +0000 Subject: Revert "net: set proper memcg for net_init hooks allocations" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 1d0403d20f6c281cb3d14c5f1db5317caeec48e9. Anatoly Pugachev reported that the commit 1d0403d20f6c ("net: set proper memcg for net_init hooks allocations") is somehow causing the sparc64 VMs failed to boot and the VMs boot fine with that patch reverted. So, revert the patch for now and later we can debug the issue. Link: https://lore.kernel.org/all/20220918092849.GA10314@u164.east.ru/ Reported-by: Anatoly Pugachev Signed-off-by: Shakeel Butt Cc: Vasily Averin Cc: Jakub Kicinski Cc: Michal Koutný Cc: Andrew Morton Cc: cgroups@vger.kernel.org Cc: sparclinux@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org Tested-by: Anatoly Pugachev Acked-by: Johannes Weiner Fixes: 1d0403d20f6c ("net: set proper memcg for net_init hooks allocations") Reviewed-by: Muchun Song Acked-by: Roman Gushchin Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 45 --------------------------------------------- 1 file changed, 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6257867fbf95..567f12323f55 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1788,42 +1788,6 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, rcu_read_unlock(); } -/** - * get_mem_cgroup_from_obj - get a memcg associated with passed kernel object. - * @p: pointer to object from which memcg should be extracted. It can be NULL. - * - * Retrieves the memory group into which the memory of the pointed kernel - * object is accounted. If memcg is found, its reference is taken. - * If a passed kernel object is uncharged, or if proper memcg cannot be found, - * as well as if mem_cgroup is disabled, NULL is returned. - * - * Return: valid memcg pointer with taken reference or NULL. - */ -static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p) -{ - struct mem_cgroup *memcg; - - rcu_read_lock(); - do { - memcg = mem_cgroup_from_obj(p); - } while (memcg && !css_tryget(&memcg->css)); - rcu_read_unlock(); - return memcg; -} - -/** - * mem_cgroup_or_root - always returns a pointer to a valid memory cgroup. - * @memcg: pointer to a valid memory cgroup or NULL. - * - * If passed argument is not NULL, returns it without any additional checks - * and changes. Otherwise, root_mem_cgroup is returned. - * - * NOTE: root_mem_cgroup can be NULL during early boot. - */ -static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg) -{ - return memcg ? memcg : root_mem_cgroup; -} #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1880,15 +1844,6 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { } -static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p) -{ - return NULL; -} - -static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg) -{ - return NULL; -} #endif /* CONFIG_MEMCG_KMEM */ #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) -- cgit v1.2.3 From 49f27f2b4bfa8b6e26f02df615e544f52648bfb2 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Wed, 28 Sep 2022 14:47:55 +0800 Subject: remoteproc: Introduce rproc features remote processor may support: - boot recovery with help from main processor - self recovery without help from main processor - iommu - etc Introduce rproc features could simplify code to avoid adding more bool flags Acked-by: Arnaud Pouliquen Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20220928064756.4059662-2-peng.fan@oss.nxp.com Signed-off-by: Mathieu Poirier --- include/linux/remoteproc.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 1abf56ad02da..fe8978eb69f1 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -489,6 +489,20 @@ struct rproc_dump_segment { loff_t offset; }; +/** + * enum rproc_features - features supported + * + * @RPROC_FEAT_ATTACH_ON_RECOVERY: The remote processor does not need help + * from Linux to recover, such as firmware + * loading. Linux just needs to attach after + * recovery. + */ + +enum rproc_features { + RPROC_FEAT_ATTACH_ON_RECOVERY, + RPROC_MAX_FEATURES, +}; + /** * struct rproc - represents a physical remote processor device * @node: list node of this rproc object @@ -530,6 +544,7 @@ struct rproc_dump_segment { * @elf_machine: firmware ELF machine * @cdev: character device of the rproc * @cdev_put_on_release: flag to indicate if remoteproc should be shutdown on @char_dev release + * @features: indicate remoteproc features */ struct rproc { struct list_head node; @@ -570,6 +585,7 @@ struct rproc { u16 elf_machine; struct cdev cdev; bool cdev_put_on_release; + DECLARE_BITMAP(features, RPROC_MAX_FEATURES); }; /** -- cgit v1.2.3 From f3304ecd7f060db1d4197fbdce5a503259f770d3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 16 Sep 2022 15:29:53 +0900 Subject: linux/export: use inline assembler to populate symbol CRCs Since commit 7b4537199a4a ("kbuild: link symbol CRCs at final link, removing CONFIG_MODULE_REL_CRCS"), the module versioning on the (non-upstreamed-yet) kvx Linux port is broken due to unexpected padding for __crc_* symbols. The kvx GCC adds padding so u32 gets 8-byte alignment instead of 4. I do not know if this happens for upstream architectures in general, but any compiler has the freedom to insert padding for faster access. Use the inline assembler to directly specify the wanted data layout. This is how we previously did before the breakage. Link: https://lore.kernel.org/lkml/20220817161438.32039-1-ysionneau@kalray.eu/ Link: https://lore.kernel.org/linux-kbuild/31ce5305-a76b-13d7-ea55-afca82c46cf2@kalray.eu/ Fixes: 7b4537199a4a ("kbuild: link symbol CRCs at final link, removing CONFIG_MODULE_REL_CRCS") Reported-by: Yann Sionneau Signed-off-by: Masahiro Yamada Tested-by: Yann Sionneau --- include/linux/export-internal.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h index c2b1d4fd5987..fe7e6ba918f1 100644 --- a/include/linux/export-internal.h +++ b/include/linux/export-internal.h @@ -10,8 +10,10 @@ #include #include -/* __used is needed to keep __crc_* for LTO */ #define SYMBOL_CRC(sym, crc, sec) \ - u32 __section("___kcrctab" sec "+" #sym) __used __crc_##sym = crc + asm(".section \"___kcrctab" sec "+" #sym "\",\"a\"" "\n" \ + "__crc_" #sym ":" "\n" \ + ".long " #crc "\n" \ + ".previous" "\n") #endif /* __LINUX_EXPORT_INTERNAL_H__ */ -- cgit v1.2.3 From f0d74c4da1f060d2a66976193712a5e6abd361f5 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 26 Sep 2022 11:49:53 -0700 Subject: bpf: Parameterize task iterators. Allow creating an iterator that loops through resources of one thread/process. People could only create iterators to loop through all resources of files, vma, and tasks in the system, even though they were interested in only the resources of a specific task or process. Passing the additional parameters, people can now create an iterator to go through all resources or only the resources of a task. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220926184957.208194-2-kuifeng@fb.com --- include/linux/bpf.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5161fac0513f..0f3eaf3ed98c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1796,6 +1796,27 @@ int bpf_obj_get_user(const char __user *pathname, int flags); extern int bpf_iter_ ## target(args); \ int __init bpf_iter_ ## target(args) { return 0; } +/* + * The task type of iterators. + * + * For BPF task iterators, they can be parameterized with various + * parameters to visit only some of tasks. + * + * BPF_TASK_ITER_ALL (default) + * Iterate over resources of every task. + * + * BPF_TASK_ITER_TID + * Iterate over resources of a task/tid. + * + * BPF_TASK_ITER_TGID + * Iterate over resources of every task of a process / task group. + */ +enum bpf_iter_task_type { + BPF_TASK_ITER_ALL = 0, + BPF_TASK_ITER_TID, + BPF_TASK_ITER_TGID, +}; + struct bpf_iter_aux_info { /* for map_elem iter */ struct bpf_map *map; @@ -1805,6 +1826,10 @@ struct bpf_iter_aux_info { struct cgroup *start; /* starting cgroup */ enum bpf_cgroup_iter_order order; } cgroup; + struct { + enum bpf_iter_task_type type; + u32 pid; + } task; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, -- cgit v1.2.3 From 7e9fbbb1b776d8d7969551565bc246f74ec53b27 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 28 Sep 2022 13:39:38 -0400 Subject: ring-buffer: Add ring_buffer_wake_waiters() On closing of a file that represents a ring buffer or flushing the file, there may be waiters on the ring buffer that needs to be woken up and exit the ring_buffer_wait() function. Add ring_buffer_wake_waiters() to wake up the waiters on the ring buffer and allow them to exit the wait loop. Link: https://lkml.kernel.org/r/20220928133938.28dc2c27@gandalf.local.home Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Fixes: 15693458c4bc0 ("tracing/ring-buffer: Move poll wake ups into ring buffer code") Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index dac53fd3afea..2504df9a0453 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -101,7 +101,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full); __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table); - +void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu); #define RING_BUFFER_ALL_CPUS -1 -- cgit v1.2.3 From f3ddb74ad0790030c9592229fb14d8c451f4e9a8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 27 Sep 2022 19:15:27 -0400 Subject: tracing: Wake up ring buffer waiters on closing of the file When the file that represents the ring buffer is closed, there may be waiters waiting on more input from the ring buffer. Call ring_buffer_wake_waiters() to wake up any waiters when the file is closed. Link: https://lkml.kernel.org/r/20220927231825.182416969@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Fixes: e30f53aad2202 ("tracing: Do not busy wait in buffer splice") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8401dec93c15..20749bd9db71 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -92,6 +92,7 @@ struct trace_iterator { unsigned int temp_size; char *fmt; /* modified format holder */ unsigned int fmt_size; + long wait_index; /* trace_seq for __print_flags() and __print_symbolic() etc. */ struct trace_seq tmp_seq; -- cgit v1.2.3 From 6eaab4dfdd30ea8fb0dd4ee04940676c12b728e8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 23 Sep 2022 17:39:01 +0100 Subject: net: introduce struct ubuf_info_msgzc We're going to split struct ubuf_info and leave there only mandatory fields. Users are free to extend it. Add struct ubuf_info_msgzc, which will be an extended version for MSG_ZEROCOPY and some other users. It duplicates of struct ubuf_info for now and will be removed in a couple of patches. Signed-off-by: Pavel Begunkov Acked-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f15d5b62539b..fd7dcb977fdf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -554,7 +554,28 @@ struct ubuf_info { } mmp; }; +struct ubuf_info_msgzc { + struct ubuf_info ubuf; + + union { + struct { + unsigned long desc; + void *ctx; + }; + struct { + u32 id; + u16 len; + u16 zerocopy:1; + u32 bytelen; + }; + }; + + struct mmpin mmp; +}; + #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) +#define uarg_to_msgzc(ubuf_ptr) container_of((ubuf_ptr), struct ubuf_info_msgzc, \ + ubuf) int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); -- cgit v1.2.3 From e7d2b510165fff6bedc9cca88c071ad846850c74 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 23 Sep 2022 17:39:04 +0100 Subject: net: shrink struct ubuf_info We can benefit from a smaller struct ubuf_info, so leave only mandatory fields and let users to decide how they want to extend it. Convert MSG_ZEROCOPY to struct ubuf_info_msgzc and remove duplicated fields. This reduces the size from 48 bytes to just 16. Signed-off-by: Pavel Begunkov Acked-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fd7dcb977fdf..920eb6413fee 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -533,25 +533,8 @@ enum { struct ubuf_info { void (*callback)(struct sk_buff *, struct ubuf_info *, bool zerocopy_success); - union { - struct { - unsigned long desc; - void *ctx; - }; - struct { - u32 id; - u16 len; - u16 zerocopy:1; - u32 bytelen; - }; - }; refcount_t refcnt; u8 flags; - - struct mmpin { - struct user_struct *user; - unsigned int num_pg; - } mmp; }; struct ubuf_info_msgzc { @@ -570,7 +553,10 @@ struct ubuf_info_msgzc { }; }; - struct mmpin mmp; + struct mmpin { + struct user_struct *user; + unsigned int num_pg; + } mmp; }; #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) -- cgit v1.2.3 From b48b89f9c189d24eb5e2b4a0ac067da5a24ee86d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 27 Sep 2022 06:27:53 -0700 Subject: net: drop the weight argument from netif_napi_add We tell driver developers to always pass NAPI_POLL_WEIGHT as the weight to netif_napi_add(). This may be confusing to newcomers, drop the weight argument, those who really need to tweak the weight can use netif_napi_add_weight(). Acked-by: Marc Kleine-Budde # for CAN Link: https://lore.kernel.org/r/20220927132753.750069-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9f42fc871c3b..7a084a1c14e9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2553,16 +2553,15 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, * @dev: network device * @napi: NAPI context * @poll: polling function - * @weight: default weight * * netif_napi_add() must be used to initialize a NAPI context prior to calling * *any* of the other NAPI-related functions. */ static inline void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight) + int (*poll)(struct napi_struct *, int)) { - netif_napi_add_weight(dev, napi, poll, weight); + netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } static inline void -- cgit v1.2.3 From 40b72108f9c609c5043903efb70c52d46b8aa871 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 27 Sep 2022 13:35:56 -0700 Subject: net/mlx5: Add the log_min_mkey_entity_size capability Add the capability that will allow the driver to determine the minimal MTT page size to be able to map the smallest possible pages in XSK. The older firmwares that don't have this capability default to 12 (i.e. 4096-byte pages). Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Reviewed-by: Saeed Mahameed Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3d963c0e47e4..1ad762e22d86 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1878,7 +1878,13 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 max_reformat_remove_size[0x8]; u8 max_reformat_remove_offset[0x8]; - u8 reserved_at_c0[0x160]; + u8 reserved_at_c0[0xe0]; + + u8 reserved_at_1a0[0xb]; + u8 log_min_mkey_entity_size[0x5]; + u8 reserved_at_1b0[0x10]; + + u8 reserved_at_1c0[0x60]; u8 reserved_at_220[0x1]; u8 sw_vhca_id_valid[0x1]; -- cgit v1.2.3 From 9ed9cac1850a2a55674b4a17100c50b46f645921 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 23 Sep 2022 13:28:07 -0700 Subject: slab: Remove __malloc attribute from realloc functions The __malloc attribute should not be applied to "realloc" functions, as the returned pointer may alias the storage of the prior pointer. Instead of splitting __malloc from __alloc_size, which would be a huge amount of churn, just create __realloc_size for the few cases where it is needed. Thanks to Geert Uytterhoeven for reporting build failures with gcc-8 in earlier version which tried to remove the #ifdef. While the "alloc_size" attribute is available on all GCC versions, I forgot that it gets disabled explicitly by the kernel in GCC < 9.1 due to misbehaviors. Add a note to the compiler_attributes.h entry for it. Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: Vlastimil Babka Cc: Roman Gushchin Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Marco Elver Cc: linux-mm@kvack.org Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka --- include/linux/compiler_attributes.h | 3 ++- include/linux/compiler_types.h | 8 +++++--- include/linux/slab.h | 12 ++++++------ 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 445e80517cab..96a4ed11b4be 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -35,7 +35,8 @@ /* * Note: do not use this directly. Instead, use __alloc_size() since it is conditionally - * available and includes other attributes. + * available and includes other attributes. For GCC < 9.1, __alloc_size__ gets undefined + * in compiler-gcc.h, due to misbehaviors. * * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alloc_005fsize-function-attribute * clang: https://clang.llvm.org/docs/AttributeReference.html#alloc-size diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 4f2a819fd60a..0717534f8364 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -271,14 +271,16 @@ struct ftrace_likely_data { /* * Any place that could be marked with the "alloc_size" attribute is also - * a place to be marked with the "malloc" attribute. Do this as part of the - * __alloc_size macro to avoid redundant attributes and to avoid missing a - * __malloc marking. + * a place to be marked with the "malloc" attribute, except those that may + * be performing a _reallocation_, as that may alias the existing pointer. + * For these, use __realloc_size(). */ #ifdef __alloc_size__ # define __alloc_size(x, ...) __alloc_size__(x, ## __VA_ARGS__) __malloc +# define __realloc_size(x, ...) __alloc_size__(x, ## __VA_ARGS__) #else # define __alloc_size(x, ...) __malloc +# define __realloc_size(x, ...) #endif #ifndef asm_volatile_goto diff --git a/include/linux/slab.h b/include/linux/slab.h index 0fefdf528e0d..41bd036e7551 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -184,7 +184,7 @@ int kmem_cache_shrink(struct kmem_cache *s); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2); +void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2); void kfree(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); @@ -647,10 +647,10 @@ static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_ * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) */ -static inline __alloc_size(2, 3) void * __must_check krealloc_array(void *p, - size_t new_n, - size_t new_size, - gfp_t flags) +static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p, + size_t new_n, + size_t new_size, + gfp_t flags) { size_t bytes; @@ -774,7 +774,7 @@ static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t fla } extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) - __alloc_size(3); + __realloc_size(3); extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); -- cgit v1.2.3 From 05a940656e1eb2026d9ee31019d5b47e9545124d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 23 Sep 2022 13:28:08 -0700 Subject: slab: Introduce kmalloc_size_roundup() In the effort to help the compiler reason about buffer sizes, the __alloc_size attribute was added to allocators. This improves the scope of the compiler's ability to apply CONFIG_UBSAN_BOUNDS and (in the near future) CONFIG_FORTIFY_SOURCE. For most allocations, this works well, as the vast majority of callers are not expecting to use more memory than what they asked for. There is, however, one common exception to this: anticipatory resizing of kmalloc allocations. These cases all use ksize() to determine the actual bucket size of a given allocation (e.g. 128 when 126 was asked for). This comes in two styles in the kernel: 1) An allocation has been determined to be too small, and needs to be resized. Instead of the caller choosing its own next best size, it wants to minimize the number of calls to krealloc(), so it just uses ksize() plus some additional bytes, forcing the realloc into the next bucket size, from which it can learn how large it is now. For example: data = krealloc(data, ksize(data) + 1, gfp); data_len = ksize(data); 2) The minimum size of an allocation is calculated, but since it may grow in the future, just use all the space available in the chosen bucket immediately, to avoid needing to reallocate later. A good example of this is skbuff's allocators: data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); ... /* kmalloc(size) might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ osize = ksize(data); size = SKB_WITH_OVERHEAD(osize); In both cases, the "how much was actually allocated?" question is answered _after_ the allocation, where the compiler hinting is not in an easy place to make the association any more. This mismatch between the compiler's view of the buffer length and the code's intention about how much it is going to actually use has already caused problems[1]. It is possible to fix this by reordering the use of the "actual size" information. We can serve the needs of users of ksize() and still have accurate buffer length hinting for the compiler by doing the bucket size calculation _before_ the allocation. Code can instead ask "how large an allocation would I get for a given size?". Introduce kmalloc_size_roundup(), to serve this function so we can start replacing the "anticipatory resizing" uses of ksize(). [1] https://github.com/ClangBuiltLinux/linux/issues/1599 https://github.com/KSPP/linux/issues/183 [ vbabka@suse.cz: add SLOB version ] Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: linux-mm@kvack.org Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 41bd036e7551..727640173568 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -188,7 +188,21 @@ void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __r void kfree(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); + +/** + * ksize - Report actual allocation size of associated object + * + * @objp: Pointer returned from a prior kmalloc()-family allocation. + * + * This should not be used for writing beyond the originally requested + * allocation size. Either use krealloc() or round up the allocation size + * with kmalloc_size_roundup() prior to allocation. If this is used to + * access beyond the originally requested allocation size, UBSAN_BOUNDS + * and/or FORTIFY_SOURCE may trip, since they only know about the + * originally allocated size via the __alloc_size attribute. + */ size_t ksize(const void *objp); + #ifdef CONFIG_PRINTK bool kmem_valid_obj(void *object); void kmem_dump_obj(void *object); @@ -779,6 +793,23 @@ extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); + +/** + * kmalloc_size_roundup - Report allocation bucket size for the given size + * + * @size: Number of bytes to round up from. + * + * This returns the number of bytes that would be available in a kmalloc() + * allocation of @size bytes. For example, a 126 byte request would be + * rounded up to the next sized kmalloc bucket, 128 bytes. (This is strictly + * for the general-purpose kmalloc()-based allocations, and is not for the + * pre-sized kmem_cache_alloc()-based allocations.) + * + * Use this to kmalloc() the full bucket size ahead of time instead of using + * ksize() to query the size after an allocation. + */ +size_t kmalloc_size_roundup(size_t size); + void __init kmem_cache_init_late(void); #if defined(CONFIG_SMP) && defined(CONFIG_SLAB) -- cgit v1.2.3 From c60ba2d34608dc6e224440267ed392adf65e05f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Sep 2022 02:10:37 +0206 Subject: printk: Make pr_flush() static No user outside the printk code and no reason to export this. Signed-off-by: Thomas Gleixner Signed-off-by: John Ogness Reviewed-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Reviewed-by: Greg Kroah-Hartman Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220924000454.3319186-2-john.ogness@linutronix.de --- include/linux/printk.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 091fba7283e1..b70a42f94031 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -170,8 +170,6 @@ extern void __printk_safe_exit(void); #define printk_deferred_enter __printk_safe_enter #define printk_deferred_exit __printk_safe_exit -extern bool pr_flush(int timeout_ms, bool reset_on_progress); - /* * Please don't use printk_ratelimit(), because it shares ratelimiting state * with all other unrelated printk_ratelimit() callsites. Instead use @@ -222,11 +220,6 @@ static inline void printk_deferred_exit(void) { } -static inline bool pr_flush(int timeout_ms, bool reset_on_progress) -{ - return true; -} - static inline int printk_ratelimit(void) { return 0; -- cgit v1.2.3 From e3f12f0602833c88ad2cbe3aff397acd0cfd2695 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Sep 2022 02:10:38 +0206 Subject: printk: Declare log_wait properly kernel/printk/printk.c:365:1: warning: symbol 'log_wait' was not declared. Should it be static? Signed-off-by: Thomas Gleixner Signed-off-by: John Ogness Reviewed-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Reviewed-by: Greg Kroah-Hartman Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220924000454.3319186-3-john.ogness@linutronix.de --- include/linux/syslog.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syslog.h b/include/linux/syslog.h index 86af908e2663..955f80e34d4f 100644 --- a/include/linux/syslog.h +++ b/include/linux/syslog.h @@ -8,6 +8,8 @@ #ifndef _LINUX_SYSLOG_H #define _LINUX_SYSLOG_H +#include + /* Close the log. Currently a NOP. */ #define SYSLOG_ACTION_CLOSE 0 /* Open the log. Currently a NOP. */ @@ -35,5 +37,6 @@ #define SYSLOG_FROM_PROC 1 int do_syslog(int type, char __user *buf, int count, int source); +extern wait_queue_head_t log_wait; #endif /* _LINUX_SYSLOG_H */ -- cgit v1.2.3 From 8cafdb5ab94cda3ebb0975be16e2d564a05132ea Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 29 Sep 2022 09:47:44 +0200 Subject: block: adapt blk_mq_plug() to not plug for writes that require a zone lock The current implementation of blk_mq_plug() disables plugging for all operations that involves a transfer to the device as we just check if the last bit in op_is_write() function. Modify blk_mq_plug() to disable plugging only for REQ_OP_WRITE and REQ_OP_WRITE_ZEROS as they might require a zone lock. Suggested-by: Christoph Hellwig Suggested-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Pankaj Raghav Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220929074745.103073-2-p.raghav@samsung.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4750772ef228..49373d002631 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1304,6 +1304,15 @@ static inline bool bdev_is_zoned(struct block_device *bdev) return false; } +static inline bool bdev_op_is_zoned_write(struct block_device *bdev, + blk_opf_t op) +{ + if (!bdev_is_zoned(bdev)) + return false; + + return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES; +} + static inline sector_t bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From 39d6d08b2edf99c4b39a689a70bf0adee065b357 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 28 Jul 2022 16:33:08 -0700 Subject: tracing/user_events: Use bits vs bytes for enabled status page data User processes may require many events and when they do the cache performance of a byte index status check is less ideal than a bit index. The previous event limit per-page was 4096, the new limit is 32,768. This change adds a bitwise index to the user_reg struct. Programs check that the bit at status_bit has a bit set within the status page(s). Link: https://lkml.kernel.org/r/20220728233309.1896-6-beaub@linux.microsoft.com Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/ Suggested-by: Mathieu Desnoyers Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/linux/user_events.h | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 736e05603463..592a3fbed98e 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -20,15 +20,6 @@ #define USER_EVENTS_SYSTEM "user_events" #define USER_EVENTS_PREFIX "u:" -/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */ -#define EVENT_BIT_FTRACE 0 -#define EVENT_BIT_PERF 1 -#define EVENT_BIT_OTHER 7 - -#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE) -#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF) -#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER) - /* Create dynamic location entry within a 32-bit value */ #define DYN_LOC(offset, size) ((size) << 16 | (offset)) @@ -45,12 +36,12 @@ struct user_reg { /* Input: Pointer to string with event name, description and flags */ __u64 name_args; - /* Output: Byte index of the event within the status page */ - __u32 status_index; + /* Output: Bitwise index of the event within the status page */ + __u32 status_bit; /* Output: Index of the event to use when writing data */ __u32 write_index; -}; +} __attribute__((__packed__)); #define DIAG_IOC_MAGIC '*' -- cgit v1.2.3 From adfdfcbdbd32b356323a3db6d3a683270051a7e6 Mon Sep 17 00:00:00 2001 From: Jerome Neanne Date: Thu, 29 Sep 2022 15:25:25 +0200 Subject: regulator: gpio: Add input_supply support in gpio_regulator_config This is simillar as fixed-regulator. Used to extract regulator parent from the device tree. Without that property used, the parent regulator can be shut down (if not an always on). Thus leading to inappropriate behavior: On am62-SP-SK this fix is required to avoid tps65219 ldo1 (SDMMC rail) to be shut down after boot completion. Signed-off-by: Jerome Neanne Link: https://lore.kernel.org/r/20220929132526.29427-2-jneanne@baylibre.com Signed-off-by: Mark Brown --- include/linux/regulator/gpio-regulator.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/gpio-regulator.h b/include/linux/regulator/gpio-regulator.h index fdeb312cdabd..c223e50ff9f7 100644 --- a/include/linux/regulator/gpio-regulator.h +++ b/include/linux/regulator/gpio-regulator.h @@ -42,6 +42,7 @@ struct gpio_regulator_state { /** * struct gpio_regulator_config - config structure * @supply_name: Name of the regulator supply + * @input_supply: Name of the input regulator supply * @enabled_at_boot: Whether regulator has been enabled at * boot or not. 1 = Yes, 0 = No * This is used to keep the regulator at @@ -62,6 +63,7 @@ struct gpio_regulator_state { */ struct gpio_regulator_config { const char *supply_name; + const char *input_supply; unsigned enabled_at_boot:1; unsigned startup_delay; -- cgit v1.2.3 From 64696c40d03c01e0ea2e3e9aa1c490a7b6a1b6be Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Sep 2022 00:04:03 -0700 Subject: bpf: Add __bpf_prog_{enter,exit}_struct_ops for struct_ops trampoline The struct_ops prog is to allow using bpf to implement the functions in a struct (eg. kernel module). The current usage is to implement the tcp_congestion. The kernel does not call the tcp-cc's ops (ie. the bpf prog) in a recursive way. The struct_ops is sharing the tracing-trampoline's enter/exit function which tracks prog->active to avoid recursion. It is needed for tracing prog. However, it turns out the struct_ops bpf prog will hit this prog->active and unnecessarily skipped running the struct_ops prog. eg. The '.ssthresh' may run in_task() and then interrupted by softirq that runs the same '.ssthresh'. Skip running the '.ssthresh' will end up returning random value to the caller. The patch adds __bpf_prog_{enter,exit}_struct_ops for the struct_ops trampoline. They do not track the prog->active to detect recursion. One exception is when the tcp_congestion's '.init' ops is doing bpf_setsockopt(TCP_CONGESTION) and then recurs to the same '.init' ops. This will be addressed in the following patches. Fixes: ca06f55b9002 ("bpf: Add per-program recursion prevention mechanism") Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220929070407.965581-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0f3eaf3ed98c..9e7d46d16032 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -864,6 +864,10 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); -- cgit v1.2.3 From 061ff040710e9f6f043d1fa80b1b362d2845b17a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Sep 2022 00:04:06 -0700 Subject: bpf: tcp: Stop bpf_setsockopt(TCP_CONGESTION) in init ops to recur itself When a bad bpf prog '.init' calls bpf_setsockopt(TCP_CONGESTION, "itself"), it will trigger this loop: .init => bpf_setsockopt(tcp_cc) => .init => bpf_setsockopt(tcp_cc) ... ... => .init => bpf_setsockopt(tcp_cc). It was prevented by the prog->active counter before but the prog->active detection cannot be used in struct_ops as explained in the earlier patch of the set. In this patch, the second bpf_setsockopt(tcp_cc) is not allowed in order to break the loop. This is done by using a bit of an existing 1 byte hole in tcp_sock to check if there is on-going bpf_setsockopt(TCP_CONGESTION) in this tcp_sock. Note that this essentially limits only the first '.init' can call bpf_setsockopt(TCP_CONGESTION) to pick a fallback cc (eg. peer does not support ECN) and the second '.init' cannot fallback to another cc. This applies even the second bpf_setsockopt(TCP_CONGESTION) will not cause a loop. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220929070407.965581-5-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/tcp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a9fbe22732c3..3bdf687e2fb3 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -388,6 +388,12 @@ struct tcp_sock { u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs * values defined in uapi/linux/tcp.h */ + u8 bpf_chg_cc_inprogress:1; /* In the middle of + * bpf_setsockopt(TCP_CONGESTION), + * it is to avoid the bpf_tcp_cc->init() + * to recur itself by calling + * bpf_setsockopt(TCP_CONGESTION, "itself"). + */ #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) #else #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 -- cgit v1.2.3 From f62384995e4cb7703e5295779c44135c5311770d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 26 Sep 2022 17:43:14 +0200 Subject: random: split initialization into early step and later step The full RNG initialization relies on some timestamps, made possible with initialization functions like time_init() and timekeeping_init(). However, these are only available rather late in initialization. Meanwhile, other things, such as memory allocator functions, make use of the RNG much earlier. So split RNG initialization into two phases. We can provide arch randomness very early on, and then later, after timekeeping and such are available, initialize the rest. This ensures that, for example, slabs are properly randomized if RDRAND is available. Without this, CONFIG_SLAB_FREELIST_RANDOM=y loses a degree of its security, because its random seed is potentially deterministic, since it hasn't yet incorporated RDRAND. It also makes it possible to use a better seed in kfence, which currently relies on only the cycle counter. Another positive consequence is that on systems with RDRAND, running with CONFIG_WARN_ALL_UNSEEDED_RANDOM=y results in no warnings at all. One subtle side effect of this change is that on systems with no RDRAND, RDTSC is now only queried by random_init() once, committing the moment of the function call, instead of multiple times as before. This is intentional, as the multiple RDTSCs in a loop before weren't accomplishing very much, with jitter being better provided by try_to_generate_entropy(). Plus, filling blocks with RDTSC is still being done in extract_entropy(), which is necessarily called before random bytes are served anyway. Cc: Andrew Morton Reviewed-by: Kees Cook Reviewed-by: Dominik Brodowski Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 3fec206487f6..a9e6e16f9774 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -72,7 +72,8 @@ static inline unsigned long get_random_canary(void) return get_random_long() & CANARY_MASK; } -int __init random_init(const char *command_line); +void __init random_init_early(const char *command_line); +void __init random_init(void); bool rng_is_initialized(void); int wait_for_random_bytes(void); -- cgit v1.2.3 From 585cd5fe9f7378601b1d4915ad6e9088333b5e5e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 28 Sep 2022 18:47:30 +0200 Subject: random: add 8-bit and 16-bit batches There are numerous places in the kernel that would be sped up by having smaller batches. Currently those callsites do `get_random_u32() & 0xff` or similar. Since these are pretty spread out, and will require patches to multiple different trees, let's get ahead of the curve and lay the foundation for `get_random_u8()` and `get_random_u16()`, so that it's then possible to start submitting conversion patches leisurely. Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index a9e6e16f9774..2c130f8f18e5 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -38,6 +38,8 @@ static inline int unregister_random_vmfork_notifier(struct notifier_block *nb) { #endif void get_random_bytes(void *buf, size_t len); +u8 get_random_u8(void); +u16 get_random_u16(void); u32 get_random_u32(void); u64 get_random_u64(void); static inline unsigned int get_random_int(void) -- cgit v1.2.3 From 4c95236a335d8b62aa8dbd587bed6a5f30d8265a Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 29 Sep 2022 11:49:35 +0200 Subject: prandom: make use of smaller types in prandom_u32_max When possible at compile-time, make use of smaller types in prandom_u32_max(), so that we can use smaller batches from random.c, which in turn leads to a 2x or 4x performance boost. This makes a difference, for example, in kfence, which needs a fast stream of small numbers (booleans). At the same time, we use the occasion to update the old documentation on these functions. prandom_u32() and prandom_bytes() have direct replacements now in random.h, while prandom_u32_max() remains useful as a prandom.h function, since it's not cryptographically secure by virtue of not being evenly distributed. Cc: Dominik Brodowski Acked-by: Marco Elver Signed-off-by: Jason A. Donenfeld --- include/linux/prandom.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/prandom.h b/include/linux/prandom.h index deace5fb4e62..78db003bc290 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -12,11 +12,13 @@ #include #include +/* Deprecated: use get_random_u32 instead. */ static inline u32 prandom_u32(void) { return get_random_u32(); } +/* Deprecated: use get_random_bytes instead. */ static inline void prandom_bytes(void *buf, size_t nbytes) { return get_random_bytes(buf, nbytes); @@ -37,17 +39,20 @@ void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state); * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro) * @ep_ro: right open interval endpoint * - * Returns a pseudo-random number that is in interval [0, ep_ro). Note - * that the result depends on PRNG being well distributed in [0, ~0U] - * u32 space. Here we use maximally equidistributed combined Tausworthe - * generator, that is, prandom_u32(). This is useful when requesting a - * random index of an array containing ep_ro elements, for example. + * Returns a pseudo-random number that is in interval [0, ep_ro). This is + * useful when requesting a random index of an array containing ep_ro elements, + * for example. The result is somewhat biased when ep_ro is not a power of 2, + * so do not use this for cryptographic purposes. * * Returns: pseudo-random number in interval [0, ep_ro) */ static inline u32 prandom_u32_max(u32 ep_ro) { - return (u32)(((u64) prandom_u32() * ep_ro) >> 32); + if (__builtin_constant_p(ep_ro <= 1U << 8) && ep_ro <= 1U << 8) + return (get_random_u8() * ep_ro) >> 8; + if (__builtin_constant_p(ep_ro <= 1U << 16) && ep_ro <= 1U << 16) + return (get_random_u16() * ep_ro) >> 16; + return ((u64)get_random_u32() * ep_ro) >> 32; } /* -- cgit v1.2.3 From 9f4beead610c83065cc0410bfe97ff51d8e9578d Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 29 Sep 2022 22:39:03 +0200 Subject: binfmt: remove taso from linux_binprm struct With commit 987f20a9dcce ("a.out: Remove the a.out implementation"), the use of the special taso flag for alpha architectures in the linux_binprm struct is gone. Remove the definition of taso in the linux_binprm struct. No functional change. Signed-off-by: Lukas Bulwahn Reviewed-by: "Eric W. Biederman" Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220929203903.9475-1-lukas.bulwahn@gmail.com --- include/linux/binfmts.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 3dc20c4f394c..8d51f69f9f5e 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -43,9 +43,6 @@ struct linux_binprm { * original userspace. */ point_of_no_return:1; -#ifdef __alpha__ - unsigned int taso:1; -#endif struct file *executable; /* Executable to pass to the interpreter */ struct file *interpreter; struct file *file; -- cgit v1.2.3 From f5290d8e4f0caa81a491448a27dd70e726095d07 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 16 Sep 2022 09:17:38 +0300 Subject: clk: asm9260: use parent index to link the reference clock Rewrite clk-asm9260 to use parent index to use the reference clock. During this rework two helpers are added: - clk_hw_register_mux_table_parent_data() to supplement clk_hw_register_mux_table() but using parent_data instead of parent_names - clk_hw_register_fixed_rate_parent_accuracy() to be used instead of directly calling __clk_hw_register_fixed_rate(). The later function is an internal API, which is better not to be called directly. Signed-off-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20220916061740.87167-2-dmitry.baryshkov@linaro.org Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 1615010aa0ec..86140ac2f9a5 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -439,6 +439,20 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name, __clk_hw_register_fixed_rate((dev), NULL, (name), NULL, NULL, \ (parent_data), NULL, (flags), \ (fixed_rate), (fixed_accuracy), 0) +/** + * clk_hw_register_fixed_rate_parent_accuracy - register fixed-rate clock with + * the clock framework + * @dev: device that is registering this clock + * @name: name of this clock + * @parent_name: name of clock's parent + * @flags: framework-specific flags + * @fixed_rate: non-adjustable clock rate + */ +#define clk_hw_register_fixed_rate_parent_accuracy(dev, name, parent_data, \ + flags, fixed_rate) \ + __clk_hw_register_fixed_rate((dev), NULL, (name), NULL, NULL, \ + (parent_data), (flags), (fixed_rate), 0, \ + CLK_FIXED_RATE_PARENT_ACCURACY) void clk_unregister_fixed_rate(struct clk *clk); void clk_hw_unregister_fixed_rate(struct clk_hw *hw); @@ -957,6 +971,13 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name, (parent_names), NULL, NULL, (flags), (reg), \ (shift), (mask), (clk_mux_flags), (table), \ (lock)) +#define clk_hw_register_mux_table_parent_data(dev, name, parent_data, \ + num_parents, flags, reg, shift, mask, \ + clk_mux_flags, table, lock) \ + __clk_hw_register_mux((dev), NULL, (name), (num_parents), \ + NULL, NULL, (parent_data), (flags), (reg), \ + (shift), (mask), (clk_mux_flags), (table), \ + (lock)) #define clk_hw_register_mux(dev, name, parent_names, num_parents, flags, reg, \ shift, width, clk_mux_flags, lock) \ __clk_hw_register_mux((dev), NULL, (name), (num_parents), \ -- cgit v1.2.3 From 1d7d20658534c7d36fe6f4252f6f1a27d9631a99 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 16 Sep 2022 09:17:39 +0300 Subject: clk: fixed-rate: add devm_clk_hw_register_fixed_rate Add devm_clk_hw_register_fixed_rate(), devres-managed helper to register fixed-rate clock. Signed-off-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20220916061740.87167-3-dmitry.baryshkov@linaro.org Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 86140ac2f9a5..49405b336cad 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -350,7 +350,7 @@ struct clk_hw *__clk_hw_register_fixed_rate(struct device *dev, const char *parent_name, const struct clk_hw *parent_hw, const struct clk_parent_data *parent_data, unsigned long flags, unsigned long fixed_rate, unsigned long fixed_accuracy, - unsigned long clk_fixed_flags); + unsigned long clk_fixed_flags, bool devm); struct clk *clk_register_fixed_rate(struct device *dev, const char *name, const char *parent_name, unsigned long flags, unsigned long fixed_rate); @@ -365,7 +365,20 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name, */ #define clk_hw_register_fixed_rate(dev, name, parent_name, flags, fixed_rate) \ __clk_hw_register_fixed_rate((dev), NULL, (name), (parent_name), NULL, \ - NULL, (flags), (fixed_rate), 0, 0) + NULL, (flags), (fixed_rate), 0, 0, false) + +/** + * devm_clk_hw_register_fixed_rate - register fixed-rate clock with the clock + * framework + * @dev: device that is registering this clock + * @name: name of this clock + * @parent_name: name of clock's parent + * @flags: framework-specific flags + * @fixed_rate: non-adjustable clock rate + */ +#define devm_clk_hw_register_fixed_rate(dev, name, parent_name, flags, fixed_rate) \ + __clk_hw_register_fixed_rate((dev), NULL, (name), (parent_name), NULL, \ + NULL, (flags), (fixed_rate), 0, 0, true) /** * clk_hw_register_fixed_rate_parent_hw - register fixed-rate clock with * the clock framework @@ -378,7 +391,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name, #define clk_hw_register_fixed_rate_parent_hw(dev, name, parent_hw, flags, \ fixed_rate) \ __clk_hw_register_fixed_rate((dev), NULL, (name), NULL, (parent_hw), \ - NULL, (flags), (fixed_rate), 0, 0) + NULL, (flags), (fixed_rate), 0, 0, false) /** * clk_hw_register_fixed_rate_parent_data - register fixed-rate clock with * the clock framework @@ -392,7 +405,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name, fixed_rate) \ __clk_hw_register_fixed_rate((dev), NULL, (name), NULL, NULL, \ (parent_data), (flags), (fixed_rate), 0, \ - 0) + 0, false) /** * clk_hw_register_fixed_rate_with_accuracy - register fixed-rate clock with * the clock framework @@ -408,7 +421,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name, fixed_accuracy) \ __clk_hw_register_fixed_rate((dev), NULL, (name), (parent_name), \ NULL, NULL, (flags), (fixed_rate), \ - (fixed_accuracy), 0) + (fixed_accuracy), 0, false) /** * clk_hw_register_fixed_rate_with_accuracy_parent_hw - register fixed-rate * clock with the clock framework @@ -423,7 +436,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name, parent_hw, flags, fixed_rate, fixed_accuracy) \ __clk_hw_register_fixed_rate((dev), NULL, (name), NULL, (parent_hw) \ NULL, NULL, (flags), (fixed_rate), \ - (fixed_accuracy), 0) + (fixed_accuracy), 0, false) /** * clk_hw_register_fixed_rate_with_accuracy_parent_data - register fixed-rate * clock with the clock framework @@ -438,7 +451,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name, parent_data, flags, fixed_rate, fixed_accuracy) \ __clk_hw_register_fixed_rate((dev), NULL, (name), NULL, NULL, \ (parent_data), NULL, (flags), \ - (fixed_rate), (fixed_accuracy), 0) + (fixed_rate), (fixed_accuracy), 0, false) /** * clk_hw_register_fixed_rate_parent_accuracy - register fixed-rate clock with * the clock framework @@ -452,7 +465,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name, flags, fixed_rate) \ __clk_hw_register_fixed_rate((dev), NULL, (name), NULL, NULL, \ (parent_data), (flags), (fixed_rate), 0, \ - CLK_FIXED_RATE_PARENT_ACCURACY) + CLK_FIXED_RATE_PARENT_ACCURACY, false) void clk_unregister_fixed_rate(struct clk *clk); void clk_hw_unregister_fixed_rate(struct clk_hw *hw); -- cgit v1.2.3 From dbae2b062824fc2d35ae2d5df2f500626c758e80 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 28 Sep 2022 10:43:09 +0200 Subject: net: skb: introduce and use a single page frag cache After commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs") we are observing 10-20% regressions in performance tests with small packets. The perf trace points to high pressure on the slab allocator. This change tries to improve the allocation schema for small packets using an idea originally suggested by Eric: a new per CPU page frag is introduced and used in __napi_alloc_skb to cope with small allocation requests. To ensure that the above does not lead to excessive truesize underestimation, the frag size for small allocation is inflated to 1K and all the above is restricted to build with 4K page size. Note that we need to update accordingly the run-time check introduced with commit fd9ea57f4e95 ("net: add napi_get_frags_check() helper"). Alex suggested a smart page refcount schema to reduce the number of atomic operations and deal properly with pfmemalloc pages. Under small packet UDP flood, I measure a 15% peak tput increases. Suggested-by: Eric Dumazet Suggested-by: Alexander H Duyck Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/6b6f65957c59f86a353fc09a5127e83a32ab5999.1664350652.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7a084a1c14e9..64c3a5864969 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3821,6 +3821,7 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); +void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); struct packet_offload *gro_find_receive_by_type(__be16 type); struct packet_offload *gro_find_complete_by_type(__be16 type); -- cgit v1.2.3 From aac4daa8941ea6566563ac001e9e5d4e54a674e2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 28 Sep 2022 12:51:57 +0300 Subject: net/sched: query offload capabilities through ndo_setup_tc() When adding optional new features to Qdisc offloads, existing drivers must reject the new configuration until they are coded up to act on it. Since modifying all drivers in lockstep with the changes in the Qdisc can create problems of its own, it would be nice if there existed an automatic opt-in mechanism for offloading optional features. Jakub proposes that we multiplex one more kind of call through ndo_setup_tc(): one where the driver populates a Qdisc-specific capability structure. First user will be taprio in further changes. Here we are introducing the definitions for the base functionality. Link: https://patchwork.kernel.org/project/netdevbpf/patch/20220923163310.3192733-3-vladimir.oltean@nxp.com/ Suggested-by: Jakub Kicinski Co-developed-by: Jakub Kicinski Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 64c3a5864969..eddf8ee270e7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -940,6 +940,7 @@ struct net_device_path_ctx { }; enum tc_setup_type { + TC_QUERY_CAPS, TC_SETUP_QDISC_MQPRIO, TC_SETUP_CLSU32, TC_SETUP_CLSFLOWER, -- cgit v1.2.3 From 5bdf402a05fafc55c876b9993b4c948bb2bc2361 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Thu, 18 Aug 2022 10:34:40 +0530 Subject: fs/buffer: make submit_bh & submit_bh_wbc return type as void submit_bh/submit_bh_wbc are non-blocking functions which just submit the bio and return. The caller of submit_bh/submit_bh_wbc needs to wait on buffer till I/O completion and then check buffer head's b_state field to know if there was any I/O error. Hence there is no need for these functions to have any return type. Even now they always returns 0. Hence drop the return value and make their return type as void to avoid any confusion. Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/cb66ef823374cdd94d2d03083ce13de844fffd41.1660788334.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 089c9ade4325..dcc0e90d8979 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -229,7 +229,7 @@ void ll_rw_block(blk_opf_t, int, struct buffer_head * bh[]); int sync_dirty_buffer(struct buffer_head *bh); int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); -int submit_bh(blk_opf_t, struct buffer_head *); +void submit_bh(blk_opf_t, struct buffer_head *); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); int bh_uptodate_or_lock(struct buffer_head *bh); -- cgit v1.2.3 From cbfecb927f429a6fa613d74b998496bd71e4438a Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 25 Aug 2022 12:06:57 +0200 Subject: fs: record I_DIRTY_TIME even if inode already has I_DIRTY_INODE Currently the I_DIRTY_TIME will never get set if the inode already has I_DIRTY_INODE with assumption that it supersedes I_DIRTY_TIME. That's true, however ext4 will only update the on-disk inode in ->dirty_inode(), not on actual writeback. As a result if the inode already has I_DIRTY_INODE state by the time we get to __mark_inode_dirty() only with I_DIRTY_TIME, the time was already filled into on-disk inode and will not get updated until the next I_DIRTY_INODE update, which might never come if we crash or get a power failure. The problem can be reproduced on ext4 by running xfstest generic/622 with -o iversion mount option. Fix it by allowing I_DIRTY_TIME to be set even if the inode already has I_DIRTY_INODE. Also make sure that the case is properly handled in writeback_single_inode() as well. Additionally changes in xfs_fs_dirty_inode() was made to accommodate for I_DIRTY_TIME in flag. Thanks Jan Kara for suggestions on how to make this work properly. Cc: Dave Chinner Cc: Christoph Hellwig Cc: stable@kernel.org Signed-off-by: Lukas Czerner Suggested-by: Jan Kara Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220825100657.44217-1-lczerner@redhat.com Signed-off-by: Theodore Ts'o --- include/linux/fs.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9eced4cc286e..56a4b4b02477 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2371,13 +2371,14 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, * don't have to write inode on fdatasync() when only * e.g. the timestamps have changed. * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. - * I_DIRTY_TIME The inode itself only has dirty timestamps, and the + * I_DIRTY_TIME The inode itself has dirty timestamps, and the * lazytime mount option is enabled. We keep track of this * separately from I_DIRTY_SYNC in order to implement * lazytime. This gets cleared if I_DIRTY_INODE - * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. I.e. - * either I_DIRTY_TIME *or* I_DIRTY_INODE can be set in - * i_state, but not both. I_DIRTY_PAGES may still be set. + * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But + * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already + * in place because writeback might already be in progress + * and we don't want to lose the time update * I_NEW Serves as both a mutex and completion notification. * New inodes set I_NEW. If two processes both create * the same inode, one of them will release its inode and -- cgit v1.2.3 From d427c8999b071af2003203b42a007c4a80156c18 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Wed, 28 Sep 2022 14:55:31 +0200 Subject: net-next: skbuff: refactor pskb_pull pskb_may_pull already contains all of the checks performed by pskb_pull. Use pskb_may_pull for validation in pskb_pull, eliminating the duplication and making __pskb_pull obsolete. Replace __pskb_pull with pskb_pull where applicable. Signed-off-by: Richard Gobert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 920eb6413fee..9fcf534f2d92 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2616,20 +2616,6 @@ void *skb_pull_data(struct sk_buff *skb, size_t len); void *__pskb_pull_tail(struct sk_buff *skb, int delta); -static inline void *__pskb_pull(struct sk_buff *skb, unsigned int len) -{ - if (len > skb_headlen(skb) && - !__pskb_pull_tail(skb, len - skb_headlen(skb))) - return NULL; - skb->len -= len; - return skb->data += len; -} - -static inline void *pskb_pull(struct sk_buff *skb, unsigned int len) -{ - return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len); -} - static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len) { if (likely(len <= skb_headlen(skb))) @@ -2639,6 +2625,15 @@ static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len) return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL; } +static inline void *pskb_pull(struct sk_buff *skb, unsigned int len) +{ + if (!pskb_may_pull(skb, len)) + return NULL; + + skb->len -= len; + return skb->data += len; +} + void skb_condense(struct sk_buff *skb); /** -- cgit v1.2.3 From f4ce91ce12a7c6ead19b128ffa8cff6e3ded2a14 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 28 Sep 2022 16:03:31 -0400 Subject: tcp: fix tcp_cwnd_validate() to not forget is_cwnd_limited This commit fixes a bug in the tracking of max_packets_out and is_cwnd_limited. This bug can cause the connection to fail to remember that is_cwnd_limited is true, causing the connection to fail to grow cwnd when it should, causing throughput to be lower than it should be. The following event sequence is an example that triggers the bug: (a) The connection is cwnd_limited, but packets_out is not at its peak due to TSO deferral deciding not to send another skb yet. In such cases the connection can advance max_packets_seq and set tp->is_cwnd_limited to true and max_packets_out to a small number. (b) Then later in the round trip the connection is pacing-limited (not cwnd-limited), and packets_out is larger. In such cases the connection would raise max_packets_out to a bigger number but (unexpectedly) flip tp->is_cwnd_limited from true to false. This commit fixes that bug. One straightforward fix would be to separately track (a) the next window after max_packets_out reaches a maximum, and (b) the next window after tp->is_cwnd_limited is set to true. But this would require consuming an extra u32 sequence number. Instead, to save space we track only the most important information. Specifically, we track the strongest available signal of the degree to which the cwnd is fully utilized: (1) If the connection is cwnd-limited then we remember that fact for the current window. (2) If the connection not cwnd-limited then we track the maximum number of outstanding packets in the current window. In particular, note that the new logic cannot trigger the buggy (a)/(b) sequence above because with the new logic a condition where tp->packets_out > tp->max_packets_out can only trigger an update of tp->is_cwnd_limited if tp->is_cwnd_limited is false. This first showed up in a testing of a BBRv2 dev branch, but this buggy behavior highlighted a general issue with the tcp_cwnd_validate() logic that can cause cwnd to fail to increase at the proper rate for any TCP congestion control, including Reno or CUBIC. Fixes: ca8a22634381 ("tcp: make cwnd-limited checks measurement-based, and gentler") Signed-off-by: Neal Cardwell Signed-off-by: Kevin(Yudong) Yang Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a9fbe22732c3..4791fd801945 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -295,7 +295,7 @@ struct tcp_sock { u32 packets_out; /* Packets which are "in flight" */ u32 retrans_out; /* Retransmitted packets out */ u32 max_packets_out; /* max packets_out in last window */ - u32 max_packets_seq; /* right edge of max_packets_out flight */ + u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */ u16 urg_data; /* Saved octet of OOB data and control flags */ u8 ecn_flags; /* ECN status bits. */ -- cgit v1.2.3 From 650ae67bbf7ba5ac193f053969612fbb93247b64 Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Tue, 27 Sep 2022 18:53:38 -0400 Subject: counter: Introduce the Signal polarity component The Signal polarity component represents the active level of a respective Signal. There are two possible states: positive (rising edge) and negative (falling edge); enum counter_signal_polarity represents these states. A convenience macro COUNTER_COMP_POLARITY() is provided for driver authors to declare a Signal polarity component. Cc: Julien Panis Link: https://lore.kernel.org/r/8f47d6e1db71a11bb1e2666f8e2a6e9d256d4131.1664204990.git.william.gray@linaro.org/ Signed-off-by: William Breathitt Gray Link: https://lore.kernel.org/r/b6e53438badcb6318997d13dd2fc052f97d808ac.1664318353.git.william.gray@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/counter.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index a81234bc8ea8..b3fb6b68881a 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -31,6 +31,7 @@ enum counter_comp_type { COUNTER_COMP_ENUM, COUNTER_COMP_COUNT_DIRECTION, COUNTER_COMP_COUNT_MODE, + COUNTER_COMP_SIGNAL_POLARITY, }; /** @@ -477,6 +478,15 @@ struct counter_available { #define COUNTER_COMP_FLOOR(_read, _write) \ COUNTER_COMP_COUNT_U64("floor", _read, _write) +#define COUNTER_COMP_POLARITY(_read, _write, _available) \ +{ \ + .type = COUNTER_COMP_SIGNAL_POLARITY, \ + .name = "polarity", \ + .signal_u32_read = (_read), \ + .signal_u32_write = (_write), \ + .priv = &(_available), \ +} + #define COUNTER_COMP_PRESET(_read, _write) \ COUNTER_COMP_COUNT_U64("preset", _read, _write) -- cgit v1.2.3 From 45d2918520b2d8e640e4fb3fbf664dfb823dc520 Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Tue, 27 Sep 2022 18:53:40 -0400 Subject: counter: Introduce the Count capture component Some devices provide a latch function to save historic Count values. This patch standardizes exposure of such functionality as Count capture components. A COUNTER_COMP_CAPTURE macro is provided for driver authors to define a capture component. A new event COUNTER_EVENT_CAPTURE is introduced to represent Count value capture events. Cc: Julien Panis Link: https://lore.kernel.org/r/c239572ab4208d0d6728136e82a88ad464369a7a.1664204990.git.william.gray@linaro.org/ Signed-off-by: William Breathitt Gray Link: https://lore.kernel.org/r/3cebaa0b807a225eb277d771504fe6dba7269ffd.1664318353.git.william.gray@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/counter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index b3fb6b68881a..e160197971dd 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -453,6 +453,9 @@ struct counter_available { .priv = &(_available), \ } +#define COUNTER_COMP_CAPTURE(_read, _write) \ + COUNTER_COMP_COUNT_U64("capture", _read, _write) + #define COUNTER_COMP_CEILING(_read, _write) \ COUNTER_COMP_COUNT_U64("ceiling", _read, _write) -- cgit v1.2.3 From d2011be1e22f7769c7c71d6d7f777ffcc544808d Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Tue, 27 Sep 2022 18:53:42 -0400 Subject: counter: Introduce the COUNTER_COMP_ARRAY component type The COUNTER_COMP_ARRAY Counter component type is introduced to enable support for Counter array components. With Counter array components, exposure for buffers on counter devices can be defined via new Counter array component macros. This should simplify code for driver authors who would otherwise need to define individual Counter components for each array element. Eight Counter array component macros are introduced:: DEFINE_COUNTER_ARRAY_U64(_name, _length) DEFINE_COUNTER_ARRAY_CAPTURE(_name, _length) DEFINE_COUNTER_ARRAY_POLARITY(_name, _enums, _length) COUNTER_COMP_DEVICE_ARRAY_U64(_name, _read, _write, _array) COUNTER_COMP_COUNT_ARRAY_U64(_name, _read, _write, _array) COUNTER_COMP_SIGNAL_ARRAY_U64(_name, _read, _write, _array) COUNTER_COMP_ARRAY_CAPTURE(_read, _write, _array) COUNTER_COMP_ARRAY_POLARITY(_read, _write, _array) Eight Counter array callbacks are introduced as well:: int (*signal_array_u32_read)(struct counter_device *counter, struct counter_signal *signal, size_t idx, u32 *val); int (*signal_array_u32_write)(struct counter_device *counter, struct counter_signal *signal, size_t idx, u32 val); int (*device_array_u64_read)(struct counter_device *counter, size_t idx, u64 *val); int (*count_array_u64_read)(struct counter_device *counter, struct counter_count *count, size_t idx, u64 *val); int (*signal_array_u64_read)(struct counter_device *counter, struct counter_signal *signal, size_t idx, u64 *val); int (*device_array_u64_write)(struct counter_device *counter, size_t idx, u64 val); int (*count_array_u64_write)(struct counter_device *counter, struct counter_count *count, size_t idx, u64 val); int (*signal_array_u64_write)(struct counter_device *counter, struct counter_signal *signal, size_t idx, u64 val); Driver authors can handle reads/writes for an array component by receiving an element index via the `idx` parameter and processing the respective value via the `val` parameter. For example, suppose a driver wants to expose a Count's read-only capture buffer of four elements using a callback `foobar_capture_read()`:: DEFINE_COUNTER_ARRAY_CAPTURE(foobar_capture_array, 4); COUNTER_COMP_ARRAY_CAPTURE(foobar_capture_read, NULL, foobar_capture_array) Respective sysfs attributes for each array element would appear for the respective Count: * /sys/bus/counter/devices/counterX/countY/capture0 * /sys/bus/counter/devices/counterX/countY/capture1 * /sys/bus/counter/devices/counterX/countY/capture2 * /sys/bus/counter/devices/counterX/countY/capture3 If a user tries to read _capture2_ for example, `idx` will be `2` when passed to the `foobar_capture_read()` callback, and thus the driver knows which array element to handle. Counter arrays for polarity elements can be defined in a similar manner as u64 elements:: const enum counter_signal_polarity foobar_polarity_states[] = { COUNTER_SIGNAL_POLARITY_POSITIVE, COUNTER_SIGNAL_POLARITY_NEGATIVE, }; DEFINE_COUNTER_ARRAY_POLARITY(foobar_polarity_array, foobar_polarity_states, 4); COUNTER_COMP_ARRAY_POLARITY(foobar_polarity_read, foobar_polarity_write, foobar_polarity_array) Tested-by: Julien Panis Link: https://lore.kernel.org/r/5310c22520aeae65b1b74952419f49ac4c8e1ec1.1664204990.git.william.gray@linaro.org/ Signed-off-by: William Breathitt Gray Link: https://lore.kernel.org/r/a51fd608704bdfc5a0efa503fc5481df34241e0a.1664318353.git.william.gray@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/counter.h | 134 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index e160197971dd..c41fa602ed28 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -32,6 +32,7 @@ enum counter_comp_type { COUNTER_COMP_COUNT_DIRECTION, COUNTER_COMP_COUNT_MODE, COUNTER_COMP_SIGNAL_POLARITY, + COUNTER_COMP_ARRAY, }; /** @@ -69,6 +70,30 @@ enum counter_comp_type { * @signal_u64_read: Signal u64 component read callback. The read value of * the respective Signal u64 component should be passed * back via the val parameter. + * @signal_array_u32_read: Signal u32 array component read callback. The + * index of the respective Count u32 array + * component element is passed via the idx + * parameter. The read value of the respective + * Count u32 array component element should be + * passed back via the val parameter. + * @device_array_u64_read: Device u64 array component read callback. The + * index of the respective Device u64 array + * component element is passed via the idx + * parameter. The read value of the respective + * Device u64 array component element should be + * passed back via the val parameter. + * @count_array_u64_read: Count u64 array component read callback. The + * index of the respective Count u64 array + * component element is passed via the idx + * parameter. The read value of the respective + * Count u64 array component element should be + * passed back via the val parameter. + * @signal_array_u64_read: Signal u64 array component read callback. The + * index of the respective Count u64 array + * component element is passed via the idx + * parameter. The read value of the respective + * Count u64 array component element should be + * passed back via the val parameter. * @action_write: Synapse action mode write callback. The write value of * the respective Synapse action mode is passed via the * action parameter. @@ -99,6 +124,30 @@ enum counter_comp_type { * @signal_u64_write: Signal u64 component write callback. The write value of * the respective Signal u64 component is passed via the * val parameter. + * @signal_array_u32_write: Signal u32 array component write callback. The + * index of the respective Signal u32 array + * component element is passed via the idx + * parameter. The write value of the respective + * Signal u32 array component element is passed via + * the val parameter. + * @device_array_u64_write: Device u64 array component write callback. The + * index of the respective Device u64 array + * component element is passed via the idx + * parameter. The write value of the respective + * Device u64 array component element is passed via + * the val parameter. + * @count_array_u64_write: Count u64 array component write callback. The + * index of the respective Count u64 array + * component element is passed via the idx + * parameter. The write value of the respective + * Count u64 array component element is passed via + * the val parameter. + * @signal_array_u64_write: Signal u64 array component write callback. The + * index of the respective Signal u64 array + * component element is passed via the idx + * parameter. The write value of the respective + * Signal u64 array component element is passed via + * the val parameter. */ struct counter_comp { enum counter_comp_type type; @@ -126,6 +175,17 @@ struct counter_comp { struct counter_count *count, u64 *val); int (*signal_u64_read)(struct counter_device *counter, struct counter_signal *signal, u64 *val); + int (*signal_array_u32_read)(struct counter_device *counter, + struct counter_signal *signal, + size_t idx, u32 *val); + int (*device_array_u64_read)(struct counter_device *counter, + size_t idx, u64 *val); + int (*count_array_u64_read)(struct counter_device *counter, + struct counter_count *count, + size_t idx, u64 *val); + int (*signal_array_u64_read)(struct counter_device *counter, + struct counter_signal *signal, + size_t idx, u64 *val); }; union { int (*action_write)(struct counter_device *counter, @@ -149,6 +209,17 @@ struct counter_comp { struct counter_count *count, u64 val); int (*signal_u64_write)(struct counter_device *counter, struct counter_signal *signal, u64 val); + int (*signal_array_u32_write)(struct counter_device *counter, + struct counter_signal *signal, + size_t idx, u32 val); + int (*device_array_u64_write)(struct counter_device *counter, + size_t idx, u64 val); + int (*count_array_u64_write)(struct counter_device *counter, + struct counter_count *count, + size_t idx, u64 val); + int (*signal_array_u64_write)(struct counter_device *counter, + struct counter_signal *signal, + size_t idx, u64 val); }; }; @@ -453,6 +524,57 @@ struct counter_available { .priv = &(_available), \ } +struct counter_array { + enum counter_comp_type type; + const struct counter_available *avail; + union { + size_t length; + size_t idx; + }; +}; + +#define DEFINE_COUNTER_ARRAY_U64(_name, _length) \ + struct counter_array _name = { \ + .type = COUNTER_COMP_U64, \ + .length = (_length), \ + } + +#define DEFINE_COUNTER_ARRAY_CAPTURE(_name, _length) \ + DEFINE_COUNTER_ARRAY_U64(_name, _length) + +#define DEFINE_COUNTER_ARRAY_POLARITY(_name, _enums, _length) \ + DEFINE_COUNTER_AVAILABLE(_name##_available, _enums); \ + struct counter_array _name = { \ + .type = COUNTER_COMP_SIGNAL_POLARITY, \ + .avail = &(_name##_available), \ + .length = (_length), \ + } + +#define COUNTER_COMP_DEVICE_ARRAY_U64(_name, _read, _write, _array) \ +{ \ + .type = COUNTER_COMP_ARRAY, \ + .name = (_name), \ + .device_array_u64_read = (_read), \ + .device_array_u64_write = (_write), \ + .priv = &(_array), \ +} +#define COUNTER_COMP_COUNT_ARRAY_U64(_name, _read, _write, _array) \ +{ \ + .type = COUNTER_COMP_ARRAY, \ + .name = (_name), \ + .count_array_u64_read = (_read), \ + .count_array_u64_write = (_write), \ + .priv = &(_array), \ +} +#define COUNTER_COMP_SIGNAL_ARRAY_U64(_name, _read, _write, _array) \ +{ \ + .type = COUNTER_COMP_ARRAY, \ + .name = (_name), \ + .signal_array_u64_read = (_read), \ + .signal_array_u64_write = (_write), \ + .priv = &(_array), \ +} + #define COUNTER_COMP_CAPTURE(_read, _write) \ COUNTER_COMP_COUNT_U64("capture", _read, _write) @@ -496,4 +618,16 @@ struct counter_available { #define COUNTER_COMP_PRESET_ENABLE(_read, _write) \ COUNTER_COMP_COUNT_BOOL("preset_enable", _read, _write) +#define COUNTER_COMP_ARRAY_CAPTURE(_read, _write, _array) \ + COUNTER_COMP_COUNT_ARRAY_U64("capture", _read, _write, _array) + +#define COUNTER_COMP_ARRAY_POLARITY(_read, _write, _array) \ +{ \ + .type = COUNTER_COMP_ARRAY, \ + .name = "polarity", \ + .signal_array_u32_read = (_read), \ + .signal_array_u32_write = (_write), \ + .priv = &(_array), \ +} + #endif /* _COUNTER_H_ */ -- cgit v1.2.3 From de671d6116b5210097cd6fbb877bac92536f265b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Sep 2022 15:19:54 -0600 Subject: block: change request end_io handler to pass back a return value Everything is just converted to returning RQ_END_IO_NONE, and there should be no functional changes with this patch. In preparation for allowing the end_io handler to pass ownership back to the block layer, rather than retain ownership of the request. Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 00a15808c137..e6fa49dd6196 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -14,7 +14,12 @@ struct blk_flush_queue; #define BLKDEV_MIN_RQ 4 #define BLKDEV_DEFAULT_RQ 128 -typedef void (rq_end_io_fn)(struct request *, blk_status_t); +enum rq_end_io_ret { + RQ_END_IO_NONE, + RQ_END_IO_FREE, +}; + +typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); /* * request flags */ -- cgit v1.2.3 From ab3e1d3bbab9e973aeb4dd4603251578658a47ff Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Sep 2022 08:24:16 -0600 Subject: block: allow end_io based requests in the completion batch handling With end_io handlers now being able to potentially pass ownership of the request upon completion, we can allow requests with end_io handlers in the batch completion handling. Reviewed-by: Anuj Gupta Reviewed-by: Keith Busch Co-developed-by: Stefan Roesch Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e6fa49dd6196..50811d0fb143 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -853,8 +853,9 @@ static inline bool blk_mq_add_to_batch(struct request *req, struct io_comp_batch *iob, int ioerror, void (*complete)(struct io_comp_batch *)) { - if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror) + if (!iob || (req->rq_flags & RQF_ELV) || ioerror) return false; + if (!iob->complete) iob->complete = complete; else if (iob->complete != complete) -- cgit v1.2.3 From a9216fac3ed8819cbbda5d39dd5fcaa43dfd35d8 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Fri, 30 Sep 2022 11:57:38 +0530 Subject: io_uring: add io_uring_cmd_import_fixed This is a new helper that callers can use to obtain a bvec iterator for the previously mapped buffer. This is preparatory work to enable fixed-buffer support for io_uring_cmd. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20220930062749.152261-2-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 58676c0a398f..1dbf51115c30 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -4,6 +4,7 @@ #include #include +#include enum io_uring_cmd_flags { IO_URING_F_COMPLETE_DEFER = 1, @@ -32,6 +33,8 @@ struct io_uring_cmd { }; #if defined(CONFIG_IO_URING) +int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, void *ioucmd); void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2); void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *)); @@ -59,6 +62,11 @@ static inline void io_uring_free(struct task_struct *tsk) __io_uring_free(tsk); } #else +static int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, void *ioucmd) +{ + return -EOPNOTSUPP; +} static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t ret2) { -- cgit v1.2.3 From 9cda70f622cdcf049521a9c2886e5fd8a90a0591 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Fri, 30 Sep 2022 11:57:39 +0530 Subject: io_uring: introduce fixed buffer support for io_uring_cmd Add IORING_URING_CMD_FIXED flag that is to be used for sending io_uring command with previously registered buffers. User-space passes the buffer index in sqe->buf_index, same as done in read/write variants that uses fixed buffers. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20220930062749.152261-3-anuj20.g@samsung.com [axboe: shuffle valid flags check before acting on it] Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 1dbf51115c30..e10c5cc81082 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -28,7 +28,7 @@ struct io_uring_cmd { void *cookie; }; u32 cmd_op; - u32 pad; + u32 flags; u8 pdu[32]; /* available inline for free use */ }; -- cgit v1.2.3 From 557654025df5706785d395558244890dc4b2c875 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Fri, 30 Sep 2022 11:57:40 +0530 Subject: block: add blk_rq_map_user_io Create a helper blk_rq_map_user_io for mapping of vectored as well as non-vectored requests. This will help in saving dupilcation of code at few places in scsi and nvme. Signed-off-by: Anuj Gupta Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220930062749.152261-4-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 50811d0fb143..ba18e9bdb799 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -985,6 +985,8 @@ struct rq_map_data { int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); +int blk_rq_map_user_io(struct request *, struct rq_map_data *, + void __user *, unsigned long, gfp_t, bool, int, bool, int); int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); int blk_rq_unmap_user(struct bio *); -- cgit v1.2.3 From e5a3cc83d54019a90119ce0cf17b5e21729b79bf Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 29 Sep 2022 00:21:42 -0700 Subject: net/mlx5e: Use runtime page_shift for striding RQ This commit allows striding RQ to determine MTT page size at runtime, instead of sticking to the compile-time PAGE_SIZE. This functionality will be used by a following commit that adjusts the MTT page size to the XSK frame size. Stick with PAGE_SIZE for XSK on legacy RQ, as frag_stride is not used in data path, it only helps calculate how pages are partitioned into fragments, and PAGE_SIZE will ensure each fragment starts at the beginning of a new allocation unit (XSK frame). Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/qp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index be640c749d5e..afac93f9552c 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -162,6 +162,8 @@ enum { MLX5_SEND_WQE_MAX_WQEBBS = 16, }; +#define MLX5_SEND_WQE_MAX_SIZE (MLX5_SEND_WQE_MAX_WQEBBS * MLX5_SEND_WQE_BB) + enum { MLX5_WQE_FMR_PERM_LOCAL_READ = 1 << 27, MLX5_WQE_FMR_PERM_LOCAL_WRITE = 1 << 28, -- cgit v1.2.3 From 6470d2e7e8ed8e9dd560d8dc3e09d1100a17ee26 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 29 Sep 2022 00:21:46 -0700 Subject: net/mlx5e: xsk: Use KSM for unaligned XSK UMR MTTs used in striding RQ have certain alignment requirements. While it's guaranteed to work when UMR pages are aligned to the UMR page size, in practice it works then UMR pages are aligned to 8 bytes. However, it's still not enough flexibility for the unaligned mode of XSK. This patch leverages KSM to map UMR pages without alignment requirements, when unaligned XSK is active. The downside is that KSM entries are twice as big as MTTs, which limits the maximum WQE size, so regular RQs and aligned XSK continue using MTTs. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/qp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index afac93f9552c..4657d5c54abe 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -478,6 +478,12 @@ struct mlx5_klm { __be64 va; }; +struct mlx5_ksm { + __be32 reserved; + __be32 key; + __be64 va; +}; + struct mlx5_stride_block_entry { __be16 stride; __be16 bcount; -- cgit v1.2.3 From 82b1ec794d701478381482264f3bfada3a7bf2d9 Mon Sep 17 00:00:00 2001 From: Sumeet Pawnikar Date: Tue, 27 Sep 2022 21:17:09 +0530 Subject: thermal: core: Increase maximum number of trip points On one of the Chrome system, if we define more than 12 trip points, probe for thermal sensor fails with "int3403 thermal: probe of INTC1046:03 failed with error -22" and throws an error as "thermal_sys: Error: Incorrect number of thermal trips". The thermal_zone_device_register() interface needs maximum number of trip points supported in a zone as an argument. This number can't exceed THERMAL_MAX_TRIPS, which is currently set to 12. To address this issue, THERMAL_MAX_TRIPS value has to be increased. This interface also has an argument to specify a mask of trips which are writable. This mask is defined as an int. This mask sets the ceiling for increasing maximum number of supported trips. With the current implementation, maximum number of trips can be supported is 31. Also, THERMAL_MAX_TRIPS macro is used in one place only. So, remove THERMAL_MAX_TRIPS macro and compare num_trips directly with using a macro BITS_PER_TYPE(int)-1. Signed-off-by: Sumeet Pawnikar Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 86c24ddd5985..6f1ec4fb7ef8 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -17,8 +17,6 @@ #include #include -#define THERMAL_MAX_TRIPS 12 - /* invalid cooling state */ #define THERMAL_CSTATE_INVALID -1UL -- cgit v1.2.3 From 88269151be679b9accb2f1e73487eaeb9eae9e39 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 29 Sep 2022 17:32:49 -0700 Subject: of: base: make of_device_compatible_match() accept const device node of_device_is_compatible() accepts const device node pointer, there is no reason why of_device_compatible_match() can't do the same. Signed-off-by: Dmitry Torokhov Link: https://lore.kernel.org/r/YzY5MaU5N4A2st5R@google.com Signed-off-by: Rob Herring --- include/linux/of.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 766d002bddb9..6b79ef9a6541 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -342,7 +342,7 @@ extern int of_property_read_string_helper(const struct device_node *np, const char **out_strs, size_t sz, int index); extern int of_device_is_compatible(const struct device_node *device, const char *); -extern int of_device_compatible_match(struct device_node *device, +extern int of_device_compatible_match(const struct device_node *device, const char *const *compat); extern bool of_device_is_available(const struct device_node *device); extern bool of_device_is_big_endian(const struct device_node *device); @@ -562,7 +562,7 @@ static inline int of_device_is_compatible(const struct device_node *device, return 0; } -static inline int of_device_compatible_match(struct device_node *device, +static inline int of_device_compatible_match(const struct device_node *device, const char *const *compat) { return 0; -- cgit v1.2.3 From 1c8934b4802d2744c97e7c97d244af967f4bf141 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 23 Jun 2022 14:57:17 +0300 Subject: clk: Remove never used devm_of_clk_del_provider() For the entire history of the devm_of_clk_del_provider) existence (since 2017) it was never used. Remove it for good. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220623115719.52683-1-andriy.shevchenko@linux.intel.com Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 1615010aa0ec..305e18eb23cf 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -1454,7 +1454,7 @@ int devm_of_clk_add_hw_provider(struct device *dev, void *data), void *data); void of_clk_del_provider(struct device_node *np); -void devm_of_clk_del_provider(struct device *dev); + struct clk *of_clk_src_simple_get(struct of_phandle_args *clkspec, void *data); struct clk_hw *of_clk_hw_simple_get(struct of_phandle_args *clkspec, @@ -1491,7 +1491,7 @@ static inline int devm_of_clk_add_hw_provider(struct device *dev, return 0; } static inline void of_clk_del_provider(struct device_node *np) {} -static inline void devm_of_clk_del_provider(struct device *dev) {} + static inline struct clk *of_clk_src_simple_get( struct of_phandle_args *clkspec, void *data) { -- cgit v1.2.3 From 07bdf48d3fee12268bf9a179821aee9b80f5239c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 23 Jun 2022 14:57:18 +0300 Subject: clkdev: Remove never used devm_clk_release_clkdev() For the entire history of the devm_clk_release_clkdev() existence (since 2018) it was never used. Remove it for good. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220623115719.52683-2-andriy.shevchenko@linux.intel.com Signed-off-by: Stephen Boyd --- include/linux/clkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h index 8a8423eb8e9a..45570bc21a43 100644 --- a/include/linux/clkdev.h +++ b/include/linux/clkdev.h @@ -46,6 +46,4 @@ int clk_hw_register_clkdev(struct clk_hw *, const char *, const char *); int devm_clk_hw_register_clkdev(struct device *dev, struct clk_hw *hw, const char *con_id, const char *dev_id); -void devm_clk_release_clkdev(struct device *dev, const char *con_id, - const char *dev_id); #endif -- cgit v1.2.3 From 854701ba4c39afae2362ba19a580c461cb183e4f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 19 Sep 2022 14:05:54 -0700 Subject: net: fix cpu_max_bits_warn() usage in netif_attrmask_next{,_and} The functions require to be passed with a cpu index prior to one that is the first to start search, so the valid input range is [-1, nr_cpu_ids-1). However, the code checks against [-1, nr_cpu_ids). Acked-by: Jakub Kicinski Signed-off-by: Yury Norov --- include/linux/netdevice.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 05d6f3facd5a..4d6d5a2dd82e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3643,9 +3643,8 @@ static inline bool netif_attr_test_online(unsigned long j, static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, unsigned int nr_bits) { - /* -1 is a legal arg here. */ - if (n != -1) - cpu_max_bits_warn(n, nr_bits); + /* n is a prior cpu */ + cpu_max_bits_warn(n + 1, nr_bits); if (srcp) return find_next_bit(srcp, nr_bits, n + 1); @@ -3666,9 +3665,8 @@ static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, const unsigned long *src2p, unsigned int nr_bits) { - /* -1 is a legal arg here. */ - if (n != -1) - cpu_max_bits_warn(n, nr_bits); + /* n is a prior cpu */ + cpu_max_bits_warn(n + 1, nr_bits); if (src1p && src2p) return find_next_and_bit(src1p, src2p, nr_bits, n + 1); -- cgit v1.2.3 From 33e67710beda78aed38a2fe10be6088d4aeb1c53 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 19 Sep 2022 14:05:55 -0700 Subject: cpumask: switch for_each_cpu{,_not} to use for_each_bit() The difference between for_each_cpu() and for_each_set_bit() is that the latter uses cpumask_next() instead of find_next_bit(), and so calls cpumask_check(). This check is useless because the iterator value is not provided by user. It generates false-positives for the very last iteration of for_each_cpu(). Signed-off-by: Yury Norov --- include/linux/cpumask.h | 12 +++--------- include/linux/find.h | 5 +++++ 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index e4f9136a4a63..9c85774d45a4 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -246,9 +246,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p, * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu(cpu, mask) \ - for ((cpu) = -1; \ - (cpu) = cpumask_next((cpu), (mask)), \ - (cpu) < nr_cpu_ids;) + for_each_set_bit(cpu, cpumask_bits(mask), nr_cpumask_bits) /** * for_each_cpu_not - iterate over every cpu in a complemented mask @@ -258,9 +256,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p, * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_not(cpu, mask) \ - for ((cpu) = -1; \ - (cpu) = cpumask_next_zero((cpu), (mask)), \ - (cpu) < nr_cpu_ids;) + for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits) #if NR_CPUS == 1 static inline @@ -313,9 +309,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_and(cpu, mask1, mask2) \ - for ((cpu) = -1; \ - (cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \ - (cpu) < nr_cpu_ids;) + for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits) /** * cpumask_any_but - return a "random" in a cpumask, but not this one. diff --git a/include/linux/find.h b/include/linux/find.h index b100944daba0..128615a3f93e 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -390,6 +390,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) +#define for_each_and_bit(bit, addr1, addr2, size) \ + for ((bit) = find_next_and_bit((addr1), (addr2), (size), 0); \ + (bit) < (size); \ + (bit) = find_next_and_bit((addr1), (addr2), (size), (bit) + 1)) + /* same as for_each_set_bit() but use bit as value to start with */ #define for_each_set_bit_from(bit, addr, size) \ for ((bit) = find_next_bit((addr), (size), (bit)); \ -- cgit v1.2.3 From 6cc18331a987c4a29d66b9c4fd292587fba4d7bd Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 19 Sep 2022 14:05:56 -0700 Subject: lib/find_bit: add find_next{,_and}_bit_wrap The helper is better optimized for the worst case: in case of empty cpumask, current code traverses 2 * size: next = cpumask_next_and(prev, src1p, src2p); if (next >= nr_cpu_ids) next = cpumask_first_and(src1p, src2p); At bitmap level we can stop earlier after checking 'size + offset' bits. Signed-off-by: Yury Norov --- include/linux/find.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index 128615a3f93e..77c087b7a451 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -290,6 +290,52 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size) } #endif +/** + * find_next_and_bit_wrap - find the next set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @size: The bitmap size in bits + * @offset: The bitnumber to start searching at + * + * Returns the bit number for the next set bit, or first set bit up to @offset + * If no bits are set, returns @size. + */ +static inline +unsigned long find_next_and_bit_wrap(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size, unsigned long offset) +{ + unsigned long bit = find_next_and_bit(addr1, addr2, size, offset); + + if (bit < size) + return bit; + + bit = find_first_and_bit(addr1, addr2, offset); + return bit < offset ? bit : size; +} + +/** + * find_next_bit_wrap - find the next set bit in both memory regions + * @addr: The first address to base the search on + * @size: The bitmap size in bits + * @offset: The bitnumber to start searching at + * + * Returns the bit number for the next set bit, or first set bit up to @offset + * If no bits are set, returns @size. + */ +static inline +unsigned long find_next_bit_wrap(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + unsigned long bit = find_next_bit(addr, size, offset); + + if (bit < size) + return bit; + + bit = find_first_bit(addr, offset); + return bit < offset ? bit : size; +} + /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump -- cgit v1.2.3 From 4fe49b3b97c2640147c46519c2a6fdb06df34f5f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 19 Sep 2022 14:05:57 -0700 Subject: lib/bitmap: introduce for_each_set_bit_wrap() macro Add for_each_set_bit_wrap() macro and use it in for_each_cpu_wrap(). The new macro is based on __for_each_wrap() iterator, which is simpler and smaller than cpumask_next_wrap(). Signed-off-by: Yury Norov --- include/linux/cpumask.h | 6 ++---- include/linux/find.h | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 9c85774d45a4..13b32dd9803b 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -289,10 +289,8 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta * * After the loop, cpu is >= nr_cpu_ids. */ -#define for_each_cpu_wrap(cpu, mask, start) \ - for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \ - (cpu) < nr_cpumask_bits; \ - (cpu) = cpumask_next_wrap((cpu), (mask), (start), true)) +#define for_each_cpu_wrap(cpu, mask, start) \ + for_each_set_bit_wrap(cpu, cpumask_bits(mask), nr_cpumask_bits, start) /** * for_each_cpu_and - iterate over every cpu in both masks diff --git a/include/linux/find.h b/include/linux/find.h index 77c087b7a451..3b746a183216 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -336,6 +336,32 @@ unsigned long find_next_bit_wrap(const unsigned long *addr, return bit < offset ? bit : size; } +/* + * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing + * before using it alone. + */ +static inline +unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, + unsigned long start, unsigned long n) +{ + unsigned long bit; + + /* If not wrapped around */ + if (n > start) { + /* and have a bit, just return it. */ + bit = find_next_bit(bitmap, size, n); + if (bit < size) + return bit; + + /* Otherwise, wrap around and ... */ + n = 0; + } + + /* Search the other part. */ + bit = find_next_bit(bitmap, start, n); + return bit < start ? bit : size; +} + /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump @@ -514,6 +540,19 @@ unsigned long find_next_bit_le(const void *addr, unsigned (b) = find_next_zero_bit((addr), (size), (e) + 1), \ (e) = find_next_bit((addr), (size), (b) + 1)) +/** + * for_each_set_bit_wrap - iterate over all set bits starting from @start, and + * wrapping around the end of bitmap. + * @bit: offset for current iteration + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + * @start: Starting bit for bitmap traversing, wrapping around the bitmap end + */ +#define for_each_set_bit_wrap(bit, addr, size, start) \ + for ((bit) = find_next_bit_wrap((addr), (size), (start)); \ + (bit) < (size); \ + (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1)) + /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset -- cgit v1.2.3 From fdae96a3fc7f70eb8ff9619550d7fa604719626a Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 19 Sep 2022 14:05:58 -0700 Subject: lib/find: optimize for_each() macros Moving an iterator of the macros inside conditional part of for-loop helps to generate a better code. It had been first implemented in commit 7baac8b91f9871ba ("cpumask: make for_each_cpu_mask a bit smaller"). Now that cpumask for-loops are the aliases to bitmap loops, it's worth to optimize them the same way. Bloat-o-meter says: add/remove: 8/12 grow/shrink: 147/592 up/down: 4876/-24416 (-19540) Signed-off-by: Yury Norov --- include/linux/find.h | 56 +++++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index 3b746a183216..0cdfab9734a6 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -458,31 +458,25 @@ unsigned long find_next_bit_le(const void *addr, unsigned #endif #define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_next_bit((addr), (size), 0); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) + for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_and_bit(bit, addr1, addr2, size) \ - for ((bit) = find_next_and_bit((addr1), (addr2), (size), 0); \ - (bit) < (size); \ - (bit) = find_next_and_bit((addr1), (addr2), (size), (bit) + 1)) + for ((bit) = 0; \ + (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ + (bit)++) /* same as for_each_set_bit() but use bit as value to start with */ #define for_each_set_bit_from(bit, addr, size) \ - for ((bit) = find_next_bit((addr), (size), (bit)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) + for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_clear_bit(bit, addr, size) \ - for ((bit) = find_next_zero_bit((addr), (size), 0); \ - (bit) < (size); \ - (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) + for ((bit) = 0; \ + (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); \ + (bit)++) /* same as for_each_clear_bit() but use bit as value to start with */ #define for_each_clear_bit_from(bit, addr, size) \ - for ((bit) = find_next_zero_bit((addr), (size), (bit)); \ - (bit) < (size); \ - (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) + for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++) /** * for_each_set_bitrange - iterate over all set bit ranges [b; e) @@ -492,11 +486,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned * @size: bitmap size in number of bits */ #define for_each_set_bitrange(b, e, addr, size) \ - for ((b) = find_next_bit((addr), (size), 0), \ - (e) = find_next_zero_bit((addr), (size), (b) + 1); \ + for ((b) = 0; \ + (b) = find_next_bit((addr), (size), b), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ - (b) = find_next_bit((addr), (size), (e) + 1), \ - (e) = find_next_zero_bit((addr), (size), (b) + 1)) + (b) = (e) + 1) /** * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) @@ -506,11 +500,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned * @size: bitmap size in number of bits */ #define for_each_set_bitrange_from(b, e, addr, size) \ - for ((b) = find_next_bit((addr), (size), (b)), \ - (e) = find_next_zero_bit((addr), (size), (b) + 1); \ + for (; \ + (b) = find_next_bit((addr), (size), (b)), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ - (b) = find_next_bit((addr), (size), (e) + 1), \ - (e) = find_next_zero_bit((addr), (size), (b) + 1)) + (b) = (e) + 1) /** * for_each_clear_bitrange - iterate over all unset bit ranges [b; e) @@ -520,11 +514,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned * @size: bitmap size in number of bits */ #define for_each_clear_bitrange(b, e, addr, size) \ - for ((b) = find_next_zero_bit((addr), (size), 0), \ - (e) = find_next_bit((addr), (size), (b) + 1); \ + for ((b) = 0; \ + (b) = find_next_zero_bit((addr), (size), (b)), \ + (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ - (b) = find_next_zero_bit((addr), (size), (e) + 1), \ - (e) = find_next_bit((addr), (size), (b) + 1)) + (b) = (e) + 1) /** * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e) @@ -534,11 +528,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned * @size: bitmap size in number of bits */ #define for_each_clear_bitrange_from(b, e, addr, size) \ - for ((b) = find_next_zero_bit((addr), (size), (b)), \ - (e) = find_next_bit((addr), (size), (b) + 1); \ + for (; \ + (b) = find_next_zero_bit((addr), (size), (b)), \ + (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ - (b) = find_next_zero_bit((addr), (size), (e) + 1), \ - (e) = find_next_bit((addr), (size), (b) + 1)) + (b) = (e) + 1) /** * for_each_set_bit_wrap - iterate over all set bits starting from @start, and -- cgit v1.2.3 From 78e5a3399421ad79fc024e6d78e2deb7809d26af Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 19 Sep 2022 14:05:53 -0700 Subject: cpumask: fix checking valid cpu range The range of valid CPUs is [0, nr_cpu_ids). Some cpumask functions are passed with a shifted CPU index, and for them, the valid range is [-1, nr_cpu_ids-1). Currently for those functions, we check the index against [-1, nr_cpu_ids), which is wrong. Signed-off-by: Yury Norov --- include/linux/cpumask.h | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 13b32dd9803b..286804bfe3b7 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -174,9 +174,8 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp) static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { - /* -1 is a legal arg here. */ - if (n != -1) - cpumask_check(n); + /* n is a prior cpu */ + cpumask_check(n + 1); return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1); } @@ -189,9 +188,8 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp) */ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { - /* -1 is a legal arg here. */ - if (n != -1) - cpumask_check(n); + /* n is a prior cpu */ + cpumask_check(n + 1); return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1); } @@ -231,9 +229,8 @@ static inline unsigned int cpumask_next_and(int n, const struct cpumask *src1p, const struct cpumask *src2p) { - /* -1 is a legal arg here. */ - if (n != -1) - cpumask_check(n); + /* n is a prior cpu */ + cpumask_check(n + 1); return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits, n + 1); } @@ -263,8 +260,8 @@ static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) { cpumask_check(start); - if (n != -1) - cpumask_check(n); + /* n is a prior cpu */ + cpumask_check(n + 1); /* * Return the first available CPU when wrapping, or when starting before cpu0, -- cgit v1.2.3 From fd580c9830316edad6f8b1d9f542563730658efe Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 30 Sep 2022 16:21:00 +0200 Subject: net: sfp: augment SFP parsing with phy_interface_t bitmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We currently parse the SFP EEPROM to a bitmap of ethtool link modes, and then attempt to convert the link modes to a PHY interface mode. While this works at present, there are cases where this is sub-optimal. For example, where a module can operate with several different PHY interface modes. To start addressing this, arrange for the SFP EEPROM parsing to also provide a bitmap of the possible PHY interface modes. Signed-off-by: Russell King Signed-off-by: Marek Behún Signed-off-by: David S. Miller --- include/linux/sfp.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 302094b855fb..d1f343853b6c 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -535,7 +535,7 @@ int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support); bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id); void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, - unsigned long *support); + unsigned long *support, unsigned long *interfaces); phy_interface_t sfp_select_interface(struct sfp_bus *bus, unsigned long *link_modes); @@ -568,7 +568,8 @@ static inline bool sfp_may_have_phy(struct sfp_bus *bus, static inline void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, - unsigned long *support) + unsigned long *support, + unsigned long *interfaces) { } -- cgit v1.2.3 From eca68a3c7d05b38b4e728cead0c49718f2bc1d5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Fri, 30 Sep 2022 16:21:03 +0200 Subject: net: phylink: pass supported host PHY interface modes to phylib for SFP's PHYs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass the supported PHY interface types to phylib if the PHY we are connecting is inside a SFP, so that the PHY driver can select an appropriate host configuration mode for their interface according to the host capabilities. For example the Marvell 88X3310 PHY inside RollBall SFP modules defaults to 10gbase-r mode on host's side, and the marvell10g driver currently does not change this setting. But a host may not support 10gbase-r. For example Turris Omnia only supports sgmii, 1000base-x and 2500base-x modes. The PHY can be configured to use those modes, but in order for the PHY driver to do that, it needs to know which modes are supported. Signed-off-by: Marek Behún Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 9c66f357f489..d65fc76fe0ae 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -571,6 +571,7 @@ struct macsec_ops; * @advertising: Currently advertised linkmodes * @adv_old: Saved advertised while power saving for WoL * @lp_advertising: Current link partner advertised linkmodes + * @host_interfaces: PHY interface modes supported by host * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited * @autoneg: Flag autoneg being used * @rate_matching: Current rate matching mode @@ -670,6 +671,9 @@ struct phy_device { /* used with phy_speed_down */ __ETHTOOL_DECLARE_LINK_MODE_MASK(adv_old); + /* Host supported PHY interface types. Should be ignored if empty. */ + DECLARE_PHY_INTERFACE_MASK(host_interfaces); + /* Energy efficient ethernet modes which should be prohibited */ u32 eee_broken_modes; -- cgit v1.2.3 From e85b1347ace677c3822c12d9332dfaaffe594da6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Fri, 30 Sep 2022 16:21:08 +0200 Subject: net: sfp: create/destroy I2C mdiobus before PHY probe/after PHY release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of configuring the I2C mdiobus when SFP driver is probed, create/destroy the mdiobus before the PHY is probed for/after it is released. This way we can tell the mdio-i2c code which protocol to use for each SFP transceiver. Move the code that determines MDIO I2C protocol from sfp_sm_probe_for_phy() to sfp_sm_mod_probe(), where most of the SFP ID parsing is done. Don't allocate I2C bus if no PHY is expected. Signed-off-by: Marek Behún Signed-off-by: David S. Miller --- include/linux/mdio/mdio-i2c.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio/mdio-i2c.h b/include/linux/mdio/mdio-i2c.h index b1d27f7cd23f..3bde1a555a49 100644 --- a/include/linux/mdio/mdio-i2c.h +++ b/include/linux/mdio/mdio-i2c.h @@ -11,6 +11,12 @@ struct device; struct i2c_adapter; struct mii_bus; +enum mdio_i2c_proto { + MDIO_I2C_NONE, + MDIO_I2C_MARVELL_C22, + MDIO_I2C_C45, +}; + struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c); #endif -- cgit v1.2.3 From 09bbedac72d5a9267088c15d1a71c8c3a8fb47e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Fri, 30 Sep 2022 16:21:09 +0200 Subject: net: phy: mdio-i2c: support I2C MDIO protocol for RollBall SFP modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some multigig SFPs from RollBall and Hilink do not expose functional MDIO access to the internal PHY of the SFP via I2C address 0x56 (although there seems to be read-only clause 22 access on this address). Instead these SFPs PHY can be accessed via I2C via the SFP Enhanced Digital Diagnostic Interface - I2C address 0x51. The SFP_PAGE has to be selected to 3 and the password must be filled with 0xff bytes for this PHY communication to work. This extends the mdio-i2c driver to support this protocol by adding a special parameter to mdio_i2c_alloc function via which this RollBall protocol can be selected. Signed-off-by: Marek Behún Cc: Andrew Lunn Cc: Russell King Signed-off-by: David S. Miller --- include/linux/mdio/mdio-i2c.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mdio/mdio-i2c.h b/include/linux/mdio/mdio-i2c.h index 3bde1a555a49..65b550a6fc32 100644 --- a/include/linux/mdio/mdio-i2c.h +++ b/include/linux/mdio/mdio-i2c.h @@ -15,8 +15,10 @@ enum mdio_i2c_proto { MDIO_I2C_NONE, MDIO_I2C_MARVELL_C22, MDIO_I2C_C45, + MDIO_I2C_ROLLBALL, }; -struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c); +struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c, + enum mdio_i2c_proto protocol); #endif -- cgit v1.2.3 From 62c07983bef9d3e78e71189441e1a470f0d1e653 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Oct 2022 13:51:02 -0700 Subject: once: add DO_ONCE_SLOW() for sleepable contexts Christophe Leroy reported a ~80ms latency spike happening at first TCP connect() time. This is because __inet_hash_connect() uses get_random_once() to populate a perturbation table which became quite big after commit 4c2c8f03a5ab ("tcp: increase source port perturb table to 2^16") get_random_once() uses DO_ONCE(), which block hard irqs for the duration of the operation. This patch adds DO_ONCE_SLOW() which uses a mutex instead of a spinlock for operations where we prefer to stay in process context. Then __inet_hash_connect() can use get_random_slow_once() to populate its perturbation table. Fixes: 4c2c8f03a5ab ("tcp: increase source port perturb table to 2^16") Fixes: 190cc82489f4 ("tcp: change source port randomizarion at connect() time") Reported-by: Christophe Leroy Link: https://lore.kernel.org/netdev/CANn89iLAEYBaoYajy0Y9UmGFff5GPxDUoG-ErVB2jDdRNQ5Tug@mail.gmail.com/T/#t Signed-off-by: Eric Dumazet Cc: Willy Tarreau Tested-by: Christophe Leroy Signed-off-by: David S. Miller --- include/linux/once.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/once.h b/include/linux/once.h index b14d8b309d52..176ab75b42df 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -5,10 +5,18 @@ #include #include +/* Helpers used from arbitrary contexts. + * Hard irqs are blocked, be cautious. + */ bool __do_once_start(bool *done, unsigned long *flags); void __do_once_done(bool *done, struct static_key_true *once_key, unsigned long *flags, struct module *mod); +/* Variant for process contexts only. */ +bool __do_once_slow_start(bool *done); +void __do_once_slow_done(bool *done, struct static_key_true *once_key, + struct module *mod); + /* Call a function exactly once. The idea of DO_ONCE() is to perform * a function call such as initialization of random seeds, etc, only * once, where DO_ONCE() can live in the fast-path. After @func has @@ -52,7 +60,27 @@ void __do_once_done(bool *done, struct static_key_true *once_key, ___ret; \ }) +/* Variant of DO_ONCE() for process/sleepable contexts. */ +#define DO_ONCE_SLOW(func, ...) \ + ({ \ + bool ___ret = false; \ + static bool __section(".data.once") ___done = false; \ + static DEFINE_STATIC_KEY_TRUE(___once_key); \ + if (static_branch_unlikely(&___once_key)) { \ + ___ret = __do_once_slow_start(&___done); \ + if (unlikely(___ret)) { \ + func(__VA_ARGS__); \ + __do_once_slow_done(&___done, &___once_key, \ + THIS_MODULE); \ + } \ + } \ + ___ret; \ + }) + #define get_random_once(buf, nbytes) \ DO_ONCE(get_random_bytes, (buf), (nbytes)) +#define get_random_slow_once(buf, nbytes) \ + DO_ONCE_SLOW(get_random_bytes, (buf), (nbytes)) + #endif /* _LINUX_ONCE_H */ -- cgit v1.2.3 From 79e1119b7e0099c6c9379ca3129ffb7aa2a1c249 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:47 +0800 Subject: ksm: remove redundant declarations in ksm.h Currently, for struct stable_node, no one uses it in both the include/linux/ksm.h file and the file that contains it. For struct mem_cgroup, it's also not used in ksm.h. So they're all redundant, just remove them. Link: https://lkml.kernel.org/r/20220831031951.43152-4-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/ksm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 0b4f17418f64..7e232ba59b86 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -15,9 +15,6 @@ #include #include -struct stable_node; -struct mem_cgroup; - #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); -- cgit v1.2.3 From 379708ffde1b049bc41084e0a0572c44c8a1d2c4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:45:58 +0100 Subject: mm: add the first tail page to struct folio Some of the static checkers get confused by extracting the page from the folio and referring to fields in the first tail page. Adding these fields to struct folio lets us avoid doing that. It has the risk that people will refer to those fields without checking that the folio is actually a large folio, so prefix them with underscores and document the preferred function to use instead. Link: https://lkml.kernel.org/r/20220902194653.1739778-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8f30f262431c..5c87d0f292a2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -245,6 +245,13 @@ struct page { * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. + * @_flags_1: For large folios, additional page flags. + * @__head: Points to the folio. Do not use. + * @_folio_dtor: Which destructor to use for this folio. + * @_folio_order: Do not use directly, call folio_order(). + * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). + * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -283,9 +290,17 @@ struct folio { }; struct page page; }; + unsigned long _flags_1; + unsigned long __head; + unsigned char _folio_dtor; + unsigned char _folio_order; + atomic_t _total_mapcount; + atomic_t _pincount; +#ifdef CONFIG_64BIT + unsigned int _folio_nr_pages; +#endif }; -static_assert(sizeof(struct page) == sizeof(struct folio)); #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) FOLIO_MATCH(flags, flags); @@ -300,6 +315,19 @@ FOLIO_MATCH(_refcount, _refcount); FOLIO_MATCH(memcg_data, memcg_data); #endif #undef FOLIO_MATCH +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct folio, fl) == \ + offsetof(struct page, pg) + sizeof(struct page)) +FOLIO_MATCH(flags, _flags_1); +FOLIO_MATCH(compound_head, __head); +FOLIO_MATCH(compound_dtor, _folio_dtor); +FOLIO_MATCH(compound_order, _folio_order); +FOLIO_MATCH(compound_mapcount, _total_mapcount); +FOLIO_MATCH(compound_pincount, _pincount); +#ifdef CONFIG_64BIT +FOLIO_MATCH(compound_nr, _folio_nr_pages); +#endif +#undef FOLIO_MATCH static inline atomic_t *folio_mapcount_ptr(struct folio *folio) { -- cgit v1.2.3 From c3a15bff46cb5149aeae4c8ae69443d791fa6578 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:45:59 +0100 Subject: mm: reimplement folio_order() and folio_nr_pages() Instead of calling compound_order() and compound_nr_pages(), use the folio directly. Saves 1905 bytes from mm/filemap.o due to folio_test_large() now being a cheaper check than PageHead(). Link: https://lkml.kernel.org/r/20220902194653.1739778-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e56dd8f7eae1..a37c8a29c49b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -729,7 +729,9 @@ static inline unsigned int compound_order(struct page *page) */ static inline unsigned int folio_order(struct folio *folio) { - return compound_order(&folio->page); + if (!folio_test_large(folio)) + return 0; + return folio->_folio_order; } #include @@ -1659,7 +1661,13 @@ static inline void set_page_links(struct page *page, enum zone_type zone, */ static inline long folio_nr_pages(struct folio *folio) { - return compound_nr(&folio->page); + if (!folio_test_large(folio)) + return 1; +#ifdef CONFIG_64BIT + return folio->_folio_nr_pages; +#else + return 1L << folio->_folio_order; +#endif } /** -- cgit v1.2.3 From d788f5b374c2ba204fed57e39acf2452acc24812 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:00 +0100 Subject: mm: add split_folio() This wrapper removes a need to use split_huge_page(&folio->page). Convert two callers. Link: https://lkml.kernel.org/r/20220902194653.1739778-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 38265f9f782e..a1341fdcf666 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -444,6 +444,11 @@ static inline int split_folio_to_list(struct folio *folio, return split_huge_page_to_list(&folio->page, list); } +static inline int split_folio(struct folio *folio) +{ + return split_folio_to_list(folio, NULL); +} + /* * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to * limitations in the implementation like arm64 MTE can override this to -- cgit v1.2.3 From 681ecf6301786cb06942b57f0ef7103b07ae6813 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:01 +0100 Subject: mm: add folio_add_lru_vma() Convert lru_cache_add_inactive_or_unevictable() to folio_add_lru_vma() and add a compatibility wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-6-willy@infradead.org Signed-off-by: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 6308150b234a..2ede1e3695d9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -379,11 +379,11 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ -extern void lru_note_cost(struct lruvec *lruvec, bool file, - unsigned int nr_pages); -extern void lru_note_cost_folio(struct folio *); -extern void folio_add_lru(struct folio *); -extern void lru_cache_add(struct page *); +void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); +void lru_note_cost_folio(struct folio *); +void folio_add_lru(struct folio *); +void folio_add_lru_vma(struct folio *, struct vm_area_struct *); +void lru_cache_add(struct page *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); -- cgit v1.2.3 From 907ea17eb2b436f07332c935476d77893abae735 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:04 +0100 Subject: shmem: convert shmem_replace_page() to use folios throughout Introduce folio_set_swap_entry() to abstract how both folio->private and swp_entry_t work. Use swap_address_space() directly instead of indirecting through folio_mapping(). Include an assertion that the old folio is not large as we only allocate a single-page folio to replace it. Use folio_put_refs() instead of calling folio_put() twice. Link: https://lkml.kernel.org/r/20220902194653.1739778-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2ede1e3695d9..61e13d1a4cab 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -355,6 +355,11 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio) return entry; } +static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) +{ + folio->private = (void *)entry.val; +} + /* linux/mm/workingset.c */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); -- cgit v1.2.3 From bdb0ed54a4768dc3c2613d4c45f94c887d43cd7a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:06 +0100 Subject: mm/swapfile: convert try_to_free_swap() to folio_free_swap() Add kernel-doc for folio_free_swap() and make it return bool. Add a try_to_free_swap() compatibility wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 61e13d1a4cab..dac6308d878e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -490,6 +490,7 @@ static inline long get_nr_swap_pages(void) extern void si_swapinfo(struct sysinfo *); swp_entry_t folio_alloc_swap(struct folio *folio); +bool folio_free_swap(struct folio *folio); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); @@ -606,6 +607,11 @@ static inline swp_entry_t folio_alloc_swap(struct folio *folio) return entry; } +static inline bool folio_free_swap(struct folio *folio) +{ + return false; +} + static inline int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) -- cgit v1.2.3 From 4081f7446d95a9d3ced12dc04ff02c187a761e90 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:09 +0100 Subject: mm/swap: convert put_swap_page() to put_swap_folio() With all callers now using a folio, we can convert this function. Link: https://lkml.kernel.org/r/20220902194653.1739778-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index dac6308d878e..42cbef554de6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -491,7 +491,7 @@ static inline long get_nr_swap_pages(void) extern void si_swapinfo(struct sysinfo *); swp_entry_t folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); -extern void put_swap_page(struct page *page, swp_entry_t entry); +void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); extern int add_swap_count_continuation(swp_entry_t, gfp_t); @@ -576,7 +576,7 @@ static inline void swap_free(swp_entry_t swp) { } -static inline void put_swap_page(struct page *page, swp_entry_t swp) +static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) { } -- cgit v1.2.3 From 6599591816f522c1cc8ec4eb5cea75738963756a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:12 +0100 Subject: memcg: convert mem_cgroup_swapin_charge_page() to mem_cgroup_swapin_charge_folio() All callers now have a folio, so pass it in here and remove an unnecessary call to page_folio(). Link: https://lkml.kernel.org/r/20220902194653.1739778-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 60545e4a1c03..ca0df42662ad 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -688,7 +688,7 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, return __mem_cgroup_charge(folio, mm, gfp); } -int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, +int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); @@ -1254,7 +1254,7 @@ static inline int mem_cgroup_charge(struct folio *folio, return 0; } -static inline int mem_cgroup_swapin_charge_page(struct page *page, +static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { return 0; -- cgit v1.2.3 From 4e1fc793ad9892cec67b40c9f67583160e08f695 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:20 +0100 Subject: shmem: add shmem_get_folio() With no remaining callers of shmem_getpage_gfp(), add shmem_get_folio() and reimplement shmem_getpage() as a call to shmem_get_folio(). Link: https://lkml.kernel.org/r/20220902194653.1739778-25-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index ff0b990de83d..f4bd50b08a91 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -113,6 +113,8 @@ enum sgp_type { extern int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp); +int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, + enum sgp_type sgp); static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) -- cgit v1.2.3 From 923e2f0e7c30db5c1ee5d680050ab781e6c114fb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:29 +0100 Subject: shmem: remove shmem_getpage() With all callers removed, remove this wrapper function. The flags are now mysteriously called SGP, but I think we can live with that. Link: https://lkml.kernel.org/r/20220902194653.1739778-34-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f4bd50b08a91..f24071e3c826 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -102,7 +102,7 @@ extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); -/* Flag allocation requirements to shmem_getpage */ +/* Flag allocation requirements to shmem_get_folio */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ @@ -111,8 +111,6 @@ enum sgp_type { SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; -extern int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp); int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp); -- cgit v1.2.3 From 9202d527b715f67bcdccbb9b712b65fe053f8109 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:43 +0100 Subject: memcg: convert mem_cgroup_swap_full() to take a folio All callers now have a folio, so convert the function to take a folio. Saves a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-48-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 42cbef554de6..d8bd6401c3e7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -692,7 +692,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p } extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); -extern bool mem_cgroup_swap_full(struct page *page); +extern bool mem_cgroup_swap_full(struct folio *folio); #else static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { @@ -714,7 +714,7 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return get_nr_swap_pages(); } -static inline bool mem_cgroup_swap_full(struct page *page) +static inline bool mem_cgroup_swap_full(struct folio *folio) { return vm_swap_full(); } -- cgit v1.2.3 From 3b344157c0c15b8f9588e3021dfb22ee25f4508a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:44 +0100 Subject: mm: remove try_to_free_swap() All callers have now been converted to folio_free_swap() and we can remove this wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-49-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index d8bd6401c3e7..fc8d98660326 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -510,7 +510,6 @@ extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); -extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); @@ -595,11 +594,6 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int try_to_free_swap(struct page *page) -{ - return 0; -} - static inline swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; -- cgit v1.2.3 From 29eea9b5a9c9ecf21164a082a42bfabe06fdcb30 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:50 +0100 Subject: mm: convert page_get_anon_vma() to folio_get_anon_vma() With all callers now passing in a folio, rename the function and convert all callers. Removes a couple of calls to compound_head() and a reference to page->mapping. Link: https://lkml.kernel.org/r/20220902194653.1739778-55-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 72b2bcc37f73..3d56e3712bb2 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -163,7 +163,7 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, unlink_anon_vmas(next); } -struct anon_vma *page_get_anon_vma(struct page *page); +struct anon_vma *folio_get_anon_vma(struct folio *folio); /* RMAP flags, currently only relevant for some anon rmap operations. */ typedef int __bitwise rmap_t; -- cgit v1.2.3 From 0c826c0b6a176b9ed5ace7106fd1770bb48f1898 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:51 +0100 Subject: rmap: remove page_unlock_anon_vma_read() This was simply an alias for anon_vma_unlock_read() since 2011. Link: https://lkml.kernel.org/r/20220902194653.1739778-56-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3d56e3712bb2..ca3e4ba6c58c 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -458,13 +458,8 @@ struct rmap_walk_control { void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); - -/* - * Called by memory-failure.c to kill processes. - */ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, struct rmap_walk_control *rwc); -void page_unlock_anon_vma_read(struct anon_vma *anon_vma); #else /* !CONFIG_MMU */ -- cgit v1.2.3 From 19672a9e4a75252871cba319f4e3b859b8fdf671 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:53 +0100 Subject: mm: convert lock_page_or_retry() to folio_lock_or_retry() Remove a call to compound_head() in each of the two callers. Link: https://lkml.kernel.org/r/20220902194653.1739778-58-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09de43e36a64..32846b6306db 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -989,19 +989,16 @@ static inline int lock_page_killable(struct page *page) } /* - * lock_page_or_retry - Lock the page, unless this would block and the + * folio_lock_or_retry - Lock the folio, unless this would block and the * caller indicated that it can handle a retry. * * Return value and mmap_lock implications depend on flags; see * __folio_lock_or_retry(). */ -static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm, - unsigned int flags) +static inline bool folio_lock_or_retry(struct folio *folio, + struct mm_struct *mm, unsigned int flags) { - struct folio *folio; might_sleep(); - - folio = page_folio(page); return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags); } -- cgit v1.2.3 From f372bde922e2ced8e0b5a928887b4cf587cc4453 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:29 +0200 Subject: kasan: only define kasan_metadata_size for Generic mode KASAN provides a helper for calculating the size of per-object metadata stored in the redzone. As now only the Generic mode uses per-object metadata, only define kasan_metadata_size() for this mode. Link: https://lkml.kernel.org/r/8f81d4938b80446bc72538a08217009f328a3e23.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index b092277bf48d..027df7599573 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -150,14 +150,6 @@ static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) __kasan_cache_create_kmalloc(cache); } -size_t __kasan_metadata_size(struct kmem_cache *cache); -static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache) -{ - if (kasan_enabled()) - return __kasan_metadata_size(cache); - return 0; -} - void __kasan_poison_slab(struct slab *slab); static __always_inline void kasan_poison_slab(struct slab *slab) { @@ -282,7 +274,6 @@ static inline void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) {} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} -static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) {} @@ -333,6 +324,8 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC +size_t kasan_metadata_size(struct kmem_cache *cache); + void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); @@ -340,6 +333,12 @@ void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ +/* Tag-based KASAN modes do not use per-object metadata. */ +static inline size_t kasan_metadata_size(struct kmem_cache *cache) +{ + return 0; +} + static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} -- cgit v1.2.3 From 3b7f8813e9ecf7fe91f2f8dc3b581a111cd374a5 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:30 +0200 Subject: kasan: only define kasan_never_merge for Generic mode KASAN prevents merging of slab caches whose objects have per-object metadata stored in redzones. As now only the Generic mode uses per-object metadata, define kasan_never_merge() only for this mode. Link: https://lkml.kernel.org/r/81ed01f29ff3443580b7e2fe362a8b47b1e8006d.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 027df7599573..9743d4b3a918 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -103,14 +103,6 @@ struct kasan_cache { bool is_kmalloc; }; -slab_flags_t __kasan_never_merge(void); -static __always_inline slab_flags_t kasan_never_merge(void) -{ - if (kasan_enabled()) - return __kasan_never_merge(); - return 0; -} - void __kasan_unpoison_range(const void *addr, size_t size); static __always_inline void kasan_unpoison_range(const void *addr, size_t size) { @@ -261,10 +253,6 @@ static __always_inline bool kasan_check_byte(const void *addr) #else /* CONFIG_KASAN */ -static inline slab_flags_t kasan_never_merge(void) -{ - return 0; -} static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} @@ -325,6 +313,7 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC size_t kasan_metadata_size(struct kmem_cache *cache); +slab_flags_t kasan_never_merge(void); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); @@ -338,6 +327,11 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } +/* And thus nothing prevents cache merging. */ +static inline slab_flags_t kasan_never_merge(void) +{ + return 0; +} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} -- cgit v1.2.3 From 26f21f3ac76df6cf3b447e8231f8754991165475 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:31 +0200 Subject: kasan: only define metadata offsets for Generic mode Hide the definitions of alloc_meta_offset and free_meta_offset under an ifdef CONFIG_KASAN_GENERIC check, as these fields are now only used when the Generic mode is enabled. Link: https://lkml.kernel.org/r/d4bafa0534facafd1a23c465a94261e64f366493.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 9743d4b3a918..a212c2e3f32d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -98,8 +98,10 @@ static inline bool kasan_has_integrated_init(void) #ifdef CONFIG_KASAN struct kasan_cache { +#ifdef CONFIG_KASAN_GENERIC int alloc_meta_offset; int free_meta_offset; +#endif bool is_kmalloc; }; -- cgit v1.2.3 From 682ed08924407b719fa0b1123a26971748d76ace Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:33 +0200 Subject: kasan: only define kasan_cache_create for Generic mode Right now, kasan_cache_create() assigns SLAB_KASAN for all KASAN modes and then sets up metadata-related cache parameters for the Generic mode. SLAB_KASAN is used in two places: 1. In slab_ksize() to account for per-object metadata when calculating the size of the accessible memory within the object. 2. In slab_common.c via kasan_never_merge() to prevent merging of caches with per-object metadata. Both cases are only relevant when per-object metadata is present, which is only the case with the Generic mode. Thus, assign SLAB_KASAN and define kasan_cache_create() only for the Generic mode. Also update the SLAB_KASAN-related comment. Link: https://lkml.kernel.org/r/61faa2aa1906e2d02c97d00ddf99ce8911dda095.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 18 ++++++------------ include/linux/slab.h | 2 +- 2 files changed, 7 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index a212c2e3f32d..d811b3d7d2a1 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -128,15 +128,6 @@ static __always_inline void kasan_unpoison_pages(struct page *page, __kasan_unpoison_pages(page, order, init); } -void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, - slab_flags_t *flags); -static __always_inline void kasan_cache_create(struct kmem_cache *cache, - unsigned int *size, slab_flags_t *flags) -{ - if (kasan_enabled()) - __kasan_cache_create(cache, size, flags); -} - void __kasan_cache_create_kmalloc(struct kmem_cache *cache); static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) { @@ -260,9 +251,6 @@ static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} static inline void kasan_unpoison_pages(struct page *page, unsigned int order, bool init) {} -static inline void kasan_cache_create(struct kmem_cache *cache, - unsigned int *size, - slab_flags_t *flags) {} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, @@ -316,6 +304,8 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} size_t kasan_metadata_size(struct kmem_cache *cache); slab_flags_t kasan_never_merge(void); +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, + slab_flags_t *flags); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); @@ -334,6 +324,10 @@ static inline slab_flags_t kasan_never_merge(void) { return 0; } +/* And no cache-related metadata initialization is required. */ +static inline void kasan_cache_create(struct kmem_cache *cache, + unsigned int *size, + slab_flags_t *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} diff --git a/include/linux/slab.h b/include/linux/slab.h index 352e3f082acc..617a39f7db46 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -106,7 +106,7 @@ # define SLAB_ACCOUNT 0 #endif -#ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_GENERIC #define SLAB_KASAN ((slab_flags_t __force)0x08000000U) #else #define SLAB_KASAN 0 -- cgit v1.2.3 From 36001cba4f728e7fa2a58bc69fece22eaeef5cca Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 6 Sep 2022 23:18:47 +0800 Subject: mm/damon/core: iterate the regions list from current point in damon_set_regions() We iterate the whole regions list every time to get the first/last regions intersecting with the specific range in damon_set_regions(), in order to add new region or resize existing regions to fit in the specific range. Actually, it is unnecessary to iterate the new added regions and the front regions that have been checked. Just iterate the regions list from the current point using list_for_each_entry_from() every time to improve performance. The kunit tests passed: [PASSED] damon_test_apply_three_regions1 [PASSED] damon_test_apply_three_regions2 [PASSED] damon_test_apply_three_regions3 [PASSED] damon_test_apply_three_regions4 Link: https://lkml.kernel.org/r/1662477527-13003-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 7b1f4a488230..d54acec048d6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -463,9 +463,17 @@ static inline struct damon_region *damon_last_region(struct damon_target *t) return list_last_entry(&t->regions_list, struct damon_region, list); } +static inline struct damon_region *damon_first_region(struct damon_target *t) +{ + return list_first_entry(&t->regions_list, struct damon_region, list); +} + #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) +#define damon_for_each_region_from(r, t) \ + list_for_each_entry_from(r, &t->regions_list, list) + #define damon_for_each_region_safe(r, next, t) \ list_for_each_entry_safe(r, next, &t->regions_list, list) -- cgit v1.2.3 From 4f9bc69ac5ce34071a9a51343bc81ca76cb2e3f1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 14:08:42 +0800 Subject: mm: reuse pageblock_start/end_pfn() macro Move pageblock_start_pfn/pageblock_end_pfn() into pageblock-flags.h, then they could be used somewhere else, not only in compaction, also use ALIGN_DOWN() instead of round_down() to be pair with ALIGN(), which should be same for pageblock usage. Link: https://lkml.kernel.org/r/20220907060844.126891-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 83c7248053a1..a09b7fe6bbf8 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -53,6 +53,8 @@ extern unsigned int pageblock_order; #endif /* CONFIG_HUGETLB_PAGE */ #define pageblock_nr_pages (1UL << pageblock_order) +#define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) +#define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages) /* Forward declaration */ struct page; -- cgit v1.2.3 From 5f7fa13fa858c17580ed513bd5e0a4b36d68fdd6 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 14:08:43 +0800 Subject: mm: add pageblock_align() macro Add pageblock_align() macro and use it to simplify code. Link: https://lkml.kernel.org/r/20220907060844.126891-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index a09b7fe6bbf8..293c76630fa8 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -53,6 +53,7 @@ extern unsigned int pageblock_order; #endif /* CONFIG_HUGETLB_PAGE */ #define pageblock_nr_pages (1UL << pageblock_order) +#define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) #define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages) -- cgit v1.2.3 From ee0913c4719610204315a0d8a35122c6233249e0 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 14:08:44 +0800 Subject: mm: add pageblock_aligned() macro Add pageblock_aligned() and use it to simplify code. Link: https://lkml.kernel.org/r/20220907060844.126891-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport Cc: David Hildenbrand Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 293c76630fa8..5f1ae07d724b 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -54,6 +54,7 @@ extern unsigned int pageblock_order; #define pageblock_nr_pages (1UL << pageblock_order) #define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) +#define pageblock_aligned(pfn) IS_ALIGNED((pfn), pageblock_nr_pages) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) #define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages) -- cgit v1.2.3 From 410f8e82689e1e66044fea51ef852054a09502b7 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 7 Sep 2022 04:35:35 +0000 Subject: memcg: extract memcg_vmstats from struct mem_cgroup Patch series "memcg: reduce memory overhead of memory cgroups". Currently a lot of memory is wasted to maintain the vmevents for memory cgroups as we have multiple arrays of size NR_VM_EVENT_ITEMS which can be as large as 110. However memcg code uses small portion of those entries. This patch series eliminate this overhead by removing the unneeded vmevent entries from memory cgroup data structures. This patch (of 3): This is a preparatory patch to reduce the memory overhead of memory cgroup. The struct memcg_vmstats is the largest object embedded into the struct mem_cgroup. This patch extracts struct memcg_vmstats from struct mem_cgroup to ease the following patches in reducing the size of struct memcg_vmstats. Link: https://lkml.kernel.org/r/20220907043537.3457014-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20220907043537.3457014-2-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 37 ++++--------------------------------- 1 file changed, 4 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ca0df42662ad..dc7d40e575d5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -80,29 +80,8 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, }; -struct memcg_vmstats_percpu { - /* Local (CPU and cgroup) page state & events */ - long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; - - /* Delta calculation for lockless upward propagation */ - long state_prev[MEMCG_NR_STAT]; - unsigned long events_prev[NR_VM_EVENT_ITEMS]; - - /* Cgroup1: threshold notifications & softlimit tree updates */ - unsigned long nr_page_events; - unsigned long targets[MEM_CGROUP_NTARGETS]; -}; - -struct memcg_vmstats { - /* Aggregated (CPU and subtree) page state & events */ - long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; - - /* Pending child counts during tree propagation */ - long state_pending[MEMCG_NR_STAT]; - unsigned long events_pending[NR_VM_EVENT_ITEMS]; -}; +struct memcg_vmstats_percpu; +struct memcg_vmstats; struct mem_cgroup_reclaim_iter { struct mem_cgroup *position; @@ -298,7 +277,7 @@ struct mem_cgroup { CACHELINE_PADDING(_pad1_); /* memory.stat */ - struct memcg_vmstats vmstats; + struct memcg_vmstats *vmstats; /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; @@ -1001,15 +980,7 @@ static inline void mod_memcg_page_state(struct page *page, rcu_read_unlock(); } -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) -- cgit v1.2.3 From f5a79d7c0c87c8d88bb5e3f3c898258fdf1b3b05 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Thu, 8 Sep 2022 19:14:43 +0000 Subject: mm/damon: introduce struct damos_access_pattern damon_new_scheme() has too many parameters, so introduce struct damos_access_pattern to simplify it. In additon, we can't use a bpf trace kprobe that has more than 5 parameters. Link: https://lkml.kernel.org/r/20220908191443.129534-1-sj@kernel.org Signed-off-by: Yajun Deng Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index d54acec048d6..90f20675da22 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -216,13 +216,26 @@ struct damos_stat { }; /** - * struct damos - Represents a Data Access Monitoring-based Operation Scheme. + * struct damos_access_pattern - Target access pattern of the given scheme. * @min_sz_region: Minimum size of target regions. * @max_sz_region: Maximum size of target regions. * @min_nr_accesses: Minimum ``->nr_accesses`` of target regions. * @max_nr_accesses: Maximum ``->nr_accesses`` of target regions. * @min_age_region: Minimum age of target regions. * @max_age_region: Maximum age of target regions. + */ +struct damos_access_pattern { + unsigned long min_sz_region; + unsigned long max_sz_region; + unsigned int min_nr_accesses; + unsigned int max_nr_accesses; + unsigned int min_age_region; + unsigned int max_age_region; +}; + +/** + * struct damos - Represents a Data Access Monitoring-based Operation Scheme. + * @pattern: Access pattern of target regions. * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. @@ -230,10 +243,8 @@ struct damos_stat { * @list: List head for siblings. * * For each aggregation interval, DAMON finds regions which fit in the - * condition (&min_sz_region, &max_sz_region, &min_nr_accesses, - * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to - * those. To avoid consuming too much CPU time or IO resources for the - * &action, "a is used. + * &pattern and applies &action to those. To avoid consuming too much + * CPU time or IO resources for the &action, "a is used. * * To do the work only when needed, schemes can be activated for specific * system situations using &wmarks. If all schemes that registered to the @@ -248,12 +259,7 @@ struct damos_stat { * &action is applied. */ struct damos { - unsigned long min_sz_region; - unsigned long max_sz_region; - unsigned int min_nr_accesses; - unsigned int max_nr_accesses; - unsigned int min_age_region; - unsigned int max_age_region; + struct damos_access_pattern pattern; enum damos_action action; struct damos_quota quota; struct damos_watermarks wmarks; @@ -509,12 +515,9 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges); -struct damos *damon_new_scheme( - unsigned long min_sz_region, unsigned long max_sz_region, - unsigned int min_nr_accesses, unsigned int max_nr_accesses, - unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action, struct damos_quota *quota, - struct damos_watermarks *wmarks); +struct damos *damon_new_scheme(struct damos_access_pattern *pattern, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); -- cgit v1.2.3 From 0d83b2d89dbfad17b62d4e7fb8f0b0525ba1a204 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 9 Sep 2022 21:36:06 +0000 Subject: mm/damon: remove duplicate get_monitoring_region() definitions In lru_sort.c and reclaim.c, they are all defining get_monitoring_region() function, there is no need to define it separately. As 'get_monitoring_region()' is not a 'static' function anymore, we try to use a prefix to distinguish with other functions, so there rename it to 'damon_find_biggest_system_ram'. Link: https://lkml.kernel.org/r/20220909213606.136221-1-sj@kernel.org Signed-off-by: Xin Hao Signed-off-by: SeongJae Park Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 90f20675da22..016b6c9c03d6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -549,6 +549,8 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx) int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); +bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end); + #endif /* CONFIG_DAMON */ #endif /* _DAMON_H */ -- cgit v1.2.3 From 13cc378403a83e70430ae9bad53fd65199f21fe1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 9 Sep 2022 10:57:11 +0800 Subject: writeback: remove unused macro DIRTY_FULL_SCOPE It's introduced but never used. Remove it. Link: https://lkml.kernel.org/r/20220909025711.32012-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Jan Kara Acked-by: Jens Axboe Cc: Bart Van Assche Cc: David Howells Cc: Matthew Wilcox Cc: NeilBrown Cc: Vlastimil Babka Cc: zhanglianjie Signed-off-by: Andrew Morton --- include/linux/writeback.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3f045f6d6c4f..06f9291b6fd5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -17,20 +17,12 @@ struct bio; DECLARE_PER_CPU(int, dirty_throttle_leaks); /* - * The 1/4 region under the global dirty thresh is for smooth dirty throttling: - * - * (thresh - thresh/DIRTY_FULL_SCOPE, thresh) - * - * Further beyond, all dirtier tasks will enter a loop waiting (possibly long - * time) for the dirty pages to drop, unless written enough pages. - * * The global dirty threshold is normally equal to the global dirty limit, * except when the system suddenly allocates a lot of anonymous memory and * knocks down the global dirty threshold quickly, in which case the global * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. */ #define DIRTY_SCOPE 8 -#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) struct backing_dev_info; -- cgit v1.2.3 From cbeaa77b044938cfe91818821ece6b0b1511e967 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:32 +0000 Subject: mm/damon/core: use a dedicated struct for monitoring attributes DAMON monitoring attributes are directly defined as fields of 'struct damon_ctx'. This makes 'struct damon_ctx' a little long and complicated. This commit defines and uses a struct, 'struct damon_attrs', which is dedicated for only the monitoring attributes to make the purpose of the five values clearer and simplify 'struct damon_ctx'. Link: https://lkml.kernel.org/r/20220913174449.50645-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 016b6c9c03d6..2ceee8b07726 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -389,13 +389,15 @@ struct damon_callback { }; /** - * struct damon_ctx - Represents a context for each monitoring. This is the - * main interface that allows users to set the attributes and get the results - * of the monitoring. + * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * * @sample_interval: The time between access samplings. * @aggr_interval: The time between monitor results aggregations. * @ops_update_interval: The time between monitoring operations updates. + * @min_nr_regions: The minimum number of adaptive monitoring + * regions. + * @max_nr_regions: The maximum number of adaptive monitoring + * regions. * * For each @sample_interval, DAMON checks whether each region is accessed or * not. It aggregates and keeps the access information (number of accesses to @@ -405,7 +407,21 @@ struct damon_callback { * @ops_update_interval. All time intervals are in micro-seconds. * Please refer to &struct damon_operations and &struct damon_callback for more * detail. + */ +struct damon_attrs { + unsigned long sample_interval; + unsigned long aggr_interval; + unsigned long ops_update_interval; + unsigned long min_nr_regions; + unsigned long max_nr_regions; +}; + +/** + * struct damon_ctx - Represents a context for each monitoring. This is the + * main interface that allows users to set the attributes and get the results + * of the monitoring. * + * @attrs: Monitoring attributes for accuracy/overhead control. * @kdamond: Kernel thread who does the monitoring. * @kdamond_lock: Mutex for the synchronizations with @kdamond. * @@ -427,15 +443,11 @@ struct damon_callback { * @ops: Set of monitoring operations for given use cases. * @callback: Set of callbacks for monitoring events notifications. * - * @min_nr_regions: The minimum number of adaptive monitoring regions. - * @max_nr_regions: The maximum number of adaptive monitoring regions. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ struct damon_ctx { - unsigned long sample_interval; - unsigned long aggr_interval; - unsigned long ops_update_interval; + struct damon_attrs attrs; /* private: internal use only */ struct timespec64 last_aggregation; @@ -448,8 +460,6 @@ struct damon_ctx { struct damon_operations ops; struct damon_callback callback; - unsigned long min_nr_regions; - unsigned long max_nr_regions; struct list_head adaptive_targets; struct list_head schemes; }; -- cgit v1.2.3 From bead3b00088eb8016b32cafa7e0701b3283e68a4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:33 +0000 Subject: mm/damon/core: reduce parameters for damon_set_attrs() Number of parameters for 'damon_set_attrs()' is six. As it could be confusing and verbose, this commit reduces the number by receiving single pointer to a 'struct damon_attrs'. Link: https://lkml.kernel.org/r/20220913174449.50645-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2ceee8b07726..c5dc0c77c772 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -540,9 +540,7 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); -int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long ops_upd_int, - unsigned long min_nr_reg, unsigned long max_nr_reg); +int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs); int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); -- cgit v1.2.3 From b958d4d08fbfe938af24ea06ebbf839b48fa18a9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 14 Sep 2022 15:26:02 +0800 Subject: mm: hugetlb: simplify per-node sysfs creation and removal Patch series "simplify handling of per-node sysfs creation and removal", v4. This patch (of 2): The following commit offload per-node sysfs creation and removal to a kworker and did not say why it is needed. And it also said "I don't know that this is absolutely required". It seems like the author was not sure as well. Since it only complicates the code, this patch will revert the changes to simplify the code. 39da08cb074c ("hugetlb: offload per node attribute registrations") We could use memory hotplug notifier to do per-node sysfs creation and removal instead of inserting those operations to node registration and unregistration. Then, it can reduce the code coupling between node.c and hugetlb.c. Also, it can simplify the code. Link: https://lkml.kernel.org/r/20220914072603.60293-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220914072603.60293-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Kravetz Acked-by: David Hildenbrand Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Muchun Song Cc: Oscar Salvador Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- include/linux/node.h | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/node.h b/include/linux/node.h index 9ec680dd607f..427a5975cf40 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -2,15 +2,15 @@ /* * include/linux/node.h - generic node definition * - * This is mainly for topological representation. We define the - * basic 'struct node' here, which can be embedded in per-arch + * This is mainly for topological representation. We define the + * basic 'struct node' here, which can be embedded in per-arch * definitions of processors. * * Basic handling of the devices is done in drivers/base/node.c - * and system devices are handled in drivers/base/sys.c. + * and system devices are handled in drivers/base/sys.c. * * Nodes are exported via driverfs in the class/node/devices/ - * directory. + * directory. */ #ifndef _LINUX_NODE_H_ #define _LINUX_NODE_H_ @@ -18,7 +18,6 @@ #include #include #include -#include /** * struct node_hmem_attrs - heterogeneous memory performance attributes @@ -84,10 +83,6 @@ static inline void node_set_perf_attrs(unsigned int nid, struct node { struct device dev; struct list_head access_list; - -#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) - struct work_struct node_work; -#endif #ifdef CONFIG_HMEM_REPORTING struct list_head cache_attrs; struct device *cache_dev; @@ -96,7 +91,6 @@ struct node { struct memory_block; extern struct node *node_devices[]; -typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA) void register_memory_blocks_under_node(int nid, unsigned long start_pfn, @@ -144,11 +138,6 @@ extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); extern int register_memory_node_under_compute_node(unsigned int mem_nid, unsigned int cpu_nid, unsigned access); - -#ifdef CONFIG_HUGETLBFS -extern void register_hugetlbfs_with_node(node_registration_func_t doregister, - node_registration_func_t unregister); -#endif #else static inline void node_dev_init(void) { @@ -176,11 +165,6 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk) { } - -static inline void register_hugetlbfs_with_node(node_registration_func_t reg, - node_registration_func_t unreg) -{ -} #endif #define to_node(device) container_of(device, struct node, dev) -- cgit v1.2.3 From a4a00b451ef5e1deb959088e25e248f4ee399792 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 14 Sep 2022 15:26:03 +0800 Subject: mm: hugetlb: eliminate memory-less nodes handling The memory-notify-based approach aims to handle meory-less nodes, however, it just adds the complexity of code as pointed by David in thread [1]. The handling of memory-less nodes is introduced by commit 4faf8d950ec4 ("hugetlb: handle memory hot-plug events"). >From its commit message, we cannot find any necessity of handling this case. So, we can simply register/unregister sysfs entries in register_node/unregister_node to simlify the code. BTW, hotplug callback added because in hugetlb_register_all_nodes() we register sysfs nodes only for N_MEMORY nodes, seeing commit 9b5e5d0fdc91, which said it was a preparation for handling memory-less nodes via memory hotplug. Since we want to remove memory hotplug, so make sure we only register per-node sysfs for online (N_ONLINE) nodes in hugetlb_register_all_nodes(). https://lore.kernel.org/linux-mm/60933ffc-b850-976c-78a0-0ee6e0ea9ef0@redhat.com/ [1] Link: https://lkml.kernel.org/r/20220914072603.60293-3-songmuchun@bytedance.com Suggested-by: David Hildenbrand Signed-off-by: Muchun Song Acked-by: David Hildenbrand Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Mike Kravetz Cc: Oscar Salvador Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 57e72954a482..6d7f39754060 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -16,6 +16,7 @@ struct ctl_table; struct user_struct; struct mmu_gather; +struct node; #ifndef is_hugepd typedef struct { unsigned long pd; } hugepd_t; @@ -935,6 +936,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, } #endif +#ifdef CONFIG_NUMA +void hugetlb_register_node(struct node *node); +void hugetlb_unregister_node(struct node *node); +#endif + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; @@ -1109,6 +1115,14 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { } + +static inline void hugetlb_register_node(struct node *node) +{ +} + +static inline void hugetlb_unregister_node(struct node *node) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, -- cgit v1.2.3 From c195c3215741746b1eb7ab7980b926ddc37a4be3 Mon Sep 17 00:00:00 2001 From: Ke Sun Date: Wed, 14 Sep 2022 10:17:38 +0800 Subject: mm/filemap: make folio_put_wait_locked static It's only used in mm/filemap.c, since commit ("mm/migrate.c: rework migration_entry_wait() to not take a pageref"). Make it static. Link: https://lkml.kernel.org/r/20220914021738.3228011-1-sunke@kylinos.cn Signed-off-by: Ke Sun Reported-by: k2ci Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 32846b6306db..23125ab87ded 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1039,7 +1039,6 @@ static inline int wait_on_page_locked_killable(struct page *page) return folio_wait_locked_killable(page_folio(page)); } -int folio_put_wait_locked(struct folio *folio, int state); void wait_on_page_writeback(struct page *page); void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); -- cgit v1.2.3 From 7e1813d48dd30e6c6f235f6661d1bc108fcab528 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:04 -0700 Subject: hugetlb: rename remove_huge_page to hugetlb_delete_from_page_cache remove_huge_page removes a hugetlb page from the page cache. Change to hugetlb_delete_from_page_cache as it is a more descriptive name. huge_add_to_page_cache is global in scope, but only deals with hugetlb pages. For consistency and clarity, rename to hugetlb_add_to_page_cache. Link: https://lkml.kernel.org/r/20220914221810.95771-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6d7f39754060..4893d6d07099 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -666,7 +666,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); -int huge_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page); -- cgit v1.2.3 From 8d9bfb2608145cf3e408428c224099e1585471af Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:07 -0700 Subject: hugetlb: add vma based lock for pmd sharing Allocate a new hugetlb_vma_lock structure and hang off vm_private_data for synchronization use by vmas that could be involved in pmd sharing. This data structure contains a rw semaphore that is the primary tool used for synchronization. This new structure is ref counted, so that it can exist when NOT attached to a vma. This is only helpful in resolving lock ordering issues where code may need to obtain the vma_lock while there are no guarantees the vma may go away. By obtaining a ref on the structure, it can be guaranteed that at least the rw semaphore will not go away. Only add infrastructure for the new lock here. Actual use will be added in subsequent patches. [mike.kravetz@oracle.com: fix build issue for missing hugetlb_vma_lock_release] Link: https://lkml.kernel.org/r/YyNUtA1vRASOE4+M@monkey Link: https://lkml.kernel.org/r/20220914221810.95771-7-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4893d6d07099..7b70aa931729 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -115,6 +115,12 @@ struct file_region { #endif }; +struct hugetlb_vma_lock { + struct kref refs; + struct rw_semaphore rw_sema; + struct vm_area_struct *vma; +}; + extern struct resv_map *resv_map_alloc(void); void resv_map_release(struct kref *ref); @@ -127,7 +133,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, long min_hpages); void hugepage_put_subpool(struct hugepage_subpool *spool); -void reset_vma_resv_huge_pages(struct vm_area_struct *vma); +void hugetlb_dup_vma_private(struct vm_area_struct *vma); void clear_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *, @@ -215,6 +221,14 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags); +void hugetlb_vma_lock_read(struct vm_area_struct *vma); +void hugetlb_vma_unlock_read(struct vm_area_struct *vma); +void hugetlb_vma_lock_write(struct vm_area_struct *vma); +void hugetlb_vma_unlock_write(struct vm_area_struct *vma); +int hugetlb_vma_trylock_write(struct vm_area_struct *vma); +void hugetlb_vma_assert_locked(struct vm_area_struct *vma); +void hugetlb_vma_lock_release(struct kref *kref); + int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); unsigned long hugetlb_change_protection(struct vm_area_struct *vma, @@ -226,7 +240,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ -static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma) { } @@ -337,6 +351,31 @@ static inline int prepare_hugepage_range(struct file *file, return -EINVAL; } +static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ +} + +static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + return 1; +} + +static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ +} + static inline int pmd_huge(pmd_t pmd) { return 0; -- cgit v1.2.3 From 83a4f1ef45a90d740bc6edf6a2533b14a3e5d183 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:36 +0200 Subject: stackdepot: reserve 5 extra bits in depot_stack_handle_t Some users (currently only KMSAN) may want to use spare bits in depot_stack_handle_t. Let them do so by adding @extra_bits to __stack_depot_save() to store arbitrary flags, and providing stack_depot_get_extra_bits() to retrieve those flags. Also adapt KASAN to the new prototype by passing extra_bits=0, as KASAN does not intend to store additional information in the stack handle. Link: https://lkml.kernel.org/r/20220915150417.722975-3-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index bc2797955de9..9ca7798d7a31 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -14,9 +14,15 @@ #include typedef u32 depot_stack_handle_t; +/* + * Number of bits in the handle that stack depot doesn't use. Users may store + * information in them. + */ +#define STACK_DEPOT_EXTRA_BITS 5 depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, + unsigned int extra_bits, gfp_t gfp_flags, bool can_alloc); /* @@ -59,6 +65,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); + int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); -- cgit v1.2.3 From 33b75c1d884e81ec97525e0a6fdcb187adf273f4 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:37 +0200 Subject: instrumented.h: allow instrumenting both sides of copy_from_user() Introduce instrument_copy_from_user_before() and instrument_copy_from_user_after() hooks to be invoked before and after the call to copy_from_user(). KASAN and KCSAN will be only using instrument_copy_from_user_before(), but for KMSAN we'll need to insert code after copy_from_user(). Link: https://lkml.kernel.org/r/20220915150417.722975-4-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/instrumented.h | 21 +++++++++++++++++++-- include/linux/uaccess.h | 19 ++++++++++++++----- 2 files changed, 33 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index 42faebbaa202..ee8f7d17d34f 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -120,7 +120,7 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n) } /** - * instrument_copy_from_user - instrument writes of copy_from_user + * instrument_copy_from_user_before - add instrumentation before copy_from_user * * Instrument writes to kernel memory, that are due to copy_from_user (and * variants). The instrumentation should be inserted before the accesses. @@ -130,10 +130,27 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n) * @n number of bytes to copy */ static __always_inline void -instrument_copy_from_user(const void *to, const void __user *from, unsigned long n) +instrument_copy_from_user_before(const void *to, const void __user *from, unsigned long n) { kasan_check_write(to, n); kcsan_check_write(to, n); } +/** + * instrument_copy_from_user_after - add instrumentation after copy_from_user + * + * Instrument writes to kernel memory, that are due to copy_from_user (and + * variants). The instrumentation should be inserted after the accesses. + * + * @to destination address + * @from source address + * @n number of bytes to copy + * @left number of bytes not copied (as returned by copy_from_user) + */ +static __always_inline void +instrument_copy_from_user_after(const void *to, const void __user *from, + unsigned long n, unsigned long left) +{ +} + #endif /* _LINUX_INSTRUMENTED_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 47e5d374c7eb..afb18f198843 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -58,20 +58,28 @@ static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { - instrument_copy_from_user(to, from, n); + unsigned long res; + + instrument_copy_from_user_before(to, from, n); check_object_size(to, n, false); - return raw_copy_from_user(to, from, n); + res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); + return res; } static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { + unsigned long res; + might_fault(); + instrument_copy_from_user_before(to, from, n); if (should_fail_usercopy()) return n; - instrument_copy_from_user(to, from, n); check_object_size(to, n, false); - return raw_copy_from_user(to, from, n); + res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); + return res; } /** @@ -115,8 +123,9 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { - instrument_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); -- cgit v1.2.3 From 888f84a6da4d17e453058169fa7b235fff34f5bf Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:38 +0200 Subject: x86: asm: instrument usercopy in get_user() and put_user() Use hooks from instrumented.h to notify bug detection tools about usercopy events in variations of get_user() and put_user(). Link: https://lkml.kernel.org/r/20220915150417.722975-5-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/instrumented.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index ee8f7d17d34f..9f1dba8f717b 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -153,4 +153,32 @@ instrument_copy_from_user_after(const void *to, const void __user *from, { } +/** + * instrument_get_user() - add instrumentation to get_user()-like macros + * + * get_user() and friends are fragile, so it may depend on the implementation + * whether the instrumentation happens before or after the data is copied from + * the userspace. + * + * @to destination variable, may not be address-taken + */ +#define instrument_get_user(to) \ +({ \ +}) + +/** + * instrument_put_user() - add instrumentation to put_user()-like macros + * + * put_user() and friends are fragile, so it may depend on the implementation + * whether the instrumentation happens before or after the data is copied from + * the userspace. + * + * @from source address + * @ptr userspace pointer to copy to + * @size number of bytes to copy + */ +#define instrument_put_user(from, ptr, size) \ +({ \ +}) + #endif /* _LINUX_INSTRUMENTED_H */ -- cgit v1.2.3 From 9b448bc25b776daab3215393c3ce6953dd3bb8ad Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:41 +0200 Subject: kmsan: introduce __no_sanitize_memory and __no_kmsan_checks __no_sanitize_memory is a function attribute that instructs KMSAN to skip a function during instrumentation. This is needed to e.g. implement the noinstr functions. __no_kmsan_checks is a function attribute that makes KMSAN ignore the uninitialized values coming from the function's inputs, and initialize the function's outputs. Functions marked with this attribute can't be inlined into functions not marked with it, and vice versa. This behavior is overridden by __always_inline. __SANITIZE_MEMORY__ is a macro that's defined iff the file is instrumented with KMSAN. This is not the same as CONFIG_KMSAN, which is defined for every file. Link: https://lkml.kernel.org/r/20220915150417.722975-8-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 23 +++++++++++++++++++++++ include/linux/compiler-gcc.h | 6 ++++++ 2 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index c84fec767445..4fa0cc4cbd2c 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -51,6 +51,29 @@ #define __no_sanitize_undefined #endif +#if __has_feature(memory_sanitizer) +#define __SANITIZE_MEMORY__ +/* + * Unlike other sanitizers, KMSAN still inserts code into functions marked with + * no_sanitize("kernel-memory"). Using disable_sanitizer_instrumentation + * provides the behavior consistent with other __no_sanitize_ attributes, + * guaranteeing that __no_sanitize_memory functions remain uninstrumented. + */ +#define __no_sanitize_memory __disable_sanitizer_instrumentation + +/* + * The __no_kmsan_checks attribute ensures that a function does not produce + * false positive reports by: + * - initializing all local variables and memory stores in this function; + * - skipping all shadow checks; + * - passing initialized arguments to this function's callees. + */ +#define __no_kmsan_checks __attribute__((no_sanitize("kernel-memory"))) +#else +#define __no_sanitize_memory +#define __no_kmsan_checks +#endif + /* * Support for __has_feature(coverage_sanitizer) was added in Clang 13 together * with no_sanitize("coverage"). Prior versions of Clang support coverage diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 9b157b71036f..f55a37efdb97 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -114,6 +114,12 @@ #define __SANITIZE_ADDRESS__ #endif +/* + * GCC does not support KMSAN. + */ +#define __no_sanitize_memory +#define __no_kmsan_checks + /* * Turn individual warnings and errors on and off locally, depending * on version. -- cgit v1.2.3 From 5de0ce85f5a4d2883eae6f48eb015bc5dfbd91e9 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:42 +0200 Subject: kmsan: mark noinstr as __no_sanitize_memory noinstr functions should never be instrumented, so make KMSAN skip them by applying the __no_sanitize_memory attribute. Link: https://lkml.kernel.org/r/20220915150417.722975-9-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/compiler_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 4f2a819fd60a..015207a6e2bf 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -229,7 +229,8 @@ struct ftrace_likely_data { /* Section for code which can't be instrumented at all */ #define noinstr \ noinline notrace __attribute((__section__(".noinstr.text"))) \ - __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage + __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \ + __no_sanitize_memory #endif /* __KERNEL__ */ -- cgit v1.2.3 From f80be4571b19b9fd8dd1528cd2a2f123aff51f70 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:45 +0200 Subject: kmsan: add KMSAN runtime core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For each memory location KernelMemorySanitizer maintains two types of metadata: 1. The so-called shadow of that location - а byte:byte mapping describing whether or not individual bits of memory are initialized (shadow is 0) or not (shadow is 1). 2. The origins of that location - а 4-byte:4-byte mapping containing 4-byte IDs of the stack traces where uninitialized values were created. Each struct page now contains pointers to two struct pages holding KMSAN metadata (shadow and origins) for the original struct page. Utility routines in mm/kmsan/core.c and mm/kmsan/shadow.c handle the metadata creation, addressing, copying and checking. mm/kmsan/report.c performs error reporting in the cases an uninitialized value is used in a way that leads to undefined behavior. KMSAN compiler instrumentation is responsible for tracking the metadata along with the kernel memory. mm/kmsan/instrumentation.c provides the implementation for instrumentation hooks that are called from files compiled with -fsanitize=kernel-memory. To aid parameter passing (also done at instrumentation level), each task_struct now contains a struct kmsan_task_state used to track the metadata of function parameters and return values for that task. Finally, this patch provides CONFIG_KMSAN that enables KMSAN, and declares CFLAGS_KMSAN, which are applied to files compiled with KMSAN. The KMSAN_SANITIZE:=n Makefile directive can be used to completely disable KMSAN instrumentation for certain files. Similarly, KMSAN_ENABLE_CHECKS:=n disables KMSAN checks and makes newly created stack memory initialized. Users can also use functions from include/linux/kmsan-checks.h to mark certain memory regions as uninitialized or initialized (this is called "poisoning" and "unpoisoning") or check that a particular region is initialized. Link: https://lkml.kernel.org/r/20220915150417.722975-12-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan-checks.h | 64 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/kmsan_types.h | 35 ++++++++++++++++++++++++ include/linux/mm_types.h | 12 +++++++++ include/linux/sched.h | 5 ++++ 4 files changed, 116 insertions(+) create mode 100644 include/linux/kmsan-checks.h create mode 100644 include/linux/kmsan_types.h (limited to 'include/linux') diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h new file mode 100644 index 000000000000..a6522a0c28df --- /dev/null +++ b/include/linux/kmsan-checks.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN checks to be used for one-off annotations in subsystems. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#ifndef _LINUX_KMSAN_CHECKS_H +#define _LINUX_KMSAN_CHECKS_H + +#include + +#ifdef CONFIG_KMSAN + +/** + * kmsan_poison_memory() - Mark the memory range as uninitialized. + * @address: address to start with. + * @size: size of buffer to poison. + * @flags: GFP flags for allocations done by this function. + * + * Until other data is written to this range, KMSAN will treat it as + * uninitialized. Error reports for this memory will reference the call site of + * kmsan_poison_memory() as origin. + */ +void kmsan_poison_memory(const void *address, size_t size, gfp_t flags); + +/** + * kmsan_unpoison_memory() - Mark the memory range as initialized. + * @address: address to start with. + * @size: size of buffer to unpoison. + * + * Until other data is written to this range, KMSAN will treat it as + * initialized. + */ +void kmsan_unpoison_memory(const void *address, size_t size); + +/** + * kmsan_check_memory() - Check the memory range for being initialized. + * @address: address to start with. + * @size: size of buffer to check. + * + * If any piece of the given range is marked as uninitialized, KMSAN will report + * an error. + */ +void kmsan_check_memory(const void *address, size_t size); + +#else + +static inline void kmsan_poison_memory(const void *address, size_t size, + gfp_t flags) +{ +} +static inline void kmsan_unpoison_memory(const void *address, size_t size) +{ +} +static inline void kmsan_check_memory(const void *address, size_t size) +{ +} + +#endif + +#endif /* _LINUX_KMSAN_CHECKS_H */ diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h new file mode 100644 index 000000000000..8bfa6c98176d --- /dev/null +++ b/include/linux/kmsan_types.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A minimal header declaring types added by KMSAN to existing kernel structs. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ +#ifndef _LINUX_KMSAN_TYPES_H +#define _LINUX_KMSAN_TYPES_H + +/* These constants are defined in the MSan LLVM instrumentation pass. */ +#define KMSAN_RETVAL_SIZE 800 +#define KMSAN_PARAM_SIZE 800 + +struct kmsan_context_state { + char param_tls[KMSAN_PARAM_SIZE]; + char retval_tls[KMSAN_RETVAL_SIZE]; + char va_arg_tls[KMSAN_PARAM_SIZE]; + char va_arg_origin_tls[KMSAN_PARAM_SIZE]; + u64 va_arg_overflow_size_tls; + char param_origin_tls[KMSAN_PARAM_SIZE]; + u32 retval_origin_tls; +}; + +#undef KMSAN_PARAM_SIZE +#undef KMSAN_RETVAL_SIZE + +struct kmsan_ctx { + struct kmsan_context_state cstate; + int kmsan_in_runtime; + bool allow_reporting; +}; + +#endif /* _LINUX_KMSAN_TYPES_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5c87d0f292a2..500e536796ca 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -224,6 +224,18 @@ struct page { not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ +#ifdef CONFIG_KMSAN + /* + * KMSAN metadata for this page: + * - shadow page: every bit indicates whether the corresponding + * bit of the original page is initialized (0) or not (1); + * - origin page: every 4 bytes contain an id of the stack trace + * where the uninitialized value was created. + */ + struct page *kmsan_shadow; + struct page *kmsan_origin; +#endif + #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index fbac3c19fe35..88a043f7235e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1362,6 +1363,10 @@ struct task_struct { #endif #endif +#ifdef CONFIG_KMSAN + struct kmsan_ctx kmsan_ctx; +#endif + #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif -- cgit v1.2.3 From b073d7f8aee4ebf05d10e3380df377b73120cf16 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:48 +0200 Subject: mm: kmsan: maintain KMSAN metadata for page operations Insert KMSAN hooks that make the necessary bookkeeping changes: - poison page shadow and origins in alloc_pages()/free_page(); - clear page shadow and origins in clear_page(), copy_user_highpage(); - copy page metadata in copy_highpage(), wp_page_copy(); - handle vmap()/vunmap()/iounmap(); Link: https://lkml.kernel.org/r/20220915150417.722975-15-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/highmem.h | 3 + include/linux/kmsan.h | 145 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 include/linux/kmsan.h (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 25679035ca28..e9912da5441b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -311,6 +312,7 @@ static inline void copy_user_highpage(struct page *to, struct page *from, vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); + kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); } @@ -326,6 +328,7 @@ static inline void copy_highpage(struct page *to, struct page *from) vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_page(vto, vfrom); + kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); } diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h new file mode 100644 index 000000000000..b36bf3db835e --- /dev/null +++ b/include/linux/kmsan.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN API for subsystems. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ +#ifndef _LINUX_KMSAN_H +#define _LINUX_KMSAN_H + +#include +#include +#include + +struct page; + +#ifdef CONFIG_KMSAN + +/** + * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. + * @page: struct page pointer returned by alloc_pages(). + * @order: order of allocated struct page. + * @flags: GFP flags used by alloc_pages() + * + * KMSAN marks 1<<@order pages starting at @page as uninitialized, unless + * @flags contain __GFP_ZERO. + */ +void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags); + +/** + * kmsan_free_page() - Notify KMSAN about a free_pages() call. + * @page: struct page pointer passed to free_pages(). + * @order: order of deallocated struct page. + * + * KMSAN marks freed memory as uninitialized. + */ +void kmsan_free_page(struct page *page, unsigned int order); + +/** + * kmsan_copy_page_meta() - Copy KMSAN metadata between two pages. + * @dst: destination page. + * @src: source page. + * + * KMSAN copies the contents of metadata pages for @src into the metadata pages + * for @dst. If @dst has no associated metadata pages, nothing happens. + * If @src has no associated metadata pages, @dst metadata pages are unpoisoned. + */ +void kmsan_copy_page_meta(struct page *dst, struct page *src); + +/** + * kmsan_map_kernel_range_noflush() - Notify KMSAN about a vmap. + * @start: start of vmapped range. + * @end: end of vmapped range. + * @prot: page protection flags used for vmap. + * @pages: array of pages. + * @page_shift: page_shift passed to vmap_range_noflush(). + * + * KMSAN maps shadow and origin pages of @pages into contiguous ranges in + * vmalloc metadata address range. + */ +void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); + +/** + * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. + * @start: start of vunmapped range. + * @end: end of vunmapped range. + * + * KMSAN unmaps the contiguous metadata ranges created by + * kmsan_map_kernel_range_noflush(). + */ +void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end); + +/** + * kmsan_ioremap_page_range() - Notify KMSAN about a ioremap_page_range() call. + * @addr: range start. + * @end: range end. + * @phys_addr: physical range start. + * @prot: page protection flags used for ioremap_page_range(). + * @page_shift: page_shift argument passed to vmap_range_noflush(). + * + * KMSAN creates new metadata pages for the physical pages mapped into the + * virtual memory. + */ +void kmsan_ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift); + +/** + * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call. + * @start: range start. + * @end: range end. + * + * KMSAN unmaps the metadata pages for the given range and, unlike for + * vunmap_page_range(), also deallocates them. + */ +void kmsan_iounmap_page_range(unsigned long start, unsigned long end); + +#else + +static inline int kmsan_alloc_page(struct page *page, unsigned int order, + gfp_t flags) +{ + return 0; +} + +static inline void kmsan_free_page(struct page *page, unsigned int order) +{ +} + +static inline void kmsan_copy_page_meta(struct page *dst, struct page *src) +{ +} + +static inline void kmsan_vmap_pages_range_noflush(unsigned long start, + unsigned long end, + pgprot_t prot, + struct page **pages, + unsigned int page_shift) +{ +} + +static inline void kmsan_vunmap_range_noflush(unsigned long start, + unsigned long end) +{ +} + +static inline void kmsan_ioremap_page_range(unsigned long start, + unsigned long end, + phys_addr_t phys_addr, + pgprot_t prot, + unsigned int page_shift) +{ +} + +static inline void kmsan_iounmap_page_range(unsigned long start, + unsigned long end) +{ +} + +#endif + +#endif /* _LINUX_KMSAN_H */ -- cgit v1.2.3 From 68ef169a1dd20df5cfa5a161b7304ad9fdd14c36 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:49 +0200 Subject: mm: kmsan: call KMSAN hooks from SLUB code In order to report uninitialized memory coming from heap allocations KMSAN has to poison them unless they're created with __GFP_ZERO. It's handy that we need KMSAN hooks in the places where init_on_alloc/init_on_free initialization is performed. In addition, we apply __no_kmsan_checks to get_freepointer_safe() to suppress reports when accessing freelist pointers that reside in freed objects. Link: https://lkml.kernel.org/r/20220915150417.722975-16-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index b36bf3db835e..5c4e0079054e 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -14,6 +14,7 @@ #include struct page; +struct kmem_cache; #ifdef CONFIG_KMSAN @@ -48,6 +49,44 @@ void kmsan_free_page(struct page *page, unsigned int order); */ void kmsan_copy_page_meta(struct page *dst, struct page *src); +/** + * kmsan_slab_alloc() - Notify KMSAN about a slab allocation. + * @s: slab cache the object belongs to. + * @object: object pointer. + * @flags: GFP flags passed to the allocator. + * + * Depending on cache flags and GFP flags, KMSAN sets up the metadata of the + * newly created object, marking it as initialized or uninitialized. + */ +void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags); + +/** + * kmsan_slab_free() - Notify KMSAN about a slab deallocation. + * @s: slab cache the object belongs to. + * @object: object pointer. + * + * KMSAN marks the freed object as uninitialized. + */ +void kmsan_slab_free(struct kmem_cache *s, void *object); + +/** + * kmsan_kmalloc_large() - Notify KMSAN about a large slab allocation. + * @ptr: object pointer. + * @size: object size. + * @flags: GFP flags passed to the allocator. + * + * Similar to kmsan_slab_alloc(), but for large allocations. + */ +void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags); + +/** + * kmsan_kfree_large() - Notify KMSAN about a large slab deallocation. + * @ptr: object pointer. + * + * Similar to kmsan_slab_free(), but for large allocations. + */ +void kmsan_kfree_large(const void *ptr); + /** * kmsan_map_kernel_range_noflush() - Notify KMSAN about a vmap. * @start: start of vmapped range. @@ -114,6 +153,24 @@ static inline void kmsan_copy_page_meta(struct page *dst, struct page *src) { } +static inline void kmsan_slab_alloc(struct kmem_cache *s, void *object, + gfp_t flags) +{ +} + +static inline void kmsan_slab_free(struct kmem_cache *s, void *object) +{ +} + +static inline void kmsan_kmalloc_large(const void *ptr, size_t size, + gfp_t flags) +{ +} + +static inline void kmsan_kfree_large(const void *ptr) +{ +} + static inline void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, -- cgit v1.2.3 From 50b5e49ca694a60f84a2a12d62b6cb6ec8e3649f Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:50 +0200 Subject: kmsan: handle task creation and exiting Tell KMSAN that a new task is created, so the tool creates a backing metadata structure for that task. Link: https://lkml.kernel.org/r/20220915150417.722975-17-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 5c4e0079054e..354aee6f7b1a 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -15,9 +15,22 @@ struct page; struct kmem_cache; +struct task_struct; #ifdef CONFIG_KMSAN +/** + * kmsan_task_create() - Initialize KMSAN state for the task. + * @task: task to initialize. + */ +void kmsan_task_create(struct task_struct *task); + +/** + * kmsan_task_exit() - Notify KMSAN that a task has exited. + * @task: task about to finish. + */ +void kmsan_task_exit(struct task_struct *task); + /** * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. * @page: struct page pointer returned by alloc_pages(). @@ -139,6 +152,14 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); #else +static inline void kmsan_task_create(struct task_struct *task) +{ +} + +static inline void kmsan_task_exit(struct task_struct *task) +{ +} + static inline int kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) { -- cgit v1.2.3 From 3c206509826094e85ead0b056f484db96829248d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:51 +0200 Subject: init: kmsan: call KMSAN initialization routines kmsan_init_shadow() scans the mappings created at boot time and creates metadata pages for those mappings. When the memblock allocator returns pages to pagealloc, we reserve 2/3 of those pages and use them as metadata for the remaining 1/3. Once KMSAN starts, every page allocated by pagealloc has its associated shadow and origin pages. kmsan_initialize() initializes the bookkeeping for init_task and enables KMSAN. Link: https://lkml.kernel.org/r/20220915150417.722975-18-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 354aee6f7b1a..e00de976ee43 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -31,6 +31,28 @@ void kmsan_task_create(struct task_struct *task); */ void kmsan_task_exit(struct task_struct *task); +/** + * kmsan_init_shadow() - Initialize KMSAN shadow at boot time. + * + * Allocate and initialize KMSAN metadata for early allocations. + */ +void __init kmsan_init_shadow(void); + +/** + * kmsan_init_runtime() - Initialize KMSAN state and enable KMSAN. + */ +void __init kmsan_init_runtime(void); + +/** + * kmsan_memblock_free_pages() - handle freeing of memblock pages. + * @page: struct page to free. + * @order: order of @page. + * + * Freed pages are either returned to buddy allocator or held back to be used + * as metadata pages. + */ +bool __init kmsan_memblock_free_pages(struct page *page, unsigned int order); + /** * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. * @page: struct page pointer returned by alloc_pages(). @@ -152,6 +174,20 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); #else +static inline void kmsan_init_shadow(void) +{ +} + +static inline void kmsan_init_runtime(void) +{ +} + +static inline bool kmsan_memblock_free_pages(struct page *page, + unsigned int order) +{ + return true; +} + static inline void kmsan_task_create(struct task_struct *task) { } -- cgit v1.2.3 From 75cf0290271bf6dae9dee982aef15242dadf97e4 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:52 +0200 Subject: instrumented.h: add KMSAN support To avoid false positives, KMSAN needs to unpoison the data copied from the userspace. To detect infoleaks - check the memory buffer passed to copy_to_user(). Link: https://lkml.kernel.org/r/20220915150417.722975-19-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/instrumented.h | 18 +++++++++++++----- include/linux/kmsan-checks.h | 19 +++++++++++++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index 9f1dba8f717b..501fa8486749 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -2,7 +2,7 @@ /* * This header provides generic wrappers for memory access instrumentation that - * the compiler cannot emit for: KASAN, KCSAN. + * the compiler cannot emit for: KASAN, KCSAN, KMSAN. */ #ifndef _LINUX_INSTRUMENTED_H #define _LINUX_INSTRUMENTED_H @@ -10,6 +10,7 @@ #include #include #include +#include #include /** @@ -117,6 +118,7 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n) { kasan_check_read(from, n); kcsan_check_read(from, n); + kmsan_copy_to_user(to, from, n, 0); } /** @@ -151,6 +153,7 @@ static __always_inline void instrument_copy_from_user_after(const void *to, const void __user *from, unsigned long n, unsigned long left) { + kmsan_unpoison_memory(to, n - left); } /** @@ -162,10 +165,14 @@ instrument_copy_from_user_after(const void *to, const void __user *from, * * @to destination variable, may not be address-taken */ -#define instrument_get_user(to) \ -({ \ +#define instrument_get_user(to) \ +({ \ + u64 __tmp = (u64)(to); \ + kmsan_unpoison_memory(&__tmp, sizeof(__tmp)); \ + to = __tmp; \ }) + /** * instrument_put_user() - add instrumentation to put_user()-like macros * @@ -177,8 +184,9 @@ instrument_copy_from_user_after(const void *to, const void __user *from, * @ptr userspace pointer to copy to * @size number of bytes to copy */ -#define instrument_put_user(from, ptr, size) \ -({ \ +#define instrument_put_user(from, ptr, size) \ +({ \ + kmsan_copy_to_user(ptr, &from, sizeof(from), 0); \ }) #endif /* _LINUX_INSTRUMENTED_H */ diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h index a6522a0c28df..c4cae333deec 100644 --- a/include/linux/kmsan-checks.h +++ b/include/linux/kmsan-checks.h @@ -46,6 +46,21 @@ void kmsan_unpoison_memory(const void *address, size_t size); */ void kmsan_check_memory(const void *address, size_t size); +/** + * kmsan_copy_to_user() - Notify KMSAN about a data transfer to userspace. + * @to: destination address in the userspace. + * @from: source address in the kernel. + * @to_copy: number of bytes to copy. + * @left: number of bytes not copied. + * + * If this is a real userspace data transfer, KMSAN checks the bytes that were + * actually copied to ensure there was no information leak. If @to belongs to + * the kernel space (which is possible for compat syscalls), KMSAN just copies + * the metadata. + */ +void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, + size_t left); + #else static inline void kmsan_poison_memory(const void *address, size_t size, @@ -58,6 +73,10 @@ static inline void kmsan_unpoison_memory(const void *address, size_t size) static inline void kmsan_check_memory(const void *address, size_t size) { } +static inline void kmsan_copy_to_user(void __user *to, const void *from, + size_t to_copy, size_t left) +{ +} #endif -- cgit v1.2.3 From 7ade4f10779cb46f5c29ced9b7a41f68501cf0ed Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:55 +0200 Subject: dma: kmsan: unpoison DMA mappings KMSAN doesn't know about DMA memory writes performed by devices. We unpoison such memory when it's mapped to avoid false positive reports. Link: https://lkml.kernel.org/r/20220915150417.722975-22-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index e00de976ee43..dac296da45c5 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -9,6 +9,7 @@ #ifndef _LINUX_KMSAN_H #define _LINUX_KMSAN_H +#include #include #include #include @@ -16,6 +17,7 @@ struct page; struct kmem_cache; struct task_struct; +struct scatterlist; #ifdef CONFIG_KMSAN @@ -172,6 +174,35 @@ void kmsan_ioremap_page_range(unsigned long addr, unsigned long end, */ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); +/** + * kmsan_handle_dma() - Handle a DMA data transfer. + * @page: first page of the buffer. + * @offset: offset of the buffer within the first page. + * @size: buffer size. + * @dir: one of possible dma_data_direction values. + * + * Depending on @direction, KMSAN: + * * checks the buffer, if it is copied to device; + * * initializes the buffer, if it is copied from device; + * * does both, if this is a DMA_BIDIRECTIONAL transfer. + */ +void kmsan_handle_dma(struct page *page, size_t offset, size_t size, + enum dma_data_direction dir); + +/** + * kmsan_handle_dma_sg() - Handle a DMA transfer using scatterlist. + * @sg: scatterlist holding DMA buffers. + * @nents: number of scatterlist entries. + * @dir: one of possible dma_data_direction values. + * + * Depending on @direction, KMSAN: + * * checks the buffers in the scatterlist, if they are copied to device; + * * initializes the buffers, if they are copied from device; + * * does both, if this is a DMA_BIDIRECTIONAL transfer. + */ +void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, + enum dma_data_direction dir); + #else static inline void kmsan_init_shadow(void) @@ -254,6 +285,16 @@ static inline void kmsan_iounmap_page_range(unsigned long start, { } +static inline void kmsan_handle_dma(struct page *page, size_t offset, + size_t size, enum dma_data_direction dir) +{ +} + +static inline void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ -- cgit v1.2.3 From 553a80188a5d7164d2b0688b06bf3fe297023bfe Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:57 +0200 Subject: kmsan: handle memory sent to/from USB Depending on the value of is_out kmsan_handle_urb() KMSAN either marks the data copied to the kernel from a USB device as initialized, or checks the data sent to the device for being initialized. Link: https://lkml.kernel.org/r/20220915150417.722975-24-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index dac296da45c5..c473e0e21683 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -18,6 +18,7 @@ struct page; struct kmem_cache; struct task_struct; struct scatterlist; +struct urb; #ifdef CONFIG_KMSAN @@ -203,6 +204,16 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size, void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, enum dma_data_direction dir); +/** + * kmsan_handle_urb() - Handle a USB data transfer. + * @urb: struct urb pointer. + * @is_out: data transfer direction (true means output to hardware). + * + * If @is_out is true, KMSAN checks the transfer buffer of @urb. Otherwise, + * KMSAN initializes the transfer buffer. + */ +void kmsan_handle_urb(const struct urb *urb, bool is_out); + #else static inline void kmsan_init_shadow(void) @@ -295,6 +306,10 @@ static inline void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, { } +static inline void kmsan_handle_urb(const struct urb *urb, bool is_out) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ -- cgit v1.2.3 From ff901d80fff6d65ada6f2a60a1f7d180ee2e0416 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:09 +0200 Subject: x86: kmsan: use __msan_ string functions where possible. Unless stated otherwise (by explicitly calling __memcpy(), __memset() or __memmove()) we want all string functions to call their __msan_ versions (e.g. __msan_memcpy() instead of memcpy()), so that shadow and origin values are updated accordingly. Bootloader must still use the default string functions to avoid crashes. Link: https://lkml.kernel.org/r/20220915150417.722975-36-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/fortify-string.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 3b401fa0f374..6c8a1a29d0b6 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -285,8 +285,10 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size, * __builtin_object_size() must be captured here to avoid evaluating argument * side-effects further into the macro layers. */ +#ifndef CONFIG_KMSAN #define memset(p, c, s) __fortify_memset_chk(p, c, s, \ __builtin_object_size(p, 0), __builtin_object_size(p, 1)) +#endif /* * To make sure the compiler can enforce protection against buffer overflows, -- cgit v1.2.3 From 6cae637fa26df867449c6bc20ea8bc693abe49b0 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:14 +0200 Subject: entry: kmsan: introduce kmsan_unpoison_entry_regs() struct pt_regs passed into IRQ entry code is set up by uninstrumented asm functions, therefore KMSAN may not notice the registers are initialized. kmsan_unpoison_entry_regs() unpoisons the contents of struct pt_regs, preventing potential false positives. Unlike kmsan_unpoison_memory(), it can be called under kmsan_in_runtime(), which is often the case in IRQ entry code. Link: https://lkml.kernel.org/r/20220915150417.722975-41-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index c473e0e21683..e38ae3c34618 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -214,6 +214,17 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, */ void kmsan_handle_urb(const struct urb *urb, bool is_out); +/** + * kmsan_unpoison_entry_regs() - Handle pt_regs in low-level entry code. + * @regs: struct pt_regs pointer received from assembly code. + * + * KMSAN unpoisons the contents of the passed pt_regs, preventing potential + * false positive reports. Unlike kmsan_unpoison_memory(), + * kmsan_unpoison_entry_regs() can be called from the regions where + * kmsan_in_runtime() returns true, which is the case in early entry code. + */ +void kmsan_unpoison_entry_regs(const struct pt_regs *regs); + #else static inline void kmsan_init_shadow(void) @@ -310,6 +321,10 @@ static inline void kmsan_handle_urb(const struct urb *urb, bool is_out) { } +static inline void kmsan_unpoison_entry_regs(const struct pt_regs *regs) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ -- cgit v1.2.3 From 16bc1b0f0269b6110f6d25880b52947d354e2980 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 15 Sep 2022 19:33:41 +0800 Subject: mm/damon: use 'struct damon_target *' instead of 'void *' in target_valid() We could use 'struct damon_target *' directly instead of 'void *' in target_valid() operation to make code simple. Link: https://lkml.kernel.org/r/1663241621-13293-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index c5dc0c77c772..1dda8d0068e5 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -346,7 +346,7 @@ struct damon_operations { unsigned long (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); - bool (*target_valid)(void *target); + bool (*target_valid)(struct damon_target *t); void (*cleanup)(struct damon_ctx *context); }; -- cgit v1.2.3 From cc713520bdc1b84fc5394f6ac8649b93ad2c5dde Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Fri, 16 Sep 2022 23:20:35 +0800 Subject: mm/damon: return void from damon_set_schemes() There is no point in returning an int from damon_set_schemes(). It always returns 0 which is meaningless for the caller, so change it to return void directly. Link: https://lkml.kernel.org/r/1663341635-12675-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 1dda8d0068e5..e7808a84675f 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -541,7 +541,7 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs); -int damon_set_schemes(struct damon_ctx *ctx, +void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); bool damon_is_registered_ops(enum damon_ops_id id); -- cgit v1.2.3 From 638a9ae97ab596f1f7b7522dad709e69cb5b4e9d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:44 +0800 Subject: mm: remove obsolete macro NR_PCP_ORDER_MASK and NR_PCP_ORDER_WIDTH Since commit 8b10b465d0e1 ("mm/page_alloc: free pages in a single pass during bulk free"), they're not used anymore. Remove them. Link: https://lkml.kernel.org/r/20220916072257.9639-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c69c08156822..3ff1e757d5aa 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -564,13 +564,6 @@ enum zone_watermarks { #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) -/* - * Shift to encode migratetype and order in the same integer, with order - * in the least significant bits. - */ -#define NR_PCP_ORDER_WIDTH 8 -#define NR_PCP_ORDER_MASK ((1<_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) -- cgit v1.2.3 From 5749fcc5f04cef4091dea0c2ba6b5c5f5e05a549 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:46 +0800 Subject: mm/page_alloc: add __init annotations to init_mem_debugging_and_hardening() It's only called by mm_init(). Add __init annotations to it. Link: https://lkml.kernel.org/r/20220916072257.9639-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a37c8a29c49b..8bbcccbc5565 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3092,7 +3092,7 @@ extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); -extern void init_mem_debugging_and_hardening(void); +extern void __init init_mem_debugging_and_hardening(void); #ifdef CONFIG_PAGE_POISONING extern void __kernel_poison_pages(struct page *page, int numpages); extern void __kernel_unpoison_pages(struct page *page, int numpages); -- cgit v1.2.3 From 30e3b5d7c82f78c63c53197b5d8b99636bb60d56 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:48 +0800 Subject: mm: remove obsolete pgdat_is_empty() There's no caller. Remove it. Link: https://lkml.kernel.org/r/20220916072257.9639-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3ff1e757d5aa..4c8510f26b02 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1241,11 +1241,6 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) return pgdat->node_start_pfn + pgdat->node_spanned_pages; } -static inline bool pgdat_is_empty(pg_data_t *pgdat) -{ - return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; -} - #include void build_all_zonelists(pg_data_t *pgdat); -- cgit v1.2.3 From f774a6a6fd39e1b5677bdf71f6813b382faddeeb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:51 +0800 Subject: mm, memory_hotplug: remove obsolete generic_free_nodedata() Commit 390511e1476e ("mm, memory_hotplug: drop arch_free_nodedata") drops the last caller of generic_free_nodedata(). Remove it too. Link: https://lkml.kernel.org/r/20220916072257.9639-11-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 51052969dbfe..9fcbf5706595 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -43,11 +43,6 @@ extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); ({ \ memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \ }) -/* - * This definition is just for error path in node hotadd. - * For node hotremove, we have to replace this. - */ -#define generic_free_nodedata(pgdat) kfree(pgdat) extern pg_data_t *node_data[]; static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) @@ -63,9 +58,6 @@ static inline pg_data_t *generic_alloc_nodedata(int nid) BUG(); return NULL; } -static inline void generic_free_nodedata(pg_data_t *pgdat) -{ -} static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) { } -- cgit v1.2.3 From def76fd549c513bb90278a8d6d0fe3ef3faa20a7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:56 +0800 Subject: mm/page_alloc: remove obsolete gfpflags_normal_context() Since commit dacb5d8875cc ("tcp: fix page frag corruption on page fault"), there's no caller of gfpflags_normal_context(). Remove it as this helper is strictly tied to the sk page frag usage and there won't be other user in the future. [linmiaohe@huawei.com: fix htmldocs] Link: https://lkml.kernel.org/r/1bc55727-9b66-0e9e-c306-f10c4716ea89@huawei.com Link: https://lkml.kernel.org/r/20220916072257.9639-16-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/gfp.h | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ea6cb9399152..ef4aea3b356e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -36,29 +36,6 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } -/** - * gfpflags_normal_context - is gfp_flags a normal sleepable context? - * @gfp_flags: gfp_flags to test - * - * Test whether @gfp_flags indicates that the allocation is from the - * %current context and allowed to sleep. - * - * An allocation being allowed to block doesn't mean it owns the %current - * context. When direct reclaim path tries to allocate memory, the - * allocation context is nested inside whatever %current was doing at the - * time of the original allocation. The nested allocation may be allowed - * to block but modifying anything %current owns can corrupt the outer - * context's expectations. - * - * %true result from this function indicates that the allocation context - * can sleep and use anything that's associated with %current. - */ -static inline bool gfpflags_normal_context(const gfp_t gfp_flags) -{ - return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) == - __GFP_DIRECT_RECLAIM; -} - #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else -- cgit v1.2.3 From 233f0b31bd9503ce2be7be0bde69c67287c8a741 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 20 Sep 2022 16:53:22 +0000 Subject: mm/damon: deduplicate damon_{reclaim,lru_sort}_apply_parameters() The bodies of damon_{reclaim,lru_sort}_apply_parameters() contain duplicates. This commit adds a common function damon_set_region_biggest_system_ram_default() to remove the duplicates. Link: https://lkml.kernel.org/r/6329f00d.a70a0220.9bb29.3678SMTPIN_ADDED_BROKEN@mx.google.com Signed-off-by: Kaixu Xia Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index e7808a84675f..ed5470f50bab 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -557,7 +557,8 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx) int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); -bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end); +int damon_set_region_biggest_system_ram_default(struct damon_target *t, + unsigned long *start, unsigned long *end); #endif /* CONFIG_DAMON */ -- cgit v1.2.3 From 2eb989195d9a361d13d66ffb8738847649e080ad Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Sep 2022 02:06:33 +0800 Subject: mm: memcontrol: use memcg_kmem_enabled in count_objcg_event Patch series "mm: memcontrol: cleanup and optimize for two accounting params", v2. This patch (of 2): There are currently two helpers for checking if cgroup kmem accounting is enabled: - mem_cgroup_kmem_disabled - memcg_kmem_enabled mem_cgroup_kmem_disabled is a simple helper that returns true if cgroup.memory=nokmem is specified, otherwise returns false. memcg_kmem_enabled is a bit different, it returns true if cgroup.memory=nokmem is not specified and there was at least one non-root memory control enabled cgroup ever created. This help improve performance when kmem accounting was not actually activated. And it's optimized with static branch. The usage of mem_cgroup_kmem_disabled is for sub-systems that need to preallocate data for kmem accounting since they could be initialized before kmem accounting is activated. But count_objcg_event doesn't need that, so using memcg_kmem_enabled is better here. Link: https://lkml.kernel.org/r/20220919180634.45958-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20220919180634.45958-2-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Muchun Song Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dc7d40e575d5..ef479e554253 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1778,7 +1778,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { struct mem_cgroup *memcg; - if (mem_cgroup_kmem_disabled()) + if (!memcg_kmem_enabled()) return; rcu_read_lock(); -- cgit v1.2.3 From 7c6c6cc4d3a213e7303ef06ff40f6193df01839c Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:37 -0700 Subject: mm/shmem: add flag to enforce shmem THP in hugepage_vma_check() Patch series "mm: add file/shmem support to MADV_COLLAPSE", v4. This series builds on top of the previous "mm: userspace hugepage collapse" series which introduced the MADV_COLLAPSE madvise mode and added support for private, anonymous mappings[2], by adding support for file and shmem backed memory to CONFIG_READ_ONLY_THP_FOR_FS=y kernels. File and shmem support have been added with effort to align with existing MADV_COLLAPSE semantics and policy decisions[3]. Collapse of shmem-backed memory ignores kernel-guiding directives and heuristics including all sysfs settings (transparent_hugepage/shmem_enabled), and tmpfs huge= mount options (shmem always supports large folios). Like anonymous mappings, on successful return of MADV_COLLAPSE on file/shmem memory, the contents of memory mapped by the addresses provided will be synchronously pmd-mapped THPs. This functionality unlocks two important uses: (1) Immediately back executable text by THPs. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which might impair services from serving at their full rated load after (re)starting. Tricks like mremap(2)'ing text onto anonymous memory to immediately realize iTLB performance prevents page sharing and demand paging, both of which increase steady state memory footprint. Now, we can have the best of both worlds: Peak upfront performance and lower RAM footprints. (2) userfaultfd-based live migration of virtual machines satisfy UFFD faults by fetching native-sized pages over the network (to avoid latency of transferring an entire hugepage). However, after guest memory has been fully copied to the new host, MADV_COLLAPSE can be used to immediately increase guest performance. khugepaged has received a small improvement by association and can now detect and collapse pte-mapped THPs. However, there is still work to be done along the file collapse path. Compound pages of arbitrary order still needs to be supported and THP collapse needs to be converted to using folios in general. Eventually, we'd like to move away from the read-only and executable-mapped constraints currently imposed on eligible files and support any inode claiming huge folio support. That said, I think the series as-is covers enough to claim that MADV_COLLAPSE supports file/shmem memory. Patches 1-3 Implement the guts of the series. Patch 4 Is a tracepoint for debugging. Patches 5-9 Refactor existing khugepaged selftests to work with new memory types + new collapse tests. Patch 10 Adds a userfaultfd selftest mode to mimic a functional test of UFFDIO_REGISTER_MODE_MINOR+MADV_COLLAPSE live migration. (v4 note: "userfaultfd shmem" selftest is failing as of Sep 22 mm-unstable) [1] https://lore.kernel.org/linux-mm/YyiK8YvVcrtZo0z3@google.com/ [2] https://lore.kernel.org/linux-mm/20220706235936.2197195-1-zokeefe@google.com/ [3] https://lore.kernel.org/linux-mm/YtBmhaiPHUTkJml8@google.com/ [4] https://lore.kernel.org/linux-mm/20220922222731.1124481-1-zokeefe@google.com/ [5] https://lore.kernel.org/linux-mm/20220922184651.1016461-1-zokeefe@google.com/ This patch (of 10): Extend 'mm/thp: add flag to enforce sysfs THP in hugepage_vma_check()' to shmem, allowing callers to ignore /sys/kernel/transparent_hugepage/shmem_enabled and tmpfs huge= mount. This is intended to be used by MADV_COLLAPSE, and the rationale is analogous to the anon/file case: MADV_COLLAPSE is not coupled to directives that advise the kernel's decisions on when THPs should be considered eligible. shmem/tmpfs always claims large folio support, regardless of sysfs or mount options. [shy828301@gmail.com: test shmem_huge_force explicitly] Link: https://lore.kernel.org/linux-mm/CAHbLzko3A5-TpS0BgBeKkx5cuOkWgLvWXQH=TdgW-baO4rPtdg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220922224046.1143204-1-zokeefe@google.com Link: https://lkml.kernel.org/r/20220907144521.3115321-2-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-2-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f24071e3c826..d500ea967dc7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -92,11 +92,13 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); -extern bool shmem_is_huge(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index); -static inline bool shmem_huge_enabled(struct vm_area_struct *vma) +extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force); +static inline bool shmem_huge_enabled(struct vm_area_struct *vma, + bool shmem_huge_force) { - return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff); + return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff, + shmem_huge_force); } extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, -- cgit v1.2.3 From 34488399fa08faaf664743fa54b271eb6f9e1321 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:39 -0700 Subject: mm/madvise: add file and shmem support to MADV_COLLAPSE Add support for MADV_COLLAPSE to collapse shmem-backed and file-backed memory into THPs (requires CONFIG_READ_ONLY_THP_FOR_FS=y). On success, the backing memory will be a hugepage. For the memory range and process provided, the page tables will synchronously have a huge pmd installed, mapping the THP. Other mappings of the file extent mapped by the memory range may be added to a set of entries that khugepaged will later process and attempt update their page tables to map the THP by a pmd. This functionality unlocks two important uses: (1) Immediately back executable text by THPs. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which might impair services from serving at their full rated load after (re)starting. Tricks like mremap(2)'ing text onto anonymous memory to immediately realize iTLB performance prevents page sharing and demand paging, both of which increase steady state memory footprint. Now, we can have the best of both worlds: Peak upfront performance and lower RAM footprints. (2) userfaultfd-based live migration of virtual machines satisfy UFFD faults by fetching native-sized pages over the network (to avoid latency of transferring an entire hugepage). However, after guest memory has been fully copied to the new host, MADV_COLLAPSE can be used to immediately increase guest performance. Since khugepaged is single threaded, this change now introduces possibility of collapse contexts racing in file collapse path. There a important few places to consider: (1) hpage_collapse_scan_file(), when we xas_pause() and drop RCU. We could have the memory collapsed out from under us, but the next xas_for_each() iteration will correctly pick up the hugepage. The hugepage might not be up to date (insofar as copying of small page contents might not have completed - the page still may be locked), but regardless what small page index we were iterating over, we'll find the hugepage and identify it as a suitably aligned compound page of order HPAGE_PMD_ORDER. In khugepaged path, we locklessly check the value of the pmd, and only add it to deferred collapse array if we find pmd mapping pte table. This is fine, since other values that could have raced in right afterwards denote failure, or that the memory was successfully collapsed, so we don't need further processing. In madvise path, we'll take mmap_lock() in write to serialize against page table updates and will know what to do based on the true value of the pmd: recheck all ptes if we point to a pte table, directly install the pmd, if the pmd has been cleared, but memory not yet faulted, or nothing at all if we find a huge pmd. It's worth putting emphasis here on how we treat the none pmd here. If khugepaged has processed this mm's page tables already, it will have left the pmd cleared (ready for refault by the process). Depending on the VMA flags and sysfs settings, amount of RAM on the machine, and the current load, could be a relatively common occurrence - and as such is one we'd like to handle successfully in MADV_COLLAPSE. When we see the none pmd in collapse_pte_mapped_thp(), we've locked mmap_lock in write and checked (a) huepaged_vma_check() to see if the backing memory is appropriate still, along with VMA sizing and appropriate hugepage alignment within the file, and (b) we've found a hugepage head of order HPAGE_PMD_ORDER at the offset in the file mapped by our hugepage-aligned virtual address. Even though the common-case is likely race with khugepaged, given these checks (regardless how we got here - we could be operating on a completely different file than originally checked in hpage_collapse_scan_file() for all we know) it should be safe to directly make the pmd a huge pmd pointing to this hugepage. (2) collapse_file() is mostly serialized on the same file extent by lock sequence: | lock hupepage | lock mapping->i_pages | lock 1st page | unlock mapping->i_pages | | lock mapping->i_pages | page_ref_freeze(3) | xas_store(hugepage) | unlock mapping->i_pages | page_ref_unfreeze(1) | unlock 1st page V unlock hugepage Once a context (who already has their fresh hugepage locked) locks mapping->i_pages exclusively, it will hold said lock until it locks the first page, and it will hold that lock until the after the hugepage has been added to the page cache (and will unlock the hugepage after page table update, though that isn't important here). A racing context that loses the race for mapping->i_pages will then lose the race to locking the first page. Here - depending on how far the other racing context has gotten - we might find the new hugepage (in which case we'll exit cleanly when we check PageTransCompound()), or we'll find the "old" 1st small page (in which we'll exit cleanly when we discover unexpected refcount of 2 after isolate_lru_page()). This is assuming we are able to successfully lock the page we find - in shmem path, we could just fail the trylock and exit cleanly anyways. Failure path in collapse_file() is similar: once we hold lock on 1st small page, we are serialized against other collapse contexts. Before the 1st small page is unlocked, we add it back to the pagecache and unfreeze the refcount appropriately. Contexts who lost the race to the 1st small page will then find the same 1st small page with the correct refcount and will be able to proceed. [zokeefe@google.com: don't check pmd value twice in collapse_pte_mapped_thp()] Link: https://lkml.kernel.org/r/20220927033854.477018-1-zokeefe@google.com [shy828301@gmail.com: Delete hugepage_vma_revalidate_anon(), remove check for multi-add in khugepaged_add_pte_mapped_thp()] Link: https://lore.kernel.org/linux-mm/CAHbLzkrtpM=ic7cYAHcqkubah5VTR8N5=k5RT8MTvv5rN1Y91w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220907144521.3115321-4-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-4-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 384f034ae947..70162d707caf 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -16,11 +16,13 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); #ifdef CONFIG_SHMEM -extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); +extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd); #else -static inline void collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) +static inline int collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr, bool install_pmd) { + return 0; } #endif @@ -46,9 +48,10 @@ static inline void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { } -static inline void collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) +static inline int collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr, bool install_pmd) { + return 0; } static inline void khugepaged_min_free_kbytes_update(void) -- cgit v1.2.3 From 6b91e5dfb3c7ef485587e7ab494dcb47bcdadce3 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Thu, 22 Sep 2022 19:09:35 +0800 Subject: mm: remove unused inline functions from include/linux/mm_inline.h Remove the following unused inline functions from mm_inline.h: 1. All uses of add_page_to_lru_list_tail() have been removed since commit 7a3dbfe8a52b ("mm/swap: convert lru_deactivate_file to a folio_batch"), and it can be replaced by lruvec_add_folio_tail(). 2. All uses of __clear_page_lru_flags() have been removed since commit 188e8caee968 ("mm/swap: convert __page_cache_release() to use a folio"), and it can be replaced by __folio_clear_lru_flags(). They are useless, so remove them. Link: https://lkml.kernel.org/r/20220922110935.1495099-1-cuigaosheng1@huawei.com Signed-off-by: Gaosheng Cui Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 4949eda9a9a2..e8ed225d8f7c 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -76,11 +76,6 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio) __folio_clear_unevictable(folio); } -static __always_inline void __clear_page_lru_flags(struct page *page) -{ - __folio_clear_lru_flags(page_folio(page)); -} - /** * folio_lru_list - Which LRU list should a folio be on? * @folio: The folio to test. @@ -348,12 +343,6 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) list_add_tail(&folio->lru, &lruvec->lists[lru]); } -static __always_inline void add_page_to_lru_list_tail(struct page *page, - struct lruvec *lruvec) -{ - lruvec_add_folio_tail(lruvec, page_folio(page)); -} - static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { -- cgit v1.2.3 From e55b9f96860f6c6026cff97966a740576285e07b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:04 -0400 Subject: mm: memcontrol: drop dead CONFIG_MEMCG_SWAP config symbol Since 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control"), CONFIG_MEMCG_SWAP hasn't been a user-visible config option anymore, it just means CONFIG_MEMCG && CONFIG_SWAP. Update the sites accordingly and drop the symbol. [ While touching the docs, remove two references to CONFIG_MEMCG_KMEM, which hasn't been a user-visible symbol for over half a decade. ] Link: https://lkml.kernel.org/r/20220926135704.400818-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Hugh Dickins Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- include/linux/swap_cgroup.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index fc8d98660326..a18cf4b7c724 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -666,7 +666,7 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) cgroup_throttle_swaprate(&folio->page, gfp); } -#ifdef CONFIG_MEMCG_SWAP +#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index a12dd1c3966c..ae73a87775b3 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -4,7 +4,7 @@ #include -#ifdef CONFIG_MEMCG_SWAP +#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new); @@ -40,6 +40,6 @@ static inline void swap_cgroup_swapoff(int type) return; } -#endif /* CONFIG_MEMCG_SWAP */ +#endif #endif /* __LINUX_SWAP_CGROUP_H */ -- cgit v1.2.3 From 8f824b4abd31c5ea32ae1d6725c47bdb247d18da Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Mon, 5 Sep 2022 10:10:34 +0800 Subject: init.h: fix spelling typo in comment Fix spelling typo in comment. Link: https://lkml.kernel.org/r/20220905021034.947701-1-13667453960@163.com Signed-off-by: Jiangshan Yi Reported-by: k2ci Signed-off-by: Andrew Morton --- include/linux/init.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index baf0b29a7010..3254766ebbf2 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -134,7 +134,7 @@ static inline initcall_t initcall_from_entry(initcall_entry_t *entry) extern initcall_entry_t __con_initcall_start[], __con_initcall_end[]; -/* Used for contructor calls. */ +/* Used for constructor calls. */ typedef void (*ctor_fn_t)(void); struct file_system_type; -- cgit v1.2.3 From 5ca14835dc429c09fefb290f60343fe266382760 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 9 Sep 2022 13:57:41 -0700 Subject: fs: uninline inode_maybe_inc_iversion() It has many callsites and is large. text data bss dec hex filename 91796 15984 512 108292 1a704 mm/shmem.o-before 91180 15984 512 107676 1a49c mm/shmem.o-after Acked-by: Jeff Layton Cc: Chuck Lever Cc: Alexander Viro Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/iversion.h | 46 +--------------------------------------------- 1 file changed, 1 insertion(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iversion.h b/include/linux/iversion.h index eb5a15810169..e27bd4f55d84 100644 --- a/include/linux/iversion.h +++ b/include/linux/iversion.h @@ -172,51 +172,7 @@ inode_set_iversion_queried(struct inode *inode, u64 val) I_VERSION_QUERIED); } -/** - * inode_maybe_inc_iversion - increments i_version - * @inode: inode with the i_version that should be updated - * @force: increment the counter even if it's not necessary? - * - * Every time the inode is modified, the i_version field must be seen to have - * changed by any observer. - * - * If "force" is set or the QUERIED flag is set, then ensure that we increment - * the value, and clear the queried flag. - * - * In the common case where neither is set, then we can return "false" without - * updating i_version. - * - * If this function returns false, and no other metadata has changed, then we - * can avoid logging the metadata. - */ -static inline bool -inode_maybe_inc_iversion(struct inode *inode, bool force) -{ - u64 cur, new; - - /* - * The i_version field is not strictly ordered with any other inode - * information, but the legacy inode_inc_iversion code used a spinlock - * to serialize increments. - * - * Here, we add full memory barriers to ensure that any de-facto - * ordering with other info is preserved. - * - * This barrier pairs with the barrier in inode_query_iversion() - */ - smp_mb(); - cur = inode_peek_iversion_raw(inode); - do { - /* If flag is clear then we needn't do anything */ - if (!force && !(cur & I_VERSION_QUERIED)) - return false; - - /* Since lowest bit is flag, add 2 to avoid it */ - new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; - } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); - return true; -} - +bool inode_maybe_inc_iversion(struct inode *inode, bool force); /** * inode_inc_iversion - forcibly increment i_version -- cgit v1.2.3 From 5d0ce3595ab75330a15cec914096efbbb8b41e4a Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Wed, 14 Sep 2022 03:25:37 +0800 Subject: percpu: add percpu_counter_add_local and percpu_counter_sub_local Patch series "/msg: mitigate the lock contention in ipc/msg", v6. Here are two patches to mitigate the lock contention in ipc/msg. The 1st patch is to add the new interface percpu_counter_add_local and percpu_counter_sub_local. The batch size in percpu_counter_add_batch should be very large in heavy writing and rare reading case. Add the "_local" version, and mostly it will do local adding, reduce the global updating and mitigate lock contention in writing. The 2nd patch is to use percpu_counter instead of atomic update in ipc/msg. The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC msg queue is in heavy use, causing heavy cache bounce and overhead. Change them to percpu_counter greatly improve the performance. Since there is one percpu struct per namespace, additional memory cost is minimal. Reading of the count done in msgctl call, which is infrequent. So the need to sum up the counts in each CPU is infrequent. This patch (of 2): The batch size in percpu_counter_add_batch should be very large in heavy writing and rare reading case. Add the "_local" version, and mostly it will do local adding, reduce the global updating and mitigate lock contention in writing. Link: https://lkml.kernel.org/r/20220913192538.3023708-1-jiebin.sun@intel.com Link: https://lkml.kernel.org/r/20220913192538.3023708-2-jiebin.sun@intel.com Signed-off-by: Jiebin Sun Reviewed-by: Tim Chen Cc: Alexander Mikhalitsyn Cc: Alexey Gladkov Cc: Christoph Lameter Cc: Dennis Zhou Cc: "Eric W . Biederman" Cc: Manfred Spraul Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Cc: Davidlohr Bueso Signed-off-by: Andrew Morton --- include/linux/percpu_counter.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 01861eebed79..8ed5fba6d156 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -15,6 +15,9 @@ #include #include +/* percpu_counter batch for local add or sub */ +#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX + #ifdef CONFIG_SMP struct percpu_counter { @@ -56,6 +59,22 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) percpu_counter_add_batch(fbc, amount, percpu_counter_batch); } +/* + * With percpu_counter_add_local() and percpu_counter_sub_local(), counts + * are accumulated in local per cpu counter and not in fbc->count until + * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter + * write efficient. + * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be + * used to add up the counts from each CPU to account for all the local + * counts. So percpu_counter_add_local() and percpu_counter_sub_local() + * should be used when a counter is updated frequently and read rarely. + */ +static inline void +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH); +} + static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { s64 ret = __percpu_counter_sum(fbc); @@ -138,6 +157,13 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount) preempt_enable(); } +/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ +static inline void +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add(fbc, amount); +} + static inline void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { @@ -193,4 +219,10 @@ static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount) percpu_counter_add(fbc, -amount); } +static inline void +percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add_local(fbc, -amount); +} + #endif /* _LINUX_PERCPU_COUNTER_H */ -- cgit v1.2.3 From 72d1e611082eda18689106a0c192f2827072713c Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Wed, 14 Sep 2022 03:25:38 +0800 Subject: ipc/msg: mitigate the lock contention with percpu counter The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC msg queue is in heavy use, causing heavy cache bounce and overhead. Change them to percpu_counter greatly improve the performance. Since there is one percpu struct per namespace, additional memory cost is minimal. Reading of the count done in msgctl call, which is infrequent. So the need to sum up the counts in each CPU is infrequent. Apply the patch and test the pts/stress-ng-1.4.0 -- system v message passing (160 threads). Score gain: 3.99x CPU: ICX 8380 x 2 sockets Core number: 40 x 2 physical cores Benchmark: pts/stress-ng-1.4.0 -- system v message passing (160 threads) [akpm@linux-foundation.org: coding-style cleanups] [jiebin.sun@intel.com: avoid negative value by overflow in msginfo] Link: https://lkml.kernel.org/r/20220920150809.4014944-1-jiebin.sun@intel.com [akpm@linux-foundation.org: fix min() warnings] Link: https://lkml.kernel.org/r/20220913192538.3023708-3-jiebin.sun@intel.com Signed-off-by: Jiebin Sun Reviewed-by: Tim Chen Cc: Alexander Mikhalitsyn Cc: Alexey Gladkov Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Dennis Zhou Cc: "Eric W . Biederman" Cc: Manfred Spraul Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Signed-off-by: Andrew Morton --- include/linux/ipc_namespace.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e3e8c8662b49..e8240cf2611a 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -11,6 +11,7 @@ #include #include #include +#include struct user_namespace; @@ -36,8 +37,8 @@ struct ipc_namespace { unsigned int msg_ctlmax; unsigned int msg_ctlmnb; unsigned int msg_ctlmni; - atomic_t msg_bytes; - atomic_t msg_hdrs; + struct percpu_counter percpu_msg_bytes; + struct percpu_counter percpu_msg_hdrs; size_t shm_ctlmax; size_t shm_ctlall; -- cgit v1.2.3 From 168723c1f8d6e1e823d4c6ad3cf64478cf58330a Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Sat, 1 Oct 2022 21:56:22 -0700 Subject: net/mlx5e: xsk: Use umr_mode to calculate striding RQ parameters Instead of passing the unaligned flag, pass an enum that indicates the UMR mode. The next commit will add the third mode (KLM for certain configurations of XSK), which will be added to this enum instead of adding another bool flag everywhere. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f8ecb33105d3..285f301a6390 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1288,4 +1288,8 @@ static inline bool mlx5_get_roce_state(struct mlx5_core_dev *dev) return mlx5_is_roce_on(dev); } +enum { + MLX5_OCTWORD = 16, +}; + #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From 16ab85e78439bab1201ff26ba430231d1574b4ae Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sat, 1 Oct 2022 21:56:27 -0700 Subject: net/mlx5e: Expose rx_oversize_pkts_buffer counter Add the rx_oversize_pkts_buffer counter to ethtool statistics. This counter exposes the number of dropped received packets due to length which arrived to RQ and exceed software buffer size allocated by the device for incoming traffic. It might imply that the device MTU is larger than the software buffers size. Signed-off-by: Gal Pressman Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1ad762e22d86..06574d430ff5 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1491,7 +1491,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_120[0xa]; u8 log_max_ra_req_dc[0x6]; - u8 reserved_at_130[0x9]; + u8 reserved_at_130[0x2]; + u8 eth_wqe_too_small[0x1]; + u8 reserved_at_133[0x6]; u8 vnic_env_cq_overrun[0x1]; u8 log_max_ra_res_dc[0x6]; @@ -3537,7 +3539,9 @@ struct mlx5_ifc_vnic_diagnostic_statistics_bits { u8 cq_overrun[0x20]; - u8 reserved_at_220[0xde0]; + u8 eth_wqe_too_small[0x20]; + + u8 reserved_at_220[0xdc0]; }; struct mlx5_ifc_traffic_counter_bits { -- cgit v1.2.3 From 9b98d395b85dd042fe83fb696b1ac02e6c93a520 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sat, 1 Oct 2022 21:56:28 -0700 Subject: net/mlx5: Start health poll at earlier stage of driver load Start health poll at earlier stage, so if fw fatal issue occurred before or during initialization commands such as init_hca or set_hca_cap the poll health can detect and indicate that the driver is already in error state. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 285f301a6390..a12929bc31b2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1017,6 +1017,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev); void mlx5_start_health_poll(struct mlx5_core_dev *dev); void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health); +void mlx5_start_health_fw_log_up(struct mlx5_core_dev *dev); void mlx5_drain_health_wq(struct mlx5_core_dev *dev); void mlx5_trigger_health_work(struct mlx5_core_dev *dev); int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size, -- cgit v1.2.3 From 3114b075eb2531dea31a961944309485d6a53040 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 3 Oct 2022 08:51:57 +0200 Subject: net: add framework to support Ethernet PSE and PDs devices This framework was create with intention to provide support for Ethernet PSE (Power Sourcing Equipment) and PDs (Powered Device). At current step this patch implements generic PSE support for PoDL (Power over Data Lines 802.3bu) specification with reserving name space for PD devices as well. This framework can be extended to support 802.3af and 802.3at "Power via the Media Dependent Interface" (or PoE/Power over Ethernet) Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/pse-pd/pse.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 include/linux/pse-pd/pse.h (limited to 'include/linux') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h new file mode 100644 index 000000000000..3ba787a48b15 --- /dev/null +++ b/include/linux/pse-pd/pse.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* +// Copyright (c) 2022 Pengutronix, Oleksij Rempel + */ +#ifndef _LINUX_PSE_CONTROLLER_H +#define _LINUX_PSE_CONTROLLER_H + +#include +#include +#include + +struct module; +struct device_node; +struct of_phandle_args; +struct pse_control; + +/** + * struct pse_controller_dev - PSE controller entity that might + * provide multiple PSE controls + * @ops: a pointer to device specific struct pse_controller_ops + * @owner: kernel module of the PSE controller driver + * @list: internal list of PSE controller devices + * @pse_control_head: head of internal list of requested PSE controls + * @dev: corresponding driver model device struct + * @of_pse_n_cells: number of cells in PSE line specifiers + * @of_xlate: translation function to translate from specifier as found in the + * device tree to id as given to the PSE control ops + * @nr_lines: number of PSE controls in this controller device + * @lock: Mutex for serialization access to the PSE controller + */ +struct pse_controller_dev { + const struct pse_controller_ops *ops; + struct module *owner; + struct list_head list; + struct list_head pse_control_head; + struct device *dev; + int of_pse_n_cells; + int (*of_xlate)(struct pse_controller_dev *pcdev, + const struct of_phandle_args *pse_spec); + unsigned int nr_lines; + struct mutex lock; +}; + +#if IS_ENABLED(CONFIG_PSE_CONTROLLER) +int pse_controller_register(struct pse_controller_dev *pcdev); +void pse_controller_unregister(struct pse_controller_dev *pcdev); +struct device; +int devm_pse_controller_register(struct device *dev, + struct pse_controller_dev *pcdev); + +struct pse_control *of_pse_control_get(struct device_node *node); +void pse_control_put(struct pse_control *psec); + +#else + +static inline struct pse_control *of_pse_control_get(struct device_node *node) +{ + return ERR_PTR(-ENOENT); +} + +static inline void pse_control_put(struct pse_control *psec) +{ +} + +#endif + +#endif -- cgit v1.2.3 From 5e82147de1cbd758bb280908daa39d95ed467538 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 3 Oct 2022 08:51:59 +0200 Subject: net: mdiobus: search for PSE nodes by parsing PHY nodes. Some PHYs can be linked with PSE (Power Sourcing Equipment), so search for related nodes and attach it to the phydev. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index d65fc76fe0ae..ddf66198f751 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -597,6 +597,7 @@ struct macsec_ops; * @master_slave_get: Current master/slave advertisement * @master_slave_state: Current master/slave configuration * @mii_ts: Pointer to time stamper callbacks + * @psec: Pointer to Power Sourcing Equipment control struct * @lock: Mutex for serialization access to PHY * @state_queue: Work queue for state machine * @shared: Pointer to private data shared by phys in one package @@ -715,6 +716,7 @@ struct phy_device { struct phylink *phylink; struct net_device *attached_dev; struct mii_timestamper *mii_ts; + struct pse_control *psec; u8 mdix; u8 mdix_ctrl; -- cgit v1.2.3 From 18ff0bcda6d1dd3d53b4ce3f03e61bf1a648f960 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 3 Oct 2022 08:52:00 +0200 Subject: ethtool: add interface to interact with Ethernet Power Equipment Add interface to support Power Sourcing Equipment. At current step it provides generic way to address all variants of PSE devices as defined in IEEE 802.3-2018 but support only objects specified for IEEE 802.3-2018 104.4 PoDL Power Sourcing Equipment (PSE). Currently supported and mandatory objects are: IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState IEEE 802.3-2018 30.15.1.2.1 acPoDLPSEAdminControl This is minimal interface needed to control PSE on each separate ethernet port but it provides not all mandatory objects specified in IEEE 802.3-2018. Since "PoDL PSE" and "PSE" have similar names, but some different values I decide to not merge them and keep separate naming schema. This should allow as to be as close to IEEE 802.3 spec as possible and avoid name conflicts in the future. This implementation is connected to PHYs instead of MACs because PSE auto classification can potentially interfere with PHY auto negotiation. So, may be some extra PHY related initialization will be needed. With WIP version of ethtools interaction with PSE capable link looks as following: $ ip l ... 5: t1l1@eth0: .. ... $ ethtool --show-pse t1l1 PSE attributs for t1l1: PoDL PSE Admin State: disabled PoDL PSE Power Detection Status: disabled $ ethtool --set-pse t1l1 podl-pse-admin-control enable $ ethtool --show-pse t1l1 PSE attributs for t1l1: PoDL PSE Admin State: enabled PoDL PSE Power Detection Status: delivering power Signed-off-by: kernel test robot Signed-off-by: Oleksij Rempel Reviewed-by: Bagas Sanjaya Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/pse-pd/pse.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 3ba787a48b15..fd1a916eeeba 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -9,6 +9,47 @@ #include #include +struct phy_device; +struct pse_controller_dev; + +/** + * struct pse_control_config - PSE control/channel configuration. + * + * @admin_cotrol: set PoDL PSE admin control as described in + * IEEE 802.3-2018 30.15.1.2.1 acPoDLPSEAdminControl + */ +struct pse_control_config { + enum ethtool_podl_pse_admin_state admin_cotrol; +}; + +/** + * struct pse_control_status - PSE control/channel status. + * + * @podl_admin_state: operational state of the PoDL PSE + * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState + * @podl_pw_status: power detection status of the PoDL PSE. + * IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus: + */ +struct pse_control_status { + enum ethtool_podl_pse_admin_state podl_admin_state; + enum ethtool_podl_pse_pw_d_status podl_pw_status; +}; + +/** + * struct pse_controller_ops - PSE controller driver callbacks + * + * @ethtool_get_status: get PSE control status for ethtool interface + * @ethtool_set_config: set PSE control configuration over ethtool interface + */ +struct pse_controller_ops { + int (*ethtool_get_status)(struct pse_controller_dev *pcdev, + unsigned long id, struct netlink_ext_ack *extack, + struct pse_control_status *status); + int (*ethtool_set_config)(struct pse_controller_dev *pcdev, + unsigned long id, struct netlink_ext_ack *extack, + const struct pse_control_config *config); +}; + struct module; struct device_node; struct of_phandle_args; @@ -51,6 +92,13 @@ int devm_pse_controller_register(struct device *dev, struct pse_control *of_pse_control_get(struct device_node *node); void pse_control_put(struct pse_control *psec); +int pse_ethtool_get_status(struct pse_control *psec, + struct netlink_ext_ack *extack, + struct pse_control_status *status); +int pse_ethtool_set_config(struct pse_control *psec, + struct netlink_ext_ack *extack, + const struct pse_control_config *config); + #else static inline struct pse_control *of_pse_control_get(struct device_node *node) @@ -62,6 +110,20 @@ static inline void pse_control_put(struct pse_control *psec) { } +int pse_ethtool_get_status(struct pse_control *psec, + struct netlink_ext_ack *extack, + struct pse_control_status *status) +{ + return -ENOTSUPP; +} + +int pse_ethtool_set_config(struct pse_control *psec, + struct netlink_ext_ack *extack, + const struct pse_control_config *config) +{ + return -ENOTSUPP; +} + #endif #endif -- cgit v1.2.3 From 2a4187f4406ec3236f8b9d0d5150d2bf8d021b68 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 3 Oct 2022 20:14:13 +0200 Subject: once: rename _SLOW to _SLEEPABLE The _SLOW designation wasn't really descriptive of anything. This is meant to be called from process context when it's possible to sleep. So name this more aptly _SLEEPABLE, which better fits its intended use. Fixes: 62c07983bef9 ("once: add DO_ONCE_SLOW() for sleepable contexts") Cc: Christophe Leroy Signed-off-by: Jason A. Donenfeld Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20221003181413.1221968-1-Jason@zx2c4.com Signed-off-by: Jakub Kicinski --- include/linux/once.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/once.h b/include/linux/once.h index 176ab75b42df..bc714d414448 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -13,9 +13,9 @@ void __do_once_done(bool *done, struct static_key_true *once_key, unsigned long *flags, struct module *mod); /* Variant for process contexts only. */ -bool __do_once_slow_start(bool *done); -void __do_once_slow_done(bool *done, struct static_key_true *once_key, - struct module *mod); +bool __do_once_sleepable_start(bool *done); +void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, + struct module *mod); /* Call a function exactly once. The idea of DO_ONCE() is to perform * a function call such as initialization of random seeds, etc, only @@ -61,26 +61,26 @@ void __do_once_slow_done(bool *done, struct static_key_true *once_key, }) /* Variant of DO_ONCE() for process/sleepable contexts. */ -#define DO_ONCE_SLOW(func, ...) \ - ({ \ - bool ___ret = false; \ - static bool __section(".data.once") ___done = false; \ - static DEFINE_STATIC_KEY_TRUE(___once_key); \ - if (static_branch_unlikely(&___once_key)) { \ - ___ret = __do_once_slow_start(&___done); \ - if (unlikely(___ret)) { \ - func(__VA_ARGS__); \ - __do_once_slow_done(&___done, &___once_key, \ - THIS_MODULE); \ - } \ - } \ - ___ret; \ +#define DO_ONCE_SLEEPABLE(func, ...) \ + ({ \ + bool ___ret = false; \ + static bool __section(".data.once") ___done = false; \ + static DEFINE_STATIC_KEY_TRUE(___once_key); \ + if (static_branch_unlikely(&___once_key)) { \ + ___ret = __do_once_sleepable_start(&___done); \ + if (unlikely(___ret)) { \ + func(__VA_ARGS__); \ + __do_once_sleepable_done(&___done, &___once_key,\ + THIS_MODULE); \ + } \ + } \ + ___ret; \ }) #define get_random_once(buf, nbytes) \ DO_ONCE(get_random_bytes, (buf), (nbytes)) -#define get_random_slow_once(buf, nbytes) \ - DO_ONCE_SLOW(get_random_bytes, (buf), (nbytes)) +#define get_random_sleepable_once(buf, nbytes) \ + DO_ONCE_SLEEPABLE(get_random_bytes, (buf), (nbytes)) #endif /* _LINUX_ONCE_H */ -- cgit v1.2.3 From d7915651fe9a6474e06161c34c637e6aeb811e9d Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Wed, 14 Sep 2022 16:47:42 +0200 Subject: clk: introduce (devm_)hw_register_mux_parent_data_table API Introduce (devm_)hw_register_mux_parent_data_table new API. We have basic support for clk_register_mux using parent_data but we lack any API to provide a custom parent_map. Add these 2 new API to correctly handle these special configuration instead of using the generic __(devm_)clk_hw_register_mux API. Signed-off-by: Christian Marangi Link: https://lore.kernel.org/r/20220914144743.17369-1-ansuelsmth@gmail.com Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 1615010aa0ec..65b70f0d62c5 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -974,6 +974,13 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name, __clk_hw_register_mux((dev), NULL, (name), (num_parents), NULL, NULL, \ (parent_data), (flags), (reg), (shift), \ BIT((width)) - 1, (clk_mux_flags), NULL, (lock)) +#define clk_hw_register_mux_parent_data_table(dev, name, parent_data, \ + num_parents, flags, reg, shift, \ + width, clk_mux_flags, table, \ + lock) \ + __clk_hw_register_mux((dev), NULL, (name), (num_parents), NULL, NULL, \ + (parent_data), (flags), (reg), (shift), \ + BIT((width)) - 1, (clk_mux_flags), table, (lock)) #define devm_clk_hw_register_mux(dev, name, parent_names, num_parents, flags, reg, \ shift, width, clk_mux_flags, lock) \ __devm_clk_hw_register_mux((dev), NULL, (name), (num_parents), \ @@ -987,6 +994,13 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name, (parent_hws), NULL, (flags), (reg), \ (shift), BIT((width)) - 1, \ (clk_mux_flags), NULL, (lock)) +#define devm_clk_hw_register_mux_parent_data_table(dev, name, parent_data, \ + num_parents, flags, reg, shift, \ + width, clk_mux_flags, table, \ + lock) \ + __devm_clk_hw_register_mux((dev), NULL, (name), (num_parents), NULL, \ + NULL, (parent_data), (flags), (reg), (shift), \ + BIT((width)) - 1, (clk_mux_flags), table, (lock)) int clk_mux_val_to_index(struct clk_hw *hw, const u32 *table, unsigned int flags, unsigned int val); -- cgit v1.2.3 From 681bf011b9b5989c6e9db6beb64494918aab9a43 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 3 Oct 2022 21:03:27 -0700 Subject: eth: pse: add missing static inlines build bot reports missing 'static inline' qualifiers in the header. Reported-by: kernel test robot Fixes: 18ff0bcda6d1 ("ethtool: add interface to interact with Ethernet Power Equipment") Reviewed-by: Oleksij Rempel Link: https://lore.kernel.org/r/20221004040327.2034878-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/pse-pd/pse.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index fd1a916eeeba..fb724c65c77b 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -110,16 +110,16 @@ static inline void pse_control_put(struct pse_control *psec) { } -int pse_ethtool_get_status(struct pse_control *psec, - struct netlink_ext_ack *extack, - struct pse_control_status *status) +static inline int pse_ethtool_get_status(struct pse_control *psec, + struct netlink_ext_ack *extack, + struct pse_control_status *status) { return -ENOTSUPP; } -int pse_ethtool_set_config(struct pse_control *psec, - struct netlink_ext_ack *extack, - const struct pse_control_config *config) +static inline int pse_ethtool_set_config(struct pse_control *psec, + struct netlink_ext_ack *extack, + const struct pse_control_config *config) { return -ENOTSUPP; } -- cgit v1.2.3 From 5d10f480f77b67332e4835ad565bfe4cb8528159 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 18 Aug 2022 10:23:16 +0200 Subject: thermal/of: Remove the thermal_zone_of_get_sensor_id() function The function thermal_zone_of_get_sensor_id() is no longer used anywhere, remove it. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220818082316.2717095-2-daniel.lezcano@linaro.org --- include/linux/thermal.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 6f1ec4fb7ef8..9ecc128944a1 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -308,9 +308,6 @@ void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_dev void thermal_of_zone_unregister(struct thermal_zone_device *tz); -int thermal_zone_of_get_sensor_id(struct device_node *tz_np, - struct device_node *sensor_np, - u32 *id); #else static inline struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data, @@ -334,13 +331,6 @@ static inline void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz) { } - -static inline int thermal_zone_of_get_sensor_id(struct device_node *tz_np, - struct device_node *sensor_np, - u32 *id) -{ - return -ENOENT; -} #endif #ifdef CONFIG_THERMAL -- cgit v1.2.3 From 0ce38047e82a02017839b6cae837f13a1383a3a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Oct 2022 10:46:58 +0200 Subject: perf: Fix lockdep_assert_event_ctx() I'm a flamin' moron; because even after Mark told me it should be '&&' I still got it wrong in the final commit. Fixes: f3c0eba28704 ("perf: Add a few assertions") Reported-by: Borislav Petkov Reported-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Tested-by: Borislav Petkov Link: https://lkml.kernel.org/r/YvvIWmDBWdIUCMZj@FVFF77S0Q05N --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e9b151cde491..853f64b6c8c2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -644,7 +644,7 @@ struct pmu_event_list { #ifdef CONFIG_PROVE_LOCKING #define lockdep_assert_event_ctx(event) \ WARN_ON_ONCE(__lockdep_enabled && \ - (this_cpu_read(hardirqs_enabled) || \ + (this_cpu_read(hardirqs_enabled) && \ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) #else #define lockdep_assert_event_ctx(event) -- cgit v1.2.3 From 4c99256013fa4e0fe9733ca1bab2b5684ccc02a1 Mon Sep 17 00:00:00 2001 From: Raul E Rangel Date: Thu, 29 Sep 2022 10:19:09 -0600 Subject: gpiolib: acpi: Add wake_capable variants of acpi_dev_gpio_irq_get The ACPI spec defines the SharedAndWake and ExclusiveAndWake share type keywords. This is an indication that the GPIO IRQ can also be used as a wake source. This change exposes the wake_capable bit so drivers can correctly enable wake functionality instead of making an assumption. Signed-off-by: Raul E Rangel Reviewed-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6f64b2f3dc54..cd7371a5f283 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1202,7 +1202,8 @@ bool acpi_gpio_get_irq_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio); bool acpi_gpio_get_io_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio); -int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name, int index); +int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name, int index, + bool *wake_capable); #else static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio) @@ -1214,16 +1215,28 @@ static inline bool acpi_gpio_get_io_resource(struct acpi_resource *ares, { return false; } -static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, - const char *name, int index) +static inline int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name, + int index, bool *wake_capable) { return -ENXIO; } #endif +static inline int acpi_dev_gpio_irq_wake_get(struct acpi_device *adev, int index, + bool *wake_capable) +{ + return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, wake_capable); +} + +static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name, + int index) +{ + return acpi_dev_gpio_irq_wake_get_by(adev, name, index, NULL); +} + static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index) { - return acpi_dev_gpio_irq_get_by(adev, NULL, index); + return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, NULL); } /* Device properties */ -- cgit v1.2.3 From 5ff811604f93bdd2650beed80b48c2ca16c6fba6 Mon Sep 17 00:00:00 2001 From: Raul E Rangel Date: Thu, 29 Sep 2022 10:19:10 -0600 Subject: ACPI: resources: Add wake_capable parameter to acpi_dev_irq_flags ACPI IRQ/Interrupt resources contain a bit that describes if the interrupt should wake the system. This change exposes that bit via a new IORESOURCE_IRQ_WAKECAPABLE flag. Drivers should check this flag before arming an IRQ to wake the system. Signed-off-by: Raul E Rangel Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 2 +- include/linux/ioport.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index cd7371a5f283..ea2efbdbeee5 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -495,7 +495,7 @@ bool acpi_dev_resource_address_space(struct acpi_resource *ares, struct resource_win *win); bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares, struct resource_win *win); -unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable); +unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable, u8 wake_capable); unsigned int acpi_dev_get_irq_type(int triggering, int polarity); bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index, struct resource *res); diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 616b683563a9..3baeea4d903b 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -79,7 +79,8 @@ struct resource { #define IORESOURCE_IRQ_HIGHLEVEL (1<<2) #define IORESOURCE_IRQ_LOWLEVEL (1<<3) #define IORESOURCE_IRQ_SHAREABLE (1<<4) -#define IORESOURCE_IRQ_OPTIONAL (1<<5) +#define IORESOURCE_IRQ_OPTIONAL (1<<5) +#define IORESOURCE_IRQ_WAKECAPABLE (1<<6) /* PnP DMA specific bits (IORESOURCE_BITS) */ #define IORESOURCE_DMA_TYPE_MASK (3<<0) -- cgit v1.2.3 From e7fd8b68404f3d8bc03f85bc3c078d67096be06f Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Thu, 29 Sep 2022 15:05:23 +0800 Subject: kernel/reboot: Add SYS_OFF_MODE_RESTART_PREPARE mode Add SYS_OFF_MODE_RESTART_PREPARE callbacks to be invoked before a system restart. Suggested-by: Dmitry Osipenko Reviewed-by: Dmitry Osipenko Signed-off-by: Kai-Heng Feng [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/reboot.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index e5d9ef886179..2b6bb593be5b 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -105,6 +105,14 @@ enum sys_off_mode { */ SYS_OFF_MODE_POWER_OFF, + /** + * @SYS_OFF_MODE_RESTART_PREPARE: + * + * Handlers prepare system to be restarted. Handlers are + * allowed to sleep. + */ + SYS_OFF_MODE_RESTART_PREPARE, + /** * @SYS_OFF_MODE_RESTART: * -- cgit v1.2.3 From 0e0abad2a71bcd7ba0f30e7975f5b4199ade4e60 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 4 Oct 2022 14:39:10 +0200 Subject: io_uring: Add missing inline to io_uring_cmd_import_fixed() dummy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_IO_URING is not set: include/linux/io_uring.h:65:12: error: ‘io_uring_cmd_import_fixed’ defined but not used [-Werror=unused-function] 65 | static int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, | ^~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by adding the missing "inline" keyword. Fixes: a9216fac3ed8819c ("io_uring: add io_uring_cmd_import_fixed") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/7404b4a696f64e33e5ef3c5bd3754d4f26d13e50.1664887093.git.geert+renesas@glider.be Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index e10c5cc81082..43bc8a2edccf 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -62,7 +62,7 @@ static inline void io_uring_free(struct task_struct *tsk) __io_uring_free(tsk); } #else -static int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, +static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) { return -EOPNOTSUPP; -- cgit v1.2.3 From da4ab869e37cf81f93333ba74b16e0ea6d322e15 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 25 May 2022 06:11:00 -0400 Subject: libceph: drop last_piece flag from ceph_msg_data_cursor ceph_msg_data_next is always passed a NULL pointer for this field. Some of the "next" operations look at it in order to determine the length, but we can just take the min of the data on the page or cursor->resid. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index e7f2fb2fc207..99c1726be6ee 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -207,7 +207,6 @@ struct ceph_msg_data_cursor { struct ceph_msg_data *data; /* current data item */ size_t resid; /* bytes not yet consumed */ - bool last_piece; /* current is last piece */ bool need_crc; /* crc update needed */ union { #ifdef CONFIG_BLOCK @@ -498,8 +497,7 @@ void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq); void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, struct ceph_msg *msg, size_t length); struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, - size_t *page_offset, size_t *length, - bool *last_piece); + size_t *page_offset, size_t *length); void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes); u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, -- cgit v1.2.3 From bdef2b7896df293736330eb6eb0f43947049b828 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:41 +0200 Subject: vfio/mdev: make mdev.h standalone includable Include and so that users of this headers don't need to do that and remove those includes that aren't needed any more. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Farman Reviewed-by: Jason Gunthorpe Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Link: https://lore.kernel.org/r/20220923092652.100656-4-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/mdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 47ad3b104d9e..a5d8ae6132a2 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -10,6 +10,9 @@ #ifndef MDEV_H #define MDEV_H +#include +#include + struct mdev_type; struct mdev_device { -- cgit v1.2.3 From 89345d5177aa0f6d678251e1e0870b0eeb1ab510 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:42 +0200 Subject: vfio/mdev: embedd struct mdev_parent in the parent data structure Simplify mdev_{un}register_device by requiring the caller to pass in a structure allocate as part of the parent device structure. This removes the need for a list of parents and the separate mdev_parent refcount as we can simplify rely on the reference to the parent device. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-5-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/mdev.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index a5d8ae6132a2..262512c2a8ff 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -23,6 +23,16 @@ struct mdev_device { bool active; }; +/* embedded into the struct device that the mdev devices hang off */ +struct mdev_parent { + struct device *dev; + struct mdev_driver *mdev_driver; + struct kset *mdev_types_kset; + struct list_head type_list; + /* Synchronize device creation/removal with parent unregistration */ + struct rw_semaphore unreg_sem; +}; + static inline struct mdev_device *to_mdev_device(struct device *dev) { return container_of(dev, struct mdev_device, dev); @@ -70,8 +80,9 @@ struct mdev_driver { extern struct bus_type mdev_bus_type; -int mdev_register_device(struct device *dev, struct mdev_driver *mdev_driver); -void mdev_unregister_device(struct device *dev); +int mdev_register_parent(struct mdev_parent *parent, struct device *dev, + struct mdev_driver *mdev_driver); +void mdev_unregister_parent(struct mdev_parent *parent); int mdev_register_driver(struct mdev_driver *drv); void mdev_unregister_driver(struct mdev_driver *drv); -- cgit v1.2.3 From da44c340c4fe9d9653ae84fa6a60f406bafcffce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:43 +0200 Subject: vfio/mdev: simplify mdev_type handling Instead of abusing struct attribute_group to control initialization of struct mdev_type, just define the actual attributes in the mdev_driver, allocate the mdev_type structures in the caller and pass them to mdev_register_parent. This allows the caller to use container_of to get at the containing structure and thus significantly simplify the code. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-6-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/mdev.h | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 262512c2a8ff..19bc93c10e8c 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -23,14 +23,27 @@ struct mdev_device { bool active; }; +struct mdev_type { + /* set by the driver before calling mdev_register parent: */ + const char *sysfs_name; + + /* set by the core, can be used drivers */ + struct mdev_parent *parent; + + /* internal only */ + struct kobject kobj; + struct kobject *devices_kobj; +}; + /* embedded into the struct device that the mdev devices hang off */ struct mdev_parent { struct device *dev; struct mdev_driver *mdev_driver; struct kset *mdev_types_kset; - struct list_head type_list; /* Synchronize device creation/removal with parent unregistration */ struct rw_semaphore unreg_sem; + struct mdev_type **types; + unsigned int nr_types; }; static inline struct mdev_device *to_mdev_device(struct device *dev) @@ -38,8 +51,6 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) return container_of(dev, struct mdev_device, dev); } -unsigned int mdev_get_type_group_id(struct mdev_device *mdev); -unsigned int mtype_get_type_group_id(struct mdev_type *mtype); struct device *mtype_get_parent_dev(struct mdev_type *mtype); /* interface for exporting mdev supported type attributes */ @@ -66,22 +77,21 @@ struct mdev_type_attribute mdev_type_attr_##_name = \ * struct mdev_driver - Mediated device driver * @probe: called when new device created * @remove: called when device removed - * @supported_type_groups: Attributes to define supported types. It is mandatory - * to provide supported types. + * @types_attrs: attributes to the type kobjects. * @driver: device driver structure - * **/ struct mdev_driver { int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); - struct attribute_group **supported_type_groups; + const struct attribute * const *types_attrs; struct device_driver driver; }; extern struct bus_type mdev_bus_type; int mdev_register_parent(struct mdev_parent *parent, struct device *dev, - struct mdev_driver *mdev_driver); + struct mdev_driver *mdev_driver, struct mdev_type **types, + unsigned int nr_types); void mdev_unregister_parent(struct mdev_parent *parent); int mdev_register_driver(struct mdev_driver *drv); -- cgit v1.2.3 From cbf3bb28aaeaee425ca7b9c537a3efff1f8c98ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:44 +0200 Subject: vfio/mdev: remove mdev_from_dev Just open code it in the only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Link: https://lore.kernel.org/r/20220923092652.100656-7-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/mdev.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 19bc93c10e8c..4f558de52fd9 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -102,9 +102,5 @@ static inline struct device *mdev_dev(struct mdev_device *mdev) { return &mdev->dev; } -static inline struct mdev_device *mdev_from_dev(struct device *dev) -{ - return dev->bus == &mdev_bus_type ? to_mdev_device(dev) : NULL; -} #endif /* MDEV_H */ -- cgit v1.2.3 From 2815fe149ffa8e1a022b2830ab62999135c00a4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:45 +0200 Subject: vfio/mdev: unexport mdev_bus_type mdev_bus_type is only used in mdev.ko now, so unexport it. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Link: https://lore.kernel.org/r/20220923092652.100656-8-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/mdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 4f558de52fd9..6c179d2b8927 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -87,8 +87,6 @@ struct mdev_driver { struct device_driver driver; }; -extern struct bus_type mdev_bus_type; - int mdev_register_parent(struct mdev_parent *parent, struct device *dev, struct mdev_driver *mdev_driver, struct mdev_type **types, unsigned int nr_types); -- cgit v1.2.3 From 062e720cd209d8091c4f3d118d93973f02209aca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:46 +0200 Subject: vfio/mdev: remove mdev_parent_dev Just open code the dereferences in the only user. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Link: https://lore.kernel.org/r/20220923092652.100656-9-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/mdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 6c179d2b8927..bbedffcb38d4 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -95,7 +95,6 @@ void mdev_unregister_parent(struct mdev_parent *parent); int mdev_register_driver(struct mdev_driver *drv); void mdev_unregister_driver(struct mdev_driver *drv); -struct device *mdev_parent_dev(struct mdev_device *mdev); static inline struct device *mdev_dev(struct mdev_device *mdev) { return &mdev->dev; -- cgit v1.2.3 From c7c1f38f6cba7e3249866c06639ea62755f0a24e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:47 +0200 Subject: vfio/mdev: remove mtype_get_parent_dev Just open code the dereferences in the only user. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Jason J. Herne Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-10-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/mdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index bbedffcb38d4..e445f809ceca 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -51,8 +51,6 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) return container_of(dev, struct mdev_device, dev); } -struct device *mtype_get_parent_dev(struct mdev_type *mtype); - /* interface for exporting mdev supported type attributes */ struct mdev_type_attribute { struct attribute attr; -- cgit v1.2.3 From 290aac5df88a83e264b3a73ec146e5e5b3c45793 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 23 Sep 2022 11:26:48 +0200 Subject: vfio/mdev: consolidate all the device_api sysfs into the core code Every driver just emits a static string, simply feed it through the ops and provide a standard sysfs show function. Signed-off-by: Jason Gunthorpe Signed-off-by: Christoph Hellwig Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-11-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/mdev.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index e445f809ceca..af1ff0165b8d 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -61,11 +61,6 @@ struct mdev_type_attribute { size_t count); }; -#define MDEV_TYPE_ATTR(_name, _mode, _show, _store) \ -struct mdev_type_attribute mdev_type_attr_##_name = \ - __ATTR(_name, _mode, _show, _store) -#define MDEV_TYPE_ATTR_RW(_name) \ - struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RW(_name) #define MDEV_TYPE_ATTR_RO(_name) \ struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name) #define MDEV_TYPE_ATTR_WO(_name) \ @@ -73,12 +68,14 @@ struct mdev_type_attribute mdev_type_attr_##_name = \ /** * struct mdev_driver - Mediated device driver + * @device_api: string to return for the device_api sysfs * @probe: called when new device created * @remove: called when device removed * @types_attrs: attributes to the type kobjects. * @driver: device driver structure **/ struct mdev_driver { + const char *device_api; int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); const struct attribute * const *types_attrs; -- cgit v1.2.3 From 0bc79069ccbdbe26492493dd0c4e38b7cadf8ad5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:49 +0200 Subject: vfio/mdev: consolidate all the name sysfs into the core code Every driver just emits a static string, simply add a field to the mdev_type for the driver to fill out or fall back to the sysfs name and provide a standard sysfs show function. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-12-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/mdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index af1ff0165b8d..4bb8a58b577b 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -26,6 +26,7 @@ struct mdev_device { struct mdev_type { /* set by the driver before calling mdev_register parent: */ const char *sysfs_name; + const char *pretty_name; /* set by the core, can be used drivers */ struct mdev_parent *parent; -- cgit v1.2.3 From f2fbc72e6da4f8e01fe5fe3d6871a791e76271c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:50 +0200 Subject: vfio/mdev: consolidate all the available_instance sysfs into the core code Every driver just print a number, simply add a method to the mdev_driver to return it and provide a standard sysfs show function. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-13-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/mdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 4bb8a58b577b..d39e08a1824c 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -72,6 +72,7 @@ struct mdev_type_attribute { * @device_api: string to return for the device_api sysfs * @probe: called when new device created * @remove: called when device removed + * @get_available: Return the max number of instances that can be created * @types_attrs: attributes to the type kobjects. * @driver: device driver structure **/ @@ -79,6 +80,7 @@ struct mdev_driver { const char *device_api; int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); + unsigned int (*get_available)(struct mdev_type *mtype); const struct attribute * const *types_attrs; struct device_driver driver; }; -- cgit v1.2.3 From 685a1537f4c603cfcaf4b9be56ff6a571f7ddd08 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:51 +0200 Subject: vfio/mdev: consolidate all the description sysfs into the core code Every driver just emits a string, simply add a method to the mdev_driver to return it and provide a standard sysfs show function. Remove the now unused types_attrs field in struct mdev_driver and the support code for it. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Link: https://lore.kernel.org/r/20220923092652.100656-14-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/mdev.h | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index d39e08a1824c..33674cb5ed5d 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -52,28 +52,13 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) return container_of(dev, struct mdev_device, dev); } -/* interface for exporting mdev supported type attributes */ -struct mdev_type_attribute { - struct attribute attr; - ssize_t (*show)(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf); - ssize_t (*store)(struct mdev_type *mtype, - struct mdev_type_attribute *attr, const char *buf, - size_t count); -}; - -#define MDEV_TYPE_ATTR_RO(_name) \ - struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name) -#define MDEV_TYPE_ATTR_WO(_name) \ - struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_WO(_name) - /** * struct mdev_driver - Mediated device driver * @device_api: string to return for the device_api sysfs * @probe: called when new device created * @remove: called when device removed * @get_available: Return the max number of instances that can be created - * @types_attrs: attributes to the type kobjects. + * @show_description: Print a description of the mtype * @driver: device driver structure **/ struct mdev_driver { @@ -81,7 +66,7 @@ struct mdev_driver { int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); unsigned int (*get_available)(struct mdev_type *mtype); - const struct attribute * const *types_attrs; + ssize_t (*show_description)(struct mdev_type *mtype, char *buf); struct device_driver driver; }; -- cgit v1.2.3 From 9c799c224d6ebc5be51065bd3217a2d7eea23b8f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 23 Sep 2022 11:26:52 +0200 Subject: vfio/mdev: add mdev available instance checking to the core Many of the mdev drivers use a simple counter for keeping track of the available instances. Move this code to the core code and store the counter in the mdev_parent. Implement it using correct locking, fixing mdpy. Drivers just provide the value in the mdev_driver at registration time and the core code takes care of maintaining it and exposing the value in sysfs. [hch: count instances per-parent instead of per-type, use an atomic_t to avoid taking mdev_list_lock in the show method] Signed-off-by: Jason Gunthorpe Signed-off-by: Christoph Hellwig Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-15-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/mdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 33674cb5ed5d..139d05b26f82 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -45,6 +45,7 @@ struct mdev_parent { struct rw_semaphore unreg_sem; struct mdev_type **types; unsigned int nr_types; + atomic_t available_instances; }; static inline struct mdev_device *to_mdev_device(struct device *dev) @@ -55,6 +56,7 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) /** * struct mdev_driver - Mediated device driver * @device_api: string to return for the device_api sysfs + * @max_instances: maximum number of instances supported (optional) * @probe: called when new device created * @remove: called when device removed * @get_available: Return the max number of instances that can be created @@ -63,6 +65,7 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) **/ struct mdev_driver { const char *device_api; + unsigned int max_instances; int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); unsigned int (*get_available)(struct mdev_type *mtype); -- cgit v1.2.3 From 34e1ed189fab1b7533befb266b96051104c1deb6 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Mon, 8 Aug 2022 19:40:38 +0200 Subject: PM: Improve EXPORT_*_DEV_PM_OPS macros Update the _EXPORT_DEV_PM_OPS() internal macro. It was not used anywhere outside pm.h and pm_runtime.h, so it is safe to update it. Before, this macro would take a few parameters to be used as sleep and runtime callbacks. This made it unsuitable to use with different callbacks, for instance the "noirq" ones. It is now semantically different: instead of creating a conditionally exported dev_pm_ops structure, it only contains part of the definition. This macro should however never be used directly (hence the trailing underscore). Instead, the following four macros are provided: - EXPORT_DEV_PM_OPS(name) - EXPORT_GPL_DEV_PM_OPS(name) - EXPORT_NS_DEV_PM_OPS(name, ns) - EXPORT_NS_GPL_DEV_PM_OPS(name, ns) For instance, it is now possible to conditionally export noirq suspend/resume PM functions like this: EXPORT_GPL_DEV_PM_OPS(foo_pm_ops) = { NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) }; The existing helper macros EXPORT_*_SIMPLE_DEV_PM_OPS() and EXPORT_*_RUNTIME_DEV_PM_OPS() have been updated to use these new macros. Signed-off-by: Paul Cercueil Reviewed-by: Jonathan Cameron Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 37 +++++++++++++++++++++++-------------- include/linux/pm_runtime.h | 20 ++++++++++++-------- 2 files changed, 35 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 871c9c49ec9d..93cd34f00822 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -375,19 +375,20 @@ const struct dev_pm_ops name = { \ } #ifdef CONFIG_PM -#define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \ - runtime_resume_fn, idle_fn, sec, ns) \ - _DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \ - runtime_resume_fn, idle_fn); \ - __EXPORT_SYMBOL(name, sec, ns) +#define _EXPORT_DEV_PM_OPS(name, sec, ns) \ + const struct dev_pm_ops name; \ + __EXPORT_SYMBOL(name, sec, ns); \ + const struct dev_pm_ops name #else -#define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \ - runtime_resume_fn, idle_fn, sec, ns) \ -static __maybe_unused _DEFINE_DEV_PM_OPS(__static_##name, suspend_fn, \ - resume_fn, runtime_suspend_fn, \ - runtime_resume_fn, idle_fn) +#define _EXPORT_DEV_PM_OPS(name, sec, ns) \ + static __maybe_unused const struct dev_pm_ops __static_##name #endif +#define EXPORT_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "", "") +#define EXPORT_GPL_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "_gpl", "") +#define EXPORT_NS_DEV_PM_OPS(name, ns) _EXPORT_DEV_PM_OPS(name, "", #ns) +#define EXPORT_NS_GPL_DEV_PM_OPS(name, ns) _EXPORT_DEV_PM_OPS(name, "_gpl", #ns) + /* * Use this if you want to use the same suspend and resume callbacks for suspend * to RAM and hibernation. @@ -399,13 +400,21 @@ static __maybe_unused _DEFINE_DEV_PM_OPS(__static_##name, suspend_fn, \ _DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL) #define EXPORT_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ - _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "", "") + EXPORT_DEV_PM_OPS(name) = { \ + SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ + } #define EXPORT_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ - _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl", "") + EXPORT_GPL_DEV_PM_OPS(name) = { \ + SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ + } #define EXPORT_NS_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns) \ - _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "", #ns) + EXPORT_NS_DEV_PM_OPS(name, ns) = { \ + SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ + } #define EXPORT_NS_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns) \ - _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl", #ns) + EXPORT_NS_GPL_DEV_PM_OPS(name, ns) = { \ + SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ + } /* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */ #define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 0a41b2dcccad..9a8151a2bdea 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -40,17 +40,21 @@ resume_fn, idle_fn) #define EXPORT_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ - _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ - suspend_fn, resume_fn, idle_fn, "", "") + EXPORT_DEV_PM_OPS(name) = { \ + RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ + } #define EXPORT_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ - _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ - suspend_fn, resume_fn, idle_fn, "_gpl", "") + EXPORT_GPL_DEV_PM_OPS(name) = { \ + RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ + } #define EXPORT_NS_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \ - _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ - suspend_fn, resume_fn, idle_fn, "", #ns) + EXPORT_NS_DEV_PM_OPS(name, ns) = { \ + RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ + } #define EXPORT_NS_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \ - _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ - suspend_fn, resume_fn, idle_fn, "_gpl", #ns) + EXPORT_NS_GPL_DEV_PM_OPS(name, ns) = { \ + RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ + } #ifdef CONFIG_PM extern struct workqueue_struct *pm_wq; -- cgit v1.2.3 From a9cfee0ef98e99c8b1951dfd1d57a88580354d0d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 28 Sep 2022 23:38:53 +0800 Subject: f2fs: support recording stop_checkpoint reason into super_block This patch supports to record stop_checkpoint error into f2fs_super_block.s_stop_reason[]. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index d445150c5350..5dd1e52b8997 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -73,6 +73,20 @@ struct f2fs_device { __le32 total_segments; } __packed; +/* reason of stop_checkpoint */ +enum stop_cp_reason { + STOP_CP_REASON_SHUTDOWN, + STOP_CP_REASON_FAULT_INJECT, + STOP_CP_REASON_META_PAGE, + STOP_CP_REASON_WRITE_FAIL, + STOP_CP_REASON_CORRUPTED_SUMMARY, + STOP_CP_REASON_UPDATE_INODE, + STOP_CP_REASON_FLUSH_FAIL, + STOP_CP_REASON_MAX, +}; + +#define MAX_STOP_REASON 32 + struct f2fs_super_block { __le32 magic; /* Magic Number */ __le16 major_ver; /* Major Version */ @@ -116,7 +130,8 @@ struct f2fs_super_block { __u8 hot_ext_count; /* # of hot file extension */ __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ - __u8 reserved[306]; /* valid reserved region */ + __u8 s_stop_reason[MAX_STOP_REASON]; /* stop checkpoint reason */ + __u8 reserved[274]; /* valid reserved region */ __le32 crc; /* checksum of superblock */ } __packed; -- cgit v1.2.3 From 95fa90c9e5a7f14c2497d5b032544478c9377c3a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 28 Sep 2022 23:38:54 +0800 Subject: f2fs: support recording errors into superblock This patch supports to record detail reason of FSCORRUPTED error into f2fs_super_block.s_errors[]. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 5dd1e52b8997..ee0d75d9a302 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -87,6 +87,28 @@ enum stop_cp_reason { #define MAX_STOP_REASON 32 +/* detail reason for EFSCORRUPTED */ +enum f2fs_error { + ERROR_CORRUPTED_CLUSTER, + ERROR_FAIL_DECOMPRESSION, + ERROR_INVALID_BLKADDR, + ERROR_CORRUPTED_DIRENT, + ERROR_CORRUPTED_INODE, + ERROR_INCONSISTENT_SUMMARY, + ERROR_INCONSISTENT_FOOTER, + ERROR_INCONSISTENT_SUM_TYPE, + ERROR_CORRUPTED_JOURNAL, + ERROR_INCONSISTENT_NODE_COUNT, + ERROR_INCONSISTENT_BLOCK_COUNT, + ERROR_INVALID_CURSEG, + ERROR_INCONSISTENT_SIT, + ERROR_CORRUPTED_VERITY_XATTR, + ERROR_CORRUPTED_XATTR, + ERROR_MAX, +}; + +#define MAX_F2FS_ERRORS 16 + struct f2fs_super_block { __le32 magic; /* Magic Number */ __le16 major_ver; /* Major Version */ @@ -131,7 +153,8 @@ struct f2fs_super_block { __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __u8 s_stop_reason[MAX_STOP_REASON]; /* stop checkpoint reason */ - __u8 reserved[274]; /* valid reserved region */ + __u8 s_errors[MAX_F2FS_ERRORS]; /* reason of image corrupts */ + __u8 reserved[258]; /* valid reserved region */ __le32 crc; /* checksum of superblock */ } __packed; -- cgit v1.2.3 From f1375ec1df09fb09fecfdf9418d3784e7ba12469 Mon Sep 17 00:00:00 2001 From: Meng Li Date: Wed, 17 Aug 2022 11:46:27 +0800 Subject: cpufreq: amd-pstate: Expose struct amd_cpudata Expose struct amd_cpudata to AMD P-State unit test module. This data struct will be used on the following AMD P-State unit test (amd-pstate-ut) module. The amd-pstate-ut module can get some AMD infomations by this data struct. For example: highest perf, nominal perf, boost supported etc. Signed-off-by: Meng Li Acked-by: Huang Rui Acked-by: Shuah Khan Signed-off-by: Shuah Khan --- include/linux/amd-pstate.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 include/linux/amd-pstate.h (limited to 'include/linux') diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h new file mode 100644 index 000000000000..1c4b8659f171 --- /dev/null +++ b/include/linux/amd-pstate.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * linux/include/linux/amd-pstate.h + * + * Copyright (C) 2022 Advanced Micro Devices, Inc. + * + * Author: Meng Li + */ + +#ifndef _LINUX_AMD_PSTATE_H +#define _LINUX_AMD_PSTATE_H + +#include + +/********************************************************************* + * AMD P-state INTERFACE * + *********************************************************************/ +/** + * struct amd_aperf_mperf + * @aperf: actual performance frequency clock count + * @mperf: maximum performance frequency clock count + * @tsc: time stamp counter + */ +struct amd_aperf_mperf { + u64 aperf; + u64 mperf; + u64 tsc; +}; + +/** + * struct amd_cpudata - private CPU data for AMD P-State + * @cpu: CPU number + * @req: constraint request to apply + * @cppc_req_cached: cached performance request hints + * @highest_perf: the maximum performance an individual processor may reach, + * assuming ideal conditions + * @nominal_perf: the maximum sustained performance level of the processor, + * assuming ideal operating conditions + * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power + * savings are achieved + * @lowest_perf: the absolute lowest performance level of the processor + * @max_freq: the frequency that mapped to highest_perf + * @min_freq: the frequency that mapped to lowest_perf + * @nominal_freq: the frequency that mapped to nominal_perf + * @lowest_nonlinear_freq: the frequency that mapped to lowest_nonlinear_perf + * @cur: Difference of Aperf/Mperf/tsc count between last and current sample + * @prev: Last Aperf/Mperf/tsc count value read from register + * @freq: current cpu frequency value + * @boost_supported: check whether the Processor or SBIOS supports boost mode + * + * The amd_cpudata is key private data for each CPU thread in AMD P-State, and + * represents all the attributes and goals that AMD P-State requests at runtime. + */ +struct amd_cpudata { + int cpu; + + struct freq_qos_request req[2]; + u64 cppc_req_cached; + + u32 highest_perf; + u32 nominal_perf; + u32 lowest_nonlinear_perf; + u32 lowest_perf; + + u32 max_freq; + u32 min_freq; + u32 nominal_freq; + u32 lowest_nonlinear_freq; + + struct amd_aperf_mperf cur; + struct amd_aperf_mperf prev; + + u64 freq; + bool boost_supported; +}; + +#endif /* _LINUX_AMD_PSTATE_H */ -- cgit v1.2.3 From 07d2872bf4c864eb83d034263c155746a2fb7a3b Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Wed, 28 Sep 2022 12:57:44 +0300 Subject: mmc: core: Add SD card quirk for broken discard Some SD-cards from Sandisk that are SDA-6.0 compliant reports they supports discard, while they actually don't. This might cause mk2fs to fail while trying to format the card and revert it to a read-only mode. To fix this problem, let's add a card quirk (MMC_QUIRK_BROKEN_SD_DISCARD) to indicate that we shall fall-back to use the legacy erase command instead. Signed-off-by: Avri Altman Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220928095744.16455-1-avri.altman@wdc.com [Ulf: Updated the commit message] Signed-off-by: Ulf Hansson --- include/linux/mmc/card.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 8a30de08e913..c726ea781255 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -293,6 +293,7 @@ struct mmc_card { #define MMC_QUIRK_BROKEN_IRQ_POLLING (1<<11) /* Polling SDIO_CCCR_INTx could create a fake interrupt */ #define MMC_QUIRK_TRIM_BROKEN (1<<12) /* Skip trim */ #define MMC_QUIRK_BROKEN_HPI (1<<13) /* Disable broken HPI support */ +#define MMC_QUIRK_BROKEN_SD_DISCARD (1<<14) /* Disable broken SD discard support */ bool reenable_cmdq; /* Re-enable Command Queue */ -- cgit v1.2.3 From 90d482908eedd56f01a707325aa541cf9c40f936 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 3 Oct 2022 16:34:17 +0100 Subject: lib/find_bit: Introduce find_next_andnot_bit() In preparation of introducing for_each_cpu_andnot(), add a variant of find_next_bit() that negate the bits in @addr2 when ANDing them with the bits in @addr1. Signed-off-by: Valentin Schneider --- include/linux/find.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index 0cdfab9734a6..2235af19e7e1 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -12,6 +12,8 @@ unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits, unsigned long start); unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); +unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start); unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); @@ -91,6 +93,37 @@ unsigned long find_next_and_bit(const unsigned long *addr1, } #endif +#ifndef find_next_andnot_bit +/** + * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits + * in *addr2 + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @size: The bitmap size in bits + * @offset: The bitnumber to start searching at + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_next_andnot_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr1 & ~*addr2 & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_andnot_bit(addr1, addr2, size, offset); +} +#endif + #ifndef find_next_zero_bit /** * find_next_zero_bit - find the next cleared bit in a memory region -- cgit v1.2.3 From 5f75ff295c662c1c8fb9e1737e9dc3b9a1e7fb29 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 3 Oct 2022 16:34:18 +0100 Subject: cpumask: Introduce for_each_cpu_andnot() for_each_cpu_and() is very convenient as it saves having to allocate a temporary cpumask to store the result of cpumask_and(). The same issue applies to cpumask_andnot() which doesn't actually need temporary storage for iteration purposes. Following what has been done for for_each_cpu_and(), introduce for_each_cpu_andnot(). Signed-off-by: Valentin Schneider --- include/linux/cpumask.h | 18 ++++++++++++++++++ include/linux/find.h | 5 +++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 286804bfe3b7..2f065ad97541 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -306,6 +306,24 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta #define for_each_cpu_and(cpu, mask1, mask2) \ for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits) +/** + * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding + * those present in another. + * @cpu: the (optionally unsigned) integer iterator + * @mask1: the first cpumask pointer + * @mask2: the second cpumask pointer + * + * This saves a temporary CPU mask in many places. It is equivalent to: + * struct cpumask tmp; + * cpumask_andnot(&tmp, &mask1, &mask2); + * for_each_cpu(cpu, &tmp) + * ... + * + * After the loop, cpu is >= nr_cpu_ids. + */ +#define for_each_cpu_andnot(cpu, mask1, mask2) \ + for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits) + /** * cpumask_any_but - return a "random" in a cpumask, but not this one. * @mask: the cpumask to search diff --git a/include/linux/find.h b/include/linux/find.h index 2235af19e7e1..ccaf61a0f5fd 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -498,6 +498,11 @@ unsigned long find_next_bit_le(const void *addr, unsigned (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) +#define for_each_andnot_bit(bit, addr1, addr2, size) \ + for ((bit) = 0; \ + (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ + (bit)++) + /* same as for_each_set_bit() but use bit as value to start with */ #define for_each_set_bit_from(bit, addr, size) \ for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) -- cgit v1.2.3 From 39494194f93bed7926d4b3bd03a6a76ba23e612b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 5 Oct 2022 15:57:35 -0400 Subject: SUNRPC: Fix races with rpc_killall_tasks() Ensure that we immediately call rpc_exit_task() after waking up, and that the tk_rpc_status cannot get clobbered by some other function. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index acc62647317c..647247040ef9 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -209,6 +209,7 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *); struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req); void rpc_put_task(struct rpc_task *); void rpc_put_task_async(struct rpc_task *); +bool rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status); void rpc_signal_task(struct rpc_task *); void rpc_exit_task(struct rpc_task *); void rpc_exit(struct rpc_task *, int); -- cgit v1.2.3 From f8423909ecca208834a9d704e58409800f8b5f21 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 5 Oct 2022 15:57:36 -0400 Subject: SUNRPC: Add a helper to allow pNFS drivers to selectively cancel RPC calls Add the helper rpc_cancel_tasks(), which uses a caller-defined selection function to define a set of in-flight RPC calls to cancel. This is mainly intended for pNFS drivers which are subject to a layout recall, and which may therefore want to cancel all pending I/O using that layout in order to redrive it after the layout recall has been satisfied. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 647247040ef9..cdcf0fe56a6f 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -210,11 +210,16 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req); void rpc_put_task(struct rpc_task *); void rpc_put_task_async(struct rpc_task *); bool rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status); +void rpc_task_try_cancel(struct rpc_task *task, int error); void rpc_signal_task(struct rpc_task *); void rpc_exit_task(struct rpc_task *); void rpc_exit(struct rpc_task *, int); void rpc_release_calldata(const struct rpc_call_ops *, void *); void rpc_killall_tasks(struct rpc_clnt *); +unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error, + bool (*fnmatch)(const struct rpc_task *, + const void *), + const void *data); void rpc_execute(struct rpc_task *); void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *); void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); -- cgit v1.2.3 From dc4c4304855a5721d214e2a53e17df5152dd5f34 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 5 Oct 2022 15:57:37 -0400 Subject: SUNRPC: Add API to force the client to disconnect Allow the caller to force a disconnection of the RPC client so that we can clear any pending requests that are buffered in the socket. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 75eea5ebb179..770ef2cb5775 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -246,6 +246,7 @@ void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *, struct rpc_xprt *); bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, const struct sockaddr *sap); void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt); +void rpc_clnt_disconnect(struct rpc_clnt *clnt); void rpc_cleanup_clids(void); static inline int rpc_reply_expected(struct rpc_task *task) -- cgit v1.2.3 From a890d1c657ecba73a7b28591c92587aef1be1888 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 5 Oct 2022 12:54:38 +0200 Subject: random: clear new batches when bringing new CPUs online The commit that added the new get_random_{u8,u16}() functions neglected to update the code that clears the batches when bringing up a new CPU. It also forgot a few comments and helper defines, so add those in too. Fixes: 585cd5fe9f73 ("random: add 8-bit and 16-bit batches") Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 2c130f8f18e5..08322f700cdc 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -96,6 +96,8 @@ static inline int get_random_bytes_wait(void *buf, size_t nbytes) *out = get_random_ ## name(); \ return 0; \ } +declare_get_random_var_wait(u8, u8) +declare_get_random_var_wait(u16, u16) declare_get_random_var_wait(u32, u32) declare_get_random_var_wait(u64, u32) declare_get_random_var_wait(int, unsigned int) -- cgit v1.2.3 From e3e6e1d16a4cf7b63159ec71774e822194071954 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Tue, 27 Sep 2022 07:34:59 +0800 Subject: wifi: wext: use flex array destination for memcpy() Syzkaller reports buffer overflow false positive as follows: ------------[ cut here ]------------ memcpy: detected field-spanning write (size 8) of single field "&compat_event->pointer" at net/wireless/wext-core.c:623 (size 4) WARNING: CPU: 0 PID: 3607 at net/wireless/wext-core.c:623 wireless_send_event+0xab5/0xca0 net/wireless/wext-core.c:623 Modules linked in: CPU: 1 PID: 3607 Comm: syz-executor659 Not tainted 6.0.0-rc6-next-20220921-syzkaller #0 [...] Call Trace: ioctl_standard_call+0x155/0x1f0 net/wireless/wext-core.c:1022 wireless_process_ioctl+0xc8/0x4c0 net/wireless/wext-core.c:955 wext_ioctl_dispatch net/wireless/wext-core.c:988 [inline] wext_ioctl_dispatch net/wireless/wext-core.c:976 [inline] wext_handle_ioctl+0x26b/0x280 net/wireless/wext-core.c:1049 sock_ioctl+0x285/0x640 net/socket.c:1220 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl fs/ioctl.c:856 [inline] __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:856 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] Wireless events will be sent on the appropriate channels in wireless_send_event(). Different wireless events may have different payload structure and size, so kernel uses **len** and **cmd** field in struct __compat_iw_event as wireless event common LCP part, uses **pointer** as a label to mark the position of remaining different part. Yet the problem is that, **pointer** is a compat_caddr_t type, which may be smaller than the relative structure at the same position. So during wireless_send_event() tries to parse the wireless events payload, it may trigger the memcpy() run-time destination buffer bounds checking when the relative structure's data is copied to the position marked by **pointer**. This patch solves it by introducing flexible-array field **ptr_bytes**, to mark the position of the wireless events remaining part next to LCP part. What's more, this patch also adds **ptr_len** variable in wireless_send_event() to improve its maintainability. Reported-and-tested-by: syzbot+473754e5af963cf014cf@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/00000000000070db2005e95a5984@google.com/ Suggested-by: Kees Cook Reviewed-by: Kees Cook Signed-off-by: Hawkins Jiawei Signed-off-by: Johannes Berg --- include/linux/wireless.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wireless.h b/include/linux/wireless.h index 2d1b54556eff..e6e34d74dda0 100644 --- a/include/linux/wireless.h +++ b/include/linux/wireless.h @@ -26,7 +26,15 @@ struct compat_iw_point { struct __compat_iw_event { __u16 len; /* Real length of this stuff */ __u16 cmd; /* Wireless IOCTL */ - compat_caddr_t pointer; + + union { + compat_caddr_t pointer; + + /* we need ptr_bytes to make memcpy() run-time destination + * buffer bounds checking happy, nothing special + */ + DECLARE_FLEX_ARRAY(__u8, ptr_bytes); + }; }; #define IW_EV_COMPAT_LCP_LEN offsetof(struct __compat_iw_event, pointer) #define IW_EV_COMPAT_POINT_OFF offsetof(struct compat_iw_point, length) -- cgit v1.2.3 From cdbd952bb7b5fca36676b3d318796b196b127397 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 15 Aug 2022 18:04:20 -0400 Subject: virtio: drop vp_legacy_set_queue_size There's actually no way to set queue size on legacy virtio pci. Signed-off-by: Michael S. Tsirkin Message-Id: <20220815220447.155860-1-mst@redhat.com> --- include/linux/virtio_pci_legacy.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_pci_legacy.h b/include/linux/virtio_pci_legacy.h index e5d665faf00e..a8dc757d0367 100644 --- a/include/linux/virtio_pci_legacy.h +++ b/include/linux/virtio_pci_legacy.h @@ -32,8 +32,6 @@ void vp_legacy_set_queue_address(struct virtio_pci_legacy_device *ldev, u16 index, u32 queue_pfn); bool vp_legacy_get_queue_enable(struct virtio_pci_legacy_device *ldev, u16 idx); -void vp_legacy_set_queue_size(struct virtio_pci_legacy_device *ldev, - u16 idx, u16 size); u16 vp_legacy_get_queue_size(struct virtio_pci_legacy_device *ldev, u16 idx); int vp_legacy_probe(struct virtio_pci_legacy_device *ldev); -- cgit v1.2.3 From 90fea5a800c3dd80fb8ad9a02929bcef5fde42b8 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 27 Sep 2022 15:48:08 +0800 Subject: vdpa: device feature provisioning This patch allows the device features to be provisioned through netlink. A new attribute is introduced to allow the userspace to pass a 64bit device features during device adding. This provides several advantages: - Allow to provision a subset of the features to ease the cross vendor live migration. - Better debug-ability for vDPA framework and parent. Reviewed-by: Eli Cohen Signed-off-by: Jason Wang Message-Id: <20220927074810.28627-2-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index d282f464d2f1..6d0f5e4e82c2 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -104,6 +104,7 @@ struct vdpa_iova_range { }; struct vdpa_dev_set_config { + u64 device_features; struct { u8 mac[ETH_ALEN]; u16 mtu; -- cgit v1.2.3 From 4b22ef042d6f54a6e5899555f2db71749133eca8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Oct 2022 11:04:39 -0300 Subject: vfio: Add vfio_file_is_group() This replaces uses of vfio_file_iommu_group() which were only detecting if the file is a VFIO file with no interest in the actual group. The only remaning user of vfio_file_iommu_group() is in KVM for the SPAPR stuff. It passes the iommu_group into the arch code through kvm for some reason. Tested-by: Matthew Rosato Tested-by: Christian Borntraeger Tested-by: Eric Farman Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-15417f29324e+1c-vfio_group_disassociate_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index ee399a768070..e7cebeb875dd 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -199,6 +199,7 @@ int vfio_mig_get_next_state(struct vfio_device *device, * External user API */ struct iommu_group *vfio_file_iommu_group(struct file *file); +bool vfio_file_is_group(struct file *file); bool vfio_file_enforced_coherent(struct file *file); void vfio_file_set_kvm(struct file *file, struct kvm *kvm); bool vfio_file_has_dev(struct file *file, struct vfio_device *device); -- cgit v1.2.3 From 9afd20a5a1b3558950b5e12f3ed057ef93c64a05 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 10 Oct 2022 09:52:37 +0530 Subject: clk: spear: Move prototype to accessible header Fixes the following W=1 kernel build warning(s): drivers/clk/spear/spear6xx_clock.c:116:13: warning: no previous prototype for function 'spear6xx_clk_init' [-Wmissing-prototypes] Reported-by: kernel test robot Signed-off-by: Viresh Kumar Signed-off-by: Arnd Bergmann --- include/linux/clk/spear.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk/spear.h b/include/linux/clk/spear.h index a64d034ceddd..eaf95ca656f8 100644 --- a/include/linux/clk/spear.h +++ b/include/linux/clk/spear.h @@ -8,6 +8,20 @@ #ifndef __LINUX_CLK_SPEAR_H #define __LINUX_CLK_SPEAR_H +#ifdef CONFIG_ARCH_SPEAR3XX +void __init spear3xx_clk_init(void __iomem *misc_base, + void __iomem *soc_config_base); +#else +static inline void __init spear3xx_clk_init(void __iomem *misc_base, + void __iomem *soc_config_base) {} +#endif + +#ifdef CONFIG_ARCH_SPEAR6XX +void __init spear6xx_clk_init(void __iomem *misc_base); +#else +static inline void __init spear6xx_clk_init(void __iomem *misc_base) {} +#endif + #ifdef CONFIG_MACH_SPEAR1310 void __init spear1310_clk_init(void __iomem *misc_base, void __iomem *ras_base); #else -- cgit v1.2.3 From ca5eebda3e1c1a58a1c5a337da393ed6734593e3 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 3 Oct 2022 09:35:34 -0400 Subject: block: avoid sign extend problem with default queue flags mask request_queue->queue_flags is unsigned long, which is 8-bytes on 64-bit architectures. Most queue flag modifications occur through bit field helpers, but default flags can be logically OR'd via the QUEUE_FLAG_MQ_DEFAULT mask. If this mask happens to include bit 31, the assignment can sign extend the field and set all upper 32 bits. This exact problem has been observed on a downstream kernel that happens to use bit 31 for QUEUE_FLAG_NOWAIT. This is not an immediate problem for current upstream because bit 31 is not included in the default flag assignment (and is not used at all, actually). Regardless, fix up the QUEUE_FLAG_MQ_DEFAULT mask definition to avoid the landmine in the future. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221003133534.1075582-1-bfoster@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 49373d002631..e4fb47753177 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -580,9 +580,9 @@ struct request_queue { #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ -#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_SAME_COMP) | \ - (1 << QUEUE_FLAG_NOWAIT)) +#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \ + (1UL << QUEUE_FLAG_SAME_COMP) | \ + (1UL << QUEUE_FLAG_NOWAIT)) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); -- cgit v1.2.3 From 81895a65ec63ee1daec3255dc1a06675d2fbe915 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 5 Oct 2022 16:43:38 +0200 Subject: treewide: use prandom_u32_max() when possible, part 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than incurring a division or requesting too many random bytes for the given range, use the prandom_u32_max() function, which only takes the minimum required bytes from the RNG and avoids divisions. This was done mechanically with this coccinelle script: @basic@ expression E; type T; identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; typedef u64; @@ ( - ((T)get_random_u32() % (E)) + prandom_u32_max(E) | - ((T)get_random_u32() & ((E) - 1)) + prandom_u32_max(E * XXX_MAKE_SURE_E_IS_POW2) | - ((u64)(E) * get_random_u32() >> 32) + prandom_u32_max(E) | - ((T)get_random_u32() & ~PAGE_MASK) + prandom_u32_max(PAGE_SIZE) ) @multi_line@ identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; identifier RAND; expression E; @@ - RAND = get_random_u32(); ... when != RAND - RAND %= (E); + RAND = prandom_u32_max(E); // Find a potential literal @literal_mask@ expression LITERAL; type T; identifier get_random_u32 =~ "get_random_int|prandom_u32|get_random_u32"; position p; @@ ((T)get_random_u32()@p & (LITERAL)) // Add one to the literal. @script:python add_one@ literal << literal_mask.LITERAL; RESULT; @@ value = None if literal.startswith('0x'): value = int(literal, 16) elif literal[0] in '123456789': value = int(literal, 10) if value is None: print("I don't know how to handle %s" % (literal)) cocci.include_match(False) elif value == 2**32 - 1 or value == 2**31 - 1 or value == 2**24 - 1 or value == 2**16 - 1 or value == 2**8 - 1: print("Skipping 0x%x for cleanup elsewhere" % (value)) cocci.include_match(False) elif value & (value + 1) != 0: print("Skipping 0x%x because it's not a power of two minus one" % (value)) cocci.include_match(False) elif literal.startswith('0x'): coccinelle.RESULT = cocci.make_expr("0x%x" % (value + 1)) else: coccinelle.RESULT = cocci.make_expr("%d" % (value + 1)) // Replace the literal mask with the calculated result. @plus_one@ expression literal_mask.LITERAL; position literal_mask.p; expression add_one.RESULT; identifier FUNC; @@ - (FUNC()@p & (LITERAL)) + prandom_u32_max(RESULT) @collapse_ret@ type T; identifier VAR; expression E; @@ { - T VAR; - VAR = (E); - return VAR; + return E; } @drop_var@ type T; identifier VAR; @@ { - T VAR; ... when != VAR } Reviewed-by: Greg Kroah-Hartman Reviewed-by: Kees Cook Reviewed-by: Yury Norov Reviewed-by: KP Singh Reviewed-by: Jan Kara # for ext4 and sbitmap Reviewed-by: Christoph Böhmwalder # for drbd Acked-by: Jakub Kicinski Acked-by: Heiko Carstens # for s390 Acked-by: Ulf Hansson # for mmc Acked-by: Darrick J. Wong # for xfs Signed-off-by: Jason A. Donenfeld --- include/linux/nodemask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 378956c93c94..efef68c9352a 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -516,7 +516,7 @@ static inline int node_random(const nodemask_t *maskp) bit = first_node(*maskp); break; default: - bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_int() % w); + bit = find_nth_bit(maskp->bits, MAX_NUMNODES, prandom_u32_max(w)); break; } return bit; -- cgit v1.2.3 From de492c83cae0af72de370b9404aacda93dafcad5 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 5 Oct 2022 17:50:20 +0200 Subject: prandom: remove unused functions With no callers left of prandom_u32() and prandom_bytes(), as well as get_random_int(), remove these deprecated wrappers, in favor of get_random_u32() and get_random_bytes(). Reviewed-by: Greg Kroah-Hartman Reviewed-by: Kees Cook Reviewed-by: Yury Norov Acked-by: Jakub Kicinski Signed-off-by: Jason A. Donenfeld --- include/linux/prandom.h | 12 ------------ include/linux/random.h | 5 ----- 2 files changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/prandom.h b/include/linux/prandom.h index 78db003bc290..e0a0759dd09c 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -12,18 +12,6 @@ #include #include -/* Deprecated: use get_random_u32 instead. */ -static inline u32 prandom_u32(void) -{ - return get_random_u32(); -} - -/* Deprecated: use get_random_bytes instead. */ -static inline void prandom_bytes(void *buf, size_t nbytes) -{ - return get_random_bytes(buf, nbytes); -} - struct rnd_state { __u32 s1, s2, s3, s4; }; diff --git a/include/linux/random.h b/include/linux/random.h index 08322f700cdc..147a5e0d0b8e 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -42,10 +42,6 @@ u8 get_random_u8(void); u16 get_random_u16(void); u32 get_random_u32(void); u64 get_random_u64(void); -static inline unsigned int get_random_int(void) -{ - return get_random_u32(); -} static inline unsigned long get_random_long(void) { #if BITS_PER_LONG == 64 @@ -100,7 +96,6 @@ declare_get_random_var_wait(u8, u8) declare_get_random_var_wait(u16, u16) declare_get_random_var_wait(u32, u32) declare_get_random_var_wait(u64, u32) -declare_get_random_var_wait(int, unsigned int) declare_get_random_var_wait(long, unsigned long) #undef declare_get_random_var -- cgit v1.2.3 From 6a961bffd1c3505c13b4d33bbb8385fe08239cb8 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 2 Sep 2022 11:41:46 +0800 Subject: include/linux/entry-common.h: remove has_signal comment of arch_do_signal_or_restart() prototype The argument has_signal of arch_do_signal_or_restart() has been removed in commit 8ba62d37949e ("task_work: Call tracehook_notify_signal from get_signal on all architectures"), let us remove the related comment. Link: https://lkml.kernel.org/r/1662090106-5545-1-git-send-email-yangtiezhu@loongson.cn Fixes: 8ba62d37949e ("task_work: Call tracehook_notify_signal from get_signal on all architectures") Signed-off-by: Tiezhu Yang Reviewed-by: Kees Cook Signed-off-by: Andrew Morton --- include/linux/entry-common.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 84a466b176cf..d95ab85f96ba 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -253,7 +253,6 @@ static __always_inline void arch_exit_to_user_mode(void) { } /** * arch_do_signal_or_restart - Architecture specific signal delivery function * @regs: Pointer to currents pt_regs - * @has_signal: actual signal to handle * * Invoked from exit_to_user_mode_loop(). */ -- cgit v1.2.3 From fac35ba763ed07ba93154c95ffc0c4a55023707f Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 1 Sep 2022 18:41:31 +0800 Subject: mm/hugetlb: fix races when looking up a CONT-PTE/PMD size hugetlb page On some architectures (like ARM64), it can support CONT-PTE/PMD size hugetlb, which means it can support not only PMD/PUD size hugetlb (2M and 1G), but also CONT-PTE/PMD size(64K and 32M) if a 4K page size specified. So when looking up a CONT-PTE size hugetlb page by follow_page(), it will use pte_offset_map_lock() to get the pte entry lock for the CONT-PTE size hugetlb in follow_page_pte(). However this pte entry lock is incorrect for the CONT-PTE size hugetlb, since we should use huge_pte_lock() to get the correct lock, which is mm->page_table_lock. That means the pte entry of the CONT-PTE size hugetlb under current pte lock is unstable in follow_page_pte(), we can continue to migrate or poison the pte entry of the CONT-PTE size hugetlb, which can cause some potential race issues, even though they are under the 'pte lock'. For example, suppose thread A is trying to look up a CONT-PTE size hugetlb page by move_pages() syscall under the lock, however antoher thread B can migrate the CONT-PTE hugetlb page at the same time, which will cause thread A to get an incorrect page, if thread A also wants to do page migration, then data inconsistency error occurs. Moreover we have the same issue for CONT-PMD size hugetlb in follow_huge_pmd(). To fix above issues, rename the follow_huge_pmd() as follow_huge_pmd_pte() to handle PMD and PTE level size hugetlb, which uses huge_pte_lock() to get the correct pte entry lock to make the pte entry stable. Mike said: Support for CONT_PMD/_PTE was added with bb9dd3df8ee9 ("arm64: hugetlb: refactor find_num_contig()"). Patch series "Support for contiguous pte hugepages", v4. However, I do not believe these code paths were executed until migration support was added with 5480280d3f2d ("arm64/mm: enable HugeTLB migration for contiguous bit HugeTLB pages") I would go with 5480280d3f2d for the Fixes: targe. Link: https://lkml.kernel.org/r/635f43bdd85ac2615a58405da82b4d33c6e5eb05.1662017562.git.baolin.wang@linux.alibaba.com Fixes: 5480280d3f2d ("arm64/mm: enable HugeTLB migration for contiguous bit HugeTLB pages") Signed-off-by: Baolin Wang Suggested-by: Mike Kravetz Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3ec981a0d8b3..67c88b82fc32 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -207,8 +207,8 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, struct page *follow_huge_pd(struct vm_area_struct *vma, unsigned long address, hugepd_t hpd, int flags, int pdshift); -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int flags); +struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, + int flags); struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int flags); struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, @@ -312,8 +312,8 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma, return NULL; } -static inline struct page *follow_huge_pmd(struct mm_struct *mm, - unsigned long address, pmd_t *pmd, int flags) +static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, + unsigned long address, int flags) { return NULL; } -- cgit v1.2.3 From 0091bfc81741b8d3aeb3b7ab8636f911b2de6e80 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 3 Oct 2022 13:59:47 +0100 Subject: io_uring/af_unix: defer registered files gc to io_uring release Instead of putting io_uring's registered files in unix_gc() we want it to be done by io_uring itself. The trick here is to consider io_uring registered files for cycle detection but not actually putting them down. Because io_uring can't register other ring instances, this will remove all refs to the ring file triggering the ->release path and clean up with io_ring_ctx_free(). Cc: stable@vger.kernel.org Fixes: 6b06314c47e1 ("io_uring: add file set registration") Reported-and-tested-by: David Bouman Signed-off-by: Pavel Begunkov Signed-off-by: Thadeu Lima de Souza Cascardo [axboe: add kerneldoc comment to skb, fold in skb leak fix] Signed-off-by: Jens Axboe --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9fcf534f2d92..7be5bb4c94b6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -803,6 +803,7 @@ typedef unsigned char *sk_buff_data_t; * @csum_level: indicates the number of consecutive checksums found in * the packet minus one that have been verified as * CHECKSUM_UNNECESSARY (max 3) + * @scm_io_uring: SKB holds io_uring registered files * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required @@ -982,6 +983,7 @@ struct sk_buff { #endif __u8 slow_gro:1; __u8 csum_not_inet:1; + __u8 scm_io_uring:1; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ -- cgit v1.2.3 From b7a817752efc850603c4c23ed78da2b990a6a34a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Oct 2022 03:19:25 +0100 Subject: io_uring: remove notif leftovers Notifications were killed but there is a couple of fields and struct declarations left, remove them. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8df8877d677be5a2b43afd936d600e60105ea960.1664849941.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index aa4d90a53866..f5b687a787a3 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -34,9 +34,6 @@ struct io_file_table { unsigned int alloc_hint; }; -struct io_notif; -struct io_notif_slot; - struct io_hash_bucket { spinlock_t lock; struct hlist_head list; @@ -242,8 +239,6 @@ struct io_ring_ctx { unsigned nr_user_files; unsigned nr_user_bufs; struct io_mapped_ubuf **user_bufs; - struct io_notif_slot *notif_slots; - unsigned nr_notif_slots; struct io_submit_state submit_state; -- cgit v1.2.3 From 7be1c1a3c7b13fb259bb5159662a7b83622013b8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 11 Oct 2022 20:55:31 +0300 Subject: mm: more vma cache removal Link: https://lkml.kernel.org/r/Y0WuE3Riv4iy5Jx8@localhost.localdomain Fixes: 7964cf8caa4d ("mm: remove vmacache") Signed-off-by: Alexey Dobriyan Acked-by: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 88a043f7235e..e0bb85cf8bdd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -861,8 +861,6 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; - /* Per-thread vma caching: */ - #ifdef SPLIT_RSS_COUNTING struct task_rss_stat rss_stat; #endif -- cgit v1.2.3 From 652e04464d3944226052c827bdaaf5113b072870 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Tue, 27 Sep 2022 08:19:45 +0800 Subject: mm/damon: move sz_damon_region to damon_sz_region Rename sz_damon_region() to damon_sz_region(), and move it to "include/linux/damon.h", because in many places, we can to use this func. Link: https://lkml.kernel.org/r/20220927001946.85375-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index ed5470f50bab..620ada094c3b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -484,6 +484,12 @@ static inline struct damon_region *damon_first_region(struct damon_target *t) return list_first_entry(&t->regions_list, struct damon_region, list); } +static inline unsigned long damon_sz_region(struct damon_region *r) +{ + return r->ar.end - r->ar.start; +} + + #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) -- cgit v1.2.3 From 16ce101db85db694a91380aa4c89b25530871d33 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:15 +1000 Subject: mm/memory.c: fix race when faulting a device private page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Fix several device private page reference counting issues", v2 This series aims to fix a number of page reference counting issues in drivers dealing with device private ZONE_DEVICE pages. These result in use-after-free type bugs, either from accessing a struct page which no longer exists because it has been removed or accessing fields within the struct page which are no longer valid because the page has been freed. During normal usage it is unlikely these will cause any problems. However without these fixes it is possible to crash the kernel from userspace. These crashes can be triggered either by unloading the kernel module or unbinding the device from the driver prior to a userspace task exiting. In modules such as Nouveau it is also possible to trigger some of these issues by explicitly closing the device file-descriptor prior to the task exiting and then accessing device private memory. This involves some minor changes to both PowerPC and AMD GPU code. Unfortunately I lack hardware to test either of those so any help there would be appreciated. The changes mimic what is done in for both Nouveau and hmm-tests though so I doubt they will cause problems. This patch (of 8): When the CPU tries to access a device private page the migrate_to_ram() callback associated with the pgmap for the page is called. However no reference is taken on the faulting page. Therefore a concurrent migration of the device private page can free the page and possibly the underlying pgmap. This results in a race which can crash the kernel due to the migrate_to_ram() function pointer becoming invalid. It also means drivers can't reliably read the zone_device_data field because the page may have been freed with memunmap_pages(). Close the race by getting a reference on the page while holding the ptl to ensure it has not been freed. Unfortunately the elevated reference count will cause the migration required to handle the fault to fail. To avoid this failure pass the faulting page into the migrate_vma functions so that if an elevated reference count is found it can be checked to see if it's expected or not. [mpe@ellerman.id.au: fix build] Link: https://lkml.kernel.org/r/87fsgbf3gh.fsf@mpe.ellerman.id.au Link: https://lkml.kernel.org/r/cover.60659b549d8509ddecafad4f498ee7f03bb23c69.1664366292.git-series.apopple@nvidia.com Link: https://lkml.kernel.org/r/d3e813178a59e565e8d78d9b9a4e2562f6494f90.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Felix Kuehling Cc: Jason Gunthorpe Cc: John Hubbard Cc: Ralph Campbell Cc: Michael Ellerman Cc: Lyude Paul Cc: Alex Deucher Cc: Alex Sierra Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/migrate.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 704a04f5a074..52090d1f9230 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -62,6 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION extern void putback_movable_pages(struct list_head *l); +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, @@ -197,6 +199,12 @@ struct migrate_vma { */ void *pgmap_owner; unsigned long flags; + + /* + * Set to vmf->page if this is being called to migrate a page as part of + * a migrate_to_ram() callback. + */ + struct page *fault_page; }; int migrate_vma_setup(struct migrate_vma *args); -- cgit v1.2.3 From ef233450898f8893dafa193a9f3211fa077a3d05 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:16 +1000 Subject: mm: free device private pages have zero refcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount") device private pages have no longer had an extra reference count when the page is in use. However before handing them back to the owning device driver we add an extra reference count such that free pages have a reference count of one. This makes it difficult to tell if a page is free or not because both free and in use pages will have a non-zero refcount. Instead we should return pages to the drivers page allocator with a zero reference count. Kernel code can then safely use kernel functions such as get_page_unless_zero(). Link: https://lkml.kernel.org/r/cf70cf6f8c0bdb8aaebdbfb0d790aea4c683c3c6.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Felix Kuehling Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Alex Deucher Cc: Christian König Cc: Ben Skeggs Cc: Lyude Paul Cc: Ralph Campbell Cc: Alex Sierra Cc: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memremap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index c3b4cc84877b..7fcaf3180a5b 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -187,6 +187,7 @@ static inline bool folio_is_device_coherent(const struct folio *folio) } #ifdef CONFIG_ZONE_DEVICE +void zone_device_page_init(struct page *page); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); -- cgit v1.2.3 From e778406b40dbb1342a1888cd751ca9d2982a12e2 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:19 +1000 Subject: mm/migrate_device.c: add migrate_device_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Device drivers can use the migrate_vma family of functions to migrate existing private anonymous mappings to device private pages. These pages are backed by memory on the device with drivers being responsible for copying data to and from device memory. Device private pages are freed via the pgmap->page_free() callback when they are unmapped and their refcount drops to zero. Alternatively they may be freed indirectly via migration back to CPU memory in response to a pgmap->migrate_to_ram() callback called whenever the CPU accesses an address mapped to a device private page. In other words drivers cannot control the lifetime of data allocated on the devices and must wait until these pages are freed from userspace. This causes issues when memory needs to reclaimed on the device, either because the device is going away due to a ->release() callback or because another user needs to use the memory. Drivers could use the existing migrate_vma functions to migrate data off the device. However this would require them to track the mappings of each page which is both complicated and not always possible. Instead drivers need to be able to migrate device pages directly so they can free up device memory. To allow that this patch introduces the migrate_device family of functions which are functionally similar to migrate_vma but which skips the initial lookup based on mapping. Link: https://lkml.kernel.org/r/868116aab70b0c8ee467d62498bb2cf0ef907295.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Cc: "Huang, Ying" Cc: Zi Yan Cc: Matthew Wilcox Cc: Yang Shi Cc: David Hildenbrand Cc: Ralph Campbell Cc: John Hubbard Cc: Alex Deucher Cc: Alex Sierra Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Lyude Paul Cc: Michael Ellerman Signed-off-by: Andrew Morton --- include/linux/migrate.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 52090d1f9230..3ef77f52a4f0 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -210,6 +210,13 @@ struct migrate_vma { int migrate_vma_setup(struct migrate_vma *args); void migrate_vma_pages(struct migrate_vma *migrate); void migrate_vma_finalize(struct migrate_vma *migrate); +int migrate_device_range(unsigned long *src_pfns, unsigned long start, + unsigned long npages); +void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, + unsigned long npages); +void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages); + #endif /* CONFIG_MIGRATION */ #endif /* _LINUX_MIGRATE_H */ -- cgit v1.2.3 From 57d849636a04a12713dd3a10a97cb9658ec7edf6 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 12 Oct 2022 11:06:35 +0800 Subject: clk: at91: fix the build with binutils 2.27 There is an issue when build with older versions of binutils 2.27.0, arch/arm/mach-at91/pm_suspend.S: Assembler messages: arch/arm/mach-at91/pm_suspend.S:1086: Error: garbage following instruction -- `ldr tmp1,=0x00020010UL' Use UL() macro to fix the issue in assembly file. Fixes: 4fd36e458392 ("ARM: at91: pm: add plla disable/enable support for sam9x60") Signed-off-by: Kefeng Wang Link: https://lore.kernel.org/r/20221012030635.13140-1-wangkefeng.wang@huawei.com Signed-off-by: Stephen Boyd --- include/linux/clk/at91_pmc.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h index 3484309b59bf..7af499bdbecb 100644 --- a/include/linux/clk/at91_pmc.h +++ b/include/linux/clk/at91_pmc.h @@ -12,6 +12,8 @@ #ifndef AT91_PMC_H #define AT91_PMC_H +#include + #define AT91_PMC_V1 (1) /* PMC version 1 */ #define AT91_PMC_V2 (2) /* PMC version 2 [SAM9X60] */ @@ -45,8 +47,8 @@ #define AT91_PMC_PCSR 0x18 /* Peripheral Clock Status Register */ #define AT91_PMC_PLL_ACR 0x18 /* PLL Analog Control Register [for SAM9X60] */ -#define AT91_PMC_PLL_ACR_DEFAULT_UPLL 0x12020010UL /* Default PLL ACR value for UPLL */ -#define AT91_PMC_PLL_ACR_DEFAULT_PLLA 0x00020010UL /* Default PLL ACR value for PLLA */ +#define AT91_PMC_PLL_ACR_DEFAULT_UPLL UL(0x12020010) /* Default PLL ACR value for UPLL */ +#define AT91_PMC_PLL_ACR_DEFAULT_PLLA UL(0x00020010) /* Default PLL ACR value for PLLA */ #define AT91_PMC_PLL_ACR_UTMIVR (1 << 12) /* UPLL Voltage regulator Control */ #define AT91_PMC_PLL_ACR_UTMIBG (1 << 13) /* UPLL Bandgap Control */ -- cgit v1.2.3 From e36ce448a08d43de69e7449eb225805a7a8addf8 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Sat, 15 Oct 2022 13:34:29 +0900 Subject: mm/slab: use kmalloc_node() for off slab freelist_idx_t array allocation After commit d6a71648dbc0 ("mm/slab: kmalloc: pass requests larger than order-1 page to page allocator"), SLAB passes large ( > PAGE_SIZE * 2) requests to buddy like SLUB does. SLAB has been using kmalloc caches to allocate freelist_idx_t array for off slab caches. But after the commit, freelist_size can be bigger than KMALLOC_MAX_CACHE_SIZE. Instead of using pointer to kmalloc cache, use kmalloc_node() and only check if the kmalloc cache is off slab during calculate_slab_order(). If freelist_size > KMALLOC_MAX_CACHE_SIZE, no looping condition happens as it allocates freelist_idx_t array directly from buddy. Link: https://lore.kernel.org/all/20221014205818.GA1428667@roeck-us.net/ Reported-and-tested-by: Guenter Roeck Fixes: d6a71648dbc0 ("mm/slab: kmalloc: pass requests larger than order-1 page to page allocator") Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/slab_def.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index e24c9aff6fed..f0ffad6a3365 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -33,7 +33,6 @@ struct kmem_cache { size_t colour; /* cache colouring range */ unsigned int colour_off; /* colour offset */ - struct kmem_cache *freelist_cache; unsigned int freelist_size; /* constructor func */ -- cgit v1.2.3 From 80493877d7d0ae0cbe62921d748682811c58026f Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 16 Oct 2022 00:53:51 +0900 Subject: Revert "cpumask: fix checking valid cpu range". This reverts commit 78e5a3399421 ("cpumask: fix checking valid cpu range"). syzbot is hitting WARN_ON_ONCE(cpu >= nr_cpumask_bits) warning at cpu_max_bits_warn() [1], for commit 78e5a3399421 ("cpumask: fix checking valid cpu range") is broken. Obviously that patch hits WARN_ON_ONCE() when e.g. reading /proc/cpuinfo because passing "cpu + 1" instead of "cpu" will trivially hit cpu == nr_cpumask_bits condition. Although syzbot found this problem in linux-next.git on 2022/09/27 [2], this problem was not fixed immediately. As a result, that patch was sent to linux.git before the patch author recognizes this problem, and syzbot started failing to test changes in linux.git since 2022/10/10 [3]. Andrew Jones proposed a fix for x86 and riscv architectures [4]. But [2] and [5] indicate that affected locations are not limited to arch code. More delay before we find and fix affected locations, less tested kernel (and more difficult to bisect and fix) before release. We should have inspected and fixed basically all cpumask users before applying that patch. We should not crash kernels in order to ask existing cpumask users to update their code, even if limited to CONFIG_DEBUG_PER_CPU_MAPS=y case. Link: https://syzkaller.appspot.com/bug?extid=d0fd2bf0dd6da72496dd [1] Link: https://syzkaller.appspot.com/bug?extid=21da700f3c9f0bc40150 [2] Link: https://syzkaller.appspot.com/bug?extid=51a652e2d24d53e75734 [3] Link: https://lkml.kernel.org/r/20221014155845.1986223-1-ajones@ventanamicro.com [4] Link: https://syzkaller.appspot.com/bug?extid=4d46c43d81c3bd155060 [5] Reported-by: Andrew Jones Reported-by: syzbot+d0fd2bf0dd6da72496dd@syzkaller.appspotmail.com Signed-off-by: Tetsuo Handa Cc: Yury Norov Cc: Borislav Petkov Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 2f065ad97541..c2aa0aa26b45 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -174,8 +174,9 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp) static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { - /* n is a prior cpu */ - cpumask_check(n + 1); + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1); } @@ -188,8 +189,9 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp) */ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { - /* n is a prior cpu */ - cpumask_check(n + 1); + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1); } @@ -229,8 +231,9 @@ static inline unsigned int cpumask_next_and(int n, const struct cpumask *src1p, const struct cpumask *src2p) { - /* n is a prior cpu */ - cpumask_check(n + 1); + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits, n + 1); } @@ -260,8 +263,8 @@ static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) { cpumask_check(start); - /* n is a prior cpu */ - cpumask_check(n + 1); + if (n != -1) + cpumask_check(n); /* * Return the first available CPU when wrapping, or when starting before cpu0, -- cgit v1.2.3