From e35ecb466eb63c2311783208547633f90742d06d Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Fri, 19 Feb 2021 15:34:41 +0100 Subject: RDMA/iwcm: Allow AFONLY binding for IPv6 addresses Binding IPv6 address/port to AF_INET6 domain only is provided via rdma_set_afonly(), but was not signalled to the provider. Applications like NFS/RDMA bind the same port to both IPv4 and IPv6 addresses simultaneously and thus rely on it working correctly. Link: https://lore.kernel.org/r/20210219143441.1068-1-bmt@zurich.ibm.com Tested-by: Chuck Lever Tested-by: Benjamin Coddington Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- include/rdma/iw_cm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index 91975400e1b3..03abd30e6c8c 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -70,6 +70,7 @@ struct iw_cm_id { u8 tos; bool tos_set:1; bool mapped:1; + bool afonly:1; }; struct iw_cm_conn_param { -- cgit v1.2.3 From 312e3f8aefb5dc9c2f052ba0ee35a2fd6baa5bcd Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 11 Mar 2021 09:30:54 +0000 Subject: thermal: Fix spelling mistake "disabed" -> "disabled" There is a spelling mistake in a comment, fix it. Signed-off-by: Colin Ian King Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210311093054.5338-1-colin.king@canonical.com --- include/uapi/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h index c105054cbb57..9aa2fedfa309 100644 --- a/include/uapi/linux/thermal.h +++ b/include/uapi/linux/thermal.h @@ -60,7 +60,7 @@ enum thermal_genl_event { THERMAL_GENL_EVENT_UNSPEC, THERMAL_GENL_EVENT_TZ_CREATE, /* Thermal zone creation */ THERMAL_GENL_EVENT_TZ_DELETE, /* Thermal zone deletion */ - THERMAL_GENL_EVENT_TZ_DISABLE, /* Thermal zone disabed */ + THERMAL_GENL_EVENT_TZ_DISABLE, /* Thermal zone disabled */ THERMAL_GENL_EVENT_TZ_ENABLE, /* Thermal zone enabled */ THERMAL_GENL_EVENT_TZ_TRIP_UP, /* Trip point crossed the way up */ THERMAL_GENL_EVENT_TZ_TRIP_DOWN, /* Trip point crossed the way down */ -- cgit v1.2.3 From f675ba125bd38acb95d84da04e5fd89aa36cc429 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 11 Mar 2021 17:09:21 +0200 Subject: RDMA/core: Remove unused req_ncomp_notif device operation The request_ncomp_notif device operation and function are unused, remove them. Link: https://lore.kernel.org/r/20210311150921.23726-1-galpress@amazon.com Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ca28fca5736b..21c19b1fcee7 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2301,7 +2301,6 @@ struct ib_device_ops { int (*poll_cq)(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int (*peek_cq)(struct ib_cq *cq, int wc_cnt); int (*req_notify_cq)(struct ib_cq *cq, enum ib_cq_notify_flags flags); - int (*req_ncomp_notif)(struct ib_cq *cq, int wc_cnt); int (*post_srq_recv)(struct ib_srq *srq, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); @@ -3915,20 +3914,6 @@ struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe); -/** - * ib_req_ncomp_notif - Request completion notification when there are - * at least the specified number of unreaped completions on the CQ. - * @cq: The CQ to generate an event for. - * @wc_cnt: The number of unreaped completions that should be on the - * CQ before an event is generated. - */ -static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt) -{ - return cq->device->ops.req_ncomp_notif ? - cq->device->ops.req_ncomp_notif(cq, wc_cnt) : - -ENOSYS; -} - /* * Drivers that don't need a DMA mapping at the RDMA layer, set dma_device to * NULL. This causes the ib_dma* helpers to just stash the kernel virtual -- cgit v1.2.3 From 2cfc056ef2c28b4961bff5e2f6deed94afb14024 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sat, 6 Mar 2021 19:24:18 +0800 Subject: remoteproc: introduce is_iomem to rproc_mem_entry Introduce is_iomem to indicate this piece memory is iomem or not. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1615029865-23312-4-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index f28ee75d1005..a5f6d2d9cde2 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -315,6 +315,7 @@ struct rproc; /** * struct rproc_mem_entry - memory entry descriptor * @va: virtual address + * @is_iomem: io memory * @dma: dma address * @len: length, in bytes * @da: device address @@ -329,6 +330,7 @@ struct rproc; */ struct rproc_mem_entry { void *va; + bool is_iomem; dma_addr_t dma; size_t len; u32 da; -- cgit v1.2.3 From 40df0a91b2a5228ded8e5f75b80d28c96c6831cd Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sat, 6 Mar 2021 19:24:19 +0800 Subject: remoteproc: add is_iomem to da_to_va Introduce an extra parameter is_iomem to da_to_va, then the caller could take the memory as normal memory or io mapped memory. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Reported-by: kernel test robot Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1615029865-23312-5-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index a5f6d2d9cde2..1b7d56c7a453 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -386,7 +386,7 @@ struct rproc_ops { int (*stop)(struct rproc *rproc); int (*attach)(struct rproc *rproc); void (*kick)(struct rproc *rproc, int vqid); - void * (*da_to_va)(struct rproc *rproc, u64 da, size_t len); + void * (*da_to_va)(struct rproc *rproc, u64 da, size_t len, bool *is_iomem); int (*parse_fw)(struct rproc *rproc, const struct firmware *fw); int (*handle_rsc)(struct rproc *rproc, u32 rsc_type, void *rsc, int offset, int avail); -- cgit v1.2.3 From 32548870d438aba3c4a13f07efb73a8b86de507d Mon Sep 17 00:00:00 2001 From: Wenpeng Liang Date: Thu, 4 Mar 2021 10:55:58 +0800 Subject: RDMA/hns: Add support for XRC on HIP09 The HIP09 supports XRC transport service, it greatly saves the number of QPs required to connect all processes in a large cluster. Link: https://lore.kernel.org/r/1614826558-35423-1-git-send-email-liweihang@huawei.com Signed-off-by: Wenpeng Liang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/hns-abi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 90b739d05adf..42b177655560 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -86,6 +86,8 @@ struct hns_roce_ib_create_qp_resp { struct hns_roce_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 cqe_size; + __u32 srq_tab_size; + __u32 reserved; }; struct hns_roce_ib_alloc_pd_resp { -- cgit v1.2.3 From 7852546f524595245382a919e752468f73421451 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 4 Mar 2021 14:45:15 +0200 Subject: RDMA/mlx5: Fix query RoCE port mlx5_is_roce_enabled returns the devlink RoCE init value, therefore it should be used only when driver is loaded. Instead we just need to read the roce_en field. In addition, rename mlx5_is_roce_enabled to mlx5_is_roce_init_enabled. Fixes: 7a58779edd75 ("IB/mlx5: Improve query port for representor port") Link: https://lore.kernel.org/r/20210304124517.1100608-2-leon@kernel.org Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 53b89631a1d9..ab07f09f2bad 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1226,7 +1226,7 @@ enum { MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32, }; -static inline bool mlx5_is_roce_enabled(struct mlx5_core_dev *dev) +static inline bool mlx5_is_roce_init_enabled(struct mlx5_core_dev *dev) { struct devlink *devlink = priv_to_devlink(dev); union devlink_param_value val; -- cgit v1.2.3 From 2904bb37b35d07be7bfa3fb4a0fc1a3daa6678b3 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 4 Mar 2021 15:05:00 +0200 Subject: IB/core: Split uverbs_get_const/default to consider target type Change uverbs_get_const/uverbs_get_const_default to work properly with both signed/unsigned parameters. Current APIs mix s64 and u64 which leads to incorrect check when u64 value was supplied and its upper bit was set. In that case uverbs_get_const() / uverbs_get_const_default() lower bound check may fail unexpectedly, target is unsigned (lower bound is 0) but value became negative as of the s64 usage. Split to have two different APIs, no change to callers as the required API will be called internally according to the target type. Link: https://lore.kernel.org/r/20210304130501.1102577-3-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/uverbs_ioctl.h | 80 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 39ef204753ec..3829b6ef4bb6 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -875,9 +875,14 @@ static inline __malloc void *uverbs_kcalloc(struct uverbs_attr_bundle *bundle, return ERR_PTR(-EOVERFLOW); return uverbs_zalloc(bundle, bytes); } -int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, - size_t idx, s64 lower_bound, u64 upper_bound, - s64 *def_val); + +int _uverbs_get_const_signed(s64 *to, + const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val); +int _uverbs_get_const_unsigned(u64 *to, + const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, u64 upper_bound, u64 *def_val); int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, size_t idx, const void *from, size_t size); #else @@ -921,27 +926,76 @@ uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, { return -EINVAL; } +static int +_uverbs_get_const_signed(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val) +{ + return -EINVAL; +} +static int +_uverbs_get_const_unsigned(u64 *to, + const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, u64 upper_bound, u64 *def_val) +{ + return -EINVAL; +} #endif -#define uverbs_get_const(_to, _attrs_bundle, _idx) \ +#define uverbs_get_const_signed(_to, _attrs_bundle, _idx) \ ({ \ s64 _val; \ - int _ret = _uverbs_get_const(&_val, _attrs_bundle, _idx, \ - type_min(typeof(*_to)), \ - type_max(typeof(*_to)), NULL); \ - (*_to) = _val; \ + int _ret = \ + _uverbs_get_const_signed(&_val, _attrs_bundle, _idx, \ + type_min(typeof(*(_to))), \ + type_max(typeof(*(_to))), NULL); \ + (*(_to)) = _val; \ _ret; \ }) -#define uverbs_get_const_default(_to, _attrs_bundle, _idx, _default) \ +#define uverbs_get_const_unsigned(_to, _attrs_bundle, _idx) \ + ({ \ + u64 _val; \ + int _ret = \ + _uverbs_get_const_unsigned(&_val, _attrs_bundle, _idx, \ + type_max(typeof(*(_to))), NULL); \ + (*(_to)) = _val; \ + _ret; \ + }) + +#define uverbs_get_const_default_signed(_to, _attrs_bundle, _idx, _default) \ ({ \ s64 _val; \ s64 _def_val = _default; \ int _ret = \ - _uverbs_get_const(&_val, _attrs_bundle, _idx, \ - type_min(typeof(*_to)), \ - type_max(typeof(*_to)), &_def_val); \ - (*_to) = _val; \ + _uverbs_get_const_signed(&_val, _attrs_bundle, _idx, \ + type_min(typeof(*(_to))), \ + type_max(typeof(*(_to))), &_def_val); \ + (*(_to)) = _val; \ + _ret; \ + }) + +#define uverbs_get_const_default_unsigned(_to, _attrs_bundle, _idx, _default) \ + ({ \ + u64 _val; \ + u64 _def_val = _default; \ + int _ret = \ + _uverbs_get_const_unsigned(&_val, _attrs_bundle, _idx, \ + type_max(typeof(*(_to))), &_def_val); \ + (*(_to)) = _val; \ _ret; \ }) + +#define uverbs_get_const(_to, _attrs_bundle, _idx) \ + (is_signed_type(typeof(*(_to))) ? \ + uverbs_get_const_signed(_to, _attrs_bundle, _idx) : \ + uverbs_get_const_unsigned(_to, _attrs_bundle, _idx)) \ + +#define uverbs_get_const_default(_to, _attrs_bundle, _idx, _default) \ + (is_signed_type(typeof(*(_to))) ? \ + uverbs_get_const_default_signed(_to, _attrs_bundle, _idx, \ + _default) : \ + uverbs_get_const_default_unsigned(_to, _attrs_bundle, _idx, \ + _default)) + #endif -- cgit v1.2.3 From 7610ab57de5616631b664ea31c11bad527810391 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 4 Mar 2021 15:05:01 +0200 Subject: RDMA/mlx5: Allow larger pages in DevX umem The umem DMA list calculation was locked at 4k pages due to confusion around how this API works and is used when larger pages are present. The conclusion is: - umem's cannot extend past what is mapped into the process, so creating a lage page size and referring to a sub-range is not allowed - umem's must always have a page offset of zero, except for sub PAGE_SIZE umems - The feature of umem_offset to create multiple objects inside a umem is buggy and isn't used anyplace. Thus we can assume all users of the current API have umem_offset == 0 as well Provide a new page size calculator that limits the DMA list to the VA range and enforces umem_offset == 0. Allow user space to specify the page sizes which it can accept, this bitmap must be derived from the intended use of the umem, based on per-usage HW limitations. Link: https://lore.kernel.org/r/20210304130501.1102577-4-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 3fd9b380a091..3f0bc7597ba7 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -154,6 +154,7 @@ enum mlx5_ib_devx_umem_reg_attrs { MLX5_IB_ATTR_DEVX_UMEM_REG_LEN, MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, + MLX5_IB_ATTR_DEVX_UMEM_REG_PGSZ_BITMAP, }; enum mlx5_ib_devx_umem_dereg_attrs { -- cgit v1.2.3 From 4260c4067fbba55a90037fe3ee32eff087749f83 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 24 Feb 2021 13:03:13 -0600 Subject: f2fs: Replace one-element array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. Refactor the code according to the use of a flexible-array member in struct f2fs_checkpoint, instead of a one-element arrays. Notice that a temporary pointer to void '*tmp_ptr' was used in order to fix the following errors when using a flexible array instead of a one element array in struct f2fs_checkpoint: CC [M] fs/f2fs/dir.o In file included from fs/f2fs/dir.c:13: fs/f2fs/f2fs.h: In function ‘__bitmap_ptr’: fs/f2fs/f2fs.h:2227:40: error: invalid use of flexible array member 2227 | return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32); | ^ fs/f2fs/f2fs.h:2227:49: error: invalid use of flexible array member 2227 | return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32); | ^ fs/f2fs/f2fs.h:2238:40: error: invalid use of flexible array member 2238 | return &ckpt->sit_nat_version_bitmap + offset; | ^ make[2]: *** [scripts/Makefile.build:287: fs/f2fs/dir.o] Error 1 make[1]: *** [scripts/Makefile.build:530: fs/f2fs] Error 2 make: *** [Makefile:1819: fs] Error 2 [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/79 Build-tested-by: kernel test robot Link: https://lore.kernel.org/lkml/603647e4.DeEFbl4eqljuwAUe%25lkp@intel.com/ Signed-off-by: Gustavo A. R. Silva Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index c6cc0a566ef5..5487a80617a3 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -168,7 +168,7 @@ struct f2fs_checkpoint { unsigned char alloc_type[MAX_ACTIVE_LOGS]; /* SIT and NAT version bitmap */ - unsigned char sit_nat_version_bitmap[1]; + unsigned char sit_nat_version_bitmap[]; } __packed; #define CP_CHKSUM_OFFSET 4092 /* default chksum offset in checkpoint */ -- cgit v1.2.3 From 1bb73841ea7a88765db7f641a90120490f1f4aee Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 2 Mar 2021 07:21:33 +0100 Subject: PCI: Remove MicroGate SyncLink device IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The drivers were removed in a1f714b44e34 (tty: Remove redundant synclink driver) and 3d608a591b2b (tty: Remove redundant synclinkmp driver). Remove the PCI device ID entries as well. Link: https://lore.kernel.org/r/20210302062214.29627-3-jslaby@suse.cz Signed-off-by: Jiri Slaby Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- include/linux/pci_ids.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index a76ccb697bef..8a18517696c1 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2065,8 +2065,6 @@ #define PCI_DEVICE_ID_EXAR_XR17V358 0x0358 #define PCI_VENDOR_ID_MICROGATE 0x13c0 -#define PCI_DEVICE_ID_MICROGATE_USC 0x0010 -#define PCI_DEVICE_ID_MICROGATE_SCA 0x0030 #define PCI_VENDOR_ID_3WARE 0x13C1 #define PCI_DEVICE_ID_3WARE_1000 0x1000 -- cgit v1.2.3 From 58483761810087e5ffdf36e84ac1bf26df909097 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 14 Mar 2021 12:13:29 +0100 Subject: thermal/drivers/core: Use a char pointer for the cooling device name We want to have any kind of name for the cooling devices as we do no longer want to rely on auto-numbering. Let's replace the cooling device's fixed array by a char pointer to be allocated dynamically when registering the cooling device, so we don't limit the length of the name. Rework the error path at the same time as we have to rollback the allocations in case of error. Tested with a dummy device having the name: "Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch" A village on the island of Anglesey (Wales), known to have the longest name in Europe. Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Tested-by: Ido Schimmel Link: https://lore.kernel.org/r/20210314111333.16551-1-daniel.lezcano@linaro.org --- include/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 6ac7bb1d2b1f..169502164364 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -91,7 +91,7 @@ struct thermal_cooling_device_ops { struct thermal_cooling_device { int id; - char type[THERMAL_NAME_LENGTH]; + char *type; struct device device; struct device_node *np; void *devdata; -- cgit v1.2.3 From eedb0b12d091a21909b5e84d9f3e5e649305bd12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jan 2021 14:53:22 +0100 Subject: dma-mapping: add a dma_mmap_pages helper Add a helper to map memory allocated using dma_alloc_pages into a user address space, similar to the dma_alloc_attrs function for coherent allocations. Signed-off-by: Christoph Hellwig Reviewed-by: Tomasz Figa Tested-by: Ricardo Ribalda --- include/linux/dma-mapping.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2a984cb4d1e0..2b8dce756e1f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -263,6 +263,8 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir); +int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, + size_t size, struct page *page); static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) -- cgit v1.2.3 From 7d5b5738d1514e9dd8ed452660e2a4d25beb9483 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jan 2021 14:54:18 +0100 Subject: dma-mapping: add a dma_alloc_noncontiguous API Add a new API that returns a potentiall virtually non-contigous sg_table and a DMA address. This API is only properly implemented for dma-iommu and will simply return a contigious chunk as a fallback. The intent is that drivers can use this API if either: - no kernel mapping or only temporary kernel mappings are required. That is as a better replacement for DMA_ATTR_NO_KERNEL_MAPPING - a kernel mapping is required for cached and DMA mapped pages, but the driver also needs the pages to e.g. map them to userspace. In that sense it is a replacement for some aspects of the recently removed and never fully implemented DMA_ATTR_NON_CONSISTENT Signed-off-by: Christoph Hellwig Reviewed-by: Tomasz Figa Tested-by: Ricardo Ribalda --- include/linux/dma-map-ops.h | 19 +++++++++++++++++++ include/linux/dma-mapping.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'include') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 51872e736e7b..0d53a96a3d64 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -22,6 +22,11 @@ struct dma_map_ops { gfp_t gfp); void (*free_pages)(struct device *dev, size_t size, struct page *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir); + struct sg_table *(*alloc_noncontiguous)(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, + unsigned long attrs); + void (*free_noncontiguous)(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir); int (*mmap)(struct device *, struct vm_area_struct *, void *, dma_addr_t, size_t, unsigned long attrs); @@ -198,6 +203,20 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, } #endif /* CONFIG_DMA_DECLARE_COHERENT */ +/* + * This is the actual return value from the ->alloc_noncontiguous method. + * The users of the DMA API should only care about the sg_table, but to make + * the DMA-API internal vmaping and freeing easier we stash away the page + * array as well (except for the fallback case). This can go away any time, + * e.g. when a vmap-variant that takes a scatterlist comes along. + */ +struct dma_sgt_handle { + struct sg_table sgt; + struct page **pages; +}; +#define sgt_handle(sgt) \ + container_of((sgt), struct dma_sgt_handle, sgt) + int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2b8dce756e1f..954847f9a3e0 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -144,6 +144,15 @@ u64 dma_get_required_mask(struct device *dev); size_t dma_max_mapping_size(struct device *dev); bool dma_need_sync(struct device *dev, dma_addr_t dma_addr); unsigned long dma_get_merge_boundary(struct device *dev); +struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, unsigned long attrs); +void dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir); +void *dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt); +void dma_vunmap_noncontiguous(struct device *dev, void *vaddr); +int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, + size_t size, struct sg_table *sgt); #else /* CONFIG_HAS_DMA */ static inline dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, @@ -257,6 +266,29 @@ static inline unsigned long dma_get_merge_boundary(struct device *dev) { return 0; } +static inline struct sg_table *dma_alloc_noncontiguous(struct device *dev, + size_t size, enum dma_data_direction dir, gfp_t gfp, + unsigned long attrs) +{ + return NULL; +} +static inline void dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ +} +static inline void *dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt) +{ + return NULL; +} +static inline void dma_vunmap_noncontiguous(struct device *dev, void *vaddr) +{ +} +static inline int dma_mmap_noncontiguous(struct device *dev, + struct vm_area_struct *vma, size_t size, struct sg_table *sgt) +{ + return -EINVAL; +} #endif /* CONFIG_HAS_DMA */ struct page *dma_alloc_pages(struct device *dev, size_t size, -- cgit v1.2.3 From c124fd9a969acaa83f6dfa5e160a99a500af9e4b Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:03:58 +0100 Subject: PCI: Add pci_find_vsec_capability() to find a specific VSEC Add pci_find_vsec_capability() to locate a Vendor-Specific Extended Capability with the specified VSEC ID. The Vendor-Specific Extended Capability (VSEC) allows one or more proprietary capabilities defined by the vendor which aren't standard or shared between vendors. Signed-off-by: Gustavo Pimentel Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/d89506834fb11c6fa0bd5d515c0dd55b13ac6958.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 86c799c97b77..59e731a2d1f8 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1077,6 +1077,7 @@ u8 pci_find_next_ht_capability(struct pci_dev *dev, u8 pos, int ht_cap); u16 pci_find_ext_capability(struct pci_dev *dev, int cap); u16 pci_find_next_ext_capability(struct pci_dev *dev, u16 pos, int cap); struct pci_bus *pci_find_next_bus(const struct pci_bus *from); +u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap); u64 pci_get_dsn(struct pci_dev *dev); -- cgit v1.2.3 From 2973073a80b46daebc352c31d09d95d16cf6876e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:24 +0100 Subject: swiotlb: remove the alloc_size parameter to swiotlb_tbl_unmap_single Now that swiotlb remembers the allocation size there is no need to pass it back to swiotlb_tbl_unmap_single. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 5857a937c637..59f421d041ed 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -57,7 +57,6 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, extern void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, size_t mapping_size, - size_t alloc_size, enum dma_data_direction dir, unsigned long attrs); -- cgit v1.2.3 From 80808d273a3f075196d1a26463f65d4c9d2891c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:26 +0100 Subject: swiotlb: split swiotlb_tbl_sync_single Split swiotlb_tbl_sync_single into two separate funtions for the to device and to cpu synchronization. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 59f421d041ed..0696bdc8072e 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -42,14 +42,6 @@ extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); extern int swiotlb_late_init_with_default_size(size_t default_size); extern void __init swiotlb_update_mem_attributes(void); -/* - * Enumeration for sync targets - */ -enum dma_sync_target { - SYNC_FOR_CPU = 0, - SYNC_FOR_DEVICE = 1, -}; - phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs); @@ -60,11 +52,10 @@ extern void swiotlb_tbl_unmap_single(struct device *hwdev, enum dma_data_direction dir, unsigned long attrs); -extern void swiotlb_tbl_sync_single(struct device *hwdev, - phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir, - enum dma_sync_target target); - +void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir); +void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir); dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys, size_t size, enum dma_data_direction dir, unsigned long attrs); -- cgit v1.2.3 From a98f565462f0fca9096e8f53933364dc2a74bc90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:32 +0100 Subject: xen-swiotlb: split xen_swiotlb_init Split xen_swiotlb_init into a normal an an early case. That makes both much simpler and more readable, and also allows marking the early code as __init and x86-only. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- include/xen/swiotlb-xen.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index d5eaf9d682b8..6206b1ec9916 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -9,7 +9,8 @@ void xen_dma_sync_for_cpu(struct device *dev, dma_addr_t handle, void xen_dma_sync_for_device(struct device *dev, dma_addr_t handle, size_t size, enum dma_data_direction dir); -extern int xen_swiotlb_init(int verbose, bool early); +int xen_swiotlb_init(void); +void __init xen_swiotlb_init_early(void); extern const struct dma_map_ops xen_swiotlb_dma_ops; #endif /* __LINUX_SWIOTLB_XEN_H */ -- cgit v1.2.3 From 3093c3c7c136458af692d5c3d309a66c3c12d9f4 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Thu, 11 Mar 2021 15:04:09 +0100 Subject: rpmsg: Move RPMSG_ADDR_ANY in user API As the RPMSG_ADDR_ANY is a valid src or dst address that can be set by user applications, migrate its definition in user API. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210311140413.31725-3-arnaud.pouliquen@foss.st.com Signed-off-by: Bjorn Andersson --- include/linux/rpmsg.h | 3 +-- include/uapi/linux/rpmsg.h | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index a5db828b2420..d97dcd049f18 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -18,8 +18,7 @@ #include #include #include - -#define RPMSG_ADDR_ANY 0xFFFFFFFF +#include struct rpmsg_device; struct rpmsg_endpoint; diff --git a/include/uapi/linux/rpmsg.h b/include/uapi/linux/rpmsg.h index e14c6dab4223..5e00748da319 100644 --- a/include/uapi/linux/rpmsg.h +++ b/include/uapi/linux/rpmsg.h @@ -9,6 +9,8 @@ #include #include +#define RPMSG_ADDR_ANY 0xFFFFFFFF + /** * struct rpmsg_endpoint_info - endpoint info representation * @name: name of service -- cgit v1.2.3 From 809328b40cfb152f75541aa3dcbbe4903098963b Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Thu, 11 Mar 2021 15:04:10 +0100 Subject: rpmsg: Add short description of the IOCTL defined in UAPI. Add a description of the IOCTLs and provide information on the default value of the source and destination addresses. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210311140413.31725-4-arnaud.pouliquen@foss.st.com Signed-off-by: Bjorn Andersson --- include/uapi/linux/rpmsg.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/rpmsg.h b/include/uapi/linux/rpmsg.h index 5e00748da319..f5ca8740f3fb 100644 --- a/include/uapi/linux/rpmsg.h +++ b/include/uapi/linux/rpmsg.h @@ -14,8 +14,8 @@ /** * struct rpmsg_endpoint_info - endpoint info representation * @name: name of service - * @src: local address - * @dst: destination address + * @src: local address. To set to RPMSG_ADDR_ANY if not used. + * @dst: destination address. To set to RPMSG_ADDR_ANY if not used. */ struct rpmsg_endpoint_info { char name[32]; @@ -23,7 +23,14 @@ struct rpmsg_endpoint_info { __u32 dst; }; +/** + * Instantiate a new rmpsg char device endpoint. + */ #define RPMSG_CREATE_EPT_IOCTL _IOW(0xb5, 0x1, struct rpmsg_endpoint_info) + +/** + * Destroy a rpmsg char device endpoint created by the RPMSG_CREATE_EPT_IOCTL. + */ #define RPMSG_DESTROY_EPT_IOCTL _IO(0xb5, 0x2) #endif -- cgit v1.2.3 From 3542dcb15cef66c0b9e6c3b33168eb657e0d9520 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 5 Mar 2021 16:32:34 +0000 Subject: iommu/dma: Resurrect the "forcedac" option In converting intel-iommu over to the common IOMMU DMA ops, it quietly lost the functionality of its "forcedac" option. Since this is a handy thing both for testing and for performance optimisation on certain platforms, reimplement it under the common IOMMU parameter namespace. For the sake of fixing the inadvertent breakage of the Intel-specific parameter, remove the dmar_forcedac remnants and hook it up as an alias while documenting the transition to the new common parameter. Fixes: c588072bba6b ("iommu/vt-d: Convert intel iommu driver to the iommu ops") Signed-off-by: Robin Murphy Acked-by: Lu Baolu Reviewed-by: John Garry Link: https://lore.kernel.org/r/7eece8e0ea7bfbe2cd0e30789e0d46df573af9b0.1614961776.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/dma-iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h index 706b68d1359b..13d1f4c14d7b 100644 --- a/include/linux/dma-iommu.h +++ b/include/linux/dma-iommu.h @@ -40,6 +40,8 @@ void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list); void iommu_dma_free_cpu_cached_iovas(unsigned int cpu, struct iommu_domain *domain); +extern bool iommu_dma_forcedac; + #else /* CONFIG_IOMMU_DMA */ struct iommu_domain; -- cgit v1.2.3 From 6ca69e5841f01ccbfa45e56577e1b33e14e53504 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 18 Mar 2021 08:53:40 +0800 Subject: iommu/vt-d: Report more information about invalidation errors When the invalidation queue errors are encountered, dump the information logged by the VT-d hardware together with the pending queue invalidation descriptors. Signed-off-by: Ashok Raj Tested-by: Guo Kaijie Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20210318005340.187311-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 1bc46b88711a..1732298ce888 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -80,6 +81,7 @@ #define DMAR_IQ_SHIFT 4 /* Invalidation queue head/tail shift */ #define DMAR_IQA_REG 0x90 /* Invalidation queue addr register */ #define DMAR_ICS_REG 0x9c /* Invalidation complete status register */ +#define DMAR_IQER_REG 0xb0 /* Invalidation queue error record register */ #define DMAR_IRTA_REG 0xb8 /* Interrupt remapping table addr register */ #define DMAR_PQH_REG 0xc0 /* Page request queue head register */ #define DMAR_PQT_REG 0xc8 /* Page request queue tail register */ @@ -126,6 +128,10 @@ #define DMAR_VCMD_REG 0xe10 /* Virtual command register */ #define DMAR_VCRSP_REG 0xe20 /* Virtual command response register */ +#define DMAR_IQER_REG_IQEI(reg) FIELD_GET(GENMASK_ULL(3, 0), reg) +#define DMAR_IQER_REG_ITESID(reg) FIELD_GET(GENMASK_ULL(47, 32), reg) +#define DMAR_IQER_REG_ICESID(reg) FIELD_GET(GENMASK_ULL(63, 48), reg) + #define OFFSET_STRIDE (9) #define dmar_readq(a) readq(a) -- cgit v1.2.3 From bb0f61533dfd6aa815a2719720c77d13f840b683 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 2 Mar 2021 02:13:58 -0800 Subject: iommu/vt-d: Enable write protect propagation from guest Write protect bit, when set, inhibits supervisor writes to the read-only pages. In guest supervisor shared virtual addressing (SVA), write-protect should be honored upon guest bind supervisor PASID request. This patch extends the VT-d portion of the IOMMU UAPI to include WP bit. WPE bit of the supervisor PASID entry will be set to match CPU CR0.WP bit. Signed-off-by: Sanjay Kumar Signed-off-by: Jacob Pan Acked-by: Lu Baolu Link: https://lore.kernel.org/r/1614680040-1989-3-git-send-email-jacob.jun.pan@linux.intel.com Signed-off-by: Joerg Roedel --- include/uapi/linux/iommu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index e1d9e75f2c94..59178fc229ca 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -288,7 +288,8 @@ struct iommu_gpasid_bind_data_vtd { #define IOMMU_SVA_VTD_GPASID_PWT (1 << 3) /* page-level write through */ #define IOMMU_SVA_VTD_GPASID_EMTE (1 << 4) /* extended mem type enable */ #define IOMMU_SVA_VTD_GPASID_CD (1 << 5) /* PASID-level cache disable */ -#define IOMMU_SVA_VTD_GPASID_LAST (1 << 6) +#define IOMMU_SVA_VTD_GPASID_WPE (1 << 6) /* Write protect enable */ +#define IOMMU_SVA_VTD_GPASID_LAST (1 << 7) __u64 flags; __u32 pat; __u32 emt; -- cgit v1.2.3 From 4196d18903f94090f0a223d65de25e3bf50a3d13 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:39 -0700 Subject: remoteproc: Add new RPROC_ATTACHED state Add a new RPROC_ATTACHED state to take into account scenarios where the remoteproc core needs to attach to a remote processor that is booted by another entity. Signed-off-by: Mathieu Poirier Reviewed-by: Peng Fan Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-4-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 1b7d56c7a453..9193a8fb5b68 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -407,6 +407,8 @@ struct rproc_ops { * @RPROC_RUNNING: device is up and running * @RPROC_CRASHED: device has crashed; need to start recovery * @RPROC_DELETED: device is deleted + * @RPROC_ATTACHED: device has been booted by another entity and the core + * has attached to it * @RPROC_DETACHED: device has been booted by another entity and waiting * for the core to attach to it * @RPROC_LAST: just keep this one at the end @@ -423,8 +425,9 @@ enum rproc_state { RPROC_RUNNING = 2, RPROC_CRASHED = 3, RPROC_DELETED = 4, - RPROC_DETACHED = 5, - RPROC_LAST = 6, + RPROC_ATTACHED = 5, + RPROC_DETACHED = 6, + RPROC_LAST = 7, }; /** -- cgit v1.2.3 From 76f4c87587e2ff41e9b9867ffde2137f27ba39b9 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:40 -0700 Subject: remoteproc: Properly represent the attached state There is a need to know when a remote processor has been attached to rather than booted by the remoteproc core. In order to avoid manipulating two variables, i.e rproc::autonomous and rproc::state, get rid of the former and simply use the newly introduced RPROC_ATTACHED state. Signed-off-by: Mathieu Poirier Reviewed-by: Peng Fan Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-5-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 9193a8fb5b68..9e42e90cd9da 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -514,7 +514,6 @@ struct rproc_dump_segment { * @table_sz: size of @cached_table * @has_iommu: flag to indicate if remote processor is behind an MMU * @auto_boot: flag to indicate if remote processor should be auto-started - * @autonomous: true if an external entity has booted the remote processor * @dump_segments: list of segments in the firmware * @nb_vdev: number of vdev currently handled by rproc * @char_dev: character device of the rproc @@ -551,7 +550,6 @@ struct rproc { size_t table_sz; bool has_iommu; bool auto_boot; - bool autonomous; struct list_head dump_segments; int nb_vdev; u8 elf_class; -- cgit v1.2.3 From 1a631382be1d22ddab0582dae3498b3d28e2e44a Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:41 -0700 Subject: remoteproc: Add new get_loaded_rsc_table() to rproc_ops Add a new get_loaded_rsc_table() operation in order to support scenarios where the remoteproc core has booted a remote processor and detaches from it. When re-attaching to the remote processor, the core needs to know where the resource table has been placed in memory. Signed-off-by: Mathieu Poirier Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-6-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 9e42e90cd9da..eee338177a3d 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -370,7 +370,9 @@ enum rsc_handling_status { * RSC_HANDLED if resource was handled, RSC_IGNORED if not handled and a * negative value on error * @load_rsc_table: load resource table from firmware image - * @find_loaded_rsc_table: find the loaded resouce table + * @find_loaded_rsc_table: find the loaded resource table from firmware image + * @get_loaded_rsc_table: get resource table installed in memory + * by external entity * @load: load firmware to memory, where the remote processor * expects to find it * @sanity_check: sanity check the fw image @@ -392,6 +394,8 @@ struct rproc_ops { int offset, int avail); struct resource_table *(*find_loaded_rsc_table)( struct rproc *rproc, const struct firmware *fw); + struct resource_table *(*get_loaded_rsc_table)( + struct rproc *rproc, size_t *size); int (*load)(struct rproc *rproc, const struct firmware *fw); int (*sanity_check)(struct rproc *rproc, const struct firmware *fw); u64 (*get_boot_addr)(struct rproc *rproc, const struct firmware *fw); -- cgit v1.2.3 From 7f3bd0c019cb813448d867c17c9b9dad205a13eb Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:44 -0700 Subject: remoteproc: Add new detach() remoteproc operation Add an new detach() operation in order to support scenarios where the remoteproc core is going away but the remote processor is kept operating. This could be the case when the system is rebooted or when the platform driver is removed. Signed-off-by: Mathieu Poirier Reviewed-by: Peng Fan Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-9-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index eee338177a3d..2f1f0fbc3994 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -363,6 +363,7 @@ enum rsc_handling_status { * @start: power on the device and boot it * @stop: power off the device * @attach: attach to a device that his already powered up + * @detach: detach from a device, leaving it powered up * @kick: kick a virtqueue (virtqueue id given as a parameter) * @da_to_va: optional platform hook to perform address translations * @parse_fw: parse firmware to extract information (e.g. resource table) @@ -387,6 +388,7 @@ struct rproc_ops { int (*start)(struct rproc *rproc); int (*stop)(struct rproc *rproc); int (*attach)(struct rproc *rproc); + int (*detach)(struct rproc *rproc); void (*kick)(struct rproc *rproc, int vqid); void * (*da_to_va)(struct rproc *rproc, u64 da, size_t len, bool *is_iomem); int (*parse_fw)(struct rproc *rproc, const struct firmware *fw); -- cgit v1.2.3 From d3962a397885518a85d2dc6b0c51e6594f71c30f Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:46 -0700 Subject: remoteproc: Introduce function rproc_detach() Introduce function rproc_detach() to enable the remoteproc core to release the resources associated with a remote processor without stopping its operation. Signed-off-by: Mathieu Poirier Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-11-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 2f1f0fbc3994..fc2cca600423 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -664,6 +664,7 @@ rproc_of_resm_mem_entry_init(struct device *dev, u32 of_resm_idx, size_t len, int rproc_boot(struct rproc *rproc); void rproc_shutdown(struct rproc *rproc); +int rproc_detach(struct rproc *rproc); int rproc_set_firmware(struct rproc *rproc, const char *fw_name); void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type); void rproc_coredump_using_sections(struct rproc *rproc); -- cgit v1.2.3 From 9dc9507f1880fb6225e3e058cb5219b152cbf198 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:47 -0700 Subject: remoteproc: Properly deal with the resource table when detaching If it is possible to detach the remote processor, keep an untouched copy of the resource table. That way we can start from the same resource table without having to worry about original values or what elements the startup code has changed when re-attaching to the remote processor. Signed-off-by: Mathieu Poirier Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-12-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index fc2cca600423..8b795b544f75 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -516,6 +516,8 @@ struct rproc_dump_segment { * @recovery_disabled: flag that state if recovery was disabled * @max_notifyid: largest allocated notify id. * @table_ptr: pointer to the resource table in effect + * @clean_table: copy of the resource table without modifications. Used + * when a remote processor is attached or detached from the core * @cached_table: copy of the resource table * @table_sz: size of @cached_table * @has_iommu: flag to indicate if remote processor is behind an MMU @@ -552,6 +554,7 @@ struct rproc { bool recovery_disabled; int max_notifyid; struct resource_table *table_ptr; + struct resource_table *clean_table; struct resource_table *cached_table; size_t table_sz; bool has_iommu; -- cgit v1.2.3 From b47e330231acbf4506b049643145cc64268a1940 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:03 -0400 Subject: tracing: Pass buffer of event to trigger operations The ring_buffer_event_time_stamp() is going to be updated to extract the time stamp for the event without needing it to be set to have absolute values for all events. But to do so, it needs the buffer that the event is on as the buffer saves information for the event before it is committed to the buffer. If the trace buffer is disabled, a temporary buffer is used, and there's no access to this buffer from the current histogram triggers, even though it is passed to the trace event code. Pass the buffer that the event is on all the way down to the histogram triggers. Link: https://lkml.kernel.org/r/20210316164113.542448131@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 28e7af1406f2..8cba64ce23a4 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -640,7 +640,8 @@ enum event_trigger_type { extern int filter_match_preds(struct event_filter *filter, void *rec); extern enum event_trigger_type -event_triggers_call(struct trace_event_file *file, void *rec, +event_triggers_call(struct trace_event_file *file, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event); extern void event_triggers_post_call(struct trace_event_file *file, @@ -664,7 +665,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { if (eflags & EVENT_FILE_FL_TRIGGER_MODE) - event_triggers_call(file, NULL, NULL); + event_triggers_call(file, NULL, NULL, NULL); if (eflags & EVENT_FILE_FL_SOFT_DISABLED) return true; if (eflags & EVENT_FILE_FL_PID_FILTER) -- cgit v1.2.3 From efe6196a6bc5bbc84b856316c4687fd24566a95c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:04 -0400 Subject: ring-buffer: Allow ring_buffer_event_time_stamp() to return time stamp of all events Currently, ring_buffer_event_time_stamp() only returns an accurate time stamp of the event if it has an absolute extended time stamp attached to it. To make it more robust, use the event_stamp() in case the event does not have an absolute value attached to it. This will allow ring_buffer_event_time_stamp() to be used in more cases than just histograms, and it will also allow histograms to not require including absolute values all the time. Link: https://lkml.kernel.org/r/20210316164113.704830885@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 136ea0997e6d..057b7ed4fe24 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -61,7 +61,8 @@ enum ring_buffer_type { unsigned ring_buffer_event_length(struct ring_buffer_event *event); void *ring_buffer_event_data(struct ring_buffer_event *event); -u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event); +u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, + struct ring_buffer_event *event); /* * ring_buffer_discard_commit will remove an event that has not -- cgit v1.2.3 From f2616c772c768485de18e7fcb2816bcdcd098339 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 17 Mar 2021 13:34:35 -0400 Subject: seq_buf: Add seq_buf_terminate() API In the case that the seq_buf buffer needs to be printed directly, add a way to make sure that the buffer is safe to read by forcing a nul terminating character at the end of the string, or the last byte of the buffer if the string has overflowed. Signed-off-by: Steven Rostedt (VMware) --- include/linux/seq_buf.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 9d6c28cc4d8f..5b31c5147969 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -71,6 +71,31 @@ static inline unsigned int seq_buf_used(struct seq_buf *s) return min(s->len, s->size); } +/** + * seq_buf_terminate - Make sure buffer is nul terminated + * @s: the seq_buf descriptor to terminate. + * + * This makes sure that the buffer in @s is nul terminated and + * safe to read as a string. + * + * Note, if this is called when the buffer has overflowed, then + * the last byte of the buffer is zeroed, and the len will still + * point passed it. + * + * After this function is called, s->buffer is safe to use + * in string operations. + */ +static inline void seq_buf_terminate(struct seq_buf *s) +{ + if (WARN_ON(s->size == 0)) + return; + + if (seq_buf_buffer_left(s)) + s->buffer[s->len] = 0; + else + s->buffer[s->size - 1] = 0; +} + /** * seq_buf_get_buf - get buffer to write arbitrary data to * @s: the seq_buf handle -- cgit v1.2.3 From 73f620951b2b594bdc38722c0d647c3b3312af7a Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Thu, 18 Mar 2021 17:14:22 +0100 Subject: swiotlb: move global variables into a new io_tlb_mem structure Added a new struct, io_tlb_mem, as the IO TLB memory pool descriptor and moved relevant global variables into that struct. This will be useful later to allow for restricted DMA pool. Signed-off-by: Claire Chang [hch: rebased] Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 0696bdc8072e..5ec5378b17c3 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -6,6 +6,7 @@ #include #include #include +#include struct device; struct page; @@ -61,11 +62,49 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys, #ifdef CONFIG_SWIOTLB extern enum swiotlb_force swiotlb_force; -extern phys_addr_t io_tlb_start, io_tlb_end; + +/** + * struct io_tlb_mem - IO TLB Memory Pool Descriptor + * + * @start: The start address of the swiotlb memory pool. Used to do a quick + * range check to see if the memory was in fact allocated by this + * API. + * @end: The end address of the swiotlb memory pool. Used to do a quick + * range check to see if the memory was in fact allocated by this + * API. + * @nslabs: The number of IO TLB blocks (in groups of 64) between @start and + * @end. This is command line adjustable via setup_io_tlb_npages. + * @used: The number of used IO TLB block. + * @list: The free list describing the number of free entries available + * from each index. + * @index: The index to start searching in the next round. + * @orig_addr: The original address corresponding to a mapped entry. + * @alloc_size: Size of the allocated buffer. + * @lock: The lock to protect the above data structures in the map and + * unmap calls. + * @debugfs: The dentry to debugfs. + * @late_alloc: %true if allocated using the page allocator + */ +struct io_tlb_mem { + phys_addr_t start; + phys_addr_t end; + unsigned long nslabs; + unsigned long used; + unsigned int *list; + unsigned int index; + phys_addr_t *orig_addr; + size_t *alloc_size; + spinlock_t lock; + struct dentry *debugfs; + bool late_alloc; +}; +extern struct io_tlb_mem io_tlb_default_mem; static inline bool is_swiotlb_buffer(phys_addr_t paddr) { - return paddr >= io_tlb_start && paddr < io_tlb_end; + struct io_tlb_mem *mem = &io_tlb_default_mem; + + return paddr >= mem->start && paddr < mem->end; } void __init swiotlb_exit(void); -- cgit v1.2.3 From 2d29960af0bee8cc6731b9bd3964850c9e7a6840 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Mar 2021 17:14:23 +0100 Subject: swiotlb: dynamically allocate io_tlb_default_mem Instead of allocating ->list and ->orig_addr separately just do one dynamic allocation for the actual io_tlb_mem structure. This simplifies a lot of the initialization code, and also allows to just check io_tlb_default_mem to see if swiotlb is in use. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 5ec5378b17c3..63f7a63f61d0 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -90,28 +90,30 @@ struct io_tlb_mem { phys_addr_t end; unsigned long nslabs; unsigned long used; - unsigned int *list; unsigned int index; - phys_addr_t *orig_addr; - size_t *alloc_size; spinlock_t lock; struct dentry *debugfs; bool late_alloc; + struct io_tlb_slot { + phys_addr_t orig_addr; + size_t alloc_size; + unsigned int list; + } slots[]; }; -extern struct io_tlb_mem io_tlb_default_mem; +extern struct io_tlb_mem *io_tlb_default_mem; static inline bool is_swiotlb_buffer(phys_addr_t paddr) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_mem *mem = io_tlb_default_mem; - return paddr >= mem->start && paddr < mem->end; + return mem && paddr >= mem->start && paddr < mem->end; } void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); bool is_swiotlb_active(void); -void __init swiotlb_adjust_size(unsigned long new_size); +void __init swiotlb_adjust_size(unsigned long size); #else #define swiotlb_force SWIOTLB_NO_FORCE static inline bool is_swiotlb_buffer(phys_addr_t paddr) @@ -135,7 +137,7 @@ static inline bool is_swiotlb_active(void) return false; } -static inline void swiotlb_adjust_size(unsigned long new_size) +static inline void swiotlb_adjust_size(unsigned long size) { } #endif /* CONFIG_SWIOTLB */ -- cgit v1.2.3 From 2cbc2776efe4faed0e17c48ae076aa03a0fcc61f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Mar 2021 17:14:24 +0100 Subject: swiotlb: remove swiotlb_nr_tbl All callers just use it to check if swiotlb is active at all, for which they can just use is_swiotlb_active. In the longer run drivers need to stop using is_swiotlb_active as well, but let's do the simple step first. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 63f7a63f61d0..216854a5e513 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -37,7 +37,6 @@ enum swiotlb_force { extern void swiotlb_init(int verbose); int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); -extern unsigned long swiotlb_nr_tbl(void); unsigned long swiotlb_size_or_default(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); extern int swiotlb_late_init_with_default_size(size_t default_size); -- cgit v1.2.3 From 84fcfbdadbfdd86c9a43a52703203e05fe7efd92 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Fri, 12 Mar 2021 10:19:12 +0800 Subject: dma-mapping: remove a pointless empty line in dma_alloc_coherent Signed-off-by: Wang Qing Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 954847f9a3e0..e9d19b974f26 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -435,7 +435,6 @@ static inline void dma_sync_sgtable_for_device(struct device *dev, static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { - return dma_alloc_attrs(dev, size, dma_handle, gfp, (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0); } -- cgit v1.2.3 From ae4c86a024f634d5523e048a68635ae62765fcc4 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Sat, 20 Mar 2021 19:19:58 -0700 Subject: dt-bindings: input: atmel_mxt_ts: Document atmel,wakeup-method and WAKE line GPIO Some Atmel touchscreen controllers have a WAKE line that needs to be asserted low in order to wake up controller from a deep sleep. Document the wakeup methods and the new GPIO properties. Reviewed-by: Rob Herring Reviewed-by: Linus Walleij Signed-off-by: Dmitry Osipenko Link: https://lore.kernel.org/r/20210302102158.10533-2-digetx@gmail.com Signed-off-by: Dmitry Torokhov --- include/dt-bindings/input/atmel-maxtouch.h | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 include/dt-bindings/input/atmel-maxtouch.h (limited to 'include') diff --git a/include/dt-bindings/input/atmel-maxtouch.h b/include/dt-bindings/input/atmel-maxtouch.h new file mode 100644 index 000000000000..7345ab32224d --- /dev/null +++ b/include/dt-bindings/input/atmel-maxtouch.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef _DT_BINDINGS_ATMEL_MAXTOUCH_H +#define _DT_BINDINGS_ATMEL_MAXTOUCH_H + +#define ATMEL_MXT_WAKEUP_NONE 0 +#define ATMEL_MXT_WAKEUP_I2C_SCL 1 +#define ATMEL_MXT_WAKEUP_GPIO 2 + +#endif /* _DT_BINDINGS_ATMEL_MAXTOUCH_H */ -- cgit v1.2.3 From a2bbe66493ee380eb25e080e7fcdd1278a847f7e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 7 Jul 2019 09:57:53 -0400 Subject: constify dentry argument of dentry_path()/dentry_path_raw() Signed-off-by: Al Viro --- include/linux/dcache.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c1e48014106f..4ecde5d8250c 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -300,8 +300,8 @@ char *dynamic_dname(struct dentry *, char *, int, const char *, ...); extern char *__d_path(const struct path *, const struct path *, char *, int); extern char *d_absolute_path(const struct path *, char *, int); extern char *d_path(const struct path *, char *, int); -extern char *dentry_path_raw(struct dentry *, char *, int); -extern char *dentry_path(struct dentry *, char *, int); +extern char *dentry_path_raw(const struct dentry *, char *, int); +extern char *dentry_path(const struct dentry *, char *, int); /* Allocation counts.. */ -- cgit v1.2.3 From 9666cec380d60808eb86d3be4caf84faeebe3081 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 7 Dec 2020 14:45:56 +0100 Subject: pwm: Drop function pwmchip_add_with_polarity() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pwmchip_add() only calls pwmchip_add_with_polarity() and nothing else. All other users of pwmchip_add_with_polarity() are gone. So drop pwmchip_add_with_polarity() and move the code instead to pwmchip_add(). The initial assignment to pwm->state.polarity is dropped. In every correct usage of the PWM API this value is overwritten later anyhow. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index e4d84d4db293..8f4eefd129aa 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -392,8 +392,6 @@ int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result, int pwm_set_chip_data(struct pwm_device *pwm, void *data); void *pwm_get_chip_data(struct pwm_device *pwm); -int pwmchip_add_with_polarity(struct pwm_chip *chip, - enum pwm_polarity polarity); int pwmchip_add(struct pwm_chip *chip); int pwmchip_remove(struct pwm_chip *chip); struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip, -- cgit v1.2.3 From 50feda23152ed574c0a197116b23ef6786201bee Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 18 Mar 2021 15:34:53 +0530 Subject: RDMA/include: Mundane typo fixes throughout the file s/proviee/provide/ s/undelying/underlying/ s/quesiton/question/ s/drivr/driver/ Link: https://lore.kernel.org/r/20210318100453.9759-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Jason Gunthorpe --- include/rdma/rdma_vt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 9fd217b24916..0af89cedfbf5 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -92,7 +92,7 @@ struct rvt_ibport { /* * The pkey table is allocated and maintained by the driver. Drivers * need to have access to this before registering with rdmav. However - * rdmavt will need access to it so drivers need to proviee this during + * rdmavt will need access to it so drivers need to provide this during * the attach port API call. */ u16 *pkey_table; @@ -230,7 +230,7 @@ struct rvt_driver_provided { void (*do_send)(struct rvt_qp *qp); /* - * Returns a pointer to the undelying hardware's PCI device. This is + * Returns a pointer to the underlying hardware's PCI device. This is * used to display information as to what hardware is being referenced * in an output message */ @@ -257,7 +257,7 @@ struct rvt_driver_provided { void (*qp_priv_free)(struct rvt_dev_info *rdi, struct rvt_qp *qp); /* - * Inform the driver the particular qp in quesiton has been reset so + * Inform the driver the particular qp in question has been reset so * that it can clean up anything it needs to. */ void (*notify_qp_reset)(struct rvt_qp *qp); @@ -281,7 +281,7 @@ struct rvt_driver_provided { void (*stop_send_queue)(struct rvt_qp *qp); /* - * Have the drivr drain any in progress operations + * Have the driver drain any in progress operations */ void (*quiesce_qp)(struct rvt_qp *qp); -- cgit v1.2.3 From f2cc020d7876de7583feb52ec939a32419cf9468 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Mar 2021 18:49:35 +0100 Subject: tracing: Fix various typos in comments Fix ~59 single-word typos in the tracing code comments, and fix the grammar in a handful of places. Link: https://lore.kernel.org/r/20210322224546.GA1981273@gmail.com Link: https://lkml.kernel.org/r/20210323174935.GA4176821@gmail.com Reviewed-by: Randy Dunlap Signed-off-by: Ingo Molnar Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 4 ++-- include/linux/trace_events.h | 2 +- include/linux/tracepoint.h | 2 +- include/trace/events/io_uring.h | 2 +- include/trace/events/rcu.h | 2 +- include/trace/events/sched.h | 2 +- include/trace/events/timer.h | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 86e5028bfa20..a69f363b61bf 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -33,7 +33,7 @@ /* * If the arch's mcount caller does not support all of ftrace's * features, then it must call an indirect function that - * does. Or at least does enough to prevent any unwelcomed side effects. + * does. Or at least does enough to prevent any unwelcome side effects. */ #if !ARCH_SUPPORTS_FTRACE_OPS # define FTRACE_FORCE_LIST_FUNC 1 @@ -389,7 +389,7 @@ DECLARE_PER_CPU(int, disable_stack_tracer); */ static inline void stack_tracer_disable(void) { - /* Preemption or interupts must be disabled */ + /* Preemption or interrupts must be disabled */ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); this_cpu_inc(disable_stack_tracer); diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8cba64ce23a4..36e27c1f42e0 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -206,7 +206,7 @@ static inline unsigned int tracing_gen_ctx_dec(void) trace_ctx = tracing_gen_ctx(); /* - * Subtract one from the preeption counter if preemption is enabled, + * Subtract one from the preemption counter if preemption is enabled, * see trace_event_buffer_reserve()for details. */ if (IS_ENABLED(CONFIG_PREEMPTION)) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 9cfb099da58f..13f65420f188 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -465,7 +465,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * * * * The declared 'local variable' is called '__entry' * * - * * __field(pid_t, prev_prid) is equivalent to a standard declariton: + * * __field(pid_t, prev_prid) is equivalent to a standard declaration: * * * * pid_t prev_pid; * * diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 9f0d3b7d56b0..ba78a5602cd1 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -49,7 +49,7 @@ TRACE_EVENT(io_uring_create, ); /** - * io_uring_register - called after a buffer/file/eventfd was succesfully + * io_uring_register - called after a buffer/file/eventfd was successfully * registered for a ring * * @ctx: pointer to a ring context structure diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5fc29400e1a2..97177c10bf64 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -48,7 +48,7 @@ TRACE_EVENT(rcu_utilization, * RCU flavor, the grace-period number, and a string identifying the * grace-period-related event as follows: * - * "AccReadyCB": CPU acclerates new callbacks to RCU_NEXT_READY_TAIL. + * "AccReadyCB": CPU accelerates new callbacks to RCU_NEXT_READY_TAIL. * "AccWaitCB": CPU accelerates new callbacks to RCU_WAIT_TAIL. * "newreq": Request a new grace period. * "start": Start a grace period. diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index cbe3e152d24c..1eca2305ca42 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -174,7 +174,7 @@ DEFINE_EVENT(sched_wakeup_template, sched_waking, TP_ARGS(p)); /* - * Tracepoint called when the task is actually woken; p->state == TASK_RUNNNG. + * Tracepoint called when the task is actually woken; p->state == TASK_RUNNING. * It is not always called from the waking context. */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup, diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 19abb6c3eb73..6ad031c71be7 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -119,7 +119,7 @@ TRACE_EVENT(timer_expire_entry, * When used in combination with the timer_expire_entry tracepoint we can * determine the runtime of the timer callback function. * - * NOTE: Do NOT derefernce timer in TP_fast_assign. The pointer might + * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might * be invalid. We solely track the pointer. */ DEFINE_EVENT(timer_class, timer_expire_exit, -- cgit v1.2.3 From 1fb7f8973f51ca1a7ffe61a2c779ed15f57f3d82 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 1 Mar 2021 09:04:20 +0200 Subject: RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_cache.h | 18 ++--- include/rdma/ib_mad.h | 2 +- include/rdma/ib_sa.h | 15 ++-- include/rdma/ib_verbs.h | 177 ++++++++++++++++++++++++-------------------- include/rdma/rdma_cm.h | 2 +- include/rdma/rdma_counter.h | 16 ++-- include/rdma/rdma_vt.h | 8 +- include/rdma/rw.h | 18 ++--- 8 files changed, 135 insertions(+), 121 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h index bae29f50adff..226ae3702d8a 100644 --- a/include/rdma/ib_cache.h +++ b/include/rdma/ib_cache.h @@ -10,7 +10,7 @@ #include -int rdma_query_gid(struct ib_device *device, u8 port_num, int index, +int rdma_query_gid(struct ib_device *device, u32 port_num, int index, union ib_gid *gid); void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr); const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, @@ -20,10 +20,10 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, const struct ib_gid_attr *rdma_find_gid_by_port(struct ib_device *ib_dev, const union ib_gid *gid, enum ib_gid_type gid_type, - u8 port, + u32 port, struct net_device *ndev); const struct ib_gid_attr *rdma_find_gid_by_filter( - struct ib_device *device, const union ib_gid *gid, u8 port_num, + struct ib_device *device, const union ib_gid *gid, u32 port_num, bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *, void *), void *context); @@ -43,7 +43,7 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); * the local software cache. */ int ib_get_cached_pkey(struct ib_device *device_handle, - u8 port_num, + u32 port_num, int index, u16 *pkey); @@ -59,7 +59,7 @@ int ib_get_cached_pkey(struct ib_device *device_handle, * the local software cache. */ int ib_find_cached_pkey(struct ib_device *device, - u8 port_num, + u32 port_num, u16 pkey, u16 *index); @@ -75,7 +75,7 @@ int ib_find_cached_pkey(struct ib_device *device, * the local software cache. */ int ib_find_exact_cached_pkey(struct ib_device *device, - u8 port_num, + u32 port_num, u16 pkey, u16 *index); @@ -89,7 +89,7 @@ int ib_find_exact_cached_pkey(struct ib_device *device, * the local software cache. */ int ib_get_cached_lmc(struct ib_device *device, - u8 port_num, + u32 port_num, u8 *lmc); /** @@ -102,12 +102,12 @@ int ib_get_cached_lmc(struct ib_device *device, * the local software cache. */ int ib_get_cached_port_state(struct ib_device *device, - u8 port_num, + u32 port_num, enum ib_port_state *port_active); bool rdma_is_zero_gid(const union ib_gid *gid); const struct ib_gid_attr *rdma_get_gid_attr(struct ib_device *device, - u8 port_num, int index); + u32 port_num, int index); void rdma_put_gid_attr(const struct ib_gid_attr *attr); void rdma_hold_gid_attr(const struct ib_gid_attr *attr); ssize_t rdma_query_gid_table(struct ib_device *device, diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 8dfb1ddf345a..f1d34f06a68b 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -668,7 +668,7 @@ struct ib_mad_reg_req { * @registration_flags: Registration flags to set for this agent */ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, - u8 port_num, + u32 port_num, enum ib_qp_type qp_type, struct ib_mad_reg_req *mad_reg_req, u8 rmpp_version, diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 4c52c2fd22a1..ba3c808a3789 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -423,7 +423,7 @@ struct ib_sa_query; void ib_sa_cancel_query(int id, struct ib_sa_query *query); int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, - u8 port_num, struct sa_path_rec *rec, + u32 port_num, struct sa_path_rec *rec, ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, @@ -431,7 +431,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, void *context, struct ib_sa_query **query); int ib_sa_service_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, u8 method, + struct ib_device *device, u32 port_num, u8 method, struct ib_sa_service_rec *rec, ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, @@ -477,7 +477,8 @@ struct ib_sa_multicast { * group, and the user must rejoin the group to continue using it. */ struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, + struct ib_device *device, + u32 port_num, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, gfp_t gfp_mask, int (*callback)(int status, @@ -506,20 +507,20 @@ void ib_sa_free_multicast(struct ib_sa_multicast *multicast); * @mgid: MGID of multicast group. * @rec: Location to copy SA multicast member record. */ -int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num, +int ib_sa_get_mcmember_rec(struct ib_device *device, u32 port_num, union ib_gid *mgid, struct ib_sa_mcmember_rec *rec); /** * ib_init_ah_from_mcmember - Initialize address handle attributes based on * an SA multicast member record. */ -int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, +int ib_init_ah_from_mcmember(struct ib_device *device, u32 port_num, struct ib_sa_mcmember_rec *rec, struct net_device *ndev, enum ib_gid_type gid_type, struct rdma_ah_attr *ah_attr); -int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num, +int ib_init_ah_attr_from_path(struct ib_device *device, u32 port_num, struct sa_path_rec *rec, struct rdma_ah_attr *ah_attr, const struct ib_gid_attr *sgid_attr); @@ -538,7 +539,7 @@ void ib_sa_unpack_path(void *attribute, struct sa_path_rec *rec); /* Support GuidInfoRecord */ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, + struct ib_device *device, u32 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, unsigned long timeout_ms, gfp_t gfp_mask, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 21c19b1fcee7..bed4cfe50554 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -152,7 +152,7 @@ struct ib_gid_attr { union ib_gid gid; enum ib_gid_type gid_type; u16 index; - u8 port_num; + u32 port_num; }; enum { @@ -736,7 +736,7 @@ struct ib_event { struct ib_qp *qp; struct ib_srq *srq; struct ib_wq *wq; - u8 port_num; + u32 port_num; } element; enum ib_event_type event; }; @@ -919,7 +919,7 @@ struct rdma_ah_attr { struct ib_global_route grh; u8 sl; u8 static_rate; - u8 port_num; + u32 port_num; u8 ah_flags; enum rdma_ah_attr_type type; union { @@ -1006,7 +1006,7 @@ struct ib_wc { u16 pkey_index; u8 sl; u8 dlid_path_bits; - u8 port_num; /* valid only for DR SMPs on switches */ + u32 port_num; /* valid only for DR SMPs on switches */ u8 smac[ETH_ALEN]; u16 vlan_id; u8 network_hdr_type; @@ -1161,7 +1161,7 @@ struct ib_qp_init_attr { /* * Only needed for special QP types, or when using the RW API. */ - u8 port_num; + u32 port_num; struct ib_rwq_ind_table *rwq_ind_tbl; u32 source_qpn; }; @@ -1280,11 +1280,11 @@ struct ib_qp_attr { u8 max_rd_atomic; u8 max_dest_rd_atomic; u8 min_rnr_timer; - u8 port_num; + u32 port_num; u8 timeout; u8 retry_cnt; u8 rnr_retry; - u8 alt_port_num; + u32 alt_port_num; u8 alt_timeout; u32 rate_limit; struct net_device *xmit_slave; @@ -1401,7 +1401,7 @@ struct ib_ud_wr { u32 remote_qpn; u32 remote_qkey; u16 pkey_index; /* valid for GSI only */ - u8 port_num; /* valid for DR SMPs on switch only */ + u32 port_num; /* valid for DR SMPs on switch only */ }; static inline const struct ib_ud_wr *ud_wr(const struct ib_send_wr *wr) @@ -1708,7 +1708,7 @@ struct ib_qp_security; struct ib_port_pkey { enum port_pkey_state state; u16 pkey_index; - u8 port_num; + u32 port_num; struct list_head qp_list; struct list_head to_error_list; struct ib_qp_security *sec; @@ -1769,7 +1769,7 @@ struct ib_qp { enum ib_qp_type qp_type; struct ib_rwq_ind_table *rwq_ind_tbl; struct ib_qp_security *qp_sec; - u8 port; + u32 port; bool integrity_en; /* @@ -2065,7 +2065,7 @@ struct ib_flow_attr { u16 priority; u32 flags; u8 num_of_specs; - u8 port; + u32 port; union ib_flow_spec flows[]; }; @@ -2194,7 +2194,7 @@ enum rdma_netdev_t { struct rdma_netdev { void *clnt_priv; struct ib_device *hca; - u8 port_num; + u32 port_num; int mtu; /* @@ -2223,7 +2223,7 @@ struct rdma_netdev_alloc_params { unsigned int rxqs; void *param; - int (*initialize_rdma_netdev)(struct ib_device *device, u8 port_num, + int (*initialize_rdma_netdev)(struct ib_device *device, u32 port_num, struct net_device *netdev, void *param); }; @@ -2305,7 +2305,7 @@ struct ib_device_ops { const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); int (*process_mad)(struct ib_device *device, int process_mad_flags, - u8 port_num, const struct ib_wc *in_wc, + u32 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad *in_mad, struct ib_mad *out_mad, size_t *out_mad_size, u16 *out_mad_pkey_index); @@ -2317,9 +2317,9 @@ struct ib_device_ops { void (*get_dev_fw_str)(struct ib_device *device, char *str); const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev, int comp_vector); - int (*query_port)(struct ib_device *device, u8 port_num, + int (*query_port)(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); - int (*modify_port)(struct ib_device *device, u8 port_num, + int (*modify_port)(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); /** @@ -2328,10 +2328,10 @@ struct ib_device_ops { * structure to avoid cache line misses when accessing struct ib_device * in fast paths. */ - int (*get_port_immutable)(struct ib_device *device, u8 port_num, + int (*get_port_immutable)(struct ib_device *device, u32 port_num, struct ib_port_immutable *immutable); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, - u8 port_num); + u32 port_num); /** * When calling get_netdev, the HW vendor's driver should return the * net device of device @device at port @port_num or NULL if such @@ -2340,7 +2340,8 @@ struct ib_device_ops { * that this function returns NULL before the net device has finished * NETDEV_UNREGISTER state. */ - struct net_device *(*get_netdev)(struct ib_device *device, u8 port_num); + struct net_device *(*get_netdev)(struct ib_device *device, + u32 port_num); /** * rdma netdev operation * @@ -2348,11 +2349,11 @@ struct ib_device_ops { * must return -EOPNOTSUPP if it doesn't support the specified type. */ struct net_device *(*alloc_rdma_netdev)( - struct ib_device *device, u8 port_num, enum rdma_netdev_t type, + struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)); - int (*rdma_netdev_get_params)(struct ib_device *device, u8 port_num, + int (*rdma_netdev_get_params)(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, struct rdma_netdev_alloc_params *params); /** @@ -2360,7 +2361,7 @@ struct ib_device_ops { * link layer is either IB or iWarp. It is no-op if @port_num port * is RoCE link layer. */ - int (*query_gid)(struct ib_device *device, u8 port_num, int index, + int (*query_gid)(struct ib_device *device, u32 port_num, int index, union ib_gid *gid); /** * When calling add_gid, the HW vendor's driver should add the gid @@ -2385,7 +2386,7 @@ struct ib_device_ops { * This function is only called when roce_gid_table is used. */ int (*del_gid)(const struct ib_gid_attr *attr, void **context); - int (*query_pkey)(struct ib_device *device, u8 port_num, u16 index, + int (*query_pkey)(struct ib_device *device, u32 port_num, u16 index, u16 *pkey); int (*alloc_ucontext)(struct ib_ucontext *context, struct ib_udata *udata); @@ -2474,16 +2475,16 @@ struct ib_device_ops { struct ib_flow_action *action, const struct ib_flow_action_attrs_esp *attr, struct uverbs_attr_bundle *attrs); - int (*set_vf_link_state)(struct ib_device *device, int vf, u8 port, + int (*set_vf_link_state)(struct ib_device *device, int vf, u32 port, int state); - int (*get_vf_config)(struct ib_device *device, int vf, u8 port, + int (*get_vf_config)(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *ivf); - int (*get_vf_stats)(struct ib_device *device, int vf, u8 port, + int (*get_vf_stats)(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats); - int (*get_vf_guid)(struct ib_device *device, int vf, u8 port, + int (*get_vf_guid)(struct ib_device *device, int vf, u32 port, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid); - int (*set_vf_guid)(struct ib_device *device, int vf, u8 port, u64 guid, + int (*set_vf_guid)(struct ib_device *device, int vf, u32 port, u64 guid, int type); struct ib_wq *(*create_wq)(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, @@ -2521,7 +2522,7 @@ struct ib_device_ops { * struct tells the core to set a default lifespan. */ struct rdma_hw_stats *(*alloc_hw_stats)(struct ib_device *device, - u8 port_num); + u32 port_num); /** * get_hw_stats - Fill in the counter value(s) in the stats struct. * @index - The index in the value array we wish to have updated, or @@ -2535,12 +2536,12 @@ struct ib_device_ops { * one given in index at their option */ int (*get_hw_stats)(struct ib_device *device, - struct rdma_hw_stats *stats, u8 port, int index); + struct rdma_hw_stats *stats, u32 port, int index); /* * This function is called once for each port when a ib device is * registered. */ - int (*init_port)(struct ib_device *device, u8 port_num, + int (*init_port)(struct ib_device *device, u32 port_num, struct kobject *port_sysfs); /** * Allows rdma drivers to add their own restrack attributes. @@ -2684,7 +2685,7 @@ struct ib_device { /* CQ adaptive moderation (RDMA DIM) */ u16 use_cq_dim:1; u8 node_type; - u8 phys_port_cnt; + u32 phys_port_cnt; struct ib_device_attr attrs; struct attribute_group *hw_stats_ag; struct rdma_hw_stats *hw_stats; @@ -2750,7 +2751,7 @@ struct ib_client { * netdev. */ struct net_device *(*get_net_dev_by_params)( struct ib_device *dev, - u8 port, + u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, @@ -2931,10 +2932,10 @@ void ib_unregister_event_handler(struct ib_event_handler *event_handler); void ib_dispatch_event(const struct ib_event *event); int ib_query_port(struct ib_device *device, - u8 port_num, struct ib_port_attr *port_attr); + u32 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, - u8 port_num); + u32 port_num); /** * rdma_cap_ib_switch - Check if the device is IB switch @@ -2958,7 +2959,7 @@ static inline bool rdma_cap_ib_switch(const struct ib_device *device) * * Return start port number */ -static inline u8 rdma_start_port(const struct ib_device *device) +static inline u32 rdma_start_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : 1; } @@ -2969,9 +2970,10 @@ static inline u8 rdma_start_port(const struct ib_device *device) * @iter - The unsigned int to store the port number */ #define rdma_for_each_port(device, iter) \ - for (iter = rdma_start_port(device + BUILD_BUG_ON_ZERO(!__same_type( \ - unsigned int, iter))); \ - iter <= rdma_end_port(device); (iter)++) + for (iter = rdma_start_port(device + \ + BUILD_BUG_ON_ZERO(!__same_type(u32, \ + iter))); \ + iter <= rdma_end_port(device); iter++) /** * rdma_end_port - Return the last valid port number for the device @@ -2981,7 +2983,7 @@ static inline u8 rdma_start_port(const struct ib_device *device) * * Return last port number */ -static inline u8 rdma_end_port(const struct ib_device *device) +static inline u32 rdma_end_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt; } @@ -2994,55 +2996,63 @@ static inline int rdma_is_port_valid(const struct ib_device *device, } static inline bool rdma_is_grh_required(const struct ib_device *device, - u8 port_num) + u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_PORT_IB_GRH_REQUIRED; } -static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num) +static inline bool rdma_protocol_ib(const struct ib_device *device, + u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IB; } -static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num) +static inline bool rdma_protocol_roce(const struct ib_device *device, + u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); } -static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num) +static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, + u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; } -static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num) +static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, + u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; } -static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num) +static inline bool rdma_protocol_iwarp(const struct ib_device *device, + u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; } -static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num) +static inline bool rdma_ib_or_roce(const struct ib_device *device, + u32 port_num) { return rdma_protocol_ib(device, port_num) || rdma_protocol_roce(device, port_num); } -static inline bool rdma_protocol_raw_packet(const struct ib_device *device, u8 port_num) +static inline bool rdma_protocol_raw_packet(const struct ib_device *device, + u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_RAW_PACKET; } -static inline bool rdma_protocol_usnic(const struct ib_device *device, u8 port_num) +static inline bool rdma_protocol_usnic(const struct ib_device *device, + u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_USNIC; @@ -3060,7 +3070,7 @@ static inline bool rdma_protocol_usnic(const struct ib_device *device, u8 port_n * * Return: true if the port supports sending/receiving of MAD packets. */ -static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num) +static inline bool rdma_cap_ib_mad(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_MAD; @@ -3085,7 +3095,7 @@ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num) * * Return: true if the port supports OPA MAD packet formats. */ -static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num) +static inline bool rdma_cap_opa_mad(struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_OPA_MAD; @@ -3111,7 +3121,7 @@ static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num) * * Return: true if the port provides an SMI. */ -static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num) +static inline bool rdma_cap_ib_smi(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_SMI; @@ -3132,7 +3142,7 @@ static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num) * Return: true if the port supports an IB CM (this does not guarantee that * a CM is actually running however). */ -static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num) +static inline bool rdma_cap_ib_cm(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_CM; @@ -3150,7 +3160,7 @@ static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num) * Return: true if the port supports an iWARP CM (this does not guarantee that * a CM is actually running however). */ -static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num) +static inline bool rdma_cap_iw_cm(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IW_CM; @@ -3171,7 +3181,7 @@ static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num) * Administration interface. This does not imply that the SA service is * running locally. */ -static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num) +static inline bool rdma_cap_ib_sa(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_SA; @@ -3194,7 +3204,8 @@ static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num) * overhead of registering/unregistering with the SM and tracking of the * total number of queue pairs attached to the multicast group. */ -static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num) +static inline bool rdma_cap_ib_mcast(const struct ib_device *device, + u32 port_num) { return rdma_cap_ib_sa(device, port_num); } @@ -3212,7 +3223,7 @@ static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num * Return: true if the port uses a GID address to identify devices on the * network. */ -static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num) +static inline bool rdma_cap_af_ib(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_AF_IB; @@ -3234,7 +3245,7 @@ static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num) * addition of a Global Route Header built from our Ethernet Address * Handle into our header list for connectionless packets. */ -static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num) +static inline bool rdma_cap_eth_ah(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_ETH_AH; @@ -3249,7 +3260,7 @@ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num) * Return: true if we are running on an OPA device which supports * the extended OPA addressing. */ -static inline bool rdma_cap_opa_ah(struct ib_device *device, u8 port_num) +static inline bool rdma_cap_opa_ah(struct ib_device *device, u32 port_num) { return (device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_OPA_AH) == RDMA_CORE_CAP_OPA_AH; @@ -3267,7 +3278,8 @@ static inline bool rdma_cap_opa_ah(struct ib_device *device, u8 port_num) * Return the max MAD size required by the Port. Will return 0 if the port * does not support MADs */ -static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_num) +static inline size_t rdma_max_mad_size(const struct ib_device *device, + u32 port_num) { return device->port_data[port_num].immutable.max_mad_size; } @@ -3286,7 +3298,7 @@ static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_n * its GIDs. */ static inline bool rdma_cap_roce_gid_table(const struct ib_device *device, - u8 port_num) + u32 port_num) { return rdma_protocol_roce(device, port_num) && device->ops.add_gid && device->ops.del_gid; @@ -3327,7 +3339,7 @@ static inline bool rdma_core_cap_opa_port(struct ib_device *device, * Return the MTU size supported by the port as an integer value. Will return * -1 if enum value of mtu is not supported. */ -static inline int rdma_mtu_enum_to_int(struct ib_device *device, u8 port, +static inline int rdma_mtu_enum_to_int(struct ib_device *device, u32 port, int mtu) { if (rdma_core_cap_opa_port(device, port)) @@ -3344,7 +3356,7 @@ static inline int rdma_mtu_enum_to_int(struct ib_device *device, u8 port, * * Return the MTU size supported by the port as an integer value. */ -static inline int rdma_mtu_from_attr(struct ib_device *device, u8 port, +static inline int rdma_mtu_from_attr(struct ib_device *device, u32 port, struct ib_port_attr *attr) { if (rdma_core_cap_opa_port(device, port)) @@ -3353,34 +3365,34 @@ static inline int rdma_mtu_from_attr(struct ib_device *device, u8 port, return ib_mtu_enum_to_int(attr->max_mtu); } -int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, +int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port, int state); -int ib_get_vf_config(struct ib_device *device, int vf, u8 port, +int ib_get_vf_config(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *info); -int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, +int ib_get_vf_stats(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats); -int ib_get_vf_guid(struct ib_device *device, int vf, u8 port, +int ib_get_vf_guid(struct ib_device *device, int vf, u32 port, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid); -int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, +int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid, int type); int ib_query_pkey(struct ib_device *device, - u8 port_num, u16 index, u16 *pkey); + u32 port_num, u16 index, u16 *pkey); int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); int ib_modify_port(struct ib_device *device, - u8 port_num, int port_modify_mask, + u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, - u8 *port_num, u16 *index); + u32 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, - u8 port_num, u16 pkey, u16 *index); + u32 port_num, u16 pkey, u16 *index); enum ib_pd_flags { /* @@ -3495,7 +3507,7 @@ int ib_get_rdma_header_version(const union rdma_network_hdr *hdr); * attributes which are initialized using ib_init_ah_attr_from_wc(). * */ -int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num, +int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct rdma_ah_attr *ah_attr); @@ -3512,7 +3524,7 @@ int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num, * in all UD QP post sends. */ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, - const struct ib_grh *grh, u8 port_num); + const struct ib_grh *grh, u32 port_num); /** * rdma_modify_ah - Modifies the address vector associated with an address @@ -4257,12 +4269,12 @@ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id); struct ib_device *ib_device_get_by_name(const char *name, enum rdma_driver_id driver_id); -struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, +struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, unsigned int port); -struct net_device *ib_device_netdev(struct ib_device *dev, u8 port); +struct net_device *ib_device_netdev(struct ib_device *dev, u32 port); struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); @@ -4296,7 +4308,8 @@ void ib_drain_rq(struct ib_qp *qp); void ib_drain_sq(struct ib_qp *qp); void ib_drain_qp(struct ib_qp *qp); -int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u16 *speed, u8 *width); +int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, + u8 *width); static inline u8 *rdma_ah_retrieve_dmac(struct rdma_ah_attr *attr) { @@ -4364,12 +4377,12 @@ static inline bool rdma_ah_get_make_grd(const struct rdma_ah_attr *attr) return false; } -static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u8 port_num) +static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u32 port_num) { attr->port_num = port_num; } -static inline u8 rdma_ah_get_port_num(const struct rdma_ah_attr *attr) +static inline u32 rdma_ah_get_port_num(const struct rdma_ah_attr *attr) { return attr->port_num; } @@ -4467,7 +4480,7 @@ void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src); * @port_num: Port number */ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, - u8 port_num) + u32 port_num) { if (rdma_protocol_roce(dev, port_num)) return RDMA_AH_ATTR_TYPE_ROCE; @@ -4539,12 +4552,12 @@ struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile); int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs); -struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num, +struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)); -int rdma_init_netdev(struct ib_device *device, u8 port_num, +int rdma_init_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 32a67af18415..b2eed6d30543 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -107,7 +107,7 @@ struct rdma_cm_id { struct rdma_route route; enum rdma_ucm_port_space ps; enum ib_qp_type qp_type; - u8 port_num; + u32 port_num; }; struct rdma_cm_id * diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index e75cf9742e04..0295b22cd1cd 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -40,26 +40,26 @@ struct rdma_counter { struct rdma_counter_mode mode; struct mutex lock; struct rdma_hw_stats *stats; - u8 port; + u32 port; }; void rdma_counter_init(struct ib_device *dev); void rdma_counter_release(struct ib_device *dev); -int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, +int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mask mask, struct netlink_ext_ack *extack); -int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port); +int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port); int rdma_counter_unbind_qp(struct ib_qp *qp, bool force); int rdma_counter_query_stats(struct rdma_counter *counter); -u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index); -int rdma_counter_bind_qpn(struct ib_device *dev, u8 port, +u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index); +int rdma_counter_bind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id); -int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port, +int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port, u32 qp_num, u32 *counter_id); -int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port, +int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id); -int rdma_counter_get_mode(struct ib_device *dev, u8 port, +int rdma_counter_get_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mode *mode, enum rdma_nl_counter_mask *mask); diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 0af89cedfbf5..bf97d5d0dbf6 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -309,16 +309,16 @@ struct rvt_driver_provided { /* * Query driver for the state of the port. */ - int (*query_port_state)(struct rvt_dev_info *rdi, u8 port_num, + int (*query_port_state)(struct rvt_dev_info *rdi, u32 port_num, struct ib_port_attr *props); /* * Tell driver to shutdown a port */ - int (*shut_down_port)(struct rvt_dev_info *rdi, u8 port_num); + int (*shut_down_port)(struct rvt_dev_info *rdi, u32 port_num); /* Tell driver to send a trap for changed port capabilities */ - void (*cap_mask_chg)(struct rvt_dev_info *rdi, u8 port_num); + void (*cap_mask_chg)(struct rvt_dev_info *rdi, u32 port_num); /* * The following functions can be safely ignored completely. Any use of @@ -338,7 +338,7 @@ struct rvt_driver_provided { /* Let the driver pick the next queue pair number*/ int (*alloc_qpn)(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, - enum ib_qp_type type, u8 port_num); + enum ib_qp_type type, u32 port_num); /* Determine if its safe or allowed to modify the qp */ int (*check_modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr, diff --git a/include/rdma/rw.h b/include/rdma/rw.h index 6ad9dc836c10..d606cac48233 100644 --- a/include/rdma/rw.h +++ b/include/rdma/rw.h @@ -42,29 +42,29 @@ struct rdma_rw_ctx { }; }; -int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, +int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 sg_offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir); -void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, - struct scatterlist *sg, u32 sg_cnt, - enum dma_data_direction dir); +void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, + enum dma_data_direction dir); int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, - u8 port_num, struct scatterlist *sg, u32 sg_cnt, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, struct scatterlist *prot_sg, u32 prot_sg_cnt, struct ib_sig_attrs *sig_attrs, u64 remote_addr, u32 rkey, enum dma_data_direction dir); void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, - u8 port_num, struct scatterlist *sg, u32 sg_cnt, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, struct scatterlist *prot_sg, u32 prot_sg_cnt, enum dma_data_direction dir); struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, - u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr); -int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr); +int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr); -unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num, +unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num, unsigned int maxpages); void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr); int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr); -- cgit v1.2.3 From 67196fea0fcef92b25608882f62f3985bc59f1fe Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 9 Mar 2021 11:37:31 +0200 Subject: irqdomain: Introduce irq_domain_create_simple() API Linus Walleij pointed out that ird_domain_add_simple() gained additional functionality and can't be anymore replaced with a simple conditional. In preparation to upgrade GPIO library to use fwnode, introduce irq_domain_create_simple() API which is functional equivalent to the existing irq_domain_add_simple(), but takes a pointer to the struct fwnode_handle as a parameter. While at it, amend documentation to mention irq_domain_create_*() functions where it makes sense. Signed-off-by: Andy Shevchenko Acked-by: Marc Zyngier Signed-off-by: Bartosz Golaszewski --- include/linux/irqdomain.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 33cacc8af26d..1ad8d5328715 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -256,11 +256,11 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data); -struct irq_domain *irq_domain_add_simple(struct device_node *of_node, - unsigned int size, - unsigned int first_irq, - const struct irq_domain_ops *ops, - void *host_data); +struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, + unsigned int size, + unsigned int first_irq, + const struct irq_domain_ops *ops, + void *host_data); struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, unsigned int size, unsigned int first_irq, @@ -325,6 +325,15 @@ static inline struct irq_domain *irq_find_host(struct device_node *node) return d; } +static inline struct irq_domain *irq_domain_add_simple(struct device_node *of_node, + unsigned int size, + unsigned int first_irq, + const struct irq_domain_ops *ops, + void *host_data) +{ + return irq_domain_create_simple(of_node_to_fwnode(of_node), size, first_irq, ops, host_data); +} + /** * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. * @of_node: pointer to interrupt controller's device tree node. -- cgit v1.2.3 From 2d93018fe67d42c44d65a898da2a6a5a0209b9ee Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 23 Mar 2021 15:19:05 -0700 Subject: gpiolib: some edits of kernel docs for clarity Fix a few typos and some punctuation. Also, change CONFIG_OF to CONFIG_OF_GPIO in one comment. Signed-off-by: Randy Dunlap Cc: Linus Walleij Cc: Bartosz Golaszewski Cc: linux-gpio@vger.kernel.org Reviewed-by: Linus Walleij [Bartosz: tweaked the commit message] Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 286de0520574..63283de9daeb 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -227,7 +227,7 @@ struct gpio_irq_chip { /** * @valid_mask: * - * If not %NULL holds bitmask of GPIOs which are valid to be included + * If not %NULL, holds bitmask of GPIOs which are valid to be included * in IRQ domain of the chip. */ unsigned long *valid_mask; @@ -346,7 +346,7 @@ struct gpio_irq_chip { * output. * * A gpio_chip can help platforms abstract various sources of GPIOs so - * they can all be accessed through a common programing interface. + * they can all be accessed through a common programming interface. * Example sources would be SOC controllers, FPGAs, multifunction * chips, dedicated GPIO expanders, and so on. * @@ -435,15 +435,15 @@ struct gpio_chip { /** * @valid_mask: * - * If not %NULL holds bitmask of GPIOs which are valid to be used + * If not %NULL, holds bitmask of GPIOs which are valid to be used * from the chip. */ unsigned long *valid_mask; #if defined(CONFIG_OF_GPIO) /* - * If CONFIG_OF is enabled, then all GPIO controllers described in the - * device tree automatically may have an OF translation + * If CONFIG_OF_GPIO is enabled, then all GPIO controllers described in + * the device tree automatically may have an OF translation */ /** @@ -508,7 +508,7 @@ extern int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, * for GPIOs will fail rudely. * * gpiochip_add_data() must only be called after gpiolib initialization, - * ie after core_initcall(). + * i.e. after core_initcall(). * * If gc->base is negative, this requests dynamic assignment of * a range of valid GPIOs. -- cgit v1.2.3 From aa43665aeeb3db66ad732d168b5d6450eb4c60db Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Mon, 22 Mar 2021 12:13:22 +0530 Subject: RDMA: Fix a typo s/struture/structure/ Link: https://lore.kernel.org/r/20210322064322.3933985-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Jason Gunthorpe --- include/rdma/rdma_vt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index bf97d5d0dbf6..2dafd7dbe893 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -245,7 +245,7 @@ struct rvt_driver_provided { void * (*qp_priv_alloc)(struct rvt_dev_info *rdi, struct rvt_qp *qp); /* - * Init a struture allocated with qp_priv_alloc(). This should be + * Init a structure allocated with qp_priv_alloc(). This should be * called after all qp fields have been initialized in rdmavt. */ int (*qp_priv_init)(struct rvt_dev_info *rdi, struct rvt_qp *qp, -- cgit v1.2.3 From 8b638081bd4520f63db1defc660666ec5f65bc15 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 12 Mar 2021 09:07:30 -0500 Subject: dm ioctl: return UUID in DM_LIST_DEVICES_CMD result When LVM needs to find a device with a particular UUID it needs to ask for UUID for each device. This patch returns UUID directly in the list of devices, so that LVM doesn't have to query all the devices with an ioctl. The UUID is returned if the flag DM_UUID_FLAG is set in the parameters. Returning UUID is done in backward-compatible way. There's one unused 32-bit word value after the event number. This patch sets the bit DM_NAME_LIST_FLAG_HAS_UUID if UUID is present and DM_NAME_LIST_FLAG_DOESNT_HAVE_UUID if it isn't (if none of these bits is set, then we have an old kernel that doesn't support returning UUIDs). The UUID is stored after this word. The 'next' value is updated to point after the UUID, so that old version of libdevmapper will skip the UUID without attempting to interpret it. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- include/uapi/linux/dm-ioctl.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index fcff6669137b..e5c6e458bdf7 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -193,8 +193,22 @@ struct dm_name_list { __u32 next; /* offset to the next record from the _start_ of this */ char name[0]; + + /* + * The following members can be accessed by taking a pointer that + * points immediately after the terminating zero character in "name" + * and aligning this pointer to next 8-byte boundary. + * Uuid is present if the flag DM_NAME_LIST_FLAG_HAS_UUID is set. + * + * __u32 event_nr; + * __u32 flags; + * char uuid[0]; + */ }; +#define DM_NAME_LIST_FLAG_HAS_UUID 1 +#define DM_NAME_LIST_FLAG_DOESNT_HAVE_UUID 2 + /* * Used to retrieve the target versions */ @@ -272,9 +286,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 44 +#define DM_VERSION_MINOR 45 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2021-02-01)" +#define DM_VERSION_EXTRA "-ioctl (2021-03-22)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3 From 9c7d24693d864f90b27aad5d15fbfe226c02898b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Wed, 24 Mar 2021 09:19:02 +0100 Subject: gpio: guard gpiochip_irqchip_add_domain() with GPIOLIB_IRQCHIP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current code doesn't check if GPIOLIB_IRQCHIP is enabled, which results in a compilation error when trying to build gpio-regmap if CONFIG_GPIOLIB_IRQCHIP isn't enabled. Fixes: 6a45b0e2589f ("gpiolib: Introduce gpiochip_irqchip_add_domain()") Suggested-by: Michael Walle Signed-off-by: Álvaro Fernández Rojas Reviewed-by: Linus Walleij Reviewed-by: Michael Walle Acked-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20210324081923.20379-2-noltari@gmail.com Signed-off-by: Linus Walleij --- include/linux/gpio/driver.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 286de0520574..ecf0032a0995 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -624,8 +624,17 @@ void gpiochip_irq_domain_deactivate(struct irq_domain *domain, bool gpiochip_irqchip_irq_valid(const struct gpio_chip *gc, unsigned int offset); +#ifdef CONFIG_GPIOLIB_IRQCHIP int gpiochip_irqchip_add_domain(struct gpio_chip *gc, struct irq_domain *domain); +#else +static inline int gpiochip_irqchip_add_domain(struct gpio_chip *gc, + struct irq_domain *domain) +{ + WARN_ON(1); + return -EINVAL; +} +#endif int gpiochip_generic_request(struct gpio_chip *gc, unsigned int offset); void gpiochip_generic_free(struct gpio_chip *gc, unsigned int offset); -- cgit v1.2.3 From d46bf9ec4596654f36245e3b14765bcb422be6ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Wed, 24 Mar 2021 09:19:03 +0100 Subject: gpio: regmap: set gpio_chip of_node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is needed for properly registering GPIO regmap as a child of a regmap pin controller. Signed-off-by: Álvaro Fernández Rojas Reviewed-by: Michael Walle Reviewed-by: Andy Shevchenko Acked-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20210324081923.20379-3-noltari@gmail.com Signed-off-by: Linus Walleij --- include/linux/gpio/regmap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index ad76f3d0a6ba..334dd928042b 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -4,6 +4,7 @@ #define _LINUX_GPIO_REGMAP_H struct device; +struct fwnode_handle; struct gpio_regmap; struct irq_domain; struct regmap; @@ -16,6 +17,8 @@ struct regmap; * @parent: The parent device * @regmap: The regmap used to access the registers * given, the name of the device is used + * @fwnode: (Optional) The firmware node. + * If not given, the fwnode of the parent is used. * @label: (Optional) Descriptive name for GPIO controller. * If not given, the name of the device is used. * @ngpio: Number of GPIOs @@ -57,6 +60,7 @@ struct regmap; struct gpio_regmap_config { struct device *parent; struct regmap *regmap; + struct fwnode_handle *fwnode; const char *label; int ngpio; -- cgit v1.2.3 From 6e085e0ac9cf16298b5fefe0b1893f98ef765812 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 9 Dec 2020 14:09:24 +0800 Subject: arm/arm64: Probe for the presence of KVM hypervisor Although the SMCCC specification provides some limited functionality for describing the presence of hypervisor and firmware services, this is generally applicable only to functions designated as "Arm Architecture Service Functions" and no portable discovery mechanism is provided for standard hypervisor services, despite having a designated range of function identifiers reserved by the specification. In an attempt to avoid the need for additional firmware changes every time a new function is added, introduce a UID to identify the service provider as being compatible with KVM. Once this has been established, additional services can be discovered via a feature bitmap. Reviewed-by: Steven Price Signed-off-by: Will Deacon Signed-off-by: Jianyong Wu [maz: move code to its own file, plug it into PSCI] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201209060932.212364-2-jianyong.wu@arm.com --- include/linux/arm-smccc.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 62c54234576c..1a27bd9493fe 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -55,6 +55,8 @@ #define ARM_SMCCC_OWNER_TRUSTED_OS 50 #define ARM_SMCCC_OWNER_TRUSTED_OS_END 63 +#define ARM_SMCCC_FUNC_QUERY_CALL_UID 0xff01 + #define ARM_SMCCC_QUIRK_NONE 0 #define ARM_SMCCC_QUIRK_QCOM_A6 1 /* Save/restore register a6 */ @@ -87,6 +89,29 @@ ARM_SMCCC_SMC_32, \ 0, 0x7fff) +#define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_FUNC_QUERY_CALL_UID) + +/* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */ +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 0xb66fb428U +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 0xe911c52eU +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 0x564bcaa9U +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3 0x743a004dU + +/* KVM "vendor specific" services */ +#define ARM_SMCCC_KVM_FUNC_FEATURES 0 +#define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 +#define ARM_SMCCC_KVM_NUM_FUNCS 128 + +#define ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_FEATURES) + #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED 1 /* Paravirtualised time calls (defined by ARM DEN0057A) */ -- cgit v1.2.3 From 1fd3dde5e270ad08f1406f921c9a2cda154fcea9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 30 Mar 2021 12:43:16 -0500 Subject: PCI: Add pci_disable_parity() Add pci_disable_parity() to disable reporting of parity errors for a device by clearing PCI_COMMAND_PARITY. The device will still set PCI_STATUS_DETECTED_PARITY when it detects a parity error or receives a Poisoned TLP, but it will not set PCI_STATUS_PARITY, which means it will not assert PERR# (conventional PCI) or report Poisoned TLPs (PCIe). Based-on: https://lore.kernel.org/linux-arm-kernel/d375987c-ea4f-dd98-4ef8-99b2fbfe7c33@gmail.com/ Based-on-patch-by: Heiner Kallweit Link: https://lore.kernel.org/r/20210330174318.1289680-2-helgaas@kernel.org Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 86c799c97b77..4eaa773115da 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1201,6 +1201,7 @@ int __must_check pci_set_mwi(struct pci_dev *dev); int __must_check pcim_set_mwi(struct pci_dev *dev); int pci_try_set_mwi(struct pci_dev *dev); void pci_clear_mwi(struct pci_dev *dev); +void pci_disable_parity(struct pci_dev *dev); void pci_intx(struct pci_dev *dev, int enable); bool pci_check_and_mask_intx(struct pci_dev *dev); bool pci_check_and_unmask_intx(struct pci_dev *dev); -- cgit v1.2.3 From 26dbc7e299c7ebbb6a95e2c620b21b5280b37c57 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Thu, 18 Mar 2021 14:33:08 +0000 Subject: bug: Factor out a getter for a bug's file line There is some non-trivial config-based logic to get the file name and line number associated with a bug. Factor this out to a getter that can be resused. Signed-off-by: Andrew Scull Cc: Peter Zijlstra Cc: "Steven Rostedt (VMware)" Reviewed-by: Steven Rostedt (VMware) Acked-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210318143311.839894-3-ascull@google.com --- include/linux/bug.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bug.h b/include/linux/bug.h index f639bd0122f3..e3841bee4c8d 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -36,6 +36,9 @@ static inline int is_warning_bug(const struct bug_entry *bug) return bug->flags & BUGFLAG_WARNING; } +void bug_get_file_line(struct bug_entry *bug, const char **file, + unsigned int *line); + struct bug_entry *find_bug(unsigned long bugaddr); enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs); -- cgit v1.2.3 From 1decdb335c366fc0a1bae0db55c138c613cc9a1f Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Tue, 30 Mar 2021 11:40:55 +0800 Subject: tracing: Remove duplicate struct declaration in trace_events.h struct trace_array is declared twice. One has been declared at forward declaration. Remove the duplicate. Link: https://lkml.kernel.org/r/20210330034056.2266969-1-wanjiabing@vivo.com Signed-off-by: Wan Jiabing Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 36e27c1f42e0..ad413b382a3c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -404,7 +404,6 @@ trace_get_fields(struct trace_event_call *event_call) return event_call->class->get_fields(event_call); } -struct trace_array; struct trace_subsystem_dir; enum { -- cgit v1.2.3 From f3ef7202ef7c705d640d1aeec3b286a641ac9186 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Mon, 29 Mar 2021 16:03:31 +0300 Subject: tracing: Remove unused argument from "ring_buffer_time_stamp() The "cpu" parameter is not being used by the function. Link: https://lkml.kernel.org/r/20210329130331.199402-1-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 057b7ed4fe24..dac53fd3afea 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -181,7 +181,7 @@ unsigned long ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cp unsigned long ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu); unsigned long ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu); -u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu); +u64 ring_buffer_time_stamp(struct trace_buffer *buffer); void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer, int cpu, u64 *ts); void ring_buffer_set_clock(struct trace_buffer *buffer, -- cgit v1.2.3 From dbb3e9db8267dd8979b39bb15d70887ad0699e2c Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 1 Apr 2021 10:10:28 +0800 Subject: RDMA/uverbs: Fix -Wunused-function warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit make W=1 warns this: In file included from drivers/infiniband/sw/rdmavt/mmap.c:51:0: ./include/rdma/uverbs_ioctl.h:937:1: warning: ‘_uverbs_get_const_unsigned’ defined but not used [-Wunused-function] _uverbs_get_const_unsigned(u64 *to, ^~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/rdma/uverbs_ioctl.h:930:1: warning: ‘_uverbs_get_const_signed’ defined but not used [-Wunused-function] _uverbs_get_const_signed(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, ^~~~~~~~~~~~~~~~~~~~~~~~ Make these functions inline to fix this warnings. Fixes: 2904bb37b35d ("IB/core: Split uverbs_get_const/default to consider target type") Link: https://lore.kernel.org/r/20210401021028.25720-1-yuehaibing@huawei.com Signed-off-by: YueHaibing Signed-off-by: Jason Gunthorpe --- include/rdma/uverbs_ioctl.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 3829b6ef4bb6..23bb404aba12 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -926,14 +926,15 @@ uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, { return -EINVAL; } -static int -_uverbs_get_const_signed(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, +static inline int +_uverbs_get_const_signed(s64 *to, + const struct uverbs_attr_bundle *attrs_bundle, size_t idx, s64 lower_bound, u64 upper_bound, s64 *def_val) { return -EINVAL; } -static int +static inline int _uverbs_get_const_unsigned(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, size_t idx, u64 upper_bound, u64 *def_val) -- cgit v1.2.3 From a7f3d3d3600c8ed119eb0d2483de0062ce2e3707 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 26 Mar 2021 22:03:05 +0100 Subject: dma-mapping: add unlikely hint to error path in dma_mapping_error Zillions of drivers use the unlikely() hint when checking the result of dma_mapping_error(). This is an inline function anyway, so we can move the hint into the function and remove it from drivers over time. Signed-off-by: Heiner Kallweit Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index e9d19b974f26..183e7103a66d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -95,7 +95,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { debug_dma_mapping_error(dev, dma_addr); - if (dma_addr == DMA_MAPPING_ERROR) + if (unlikely(dma_addr == DMA_MAPPING_ERROR)) return -ENOMEM; return 0; } -- cgit v1.2.3 From d699ae4fc27496d01e8bc5ab2106bd79d1e7be92 Mon Sep 17 00:00:00 2001 From: Alexander Lochmann Date: Thu, 11 Feb 2021 10:51:55 +0100 Subject: ext4: updated locking documentation for journal_t Some members of transaction_t are allowed to be read without any lock being held if consistency doesn't matter. Based on LockDoc's findings, we extended the locking documentation of those members. Each one of them is marked with a short comment: "no lock for quick racy checks". Signed-off-by: Alexander Lochmann Signed-off-by: Horst Schirmeier Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/ad82c7a9-a624-4ed5-5ada-a6410c44c0b3@tu-dortmund.de Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 99d3cd051ac3..74710bb231a6 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -768,7 +768,8 @@ enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; struct journal_s { /** - * @j_flags: General journaling state flags [j_state_lock] + * @j_flags: General journaling state flags [j_state_lock, + * no lock for quick racy checks] */ unsigned long j_flags; @@ -808,7 +809,8 @@ struct journal_s /** * @j_barrier_count: * - * Number of processes waiting to create a barrier lock [j_state_lock] + * Number of processes waiting to create a barrier lock [j_state_lock, + * no lock for quick racy checks] */ int j_barrier_count; @@ -821,7 +823,8 @@ struct journal_s * @j_running_transaction: * * Transactions: The current running transaction... - * [j_state_lock] [caller holding open handle] + * [j_state_lock, no lock for quick racy checks] [caller holding + * open handle] */ transaction_t *j_running_transaction; @@ -1033,7 +1036,7 @@ struct journal_s * @j_commit_sequence: * * Sequence number of the most recently committed transaction - * [j_state_lock]. + * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_sequence; @@ -1041,7 +1044,7 @@ struct journal_s * @j_commit_request: * * Sequence number of the most recent transaction wanting commit - * [j_state_lock] + * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_request; -- cgit v1.2.3 From 3042b1b45c4106feff063932d4fd481c5009dbe1 Mon Sep 17 00:00:00 2001 From: Alexander Lochmann Date: Thu, 11 Feb 2021 18:14:10 +0100 Subject: Updated locking documentation for transaction_t Some members of transaction_t are allowed to be read without any lock being held if accessed from the correct context. We used LockDoc's findings to determine those members. Each member of them is marked with a short comment: "no lock needed for jbd2 thread". Signed-off-by: Alexander Lochmann Signed-off-by: Horst Schirmeier Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210211171410.17984-1-alexander.lochmann@tu-dortmund.de Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 74710bb231a6..b9aa85081a40 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -594,18 +594,22 @@ struct transaction_s */ unsigned long t_log_start; - /* Number of buffers on the t_buffers list [j_list_lock] */ + /* + * Number of buffers on the t_buffers list [j_list_lock, no locks + * needed for jbd2 thread] + */ int t_nr_buffers; /* * Doubly-linked circular list of all buffers reserved but not yet - * modified by this transaction [j_list_lock] + * modified by this transaction [j_list_lock, no locks needed fo + * jbd2 thread] */ struct journal_head *t_reserved_list; /* * Doubly-linked circular list of all metadata buffers owned by this - * transaction [j_list_lock] + * transaction [j_list_lock, no locks needed for jbd2 thread] */ struct journal_head *t_buffers; @@ -629,9 +633,11 @@ struct transaction_s struct journal_head *t_checkpoint_io_list; /* - * Doubly-linked circular list of metadata buffers being shadowed by log - * IO. The IO buffers on the iobuf list and the shadow buffers on this - * list match each other one for one at all times. [j_list_lock] + * Doubly-linked circular list of metadata buffers being + * shadowed by log IO. The IO buffers on the iobuf list and + * the shadow buffers on this list match each other one for + * one at all times. [j_list_lock, no locks needed for jbd2 + * thread] */ struct journal_head *t_shadow_list; -- cgit v1.2.3 From 6b3caab4ba9b2d290162e610810a946a33c65117 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Tue, 16 Feb 2021 14:16:34 -0500 Subject: ext4: delete some unused tracepoint definitions A number of tracepoint instances have been removed from ext4 by past patches but the definitions of those tracepoints have not. All instances of ext4_ext_in_cache and ext4_ext_put_in_cache were removed by commit 69eb33dc24dc ("ext4: remove single extent cache"). ext4_get_reserved_cluster_alloc was removed by commit b6bf9171ef5c ("ext4: reduce reserved cluster count by number of allocated clusters"). ext4_find_delalloc_range was removed by commit 7d1b1fbc95eb ("ext4: reimplement ext4_find_delay_alloc_range on extent status tree"). All instances of ext4_direct_IO_enter and ext4_direct_IO_exit were removed by commit 378f32bab371 ("ext4: introduce direct I/O write using iomap infrastructure"). Signed-off-by: Eric Whitney Link: https://lore.kernel.org/r/20210216191634.20957-1-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- include/trace/events/ext4.h | 176 -------------------------------------------- 1 file changed, 176 deletions(-) (limited to 'include') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 70ae5497b73a..0ea36b2b0662 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1358,64 +1358,6 @@ TRACE_EVENT(ext4_read_block_bitmap_load, __entry->group, __entry->prefetch) ); -TRACE_EVENT(ext4_direct_IO_enter, - TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), - - TP_ARGS(inode, offset, len, rw), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( loff_t, pos ) - __field( unsigned long, len ) - __field( int, rw ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->pos = offset; - __entry->len = len; - __entry->rw = rw; - ), - - TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - __entry->pos, __entry->len, __entry->rw) -); - -TRACE_EVENT(ext4_direct_IO_exit, - TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, - int rw, int ret), - - TP_ARGS(inode, offset, len, rw, ret), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( loff_t, pos ) - __field( unsigned long, len ) - __field( int, rw ) - __field( int, ret ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->pos = offset; - __entry->len = len; - __entry->rw = rw; - __entry->ret = ret; - ), - - TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - __entry->pos, __entry->len, - __entry->rw, __entry->ret) -); - DECLARE_EVENT_CLASS(ext4__fallocate_mode, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), @@ -1962,124 +1904,6 @@ TRACE_EVENT(ext4_get_implied_cluster_alloc_exit, __entry->len, show_mflags(__entry->flags), __entry->ret) ); -TRACE_EVENT(ext4_ext_put_in_cache, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len, - ext4_fsblk_t start), - - TP_ARGS(inode, lblk, len, start), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ext4_lblk_t, lblk ) - __field( unsigned int, len ) - __field( ext4_fsblk_t, start ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->lblk = lblk; - __entry->len = len; - __entry->start = start; - ), - - TP_printk("dev %d,%d ino %lu lblk %u len %u start %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned) __entry->lblk, - __entry->len, - (unsigned long long) __entry->start) -); - -TRACE_EVENT(ext4_ext_in_cache, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, int ret), - - TP_ARGS(inode, lblk, ret), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ext4_lblk_t, lblk ) - __field( int, ret ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->lblk = lblk; - __entry->ret = ret; - ), - - TP_printk("dev %d,%d ino %lu lblk %u ret %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned) __entry->lblk, - __entry->ret) - -); - -TRACE_EVENT(ext4_find_delalloc_range, - TP_PROTO(struct inode *inode, ext4_lblk_t from, ext4_lblk_t to, - int reverse, int found, ext4_lblk_t found_blk), - - TP_ARGS(inode, from, to, reverse, found, found_blk), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ext4_lblk_t, from ) - __field( ext4_lblk_t, to ) - __field( int, reverse ) - __field( int, found ) - __field( ext4_lblk_t, found_blk ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->from = from; - __entry->to = to; - __entry->reverse = reverse; - __entry->found = found; - __entry->found_blk = found_blk; - ), - - TP_printk("dev %d,%d ino %lu from %u to %u reverse %d found %d " - "(blk = %u)", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned) __entry->from, (unsigned) __entry->to, - __entry->reverse, __entry->found, - (unsigned) __entry->found_blk) -); - -TRACE_EVENT(ext4_get_reserved_cluster_alloc, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len), - - TP_ARGS(inode, lblk, len), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ext4_lblk_t, lblk ) - __field( unsigned int, len ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->lblk = lblk; - __entry->len = len; - ), - - TP_printk("dev %d,%d ino %lu lblk %u len %u", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned) __entry->lblk, - __entry->len) -); - TRACE_EVENT(ext4_ext_show_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, unsigned short len), -- cgit v1.2.3 From d737e5d418706abf32f6de68c3e09958516d422f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 9 Feb 2021 16:04:15 -0500 Subject: SUNRPC: Set TCP_CORK until the transmit queue is empty When we have multiple RPC requests queued up, it makes sense to set the TCP_CORK option while the transmit queue is non-empty. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index d2e97ee802af..d81fe8b364d0 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -247,6 +247,7 @@ struct rpc_xprt { struct rpc_task * snd_task; /* Task blocked in send */ struct list_head xmit_queue; /* Send queue */ + atomic_long_t xmit_queuelen; struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ #if defined(CONFIG_SUNRPC_BACKCHANNEL) -- cgit v1.2.3 From 547b60988e631f74ed025cf1ec50cfc17f49fd13 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Mon, 5 Apr 2021 17:42:48 +0100 Subject: perf: aux: Add flags for the buffer format Allocate a byte for advertising the PMU specific format type of the given AUX record. A PMU could end up providing hardware trace data in multiple format in a single session. e.g, The format of hardware buffer produced by CoreSight ETM PMU depends on the type of the "sink" device used for collection for an event (Traditional TMC-ETR/Bs with formatting or TRBEs without any formatting). # Boring story of why this is needed. Goto The_End_of_Story for skipping. CoreSight ETM trace allows instruction level tracing of Arm CPUs. The ETM generates the CPU excecution trace and pumps it into CoreSight AMBA Trace Bus and is collected by a different CoreSight component (traditionally CoreSight TMC-ETR /ETB/ETF), called "sink". Important to note that there is no guarantee that every CPU has a dedicated sink. Thus multiple ETMs could pump the trace data into the same "sink" and thus they apply additional formatting of the trace data for the user to decode it properly and attribute the trace data to the corresponding ETM. However, with the introduction of Arm Trace buffer Extensions (TRBE), we now have a dedicated per-CPU architected sink for collecting the trace. Since the TRBE is always per-CPU, it doesn't apply any formatting of the trace. The support for this driver is under review [1]. Now a system could have a per-cpu TRBE and one or more shared TMC-ETRs on the system. A user could choose a "specific" sink for a perf session (e.g, a TMC-ETR) or the driver could automatically select the nearest sink for a given ETM. It is possible that some ETMs could end up using TMC-ETR (e.g, if the TRBE is not usable on the CPU) while the others using TRBE in a single perf session. Thus we now have "formatted" trace collected from TMC-ETR and "unformatted" trace collected from TRBE. However, we don't get into a situation where a single event could end up using TMC-ETR & TRBE. i.e, any AUX buffer is guaranteed to be either RAW or FORMATTED, but not a mix of both. As for perf decoding, we need to know the type of the data in the individual AUX buffers, so that it can set up the "OpenCSD" (library for decoding CoreSight trace) decoder instance appropriately. Thus the perf.data file must conatin the hints for the tool to decode the data correctly. Since this is a runtime variable, and perf tool doesn't have a control on what sink gets used (in case of automatic sink selection), we need this information made available from the PMU driver for each AUX record. # The_End_of_Story Cc: Peter Ziljstra Cc: alexander.shishkin@linux.intel.com Cc: mingo@redhat.com Cc: will@kernel.org Cc: mark.rutland@arm.com Cc: mike.leach@linaro.org Cc: acme@kernel.org Cc: jolsa@redhat.com Cc: Mathieu Poirier Reviewed by: Mike Leach Acked-by: Peter Ziljstra Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20210405164307.1720226-2-suzuki.poulose@arm.com Signed-off-by: Mathieu Poirier --- include/uapi/linux/perf_event.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index ad15e40d7f5d..f006eeab6f0e 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1156,10 +1156,11 @@ enum perf_callchain_context { /** * PERF_RECORD_AUX::flags bits */ -#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ -#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ -#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ -#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ +#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ +#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ #define PERF_FLAG_FD_NO_GROUP (1UL << 0) #define PERF_FLAG_FD_OUTPUT (1UL << 1) -- cgit v1.2.3 From 7dde51767ca5339ed33109056d92fdca05d56d8d Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Mon, 5 Apr 2021 17:42:49 +0100 Subject: perf: aux: Add CoreSight PMU buffer formats CoreSight PMU supports aux-buffer for the ETM tracing. The trace generated by the ETM (associated with individual CPUs, like Intel PT) is captured by a separate IP (CoreSight TMC-ETR/ETF until now). The TMC-ETR applies formatting of the raw ETM trace data, as it can collect traces from multiple ETMs, with the TraceID to indicate the source of a given trace packet. Arm Trace Buffer Extension is new "sink" IP, attached to individual CPUs and thus do not provide additional formatting, like TMC-ETR. Additionally, a system could have both TRBE *and* TMC-ETR for the trace collection. e.g, TMC-ETR could be used as a single trace buffer to collect data from multiple ETMs to correlate the traces from different CPUs. It is possible to have a perf session where some events end up collecting the trace in TMC-ETR while the others in TRBE. Thus we need a way to identify the type of the trace for each AUX record. Define the trace formats exported by the CoreSight PMU. We don't define the flags following the "ETM" as this information is available to the user when issuing the session. What is missing is the additional formatting applied by the "sink" which is decided at the runtime and the user may not have a control on. So we define : - CORESIGHT format (indicates the Frame format) - RAW format (indicates the format of the source) The default value is CORESIGHT format for all the records (i,e == 0). Add the RAW format for others that use raw format. Cc: Peter Zijlstra Cc: Mike Leach Cc: Mathieu Poirier Cc: Leo Yan Cc: Anshuman Khandual Reviewed-by: Mike Leach Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20210405164307.1720226-3-suzuki.poulose@arm.com Signed-off-by: Mathieu Poirier --- include/uapi/linux/perf_event.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index f006eeab6f0e..63971eaef127 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1162,6 +1162,10 @@ enum perf_callchain_context { #define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ #define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ +/* CoreSight PMU AUX buffer formats */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ + #define PERF_FLAG_FD_NO_GROUP (1UL << 0) #define PERF_FLAG_FD_OUTPUT (1UL << 1) #define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ -- cgit v1.2.3 From aca01415e076aa96cca0f801f4420ee5c10c660d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bence=20Cs=C3=B3k=C3=A1s?= Date: Wed, 31 Mar 2021 19:19:20 +0000 Subject: i2c: Add I2C_AQ_NO_REP_START adapter quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This quirk signifies that the adapter cannot do a repeated START, it always issues a STOP condition after transfers. Suggested-by: Wolfram Sang Signed-off-by: Bence Csókás Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 56622658b215..a670ae129f4b 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -687,6 +687,8 @@ struct i2c_adapter_quirks { #define I2C_AQ_NO_ZERO_LEN_READ BIT(5) #define I2C_AQ_NO_ZERO_LEN_WRITE BIT(6) #define I2C_AQ_NO_ZERO_LEN (I2C_AQ_NO_ZERO_LEN_READ | I2C_AQ_NO_ZERO_LEN_WRITE) +/* adapter cannot do repeated START */ +#define I2C_AQ_NO_REP_START BIT(7) /* * i2c_adapter is the structure used to identify a physical i2c bus along -- cgit v1.2.3 From d556435156b7970b8ce61b355df558a5168927cc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Mar 2021 11:21:38 +0100 Subject: jbd2: avoid -Wempty-body warnings Building with 'make W=1' shows a harmless -Wempty-body warning: fs/jbd2/recovery.c: In function 'fc_do_one_pass': fs/jbd2/recovery.c:267:75: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body] 267 | jbd_debug(3, "Fast commit replay failed, err = %d\n", err); | ^ Change the empty dprintk() macros to no_printk(), which avoids this warning and adds format string checking. Signed-off-by: Arnd Bergmann Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210322102152.95684-1-arnd@kernel.org Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b9aa85081a40..db0e1920cb12 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -61,7 +61,7 @@ void __jbd2_debug(int level, const char *file, const char *func, #define jbd_debug(n, fmt, a...) \ __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a) #else -#define jbd_debug(n, fmt, a...) /**/ +#define jbd_debug(n, fmt, a...) no_printk(fmt, ##a) #endif extern void *jbd2_alloc(size_t size, gfp_t flags); -- cgit v1.2.3 From 28e9d4bce3be9b8fec6c854f87923db99c8fb874 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 5 Apr 2021 18:39:40 +0200 Subject: KVM: arm64: vgic-v3: Expose GICR_TYPER.Last for userspace Commit 23bde34771f1 ("KVM: arm64: vgic-v3: Drop the reporting of GICR_TYPER.Last for userspace") temporarily fixed a bug identified when attempting to access the GICR_TYPER register before the redistributor region setting, but dropped the support of the LAST bit. Emulating the GICR_TYPER.Last bit still makes sense for architecture compliance though. This patch restores its support (if the redistributor region was set) while keeping the code safe. We introduce a new helper, vgic_mmio_vcpu_rdist_is_last() which computes whether a redistributor is the highest one of a series of redistributor contributor pages. With this new implementation we do not need to have a uaccess read accessor anymore. Signed-off-by: Eric Auger Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210405163941.510258-9-eric.auger@redhat.com --- include/kvm/arm_vgic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 3d74f1060bd1..ec621180ef09 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -322,6 +322,7 @@ struct vgic_cpu { */ struct vgic_io_device rd_iodev; struct vgic_redist_region *rdreg; + u32 rdreg_index; /* Contains the attributes and gpa of the LPI pending tables. */ u64 pendbaser; -- cgit v1.2.3 From 2cd87a7b293dedbbaea3b6739f95d428a2d9890d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 5 Apr 2021 17:43:03 +0100 Subject: coresight: core: Add support for dedicated percpu sinks Add support for dedicated sinks that are bound to individual CPUs. (e.g, TRBE). To allow quicker access to the sink for a given CPU bound source, keep a percpu array of the sink devices. Also, add support for building a path to the CPU local sink from the ETM. This adds a new percpu sink type CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM. This new sink type is exclusively available and can only work with percpu source type device CORESIGHT_DEV_SUBTYPE_SOURCE_PROC. This defines a percpu structure that accommodates a single coresight_device which can be used to store an initialized instance from a sink driver. As these sinks are exclusively linked and dependent on corresponding percpu sources devices, they should also be the default sink device during a perf session. Outwards device connections are scanned while establishing paths between a source and a sink device. But such connections are not present for certain percpu source and sink devices which are exclusively linked and dependent. Build the path directly and skip connection scanning for such devices. Cc: Mathieu Poirier Cc: Mike Leach Cc: Suzuki K Poulose Tested-by: Suzuki K Poulose Reviewed-by: Mike Leach Signed-off-by: Anshuman Khandual [Moved the set/get percpu sink APIs from TRBE patch to here Fixed build break on arm32] Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20210405164307.1720226-17-suzuki.poulose@arm.com Signed-off-by: Mathieu Poirier --- include/linux/coresight.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 976ec2697610..85008a65e21f 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -50,6 +50,7 @@ enum coresight_dev_subtype_sink { CORESIGHT_DEV_SUBTYPE_SINK_PORT, CORESIGHT_DEV_SUBTYPE_SINK_BUFFER, CORESIGHT_DEV_SUBTYPE_SINK_SYSMEM, + CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM, }; enum coresight_dev_subtype_link { @@ -455,6 +456,18 @@ static inline void csdev_access_write64(struct csdev_access *csa, u64 val, u32 o } #endif /* CONFIG_64BIT */ +static inline bool coresight_is_percpu_source(struct coresight_device *csdev) +{ + return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SOURCE) && + (csdev->subtype.source_subtype == CORESIGHT_DEV_SUBTYPE_SOURCE_PROC); +} + +static inline bool coresight_is_percpu_sink(struct coresight_device *csdev) +{ + return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SINK) && + (csdev->subtype.sink_subtype == CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM); +} + extern struct coresight_device * coresight_register(struct coresight_desc *desc); extern void coresight_unregister(struct coresight_device *csdev); -- cgit v1.2.3 From 117bfa8d5d4cb50556a59381d0f10fe762c1cd28 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 23 Mar 2021 09:05:56 +0800 Subject: iommu/vt-d: Remove unused dma map/unmap trace events With commit c588072bba6b5 ("iommu/vt-d: Convert intel iommu driver to the iommu ops"), the trace events for dma map/unmap have no users any more. Cleanup them to make the code neat. Signed-off-by: Lu Baolu Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210323010600.678627-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/trace/events/intel_iommu.h | 120 ------------------------------------- 1 file changed, 120 deletions(-) (limited to 'include') diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h index e801f4910522..d233f2916584 100644 --- a/include/trace/events/intel_iommu.h +++ b/include/trace/events/intel_iommu.h @@ -15,126 +15,6 @@ #include #include -DECLARE_EVENT_CLASS(dma_map, - TP_PROTO(struct device *dev, dma_addr_t dev_addr, phys_addr_t phys_addr, - size_t size), - - TP_ARGS(dev, dev_addr, phys_addr, size), - - TP_STRUCT__entry( - __string(dev_name, dev_name(dev)) - __field(dma_addr_t, dev_addr) - __field(phys_addr_t, phys_addr) - __field(size_t, size) - ), - - TP_fast_assign( - __assign_str(dev_name, dev_name(dev)); - __entry->dev_addr = dev_addr; - __entry->phys_addr = phys_addr; - __entry->size = size; - ), - - TP_printk("dev=%s dev_addr=0x%llx phys_addr=0x%llx size=%zu", - __get_str(dev_name), - (unsigned long long)__entry->dev_addr, - (unsigned long long)__entry->phys_addr, - __entry->size) -); - -DEFINE_EVENT(dma_map, map_single, - TP_PROTO(struct device *dev, dma_addr_t dev_addr, phys_addr_t phys_addr, - size_t size), - TP_ARGS(dev, dev_addr, phys_addr, size) -); - -DEFINE_EVENT(dma_map, bounce_map_single, - TP_PROTO(struct device *dev, dma_addr_t dev_addr, phys_addr_t phys_addr, - size_t size), - TP_ARGS(dev, dev_addr, phys_addr, size) -); - -DECLARE_EVENT_CLASS(dma_unmap, - TP_PROTO(struct device *dev, dma_addr_t dev_addr, size_t size), - - TP_ARGS(dev, dev_addr, size), - - TP_STRUCT__entry( - __string(dev_name, dev_name(dev)) - __field(dma_addr_t, dev_addr) - __field(size_t, size) - ), - - TP_fast_assign( - __assign_str(dev_name, dev_name(dev)); - __entry->dev_addr = dev_addr; - __entry->size = size; - ), - - TP_printk("dev=%s dev_addr=0x%llx size=%zu", - __get_str(dev_name), - (unsigned long long)__entry->dev_addr, - __entry->size) -); - -DEFINE_EVENT(dma_unmap, unmap_single, - TP_PROTO(struct device *dev, dma_addr_t dev_addr, size_t size), - TP_ARGS(dev, dev_addr, size) -); - -DEFINE_EVENT(dma_unmap, unmap_sg, - TP_PROTO(struct device *dev, dma_addr_t dev_addr, size_t size), - TP_ARGS(dev, dev_addr, size) -); - -DEFINE_EVENT(dma_unmap, bounce_unmap_single, - TP_PROTO(struct device *dev, dma_addr_t dev_addr, size_t size), - TP_ARGS(dev, dev_addr, size) -); - -DECLARE_EVENT_CLASS(dma_map_sg, - TP_PROTO(struct device *dev, int index, int total, - struct scatterlist *sg), - - TP_ARGS(dev, index, total, sg), - - TP_STRUCT__entry( - __string(dev_name, dev_name(dev)) - __field(dma_addr_t, dev_addr) - __field(phys_addr_t, phys_addr) - __field(size_t, size) - __field(int, index) - __field(int, total) - ), - - TP_fast_assign( - __assign_str(dev_name, dev_name(dev)); - __entry->dev_addr = sg->dma_address; - __entry->phys_addr = sg_phys(sg); - __entry->size = sg->dma_length; - __entry->index = index; - __entry->total = total; - ), - - TP_printk("dev=%s [%d/%d] dev_addr=0x%llx phys_addr=0x%llx size=%zu", - __get_str(dev_name), __entry->index, __entry->total, - (unsigned long long)__entry->dev_addr, - (unsigned long long)__entry->phys_addr, - __entry->size) -); - -DEFINE_EVENT(dma_map_sg, map_sg, - TP_PROTO(struct device *dev, int index, int total, - struct scatterlist *sg), - TP_ARGS(dev, index, total, sg) -); - -DEFINE_EVENT(dma_map_sg, bounce_map_sg, - TP_PROTO(struct device *dev, int index, int total, - struct scatterlist *sg), - TP_ARGS(dev, index, total, sg) -); - TRACE_EVENT(qi_submit, TP_PROTO(struct intel_iommu *iommu, u64 qw0, u64 qw1, u64 qw2, u64 qw3), -- cgit v1.2.3 From 2e1a44c1c4acf209c0dd7bc04421d101b9e80d11 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 23 Mar 2021 09:05:57 +0800 Subject: iommu/vt-d: Remove svm_dev_ops The svm_dev_ops has never been referenced in the tree, and there's no plan to have anything to use it. Remove it to make the code neat. Signed-off-by: Lu Baolu Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210323010600.678627-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 3 --- include/linux/intel-svm.h | 7 ------- 2 files changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 1732298ce888..e0f8c2ade3e8 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -769,14 +769,11 @@ u32 intel_svm_get_pasid(struct iommu_sva *handle); int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg); -struct svm_dev_ops; - struct intel_svm_dev { struct list_head list; struct rcu_head rcu; struct device *dev; struct intel_iommu *iommu; - struct svm_dev_ops *ops; struct iommu_sva sva; u32 pasid; int users; diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 39d368a810b8..6c9d10c0fb1e 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -8,13 +8,6 @@ #ifndef __INTEL_SVM_H__ #define __INTEL_SVM_H__ -struct device; - -struct svm_dev_ops { - void (*fault_cb)(struct device *dev, u32 pasid, u64 address, - void *private, int rwxp, int response); -}; - /* Values for rxwp in fault_cb callback */ #define SVM_REQ_READ (1<<3) #define SVM_REQ_WRITE (1<<2) -- cgit v1.2.3 From 06905ea8319731036695cf1a4c53c12b0f9373cb Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 23 Mar 2021 09:05:58 +0800 Subject: iommu/vt-d: Remove SVM_FLAG_PRIVATE_PASID The SVM_FLAG_PRIVATE_PASID has never been referenced in the tree, and there's no plan to have anything to use it. So cleanup it. Signed-off-by: Lu Baolu Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210323010600.678627-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-svm.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 6c9d10c0fb1e..10fa80eef13a 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -14,16 +14,6 @@ #define SVM_REQ_EXEC (1<<1) #define SVM_REQ_PRIV (1<<0) -/* - * The SVM_FLAG_PRIVATE_PASID flag requests a PASID which is *not* the "main" - * PASID for the current process. Even if a PASID already exists, a new one - * will be allocated. And the PASID allocated with SVM_FLAG_PRIVATE_PASID - * will not be given to subsequent callers. This facility allows a driver to - * disambiguate between multiple device contexts which access the same MM, - * if there is no other way to do so. It should be used sparingly, if at all. - */ -#define SVM_FLAG_PRIVATE_PASID (1<<0) - /* * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only * for access to kernel addresses. No IOTLB flushes are automatically done @@ -35,18 +25,18 @@ * It is unlikely that we will ever hook into flush_tlb_kernel_range() to * do such IOTLB flushes automatically. */ -#define SVM_FLAG_SUPERVISOR_MODE (1<<1) +#define SVM_FLAG_SUPERVISOR_MODE BIT(0) /* * The SVM_FLAG_GUEST_MODE flag is used when a PASID bind is for guest * processes. Compared to the host bind, the primary differences are: * 1. mm life cycle management * 2. fault reporting */ -#define SVM_FLAG_GUEST_MODE (1<<2) +#define SVM_FLAG_GUEST_MODE BIT(1) /* * The SVM_FLAG_GUEST_PASID flag is used when a guest has its own PASID space, * which requires guest and host PASID translation at both directions. */ -#define SVM_FLAG_GUEST_PASID (1<<3) +#define SVM_FLAG_GUEST_PASID BIT(2) #endif /* __INTEL_SVM_H__ */ -- cgit v1.2.3 From 3431c3f660a39f6ced954548a59dba6541ce3eb1 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Thu, 25 Mar 2021 11:38:24 +0800 Subject: iommu: Fix a boundary issue to avoid performance drop After the change of patch ("iommu: Switch gather->end to the inclusive end"), the performace drops from 1600+K IOPS to 1200K in our kunpeng ARM64 platform. We find that the range [start1, end1) actually is joint from the range [end1, end2), but it is considered as disjoint after the change, so it needs more times of TLB sync, and spends more time on it. So fix the boundary issue to avoid performance drop. Fixes: 862c3715de8f ("iommu: Switch gather->end to the inclusive end") Signed-off-by: Xiang Chen Acked-by: Will Deacon Link: https://lore.kernel.org/r/1616643504-120688-1-git-send-email-chenxiang66@hisilicon.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5e7fe519430a..9ca6e6b8084d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -547,7 +547,7 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, * structure can be rewritten. */ if (gather->pgsize != size || - end < gather->start || start > gather->end) { + end + 1 < gather->start || start > gather->end + 1) { if (gather->pgsize) iommu_iotlb_sync(domain, gather); gather->pgsize = size; -- cgit v1.2.3 From f598a497bc7dfbec60270bca8b8408db3d23ac07 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 25 Mar 2021 20:29:58 +0800 Subject: iova: Add CPU hotplug handler to flush rcaches Like the Intel IOMMU driver already does, flush the per-IOVA domain CPU rcache when a CPU goes offline - there's no point in keeping it. Reviewed-by: Robin Murphy Signed-off-by: John Garry Link: https://lore.kernel.org/r/1616675401-151997-2-git-send-email-john.garry@huawei.com Signed-off-by: Joerg Roedel --- include/linux/cpuhotplug.h | 1 + include/linux/iova.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f14adb882338..cedac9986557 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -58,6 +58,7 @@ enum cpuhp_state { CPUHP_NET_DEV_DEAD, CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_INTEL_DEAD, + CPUHP_IOMMU_IOVA_DEAD, CPUHP_LUSTRE_CFS_DEAD, CPUHP_AP_ARM_CACHE_B15_RAC_DEAD, CPUHP_PADATA_DEAD, diff --git a/include/linux/iova.h b/include/linux/iova.h index c834c01c0a5b..4be6c0ab4997 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -95,6 +95,7 @@ struct iova_domain { flush-queues */ atomic_t fq_timer_on; /* 1 when timer is active, 0 when not */ + struct hlist_node cpuhp_dead; }; static inline unsigned long iova_size(struct iova *iova) -- cgit v1.2.3 From 363f266eeff6e22a09483dc922dccd7cd0b9fe9c Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 25 Mar 2021 20:29:59 +0800 Subject: iommu/vt-d: Remove IOVA domain rcache flushing for CPU offlining Now that the core code handles flushing per-IOVA domain CPU rcaches, remove the handling here. Reviewed-by: Lu Baolu Signed-off-by: John Garry Link: https://lore.kernel.org/r/1616675401-151997-3-git-send-email-john.garry@huawei.com Signed-off-by: Joerg Roedel --- include/linux/cpuhotplug.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index cedac9986557..85996494bec1 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -57,7 +57,6 @@ enum cpuhp_state { CPUHP_PAGE_ALLOC_DEAD, CPUHP_NET_DEV_DEAD, CPUHP_PCI_XGENE_DEAD, - CPUHP_IOMMU_INTEL_DEAD, CPUHP_IOMMU_IOVA_DEAD, CPUHP_LUSTRE_CFS_DEAD, CPUHP_AP_ARM_CACHE_B15_RAC_DEAD, -- cgit v1.2.3 From 149448b353e2517ecc6eced7d9f46e9f3e08b89e Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 25 Mar 2021 20:30:00 +0800 Subject: iommu: Delete iommu_dma_free_cpu_cached_iovas() Function iommu_dma_free_cpu_cached_iovas() no longer has any caller, so delete it. With that, function free_cpu_cached_iovas() may be made static. Signed-off-by: John Garry Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/1616675401-151997-4-git-send-email-john.garry@huawei.com Signed-off-by: Joerg Roedel --- include/linux/dma-iommu.h | 5 ----- include/linux/iova.h | 5 ----- 2 files changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h index 13d1f4c14d7b..6e75a2d689b4 100644 --- a/include/linux/dma-iommu.h +++ b/include/linux/dma-iommu.h @@ -83,10 +83,5 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he { } -static inline void iommu_dma_free_cpu_cached_iovas(unsigned int cpu, - struct iommu_domain *domain) -{ -} - #endif /* CONFIG_IOMMU_DMA */ #endif /* __DMA_IOMMU_H */ diff --git a/include/linux/iova.h b/include/linux/iova.h index 4be6c0ab4997..71d8a2de6635 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -157,7 +157,6 @@ int init_iova_flush_queue(struct iova_domain *iovad, iova_flush_cb flush_cb, iova_entry_dtor entry_dtor); struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn); void put_iova_domain(struct iova_domain *iovad); -void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad); #else static inline int iova_cache_get(void) { @@ -234,10 +233,6 @@ static inline void put_iova_domain(struct iova_domain *iovad) { } -static inline void free_cpu_cached_iovas(unsigned int cpu, - struct iova_domain *iovad) -{ -} #endif #endif -- cgit v1.2.3 From 0d35309ab5e080095190965aa7cfc3ca8fb88af9 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 1 Apr 2021 17:47:10 +0200 Subject: iommu: Fix comment for struct iommu_fwspec Commit 986d5ecc5699 ("iommu: Move fwspec->iommu_priv to struct dev_iommu") removed iommu_priv from fwspec and commit 5702ee24182f ("ACPI/IORT: Check ATS capability in root complex nodes") added @flags. Update the struct doc. Acked-by: Jonathan Cameron Acked-by: Will Deacon Signed-off-by: Jean-Philippe Brucker Link: https://lore.kernel.org/r/20210401154718.307519-2-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5e7fe519430a..1d422bf722a1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -571,7 +571,7 @@ struct iommu_group *fsl_mc_device_group(struct device *dev); * struct iommu_fwspec - per-device IOMMU instance data * @ops: ops for this device's IOMMU * @iommu_fwnode: firmware handle for this device's IOMMU - * @iommu_priv: IOMMU driver private data for this device + * @flags: IOMMU_FWSPEC_* flags * @num_pasid_bits: number of PASID bits supported by this device * @num_ids: number of associated device IDs * @ids: IDs which this device may present to the IOMMU -- cgit v1.2.3 From 434b73e61cc65cdd26618af6fa4736c2ba1eb29b Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 1 Apr 2021 17:47:11 +0200 Subject: iommu/arm-smmu-v3: Use device properties for pasid-num-bits The pasid-num-bits property shouldn't need a dedicated fwspec field, it's a job for device properties. Add properties for IORT, and access the number of PASID bits using device_property_read_u32(). Suggested-by: Robin Murphy Acked-by: Jonathan Cameron Acked-by: Will Deacon Reviewed-by: Eric Auger Signed-off-by: Jean-Philippe Brucker Acked-by: Hanjun Guo Link: https://lore.kernel.org/r/20210401154718.307519-3-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1d422bf722a1..16ce75693d83 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -572,7 +572,6 @@ struct iommu_group *fsl_mc_device_group(struct device *dev); * @ops: ops for this device's IOMMU * @iommu_fwnode: firmware handle for this device's IOMMU * @flags: IOMMU_FWSPEC_* flags - * @num_pasid_bits: number of PASID bits supported by this device * @num_ids: number of associated device IDs * @ids: IDs which this device may present to the IOMMU */ @@ -580,7 +579,6 @@ struct iommu_fwspec { const struct iommu_ops *ops; struct fwnode_handle *iommu_fwnode; u32 flags; - u32 num_pasid_bits; unsigned int num_ids; u32 ids[]; }; -- cgit v1.2.3 From 34b48c704d194738eef0893aa06e412bdc8a972f Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 1 Apr 2021 17:47:12 +0200 Subject: iommu: Separate IOMMU_DEV_FEAT_IOPF from IOMMU_DEV_FEAT_SVA Some devices manage I/O Page Faults (IOPF) themselves instead of relying on PCIe PRI or Arm SMMU stall. Allow their drivers to enable SVA without mandating IOMMU-managed IOPF. The other device drivers now need to first enable IOMMU_DEV_FEAT_IOPF before enabling IOMMU_DEV_FEAT_SVA. Enabling IOMMU_DEV_FEAT_IOPF on its own doesn't have any effect visible to the device driver, it is used in combination with other features. Reviewed-by: Eric Auger Reviewed-by: Lu Baolu Signed-off-by: Jean-Philippe Brucker Link: https://lore.kernel.org/r/20210401154718.307519-4-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 16ce75693d83..45c4eb372f56 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -156,10 +156,24 @@ struct iommu_resv_region { enum iommu_resv_type type; }; -/* Per device IOMMU features */ +/** + * enum iommu_dev_features - Per device IOMMU features + * @IOMMU_DEV_FEAT_AUX: Auxiliary domain feature + * @IOMMU_DEV_FEAT_SVA: Shared Virtual Addresses + * @IOMMU_DEV_FEAT_IOPF: I/O Page Faults such as PRI or Stall. Generally + * enabling %IOMMU_DEV_FEAT_SVA requires + * %IOMMU_DEV_FEAT_IOPF, but some devices manage I/O Page + * Faults themselves instead of relying on the IOMMU. When + * supported, this feature must be enabled before and + * disabled after %IOMMU_DEV_FEAT_SVA. + * + * Device drivers query whether a feature is supported using + * iommu_dev_has_feature(), and enable it using iommu_dev_enable_feature(). + */ enum iommu_dev_features { - IOMMU_DEV_FEAT_AUX, /* Aux-domain feature */ - IOMMU_DEV_FEAT_SVA, /* Shared Virtual Addresses */ + IOMMU_DEV_FEAT_AUX, + IOMMU_DEV_FEAT_SVA, + IOMMU_DEV_FEAT_IOPF, }; #define IOMMU_PASID_INVALID (-1U) -- cgit v1.2.3 From fc36479db74e957c4696b605a32c4afaa15fa6cb Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 1 Apr 2021 17:47:15 +0200 Subject: iommu: Add a page fault handler Some systems allow devices to handle I/O Page Faults in the core mm. For example systems implementing the PCIe PRI extension or Arm SMMU stall model. Infrastructure for reporting these recoverable page faults was added to the IOMMU core by commit 0c830e6b3282 ("iommu: Introduce device fault report API"). Add a page fault handler for host SVA. IOMMU driver can now instantiate several fault workqueues and link them to IOPF-capable devices. Drivers can choose between a single global workqueue, one per IOMMU device, one per low-level fault queue, one per domain, etc. When it receives a fault event, most commonly in an IRQ handler, the IOMMU driver reports the fault using iommu_report_device_fault(), which calls the registered handler. The page fault handler then calls the mm fault handler, and reports either success or failure with iommu_page_response(). After the handler succeeds, the hardware retries the access. The iopf_param pointer could be embedded into iommu_fault_param. But putting iopf_param into the iommu_param structure allows us not to care about ordering between calls to iopf_queue_add_device() and iommu_register_device_fault_handler(). Tested-by: Lu Baolu Reviewed-by: Eric Auger Reviewed-by: Jacob Pan Reviewed-by: Jonathan Cameron Reviewed-by: Lu Baolu Signed-off-by: Jean-Philippe Brucker Link: https://lore.kernel.org/r/20210401154718.307519-7-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 45c4eb372f56..86d688c4418f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -367,6 +367,7 @@ struct iommu_fault_param { * struct dev_iommu - Collection of per-device IOMMU data * * @fault_param: IOMMU detected device fault reporting data + * @iopf_param: I/O Page Fault queue and data * @fwspec: IOMMU fwspec data * @iommu_dev: IOMMU device this device is linked to * @priv: IOMMU Driver private data @@ -377,6 +378,7 @@ struct iommu_fault_param { struct dev_iommu { struct mutex lock; struct iommu_fault_param *fault_param; + struct iopf_device_param *iopf_param; struct iommu_fwspec *fwspec; struct iommu_device *iommu_dev; void *priv; -- cgit v1.2.3 From 47685cb202d1aff6f70a2bb91e8271392fefea84 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Apr 2021 17:52:37 +0200 Subject: iommu: remove the unused domain_window_disable method domain_window_disable is wired up by fsl_pamu, but never actually called. Signed-off-by: Christoph Hellwig Acked-by: Will Deacon Acked-by: Li Yang Link: https://lore.kernel.org/r/20210401155256.298656-2-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 86d688c4418f..565b8810354d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -223,7 +223,6 @@ struct iommu_iotlb_gather { * @put_resv_regions: Free list of reserved regions for a device * @apply_resv_region: Temporary helper call-back for iova reserved ranges * @domain_window_enable: Configure and enable a particular window for a domain - * @domain_window_disable: Disable a particular window for a domain * @of_xlate: add OF master IDs to iommu grouping * @is_attach_deferred: Check if domain attach should be deferred from iommu * driver init to device driver init (default no) @@ -284,7 +283,6 @@ struct iommu_ops { /* Window handling functions */ int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr, phys_addr_t paddr, u64 size, int prot); - void (*domain_window_disable)(struct iommu_domain *domain, u32 wnd_nr); int (*of_xlate)(struct device *dev, struct of_phandle_args *args); bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev); -- cgit v1.2.3 From 392825e0c76cf9aca33e5a3bf981cde2a2c87251 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Apr 2021 17:52:38 +0200 Subject: iommu/fsl_pamu: remove fsl_pamu_get_domain_attr None of the values returned by this function are ever queried. Also remove the DOMAIN_ATTR_FSL_PAMUV1 enum value that is not otherwise used. Signed-off-by: Christoph Hellwig Acked-by: Will Deacon Acked-by: Li Yang Link: https://lore.kernel.org/r/20210401155256.298656-3-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 565b8810354d..95ce205347a9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -104,9 +104,6 @@ enum iommu_cap { * -the actual size of the mapped region of a window must be power * of 2 starting with 4KB and physical address must be naturally * aligned. - * DOMAIN_ATTR_FSL_PAMUV1 corresponds to the above mentioned contraints. - * The caller can invoke iommu_domain_get_attr to check if the underlying - * iommu implementation supports these constraints. */ enum iommu_attr { @@ -115,7 +112,6 @@ enum iommu_attr { DOMAIN_ATTR_WINDOWS, DOMAIN_ATTR_FSL_PAMU_STASH, DOMAIN_ATTR_FSL_PAMU_ENABLE, - DOMAIN_ATTR_FSL_PAMUV1, DOMAIN_ATTR_NESTING, /* two stages of translation */ DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, DOMAIN_ATTR_IO_PGTABLE_CFG, -- cgit v1.2.3 From ba58d1216e2b2d2320b50591b767f50b13c623a8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Apr 2021 17:52:41 +0200 Subject: iommu/fsl_pamu: remove support for multiple windows The only domains allocated forces use of a single window. Remove all the code related to multiple window support, as well as the need for qman_portal to force a single window. Remove the now unused DOMAIN_ATTR_WINDOWS iommu_attr. Signed-off-by: Christoph Hellwig Acked-by: Will Deacon Acked-by: Li Yang Link: https://lore.kernel.org/r/20210401155256.298656-6-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 95ce205347a9..761aa8b69936 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -109,7 +109,6 @@ enum iommu_cap { enum iommu_attr { DOMAIN_ATTR_GEOMETRY, DOMAIN_ATTR_PAGING, - DOMAIN_ATTR_WINDOWS, DOMAIN_ATTR_FSL_PAMU_STASH, DOMAIN_ATTR_FSL_PAMU_ENABLE, DOMAIN_ATTR_NESTING, /* two stages of translation */ -- cgit v1.2.3 From 376dfd2a2ff41596a6efc8ea56f8b0de172b04a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Apr 2021 17:52:42 +0200 Subject: iommu/fsl_pamu: remove ->domain_window_enable The only thing that fsl_pamu_window_enable does for the current caller is to fill in the prot value in the only dma_window structure, and to propagate a few values from the iommu_domain_geometry struture into the dma_window. Remove the dma_window entirely, hardcode the prot value and otherwise use the iommu_domain_geometry structure instead. Remove the now unused ->domain_window_enable iommu method. Signed-off-by: Christoph Hellwig Acked-by: Li Yang Link: https://lore.kernel.org/r/20210401155256.298656-7-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 761aa8b69936..69537e83aae4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -217,7 +217,6 @@ struct iommu_iotlb_gather { * @get_resv_regions: Request list of reserved regions for a device * @put_resv_regions: Free list of reserved regions for a device * @apply_resv_region: Temporary helper call-back for iova reserved ranges - * @domain_window_enable: Configure and enable a particular window for a domain * @of_xlate: add OF master IDs to iommu grouping * @is_attach_deferred: Check if domain attach should be deferred from iommu * driver init to device driver init (default no) @@ -275,10 +274,6 @@ struct iommu_ops { struct iommu_domain *domain, struct iommu_resv_region *region); - /* Window handling functions */ - int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr, - phys_addr_t paddr, u64 size, int prot); - int (*of_xlate)(struct device *dev, struct of_phandle_args *args); bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev); @@ -521,11 +516,6 @@ extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr, extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr, void *data); -/* Window handling function prototypes */ -extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr, - phys_addr_t offset, u64 size, - int prot); - extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev, unsigned long iova, int flags); @@ -749,13 +739,6 @@ static inline void iommu_iotlb_sync(struct iommu_domain *domain, { } -static inline int iommu_domain_window_enable(struct iommu_domain *domain, - u32 wnd_nr, phys_addr_t paddr, - u64 size, int prot) -{ - return -ENODEV; -} - static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { return 0; -- cgit v1.2.3 From 4eeb96f6efac10e66fd10e718d2adeece3879121 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Apr 2021 17:52:43 +0200 Subject: iommu/fsl_pamu: replace DOMAIN_ATTR_FSL_PAMU_STASH with a direct call Add a fsl_pamu_configure_l1_stash API that qman_portal can call directly instead of indirecting through the iommu attr API. Signed-off-by: Christoph Hellwig Acked-by: Will Deacon Acked-by: Li Yang Link: https://lore.kernel.org/r/20210401155256.298656-8-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 69537e83aae4..a3968122aa69 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -109,7 +109,6 @@ enum iommu_cap { enum iommu_attr { DOMAIN_ATTR_GEOMETRY, DOMAIN_ATTR_PAGING, - DOMAIN_ATTR_FSL_PAMU_STASH, DOMAIN_ATTR_FSL_PAMU_ENABLE, DOMAIN_ATTR_NESTING, /* two stages of translation */ DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, -- cgit v1.2.3 From 7d61cb6ff0122a017ae907aed62478a4db9c5991 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Apr 2021 17:52:46 +0200 Subject: iommu/fsl_pamu: enable the liodn when attaching a device Instead of a separate call to enable all devices from the list, just enable the liodn once the device is attached to the iommu domain. This also remove the DOMAIN_ATTR_FSL_PAMU_ENABLE iommu_attr. Signed-off-by: Christoph Hellwig Acked-by: Will Deacon Acked-by: Li Yang Link: https://lore.kernel.org/r/20210401155256.298656-11-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a3968122aa69..f5caaa8d39be 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -109,7 +109,6 @@ enum iommu_cap { enum iommu_attr { DOMAIN_ATTR_GEOMETRY, DOMAIN_ATTR_PAGING, - DOMAIN_ATTR_FSL_PAMU_ENABLE, DOMAIN_ATTR_NESTING, /* two stages of translation */ DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, DOMAIN_ATTR_IO_PGTABLE_CFG, -- cgit v1.2.3 From 9fb5fad562fa0a41c84691714d99c23f54168a9e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Apr 2021 17:52:50 +0200 Subject: iommu: remove DOMAIN_ATTR_PAGING DOMAIN_ATTR_PAGING is never used. Signed-off-by: Christoph Hellwig Acked-by: Will Deacon Acked-by: Li Yang Link: https://lore.kernel.org/r/20210401155256.298656-15-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f5caaa8d39be..85b51084b820 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -108,7 +108,6 @@ enum iommu_cap { enum iommu_attr { DOMAIN_ATTR_GEOMETRY, - DOMAIN_ATTR_PAGING, DOMAIN_ATTR_NESTING, /* two stages of translation */ DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, DOMAIN_ATTR_IO_PGTABLE_CFG, -- cgit v1.2.3 From bc9a05eef113e75cfa792fdf24dae011bc3d5294 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Apr 2021 17:52:51 +0200 Subject: iommu: remove DOMAIN_ATTR_GEOMETRY The geometry information can be trivially queried from the iommu_domain struture. Signed-off-by: Christoph Hellwig Acked-by: Will Deacon Acked-by: Li Yang Link: https://lore.kernel.org/r/20210401155256.298656-16-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 85b51084b820..207446cc6d3f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -107,7 +107,6 @@ enum iommu_cap { */ enum iommu_attr { - DOMAIN_ATTR_GEOMETRY, DOMAIN_ATTR_NESTING, /* two stages of translation */ DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, DOMAIN_ATTR_IO_PGTABLE_CFG, -- cgit v1.2.3 From 7e147547783a9035df816864b6a45ffbb254d700 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Apr 2021 17:52:52 +0200 Subject: iommu: remove DOMAIN_ATTR_NESTING Use an explicit enable_nesting method instead. Signed-off-by: Christoph Hellwig Acked-by: Will Deacon Acked-by: Li Yang Link: https://lore.kernel.org/r/20210401155256.298656-17-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 207446cc6d3f..d7d76b5e1192 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -107,7 +107,6 @@ enum iommu_cap { */ enum iommu_attr { - DOMAIN_ATTR_NESTING, /* two stages of translation */ DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, DOMAIN_ATTR_IO_PGTABLE_CFG, DOMAIN_ATTR_MAX, @@ -210,6 +209,7 @@ struct iommu_iotlb_gather { * @device_group: find iommu group for a particular device * @domain_get_attr: Query domain attributes * @domain_set_attr: Change domain attributes + * @enable_nesting: Enable nesting * @get_resv_regions: Request list of reserved regions for a device * @put_resv_regions: Free list of reserved regions for a device * @apply_resv_region: Temporary helper call-back for iova reserved ranges @@ -262,6 +262,7 @@ struct iommu_ops { enum iommu_attr attr, void *data); int (*domain_set_attr)(struct iommu_domain *domain, enum iommu_attr attr, void *data); + int (*enable_nesting)(struct iommu_domain *domain); /* Request/Free a list of reserved regions for a device */ void (*get_resv_regions)(struct device *dev, struct list_head *list); @@ -511,6 +512,7 @@ extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr, void *data); extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr, void *data); +int iommu_enable_nesting(struct iommu_domain *domain); extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev, unsigned long iova, int flags); -- cgit v1.2.3 From a250c23f15c21c556becd4986f453255e545807c Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 1 Apr 2021 17:52:54 +0200 Subject: iommu: remove DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE Instead make the global iommu_dma_strict paramete in iommu.c canonical by exporting helpers to get and set it and use those directly in the drivers. This make sure that the iommu.strict parameter also works for the AMD and Intel IOMMU drivers on x86. As those default to lazy flushing a new IOMMU_CMD_LINE_STRICT is used to turn the value into a tristate to represent the default if not overriden by an explicit parameter. [ported on top of the other iommu_attr changes and added a few small missing bits] Signed-off-by: Robin Murphy . Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210401155256.298656-19-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d7d76b5e1192..9349bdd62e91 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -107,7 +107,6 @@ enum iommu_cap { */ enum iommu_attr { - DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, DOMAIN_ATTR_IO_PGTABLE_CFG, DOMAIN_ATTR_MAX, }; @@ -514,6 +513,9 @@ extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr, void *data); int iommu_enable_nesting(struct iommu_domain *domain); +void iommu_set_dma_strict(bool val); +bool iommu_get_dma_strict(struct iommu_domain *domain); + extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev, unsigned long iova, int flags); -- cgit v1.2.3 From 4fc52b81e87be583efb834df5b58245cb9ddd3e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Apr 2021 17:52:55 +0200 Subject: iommu: remove DOMAIN_ATTR_IO_PGTABLE_CFG Use an explicit set_pgtable_quirks method instead that just passes the actual quirk bitmask instead. Signed-off-by: Christoph Hellwig Acked-by: Will Deacon Acked-by: Li Yang Link: https://lore.kernel.org/r/20210401155256.298656-20-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 4 ---- include/linux/iommu.h | 12 +++++++++++- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index a4c9ca2c31f1..4d40dfa75b55 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -204,10 +204,6 @@ struct io_pgtable { #define io_pgtable_ops_to_pgtable(x) container_of((x), struct io_pgtable, ops) -struct io_pgtable_domain_attr { - unsigned long quirks; -}; - static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop) { if (iop->cfg.tlb && iop->cfg.tlb->tlb_flush_all) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 9349bdd62e91..fbac49fe0880 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -107,7 +107,6 @@ enum iommu_cap { */ enum iommu_attr { - DOMAIN_ATTR_IO_PGTABLE_CFG, DOMAIN_ATTR_MAX, }; @@ -209,6 +208,7 @@ struct iommu_iotlb_gather { * @domain_get_attr: Query domain attributes * @domain_set_attr: Change domain attributes * @enable_nesting: Enable nesting + * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*) * @get_resv_regions: Request list of reserved regions for a device * @put_resv_regions: Free list of reserved regions for a device * @apply_resv_region: Temporary helper call-back for iova reserved ranges @@ -262,6 +262,8 @@ struct iommu_ops { int (*domain_set_attr)(struct iommu_domain *domain, enum iommu_attr attr, void *data); int (*enable_nesting)(struct iommu_domain *domain); + int (*set_pgtable_quirks)(struct iommu_domain *domain, + unsigned long quirks); /* Request/Free a list of reserved regions for a device */ void (*get_resv_regions)(struct device *dev, struct list_head *list); @@ -512,6 +514,8 @@ extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr, extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr, void *data); int iommu_enable_nesting(struct iommu_domain *domain); +int iommu_set_pgtable_quirks(struct iommu_domain *domain, + unsigned long quirks); void iommu_set_dma_strict(bool val); bool iommu_get_dma_strict(struct iommu_domain *domain); @@ -891,6 +895,12 @@ static inline int iommu_domain_set_attr(struct iommu_domain *domain, return -EINVAL; } +static inline int iommu_set_pgtable_quirks(struct iommu_domain *domain, + unsigned long quirks) +{ + return 0; +} + static inline int iommu_device_register(struct iommu_device *iommu) { return -ENODEV; -- cgit v1.2.3 From 7876a83ffe8c23c7049a63c747a7b96cafaf10a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Apr 2021 17:52:56 +0200 Subject: iommu: remove iommu_domain_{get,set}_attr Remove the now unused iommu attr infrastructure. Signed-off-by: Christoph Hellwig Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210401155256.298656-21-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 36 ------------------------------------ 1 file changed, 36 deletions(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index fbac49fe0880..a5b3af54fbb8 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -96,20 +96,6 @@ enum iommu_cap { IOMMU_CAP_NOEXEC, /* IOMMU_NOEXEC flag */ }; -/* - * Following constraints are specifc to FSL_PAMUV1: - * -aperture must be power of 2, and naturally aligned - * -number of windows must be power of 2, and address space size - * of each window is determined by aperture size / # of windows - * -the actual size of the mapped region of a window must be power - * of 2 starting with 4KB and physical address must be naturally - * aligned. - */ - -enum iommu_attr { - DOMAIN_ATTR_MAX, -}; - /* These are the possible reserved region types */ enum iommu_resv_type { /* Memory regions which must be mapped 1:1 at all times */ @@ -205,8 +191,6 @@ struct iommu_iotlb_gather { * @probe_finalize: Do final setup work after the device is added to an IOMMU * group and attached to the groups domain * @device_group: find iommu group for a particular device - * @domain_get_attr: Query domain attributes - * @domain_set_attr: Change domain attributes * @enable_nesting: Enable nesting * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*) * @get_resv_regions: Request list of reserved regions for a device @@ -257,10 +241,6 @@ struct iommu_ops { void (*release_device)(struct device *dev); void (*probe_finalize)(struct device *dev); struct iommu_group *(*device_group)(struct device *dev); - int (*domain_get_attr)(struct iommu_domain *domain, - enum iommu_attr attr, void *data); - int (*domain_set_attr)(struct iommu_domain *domain, - enum iommu_attr attr, void *data); int (*enable_nesting)(struct iommu_domain *domain); int (*set_pgtable_quirks)(struct iommu_domain *domain, unsigned long quirks); @@ -509,10 +489,6 @@ extern int iommu_page_response(struct device *dev, extern int iommu_group_id(struct iommu_group *group); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); -extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr, - void *data); -extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr, - void *data); int iommu_enable_nesting(struct iommu_domain *domain); int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks); @@ -883,18 +859,6 @@ static inline int iommu_group_id(struct iommu_group *group) return -ENODEV; } -static inline int iommu_domain_get_attr(struct iommu_domain *domain, - enum iommu_attr attr, void *data) -{ - return -EINVAL; -} - -static inline int iommu_domain_set_attr(struct iommu_domain *domain, - enum iommu_attr attr, void *data) -{ - return -EINVAL; -} - static inline int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks) { -- cgit v1.2.3 From d151c85c52a314c6ecb91ab35b3f696a6778b509 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Apr 2021 16:33:09 +0200 Subject: iommu/amd: Remove the unused device errata code The device errata mechism is entirely unused, so remove it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210402143312.372386-2-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/amd-iommu.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 450717299928..474065ed88a4 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -32,24 +32,6 @@ struct pci_dev; extern int amd_iommu_detect(void); extern int amd_iommu_init_hardware(void); -/** - * amd_iommu_enable_device_erratum() - Enable erratum workaround for device - * in the IOMMUv2 driver - * @pdev: The PCI device the workaround is necessary for - * @erratum: The erratum workaround to enable - * - * The function needs to be called before amd_iommu_init_device(). - * Possible values for the erratum number are for now: - * - AMD_PRI_DEV_ERRATUM_ENABLE_RESET - Reset PRI capability when PRI - * is enabled - * - AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE - Limit number of outstanding PRI - * requests to one - */ -#define AMD_PRI_DEV_ERRATUM_ENABLE_RESET 0 -#define AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE 1 - -extern void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum); - /** * amd_iommu_init_device() - Init device for use with IOMMUv2 driver * @pdev: The PCI device to initialize -- cgit v1.2.3 From fc1b6620501f1a4b88f583549c63666180bea177 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Apr 2021 16:33:12 +0200 Subject: iommu/amd: Move a few prototypes to include/linux/amd-iommu.h A few functions that were intentended for the perf events support are currently declared in arch/x86/events/amd/iommu.h, which mens they are not in scope for the actual function definition. Also amdkfd has started using a few of them using externs in a .c file. End that misery by moving the prototypes to the proper header. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210402143312.372386-5-hch@lst.de Signed-off-by: Joerg Roedel --- include/linux/amd-iommu.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 474065ed88a4..58e6c3806c09 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -10,6 +10,8 @@ #include +struct amd_iommu; + /* * This is mainly used to communicate information back-and-forth * between SVM and IOMMU for setting up and tearing down posted @@ -194,4 +196,14 @@ static inline int amd_iommu_deactivate_guest_mode(void *data) } #endif /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ +int amd_iommu_get_num_iommus(void); +bool amd_iommu_pc_supported(void); +u8 amd_iommu_pc_get_max_banks(unsigned int idx); +u8 amd_iommu_pc_get_max_counters(unsigned int idx); +int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, + u64 *value); +int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, + u64 *value); +struct amd_iommu *get_amd_iommu(unsigned int idx); + #endif /* _ASM_X86_AMD_IOMMU_H */ -- cgit v1.2.3 From c0474a606ecb9326227b4d68059942f9db88a897 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 20 Mar 2021 10:54:13 +0800 Subject: iommu/vt-d: Invalidate PASID cache when root/context entry changed When the Intel IOMMU is operating in the scalable mode, some information from the root and context table may be used to tag entries in the PASID cache. Software should invalidate the PASID-cache when changing root or context table entries. Suggested-by: Ashok Raj Fixes: 7373a8cc38197 ("iommu/vt-d: Setup context and enable RID2PASID support") Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20210320025415.641201-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index e0f8c2ade3e8..03faf20a6817 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -378,6 +378,7 @@ enum { /* PASID cache invalidation granu */ #define QI_PC_ALL_PASIDS 0 #define QI_PC_PASID_SEL 1 +#define QI_PC_GLOBAL 3 #define QI_EIOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) #define QI_EIOTLB_IH(ih) (((u64)ih) << 6) -- cgit v1.2.3 From a8cf291bdac5d415eadb55e79df1fca8c3f0dfef Mon Sep 17 00:00:00 2001 From: Jianyong Wu Date: Wed, 9 Dec 2020 14:09:26 +0800 Subject: ptp: Reorganize ptp_kvm.c to make it arch-independent Currently, the ptp_kvm module contains a lot of x86-specific code. Let's move this code into a new arch-specific file in the same directory, and rename the arch-independent file to ptp_kvm_common.c. Acked-by: Richard Cochran Reviewed-by: Andre Przywara Signed-off-by: Jianyong Wu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201209060932.212364-4-jianyong.wu@arm.com --- include/linux/ptp_kvm.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/ptp_kvm.h (limited to 'include') diff --git a/include/linux/ptp_kvm.h b/include/linux/ptp_kvm.h new file mode 100644 index 000000000000..f960a719f0d5 --- /dev/null +++ b/include/linux/ptp_kvm.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Virtual PTP 1588 clock for use with KVM guests + * + * Copyright (C) 2017 Red Hat Inc. + */ + +#ifndef _PTP_KVM_H_ +#define _PTP_KVM_H_ + +struct timespec64; +struct clocksource; + +int kvm_arch_ptp_init(void); +int kvm_arch_ptp_get_clock(struct timespec64 *ts); +int kvm_arch_ptp_get_crosststamp(u64 *cycle, + struct timespec64 *tspec, struct clocksource **cs); + +#endif /* _PTP_KVM_H_ */ -- cgit v1.2.3 From b2c67cbe9f447312f5cdd7c6641b463f2349aec0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2020 14:09:27 +0800 Subject: time: Add mechanism to recognize clocksource in time_get_snapshot System time snapshots are not conveying information about the current clocksource which was used, but callers like the PTP KVM guest implementation have the requirement to evaluate the clocksource type to select the appropriate mechanism. Introduce a clocksource id field in struct clocksource which is by default set to CSID_GENERIC (0). Clocksource implementations can set that field to a value which allows to identify the clocksource. Store the clocksource id of the current clocksource in the system_time_snapshot so callers can evaluate which clocksource was used to take the snapshot and act accordingly. Signed-off-by: Thomas Gleixner Signed-off-by: Jianyong Wu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201209060932.212364-5-jianyong.wu@arm.com --- include/linux/clocksource.h | 6 ++++++ include/linux/clocksource_ids.h | 11 +++++++++++ include/linux/timekeeping.h | 12 +++++++----- 3 files changed, 24 insertions(+), 5 deletions(-) create mode 100644 include/linux/clocksource_ids.h (limited to 'include') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 86d143db6523..1290d0dce840 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -62,6 +63,10 @@ struct module; * 400-499: Perfect * The ideal clocksource. A must-use where * available. + * @id: Defaults to CSID_GENERIC. The id value is captured + * in certain snapshot functions to allow callers to + * validate the clocksource from which the snapshot was + * taken. * @flags: Flags describing special properties * @enable: Optional function to enable the clocksource * @disable: Optional function to disable the clocksource @@ -100,6 +105,7 @@ struct clocksource { const char *name; struct list_head list; int rating; + enum clocksource_ids id; enum vdso_clock_mode vdso_clock_mode; unsigned long flags; diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h new file mode 100644 index 000000000000..4d8e19e05328 --- /dev/null +++ b/include/linux/clocksource_ids.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CLOCKSOURCE_IDS_H +#define _LINUX_CLOCKSOURCE_IDS_H + +/* Enum to give clocksources a unique identifier */ +enum clocksource_ids { + CSID_GENERIC = 0, + CSID_MAX, +}; + +#endif diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index c6792cf01bc7..78a98bdff76d 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -3,6 +3,7 @@ #define _LINUX_TIMEKEEPING_H #include +#include /* Included from linux/ktime.h */ @@ -243,11 +244,12 @@ struct ktime_timestamps { * @cs_was_changed_seq: The sequence number of clocksource change events */ struct system_time_snapshot { - u64 cycles; - ktime_t real; - ktime_t raw; - unsigned int clock_was_set_seq; - u8 cs_was_changed_seq; + u64 cycles; + ktime_t real; + ktime_t raw; + enum clocksource_ids cs_id; + unsigned int clock_was_set_seq; + u8 cs_was_changed_seq; }; /** -- cgit v1.2.3 From 100148d0fc7dcf8672fe0ac83f44dc5749b4da5c Mon Sep 17 00:00:00 2001 From: Jianyong Wu Date: Wed, 9 Dec 2020 14:09:28 +0800 Subject: clocksource: Add clocksource id for arm arch counter Add clocksource id to the ARM generic counter so that it can be easily identified from callers such as ptp_kvm. Cc: Mark Rutland Reviewed-by: Andre Przywara Signed-off-by: Jianyong Wu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201209060932.212364-6-jianyong.wu@arm.com --- include/linux/clocksource_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h index 4d8e19e05328..16775d7d8f8d 100644 --- a/include/linux/clocksource_ids.h +++ b/include/linux/clocksource_ids.h @@ -5,6 +5,7 @@ /* Enum to give clocksources a unique identifier */ enum clocksource_ids { CSID_GENERIC = 0, + CSID_ARM_ARCH_COUNTER, CSID_MAX, }; -- cgit v1.2.3 From 3bf725699bf62494b3e179f1795f08c7d749f061 Mon Sep 17 00:00:00 2001 From: Jianyong Wu Date: Wed, 9 Dec 2020 14:09:29 +0800 Subject: KVM: arm64: Add support for the KVM PTP service Implement the hypervisor side of the KVM PTP interface. The service offers wall time and cycle count from host to guest. The caller must specify whether they want the host's view of either the virtual or physical counter. Signed-off-by: Jianyong Wu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201209060932.212364-7-jianyong.wu@arm.com --- include/linux/arm-smccc.h | 16 ++++++++++++++++ include/uapi/linux/kvm.h | 1 + 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 1a27bd9493fe..6861489a1890 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -103,6 +103,7 @@ /* KVM "vendor specific" services */ #define ARM_SMCCC_KVM_FUNC_FEATURES 0 +#define ARM_SMCCC_KVM_FUNC_PTP 1 #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS 128 @@ -114,6 +115,21 @@ #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED 1 +/* + * ptp_kvm is a feature used for time sync between vm and host. + * ptp_kvm module in guest kernel will get service from host using + * this hypercall ID. + */ +#define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_PTP) + +/* ptp_kvm counter type ID */ +#define KVM_PTP_VIRT_COUNTER 0 +#define KVM_PTP_PHYS_COUNTER 1 + /* Paravirtualised time calls (defined by ARM DEN0057A) */ #define ARM_SMCCC_HV_PV_TIME_FEATURES \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f6afee209620..0e0f70c0d0dc 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1078,6 +1078,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_DIRTY_LOG_RING 192 #define KVM_CAP_X86_BUS_LOCK_EXIT 193 #define KVM_CAP_PPC_DAWR1 194 +#define KVM_CAP_PTP_KVM 195 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 042a00f93aad5874937e00f36e68301f7e3a0af1 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 29 Mar 2021 09:54:08 -0400 Subject: IB/{ipoib,hfi1}: Add a timeout handler for rdma_netdev The current rdma_netdev handling in ipoib hooks the tx_timeout handler, but prints out a totally useless message that prevents effective debugging especially when multiple transmit queues are being used. Add a tx_timeout rdma_netdev hook and implement the callback in the hfi1 to print additional information. The existing non-helpful message is avoided when the driver has presented a callback. Link: https://lore.kernel.org/r/1617026056-50483-3-git-send-email-dennis.dalessandro@cornelisnetworks.com Reviewed-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bed4cfe50554..c596882893ae 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2215,6 +2215,8 @@ struct rdma_netdev { int set_qkey, u32 qkey); int (*detach_mcast)(struct net_device *dev, struct ib_device *hca, union ib_gid *gid, u16 mlid); + /* timeout */ + void (*tx_timeout)(struct net_device *dev, unsigned int txqueue); }; struct rdma_netdev_alloc_params { -- cgit v1.2.3 From 714638e02d94fa28c9e030d13d03e663fe24925e Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Mon, 29 Mar 2021 13:50:36 +0300 Subject: i2c: Add support for software nodes This makes it possible for the drivers to assign complete software fwnodes to the devices instead of only the device properties in those nodes. Signed-off-by: Heikki Krogerus Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 56622658b215..cb1f882a3e88 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -391,7 +391,8 @@ static inline bool i2c_detect_slave_mode(struct device *dev) { return false; } * @platform_data: stored in i2c_client.dev.platform_data * @of_node: pointer to OpenFirmware device node * @fwnode: device node supplied by the platform firmware - * @properties: additional device properties for the device + * @properties: Deprecated - use swnode instead + * @swnode: software node for the device * @resources: resources associated with the device * @num_resources: number of resources in the @resources array * @irq: stored in i2c_client.irq @@ -416,6 +417,7 @@ struct i2c_board_info { struct device_node *of_node; struct fwnode_handle *fwnode; const struct property_entry *properties; + const struct software_node *swnode; const struct resource *resources; unsigned int num_resources; int irq; -- cgit v1.2.3 From 1a7a6e8072ea0e4582de2da63a9088841fde798e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 6 Apr 2021 09:30:36 +0200 Subject: pwm: Clarify which state pwm_get_state() returns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Given that lowlevel drivers usually cannot implement exactly what a consumer requests with pwm_apply_state() there is some rounding involved. pwm_get_state() returns the setting that was requested most recently by the consumer (opposed to what was actually implemented in hardware in reply to the last request). Clarify this in the function kerneldoc. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 8f4eefd129aa..5bb90af4997e 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -91,6 +91,11 @@ struct pwm_device { * pwm_get_state() - retrieve the current PWM state * @pwm: PWM device * @state: state to fill with the current PWM state + * + * The returned PWM state represents the state that was applied by a previous call to + * pwm_apply_state(). Drivers may have to slightly tweak that state before programming it to + * hardware. If pwm_apply_state() was never called, this returns either the current hardware + * state (if supported) or the default settings. */ static inline void pwm_get_state(const struct pwm_device *pwm, struct pwm_state *state) -- cgit v1.2.3 From 6cbdfb3d91bab122033bd2ecae8c259cb6e4f7d0 Mon Sep 17 00:00:00 2001 From: Nayna Jain Date: Fri, 9 Apr 2021 10:35:07 -0400 Subject: ima: enable loading of build time generated key on .ima keyring The kernel currently only loads the kernel module signing key onto the builtin trusted keyring. Load the module signing key onto the IMA keyring as well. Signed-off-by: Nayna Jain Acked-by: Stefan Berger Signed-off-by: Mimi Zohar --- include/keys/system_keyring.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h index fb8b07daa9d1..f954276c616a 100644 --- a/include/keys/system_keyring.h +++ b/include/keys/system_keyring.h @@ -16,9 +16,16 @@ extern int restrict_link_by_builtin_trusted(struct key *keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key); +extern __init int load_module_cert(struct key *keyring); #else #define restrict_link_by_builtin_trusted restrict_link_reject + +static inline __init int load_module_cert(struct key *keyring) +{ + return 0; +} + #endif #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING -- cgit v1.2.3 From 4b2b4cc50ba6d607d1611ea6b2046a58d16e45eb Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Mon, 29 Mar 2021 13:50:47 +0300 Subject: i2c: Remove support for dangling device properties From now on only accepting complete software nodes. Signed-off-by: Heikki Krogerus Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index cb1f882a3e88..54b3ccc71e37 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -391,7 +391,6 @@ static inline bool i2c_detect_slave_mode(struct device *dev) { return false; } * @platform_data: stored in i2c_client.dev.platform_data * @of_node: pointer to OpenFirmware device node * @fwnode: device node supplied by the platform firmware - * @properties: Deprecated - use swnode instead * @swnode: software node for the device * @resources: resources associated with the device * @num_resources: number of resources in the @resources array @@ -416,7 +415,6 @@ struct i2c_board_info { void *platform_data; struct device_node *of_node; struct fwnode_handle *fwnode; - const struct property_entry *properties; const struct software_node *swnode; const struct resource *resources; unsigned int num_resources; -- cgit v1.2.3 From 07740c92ae57ca21204f1e0c6f59272cdf3190cc Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Thu, 8 Apr 2021 19:17:17 +0800 Subject: i2c: core: add managed function for adding i2c adapters Some I2C controller drivers will only unregister the I2C adapter in their .remove() callback, which can be done by simply using a managed variant to add the I2C adapter. So add the managed functions for adding the I2C adapter. Reviewed-by: Andy Shevchenko Reviewed-by: Dmitry Osipenko Signed-off-by: Yicong Yang Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 59b892c5bb05..ce473ddc0378 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -846,6 +846,7 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap) */ #if IS_ENABLED(CONFIG_I2C) int i2c_add_adapter(struct i2c_adapter *adap); +int devm_i2c_add_adapter(struct device *dev, struct i2c_adapter *adapter); void i2c_del_adapter(struct i2c_adapter *adap); int i2c_add_numbered_adapter(struct i2c_adapter *adap); -- cgit v1.2.3 From 3b4c747cd32078172dd238929e38a43cfed83580 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Thu, 8 Apr 2021 19:17:18 +0800 Subject: i2c: core: add api to provide frequency mode strings Some I2C drivers like Designware and HiSilicon will print the bus frequency mode information, so add a public one that everyone can make use of. Tested-by: Jarkko Nikula Reviewed-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Yicong Yang Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index ce473ddc0378..e5d8b9dad6bf 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -51,6 +51,9 @@ struct module; struct property_entry; #if IS_ENABLED(CONFIG_I2C) +/* Return the Frequency mode string based on the bus frequency */ +const char *i2c_freq_mode_string(u32 bus_freq_hz); + /* * The master routines are the ones normally used to transmit data to devices * on a bus (or read from them). Apart from two basic transfer functions to -- cgit v1.2.3 From 5b5475826c5265cead7ce4ca6d34ec0c566c70aa Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 21 Mar 2021 18:38:32 -0700 Subject: i2c: ensure timely release of driver-allocated resources More and more drivers rely on devres to manage their resources, however if bus' probe() and release() methods are not trivial and control some of resources as well (for example enable or disable clocks, or attach device to a power domain), we need to make sure that driver-allocated resources are released immediately after driver's remove() method returns, and not postponed until driver core gets around to releasing resources. To fix that we open a new devres group before calling driver's probe() and explicitly release it when we return from driver's remove(). Tested-by: Jeff LaBundy Signed-off-by: Dmitry Torokhov Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index e5d8b9dad6bf..e8f2ac8c9c3d 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -309,6 +309,8 @@ struct i2c_driver { * userspace_devices list * @slave_cb: Callback when I2C slave mode of an adapter is used. The adapter * calls it to pass on slave events to the slave driver. + * @devres_group_id: id of the devres group that will be created for resources + * acquired when probing this device. * * An i2c_client identifies a single device (i.e. chip) connected to an * i2c bus. The behaviour exposed to Linux is defined by the driver @@ -337,6 +339,7 @@ struct i2c_client { #if IS_ENABLED(CONFIG_I2C_SLAVE) i2c_slave_cb_t slave_cb; /* callback for slave mode */ #endif + void *devres_group_id; /* ID of probe devres group */ }; #define to_i2c_client(d) container_of(d, struct i2c_client, dev) -- cgit v1.2.3 From 332d1a0373be32a3a3c152756bca45ff4f4e11b5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 18:15:36 -0400 Subject: NFS: nfs4_bitmask_adjust() must not change the server global bitmasks As currently set, the calls to nfs4_bitmask_adjust() will end up overwriting the contents of the nfs_server cache_consistency_bitmask field. The intention here should be to modify a private copy of that mask in the close/delegreturn/write arguments. Fixes: 76bd5c016ef4 ("NFSv4: make cache consistency bitmask dynamic") Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 3327239fa2f9..cc29dee508f7 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -15,6 +15,8 @@ #define NFS_DEF_FILE_IO_SIZE (4096U) #define NFS_MIN_FILE_IO_SIZE (1024U) +#define NFS_BITMASK_SZ 3 + struct nfs4_string { unsigned int len; char *data; @@ -525,7 +527,8 @@ struct nfs_closeargs { struct nfs_seqid * seqid; fmode_t fmode; u32 share_access; - u32 * bitmask; + const u32 * bitmask; + u32 bitmask_store[NFS_BITMASK_SZ]; struct nfs4_layoutreturn_args *lr_args; }; @@ -608,7 +611,8 @@ struct nfs4_delegreturnargs { struct nfs4_sequence_args seq_args; const struct nfs_fh *fhandle; const nfs4_stateid *stateid; - u32 * bitmask; + const u32 *bitmask; + u32 bitmask_store[NFS_BITMASK_SZ]; struct nfs4_layoutreturn_args *lr_args; }; @@ -648,7 +652,8 @@ struct nfs_pgio_args { union { unsigned int replen; /* used by read */ struct { - u32 * bitmask; /* used by write */ + const u32 * bitmask; /* used by write */ + u32 bitmask_store[NFS_BITMASK_SZ]; /* used by write */ enum nfs3_stable_how stable; /* used by write */ }; }; -- cgit v1.2.3 From 3aeffc46afde05140551abb49efaa4563adba38c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Bugge?= Date: Wed, 31 Mar 2021 20:43:13 +0200 Subject: IB/cma: Introduce rdma_set_min_rnr_timer() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce the ability for kernel ULPs to adjust the minimum RNR Retry timer. The INIT -> RTR transition executed by RDMA CM will be used for this adjustment. This avoids an additional ib_modify_qp() call. rdma_set_min_rnr_timer() must be called before the call to rdma_connect() on the active side and before the call to rdma_accept() on the passive side. The default value of RNR Retry timer is zero, which translates to 655 ms. When the receiver is not ready to accept a send messages, it encodes the RNR Retry timer value in the NAK. The requestor will then wait at least the specified time value before retrying the send. The 5-bit value to be supplied to the rdma_set_min_rnr_timer() is documented in IBTA Table 45: "Encoding for RNR NAK Timer Field". Link: https://lore.kernel.org/r/1617216194-12890-2-git-send-email-haakon.bugge@oracle.com Signed-off-by: Håkon Bugge Acked-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- include/rdma/rdma_cm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index b2eed6d30543..d989f030fae0 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -331,6 +331,8 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse); int rdma_set_afonly(struct rdma_cm_id *id, int afonly); int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout); + +int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer); /** * rdma_get_service_id - Return the IB service ID for a specified address. * @id: Communication identifier associated with the address. -- cgit v1.2.3 From 1f3208b2d6975f31b9c7c6bf174b84fe9c97492f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 11:04:34 -0400 Subject: NFS: Add a cache validity flag argument to nfs_revalidate_inode() Add an argument to nfs_revalidate_inode() to allow callers to specify which attributes they need to check for validity. Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index eadaabd18dc7..624ffd47a9d4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -386,7 +386,7 @@ extern void nfs_access_set_mask(struct nfs_access_entry *, u32); extern int nfs_permission(struct user_namespace *, struct inode *, int); extern int nfs_open(struct inode *, struct file *); extern int nfs_attribute_cache_expired(struct inode *inode); -extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode); +extern int nfs_revalidate_inode(struct inode *inode, unsigned long flags); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_clear_invalid_mapping(struct address_space *mapping); extern bool nfs_mapping_need_revalidate_inode(struct inode *inode); -- cgit v1.2.3 From fabf2b341502e894001d70f91309dd6f3785e2dc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 13:14:42 -0400 Subject: NFS: Separate tracking of file nlinks cache validity from the mode/uid/gid Rename can cause us to revalidate the access cache, so lets track the nlinks separately from the mode/uid/gid. Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 624ffd47a9d4..41165b988dfb 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -246,11 +246,13 @@ struct nfs4_copy_state { BIT(13) /* Deferred cache invalidation */ #define NFS_INO_INVALID_BLOCKS BIT(14) /* cached blocks are invalid */ #define NFS_INO_INVALID_XATTR BIT(15) /* xattrs are invalid */ +#define NFS_INO_INVALID_NLINK BIT(16) /* cached nlinks is invalid */ #define NFS_INO_INVALID_ATTR (NFS_INO_INVALID_CHANGE \ | NFS_INO_INVALID_CTIME \ | NFS_INO_INVALID_MTIME \ | NFS_INO_INVALID_SIZE \ + | NFS_INO_INVALID_NLINK \ | NFS_INO_INVALID_OTHER) /* inode metadata is invalid */ /* -- cgit v1.2.3 From 720869eb19f3161980d6d4631d3df7e8c5355993 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 13 Apr 2021 09:41:16 -0400 Subject: NFS: Separate tracking of file mode cache validity from the uid/gid chown()/chgrp() and chmod() are separate operations, and in addition, there are mode operations that are performed automatically by the server. So let's track mode validity separately from the file ownership validity. Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 41165b988dfb..ffba254d2098 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -247,12 +247,14 @@ struct nfs4_copy_state { #define NFS_INO_INVALID_BLOCKS BIT(14) /* cached blocks are invalid */ #define NFS_INO_INVALID_XATTR BIT(15) /* xattrs are invalid */ #define NFS_INO_INVALID_NLINK BIT(16) /* cached nlinks is invalid */ +#define NFS_INO_INVALID_MODE BIT(17) /* cached mode is invalid */ #define NFS_INO_INVALID_ATTR (NFS_INO_INVALID_CHANGE \ | NFS_INO_INVALID_CTIME \ | NFS_INO_INVALID_MTIME \ | NFS_INO_INVALID_SIZE \ | NFS_INO_INVALID_NLINK \ + | NFS_INO_INVALID_MODE \ | NFS_INO_INVALID_OTHER) /* inode metadata is invalid */ /* -- cgit v1.2.3 From 7f08a3359a3c1e39c2a118fbbe583d8c8db14ace Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 26 Mar 2021 09:50:19 -0400 Subject: NFSv4: Add support for the NFSv4.2 "change_attr_type" attribute The change_attr_type allows the server to provide a description of how the change attribute will behave. This again will allow the client to optimise its behaviour w.r.t. attribute revalidation. Signed-off-by: Trond Myklebust --- include/linux/nfs4.h | 9 +++++++++ include/linux/nfs_fs_sb.h | 3 +++ include/linux/nfs_xdr.h | 2 ++ 3 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 5b4c67c91f56..15004c469807 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -452,6 +452,7 @@ enum lock_type4 { #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) #define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4) #define FATTR4_WORD2_CLONE_BLKSIZE (1UL << 13) +#define FATTR4_WORD2_CHANGE_ATTR_TYPE (1UL << 15) #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) #define FATTR4_WORD2_MODE_UMASK (1UL << 17) #define FATTR4_WORD2_XATTR_SUPPORT (1UL << 18) @@ -709,6 +710,14 @@ struct nl4_server { } u; }; +enum nfs4_change_attr_type { + NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR = 0, + NFS4_CHANGE_TYPE_IS_VERSION_COUNTER = 1, + NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS = 2, + NFS4_CHANGE_TYPE_IS_TIME_METADATA = 3, + NFS4_CHANGE_TYPE_IS_UNDEFINED = 4, +}; + /* * Options for setxattr. These match the flags for setxattr(2). */ diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 6f76b32a0238..fbcdfd9f7a7f 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -180,6 +180,9 @@ struct nfs_server { #define NFS_OPTION_FSCACHE 0x00000001 /* - local caching enabled */ #define NFS_OPTION_MIGRATION 0x00000002 /* - NFSv4 migration enabled */ + enum nfs4_change_attr_type + change_attr_type;/* Description of change attribute */ + struct nfs_fsid fsid; __u64 maxfilesize; /* maximum file size */ struct timespec64 time_delta; /* smallest time granularity */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index cc29dee508f7..717ecc87c9e7 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -152,6 +152,8 @@ struct nfs_fsinfo { __u32 layouttype[NFS_MAX_LAYOUT_TYPES]; /* supported pnfs layout driver */ __u32 blksize; /* preferred pnfs io block size */ __u32 clone_blksize; /* granularity of a CLONE operation */ + enum nfs4_change_attr_type + change_attr_type; /* Info about change attr */ __u32 xattr_support; /* User xattrs supported */ }; -- cgit v1.2.3 From 63f9c44bca5e10fb1fd86aee7e38039ed98f95cc Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Sun, 11 Apr 2021 15:29:18 +0300 Subject: net/mlx5: Add MEMIC operations related bits Add the MEMIC operations bits and structures to the mlx5_ifc file. Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 432290b58a0b..47241ebfcf7d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -133,6 +133,7 @@ enum { MLX5_CMD_OP_PAGE_FAULT_RESUME = 0x204, MLX5_CMD_OP_ALLOC_MEMIC = 0x205, MLX5_CMD_OP_DEALLOC_MEMIC = 0x206, + MLX5_CMD_OP_MODIFY_MEMIC = 0x207, MLX5_CMD_OP_CREATE_EQ = 0x301, MLX5_CMD_OP_DESTROY_EQ = 0x302, MLX5_CMD_OP_QUERY_EQ = 0x303, @@ -1017,7 +1018,11 @@ struct mlx5_ifc_device_mem_cap_bits { u8 header_modify_sw_icm_start_address[0x40]; - u8 reserved_at_180[0x680]; + u8 reserved_at_180[0x80]; + + u8 memic_operations[0x20]; + + u8 reserved_at_220[0x5e0]; }; struct mlx5_ifc_device_event_cap_bits { @@ -10417,6 +10422,41 @@ struct mlx5_ifc_destroy_vport_lag_in_bits { u8 reserved_at_40[0x40]; }; +enum { + MLX5_MODIFY_MEMIC_OP_MOD_ALLOC, + MLX5_MODIFY_MEMIC_OP_MOD_DEALLOC, +}; + +struct mlx5_ifc_modify_memic_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x18]; + u8 memic_operation_type[0x8]; + + u8 memic_start_addr[0x40]; + + u8 reserved_at_c0[0x140]; +}; + +struct mlx5_ifc_modify_memic_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + u8 memic_operation_addr[0x40]; + + u8 reserved_at_c0[0x140]; +}; + struct mlx5_ifc_alloc_memic_in_bits { u8 opcode[0x10]; u8 reserved_at_10[0x10]; -- cgit v1.2.3 From 7ca2b8a378ca0d4ce52edc63d6b160467d8a10c1 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Sun, 11 Apr 2021 15:29:19 +0300 Subject: RDMA/uverbs: Make UVERBS_OBJECT_METHODS to consider line number In order to support multiple methods declaration in the same file we should use the line number as part of the name. Link: https://lore.kernel.org/r/20210411122924.60230-3-leon@kernel.org Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/uverbs_named_ioctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/uverbs_named_ioctl.h b/include/rdma/uverbs_named_ioctl.h index f04f5126f61e..ee7873f872c3 100644 --- a/include/rdma/uverbs_named_ioctl.h +++ b/include/rdma/uverbs_named_ioctl.h @@ -20,7 +20,7 @@ /* These are static so they do not need to be qualified */ #define UVERBS_METHOD_ATTRS(method_id) _method_attrs_##method_id -#define UVERBS_OBJECT_METHODS(object_id) _object_methods_##object_id +#define UVERBS_OBJECT_METHODS(object_id) _UVERBS_NAME(_object_methods_##object_id, __LINE__) #define DECLARE_UVERBS_NAMED_METHOD(_method_id, ...) \ static const struct uverbs_attr_def *const UVERBS_METHOD_ATTRS( \ -- cgit v1.2.3 From cea85fa5dbc2e0206b58095c0c12ff035b11d129 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Sun, 11 Apr 2021 15:29:23 +0300 Subject: RDMA/mlx5: Add support in MEMIC operations MEMIC buffer, in addition to regular read and write operations, can support atomic operations from the host. Introduce and implement new UAPI to allocate address space for MEMIC operations such as atomic. This includes: 1. Expose new IOCTL for request mapping of MEMIC operation. 2. Hold the operations address in a list, so same operation to same DM will be allocated only once. 3. Manage refcount on the mlx5_ib_dm object, so it would be keep valid until all addresses were unmapped. Link: https://lore.kernel.org/r/20210411122924.60230-7-leon@kernel.org Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 3fd9b380a091..422e51118445 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -41,6 +41,17 @@ enum mlx5_ib_create_flow_action_attrs { MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS = (1U << UVERBS_ID_NS_SHIFT), }; +enum mlx5_ib_dm_methods { + MLX5_IB_METHOD_DM_MAP_OP_ADDR = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_dm_map_op_addr_attrs { + MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_OP, + MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_START_OFFSET, + MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_PAGE_INDEX, +}; + enum mlx5_ib_alloc_dm_attrs { MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, -- cgit v1.2.3 From 18731642d4e1f73e446710389d3b01233d6fbc78 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Sun, 11 Apr 2021 15:29:24 +0300 Subject: RDMA/mlx5: Expose UAPI to query DM Expose UAPI to query MEMIC DM, this will let user space application that didn't allocate the DM but has access to by owning the matching command FD to retrieve its information. Link: https://lore.kernel.org/r/20210411122924.60230-8-leon@kernel.org Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 422e51118445..42a0163bee82 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -43,6 +43,7 @@ enum mlx5_ib_create_flow_action_attrs { enum mlx5_ib_dm_methods { MLX5_IB_METHOD_DM_MAP_OP_ADDR = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_DM_QUERY, }; enum mlx5_ib_dm_map_op_addr_attrs { @@ -52,6 +53,13 @@ enum mlx5_ib_dm_map_op_addr_attrs { MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_PAGE_INDEX, }; +enum mlx5_ib_query_dm_attrs { + MLX5_IB_ATTR_QUERY_DM_REQ_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_QUERY_DM_RESP_START_OFFSET, + MLX5_IB_ATTR_QUERY_DM_RESP_PAGE_INDEX, + MLX5_IB_ATTR_QUERY_DM_RESP_LENGTH, +}; + enum mlx5_ib_alloc_dm_attrs { MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, -- cgit v1.2.3 From 52a4c95f4d24b8bcb50745732f7b9f8513c49c5f Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 25 Mar 2021 11:18:22 -0400 Subject: fuse: extend FUSE_SETXATTR request Fuse client needs to send additional information to file server when it calls SETXATTR(system.posix_acl_access), so add extra flags field to the structure. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 54442612c48b..390f7b4c889d 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -179,6 +179,7 @@ * 7.33 * - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID * - add FUSE_OPEN_KILL_SUIDGID + * - extend fuse_setxattr_in, add FUSE_SETXATTR_EXT */ #ifndef _LINUX_FUSE_H @@ -330,6 +331,7 @@ struct fuse_file_lock { * does not have CAP_FSETID. Additionally upon * write/truncate sgid is killed only if file has group * execute permission. (Same as Linux VFS behavior). + * FUSE_SETXATTR_EXT: Server supports extended struct fuse_setxattr_in */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -360,6 +362,7 @@ struct fuse_file_lock { #define FUSE_MAP_ALIGNMENT (1 << 26) #define FUSE_SUBMOUNTS (1 << 27) #define FUSE_HANDLE_KILLPRIV_V2 (1 << 28) +#define FUSE_SETXATTR_EXT (1 << 29) /** * CUSE INIT request/reply flags @@ -681,9 +684,13 @@ struct fuse_fsync_in { uint32_t padding; }; +#define FUSE_COMPAT_SETXATTR_IN_SIZE 8 + struct fuse_setxattr_in { uint32_t size; uint32_t flags; + uint32_t setxattr_flags; + uint32_t padding; }; struct fuse_getxattr_in { -- cgit v1.2.3 From 550a7d3bc0c4049ef8d36ff4d9ed7082ee8cb5ec Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 25 Mar 2021 11:18:23 -0400 Subject: fuse: add a flag FUSE_SETXATTR_ACL_KILL_SGID to kill SGID When posix access ACL is set, it can have an effect on file mode and it can also need to clear SGID if. - None of caller's group/supplementary groups match file owner group. AND - Caller is not priviliged (No CAP_FSETID). As of now fuser server is responsible for changing the file mode as well. But it does not know whether to clear SGID or not. So add a flag FUSE_SETXATTR_ACL_KILL_SGID and send this info with SETXATTR to let file server know that sgid needs to be cleared as well. Reported-by: Luis Henriques Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 390f7b4c889d..271ae90a9bb7 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -180,6 +180,7 @@ * - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID * - add FUSE_OPEN_KILL_SUIDGID * - extend fuse_setxattr_in, add FUSE_SETXATTR_EXT + * - add FUSE_SETXATTR_ACL_KILL_SGID */ #ifndef _LINUX_FUSE_H @@ -454,6 +455,12 @@ struct fuse_file_lock { */ #define FUSE_OPEN_KILL_SUIDGID (1 << 0) +/** + * setxattr flags + * FUSE_SETXATTR_ACL_KILL_SGID: Clear SGID when system.posix_acl_access is set + */ +#define FUSE_SETXATTR_ACL_KILL_SGID (1 << 0) + enum fuse_opcode { FUSE_LOOKUP = 1, FUSE_FORGET = 2, /* no reply */ -- cgit v1.2.3 From ce62b114bbad9346641d16853c528ba01513e1b0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 5 Mar 2018 15:01:18 -0500 Subject: NFS: Split attribute support out from the server capabilities There are lots of attributes, and they are crowding out the bit space. Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index fbcdfd9f7a7f..d28d7a62864f 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -191,6 +191,8 @@ struct nfs_server { dev_t s_dev; /* superblock dev numbers */ struct nfs_auth_info auth_info; /* parsed auth flavors */ + __u64 fattr_valid; /* Valid attributes */ + #ifdef CONFIG_NFS_FSCACHE struct nfs_fscache_key *fscache_key; /* unique key for superblock */ struct fscache_cookie *fscache; /* superblock cookie */ @@ -267,16 +269,7 @@ struct nfs_server { #define NFS_CAP_SYMLINKS (1U << 2) #define NFS_CAP_ACLS (1U << 3) #define NFS_CAP_ATOMIC_OPEN (1U << 4) -/* #define NFS_CAP_CHANGE_ATTR (1U << 5) */ #define NFS_CAP_LGOPEN (1U << 5) -#define NFS_CAP_FILEID (1U << 6) -#define NFS_CAP_MODE (1U << 7) -#define NFS_CAP_NLINK (1U << 8) -#define NFS_CAP_OWNER (1U << 9) -#define NFS_CAP_OWNER_GROUP (1U << 10) -#define NFS_CAP_ATIME (1U << 11) -#define NFS_CAP_CTIME (1U << 12) -#define NFS_CAP_MTIME (1U << 13) #define NFS_CAP_POSIX_LOCK (1U << 14) #define NFS_CAP_UIDGID_NOMAP (1U << 15) #define NFS_CAP_STATEID_NFSV41 (1U << 16) -- cgit v1.2.3 From e936a5970ef596ff48fca72aa8200955753c543f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 31 Mar 2021 13:22:27 -0400 Subject: SUNRPC: Add tracepoint that fires when an RPC is retransmitted A separate tracepoint can be left enabled all the time to capture rare but important retransmission events. So for example: kworker/u26:3-568 [009] 156.967933: xprt_retransmit: task:44093@5 xid=0xa25dbc79 nfsv3 WRITE ntrans=2 Or, for example, enable all nfs and nfs4 tracepoints, and set up a trigger to disable tracing when xprt_retransmit fires to capture everything that leads up to it. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/sunrpc.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 036eb1f5c133..430b42f2351f 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1079,6 +1079,46 @@ TRACE_EVENT(xprt_transmit, __entry->seqno, __entry->status) ); +TRACE_EVENT(xprt_retransmit, + TP_PROTO( + const struct rpc_rqst *rqst + ), + + TP_ARGS(rqst), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(int, ntrans) + __field(int, version) + __string(progname, + rqst->rq_task->tk_client->cl_program->name) + __string(procedure, + rqst->rq_task->tk_msg.rpc_proc->p_name) + ), + + TP_fast_assign( + struct rpc_task *task = rqst->rq_task; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->ntrans = rqst->rq_ntrans; + __assign_str(progname, + task->tk_client->cl_program->name) + __entry->version = task->tk_client->cl_vers; + __assign_str(procedure, task->tk_msg.rpc_proc->p_name) + ), + + TP_printk( + "task:%u@%u xid=0x%08x %sv%d %s ntrans=%d", + __entry->task_id, __entry->client_id, __entry->xid, + __get_str(progname), __entry->version, __get_str(procedure), + __entry->ntrans) +); + TRACE_EVENT(xprt_ping, TP_PROTO(const struct rpc_xprt *xprt, int status), -- cgit v1.2.3 From 6cf23783f750634e10daeede48b0f5f5d64ebf3a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 31 Mar 2021 16:03:08 -0400 Subject: SUNRPC: Remove trace_xprt_transmit_queued This tracepoint can crash when dereferencing snd_task because when some transports connect, they put a cookie in that field instead of a pointer to an rpc_task. BUG: KASAN: use-after-free in trace_event_raw_event_xprt_writelock_event+0x141/0x18e [sunrpc] Read of size 2 at addr ffff8881a83bd3a0 by task git/331872 CPU: 11 PID: 331872 Comm: git Tainted: G S 5.12.0-rc2-00007-g3ab6e585a7f9 #1453 Hardware name: Supermicro SYS-6028R-T/X10DRi, BIOS 1.1a 10/16/2015 Call Trace: dump_stack+0x9c/0xcf print_address_description.constprop.0+0x18/0x239 kasan_report+0x174/0x1b0 trace_event_raw_event_xprt_writelock_event+0x141/0x18e [sunrpc] xprt_prepare_transmit+0x8e/0xc1 [sunrpc] call_transmit+0x4d/0xc6 [sunrpc] Fixes: 9ce07ae5eb1d ("SUNRPC: Replace dprintk() call site in xprt_prepare_transmit") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/sunrpc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 430b42f2351f..3c3c4c843c62 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1181,7 +1181,6 @@ DECLARE_EVENT_CLASS(xprt_writelock_event, DEFINE_WRITELOCK_EVENT(reserve_xprt); DEFINE_WRITELOCK_EVENT(release_xprt); -DEFINE_WRITELOCK_EVENT(transmit_queued); DECLARE_EVENT_CLASS(xprt_cong_event, TP_PROTO( -- cgit v1.2.3 From d3debfcc4e3f65f1370ad4ca2ab61e7f0ff683cd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 Apr 2021 17:41:14 +0100 Subject: bug: Provide dummy version of bug_get_file_line() when !GENERIC_BUG Provide the missing dummy bug_get_file_line() implementation when GENENERIC_BUG isn't selected. Reported-by: kernel test robot Fixes: 26dbc7e299c7 ("bug: Factor out a getter for a bug's file line") Cc: Andrew Scull Signed-off-by: Marc Zyngier --- include/linux/bug.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/bug.h b/include/linux/bug.h index e3841bee4c8d..348acf2558f3 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -61,6 +61,13 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr, return BUG_TRAP_TYPE_BUG; } +struct bug_entry; +static inline void bug_get_file_line(struct bug_entry *bug, const char **file, + unsigned int *line) +{ + *file = NULL; + *line = 0; +} static inline void generic_bug_clear_once(void) {} -- cgit v1.2.3 From 80e5d1ff5d5f1ed5167a69b7c2fe86071b615f6b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Apr 2021 19:46:50 -0400 Subject: useful constants: struct qstr for ".." Signed-off-by: Al Viro --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4ecde5d8250c..9e23d33bb6f1 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -59,6 +59,7 @@ struct qstr { extern const struct qstr empty_name; extern const struct qstr slash_name; +extern const struct qstr dotdot_name; struct dentry_stat_t { long nr_dentry; -- cgit v1.2.3 From 42eb0d54c08a0331d6d295420f602237968d792b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Mar 2021 09:22:09 +0100 Subject: fs: split receive_fd_replace from __receive_fd receive_fd_replace shares almost no code with the general case, so split it out. Also remove the "Bump the sock usage counts" comment from both copies, as that is now what __receive_sock actually does. [AV: ... and make the only user of receive_fd_replace() choose between it and receive_fd() according to what userland had passed to it in flags] Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/file.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/file.h b/include/linux/file.h index 225982792fa2..2de2e4613d7b 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -92,23 +92,20 @@ extern void put_unused_fd(unsigned int fd); extern void fd_install(unsigned int fd, struct file *file); -extern int __receive_fd(int fd, struct file *file, int __user *ufd, +extern int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); static inline int receive_fd_user(struct file *file, int __user *ufd, unsigned int o_flags) { if (ufd == NULL) return -EFAULT; - return __receive_fd(-1, file, ufd, o_flags); + return __receive_fd(file, ufd, o_flags); } static inline int receive_fd(struct file *file, unsigned int o_flags) { - return __receive_fd(-1, file, NULL, o_flags); -} -static inline int receive_fd_replace(int fd, struct file *file, unsigned int o_flags) -{ - return __receive_fd(fd, file, NULL, o_flags); + return __receive_fd(file, NULL, o_flags); } +int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); extern void __fput_sync(struct file *); -- cgit v1.2.3 From c0aec6680b6c82fe893a546e322e1130cd5cf21e Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 1 Apr 2021 14:56:25 +0100 Subject: iommu: Statically set module owner It happens that the 3 drivers which first supported being modular are also ones which play games with their pgsize_bitmap, so have non-const iommu_ops where dynamically setting the owner manages to work out OK. However, it's less than ideal to force that upon all drivers which want to be modular - like the new sprd-iommu driver which now has a potential bug in that regard - so let's just statically set the module owner and let ops remain const wherever possible. Reviewed-by: Christoph Hellwig Acked-by: Will Deacon Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/31423b99ff609c3d4b291c701a7a7a810d9ce8dc.1617285386.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 9223a8266b08..ce904cf4e774 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -361,19 +361,12 @@ int iommu_device_link(struct iommu_device *iommu, struct device *link); void iommu_device_unlink(struct iommu_device *iommu, struct device *link); int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain); -static inline void __iommu_device_set_ops(struct iommu_device *iommu, +static inline void iommu_device_set_ops(struct iommu_device *iommu, const struct iommu_ops *ops) { iommu->ops = ops; } -#define iommu_device_set_ops(iommu, ops) \ -do { \ - struct iommu_ops *__ops = (struct iommu_ops *)(ops); \ - __ops->owner = THIS_MODULE; \ - __iommu_device_set_ops(iommu, __ops); \ -} while (0) - static inline void iommu_device_set_fwnode(struct iommu_device *iommu, struct fwnode_handle *fwnode) { -- cgit v1.2.3 From 2d471b20c55e13c98d1dba413bf2de618e89cdac Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 1 Apr 2021 14:56:26 +0100 Subject: iommu: Streamline registration interface Rather than have separate opaque setter functions that are easy to overlook and lead to repetitive boilerplate in drivers, let's pass the relevant initialisation parameters directly to iommu_device_register(). Acked-by: Will Deacon Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/ab001b87c533b6f4db71eb90db6f888953986c36.1617285386.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ce904cf4e774..32d448050bf7 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -350,7 +350,9 @@ struct dev_iommu { void *priv; }; -int iommu_device_register(struct iommu_device *iommu); +int iommu_device_register(struct iommu_device *iommu, + const struct iommu_ops *ops, + struct device *hwdev); void iommu_device_unregister(struct iommu_device *iommu); int iommu_device_sysfs_add(struct iommu_device *iommu, struct device *parent, @@ -361,18 +363,6 @@ int iommu_device_link(struct iommu_device *iommu, struct device *link); void iommu_device_unlink(struct iommu_device *iommu, struct device *link); int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain); -static inline void iommu_device_set_ops(struct iommu_device *iommu, - const struct iommu_ops *ops) -{ - iommu->ops = ops; -} - -static inline void iommu_device_set_fwnode(struct iommu_device *iommu, - struct fwnode_handle *fwnode) -{ - iommu->fwnode = fwnode; -} - static inline struct iommu_device *dev_to_iommu_device(struct device *dev) { return (struct iommu_device *)dev_get_drvdata(dev); @@ -858,21 +848,13 @@ static inline int iommu_set_pgtable_quirks(struct iommu_domain *domain, return 0; } -static inline int iommu_device_register(struct iommu_device *iommu) +static inline int iommu_device_register(struct iommu_device *iommu, + const struct iommu_ops *ops, + struct device *hwdev) { return -ENODEV; } -static inline void iommu_device_set_ops(struct iommu_device *iommu, - const struct iommu_ops *ops) -{ -} - -static inline void iommu_device_set_fwnode(struct iommu_device *iommu, - struct fwnode_handle *fwnode) -{ -} - static inline struct iommu_device *dev_to_iommu_device(struct device *dev) { return NULL; -- cgit v1.2.3 From 7f100744749e4fe547dece3bb6557fae5f0a7252 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Fri, 16 Apr 2021 19:15:37 +0530 Subject: PCI: tegra: Add Tegra194 MCFG quirks for ECAM errata The PCIe controller in Tegra194 SoC is not ECAM-compliant. With the current hardware design, ECAM can be enabled only for one controller (the C5 controller) with bus numbers starting from 160 instead of 0. A different approach is taken to avoid this abnormal way of enabling ECAM for just one controller but to enable configuration space access for all the other controllers. In this approach, ops are added through MCFG quirk mechanism which access the configuration spaces by dynamically programming iATU (internal AddressTranslation Unit) to generate respective configuration accesses just like the way it is done in DesignWare core sub-system. This issue is specific to Tegra194 and it would be fixed in the future generations of Tegra SoCs. Link: https://lore.kernel.org/r/20210416134537.19474-1-vidyas@nvidia.com Signed-off-by: Vidya Sagar Signed-off-by: Bjorn Helgaas --- include/linux/pci-ecam.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index 65d3d83015c3..fbdadd4d8377 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -85,6 +85,7 @@ extern const struct pci_ecam_ops pci_thunder_ecam_ops; /* Cavium ThunderX 1.x */ extern const struct pci_ecam_ops xgene_v1_pcie_ecam_ops; /* APM X-Gene PCIe v1 */ extern const struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */ extern const struct pci_ecam_ops al_pcie_ops; /* Amazon Annapurna Labs PCIe */ +extern const struct pci_ecam_ops tegra194_pcie_ops; /* Tegra194 PCIe */ #endif #if IS_ENABLED(CONFIG_PCI_HOST_COMMON) -- cgit v1.2.3 From 5f7c292b8975c9146063abbb91c0b9cdc1a5e9c5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 19:19:47 -0700 Subject: KVM: Move prototypes for MMU notifier callbacks to generic code Move the prototypes for the MMU notifier callbacks out of arch code and into common code. There is no benefit to having each arch replicate the prototypes since any deviation from the invocation in common code will explode. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210326021957.1424875-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1b65e7204344..e6d77353025c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -218,6 +218,14 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER +int kvm_unmap_hva_range(struct kvm *kvm, + unsigned long start, unsigned long end, unsigned flags); +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +#endif + enum { OUTSIDE_GUEST_MODE, IN_GUEST_MODE, -- cgit v1.2.3 From 501b918525efec2e701e806f04d474d7da350962 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 19:19:48 -0700 Subject: KVM: Move arm64's MMU notifier trace events to generic code Move arm64's MMU notifier trace events into common code in preparation for doing the hva->gfn lookup in common code. The alternative would be to trace the gfn instead of hva, but that's not obviously better and could also be done in common code. Tracing the notifiers is also quite handy for debug regardless of architecture. Remove a completely redundant tracepoint from PPC e500. Signed-off-by: Sean Christopherson Message-Id: <20210326021957.1424875-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/trace/events/kvm.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'include') diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 49d7d0fe29f6..200eb7465530 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -462,6 +462,72 @@ TRACE_EVENT(kvm_dirty_ring_exit, TP_printk("vcpu %d", __entry->vcpu_id) ); +TRACE_EVENT(kvm_unmap_hva_range, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mmu notifier unmap range: %#016lx -- %#016lx", + __entry->start, __entry->end) +); + +TRACE_EVENT(kvm_set_spte_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva) +); + +TRACE_EVENT(kvm_age_hva, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mmu notifier age hva: %#016lx -- %#016lx", + __entry->start, __entry->end) +); + +TRACE_EVENT(kvm_test_age_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) +); + #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 6dfbd6b5d5de19bad36f44710359200f21191134 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 19:19:57 -0700 Subject: KVM: x86/mmu: Drop trace_kvm_age_page() tracepoint Remove x86's trace_kvm_age_page() tracepoint. It's mostly redundant with the common trace_kvm_age_hva() tracepoint, and if there is a need for the extra details, e.g. gfn, referenced, etc... those details should be added to the common tracepoint so that all architectures and MMUs benefit from the info. Signed-off-by: Sean Christopherson Message-Id: <20210326021957.1424875-19-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/trace/events/kvm.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'include') diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 200eb7465530..37e1e1a2d67d 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -255,30 +255,6 @@ TRACE_EVENT(kvm_fpu, TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol)) ); -TRACE_EVENT(kvm_age_page, - TP_PROTO(ulong gfn, int level, struct kvm_memory_slot *slot, int ref), - TP_ARGS(gfn, level, slot, ref), - - TP_STRUCT__entry( - __field( u64, hva ) - __field( u64, gfn ) - __field( u8, level ) - __field( u8, referenced ) - ), - - TP_fast_assign( - __entry->gfn = gfn; - __entry->level = level; - __entry->hva = ((gfn - slot->base_gfn) << - PAGE_SHIFT) + slot->userspace_addr; - __entry->referenced = ref; - ), - - TP_printk("hva %llx gfn %llx level %u %s", - __entry->hva, __entry->gfn, __entry->level, - __entry->referenced ? "YOUNG" : "OLD") -); - #ifdef CONFIG_KVM_ASYNC_PF DECLARE_EVENT_CLASS(kvm_async_get_page_class, -- cgit v1.2.3 From 8b13c36493d8cb56fc3b386507873c5412b7108d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Apr 2021 10:24:07 -0400 Subject: KVM: introduce KVM_CAP_SET_GUEST_DEBUG2 This capability will allow the user to know which KVM_GUESTDBG_* bits are supported. Signed-off-by: Maxim Levitsky Message-Id: <20210401135451.1004564-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f6afee209620..727010788eff 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1078,6 +1078,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_DIRTY_LOG_RING 192 #define KVM_CAP_X86_BUS_LOCK_EXIT 193 #define KVM_CAP_PPC_DAWR1 194 +#define KVM_CAP_SET_GUEST_DEBUG2 195 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 6c9dd6d26216ad9733e57f382e1669c142494aab Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 2 Apr 2021 17:53:09 +0200 Subject: KVM: constify kvm_arch_flush_remote_tlbs_memslot memslots are stored in RCU and there should be no need to change them. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e6d77353025c..34a974ffc882 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -894,7 +894,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot); #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, - struct kvm_memory_slot *memslot); + const struct kvm_memory_slot *memslot); #else /* !CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, -- cgit v1.2.3 From 3039bcc744980afe87c612122e47a27306483bc2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Apr 2021 17:56:50 -0700 Subject: KVM: Move x86's MMU notifier memslot walkers to generic code Move the hva->gfn lookup for MMU notifiers into common code. Every arch does a similar lookup, and some arch code is all but identical across multiple architectures. In addition to consolidating code, this will allow introducing optimizations that will benefit all architectures without incurring multiple walks of the memslots, e.g. by taking mmu_lock if and only if a relevant range exists in the memslots. The use of __always_inline to avoid indirect call retpolines, as done by x86, may also benefit other architectures. Consolidating the lookups also fixes a wart in x86, where the legacy MMU and TDP MMU each do their own memslot walks. Lastly, future enhancements to the memslot implementation, e.g. to add an interval tree to track host address, will need to touch far less arch specific code. MIPS, PPC, and arm64 will be converted one at a time in future patches. Signed-off-by: Sean Christopherson Message-Id: <20210402005658.3024832-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 34a974ffc882..0228436bc446 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -219,11 +219,25 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif #ifdef KVM_ARCH_WANT_MMU_NOTIFIER +#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS +struct kvm_gfn_range { + struct kvm_memory_slot *slot; + gfn_t start; + gfn_t end; + pte_t pte; + bool may_block; +}; +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); +#else int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags); int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +#endif /* KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS */ #endif enum { -- cgit v1.2.3 From b4c5936c47f86295cc76672e8dbeeca8b2379ba6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Apr 2021 17:56:54 -0700 Subject: KVM: Kill off the old hva-based MMU notifier callbacks Yank out the hva-based MMU notifier APIs now that all architectures that use the notifiers have moved to the gfn-based APIs. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210402005658.3024832-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0228436bc446..6b4dd9500d70 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -219,7 +219,6 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif #ifdef KVM_ARCH_WANT_MMU_NOTIFIER -#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS struct kvm_gfn_range { struct kvm_memory_slot *slot; gfn_t start; @@ -231,13 +230,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -#else -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, unsigned flags); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -#endif /* KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS */ #endif enum { -- cgit v1.2.3 From 8ca6f063b73d3754213d009efb3df486c8fe52d2 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Thu, 1 Apr 2021 16:37:24 -0700 Subject: KVM: x86/mmu: Re-add const qualifier in kvm_tdp_mmu_zap_collapsible_sptes kvm_tdp_mmu_zap_collapsible_sptes unnecessarily removes the const qualifier from its memlsot argument, leading to a compiler warning. Add the const annotation and pass it to subsequent functions. Signed-off-by: Ben Gardon Message-Id: <20210401233736.638171-2-bgardon@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6b4dd9500d70..d17e3ff1138d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1130,7 +1130,7 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) } static inline unsigned long -__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) +__gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; } -- cgit v1.2.3 From 7a35693adcd38664b852ad10e3742782b3e87987 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 7 Apr 2021 14:25:22 +0100 Subject: dm: replace dm_vcalloc() Use kvcalloc or kvmalloc_array instead (depending whether zeroing is useful). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Mike Snitzer --- include/linux/device-mapper.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 5c641f930caf..ff700fb6ce1d 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -574,11 +574,6 @@ struct dm_table *dm_swap_table(struct mapped_device *md, */ void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm); -/* - * A wrapper around vmalloc. - */ -void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); - /*----------------------------------------------------------------- * Macros. *---------------------------------------------------------------*/ -- cgit v1.2.3 From 5d3c4c79384af06e3c8e25b7770b6247496b4417 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 15:20:49 -0700 Subject: KVM: Stop looking for coalesced MMIO zones if the bus is destroyed Abort the walk of coalesced MMIO zones if kvm_io_bus_unregister_dev() fails to allocate memory for the new instance of the bus. If it can't instantiate a new bus, unregister_dev() destroys all devices _except_ the target device. But, it doesn't tell the caller that it obliterated the bus and invoked the destructor for all devices that were on the bus. In the coalesced MMIO case, this can result in a deleted list entry dereference due to attempting to continue iterating on coalesced_zones after future entries (in the walk) have been deleted. Opportunistically add curly braces to the for-loop, which encompasses many lines but sneaks by without braces due to the guts being a single if statement. Fixes: f65886606c2d ("KVM: fix memory leak in kvm_io_bus_unregister_dev()") Cc: stable@vger.kernel.org Reported-by: Hao Sun Signed-off-by: Sean Christopherson Message-Id: <20210412222050.876100-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d17e3ff1138d..82b066db37cb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -192,8 +192,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val); int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev); -void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_io_device *dev); +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev); struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr); -- cgit v1.2.3 From fe7e948837f312d87853b3fce743795d1ae3715a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Apr 2021 16:21:43 +1200 Subject: KVM: x86: Add capability to grant VM access to privileged SGX attribute Add a capability, KVM_CAP_SGX_ATTRIBUTE, that can be used by userspace to grant a VM access to a priveleged attribute, with args[0] holding a file handle to a valid SGX attribute file. The SGX subsystem restricts access to a subset of enclave attributes to provide additional security for an uncompromised kernel, e.g. to prevent malware from using the PROVISIONKEY to ensure its nodes are running inside a geniune SGX enclave and/or to obtain a stable fingerprint. To prevent userspace from circumventing such restrictions by running an enclave in a VM, KVM restricts guest access to privileged attributes by default. Cc: Andy Lutomirski Signed-off-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: <0b099d65e933e068e3ea934b0523bab070cb8cea.1618196135.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 727010788eff..162b7e044c8b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1079,6 +1079,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_X86_BUS_LOCK_EXIT 193 #define KVM_CAP_PPC_DAWR1 194 #define KVM_CAP_SET_GUEST_DEBUG2 195 +#define KVM_CAP_SGX_ATTRIBUTE 196 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 3a05d08f6cc75b74079290c33d6127b2857226fa Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:38 +0100 Subject: PCI/MSI: Drop use of msi_controller from core code As there is no driver using msi_controller, we can now safely remove its use from the PCI probe code. Link: https://lore.kernel.org/r/20210330151145.997953-8-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- include/linux/pci.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 86c799c97b77..ebf557e59d87 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -540,7 +540,6 @@ struct pci_host_bridge { int (*map_irq)(const struct pci_dev *, u8, u8); void (*release_fn)(struct pci_host_bridge *); void *release_data; - struct msi_controller *msi; unsigned int ignore_reset_delay:1; /* For entire hierarchy */ unsigned int no_ext_tags:1; /* No Extended Tags */ unsigned int native_aer:1; /* OS may use PCIe AER */ @@ -621,7 +620,6 @@ struct pci_bus { struct resource busn_res; /* Bus numbers routed to this bus */ struct pci_ops *ops; /* Configuration access functions */ - struct msi_controller *msi; /* MSI controller */ void *sysdata; /* Hook for sys-specific extension */ struct proc_dir_entry *procdir; /* Directory entry in /proc/bus/pci */ -- cgit v1.2.3 From b227be0d7314d0869d4e28c199ac1fc7075cf06e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:39 +0100 Subject: PCI/MSI: Kill msi_controller structure msi_controller had a good, long life as the abstraction for a driver providing MSIs to PCI devices. But it has been replaced in all drivers by the more expressive generic MSI framework. Farewell, struct msi_controller. Link: https://lore.kernel.org/r/20210330151145.997953-9-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- include/linux/msi.h | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/msi.h b/include/linux/msi.h index aef35fd1cf11..3f21e77b57b7 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -240,8 +240,7 @@ void pci_msi_unmask_irq(struct irq_data *data); /* * The arch hooks to setup up msi irqs. Default functions are implemented * as weak symbols so that they /can/ be overriden by architecture specific - * code if needed. These hooks must be enabled by the architecture or by - * drivers which depend on them via msi_controller based MSI handling. + * code if needed. These hooks can only be enabled by the architecture. * * If CONFIG_PCI_MSI_ARCH_FALLBACKS is not selected they are replaced by * stubs with warnings. @@ -272,19 +271,6 @@ static inline void arch_teardown_msi_irqs(struct pci_dev *dev) void arch_restore_msi_irqs(struct pci_dev *dev); void default_restore_msi_irqs(struct pci_dev *dev); -struct msi_controller { - struct module *owner; - struct device *dev; - struct device_node *of_node; - struct list_head list; - - int (*setup_irq)(struct msi_controller *chip, struct pci_dev *dev, - struct msi_desc *desc); - int (*setup_irqs)(struct msi_controller *chip, struct pci_dev *dev, - int nvec, int type); - void (*teardown_irq)(struct msi_controller *chip, unsigned int irq); -}; - #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN #include -- cgit v1.2.3 From f8bcf249d9cf292c6ceb3d9f5bd90815090f5286 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:40 +0100 Subject: PCI/MSI: Kill default_teardown_msi_irqs() It doesn't have any caller left. Link: https://lore.kernel.org/r/20210330151145.997953-10-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- include/linux/msi.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/msi.h b/include/linux/msi.h index 3f21e77b57b7..6aff469e511d 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -250,7 +250,6 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc); void arch_teardown_msi_irq(unsigned int irq); int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void arch_teardown_msi_irqs(struct pci_dev *dev); -void default_teardown_msi_irqs(struct pci_dev *dev); #else static inline int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { -- cgit v1.2.3 From 94e89b145371b68fa0ea294855adebcd03e0522e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:41 +0100 Subject: PCI/MSI: Let PCI host bridges declare their reliance on MSI domains There is a whole class of host bridges that cannot know whether MSIs will be provided or not, as they rely on other blocks to provide the MSI functionnality, using MSI domains. This is the case for example on systems that use the ARM GIC architecture. Introduce a new attribute ('msi_domain') indicating that implicit dependency, and use this property to set the NO_MSI flag when no MSI domain is found at probe time. Link: https://lore.kernel.org/r/20210330151145.997953-11-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index ebf557e59d87..ede0aef2cfd4 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -550,6 +550,7 @@ struct pci_host_bridge { unsigned int native_dpc:1; /* OS may use PCIe DPC */ unsigned int preserve_config:1; /* Preserve FW resource setup */ unsigned int size_windows:1; /* Enable root bus sizing */ + unsigned int msi_domain:1; /* Bridge wants MSI domain */ /* Resource alignment requirements */ resource_size_t (*align_resource)(struct pci_dev *dev, -- cgit v1.2.3 From 67880f1bc342ed4c94e72cad7f8ca76e5121aae3 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Tue, 20 Apr 2021 10:16:11 -0700 Subject: platform/chrome: cros_ec: Add Type C hard reset Update the EC command header to include the new event bit. This bit is included in the latest version of the Chrome EC headers[1]. [1] https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/main/include/ec_commands.h Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20210420171617.3830902-1-pmalani@chromium.org --- include/linux/platform_data/cros_ec_commands.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index 5ff8597ceabd..9156078c6fc6 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -5678,6 +5678,7 @@ enum tcpc_cc_polarity { #define PD_STATUS_EVENT_SOP_DISC_DONE BIT(0) #define PD_STATUS_EVENT_SOP_PRIME_DISC_DONE BIT(1) +#define PD_STATUS_EVENT_HARD_RESET BIT(2) struct ec_params_typec_status { uint8_t port; -- cgit v1.2.3 From 9a89d3ad6d39cbdc9de47f776fc7f7e4b1145c70 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Sun, 18 Apr 2021 13:59:21 +0300 Subject: RDMA/mlx5: Expose private query port Expose a non standard query port via IOCTL that will be used to expose port attributes that are specific to mlx5 devices. The new interface receives a port number to query and returns a structure that contains the available attributes for that port. This will be used to fill the gap between pure DEVX use cases and use cases where a kernel needs to inform userspace about various kernel driver configurations that userspace must use in order to work correctly. Flags is used to indicate which fields are valid on return. MLX5_IB_UAPI_QUERY_PORT_VPORT: The vport number of the queered port. MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID: The VHCA ID of the vport of the queered port. MLX5_IB_UAPI_QUERY_PORT_VPORT_STEERING_ICM_RX: The vport's RX ICM address used for sw steering. MLX5_IB_UAPI_QUERY_PORT_VPORT_STEERING_ICM_TX: The vport's TX ICM address used for sw steering. MLX5_IB_UAPI_QUERY_PORT_VPORT_REG_C0: The metadata used to tag egress packets of the vport. MLX5_IB_UAPI_QUERY_PORT_ESW_OWNER_VHCA_ID: The E-Switch owner vhca id of the vport. Link: https://lore.kernel.org/r/6e2ef13e5a266a6c037eb0105eb1564c7bb52f23.1618743394.git.leonro@nvidia.com Reviewed-by: Maor Gottlieb Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 9 +++++++++ include/uapi/rdma/mlx5_user_ioctl_verbs.h | 25 +++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 3798cbcb9021..ca2372864b70 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -320,4 +320,13 @@ enum mlx5_ib_pd_methods { }; +enum mlx5_ib_device_methods { + MLX5_IB_METHOD_QUERY_PORT = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_query_port_attrs { + MLX5_IB_ATTR_QUERY_PORT_PORT_NUM = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_QUERY_PORT, +}; + #endif diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index 56b26eaea083..a21ca8ece8db 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -83,5 +83,30 @@ enum mlx5_ib_uapi_uar_alloc_type { MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC = 0x1, }; +enum mlx5_ib_uapi_query_port_flags { + MLX5_IB_UAPI_QUERY_PORT_VPORT = 1 << 0, + MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID = 1 << 1, + MLX5_IB_UAPI_QUERY_PORT_VPORT_STEERING_ICM_RX = 1 << 2, + MLX5_IB_UAPI_QUERY_PORT_VPORT_STEERING_ICM_TX = 1 << 3, + MLX5_IB_UAPI_QUERY_PORT_VPORT_REG_C0 = 1 << 4, + MLX5_IB_UAPI_QUERY_PORT_ESW_OWNER_VHCA_ID = 1 << 5, +}; + +struct mlx5_ib_uapi_reg { + __u32 value; + __u32 mask; +}; + +struct mlx5_ib_uapi_query_port { + __aligned_u64 flags; + __u16 vport; + __u16 vport_vhca_id; + __u16 esw_owner_vhca_id; + __u16 rsvd0; + __aligned_u64 vport_steering_icm_rx; + __aligned_u64 vport_steering_icm_tx; + struct mlx5_ib_uapi_reg reg_c0; +}; + #endif -- cgit v1.2.3 From d99f2487e1de23a2e902d1a359a85a48bfd21fe7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 21 Apr 2021 07:48:43 -0400 Subject: NFS: The 'fattr_valid' field in struct nfs_server should be unsigned int Fix up a static compiler warning: "fs/nfs/nfs4proc.c:3882 _nfs4_server_capabilities() warn: was expecting a 64 bit value instead of '(1 << 11)'" The fix is to convert the fattr_valid field to match the type of the 'valid' field in struct nfs_fattr. Reported-by: Dan Carpenter Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index d28d7a62864f..70057b2e606e 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -156,6 +156,7 @@ struct nfs_server { #define NFS_MOUNT_WRITE_EAGER 0x01000000 #define NFS_MOUNT_WRITE_WAIT 0x02000000 + unsigned int fattr_valid; /* Valid attributes */ unsigned int caps; /* server capabilities */ unsigned int rsize; /* read size */ unsigned int rpages; /* read size (in pages) */ @@ -191,8 +192,6 @@ struct nfs_server { dev_t s_dev; /* superblock dev numbers */ struct nfs_auth_info auth_info; /* parsed auth flavors */ - __u64 fattr_valid; /* Valid attributes */ - #ifdef CONFIG_NFS_FSCACHE struct nfs_fscache_key *fscache_key; /* unique key for superblock */ struct fscache_cookie *fscache; /* superblock cookie */ -- cgit v1.2.3 From 54526d1fd59338fd6a381dbd806b7ccbae3aa4aa Mon Sep 17 00:00:00 2001 From: Nathan Tempelman Date: Thu, 8 Apr 2021 22:32:14 +0000 Subject: KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 82b066db37cb..c306b4c3674e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -654,6 +654,7 @@ void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); void kvm_put_kvm(struct kvm *kvm); +bool file_is_kvm(struct file *file); void kvm_put_kvm_no_destroy(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 162b7e044c8b..37f0a329da6a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1080,6 +1080,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_DAWR1 194 #define KVM_CAP_SET_GUEST_DEBUG2 195 #define KVM_CAP_SGX_ATTRIBUTE 196 +#define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 52acd22faa1af8a0514ccd075a6978ac97986425 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 16 Apr 2021 11:08:10 +0800 Subject: KVM: Boost vCPU candidate in user mode which is delivering interrupt Both lock holder vCPU and IPI receiver that has halted are condidate for boost. However, the PLE handler was originally designed to deal with the lock holder preemption problem. The Intel PLE occurs when the spinlock waiter is in kernel mode. This assumption doesn't hold for IPI receiver, they can be in either kernel or user mode. the vCPU candidate in user mode will not be boosted even if they should respond to IPIs. Some benchmarks like pbzip2, swaptions etc do the TLB shootdown in kernel mode and most of the time they are running in user mode. It can lead to a large number of continuous PLE events because the IPI sender causes PLE events repeatedly until the receiver is scheduled while the receiver is not candidate for a boost. This patch boosts the vCPU candidiate in user mode which is delivery interrupt. We can observe the speed of pbzip2 improves 10% in 96 vCPUs VM in over-subscribe scenario (The host machine is 2 socket, 48 cores, 96 HTs Intel CLX box). There is no performance regression for other benchmarks like Unixbench spawn (most of the time contend read/write lock in kernel mode), ebizzy (most of the time contend read/write sem and TLB shoodtdown in kernel mode). Signed-off-by: Wanpeng Li Message-Id: <1618542490-14756-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c306b4c3674e..8895b95b6a22 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -960,6 +960,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); +bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_post_init_vm(struct kvm *kvm); void kvm_arch_pre_destroy_vm(struct kvm *kvm); -- cgit v1.2.3 From 4cfdd47d6d95aca4fb8d6cfbe73392472d353f82 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 15 Apr 2021 15:53:14 +0000 Subject: KVM: SVM: Add KVM_SEV SEND_START command The command is used to create an outgoing SEV guest encryption context. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Steve Rutherford Reviewed-by: Venu Busireddy Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra Message-Id: <2f1686d0164e0f1b3d6a41d620408393e0a48376.1618498113.git.ashish.kalra@amd.com> Signed-off-by: Paolo Bonzini --- include/linux/psp-sev.h | 8 ++++---- include/uapi/linux/kvm.h | 12 ++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index b801ead1e2bb..73da511b9423 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -326,11 +326,11 @@ struct sev_data_send_start { u64 pdh_cert_address; /* In */ u32 pdh_cert_len; /* In */ u32 reserved1; - u64 plat_cert_address; /* In */ - u32 plat_cert_len; /* In */ + u64 plat_certs_address; /* In */ + u32 plat_certs_len; /* In */ u32 reserved2; - u64 amd_cert_address; /* In */ - u32 amd_cert_len; /* In */ + u64 amd_certs_address; /* In */ + u32 amd_certs_len; /* In */ u32 reserved3; u64 session_address; /* In */ u32 session_len; /* In/Out */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 37f0a329da6a..11b3e528d35d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1732,6 +1732,18 @@ struct kvm_sev_attestation_report { __u32 len; }; +struct kvm_sev_send_start { + __u32 policy; + __u64 pdh_cert_uaddr; + __u32 pdh_cert_len; + __u64 plat_certs_uaddr; + __u32 plat_certs_len; + __u64 amd_certs_uaddr; + __u32 amd_certs_len; + __u64 session_uaddr; + __u32 session_len; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) -- cgit v1.2.3 From d3d1af85e2c75bb57da51535a6e182c7c45eceb0 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 15 Apr 2021 15:53:55 +0000 Subject: KVM: SVM: Add KVM_SEND_UPDATE_DATA command The command is used for encrypting the guest memory region using the encryption context created with KVM_SEV_SEND_START. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by : Steve Rutherford Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra Message-Id: Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 11b3e528d35d..d07be69f402f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1744,6 +1744,15 @@ struct kvm_sev_send_start { __u32 session_len; }; +struct kvm_sev_send_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_uaddr; + __u32 guest_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) -- cgit v1.2.3 From 5569e2e7a650dfffd4df7635662b2f92162d6501 Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Tue, 20 Apr 2021 05:01:20 -0400 Subject: KVM: SVM: Add support for KVM_SEV_SEND_CANCEL command After completion of SEND_START, but before SEND_FINISH, the source VMM can issue the SEND_CANCEL command to stop a migration. This is necessary so that a cancelled migration can restart with a new target later. Reviewed-by: Nathan Tempelman Reviewed-by: Brijesh Singh Signed-off-by: Steve Rutherford Message-Id: <20210412194408.2458827-1-srutherford@google.com> Signed-off-by: Paolo Bonzini --- include/linux/psp-sev.h | 10 ++++++++++ include/uapi/linux/kvm.h | 2 ++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 73da511b9423..d48a7192e881 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -73,6 +73,7 @@ enum sev_cmd { SEV_CMD_SEND_UPDATE_DATA = 0x041, SEV_CMD_SEND_UPDATE_VMSA = 0x042, SEV_CMD_SEND_FINISH = 0x043, + SEV_CMD_SEND_CANCEL = 0x044, /* Guest migration commands (incoming) */ SEV_CMD_RECEIVE_START = 0x050, @@ -392,6 +393,15 @@ struct sev_data_send_finish { u32 handle; /* In */ } __packed; +/** + * struct sev_data_send_cancel - SEND_CANCEL command parameters + * + * @handle: handle of the VM to process + */ +struct sev_data_send_cancel { + u32 handle; /* In */ +} __packed; + /** * struct sev_data_receive_start - RECEIVE_START command parameters * diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d07be69f402f..5d1adeb13739 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1674,6 +1674,8 @@ enum sev_cmd_id { KVM_SEV_CERT_EXPORT, /* Attestation report */ KVM_SEV_GET_ATTESTATION_REPORT, + /* Guest Migration Extension */ + KVM_SEV_SEND_CANCEL, KVM_SEV_NR_MAX, }; -- cgit v1.2.3 From af43cbbf954b50ca97d5e7bb56c2edc6ffd209ef Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 15 Apr 2021 15:54:50 +0000 Subject: KVM: SVM: Add support for KVM_SEV_RECEIVE_START command The command is used to create the encryption context for an incoming SEV guest. The encryption context can be later used by the hypervisor to import the incoming data into the SEV guest memory space. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Steve Rutherford Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra Message-Id: Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 5d1adeb13739..248d1d266bc3 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1755,6 +1755,15 @@ struct kvm_sev_send_update_data { __u32 trans_len; }; +struct kvm_sev_receive_start { + __u32 handle; + __u32 policy; + __u64 pdh_uaddr; + __u32 pdh_len; + __u64 session_uaddr; + __u32 session_len; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) -- cgit v1.2.3 From 15fb7de1a7f5af0d5910ca4352b26f887543e26e Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 15 Apr 2021 15:55:17 +0000 Subject: KVM: SVM: Add KVM_SEV_RECEIVE_UPDATE_DATA command The command is used for copying the incoming buffer into the SEV guest memory space. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Steve Rutherford Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra Message-Id: Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 248d1d266bc3..d76533498543 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1764,6 +1764,15 @@ struct kvm_sev_receive_start { __u32 session_len; }; +struct kvm_sev_receive_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_uaddr; + __u32 guest_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) -- cgit v1.2.3 From 7f7663899d9429476db74d8aceb042fe4a3756b7 Mon Sep 17 00:00:00 2001 From: Zhiyong Tao Date: Tue, 13 Apr 2021 13:56:59 +0800 Subject: dt-bindings: pinctrl: mt8195: add pinctrl file and binding document 1. This patch adds pinctrl file for mt8195. 2. This patch adds mt8195 compatible node in binding document. Signed-off-by: Zhiyong Tao Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20210413055702.27535-2-zhiyong.tao@mediatek.com Signed-off-by: Linus Walleij --- include/dt-bindings/pinctrl/mt8195-pinfunc.h | 962 +++++++++++++++++++++++++++ 1 file changed, 962 insertions(+) create mode 100644 include/dt-bindings/pinctrl/mt8195-pinfunc.h (limited to 'include') diff --git a/include/dt-bindings/pinctrl/mt8195-pinfunc.h b/include/dt-bindings/pinctrl/mt8195-pinfunc.h new file mode 100644 index 000000000000..666331bb9b40 --- /dev/null +++ b/include/dt-bindings/pinctrl/mt8195-pinfunc.h @@ -0,0 +1,962 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 MediaTek Inc. + * Author: Zhiyong Tao + */ + +#ifndef __MT8195_PINFUNC_H +#define __MT8195_PINFUNC_H + +#include "mt65xx.h" + +#define PINMUX_GPIO0__FUNC_GPIO0 (MTK_PIN_NO(0) | 0) +#define PINMUX_GPIO0__FUNC_TP_GPIO0_AO (MTK_PIN_NO(0) | 1) +#define PINMUX_GPIO0__FUNC_MSDC2_CMD (MTK_PIN_NO(0) | 2) +#define PINMUX_GPIO0__FUNC_TDMIN_MCK (MTK_PIN_NO(0) | 3) +#define PINMUX_GPIO0__FUNC_CLKM0 (MTK_PIN_NO(0) | 4) +#define PINMUX_GPIO0__FUNC_PERSTN_1 (MTK_PIN_NO(0) | 5) +#define PINMUX_GPIO0__FUNC_IDDIG_1P (MTK_PIN_NO(0) | 6) +#define PINMUX_GPIO0__FUNC_DMIC4_CLK (MTK_PIN_NO(0) | 7) + +#define PINMUX_GPIO1__FUNC_GPIO1 (MTK_PIN_NO(1) | 0) +#define PINMUX_GPIO1__FUNC_TP_GPIO1_AO (MTK_PIN_NO(1) | 1) +#define PINMUX_GPIO1__FUNC_MSDC2_CLK (MTK_PIN_NO(1) | 2) +#define PINMUX_GPIO1__FUNC_TDMIN_DI (MTK_PIN_NO(1) | 3) +#define PINMUX_GPIO1__FUNC_CLKM1 (MTK_PIN_NO(1) | 4) +#define PINMUX_GPIO1__FUNC_CLKREQN_1 (MTK_PIN_NO(1) | 5) +#define PINMUX_GPIO1__FUNC_USB_DRVVBUS_1P (MTK_PIN_NO(1) | 6) +#define PINMUX_GPIO1__FUNC_DMIC4_DAT (MTK_PIN_NO(1) | 7) + +#define PINMUX_GPIO2__FUNC_GPIO2 (MTK_PIN_NO(2) | 0) +#define PINMUX_GPIO2__FUNC_TP_GPIO2_AO (MTK_PIN_NO(2) | 1) +#define PINMUX_GPIO2__FUNC_MSDC2_DAT3 (MTK_PIN_NO(2) | 2) +#define PINMUX_GPIO2__FUNC_TDMIN_LRCK (MTK_PIN_NO(2) | 3) +#define PINMUX_GPIO2__FUNC_CLKM2 (MTK_PIN_NO(2) | 4) +#define PINMUX_GPIO2__FUNC_WAKEN_1 (MTK_PIN_NO(2) | 5) +#define PINMUX_GPIO2__FUNC_DMIC2_CLK (MTK_PIN_NO(2) | 7) + +#define PINMUX_GPIO3__FUNC_GPIO3 (MTK_PIN_NO(3) | 0) +#define PINMUX_GPIO3__FUNC_TP_GPIO3_AO (MTK_PIN_NO(3) | 1) +#define PINMUX_GPIO3__FUNC_MSDC2_DAT0 (MTK_PIN_NO(3) | 2) +#define PINMUX_GPIO3__FUNC_TDMIN_BCK (MTK_PIN_NO(3) | 3) +#define PINMUX_GPIO3__FUNC_CLKM3 (MTK_PIN_NO(3) | 4) +#define PINMUX_GPIO3__FUNC_DMIC2_DAT (MTK_PIN_NO(3) | 7) + +#define PINMUX_GPIO4__FUNC_GPIO4 (MTK_PIN_NO(4) | 0) +#define PINMUX_GPIO4__FUNC_TP_GPIO4_AO (MTK_PIN_NO(4) | 1) +#define PINMUX_GPIO4__FUNC_MSDC2_DAT2 (MTK_PIN_NO(4) | 2) +#define PINMUX_GPIO4__FUNC_SPDIF_IN1 (MTK_PIN_NO(4) | 3) +#define PINMUX_GPIO4__FUNC_UTXD3 (MTK_PIN_NO(4) | 4) +#define PINMUX_GPIO4__FUNC_SDA2 (MTK_PIN_NO(4) | 5) +#define PINMUX_GPIO4__FUNC_IDDIG_2P (MTK_PIN_NO(4) | 7) + +#define PINMUX_GPIO5__FUNC_GPIO5 (MTK_PIN_NO(5) | 0) +#define PINMUX_GPIO5__FUNC_TP_GPIO5_AO (MTK_PIN_NO(5) | 1) +#define PINMUX_GPIO5__FUNC_MSDC2_DAT1 (MTK_PIN_NO(5) | 2) +#define PINMUX_GPIO5__FUNC_SPDIF_IN0 (MTK_PIN_NO(5) | 3) +#define PINMUX_GPIO5__FUNC_URXD3 (MTK_PIN_NO(5) | 4) +#define PINMUX_GPIO5__FUNC_SCL2 (MTK_PIN_NO(5) | 5) +#define PINMUX_GPIO5__FUNC_USB_DRVVBUS_2P (MTK_PIN_NO(5) | 7) + +#define PINMUX_GPIO6__FUNC_GPIO6 (MTK_PIN_NO(6) | 0) +#define PINMUX_GPIO6__FUNC_TP_GPIO6_AO (MTK_PIN_NO(6) | 1) +#define PINMUX_GPIO6__FUNC_DP_TX_HPD (MTK_PIN_NO(6) | 2) +#define PINMUX_GPIO6__FUNC_I2SO1_D4 (MTK_PIN_NO(6) | 3) +#define PINMUX_GPIO6__FUNC_UTXD4 (MTK_PIN_NO(6) | 4) +#define PINMUX_GPIO6__FUNC_CMVREF3 (MTK_PIN_NO(6) | 5) +#define PINMUX_GPIO6__FUNC_DMIC3_CLK (MTK_PIN_NO(6) | 7) + +#define PINMUX_GPIO7__FUNC_GPIO7 (MTK_PIN_NO(7) | 0) +#define PINMUX_GPIO7__FUNC_TP_GPIO7_AO (MTK_PIN_NO(7) | 1) +#define PINMUX_GPIO7__FUNC_EDP_TX_HPD (MTK_PIN_NO(7) | 2) +#define PINMUX_GPIO7__FUNC_I2SO1_D5 (MTK_PIN_NO(7) | 3) +#define PINMUX_GPIO7__FUNC_URXD4 (MTK_PIN_NO(7) | 4) +#define PINMUX_GPIO7__FUNC_CMVREF4 (MTK_PIN_NO(7) | 5) +#define PINMUX_GPIO7__FUNC_DMIC3_DAT (MTK_PIN_NO(7) | 7) + +#define PINMUX_GPIO8__FUNC_GPIO8 (MTK_PIN_NO(8) | 0) +#define PINMUX_GPIO8__FUNC_SDA0 (MTK_PIN_NO(8) | 1) +#define PINMUX_GPIO8__FUNC_PWM_0 (MTK_PIN_NO(8) | 2) +#define PINMUX_GPIO8__FUNC_SPDIF_OUT (MTK_PIN_NO(8) | 4) +#define PINMUX_GPIO8__FUNC_LVTS_FOUT (MTK_PIN_NO(8) | 6) +#define PINMUX_GPIO8__FUNC_DBG_MON_A0 (MTK_PIN_NO(8) | 7) + +#define PINMUX_GPIO9__FUNC_GPIO9 (MTK_PIN_NO(9) | 0) +#define PINMUX_GPIO9__FUNC_SCL0 (MTK_PIN_NO(9) | 1) +#define PINMUX_GPIO9__FUNC_PWM_1 (MTK_PIN_NO(9) | 2) +#define PINMUX_GPIO9__FUNC_IR_IN (MTK_PIN_NO(9) | 4) +#define PINMUX_GPIO9__FUNC_LVTS_SDO (MTK_PIN_NO(9) | 6) +#define PINMUX_GPIO9__FUNC_DBG_MON_A1 (MTK_PIN_NO(9) | 7) + +#define PINMUX_GPIO10__FUNC_GPIO10 (MTK_PIN_NO(10) | 0) +#define PINMUX_GPIO10__FUNC_SDA1 (MTK_PIN_NO(10) | 1) +#define PINMUX_GPIO10__FUNC_PWM_2 (MTK_PIN_NO(10) | 2) +#define PINMUX_GPIO10__FUNC_ADSP_URXD0 (MTK_PIN_NO(10) | 3) +#define PINMUX_GPIO10__FUNC_SPDIF_IN1 (MTK_PIN_NO(10) | 4) +#define PINMUX_GPIO10__FUNC_LVTS_SCF (MTK_PIN_NO(10) | 6) +#define PINMUX_GPIO10__FUNC_DBG_MON_A2 (MTK_PIN_NO(10) | 7) + +#define PINMUX_GPIO11__FUNC_GPIO11 (MTK_PIN_NO(11) | 0) +#define PINMUX_GPIO11__FUNC_SCL1 (MTK_PIN_NO(11) | 1) +#define PINMUX_GPIO11__FUNC_PWM_3 (MTK_PIN_NO(11) | 2) +#define PINMUX_GPIO11__FUNC_ADSP_UTXD0 (MTK_PIN_NO(11) | 3) +#define PINMUX_GPIO11__FUNC_SPDIF_IN0 (MTK_PIN_NO(11) | 4) +#define PINMUX_GPIO11__FUNC_LVTS_SCK (MTK_PIN_NO(11) | 6) +#define PINMUX_GPIO11__FUNC_DBG_MON_A3 (MTK_PIN_NO(11) | 7) + +#define PINMUX_GPIO12__FUNC_GPIO12 (MTK_PIN_NO(12) | 0) +#define PINMUX_GPIO12__FUNC_SDA2 (MTK_PIN_NO(12) | 1) +#define PINMUX_GPIO12__FUNC_DMIC3_DAT_R (MTK_PIN_NO(12) | 2) +#define PINMUX_GPIO12__FUNC_I2SO1_D6 (MTK_PIN_NO(12) | 3) +#define PINMUX_GPIO12__FUNC_LVTS_SDI (MTK_PIN_NO(12) | 6) +#define PINMUX_GPIO12__FUNC_DBG_MON_A4 (MTK_PIN_NO(12) | 7) + +#define PINMUX_GPIO13__FUNC_GPIO13 (MTK_PIN_NO(13) | 0) +#define PINMUX_GPIO13__FUNC_SCL2 (MTK_PIN_NO(13) | 1) +#define PINMUX_GPIO13__FUNC_DMIC4_DAT_R (MTK_PIN_NO(13) | 2) +#define PINMUX_GPIO13__FUNC_I2SO1_D7 (MTK_PIN_NO(13) | 3) +#define PINMUX_GPIO13__FUNC_DBG_MON_A5 (MTK_PIN_NO(13) | 7) + +#define PINMUX_GPIO14__FUNC_GPIO14 (MTK_PIN_NO(14) | 0) +#define PINMUX_GPIO14__FUNC_SDA3 (MTK_PIN_NO(14) | 1) +#define PINMUX_GPIO14__FUNC_DMIC3_DAT (MTK_PIN_NO(14) | 2) +#define PINMUX_GPIO14__FUNC_TDMIN_MCK (MTK_PIN_NO(14) | 3) +#define PINMUX_GPIO14__FUNC_DBG_MON_A6 (MTK_PIN_NO(14) | 7) + +#define PINMUX_GPIO15__FUNC_GPIO15 (MTK_PIN_NO(15) | 0) +#define PINMUX_GPIO15__FUNC_SCL3 (MTK_PIN_NO(15) | 1) +#define PINMUX_GPIO15__FUNC_DMIC3_CLK (MTK_PIN_NO(15) | 2) +#define PINMUX_GPIO15__FUNC_TDMIN_DI (MTK_PIN_NO(15) | 3) +#define PINMUX_GPIO15__FUNC_DBG_MON_A7 (MTK_PIN_NO(15) | 7) + +#define PINMUX_GPIO16__FUNC_GPIO16 (MTK_PIN_NO(16) | 0) +#define PINMUX_GPIO16__FUNC_SDA4 (MTK_PIN_NO(16) | 1) +#define PINMUX_GPIO16__FUNC_DMIC4_DAT (MTK_PIN_NO(16) | 2) +#define PINMUX_GPIO16__FUNC_TDMIN_LRCK (MTK_PIN_NO(16) | 3) +#define PINMUX_GPIO16__FUNC_DBG_MON_A8 (MTK_PIN_NO(16) | 7) + +#define PINMUX_GPIO17__FUNC_GPIO17 (MTK_PIN_NO(17) | 0) +#define PINMUX_GPIO17__FUNC_SCL4 (MTK_PIN_NO(17) | 1) +#define PINMUX_GPIO17__FUNC_DMIC4_CLK (MTK_PIN_NO(17) | 2) +#define PINMUX_GPIO17__FUNC_TDMIN_BCK (MTK_PIN_NO(17) | 3) +#define PINMUX_GPIO17__FUNC_DBG_MON_A9 (MTK_PIN_NO(17) | 7) + +#define PINMUX_GPIO18__FUNC_GPIO18 (MTK_PIN_NO(18) | 0) +#define PINMUX_GPIO18__FUNC_DP_TX_HPD (MTK_PIN_NO(18) | 1) + +#define PINMUX_GPIO19__FUNC_GPIO19 (MTK_PIN_NO(19) | 0) +#define PINMUX_GPIO19__FUNC_WAKEN (MTK_PIN_NO(19) | 1) +#define PINMUX_GPIO19__FUNC_SCP_SDA1 (MTK_PIN_NO(19) | 2) +#define PINMUX_GPIO19__FUNC_MD32_0_JTAG_TCK (MTK_PIN_NO(19) | 3) +#define PINMUX_GPIO19__FUNC_ADSP_JTAG0_TCK (MTK_PIN_NO(19) | 4) +#define PINMUX_GPIO19__FUNC_SDA6 (MTK_PIN_NO(19) | 5) + +#define PINMUX_GPIO20__FUNC_GPIO20 (MTK_PIN_NO(20) | 0) +#define PINMUX_GPIO20__FUNC_PERSTN (MTK_PIN_NO(20) | 1) +#define PINMUX_GPIO20__FUNC_SCP_SCL1 (MTK_PIN_NO(20) | 2) +#define PINMUX_GPIO20__FUNC_MD32_0_JTAG_TMS (MTK_PIN_NO(20) | 3) +#define PINMUX_GPIO20__FUNC_ADSP_JTAG0_TMS (MTK_PIN_NO(20) | 4) +#define PINMUX_GPIO20__FUNC_SCL6 (MTK_PIN_NO(20) | 5) + +#define PINMUX_GPIO21__FUNC_GPIO21 (MTK_PIN_NO(21) | 0) +#define PINMUX_GPIO21__FUNC_CLKREQN (MTK_PIN_NO(21) | 1) +#define PINMUX_GPIO21__FUNC_MD32_0_JTAG_TDI (MTK_PIN_NO(21) | 3) +#define PINMUX_GPIO21__FUNC_ADSP_JTAG0_TDI (MTK_PIN_NO(21) | 4) +#define PINMUX_GPIO21__FUNC_SCP_SDA1 (MTK_PIN_NO(21) | 5) + +#define PINMUX_GPIO22__FUNC_GPIO22 (MTK_PIN_NO(22) | 0) +#define PINMUX_GPIO22__FUNC_CMMCLK0 (MTK_PIN_NO(22) | 1) +#define PINMUX_GPIO22__FUNC_PERSTN_1 (MTK_PIN_NO(22) | 2) +#define PINMUX_GPIO22__FUNC_SCP_SCL1 (MTK_PIN_NO(22) | 5) +#define PINMUX_GPIO22__FUNC_MD32_0_GPIO0 (MTK_PIN_NO(22) | 7) + +#define PINMUX_GPIO23__FUNC_GPIO23 (MTK_PIN_NO(23) | 0) +#define PINMUX_GPIO23__FUNC_CMMCLK1 (MTK_PIN_NO(23) | 1) +#define PINMUX_GPIO23__FUNC_CLKREQN_1 (MTK_PIN_NO(23) | 2) +#define PINMUX_GPIO23__FUNC_SDA4 (MTK_PIN_NO(23) | 3) +#define PINMUX_GPIO23__FUNC_DMIC1_CLK (MTK_PIN_NO(23) | 4) +#define PINMUX_GPIO23__FUNC_SCP_SDA0 (MTK_PIN_NO(23) | 5) +#define PINMUX_GPIO23__FUNC_MD32_0_GPIO1 (MTK_PIN_NO(23) | 7) + +#define PINMUX_GPIO24__FUNC_GPIO24 (MTK_PIN_NO(24) | 0) +#define PINMUX_GPIO24__FUNC_CMMCLK2 (MTK_PIN_NO(24) | 1) +#define PINMUX_GPIO24__FUNC_WAKEN_1 (MTK_PIN_NO(24) | 2) +#define PINMUX_GPIO24__FUNC_SCL4 (MTK_PIN_NO(24) | 3) +#define PINMUX_GPIO24__FUNC_DMIC1_DAT (MTK_PIN_NO(24) | 4) +#define PINMUX_GPIO24__FUNC_SCP_SCL0 (MTK_PIN_NO(24) | 5) +#define PINMUX_GPIO24__FUNC_LVTS_26M (MTK_PIN_NO(24) | 6) +#define PINMUX_GPIO24__FUNC_MD32_0_GPIO2 (MTK_PIN_NO(24) | 7) + +#define PINMUX_GPIO25__FUNC_GPIO25 (MTK_PIN_NO(25) | 0) +#define PINMUX_GPIO25__FUNC_CMMRST (MTK_PIN_NO(25) | 1) +#define PINMUX_GPIO25__FUNC_CMMCLK3 (MTK_PIN_NO(25) | 2) +#define PINMUX_GPIO25__FUNC_SPDIF_OUT (MTK_PIN_NO(25) | 3) +#define PINMUX_GPIO25__FUNC_SDA6 (MTK_PIN_NO(25) | 4) +#define PINMUX_GPIO25__FUNC_ADSP_JTAG0_TRSTN (MTK_PIN_NO(25) | 5) +#define PINMUX_GPIO25__FUNC_MD32_0_JTAG_TRST (MTK_PIN_NO(25) | 6) + +#define PINMUX_GPIO26__FUNC_GPIO26 (MTK_PIN_NO(26) | 0) +#define PINMUX_GPIO26__FUNC_CMMPDN (MTK_PIN_NO(26) | 1) +#define PINMUX_GPIO26__FUNC_CMMCLK4 (MTK_PIN_NO(26) | 2) +#define PINMUX_GPIO26__FUNC_IR_IN (MTK_PIN_NO(26) | 3) +#define PINMUX_GPIO26__FUNC_SCL6 (MTK_PIN_NO(26) | 4) +#define PINMUX_GPIO26__FUNC_ADSP_JTAG0_TDO (MTK_PIN_NO(26) | 5) +#define PINMUX_GPIO26__FUNC_MD32_0_JTAG_TDO (MTK_PIN_NO(26) | 6) + +#define PINMUX_GPIO27__FUNC_GPIO27 (MTK_PIN_NO(27) | 0) +#define PINMUX_GPIO27__FUNC_HDMIRX20_HTPLG (MTK_PIN_NO(27) | 1) +#define PINMUX_GPIO27__FUNC_CMFLASH0 (MTK_PIN_NO(27) | 2) +#define PINMUX_GPIO27__FUNC_MD32_0_TXD (MTK_PIN_NO(27) | 3) +#define PINMUX_GPIO27__FUNC_TP_UTXD2_AO (MTK_PIN_NO(27) | 4) +#define PINMUX_GPIO27__FUNC_SCL7 (MTK_PIN_NO(27) | 5) +#define PINMUX_GPIO27__FUNC_UCTS2 (MTK_PIN_NO(27) | 6) +#define PINMUX_GPIO27__FUNC_DBG_MON_A18 (MTK_PIN_NO(27) | 7) + +#define PINMUX_GPIO28__FUNC_GPIO28 (MTK_PIN_NO(28) | 0) +#define PINMUX_GPIO28__FUNC_HDMIRX20_PWR5V (MTK_PIN_NO(28) | 1) +#define PINMUX_GPIO28__FUNC_CMFLASH1 (MTK_PIN_NO(28) | 2) +#define PINMUX_GPIO28__FUNC_MD32_0_RXD (MTK_PIN_NO(28) | 3) +#define PINMUX_GPIO28__FUNC_TP_URXD2_AO (MTK_PIN_NO(28) | 4) +#define PINMUX_GPIO28__FUNC_SDA7 (MTK_PIN_NO(28) | 5) +#define PINMUX_GPIO28__FUNC_URTS2 (MTK_PIN_NO(28) | 6) +#define PINMUX_GPIO28__FUNC_DBG_MON_A19 (MTK_PIN_NO(28) | 7) + +#define PINMUX_GPIO29__FUNC_GPIO29 (MTK_PIN_NO(29) | 0) +#define PINMUX_GPIO29__FUNC_HDMIRX20_SCL (MTK_PIN_NO(29) | 1) +#define PINMUX_GPIO29__FUNC_CMFLASH2 (MTK_PIN_NO(29) | 2) +#define PINMUX_GPIO29__FUNC_SCL5 (MTK_PIN_NO(29) | 3) +#define PINMUX_GPIO29__FUNC_TP_URTS2_AO (MTK_PIN_NO(29) | 4) +#define PINMUX_GPIO29__FUNC_UTXD2 (MTK_PIN_NO(29) | 6) +#define PINMUX_GPIO29__FUNC_DBG_MON_A20 (MTK_PIN_NO(29) | 7) + +#define PINMUX_GPIO30__FUNC_GPIO30 (MTK_PIN_NO(30) | 0) +#define PINMUX_GPIO30__FUNC_HDMIRX20_SDA (MTK_PIN_NO(30) | 1) +#define PINMUX_GPIO30__FUNC_CMFLASH3 (MTK_PIN_NO(30) | 2) +#define PINMUX_GPIO30__FUNC_SDA5 (MTK_PIN_NO(30) | 3) +#define PINMUX_GPIO30__FUNC_TP_UCTS2_AO (MTK_PIN_NO(30) | 4) +#define PINMUX_GPIO30__FUNC_URXD2 (MTK_PIN_NO(30) | 6) +#define PINMUX_GPIO30__FUNC_DBG_MON_A21 (MTK_PIN_NO(30) | 7) + +#define PINMUX_GPIO31__FUNC_GPIO31 (MTK_PIN_NO(31) | 0) +#define PINMUX_GPIO31__FUNC_HDMITX20_PWR5V (MTK_PIN_NO(31) | 1) +#define PINMUX_GPIO31__FUNC_DMIC1_DAT_R (MTK_PIN_NO(31) | 2) +#define PINMUX_GPIO31__FUNC_PERSTN (MTK_PIN_NO(31) | 3) +#define PINMUX_GPIO31__FUNC_DBG_MON_A22 (MTK_PIN_NO(31) | 7) + +#define PINMUX_GPIO32__FUNC_GPIO32 (MTK_PIN_NO(32) | 0) +#define PINMUX_GPIO32__FUNC_HDMITX20_HTPLG (MTK_PIN_NO(32) | 1) +#define PINMUX_GPIO32__FUNC_CLKREQN (MTK_PIN_NO(32) | 3) +#define PINMUX_GPIO32__FUNC_DBG_MON_A23 (MTK_PIN_NO(32) | 7) + +#define PINMUX_GPIO33__FUNC_GPIO33 (MTK_PIN_NO(33) | 0) +#define PINMUX_GPIO33__FUNC_HDMITX20_CEC (MTK_PIN_NO(33) | 1) +#define PINMUX_GPIO33__FUNC_CMVREF0 (MTK_PIN_NO(33) | 2) +#define PINMUX_GPIO33__FUNC_WAKEN (MTK_PIN_NO(33) | 3) + +#define PINMUX_GPIO34__FUNC_GPIO34 (MTK_PIN_NO(34) | 0) +#define PINMUX_GPIO34__FUNC_HDMITX20_SCL (MTK_PIN_NO(34) | 1) +#define PINMUX_GPIO34__FUNC_CMVREF1 (MTK_PIN_NO(34) | 2) +#define PINMUX_GPIO34__FUNC_SCL7 (MTK_PIN_NO(34) | 3) +#define PINMUX_GPIO34__FUNC_SCL6 (MTK_PIN_NO(34) | 4) +#define PINMUX_GPIO34__FUNC_DBG_MON_A24 (MTK_PIN_NO(34) | 7) + +#define PINMUX_GPIO35__FUNC_GPIO35 (MTK_PIN_NO(35) | 0) +#define PINMUX_GPIO35__FUNC_HDMITX20_SDA (MTK_PIN_NO(35) | 1) +#define PINMUX_GPIO35__FUNC_CMVREF2 (MTK_PIN_NO(35) | 2) +#define PINMUX_GPIO35__FUNC_SDA7 (MTK_PIN_NO(35) | 3) +#define PINMUX_GPIO35__FUNC_SDA6 (MTK_PIN_NO(35) | 4) +#define PINMUX_GPIO35__FUNC_DBG_MON_A25 (MTK_PIN_NO(35) | 7) + +#define PINMUX_GPIO36__FUNC_GPIO36 (MTK_PIN_NO(36) | 0) +#define PINMUX_GPIO36__FUNC_RTC32K_CK (MTK_PIN_NO(36) | 1) +#define PINMUX_GPIO36__FUNC_DBG_MON_A27 (MTK_PIN_NO(36) | 7) + +#define PINMUX_GPIO37__FUNC_GPIO37 (MTK_PIN_NO(37) | 0) +#define PINMUX_GPIO37__FUNC_WATCHDOG (MTK_PIN_NO(37) | 1) +#define PINMUX_GPIO37__FUNC_DBG_MON_A28 (MTK_PIN_NO(37) | 7) + +#define PINMUX_GPIO38__FUNC_GPIO38 (MTK_PIN_NO(38) | 0) +#define PINMUX_GPIO38__FUNC_SRCLKENA0 (MTK_PIN_NO(38) | 1) +#define PINMUX_GPIO38__FUNC_DBG_MON_A29 (MTK_PIN_NO(38) | 7) + +#define PINMUX_GPIO39__FUNC_GPIO39 (MTK_PIN_NO(39) | 0) +#define PINMUX_GPIO39__FUNC_SRCLKENA1 (MTK_PIN_NO(39) | 1) +#define PINMUX_GPIO39__FUNC_DMIC2_DAT_R (MTK_PIN_NO(39) | 2) +#define PINMUX_GPIO39__FUNC_DBG_MON_A30 (MTK_PIN_NO(39) | 7) + +#define PINMUX_GPIO40__FUNC_GPIO40 (MTK_PIN_NO(40) | 0) +#define PINMUX_GPIO40__FUNC_PWRAP_SPI0_CSN (MTK_PIN_NO(40) | 1) +#define PINMUX_GPIO40__FUNC_SPIM3_CSB (MTK_PIN_NO(40) | 3) +#define PINMUX_GPIO40__FUNC_DBG_MON_A31 (MTK_PIN_NO(40) | 7) + +#define PINMUX_GPIO41__FUNC_GPIO41 (MTK_PIN_NO(41) | 0) +#define PINMUX_GPIO41__FUNC_PWRAP_SPI0_CK (MTK_PIN_NO(41) | 1) +#define PINMUX_GPIO41__FUNC_SPIM3_CLK (MTK_PIN_NO(41) | 3) +#define PINMUX_GPIO41__FUNC_DBG_MON_A32 (MTK_PIN_NO(41) | 7) + +#define PINMUX_GPIO42__FUNC_GPIO42 (MTK_PIN_NO(42) | 0) +#define PINMUX_GPIO42__FUNC_PWRAP_SPI0_MO (MTK_PIN_NO(42) | 1) +#define PINMUX_GPIO42__FUNC_PWRAP_SPI0_MI (MTK_PIN_NO(42) | 2) +#define PINMUX_GPIO42__FUNC_SPIM3_MO (MTK_PIN_NO(42) | 3) +#define PINMUX_GPIO42__FUNC_DBG_MON_B0 (MTK_PIN_NO(42) | 7) + +#define PINMUX_GPIO43__FUNC_GPIO43 (MTK_PIN_NO(43) | 0) +#define PINMUX_GPIO43__FUNC_PWRAP_SPI0_MI (MTK_PIN_NO(43) | 1) +#define PINMUX_GPIO43__FUNC_PWRAP_SPI0_MO (MTK_PIN_NO(43) | 2) +#define PINMUX_GPIO43__FUNC_SPIM3_MI (MTK_PIN_NO(43) | 3) +#define PINMUX_GPIO43__FUNC_DBG_MON_B1 (MTK_PIN_NO(43) | 7) + +#define PINMUX_GPIO44__FUNC_GPIO44 (MTK_PIN_NO(44) | 0) +#define PINMUX_GPIO44__FUNC_SPMI_M_SCL (MTK_PIN_NO(44) | 1) +#define PINMUX_GPIO44__FUNC_I2SI00_DATA1 (MTK_PIN_NO(44) | 2) +#define PINMUX_GPIO44__FUNC_SCL5 (MTK_PIN_NO(44) | 3) +#define PINMUX_GPIO44__FUNC_UTXD5 (MTK_PIN_NO(44) | 4) +#define PINMUX_GPIO44__FUNC_DBG_MON_B2 (MTK_PIN_NO(44) | 7) + +#define PINMUX_GPIO45__FUNC_GPIO45 (MTK_PIN_NO(45) | 0) +#define PINMUX_GPIO45__FUNC_SPMI_M_SDA (MTK_PIN_NO(45) | 1) +#define PINMUX_GPIO45__FUNC_I2SI00_DATA2 (MTK_PIN_NO(45) | 2) +#define PINMUX_GPIO45__FUNC_SDA5 (MTK_PIN_NO(45) | 3) +#define PINMUX_GPIO45__FUNC_URXD5 (MTK_PIN_NO(45) | 4) +#define PINMUX_GPIO45__FUNC_DBG_MON_B3 (MTK_PIN_NO(45) | 7) + +#define PINMUX_GPIO46__FUNC_GPIO46 (MTK_PIN_NO(46) | 0) +#define PINMUX_GPIO46__FUNC_I2SIN_MCK (MTK_PIN_NO(46) | 1) +#define PINMUX_GPIO46__FUNC_I2SI00_DATA3 (MTK_PIN_NO(46) | 2) +#define PINMUX_GPIO46__FUNC_SPLIN_MCK (MTK_PIN_NO(46) | 3) +#define PINMUX_GPIO46__FUNC_DBG_MON_B4 (MTK_PIN_NO(46) | 7) + +#define PINMUX_GPIO47__FUNC_GPIO47 (MTK_PIN_NO(47) | 0) +#define PINMUX_GPIO47__FUNC_I2SIN_BCK (MTK_PIN_NO(47) | 1) +#define PINMUX_GPIO47__FUNC_I2SIN0_BCK (MTK_PIN_NO(47) | 2) +#define PINMUX_GPIO47__FUNC_SPLIN_LRCK (MTK_PIN_NO(47) | 3) +#define PINMUX_GPIO47__FUNC_DBG_MON_B5 (MTK_PIN_NO(47) | 7) + +#define PINMUX_GPIO48__FUNC_GPIO48 (MTK_PIN_NO(48) | 0) +#define PINMUX_GPIO48__FUNC_I2SIN_WS (MTK_PIN_NO(48) | 1) +#define PINMUX_GPIO48__FUNC_I2SIN0_LRCK (MTK_PIN_NO(48) | 2) +#define PINMUX_GPIO48__FUNC_SPLIN_BCK (MTK_PIN_NO(48) | 3) +#define PINMUX_GPIO48__FUNC_DBG_MON_B6 (MTK_PIN_NO(48) | 7) + +#define PINMUX_GPIO49__FUNC_GPIO49 (MTK_PIN_NO(49) | 0) +#define PINMUX_GPIO49__FUNC_I2SIN_D0 (MTK_PIN_NO(49) | 1) +#define PINMUX_GPIO49__FUNC_I2SI00_DATA0 (MTK_PIN_NO(49) | 2) +#define PINMUX_GPIO49__FUNC_SPLIN_D0 (MTK_PIN_NO(49) | 3) +#define PINMUX_GPIO49__FUNC_DBG_MON_B7 (MTK_PIN_NO(49) | 7) + +#define PINMUX_GPIO50__FUNC_GPIO50 (MTK_PIN_NO(50) | 0) +#define PINMUX_GPIO50__FUNC_I2SO1_MCK (MTK_PIN_NO(50) | 1) +#define PINMUX_GPIO50__FUNC_I2SI5_D0 (MTK_PIN_NO(50) | 2) +#define PINMUX_GPIO50__FUNC_I2SO4_MCK (MTK_PIN_NO(50) | 4) +#define PINMUX_GPIO50__FUNC_DBG_MON_B8 (MTK_PIN_NO(50) | 7) + +#define PINMUX_GPIO51__FUNC_GPIO51 (MTK_PIN_NO(51) | 0) +#define PINMUX_GPIO51__FUNC_I2SO1_BCK (MTK_PIN_NO(51) | 1) +#define PINMUX_GPIO51__FUNC_I2SI5_BCK (MTK_PIN_NO(51) | 2) +#define PINMUX_GPIO51__FUNC_DBG_MON_B9 (MTK_PIN_NO(51) | 7) + +#define PINMUX_GPIO52__FUNC_GPIO52 (MTK_PIN_NO(52) | 0) +#define PINMUX_GPIO52__FUNC_I2SO1_WS (MTK_PIN_NO(52) | 1) +#define PINMUX_GPIO52__FUNC_I2SI5_WS (MTK_PIN_NO(52) | 2) +#define PINMUX_GPIO52__FUNC_DBG_MON_B10 (MTK_PIN_NO(52) | 7) + +#define PINMUX_GPIO53__FUNC_GPIO53 (MTK_PIN_NO(53) | 0) +#define PINMUX_GPIO53__FUNC_I2SO1_D0 (MTK_PIN_NO(53) | 1) +#define PINMUX_GPIO53__FUNC_I2SI5_MCK (MTK_PIN_NO(53) | 2) +#define PINMUX_GPIO53__FUNC_DBG_MON_B11 (MTK_PIN_NO(53) | 7) + +#define PINMUX_GPIO54__FUNC_GPIO54 (MTK_PIN_NO(54) | 0) +#define PINMUX_GPIO54__FUNC_I2SO1_D1 (MTK_PIN_NO(54) | 1) +#define PINMUX_GPIO54__FUNC_I2SI01_DATA1 (MTK_PIN_NO(54) | 2) +#define PINMUX_GPIO54__FUNC_SPLIN_D1 (MTK_PIN_NO(54) | 3) +#define PINMUX_GPIO54__FUNC_I2SO4_BCK (MTK_PIN_NO(54) | 4) +#define PINMUX_GPIO54__FUNC_DBG_MON_B12 (MTK_PIN_NO(54) | 7) + +#define PINMUX_GPIO55__FUNC_GPIO55 (MTK_PIN_NO(55) | 0) +#define PINMUX_GPIO55__FUNC_I2SO1_D2 (MTK_PIN_NO(55) | 1) +#define PINMUX_GPIO55__FUNC_I2SI01_DATA2 (MTK_PIN_NO(55) | 2) +#define PINMUX_GPIO55__FUNC_SPLIN_D2 (MTK_PIN_NO(55) | 3) +#define PINMUX_GPIO55__FUNC_I2SO4_WS (MTK_PIN_NO(55) | 4) +#define PINMUX_GPIO55__FUNC_DBG_MON_B13 (MTK_PIN_NO(55) | 7) + +#define PINMUX_GPIO56__FUNC_GPIO56 (MTK_PIN_NO(56) | 0) +#define PINMUX_GPIO56__FUNC_I2SO1_D3 (MTK_PIN_NO(56) | 1) +#define PINMUX_GPIO56__FUNC_I2SI01_DATA3 (MTK_PIN_NO(56) | 2) +#define PINMUX_GPIO56__FUNC_SPLIN_D3 (MTK_PIN_NO(56) | 3) +#define PINMUX_GPIO56__FUNC_I2SO4_D0 (MTK_PIN_NO(56) | 4) +#define PINMUX_GPIO56__FUNC_DBG_MON_B14 (MTK_PIN_NO(56) | 7) + +#define PINMUX_GPIO57__FUNC_GPIO57 (MTK_PIN_NO(57) | 0) +#define PINMUX_GPIO57__FUNC_I2SO2_MCK (MTK_PIN_NO(57) | 1) +#define PINMUX_GPIO57__FUNC_I2SO1_D12 (MTK_PIN_NO(57) | 2) +#define PINMUX_GPIO57__FUNC_LCM1_RST (MTK_PIN_NO(57) | 3) +#define PINMUX_GPIO57__FUNC_DBG_MON_B15 (MTK_PIN_NO(57) | 7) + +#define PINMUX_GPIO58__FUNC_GPIO58 (MTK_PIN_NO(58) | 0) +#define PINMUX_GPIO58__FUNC_I2SO2_BCK (MTK_PIN_NO(58) | 1) +#define PINMUX_GPIO58__FUNC_I2SO1_D13 (MTK_PIN_NO(58) | 2) +#define PINMUX_GPIO58__FUNC_I2SIN1_BCK (MTK_PIN_NO(58) | 3) +#define PINMUX_GPIO58__FUNC_DBG_MON_B16 (MTK_PIN_NO(58) | 7) + +#define PINMUX_GPIO59__FUNC_GPIO59 (MTK_PIN_NO(59) | 0) +#define PINMUX_GPIO59__FUNC_I2SO2_WS (MTK_PIN_NO(59) | 1) +#define PINMUX_GPIO59__FUNC_I2SO1_D14 (MTK_PIN_NO(59) | 2) +#define PINMUX_GPIO59__FUNC_I2SIN1_LRCK (MTK_PIN_NO(59) | 3) +#define PINMUX_GPIO59__FUNC_DBG_MON_B17 (MTK_PIN_NO(59) | 7) + +#define PINMUX_GPIO60__FUNC_GPIO60 (MTK_PIN_NO(60) | 0) +#define PINMUX_GPIO60__FUNC_I2SO2_D0 (MTK_PIN_NO(60) | 1) +#define PINMUX_GPIO60__FUNC_I2SO1_D15 (MTK_PIN_NO(60) | 2) +#define PINMUX_GPIO60__FUNC_I2SI01_DATA0 (MTK_PIN_NO(60) | 3) +#define PINMUX_GPIO60__FUNC_DBG_MON_B18 (MTK_PIN_NO(60) | 7) + +#define PINMUX_GPIO61__FUNC_GPIO61 (MTK_PIN_NO(61) | 0) +#define PINMUX_GPIO61__FUNC_DMIC1_CLK (MTK_PIN_NO(61) | 1) +#define PINMUX_GPIO61__FUNC_I2SO2_BCK (MTK_PIN_NO(61) | 2) +#define PINMUX_GPIO61__FUNC_SCP_SPI2_CK (MTK_PIN_NO(61) | 3) +#define PINMUX_GPIO61__FUNC_DBG_MON_B19 (MTK_PIN_NO(61) | 7) + +#define PINMUX_GPIO62__FUNC_GPIO62 (MTK_PIN_NO(62) | 0) +#define PINMUX_GPIO62__FUNC_DMIC1_DAT (MTK_PIN_NO(62) | 1) +#define PINMUX_GPIO62__FUNC_I2SO2_WS (MTK_PIN_NO(62) | 2) +#define PINMUX_GPIO62__FUNC_SCP_SPI2_MI (MTK_PIN_NO(62) | 3) +#define PINMUX_GPIO62__FUNC_DBG_MON_B20 (MTK_PIN_NO(62) | 7) + +#define PINMUX_GPIO63__FUNC_GPIO63 (MTK_PIN_NO(63) | 0) +#define PINMUX_GPIO63__FUNC_DMIC2_CLK (MTK_PIN_NO(63) | 1) +#define PINMUX_GPIO63__FUNC_VBUSVALID (MTK_PIN_NO(63) | 2) +#define PINMUX_GPIO63__FUNC_SCP_SPI2_MO (MTK_PIN_NO(63) | 3) +#define PINMUX_GPIO63__FUNC_SCP_SCL2 (MTK_PIN_NO(63) | 4) +#define PINMUX_GPIO63__FUNC_SCP_JTAG1_TDO (MTK_PIN_NO(63) | 5) +#define PINMUX_GPIO63__FUNC_JTDO_SEL1 (MTK_PIN_NO(63) | 6) +#define PINMUX_GPIO63__FUNC_DBG_MON_B21 (MTK_PIN_NO(63) | 7) + +#define PINMUX_GPIO64__FUNC_GPIO64 (MTK_PIN_NO(64) | 0) +#define PINMUX_GPIO64__FUNC_DMIC2_DAT (MTK_PIN_NO(64) | 1) +#define PINMUX_GPIO64__FUNC_VBUSVALID_1P (MTK_PIN_NO(64) | 2) +#define PINMUX_GPIO64__FUNC_SCP_SPI2_CS (MTK_PIN_NO(64) | 3) +#define PINMUX_GPIO64__FUNC_SCP_SDA2 (MTK_PIN_NO(64) | 4) +#define PINMUX_GPIO64__FUNC_DBG_MON_B22 (MTK_PIN_NO(64) | 7) + +#define PINMUX_GPIO65__FUNC_GPIO65 (MTK_PIN_NO(65) | 0) +#define PINMUX_GPIO65__FUNC_PCM_DO (MTK_PIN_NO(65) | 1) +#define PINMUX_GPIO65__FUNC_AUXIF_ST0 (MTK_PIN_NO(65) | 2) +#define PINMUX_GPIO65__FUNC_UCTS2 (MTK_PIN_NO(65) | 3) +#define PINMUX_GPIO65__FUNC_SCP_JTAG1_TMS (MTK_PIN_NO(65) | 5) +#define PINMUX_GPIO65__FUNC_JTMS_SEL1 (MTK_PIN_NO(65) | 6) +#define PINMUX_GPIO65__FUNC_DBG_MON_B23 (MTK_PIN_NO(65) | 7) + +#define PINMUX_GPIO66__FUNC_GPIO66 (MTK_PIN_NO(66) | 0) +#define PINMUX_GPIO66__FUNC_PCM_CLK (MTK_PIN_NO(66) | 1) +#define PINMUX_GPIO66__FUNC_AUXIF_CLK0 (MTK_PIN_NO(66) | 2) +#define PINMUX_GPIO66__FUNC_URTS2 (MTK_PIN_NO(66) | 3) +#define PINMUX_GPIO66__FUNC_SCP_JTAG1_TCK (MTK_PIN_NO(66) | 5) +#define PINMUX_GPIO66__FUNC_JTCK_SEL1 (MTK_PIN_NO(66) | 6) +#define PINMUX_GPIO66__FUNC_DBG_MON_B24 (MTK_PIN_NO(66) | 7) + +#define PINMUX_GPIO67__FUNC_GPIO67 (MTK_PIN_NO(67) | 0) +#define PINMUX_GPIO67__FUNC_PCM_DI (MTK_PIN_NO(67) | 1) +#define PINMUX_GPIO67__FUNC_AUXIF_ST1 (MTK_PIN_NO(67) | 2) +#define PINMUX_GPIO67__FUNC_UTXD2 (MTK_PIN_NO(67) | 3) +#define PINMUX_GPIO67__FUNC_SCP_JTAG1_TRSTN (MTK_PIN_NO(67) | 5) +#define PINMUX_GPIO67__FUNC_JTRSTn_SEL1 (MTK_PIN_NO(67) | 6) +#define PINMUX_GPIO67__FUNC_DBG_MON_B25 (MTK_PIN_NO(67) | 7) + +#define PINMUX_GPIO68__FUNC_GPIO68 (MTK_PIN_NO(68) | 0) +#define PINMUX_GPIO68__FUNC_PCM_SYNC (MTK_PIN_NO(68) | 1) +#define PINMUX_GPIO68__FUNC_AUXIF_CLK1 (MTK_PIN_NO(68) | 2) +#define PINMUX_GPIO68__FUNC_URXD2 (MTK_PIN_NO(68) | 3) +#define PINMUX_GPIO68__FUNC_SCP_JTAG1_TDI (MTK_PIN_NO(68) | 5) +#define PINMUX_GPIO68__FUNC_JTDI_SEL1 (MTK_PIN_NO(68) | 6) +#define PINMUX_GPIO68__FUNC_DBG_MON_B26 (MTK_PIN_NO(68) | 7) + +#define PINMUX_GPIO69__FUNC_GPIO69 (MTK_PIN_NO(69) | 0) +#define PINMUX_GPIO69__FUNC_AUD_CLK_MOSI (MTK_PIN_NO(69) | 1) +#define PINMUX_GPIO69__FUNC_I2SIN2_BCK (MTK_PIN_NO(69) | 2) +#define PINMUX_GPIO69__FUNC_PWM_0 (MTK_PIN_NO(69) | 3) +#define PINMUX_GPIO69__FUNC_WAKEN (MTK_PIN_NO(69) | 4) +#define PINMUX_GPIO69__FUNC_DBG_MON_B27 (MTK_PIN_NO(69) | 7) + +#define PINMUX_GPIO70__FUNC_GPIO70 (MTK_PIN_NO(70) | 0) +#define PINMUX_GPIO70__FUNC_AUD_SYNC_MOSI (MTK_PIN_NO(70) | 1) +#define PINMUX_GPIO70__FUNC_I2SIN2_LRCK (MTK_PIN_NO(70) | 2) +#define PINMUX_GPIO70__FUNC_PWM_1 (MTK_PIN_NO(70) | 3) +#define PINMUX_GPIO70__FUNC_PERSTN (MTK_PIN_NO(70) | 4) +#define PINMUX_GPIO70__FUNC_DBG_MON_B28 (MTK_PIN_NO(70) | 7) + +#define PINMUX_GPIO71__FUNC_GPIO71 (MTK_PIN_NO(71) | 0) +#define PINMUX_GPIO71__FUNC_AUD_DAT_MOSI0 (MTK_PIN_NO(71) | 1) +#define PINMUX_GPIO71__FUNC_IDDIG_2P (MTK_PIN_NO(71) | 2) +#define PINMUX_GPIO71__FUNC_PWM_2 (MTK_PIN_NO(71) | 3) +#define PINMUX_GPIO71__FUNC_CLKREQN (MTK_PIN_NO(71) | 4) +#define PINMUX_GPIO71__FUNC_DBG_MON_B29 (MTK_PIN_NO(71) | 7) + +#define PINMUX_GPIO72__FUNC_GPIO72 (MTK_PIN_NO(72) | 0) +#define PINMUX_GPIO72__FUNC_AUD_DAT_MOSI1 (MTK_PIN_NO(72) | 1) +#define PINMUX_GPIO72__FUNC_USB_DRVVBUS_2P (MTK_PIN_NO(72) | 2) +#define PINMUX_GPIO72__FUNC_PWM_3 (MTK_PIN_NO(72) | 3) +#define PINMUX_GPIO72__FUNC_PERSTN_1 (MTK_PIN_NO(72) | 4) +#define PINMUX_GPIO72__FUNC_DBG_MON_B30 (MTK_PIN_NO(72) | 7) + +#define PINMUX_GPIO73__FUNC_GPIO73 (MTK_PIN_NO(73) | 0) +#define PINMUX_GPIO73__FUNC_AUD_DAT_MISO0 (MTK_PIN_NO(73) | 1) +#define PINMUX_GPIO73__FUNC_I2SI02_DATA0 (MTK_PIN_NO(73) | 2) +#define PINMUX_GPIO73__FUNC_CLKREQN_1 (MTK_PIN_NO(73) | 4) +#define PINMUX_GPIO73__FUNC_VOW_DAT_MISO (MTK_PIN_NO(73) | 5) +#define PINMUX_GPIO73__FUNC_DBG_MON_B31 (MTK_PIN_NO(73) | 7) + +#define PINMUX_GPIO74__FUNC_GPIO74 (MTK_PIN_NO(74) | 0) +#define PINMUX_GPIO74__FUNC_AUD_DAT_MISO1 (MTK_PIN_NO(74) | 1) +#define PINMUX_GPIO74__FUNC_I2SI02_DATA1 (MTK_PIN_NO(74) | 2) +#define PINMUX_GPIO74__FUNC_WAKEN_1 (MTK_PIN_NO(74) | 4) +#define PINMUX_GPIO74__FUNC_VOW_CLK_MISO (MTK_PIN_NO(74) | 5) +#define PINMUX_GPIO74__FUNC_DBG_MON_B32 (MTK_PIN_NO(74) | 7) + +#define PINMUX_GPIO75__FUNC_GPIO75 (MTK_PIN_NO(75) | 0) +#define PINMUX_GPIO75__FUNC_AUD_DAT_MISO2 (MTK_PIN_NO(75) | 1) +#define PINMUX_GPIO75__FUNC_I2SI02_DATA2 (MTK_PIN_NO(75) | 2) + +#define PINMUX_GPIO76__FUNC_GPIO76 (MTK_PIN_NO(76) | 0) +#define PINMUX_GPIO76__FUNC_SCP_VREQ_VAO (MTK_PIN_NO(76) | 1) +#define PINMUX_GPIO76__FUNC_I2SI02_DATA3 (MTK_PIN_NO(76) | 2) +#define PINMUX_GPIO76__FUNC_DBG_MON_A26 (MTK_PIN_NO(76) | 7) + +#define PINMUX_GPIO77__FUNC_GPIO77 (MTK_PIN_NO(77) | 0) +#define PINMUX_GPIO77__FUNC_DGI_D0 (MTK_PIN_NO(77) | 1) +#define PINMUX_GPIO77__FUNC_DPI_D0 (MTK_PIN_NO(77) | 2) +#define PINMUX_GPIO77__FUNC_I2SI4_MCK (MTK_PIN_NO(77) | 3) +#define PINMUX_GPIO77__FUNC_SPIM4_CLK (MTK_PIN_NO(77) | 4) +#define PINMUX_GPIO77__FUNC_GBE_TXD3 (MTK_PIN_NO(77) | 5) +#define PINMUX_GPIO77__FUNC_SPM_JTAG_TCK (MTK_PIN_NO(77) | 6) + +#define PINMUX_GPIO78__FUNC_GPIO78 (MTK_PIN_NO(78) | 0) +#define PINMUX_GPIO78__FUNC_DGI_D1 (MTK_PIN_NO(78) | 1) +#define PINMUX_GPIO78__FUNC_DPI_D1 (MTK_PIN_NO(78) | 2) +#define PINMUX_GPIO78__FUNC_I2SI4_BCK (MTK_PIN_NO(78) | 3) +#define PINMUX_GPIO78__FUNC_SPIM4_MO (MTK_PIN_NO(78) | 4) +#define PINMUX_GPIO78__FUNC_GBE_TXD2 (MTK_PIN_NO(78) | 5) +#define PINMUX_GPIO78__FUNC_SPM_JTAG_TMS (MTK_PIN_NO(78) | 6) + +#define PINMUX_GPIO79__FUNC_GPIO79 (MTK_PIN_NO(79) | 0) +#define PINMUX_GPIO79__FUNC_DGI_D2 (MTK_PIN_NO(79) | 1) +#define PINMUX_GPIO79__FUNC_DPI_D2 (MTK_PIN_NO(79) | 2) +#define PINMUX_GPIO79__FUNC_I2SI4_WS (MTK_PIN_NO(79) | 3) +#define PINMUX_GPIO79__FUNC_SPIM4_CSB (MTK_PIN_NO(79) | 4) +#define PINMUX_GPIO79__FUNC_GBE_TXD1 (MTK_PIN_NO(79) | 5) +#define PINMUX_GPIO79__FUNC_SPM_JTAG_TDI (MTK_PIN_NO(79) | 6) + +#define PINMUX_GPIO80__FUNC_GPIO80 (MTK_PIN_NO(80) | 0) +#define PINMUX_GPIO80__FUNC_DGI_D3 (MTK_PIN_NO(80) | 1) +#define PINMUX_GPIO80__FUNC_DPI_D3 (MTK_PIN_NO(80) | 2) +#define PINMUX_GPIO80__FUNC_I2SI4_D0 (MTK_PIN_NO(80) | 3) +#define PINMUX_GPIO80__FUNC_SPIM4_MI (MTK_PIN_NO(80) | 4) +#define PINMUX_GPIO80__FUNC_GBE_TXD0 (MTK_PIN_NO(80) | 5) +#define PINMUX_GPIO80__FUNC_SPM_JTAG_TDO (MTK_PIN_NO(80) | 6) + +#define PINMUX_GPIO81__FUNC_GPIO81 (MTK_PIN_NO(81) | 0) +#define PINMUX_GPIO81__FUNC_DGI_D4 (MTK_PIN_NO(81) | 1) +#define PINMUX_GPIO81__FUNC_DPI_D4 (MTK_PIN_NO(81) | 2) +#define PINMUX_GPIO81__FUNC_I2SI5_MCK (MTK_PIN_NO(81) | 3) +#define PINMUX_GPIO81__FUNC_SPIM5_CLK (MTK_PIN_NO(81) | 4) +#define PINMUX_GPIO81__FUNC_GBE_RXD3 (MTK_PIN_NO(81) | 5) +#define PINMUX_GPIO81__FUNC_SPM_JTAG_TRSTN (MTK_PIN_NO(81) | 6) + +#define PINMUX_GPIO82__FUNC_GPIO82 (MTK_PIN_NO(82) | 0) +#define PINMUX_GPIO82__FUNC_DGI_D5 (MTK_PIN_NO(82) | 1) +#define PINMUX_GPIO82__FUNC_DPI_D5 (MTK_PIN_NO(82) | 2) +#define PINMUX_GPIO82__FUNC_I2SI5_BCK (MTK_PIN_NO(82) | 3) +#define PINMUX_GPIO82__FUNC_SPIM5_MO (MTK_PIN_NO(82) | 4) +#define PINMUX_GPIO82__FUNC_GBE_RXD2 (MTK_PIN_NO(82) | 5) +#define PINMUX_GPIO82__FUNC_MCUPM_JTAG_TDO (MTK_PIN_NO(82) | 6) + +#define PINMUX_GPIO83__FUNC_GPIO83 (MTK_PIN_NO(83) | 0) +#define PINMUX_GPIO83__FUNC_DGI_D6 (MTK_PIN_NO(83) | 1) +#define PINMUX_GPIO83__FUNC_DPI_D6 (MTK_PIN_NO(83) | 2) +#define PINMUX_GPIO83__FUNC_I2SI5_WS (MTK_PIN_NO(83) | 3) +#define PINMUX_GPIO83__FUNC_SPIM5_CSB (MTK_PIN_NO(83) | 4) +#define PINMUX_GPIO83__FUNC_GBE_RXD1 (MTK_PIN_NO(83) | 5) +#define PINMUX_GPIO83__FUNC_MCUPM_JTAG_TMS (MTK_PIN_NO(83) | 6) + +#define PINMUX_GPIO84__FUNC_GPIO84 (MTK_PIN_NO(84) | 0) +#define PINMUX_GPIO84__FUNC_DGI_D7 (MTK_PIN_NO(84) | 1) +#define PINMUX_GPIO84__FUNC_DPI_D7 (MTK_PIN_NO(84) | 2) +#define PINMUX_GPIO84__FUNC_I2SI5_D0 (MTK_PIN_NO(84) | 3) +#define PINMUX_GPIO84__FUNC_SPIM5_MI (MTK_PIN_NO(84) | 4) +#define PINMUX_GPIO84__FUNC_GBE_RXD0 (MTK_PIN_NO(84) | 5) +#define PINMUX_GPIO84__FUNC_MCUPM_JTAG_TCK (MTK_PIN_NO(84) | 6) + +#define PINMUX_GPIO85__FUNC_GPIO85 (MTK_PIN_NO(85) | 0) +#define PINMUX_GPIO85__FUNC_DGI_D8 (MTK_PIN_NO(85) | 1) +#define PINMUX_GPIO85__FUNC_DPI_D8 (MTK_PIN_NO(85) | 2) +#define PINMUX_GPIO85__FUNC_I2SO4_MCK (MTK_PIN_NO(85) | 3) +#define PINMUX_GPIO85__FUNC_SCP_SPI1_B_CK (MTK_PIN_NO(85) | 4) +#define PINMUX_GPIO85__FUNC_GBE_TXC (MTK_PIN_NO(85) | 5) +#define PINMUX_GPIO85__FUNC_MCUPM_JTAG_TDI (MTK_PIN_NO(85) | 6) + +#define PINMUX_GPIO86__FUNC_GPIO86 (MTK_PIN_NO(86) | 0) +#define PINMUX_GPIO86__FUNC_DGI_D9 (MTK_PIN_NO(86) | 1) +#define PINMUX_GPIO86__FUNC_DPI_D9 (MTK_PIN_NO(86) | 2) +#define PINMUX_GPIO86__FUNC_I2SO4_BCK (MTK_PIN_NO(86) | 3) +#define PINMUX_GPIO86__FUNC_SCP_SPI1_B_MI (MTK_PIN_NO(86) | 4) +#define PINMUX_GPIO86__FUNC_GBE_RXC (MTK_PIN_NO(86) | 5) +#define PINMUX_GPIO86__FUNC_MCUPM_JTAG_TRSTN (MTK_PIN_NO(86) | 6) + +#define PINMUX_GPIO87__FUNC_GPIO87 (MTK_PIN_NO(87) | 0) +#define PINMUX_GPIO87__FUNC_DGI_D10 (MTK_PIN_NO(87) | 1) +#define PINMUX_GPIO87__FUNC_DPI_D10 (MTK_PIN_NO(87) | 2) +#define PINMUX_GPIO87__FUNC_I2SO4_WS (MTK_PIN_NO(87) | 3) +#define PINMUX_GPIO87__FUNC_SCP_SPI1_B_CS (MTK_PIN_NO(87) | 4) +#define PINMUX_GPIO87__FUNC_GBE_RXDV (MTK_PIN_NO(87) | 5) +#define PINMUX_GPIO87__FUNC_SSPM_JTAG_TDO (MTK_PIN_NO(87) | 6) + +#define PINMUX_GPIO88__FUNC_GPIO88 (MTK_PIN_NO(88) | 0) +#define PINMUX_GPIO88__FUNC_DGI_D11 (MTK_PIN_NO(88) | 1) +#define PINMUX_GPIO88__FUNC_DPI_D11 (MTK_PIN_NO(88) | 2) +#define PINMUX_GPIO88__FUNC_I2SO4_D0 (MTK_PIN_NO(88) | 3) +#define PINMUX_GPIO88__FUNC_SCP_SPI1_B_MO (MTK_PIN_NO(88) | 4) +#define PINMUX_GPIO88__FUNC_GBE_TXEN (MTK_PIN_NO(88) | 5) +#define PINMUX_GPIO88__FUNC_SSPM_JTAG_TMS (MTK_PIN_NO(88) | 6) + +#define PINMUX_GPIO89__FUNC_GPIO89 (MTK_PIN_NO(89) | 0) +#define PINMUX_GPIO89__FUNC_DGI_D12 (MTK_PIN_NO(89) | 1) +#define PINMUX_GPIO89__FUNC_DPI_D12 (MTK_PIN_NO(89) | 2) +#define PINMUX_GPIO89__FUNC_MSDC2_CMD_A (MTK_PIN_NO(89) | 3) +#define PINMUX_GPIO89__FUNC_I2SO5_BCK (MTK_PIN_NO(89) | 4) +#define PINMUX_GPIO89__FUNC_GBE_MDC (MTK_PIN_NO(89) | 5) +#define PINMUX_GPIO89__FUNC_SSPM_JTAG_TCK (MTK_PIN_NO(89) | 6) + +#define PINMUX_GPIO90__FUNC_GPIO90 (MTK_PIN_NO(90) | 0) +#define PINMUX_GPIO90__FUNC_DGI_D13 (MTK_PIN_NO(90) | 1) +#define PINMUX_GPIO90__FUNC_DPI_D13 (MTK_PIN_NO(90) | 2) +#define PINMUX_GPIO90__FUNC_MSDC2_CLK_A (MTK_PIN_NO(90) | 3) +#define PINMUX_GPIO90__FUNC_I2SO5_WS (MTK_PIN_NO(90) | 4) +#define PINMUX_GPIO90__FUNC_GBE_MDIO (MTK_PIN_NO(90) | 5) +#define PINMUX_GPIO90__FUNC_SSPM_JTAG_TDI (MTK_PIN_NO(90) | 6) + +#define PINMUX_GPIO91__FUNC_GPIO91 (MTK_PIN_NO(91) | 0) +#define PINMUX_GPIO91__FUNC_DGI_D14 (MTK_PIN_NO(91) | 1) +#define PINMUX_GPIO91__FUNC_DPI_D14 (MTK_PIN_NO(91) | 2) +#define PINMUX_GPIO91__FUNC_MSDC2_DAT3_A (MTK_PIN_NO(91) | 3) +#define PINMUX_GPIO91__FUNC_I2SO5_D0 (MTK_PIN_NO(91) | 4) +#define PINMUX_GPIO91__FUNC_GBE_TXER (MTK_PIN_NO(91) | 5) +#define PINMUX_GPIO91__FUNC_SSPM_JTAG_TRSTN (MTK_PIN_NO(91) | 6) + +#define PINMUX_GPIO92__FUNC_GPIO92 (MTK_PIN_NO(92) | 0) +#define PINMUX_GPIO92__FUNC_DGI_D15 (MTK_PIN_NO(92) | 1) +#define PINMUX_GPIO92__FUNC_DPI_D15 (MTK_PIN_NO(92) | 2) +#define PINMUX_GPIO92__FUNC_MSDC2_DAT0_A (MTK_PIN_NO(92) | 3) +#define PINMUX_GPIO92__FUNC_I2SO2_D1 (MTK_PIN_NO(92) | 4) +#define PINMUX_GPIO92__FUNC_GBE_RXER (MTK_PIN_NO(92) | 5) +#define PINMUX_GPIO92__FUNC_CCU0_JTAG_TDO (MTK_PIN_NO(92) | 6) + +#define PINMUX_GPIO93__FUNC_GPIO93 (MTK_PIN_NO(93) | 0) +#define PINMUX_GPIO93__FUNC_DGI_HSYNC (MTK_PIN_NO(93) | 1) +#define PINMUX_GPIO93__FUNC_DPI_HSYNC (MTK_PIN_NO(93) | 2) +#define PINMUX_GPIO93__FUNC_MSDC2_DAT2_A (MTK_PIN_NO(93) | 3) +#define PINMUX_GPIO93__FUNC_I2SO2_D2 (MTK_PIN_NO(93) | 4) +#define PINMUX_GPIO93__FUNC_GBE_COL (MTK_PIN_NO(93) | 5) +#define PINMUX_GPIO93__FUNC_CCU0_JTAG_TMS (MTK_PIN_NO(93) | 6) + +#define PINMUX_GPIO94__FUNC_GPIO94 (MTK_PIN_NO(94) | 0) +#define PINMUX_GPIO94__FUNC_DGI_VSYNC (MTK_PIN_NO(94) | 1) +#define PINMUX_GPIO94__FUNC_DPI_VSYNC (MTK_PIN_NO(94) | 2) +#define PINMUX_GPIO94__FUNC_MSDC2_DAT1_A (MTK_PIN_NO(94) | 3) +#define PINMUX_GPIO94__FUNC_I2SO2_D3 (MTK_PIN_NO(94) | 4) +#define PINMUX_GPIO94__FUNC_GBE_INTR (MTK_PIN_NO(94) | 5) +#define PINMUX_GPIO94__FUNC_CCU0_JTAG_TDI (MTK_PIN_NO(94) | 6) + +#define PINMUX_GPIO95__FUNC_GPIO95 (MTK_PIN_NO(95) | 0) +#define PINMUX_GPIO95__FUNC_DGI_DE (MTK_PIN_NO(95) | 1) +#define PINMUX_GPIO95__FUNC_DPI_DE (MTK_PIN_NO(95) | 2) +#define PINMUX_GPIO95__FUNC_UTXD2 (MTK_PIN_NO(95) | 3) +#define PINMUX_GPIO95__FUNC_I2SIN_D1 (MTK_PIN_NO(95) | 5) +#define PINMUX_GPIO95__FUNC_CCU0_JTAG_TCK (MTK_PIN_NO(95) | 6) + +#define PINMUX_GPIO96__FUNC_GPIO96 (MTK_PIN_NO(96) | 0) +#define PINMUX_GPIO96__FUNC_DGI_CK (MTK_PIN_NO(96) | 1) +#define PINMUX_GPIO96__FUNC_DPI_CK (MTK_PIN_NO(96) | 2) +#define PINMUX_GPIO96__FUNC_URXD2 (MTK_PIN_NO(96) | 3) +#define PINMUX_GPIO96__FUNC_I2SO5_MCK (MTK_PIN_NO(96) | 4) +#define PINMUX_GPIO96__FUNC_I2SIN_D2 (MTK_PIN_NO(96) | 5) +#define PINMUX_GPIO96__FUNC_CCU0_JTAG_TRST (MTK_PIN_NO(96) | 6) + +#define PINMUX_GPIO97__FUNC_GPIO97 (MTK_PIN_NO(97) | 0) +#define PINMUX_GPIO97__FUNC_DISP_PWM0 (MTK_PIN_NO(97) | 1) +#define PINMUX_GPIO97__FUNC_DVFSRC_EXT_REQ (MTK_PIN_NO(97) | 2) + +#define PINMUX_GPIO98__FUNC_GPIO98 (MTK_PIN_NO(98) | 0) +#define PINMUX_GPIO98__FUNC_UTXD0 (MTK_PIN_NO(98) | 1) + +#define PINMUX_GPIO99__FUNC_GPIO99 (MTK_PIN_NO(99) | 0) +#define PINMUX_GPIO99__FUNC_URXD0 (MTK_PIN_NO(99) | 1) + +#define PINMUX_GPIO100__FUNC_GPIO100 (MTK_PIN_NO(100) | 0) +#define PINMUX_GPIO100__FUNC_URTS1 (MTK_PIN_NO(100) | 1) +#define PINMUX_GPIO100__FUNC_DSI_TE (MTK_PIN_NO(100) | 2) +#define PINMUX_GPIO100__FUNC_I2SO1_D8 (MTK_PIN_NO(100) | 3) +#define PINMUX_GPIO100__FUNC_KPROW2 (MTK_PIN_NO(100) | 4) +#define PINMUX_GPIO100__FUNC_PWM_0 (MTK_PIN_NO(100) | 5) +#define PINMUX_GPIO100__FUNC_TP_URTS1_AO (MTK_PIN_NO(100) | 6) +#define PINMUX_GPIO100__FUNC_I2SIN_D0 (MTK_PIN_NO(100) | 7) + +#define PINMUX_GPIO101__FUNC_GPIO101 (MTK_PIN_NO(101) | 0) +#define PINMUX_GPIO101__FUNC_UCTS1 (MTK_PIN_NO(101) | 1) +#define PINMUX_GPIO101__FUNC_DSI1_TE (MTK_PIN_NO(101) | 2) +#define PINMUX_GPIO101__FUNC_I2SO1_D9 (MTK_PIN_NO(101) | 3) +#define PINMUX_GPIO101__FUNC_KPCOL2 (MTK_PIN_NO(101) | 4) +#define PINMUX_GPIO101__FUNC_PWM_1 (MTK_PIN_NO(101) | 5) +#define PINMUX_GPIO101__FUNC_TP_UCTS1_AO (MTK_PIN_NO(101) | 6) +#define PINMUX_GPIO101__FUNC_I2SIN_D1 (MTK_PIN_NO(101) | 7) + +#define PINMUX_GPIO102__FUNC_GPIO102 (MTK_PIN_NO(102) | 0) +#define PINMUX_GPIO102__FUNC_UTXD1 (MTK_PIN_NO(102) | 1) +#define PINMUX_GPIO102__FUNC_VBUSVALID_2P (MTK_PIN_NO(102) | 2) +#define PINMUX_GPIO102__FUNC_I2SO1_D10 (MTK_PIN_NO(102) | 3) +#define PINMUX_GPIO102__FUNC_SSPM_UTXD_AO (MTK_PIN_NO(102) | 4) +#define PINMUX_GPIO102__FUNC_TP_UTXD1_AO (MTK_PIN_NO(102) | 5) +#define PINMUX_GPIO102__FUNC_MD32_1_TXD (MTK_PIN_NO(102) | 6) +#define PINMUX_GPIO102__FUNC_I2SIN_D2 (MTK_PIN_NO(102) | 7) + +#define PINMUX_GPIO103__FUNC_GPIO103 (MTK_PIN_NO(103) | 0) +#define PINMUX_GPIO103__FUNC_URXD1 (MTK_PIN_NO(103) | 1) +#define PINMUX_GPIO103__FUNC_VBUSVALID_3P (MTK_PIN_NO(103) | 2) +#define PINMUX_GPIO103__FUNC_I2SO1_D11 (MTK_PIN_NO(103) | 3) +#define PINMUX_GPIO103__FUNC_SSPM_URXD_AO (MTK_PIN_NO(103) | 4) +#define PINMUX_GPIO103__FUNC_TP_URXD1_AO (MTK_PIN_NO(103) | 5) +#define PINMUX_GPIO103__FUNC_MD32_1_RXD (MTK_PIN_NO(103) | 6) +#define PINMUX_GPIO103__FUNC_I2SIN_D3 (MTK_PIN_NO(103) | 7) + +#define PINMUX_GPIO104__FUNC_GPIO104 (MTK_PIN_NO(104) | 0) +#define PINMUX_GPIO104__FUNC_KPROW0 (MTK_PIN_NO(104) | 1) +#define PINMUX_GPIO104__FUNC_DISP_PWM1 (MTK_PIN_NO(104) | 2) + +#define PINMUX_GPIO105__FUNC_GPIO105 (MTK_PIN_NO(105) | 0) +#define PINMUX_GPIO105__FUNC_KPROW1 (MTK_PIN_NO(105) | 1) +#define PINMUX_GPIO105__FUNC_EDP_TX_HPD (MTK_PIN_NO(105) | 2) +#define PINMUX_GPIO105__FUNC_PWM_2 (MTK_PIN_NO(105) | 3) + +#define PINMUX_GPIO106__FUNC_GPIO106 (MTK_PIN_NO(106) | 0) +#define PINMUX_GPIO106__FUNC_KPCOL0 (MTK_PIN_NO(106) | 1) + +#define PINMUX_GPIO107__FUNC_GPIO107 (MTK_PIN_NO(107) | 0) +#define PINMUX_GPIO107__FUNC_KPCOL1 (MTK_PIN_NO(107) | 1) +#define PINMUX_GPIO107__FUNC_DSI1_TE (MTK_PIN_NO(107) | 2) +#define PINMUX_GPIO107__FUNC_PWM_3 (MTK_PIN_NO(107) | 3) +#define PINMUX_GPIO107__FUNC_SCP_SCL3 (MTK_PIN_NO(107) | 4) +#define PINMUX_GPIO107__FUNC_I2SIN_MCK (MTK_PIN_NO(107) | 5) + +#define PINMUX_GPIO108__FUNC_GPIO108 (MTK_PIN_NO(108) | 0) +#define PINMUX_GPIO108__FUNC_LCM_RST (MTK_PIN_NO(108) | 1) +#define PINMUX_GPIO108__FUNC_KPCOL1 (MTK_PIN_NO(108) | 2) +#define PINMUX_GPIO108__FUNC_SCP_SDA3 (MTK_PIN_NO(108) | 4) +#define PINMUX_GPIO108__FUNC_I2SIN_BCK (MTK_PIN_NO(108) | 5) + +#define PINMUX_GPIO109__FUNC_GPIO109 (MTK_PIN_NO(109) | 0) +#define PINMUX_GPIO109__FUNC_DSI_TE (MTK_PIN_NO(109) | 1) +#define PINMUX_GPIO109__FUNC_I2SIN_D3 (MTK_PIN_NO(109) | 2) +#define PINMUX_GPIO109__FUNC_I2SIN_WS (MTK_PIN_NO(109) | 5) + +#define PINMUX_GPIO110__FUNC_GPIO110 (MTK_PIN_NO(110) | 0) +#define PINMUX_GPIO110__FUNC_MSDC1_CMD (MTK_PIN_NO(110) | 1) +#define PINMUX_GPIO110__FUNC_JTMS_SEL3 (MTK_PIN_NO(110) | 2) +#define PINMUX_GPIO110__FUNC_UDI_TMS (MTK_PIN_NO(110) | 3) +#define PINMUX_GPIO110__FUNC_CCU1_JTAG_TMS (MTK_PIN_NO(110) | 5) +#define PINMUX_GPIO110__FUNC_IPU_JTAG_TMS (MTK_PIN_NO(110) | 6) + +#define PINMUX_GPIO111__FUNC_GPIO111 (MTK_PIN_NO(111) | 0) +#define PINMUX_GPIO111__FUNC_MSDC1_CLK (MTK_PIN_NO(111) | 1) +#define PINMUX_GPIO111__FUNC_JTCK_SEL3 (MTK_PIN_NO(111) | 2) +#define PINMUX_GPIO111__FUNC_UDI_TCK (MTK_PIN_NO(111) | 3) +#define PINMUX_GPIO111__FUNC_CCU1_JTAG_TCK (MTK_PIN_NO(111) | 5) +#define PINMUX_GPIO111__FUNC_IPU_JTAG_TCK (MTK_PIN_NO(111) | 6) + +#define PINMUX_GPIO112__FUNC_GPIO112 (MTK_PIN_NO(112) | 0) +#define PINMUX_GPIO112__FUNC_MSDC1_DAT0 (MTK_PIN_NO(112) | 1) +#define PINMUX_GPIO112__FUNC_JTDI_SEL3 (MTK_PIN_NO(112) | 2) +#define PINMUX_GPIO112__FUNC_UDI_TDI (MTK_PIN_NO(112) | 3) +#define PINMUX_GPIO112__FUNC_I2SO2_D0 (MTK_PIN_NO(112) | 4) +#define PINMUX_GPIO112__FUNC_CCU1_JTAG_TDI (MTK_PIN_NO(112) | 5) +#define PINMUX_GPIO112__FUNC_IPU_JTAG_TDI (MTK_PIN_NO(112) | 6) + +#define PINMUX_GPIO113__FUNC_GPIO113 (MTK_PIN_NO(113) | 0) +#define PINMUX_GPIO113__FUNC_MSDC1_DAT1 (MTK_PIN_NO(113) | 1) +#define PINMUX_GPIO113__FUNC_JTDO_SEL3 (MTK_PIN_NO(113) | 2) +#define PINMUX_GPIO113__FUNC_UDI_TDO (MTK_PIN_NO(113) | 3) +#define PINMUX_GPIO113__FUNC_I2SO2_D1 (MTK_PIN_NO(113) | 4) +#define PINMUX_GPIO113__FUNC_CCU1_JTAG_TDO (MTK_PIN_NO(113) | 5) +#define PINMUX_GPIO113__FUNC_IPU_JTAG_TDO (MTK_PIN_NO(113) | 6) + +#define PINMUX_GPIO114__FUNC_GPIO114 (MTK_PIN_NO(114) | 0) +#define PINMUX_GPIO114__FUNC_MSDC1_DAT2 (MTK_PIN_NO(114) | 1) +#define PINMUX_GPIO114__FUNC_JTRSTn_SEL3 (MTK_PIN_NO(114) | 2) +#define PINMUX_GPIO114__FUNC_UDI_NTRST (MTK_PIN_NO(114) | 3) +#define PINMUX_GPIO114__FUNC_I2SO2_D2 (MTK_PIN_NO(114) | 4) +#define PINMUX_GPIO114__FUNC_CCU1_JTAG_TRST (MTK_PIN_NO(114) | 5) +#define PINMUX_GPIO114__FUNC_IPU_JTAG_TRST (MTK_PIN_NO(114) | 6) + +#define PINMUX_GPIO115__FUNC_GPIO115 (MTK_PIN_NO(115) | 0) +#define PINMUX_GPIO115__FUNC_MSDC1_DAT3 (MTK_PIN_NO(115) | 1) +#define PINMUX_GPIO115__FUNC_I2SO2_D3 (MTK_PIN_NO(115) | 4) +#define PINMUX_GPIO115__FUNC_MD32_1_GPIO2 (MTK_PIN_NO(115) | 6) + +#define PINMUX_GPIO116__FUNC_GPIO116 (MTK_PIN_NO(116) | 0) +#define PINMUX_GPIO116__FUNC_MSDC0_DAT7 (MTK_PIN_NO(116) | 1) + +#define PINMUX_GPIO117__FUNC_GPIO117 (MTK_PIN_NO(117) | 0) +#define PINMUX_GPIO117__FUNC_MSDC0_DAT6 (MTK_PIN_NO(117) | 1) + +#define PINMUX_GPIO118__FUNC_GPIO118 (MTK_PIN_NO(118) | 0) +#define PINMUX_GPIO118__FUNC_MSDC0_DAT5 (MTK_PIN_NO(118) | 1) + +#define PINMUX_GPIO119__FUNC_GPIO119 (MTK_PIN_NO(119) | 0) +#define PINMUX_GPIO119__FUNC_MSDC0_DAT4 (MTK_PIN_NO(119) | 1) + +#define PINMUX_GPIO120__FUNC_GPIO120 (MTK_PIN_NO(120) | 0) +#define PINMUX_GPIO120__FUNC_MSDC0_RSTB (MTK_PIN_NO(120) | 1) + +#define PINMUX_GPIO121__FUNC_GPIO121 (MTK_PIN_NO(121) | 0) +#define PINMUX_GPIO121__FUNC_MSDC0_CMD (MTK_PIN_NO(121) | 1) + +#define PINMUX_GPIO122__FUNC_GPIO122 (MTK_PIN_NO(122) | 0) +#define PINMUX_GPIO122__FUNC_MSDC0_CLK (MTK_PIN_NO(122) | 1) + +#define PINMUX_GPIO123__FUNC_GPIO123 (MTK_PIN_NO(123) | 0) +#define PINMUX_GPIO123__FUNC_MSDC0_DAT3 (MTK_PIN_NO(123) | 1) + +#define PINMUX_GPIO124__FUNC_GPIO124 (MTK_PIN_NO(124) | 0) +#define PINMUX_GPIO124__FUNC_MSDC0_DAT2 (MTK_PIN_NO(124) | 1) + +#define PINMUX_GPIO125__FUNC_GPIO125 (MTK_PIN_NO(125) | 0) +#define PINMUX_GPIO125__FUNC_MSDC0_DAT1 (MTK_PIN_NO(125) | 1) + +#define PINMUX_GPIO126__FUNC_GPIO126 (MTK_PIN_NO(126) | 0) +#define PINMUX_GPIO126__FUNC_MSDC0_DAT0 (MTK_PIN_NO(126) | 1) + +#define PINMUX_GPIO127__FUNC_GPIO127 (MTK_PIN_NO(127) | 0) +#define PINMUX_GPIO127__FUNC_MSDC0_DSL (MTK_PIN_NO(127) | 1) + +#define PINMUX_GPIO128__FUNC_GPIO128 (MTK_PIN_NO(128) | 0) +#define PINMUX_GPIO128__FUNC_IDDIG (MTK_PIN_NO(128) | 1) +#define PINMUX_GPIO128__FUNC_UCTS2 (MTK_PIN_NO(128) | 2) +#define PINMUX_GPIO128__FUNC_UTXD5 (MTK_PIN_NO(128) | 3) +#define PINMUX_GPIO128__FUNC_UFS_MPHY_SCL (MTK_PIN_NO(128) | 4) +#define PINMUX_GPIO128__FUNC_mbistreaden_trigger (MTK_PIN_NO(128) | 5) +#define PINMUX_GPIO128__FUNC_MD32_1_GPIO0 (MTK_PIN_NO(128) | 6) +#define PINMUX_GPIO128__FUNC_SCP_SCL2 (MTK_PIN_NO(128) | 7) + +#define PINMUX_GPIO129__FUNC_GPIO129 (MTK_PIN_NO(129) | 0) +#define PINMUX_GPIO129__FUNC_USB_DRVVBUS (MTK_PIN_NO(129) | 1) +#define PINMUX_GPIO129__FUNC_URTS2 (MTK_PIN_NO(129) | 2) +#define PINMUX_GPIO129__FUNC_URXD5 (MTK_PIN_NO(129) | 3) +#define PINMUX_GPIO129__FUNC_UFS_MPHY_SDA (MTK_PIN_NO(129) | 4) +#define PINMUX_GPIO129__FUNC_mbistwriteen_trigger (MTK_PIN_NO(129) | 5) +#define PINMUX_GPIO129__FUNC_MD32_1_GPIO1 (MTK_PIN_NO(129) | 6) +#define PINMUX_GPIO129__FUNC_SCP_SDA2 (MTK_PIN_NO(129) | 7) + +#define PINMUX_GPIO130__FUNC_GPIO130 (MTK_PIN_NO(130) | 0) +#define PINMUX_GPIO130__FUNC_IDDIG_1P (MTK_PIN_NO(130) | 1) +#define PINMUX_GPIO130__FUNC_SPINOR_IO2 (MTK_PIN_NO(130) | 2) +#define PINMUX_GPIO130__FUNC_SNFI_WP (MTK_PIN_NO(130) | 3) +#define PINMUX_GPIO130__FUNC_VPU_UDI_NTRST (MTK_PIN_NO(130) | 4) + +#define PINMUX_GPIO131__FUNC_GPIO131 (MTK_PIN_NO(131) | 0) +#define PINMUX_GPIO131__FUNC_USB_DRVVBUS_1P (MTK_PIN_NO(131) | 1) +#define PINMUX_GPIO131__FUNC_SPINOR_IO3 (MTK_PIN_NO(131) | 2) +#define PINMUX_GPIO131__FUNC_SNFI_HOLD (MTK_PIN_NO(131) | 3) +#define PINMUX_GPIO131__FUNC_MD32_1_JTAG_TRST (MTK_PIN_NO(131) | 4) +#define PINMUX_GPIO131__FUNC_SCP_JTAG0_TRSTN (MTK_PIN_NO(131) | 5) +#define PINMUX_GPIO131__FUNC_APU_JTAG_TRST (MTK_PIN_NO(131) | 6) + +#define PINMUX_GPIO132__FUNC_GPIO132 (MTK_PIN_NO(132) | 0) +#define PINMUX_GPIO132__FUNC_SPIM0_CSB (MTK_PIN_NO(132) | 1) +#define PINMUX_GPIO132__FUNC_SCP_SPI0_CS (MTK_PIN_NO(132) | 2) +#define PINMUX_GPIO132__FUNC_SPIS0_CSB (MTK_PIN_NO(132) | 3) +#define PINMUX_GPIO132__FUNC_VPU_UDI_TMS (MTK_PIN_NO(132) | 4) +#define PINMUX_GPIO132__FUNC_I2SO5_D0 (MTK_PIN_NO(132) | 6) + +#define PINMUX_GPIO133__FUNC_GPIO133 (MTK_PIN_NO(133) | 0) +#define PINMUX_GPIO133__FUNC_SPIM0_CLK (MTK_PIN_NO(133) | 1) +#define PINMUX_GPIO133__FUNC_SCP_SPI0_CK (MTK_PIN_NO(133) | 2) +#define PINMUX_GPIO133__FUNC_SPIS0_CLK (MTK_PIN_NO(133) | 3) +#define PINMUX_GPIO133__FUNC_VPU_UDI_TCK (MTK_PIN_NO(133) | 4) +#define PINMUX_GPIO133__FUNC_I2SO5_BCK (MTK_PIN_NO(133) | 6) + +#define PINMUX_GPIO134__FUNC_GPIO134 (MTK_PIN_NO(134) | 0) +#define PINMUX_GPIO134__FUNC_SPIM0_MO (MTK_PIN_NO(134) | 1) +#define PINMUX_GPIO134__FUNC_SCP_SPI0_MO (MTK_PIN_NO(134) | 2) +#define PINMUX_GPIO134__FUNC_SPIS0_SI (MTK_PIN_NO(134) | 3) +#define PINMUX_GPIO134__FUNC_VPU_UDI_TDO (MTK_PIN_NO(134) | 4) +#define PINMUX_GPIO134__FUNC_I2SO5_WS (MTK_PIN_NO(134) | 6) + +#define PINMUX_GPIO135__FUNC_GPIO135 (MTK_PIN_NO(135) | 0) +#define PINMUX_GPIO135__FUNC_SPIM0_MI (MTK_PIN_NO(135) | 1) +#define PINMUX_GPIO135__FUNC_SCP_SPI0_MI (MTK_PIN_NO(135) | 2) +#define PINMUX_GPIO135__FUNC_SPIS0_SO (MTK_PIN_NO(135) | 3) +#define PINMUX_GPIO135__FUNC_VPU_UDI_TDI (MTK_PIN_NO(135) | 4) +#define PINMUX_GPIO135__FUNC_I2SO5_MCK (MTK_PIN_NO(135) | 6) + +#define PINMUX_GPIO136__FUNC_GPIO136 (MTK_PIN_NO(136) | 0) +#define PINMUX_GPIO136__FUNC_SPIM1_CSB (MTK_PIN_NO(136) | 1) +#define PINMUX_GPIO136__FUNC_SCP_SPI1_A_CS (MTK_PIN_NO(136) | 2) +#define PINMUX_GPIO136__FUNC_SPIS1_CSB (MTK_PIN_NO(136) | 3) +#define PINMUX_GPIO136__FUNC_MD32_1_JTAG_TMS (MTK_PIN_NO(136) | 4) +#define PINMUX_GPIO136__FUNC_SCP_JTAG0_TMS (MTK_PIN_NO(136) | 5) +#define PINMUX_GPIO136__FUNC_APU_JTAG_TMS (MTK_PIN_NO(136) | 6) +#define PINMUX_GPIO136__FUNC_DBG_MON_A15 (MTK_PIN_NO(136) | 7) + +#define PINMUX_GPIO137__FUNC_GPIO137 (MTK_PIN_NO(137) | 0) +#define PINMUX_GPIO137__FUNC_SPIM1_CLK (MTK_PIN_NO(137) | 1) +#define PINMUX_GPIO137__FUNC_SCP_SPI1_A_CK (MTK_PIN_NO(137) | 2) +#define PINMUX_GPIO137__FUNC_SPIS1_CLK (MTK_PIN_NO(137) | 3) +#define PINMUX_GPIO137__FUNC_MD32_1_JTAG_TCK (MTK_PIN_NO(137) | 4) +#define PINMUX_GPIO137__FUNC_SCP_JTAG0_TCK (MTK_PIN_NO(137) | 5) +#define PINMUX_GPIO137__FUNC_APU_JTAG_TCK (MTK_PIN_NO(137) | 6) +#define PINMUX_GPIO137__FUNC_DBG_MON_A14 (MTK_PIN_NO(137) | 7) + +#define PINMUX_GPIO138__FUNC_GPIO138 (MTK_PIN_NO(138) | 0) +#define PINMUX_GPIO138__FUNC_SPIM1_MO (MTK_PIN_NO(138) | 1) +#define PINMUX_GPIO138__FUNC_SCP_SPI1_A_MO (MTK_PIN_NO(138) | 2) +#define PINMUX_GPIO138__FUNC_SPIS1_SI (MTK_PIN_NO(138) | 3) +#define PINMUX_GPIO138__FUNC_MD32_1_JTAG_TDO (MTK_PIN_NO(138) | 4) +#define PINMUX_GPIO138__FUNC_SCP_JTAG0_TDO (MTK_PIN_NO(138) | 5) +#define PINMUX_GPIO138__FUNC_APU_JTAG_TDO (MTK_PIN_NO(138) | 6) +#define PINMUX_GPIO138__FUNC_DBG_MON_A16 (MTK_PIN_NO(138) | 7) + +#define PINMUX_GPIO139__FUNC_GPIO139 (MTK_PIN_NO(139) | 0) +#define PINMUX_GPIO139__FUNC_SPIM1_MI (MTK_PIN_NO(139) | 1) +#define PINMUX_GPIO139__FUNC_SCP_SPI1_A_MI (MTK_PIN_NO(139) | 2) +#define PINMUX_GPIO139__FUNC_SPIS1_SO (MTK_PIN_NO(139) | 3) +#define PINMUX_GPIO139__FUNC_MD32_1_JTAG_TDI (MTK_PIN_NO(139) | 4) +#define PINMUX_GPIO139__FUNC_SCP_JTAG0_TDI (MTK_PIN_NO(139) | 5) +#define PINMUX_GPIO139__FUNC_APU_JTAG_TDI (MTK_PIN_NO(139) | 6) +#define PINMUX_GPIO139__FUNC_DBG_MON_A17 (MTK_PIN_NO(139) | 7) + +#define PINMUX_GPIO140__FUNC_GPIO140 (MTK_PIN_NO(140) | 0) +#define PINMUX_GPIO140__FUNC_SPIM2_CSB (MTK_PIN_NO(140) | 1) +#define PINMUX_GPIO140__FUNC_SPINOR_CS (MTK_PIN_NO(140) | 2) +#define PINMUX_GPIO140__FUNC_SNFI_CS (MTK_PIN_NO(140) | 3) +#define PINMUX_GPIO140__FUNC_DMIC3_DAT (MTK_PIN_NO(140) | 4) +#define PINMUX_GPIO140__FUNC_DBG_MON_A11 (MTK_PIN_NO(140) | 7) + +#define PINMUX_GPIO141__FUNC_GPIO141 (MTK_PIN_NO(141) | 0) +#define PINMUX_GPIO141__FUNC_SPIM2_CLK (MTK_PIN_NO(141) | 1) +#define PINMUX_GPIO141__FUNC_SPINOR_CK (MTK_PIN_NO(141) | 2) +#define PINMUX_GPIO141__FUNC_SNFI_CLK (MTK_PIN_NO(141) | 3) +#define PINMUX_GPIO141__FUNC_DMIC3_CLK (MTK_PIN_NO(141) | 4) +#define PINMUX_GPIO141__FUNC_DBG_MON_A10 (MTK_PIN_NO(141) | 7) + +#define PINMUX_GPIO142__FUNC_GPIO142 (MTK_PIN_NO(142) | 0) +#define PINMUX_GPIO142__FUNC_SPIM2_MO (MTK_PIN_NO(142) | 1) +#define PINMUX_GPIO142__FUNC_SPINOR_IO0 (MTK_PIN_NO(142) | 2) +#define PINMUX_GPIO142__FUNC_SNFI_MOSI (MTK_PIN_NO(142) | 3) +#define PINMUX_GPIO142__FUNC_DMIC4_DAT (MTK_PIN_NO(142) | 4) +#define PINMUX_GPIO142__FUNC_DBG_MON_A12 (MTK_PIN_NO(142) | 7) + +#define PINMUX_GPIO143__FUNC_GPIO143 (MTK_PIN_NO(143) | 0) +#define PINMUX_GPIO143__FUNC_SPIM2_MI (MTK_PIN_NO(143) | 1) +#define PINMUX_GPIO143__FUNC_SPINOR_IO1 (MTK_PIN_NO(143) | 2) +#define PINMUX_GPIO143__FUNC_SNFI_MISO (MTK_PIN_NO(143) | 3) +#define PINMUX_GPIO143__FUNC_DMIC4_CLK (MTK_PIN_NO(143) | 4) +#define PINMUX_GPIO143__FUNC_DBG_MON_A13 (MTK_PIN_NO(143) | 7) + +#endif /* __MT8195-PINFUNC_H */ -- cgit v1.2.3 From 1dccb5ec01231156b06420633e3eedef95b17eac Mon Sep 17 00:00:00 2001 From: Sai Krishna Potthuri Date: Thu, 15 Apr 2021 16:43:12 +0530 Subject: dt-bindings: pinctrl: Add binding for ZynqMP pinctrl driver Adding documentation and dt-bindings file which contains MIO pin configuration defines for Xilinx ZynqMP pinctrl driver. Signed-off-by: Sai Krishna Potthuri Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/1618485193-5403-3-git-send-email-lakshmi.sai.krishna.potthuri@xilinx.com Signed-off-by: Linus Walleij --- include/dt-bindings/pinctrl/pinctrl-zynqmp.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/dt-bindings/pinctrl/pinctrl-zynqmp.h (limited to 'include') diff --git a/include/dt-bindings/pinctrl/pinctrl-zynqmp.h b/include/dt-bindings/pinctrl/pinctrl-zynqmp.h new file mode 100644 index 000000000000..cdb215734bdf --- /dev/null +++ b/include/dt-bindings/pinctrl/pinctrl-zynqmp.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * MIO pin configuration defines for Xilinx ZynqMP + * + * Copyright (C) 2020 Xilinx, Inc. + */ + +#ifndef _DT_BINDINGS_PINCTRL_ZYNQMP_H +#define _DT_BINDINGS_PINCTRL_ZYNQMP_H + +/* Bit value for different voltage levels */ +#define IO_STANDARD_LVCMOS33 0 +#define IO_STANDARD_LVCMOS18 1 + +/* Bit values for Slew Rates */ +#define SLEW_RATE_FAST 0 +#define SLEW_RATE_SLOW 1 + +#endif /* _DT_BINDINGS_PINCTRL_ZYNQMP_H */ -- cgit v1.2.3 From 1de15e99a242a66ef4f803fe1ad357f86b3a75f8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 12 Apr 2021 17:07:39 +0300 Subject: pinctrl: Keep enum pin_config_param ordered by name It seems the ordering is by name. Keep it that way. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210412140741.39946-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 6aeb711f7cd1..188db8d84162 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -90,6 +90,7 @@ struct pinctrl_map; * value on the line. Use argument 1 to indicate high level, argument 0 to * indicate low level. (Please see Documentation/driver-api/pinctl.rst, * section "GPIO mode pitfalls" for a discussion around this parameter.) + * @PIN_CONFIG_PERSIST_STATE: retain pin state across sleep or controller reset * @PIN_CONFIG_POWER_SOURCE: if the pin can select between different power * supplies, the argument to this parameter (on a custom format) tells * the driver which alternative power source to use. @@ -101,7 +102,6 @@ struct pinctrl_map; * or latch delay (on outputs) this parameter (in a custom format) * specifies the clock skew or latch delay. It typically controls how * many double inverters are put in front of the line. - * @PIN_CONFIG_PERSIST_STATE: retain pin state across sleep or controller reset * @PIN_CONFIG_END: this is the last enumerator for pin configurations, if * you need to pass in custom configurations to the pin controller, use * PIN_CONFIG_END+1 as the base offset. @@ -127,11 +127,11 @@ enum pin_config_param { PIN_CONFIG_LOW_POWER_MODE, PIN_CONFIG_OUTPUT_ENABLE, PIN_CONFIG_OUTPUT, + PIN_CONFIG_PERSIST_STATE, PIN_CONFIG_POWER_SOURCE, PIN_CONFIG_SLEEP_HARDWARE_STATE, PIN_CONFIG_SLEW_RATE, PIN_CONFIG_SKEW_DELAY, - PIN_CONFIG_PERSIST_STATE, PIN_CONFIG_END = 0x7F, PIN_CONFIG_MAX = 0xFF, }; -- cgit v1.2.3 From 31f9a421a1d01538776db37ec9c5419a3a49d650 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 12 Apr 2021 17:07:40 +0300 Subject: pinctrl: Introduce MODE group in enum pin_config_param Better to have a MODE group of settings to keep them together when ordered alphabetically. Hence, rename PIN_CONFIG_LOW_POWER_MODE to PIN_CONFIG_MODE_LOW_POWER. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210412140741.39946-2-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 188db8d84162..189e701832ea 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -76,7 +76,7 @@ struct pinctrl_map; * @PIN_CONFIG_INPUT_SCHMITT_ENABLE: control schmitt-trigger mode on the pin. * If the argument != 0, schmitt-trigger mode is enabled. If it's 0, * schmitt-trigger mode is disabled. - * @PIN_CONFIG_LOW_POWER_MODE: this will configure the pin for low power + * @PIN_CONFIG_MODE_LOW_POWER: this will configure the pin for low power * operation, if several modes of operation are supported these can be * passed in the argument on a custom form, else just use argument 1 * to indicate low power mode, argument 0 turns low power mode off. @@ -124,7 +124,7 @@ enum pin_config_param { PIN_CONFIG_INPUT_ENABLE, PIN_CONFIG_INPUT_SCHMITT, PIN_CONFIG_INPUT_SCHMITT_ENABLE, - PIN_CONFIG_LOW_POWER_MODE, + PIN_CONFIG_MODE_LOW_POWER, PIN_CONFIG_OUTPUT_ENABLE, PIN_CONFIG_OUTPUT, PIN_CONFIG_PERSIST_STATE, -- cgit v1.2.3 From 09e11caaa4cffac681963688b774e1aa3063b3a9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 12 Apr 2021 17:07:41 +0300 Subject: pinctrl: Add PIN_CONFIG_MODE_PWM to enum pin_config_param It seems that we will have more and more pin controllers that support PWM function on the (selected) pins. Due to it being a part of pin controller IP the idea is to have some code that will switch the mode and attach the corresponding driver, for example, via using it as a library. Meanwhile, put a corresponding item to the pin_config_param enumerator. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210412140741.39946-3-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 189e701832ea..e18ab3d5908f 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -80,6 +80,7 @@ struct pinctrl_map; * operation, if several modes of operation are supported these can be * passed in the argument on a custom form, else just use argument 1 * to indicate low power mode, argument 0 turns low power mode off. + * @PIN_CONFIG_MODE_PWM: this will configure the pin for PWM * @PIN_CONFIG_OUTPUT_ENABLE: this will enable the pin's output mode * without driving a value there. For most platforms this reduces to * enable the output buffers and then let the pin controller current @@ -125,6 +126,7 @@ enum pin_config_param { PIN_CONFIG_INPUT_SCHMITT, PIN_CONFIG_INPUT_SCHMITT_ENABLE, PIN_CONFIG_MODE_LOW_POWER, + PIN_CONFIG_MODE_PWM, PIN_CONFIG_OUTPUT_ENABLE, PIN_CONFIG_OUTPUT, PIN_CONFIG_PERSIST_STATE, -- cgit v1.2.3 From 85367040511f8402d7e4054d8c17b053c75e33ff Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 21 Apr 2021 23:45:26 +0800 Subject: scsi: blk-mq: Fix build warning when making htmldocs Fixes the following warning when running 'make htmldocs': include/linux/blk-mq.h:395: warning: Function parameter or member 'set_rq_budget_token' not described in 'blk_mq_ops' include/linux/blk-mq.h:395: warning: Function parameter or member 'get_rq_budget_token' not described in 'blk_mq_ops' [mkp: added warning messages] Link: https://lore.kernel.org/r/20210421154526.1954174-1-ming.lei@redhat.com Fixes: d022d18c045f ("scsi: blk-mq: Add callbacks for storing & retrieving budget token") Reported-by: Stephen Rothwell Signed-off-by: Ming Lei Signed-off-by: Martin K. Petersen --- include/linux/blk-mq.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 3bd3ee651143..359486940fa0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -313,12 +313,12 @@ struct blk_mq_ops { */ void (*put_budget)(struct request_queue *, int); - /* - * @set_rq_budget_toekn: store rq's budget token + /** + * @set_rq_budget_token: store rq's budget token */ void (*set_rq_budget_token)(struct request *, int); - /* - * @get_rq_budget_toekn: retrieve rq's budget token + /** + * @get_rq_budget_token: retrieve rq's budget token */ int (*get_rq_budget_token)(struct request *); -- cgit v1.2.3 From fa989ae7c7b38efbc6c3370571fb8a6f7350029a Mon Sep 17 00:00:00 2001 From: Sai Krishna Potthuri Date: Thu, 22 Apr 2021 14:00:00 +0530 Subject: firmware: xilinx: Add pinctrl support Adding pinctrl support to query platform specific information (pins) from firmware. Signed-off-by: Sai Krishna Potthuri Acked-by: Michal Simek Link: https://lore.kernel.org/r/1619080202-31924-2-git-send-email-lakshmi.sai.krishna.potthuri@xilinx.com Signed-off-by: Linus Walleij --- include/linux/firmware/xlnx-zynqmp.h | 90 ++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 71177b17eee5..8285a4bcfc2d 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -72,6 +72,12 @@ enum pm_api_id { PM_FPGA_LOAD = 22, PM_FPGA_GET_STATUS = 23, PM_GET_CHIPID = 24, + PM_PINCTRL_REQUEST = 28, + PM_PINCTRL_RELEASE = 29, + PM_PINCTRL_GET_FUNCTION = 30, + PM_PINCTRL_SET_FUNCTION = 31, + PM_PINCTRL_CONFIG_PARAM_GET = 32, + PM_PINCTRL_CONFIG_PARAM_SET = 33, PM_IOCTL = 34, PM_QUERY_DATA = 35, PM_CLOCK_ENABLE = 36, @@ -122,6 +128,12 @@ enum pm_query_id { PM_QID_CLOCK_GET_FIXEDFACTOR_PARAMS = 3, PM_QID_CLOCK_GET_PARENTS = 4, PM_QID_CLOCK_GET_ATTRIBUTES = 5, + PM_QID_PINCTRL_GET_NUM_PINS = 6, + PM_QID_PINCTRL_GET_NUM_FUNCTIONS = 7, + PM_QID_PINCTRL_GET_NUM_FUNCTION_GROUPS = 8, + PM_QID_PINCTRL_GET_FUNCTION_NAME = 9, + PM_QID_PINCTRL_GET_FUNCTION_GROUPS = 10, + PM_QID_PINCTRL_GET_PIN_GROUPS = 11, PM_QID_CLOCK_GET_NUM_CLOCKS = 12, PM_QID_CLOCK_GET_MAX_DIVISOR = 13, }; @@ -285,6 +297,44 @@ enum dll_reset_type { PM_DLL_RESET_PULSE = 2, }; +enum pm_pinctrl_config_param { + PM_PINCTRL_CONFIG_SLEW_RATE = 0, + PM_PINCTRL_CONFIG_BIAS_STATUS = 1, + PM_PINCTRL_CONFIG_PULL_CTRL = 2, + PM_PINCTRL_CONFIG_SCHMITT_CMOS = 3, + PM_PINCTRL_CONFIG_DRIVE_STRENGTH = 4, + PM_PINCTRL_CONFIG_VOLTAGE_STATUS = 5, + PM_PINCTRL_CONFIG_TRI_STATE = 6, + PM_PINCTRL_CONFIG_MAX = 7, +}; + +enum pm_pinctrl_slew_rate { + PM_PINCTRL_SLEW_RATE_FAST = 0, + PM_PINCTRL_SLEW_RATE_SLOW = 1, +}; + +enum pm_pinctrl_bias_status { + PM_PINCTRL_BIAS_DISABLE = 0, + PM_PINCTRL_BIAS_ENABLE = 1, +}; + +enum pm_pinctrl_pull_ctrl { + PM_PINCTRL_BIAS_PULL_DOWN = 0, + PM_PINCTRL_BIAS_PULL_UP = 1, +}; + +enum pm_pinctrl_schmitt_cmos { + PM_PINCTRL_INPUT_TYPE_CMOS = 0, + PM_PINCTRL_INPUT_TYPE_SCHMITT = 1, +}; + +enum pm_pinctrl_drive_strength { + PM_PINCTRL_DRIVE_STRENGTH_2MA = 0, + PM_PINCTRL_DRIVE_STRENGTH_4MA = 1, + PM_PINCTRL_DRIVE_STRENGTH_8MA = 2, + PM_PINCTRL_DRIVE_STRENGTH_12MA = 3, +}; + enum zynqmp_pm_shutdown_type { ZYNQMP_PM_SHUTDOWN_TYPE_SHUTDOWN = 0, ZYNQMP_PM_SHUTDOWN_TYPE_RESET = 1, @@ -353,6 +403,14 @@ int zynqmp_pm_write_pggs(u32 index, u32 value); int zynqmp_pm_read_pggs(u32 index, u32 *value); int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype); int zynqmp_pm_set_boot_health_status(u32 value); +int zynqmp_pm_pinctrl_request(const u32 pin); +int zynqmp_pm_pinctrl_release(const u32 pin); +int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id); +int zynqmp_pm_pinctrl_set_function(const u32 pin, const u32 id); +int zynqmp_pm_pinctrl_get_config(const u32 pin, const u32 param, + u32 *value); +int zynqmp_pm_pinctrl_set_config(const u32 pin, const u32 param, + u32 value); #else static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void) { @@ -537,6 +595,38 @@ static inline int zynqmp_pm_set_boot_health_status(u32 value) { return -ENODEV; } + +static inline int zynqmp_pm_pinctrl_request(const u32 pin) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_pinctrl_release(const u32 pin) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_pinctrl_set_function(const u32 pin, const u32 id) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_pinctrl_get_config(const u32 pin, const u32 param, + u32 *value) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_pinctrl_set_config(const u32 pin, const u32 param, + u32 value) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From d60d6e7adfc3814f6de03c978ff1daab21478f87 Mon Sep 17 00:00:00 2001 From: Thara Gopinath Date: Thu, 21 Jan 2021 21:34:05 -0500 Subject: thermal/core: Remove thermal_notify_framework thermal_notify_framework just updates for a single trip point where as thermal_zone_device_update does other bookkeeping like updating the temperature of the thermal zone and setting the next trip point. The only driver that was using thermal_notify_framework was updated in the previous patch to use thermal_zone_device_update instead. Since there are no users for thermal_notify_framework remove it. Signed-off-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210122023406.3500424-3-thara.gopinath@linaro.org --- include/linux/thermal.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 169502164364..d296f3b88fb9 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -390,7 +390,6 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp); int thermal_zone_get_slope(struct thermal_zone_device *tz); int thermal_zone_get_offset(struct thermal_zone_device *tz); -void thermal_notify_framework(struct thermal_zone_device *, int); int thermal_zone_device_enable(struct thermal_zone_device *tz); int thermal_zone_device_disable(struct thermal_zone_device *tz); void thermal_zone_device_critical(struct thermal_zone_device *tz); @@ -436,10 +435,6 @@ static inline int thermal_zone_get_offset( struct thermal_zone_device *tz) { return -ENODEV; } -static inline void thermal_notify_framework(struct thermal_zone_device *tz, - int trip) -{ } - static inline int thermal_zone_device_enable(struct thermal_zone_device *tz) { return -ENODEV; } -- cgit v1.2.3 From 5421db1be3b11c5e469cce3760d5c8a013a90f2c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 Apr 2021 14:44:05 +0100 Subject: KVM: arm64: Divorce the perf code from oprofile helpers KVM/arm64 is the sole user of perf_num_counters(), and really could do without it. Stop using the obsolete API by relying on the existing probing code. Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210414134409.1266357-2-maz@kernel.org --- include/kvm/arm_pmu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 6fd3cda608e4..864b9997efb2 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -61,6 +61,7 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu); +int kvm_pmu_probe_pmuver(void); #else struct kvm_pmu { }; @@ -116,6 +117,9 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) { return 0; } + +static inline int kvm_pmu_probe_pmuver(void) { return 0xf; } + #endif #endif -- cgit v1.2.3 From 7f318847a0f37b96d8927e8d30ae7b8f149b11f1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 Apr 2021 14:44:09 +0100 Subject: perf: Get rid of oprofile leftovers perf_pmu_name() and perf_num_counters() are unused. Drop them. Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210414134409.1266357-6-maz@kernel.org --- include/linux/perf_event.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3f7f89ea5e51..51154ed9a206 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -951,8 +951,6 @@ extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern void perf_pmu_unregister(struct pmu *pmu); -extern int perf_num_counters(void); -extern const char *perf_pmu_name(void); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); extern void __perf_event_task_sched_out(struct task_struct *prev, -- cgit v1.2.3 From 12ce208f40c757e70d7af60100e814be0d2f01bd Mon Sep 17 00:00:00 2001 From: Neta Ostrovsky Date: Sun, 18 Apr 2021 16:41:23 +0300 Subject: RDMA/nldev: Return context information Extend the RDMA nldev return a context information, like ctx number and process ID that created that context. This functionality is helpful to find orphan contexts that are not closed for some reason. Sample output: $ rdma res show ctx dev ibp8s0f0 ctxn 0 pid 980 comm ibv_rc_pingpong dev ibp8s0f0 ctxn 1 pid 981 comm ibv_rc_pingpong dev ibp8s0f0 ctxn 2 pid 992 comm ibv_rc_pingpong dev ibp8s0f1 ctxn 0 pid 984 comm ibv_rc_pingpong dev ibp8s0f1 ctxn 1 pid 987 comm ibv_rc_pingpong $ rdma res show ctx dev ibp8s0f1 dev ibp8s0f1 ctxn 0 pid 984 comm ibv_rc_pingpong dev ibp8s0f1 ctxn 1 pid 987 comm ibv_rc_pingpong Link: https://lore.kernel.org/r/5c956acfeac4e9d532988575f3da7d64cb449374.1618753110.git.leonro@nvidia.com Signed-off-by: Neta Ostrovsky Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index d2f5b8396243..9ec4d4e241fa 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -293,6 +293,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_RES_MR_GET_RAW, + RDMA_NLDEV_CMD_RES_CTX_GET, /* can dump */ + RDMA_NLDEV_NUM_OPS }; @@ -533,6 +535,9 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_RAW, /* binary */ + RDMA_NLDEV_ATTR_RES_CTX, /* nested table */ + RDMA_NLDEV_ATTR_RES_CTX_ENTRY, /* nested table */ + /* * Always the end */ -- cgit v1.2.3 From 48f8a70e899fa4d9c8f00369f482f0382173ece9 Mon Sep 17 00:00:00 2001 From: Neta Ostrovsky Date: Sun, 18 Apr 2021 16:41:24 +0300 Subject: RDMA/restrack: Add support to get resource tracking for SRQ In order to track SRQ resources, a new restrack object is initialized and added to the resource tracking database. Link: https://lore.kernel.org/r/0db71c409f24f2f6b019bf8797a8fed96fe7079c.1618753110.git.leonro@nvidia.com Signed-off-by: Neta Ostrovsky Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 5 +++++ include/rdma/restrack.h | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c596882893ae..7e2f3699b898 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1610,6 +1610,11 @@ struct ib_srq { } xrc; }; } ext; + + /* + * Implementation details of the RDMA core, don't use in drivers: + */ + struct rdma_restrack_entry res; }; enum ib_raw_packet_caps { diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 05e18839eaff..79d109c47242 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -49,6 +49,10 @@ enum rdma_restrack_type { * @RDMA_RESTRACK_COUNTER: Statistic Counter */ RDMA_RESTRACK_COUNTER, + /** + * @RDMA_RESTRACK_SRQ: Shared receive queue (SRQ) + */ + RDMA_RESTRACK_SRQ, /** * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations */ -- cgit v1.2.3 From 391c6bd5ac80094a5a8984d7ca20df7e3ec5b837 Mon Sep 17 00:00:00 2001 From: Neta Ostrovsky Date: Sun, 18 Apr 2021 16:41:25 +0300 Subject: RDMA/nldev: Return SRQ information Extend the RDMA nldev return a SRQ information, like SRQ number, SRQ type, PD number, CQ number and process ID that created that SRQ. Sample output: $ rdma res show srq dev ibp8s0f0 srqn 0 type BASIC pdn 3 comm [ib_ipoib] dev ibp8s0f0 srqn 4 type BASIC pdn 9 pid 3581 comm ibv_srq_pingpon dev ibp8s0f0 srqn 5 type BASIC pdn 10 pid 3584 comm ibv_srq_pingpon dev ibp8s0f0 srqn 6 type BASIC pdn 11 pid 3590 comm ibv_srq_pingpon dev ibp8s0f1 srqn 0 type BASIC pdn 3 comm [ib_ipoib] dev ibp8s0f1 srqn 1 type BASIC pdn 4 pid 3586 comm ibv_srq_pingpon Link: https://lore.kernel.org/r/322f9210b95812799190dd4a0fb92f3a3bba0333.1618753110.git.leonro@nvidia.com Signed-off-by: Neta Ostrovsky Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 9ec4d4e241fa..9abce20d39ad 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -295,6 +295,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_RES_CTX_GET, /* can dump */ + RDMA_NLDEV_CMD_RES_SRQ_GET, /* can dump */ + RDMA_NLDEV_NUM_OPS }; @@ -538,6 +540,10 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_CTX, /* nested table */ RDMA_NLDEV_ATTR_RES_CTX_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_RES_SRQ, /* nested table */ + RDMA_NLDEV_ATTR_RES_SRQ_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_RES_SRQN, /* u32 */ + /* * Always the end */ -- cgit v1.2.3 From c6c11ad3ab9fe5eb279479879e3461da99f6fdf0 Mon Sep 17 00:00:00 2001 From: Neta Ostrovsky Date: Sun, 18 Apr 2021 16:41:26 +0300 Subject: RDMA/nldev: Add QP numbers to SRQ information Add QP numbers that are associated with the SRQ to the SRQ information. The QPs are displayed in a range form. Sample output: $ rdma res show srq dev ibp8s0f0 srqn 0 type BASIC pdn 3 comm [ib_ipoib] dev ibp8s0f0 srqn 4 type BASIC lqpn 125-128,130-140 pdn 9 pid 3581 comm ibv_srq_pingpon dev ibp8s0f0 srqn 5 type BASIC lqpn 141-156 pdn 10 pid 3584 comm ibv_srq_pingpon dev ibp8s0f0 srqn 6 type BASIC lqpn 157-172 pdn 11 pid 3590 comm ibv_srq_pingpon dev ibp8s0f1 srqn 0 type BASIC pdn 3 comm [ib_ipoib] dev ibp8s0f1 srqn 1 type BASIC lqpn 329-344 pdn 4 pid 3586 comm ibv_srq_pingpon $ rdma res show srq lqpn 126-141 dev ibp8s0f0 srqn 4 type BASIC lqpn 126-128,130-140 pdn 9 pid 3581 comm ibv_srq_pingpon dev ibp8s0f0 srqn 5 type BASIC lqpn 141 pdn 10 pid 3584 comm ibv_srq_pingpon $ rdma res show srq lqpn 127 dev ibp8s0f0 srqn 4 type BASIC lqpn 127 pdn 9 pid 3581 comm ibv_srq_pingpon Link: https://lore.kernel.org/r/79a4bd4caec2248fd9583cccc26786af8e4414fc.1618753110.git.leonro@nvidia.com Signed-off-by: Neta Ostrovsky Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 9abce20d39ad..2c8d405ec924 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -544,6 +544,8 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_SRQ_ENTRY, /* nested table */ RDMA_NLDEV_ATTR_RES_SRQN, /* u32 */ + RDMA_NLDEV_ATTR_MIN_RANGE, /* u32 */ + RDMA_NLDEV_ATTR_MAX_RANGE, /* u32 */ /* * Always the end */ -- cgit v1.2.3 From 1aea7808372eee4ad01f98e064c88c57f1e94855 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Thu, 22 Apr 2021 17:41:15 +0200 Subject: LSM: Infrastructure management of the superblock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move management of the superblock->sb_security blob out of the individual security modules and into the security infrastructure. Instead of allocating the blobs from within the modules, the modules tell the infrastructure how much space is required, and the space is allocated there. Cc: John Johansen Signed-off-by: Casey Schaufler Signed-off-by: Mickaël Salaün Reviewed-by: Stephen Smalley Acked-by: Serge Hallyn Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20210422154123.13086-6-mic@digikod.net Signed-off-by: James Morris --- include/linux/lsm_hooks.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index fb7f3193753d..75715998a95f 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1573,6 +1573,7 @@ struct lsm_blob_sizes { int lbs_cred; int lbs_file; int lbs_inode; + int lbs_superblock; int lbs_ipc; int lbs_msg_msg; int lbs_task; -- cgit v1.2.3 From cb2c7d1a1776057c9a1f48ed1250d85e94d4850d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 22 Apr 2021 17:41:17 +0200 Subject: landlock: Support filesystem access-control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using Landlock objects and ruleset, it is possible to tag inodes according to a process's domain. To enable an unprivileged process to express a file hierarchy, it first needs to open a directory (or a file) and pass this file descriptor to the kernel through landlock_add_rule(2). When checking if a file access request is allowed, we walk from the requested dentry to the real root, following the different mount layers. The access to each "tagged" inodes are collected according to their rule layer level, and ANDed to create access to the requested file hierarchy. This makes possible to identify a lot of files without tagging every inodes nor modifying the filesystem, while still following the view and understanding the user has from the filesystem. Add a new ARCH_EPHEMERAL_INODES for UML because it currently does not keep the same struct inodes for the same inodes whereas these inodes are in use. This commit adds a minimal set of supported filesystem access-control which doesn't enable to restrict all file-related actions. This is the result of multiple discussions to minimize the code of Landlock to ease review. Thanks to the Landlock design, extending this access-control without breaking user space will not be a problem. Moreover, seccomp filters can be used to restrict the use of syscall families which may not be currently handled by Landlock. Cc: Al Viro Cc: Anton Ivanov Cc: James Morris Cc: Jann Horn Cc: Jeff Dike Cc: Kees Cook Cc: Richard Weinberger Cc: Serge E. Hallyn Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20210422154123.13086-8-mic@digikod.net Signed-off-by: James Morris --- include/uapi/linux/landlock.h | 76 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 include/uapi/linux/landlock.h (limited to 'include') diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h new file mode 100644 index 000000000000..b1a81b5a8b86 --- /dev/null +++ b/include/uapi/linux/landlock.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Landlock - User space API + * + * Copyright © 2017-2020 Mickaël Salaün + * Copyright © 2018-2020 ANSSI + */ + +#ifndef _UAPI_LINUX_LANDLOCK_H +#define _UAPI_LINUX_LANDLOCK_H + +/** + * DOC: fs_access + * + * A set of actions on kernel objects may be defined by an attribute (e.g. + * &struct landlock_path_beneath_attr) including a bitmask of access. + * + * Filesystem flags + * ~~~~~~~~~~~~~~~~ + * + * These flags enable to restrict a sandboxed process to a set of actions on + * files and directories. Files or directories opened before the sandboxing + * are not subject to these restrictions. + * + * A file can only receive these access rights: + * + * - %LANDLOCK_ACCESS_FS_EXECUTE: Execute a file. + * - %LANDLOCK_ACCESS_FS_WRITE_FILE: Open a file with write access. + * - %LANDLOCK_ACCESS_FS_READ_FILE: Open a file with read access. + * + * A directory can receive access rights related to files or directories. The + * following access right is applied to the directory itself, and the + * directories beneath it: + * + * - %LANDLOCK_ACCESS_FS_READ_DIR: Open a directory or list its content. + * + * However, the following access rights only apply to the content of a + * directory, not the directory itself: + * + * - %LANDLOCK_ACCESS_FS_REMOVE_DIR: Remove an empty directory or rename one. + * - %LANDLOCK_ACCESS_FS_REMOVE_FILE: Unlink (or rename) a file. + * - %LANDLOCK_ACCESS_FS_MAKE_CHAR: Create (or rename or link) a character + * device. + * - %LANDLOCK_ACCESS_FS_MAKE_DIR: Create (or rename) a directory. + * - %LANDLOCK_ACCESS_FS_MAKE_REG: Create (or rename or link) a regular file. + * - %LANDLOCK_ACCESS_FS_MAKE_SOCK: Create (or rename or link) a UNIX domain + * socket. + * - %LANDLOCK_ACCESS_FS_MAKE_FIFO: Create (or rename or link) a named pipe. + * - %LANDLOCK_ACCESS_FS_MAKE_BLOCK: Create (or rename or link) a block device. + * - %LANDLOCK_ACCESS_FS_MAKE_SYM: Create (or rename or link) a symbolic link. + * + * .. warning:: + * + * It is currently not possible to restrict some file-related actions + * accessible through these syscall families: :manpage:`chdir(2)`, + * :manpage:`truncate(2)`, :manpage:`stat(2)`, :manpage:`flock(2)`, + * :manpage:`chmod(2)`, :manpage:`chown(2)`, :manpage:`setxattr(2)`, + * :manpage:`utime(2)`, :manpage:`ioctl(2)`, :manpage:`fcntl(2)`, + * :manpage:`access(2)`. + * Future Landlock evolutions will enable to restrict them. + */ +#define LANDLOCK_ACCESS_FS_EXECUTE (1ULL << 0) +#define LANDLOCK_ACCESS_FS_WRITE_FILE (1ULL << 1) +#define LANDLOCK_ACCESS_FS_READ_FILE (1ULL << 2) +#define LANDLOCK_ACCESS_FS_READ_DIR (1ULL << 3) +#define LANDLOCK_ACCESS_FS_REMOVE_DIR (1ULL << 4) +#define LANDLOCK_ACCESS_FS_REMOVE_FILE (1ULL << 5) +#define LANDLOCK_ACCESS_FS_MAKE_CHAR (1ULL << 6) +#define LANDLOCK_ACCESS_FS_MAKE_DIR (1ULL << 7) +#define LANDLOCK_ACCESS_FS_MAKE_REG (1ULL << 8) +#define LANDLOCK_ACCESS_FS_MAKE_SOCK (1ULL << 9) +#define LANDLOCK_ACCESS_FS_MAKE_FIFO (1ULL << 10) +#define LANDLOCK_ACCESS_FS_MAKE_BLOCK (1ULL << 11) +#define LANDLOCK_ACCESS_FS_MAKE_SYM (1ULL << 12) + +#endif /* _UAPI_LINUX_LANDLOCK_H */ -- cgit v1.2.3 From 83e804f0bfee2247b1c0aa64845c81a38562da7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 22 Apr 2021 17:41:16 +0200 Subject: fs,security: Add sb_delete hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sb_delete security hook is called when shutting down a superblock, which may be useful to release kernel objects tied to the superblock's lifetime (e.g. inodes). This new hook is needed by Landlock to release (ephemerally) tagged struct inodes. This comes from the unprivileged nature of Landlock described in the next commit. Cc: Al Viro Cc: James Morris Signed-off-by: Mickaël Salaün Reviewed-by: Jann Horn Acked-by: Serge Hallyn Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20210422154123.13086-7-mic@digikod.net Signed-off-by: James Morris --- include/linux/lsm_hook_defs.h | 1 + include/linux/lsm_hooks.h | 3 +++ include/linux/security.h | 4 ++++ 3 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 477a597db013..e8adadbf9581 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -59,6 +59,7 @@ LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc, LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc, struct fs_parameter *param) LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb) +LSM_HOOK(void, LSM_RET_VOID, sb_delete, struct super_block *sb) LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb) LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts) LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 75715998a95f..cc2eaaaec0e4 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -108,6 +108,9 @@ * allocated. * @sb contains the super_block structure to be modified. * Return 0 if operation was successful. + * @sb_delete: + * Release objects tied to a superblock (e.g. inodes). + * @sb contains the super_block structure being released. * @sb_free_security: * Deallocate and clear the sb->s_security field. * @sb contains the super_block structure to be modified. diff --git a/include/linux/security.h b/include/linux/security.h index 8aeebd6646dc..90298baa4551 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -291,6 +291,7 @@ void security_bprm_committed_creds(struct linux_binprm *bprm); int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc); int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param); int security_sb_alloc(struct super_block *sb); +void security_sb_delete(struct super_block *sb); void security_sb_free(struct super_block *sb); void security_free_mnt_opts(void **mnt_opts); int security_sb_eat_lsm_opts(char *options, void **mnt_opts); @@ -631,6 +632,9 @@ static inline int security_sb_alloc(struct super_block *sb) return 0; } +static inline void security_sb_delete(struct super_block *sb) +{ } + static inline void security_sb_free(struct super_block *sb) { } -- cgit v1.2.3 From a49f4f81cb48925e8d7cbd9e59068f516e984144 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 22 Apr 2021 17:41:19 +0200 Subject: arch: Wire up Landlock syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire up the following system calls for all architectures: * landlock_create_ruleset(2) * landlock_add_rule(2) * landlock_restrict_self(2) Cc: Arnd Bergmann Cc: James Morris Cc: Jann Horn Cc: Kees Cook Cc: Serge E. Hallyn Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20210422154123.13086-10-mic@digikod.net Signed-off-by: James Morris --- include/uapi/asm-generic/unistd.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index ce58cff99b66..300608b05226 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -863,9 +863,15 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise) __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) #define __NR_mount_setattr 442 __SYSCALL(__NR_mount_setattr, sys_mount_setattr) +#define __NR_landlock_create_ruleset 444 +__SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) +#define __NR_landlock_add_rule 445 +__SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) +#define __NR_landlock_restrict_self 446 +__SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) #undef __NR_syscalls -#define __NR_syscalls 443 +#define __NR_syscalls 447 /* * 32 bit systems traditionally used different -- cgit v1.2.3 From 265885daf3e5082eb9f6e2a23bdbf9ba4456a21b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 22 Apr 2021 17:41:18 +0200 Subject: landlock: Add syscall implementations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These 3 system calls are designed to be used by unprivileged processes to sandbox themselves: * landlock_create_ruleset(2): Creates a ruleset and returns its file descriptor. * landlock_add_rule(2): Adds a rule (e.g. file hierarchy access) to a ruleset, identified by the dedicated file descriptor. * landlock_restrict_self(2): Enforces a ruleset on the calling thread and its future children (similar to seccomp). This syscall has the same usage restrictions as seccomp(2): the caller must have the no_new_privs attribute set or have CAP_SYS_ADMIN in the current user namespace. All these syscalls have a "flags" argument (not currently used) to enable extensibility. Here are the motivations for these new syscalls: * A sandboxed process may not have access to file systems, including /dev, /sys or /proc, but it should still be able to add more restrictions to itself. * Neither prctl(2) nor seccomp(2) (which was used in a previous version) fit well with the current definition of a Landlock security policy. All passed structs (attributes) are checked at build time to ensure that they don't contain holes and that they are aligned the same way for each architecture. See the user and kernel documentation for more details (provided by a following commit): * Documentation/userspace-api/landlock.rst * Documentation/security/landlock.rst Cc: Arnd Bergmann Cc: James Morris Cc: Jann Horn Cc: Kees Cook Signed-off-by: Mickaël Salaün Acked-by: Serge Hallyn Link: https://lore.kernel.org/r/20210422154123.13086-9-mic@digikod.net Signed-off-by: James Morris --- include/linux/syscalls.h | 7 ++++++ include/uapi/linux/landlock.h | 53 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 2839dc9a7c01..fa3971012e1c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -69,6 +69,8 @@ struct io_uring_params; struct clone_args; struct open_how; struct mount_attr; +struct landlock_ruleset_attr; +enum landlock_rule_type; #include #include @@ -1041,6 +1043,11 @@ asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, siginfo_t __user *info, unsigned int flags); asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags); +asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr __user *attr, + size_t size, __u32 flags); +asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type, + const void __user *rule_attr, __u32 flags); +asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); /* * Architecture-specific system calls diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index b1a81b5a8b86..ba946a1e40b2 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -9,6 +9,59 @@ #ifndef _UAPI_LINUX_LANDLOCK_H #define _UAPI_LINUX_LANDLOCK_H +#include + +/** + * struct landlock_ruleset_attr - Ruleset definition + * + * Argument of sys_landlock_create_ruleset(). This structure can grow in + * future versions. + */ +struct landlock_ruleset_attr { + /** + * @handled_access_fs: Bitmask of actions (cf. `Filesystem flags`_) + * that is handled by this ruleset and should then be forbidden if no + * rule explicitly allow them. This is needed for backward + * compatibility reasons. + */ + __u64 handled_access_fs; +}; + +/** + * enum landlock_rule_type - Landlock rule type + * + * Argument of sys_landlock_add_rule(). + */ +enum landlock_rule_type { + /** + * @LANDLOCK_RULE_PATH_BENEATH: Type of a &struct + * landlock_path_beneath_attr . + */ + LANDLOCK_RULE_PATH_BENEATH = 1, +}; + +/** + * struct landlock_path_beneath_attr - Path hierarchy definition + * + * Argument of sys_landlock_add_rule(). + */ +struct landlock_path_beneath_attr { + /** + * @allowed_access: Bitmask of allowed actions for this file hierarchy + * (cf. `Filesystem flags`_). + */ + __u64 allowed_access; + /** + * @parent_fd: File descriptor, open with ``O_PATH``, which identifies + * the parent directory of a file hierarchy, or just a file. + */ + __s32 parent_fd; + /* + * This struct is packed to avoid trailing reserved members. + * Cf. security/landlock/syscalls.c:build_check_abi() + */ +} __attribute__((packed)); + /** * DOC: fs_access * -- cgit v1.2.3 From 3532b0b4352ce79400b0aa68414f1a0fc422b920 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 22 Apr 2021 17:41:23 +0200 Subject: landlock: Enable user space to infer supported features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new flag LANDLOCK_CREATE_RULESET_VERSION to landlock_create_ruleset(2). This enables to retreive a Landlock ABI version that is useful to efficiently follow a best-effort security approach. Indeed, it would be a missed opportunity to abort the whole sandbox building, because some features are unavailable, instead of protecting users as much as possible with the subset of features provided by the running kernel. This new flag enables user space to identify the minimum set of Landlock features supported by the running kernel without relying on a filesystem interface (e.g. /proc/version, which might be inaccessible) nor testing multiple syscall argument combinations (i.e. syscall bisection). New Landlock features will be documented and tied to a minimum version number (greater than 1). The current version will be incremented for each new kernel release supporting new Landlock features. User space libraries can leverage this information to seamlessly restrict processes as much as possible while being compatible with newer APIs. This is a much more lighter approach than the previous landlock_get_features(2): the complexity is pushed to user space libraries. This flag meets similar needs as securityfs versions: selinux/policyvers, apparmor/features/*/version* and tomoyo/version. Supporting this flag now will be convenient for backward compatibility. Cc: Arnd Bergmann Cc: James Morris Cc: Jann Horn Cc: Kees Cook Cc: Serge E. Hallyn Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20210422154123.13086-14-mic@digikod.net Signed-off-by: James Morris --- include/uapi/linux/landlock.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index ba946a1e40b2..b3d952067f59 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -27,6 +27,14 @@ struct landlock_ruleset_attr { __u64 handled_access_fs; }; +/* + * sys_landlock_create_ruleset() flags: + * + * - %LANDLOCK_CREATE_RULESET_VERSION: Get the highest supported Landlock ABI + * version. + */ +#define LANDLOCK_CREATE_RULESET_VERSION (1U << 0) + /** * enum landlock_rule_type - Landlock rule type * -- cgit v1.2.3 From 81dd4d4d6178306ab31db91bdc7353d485bdafce Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 24 Apr 2021 10:04:15 -0500 Subject: dmaengine: idxd: Add IDXD performance monitor support Implement the IDXD performance monitor capability (named 'perfmon' in the DSA (Data Streaming Accelerator) spec [1]), which supports the collection of information about key events occurring during DSA and IAX (Intel Analytics Accelerator) device execution, to assist in performance tuning and debugging. The idxd perfmon support is implemented as part of the IDXD driver and interfaces with the Linux perf framework. It has several features in common with the existing uncore pmu support: - it does not support sampling - does not support per-thread counting However it also has some unique features not present in the core and uncore support: - all general-purpose counters are identical, thus no event constraints - operation is always system-wide While the core perf subsystem assumes that all counters are by default per-cpu, the uncore pmus are socket-scoped and use a cpu mask to restrict counting to one cpu from each socket. IDXD counters use a similar strategy but expand the scope even further; since IDXD counters are system-wide and can be read from any cpu, the IDXD perf driver picks a single cpu to do the work (with cpu hotplug notifiers to choose a different cpu if the chosen one is taken off-line). More specifically, the perf userspace tool by default opens a counter for each cpu for an event. However, if it finds a cpumask file associated with the pmu under sysfs, as is the case with the uncore pmus, it will open counters only on the cpus specified by the cpumask. Since perfmon only needs to open a single counter per event for a given IDXD device, the perfmon driver will create a sysfs cpumask file for the device and insert the first cpu of the system into it. When a user uses perf to open an event, perf will open a single counter on the cpu specified by the cpu mask. This amounts to the default system-wide rather than per-cpu counting mentioned previously for perfmon pmu events. In order to keep the cpu mask up-to-date, the driver implements cpu hotplug support for multiple devices, as IDXD usually enumerates and registers more than one idxd device. The perfmon driver implements basic perfmon hardware capability discovery and configuration, and is initialized by the IDXD driver's probe function. During initialization, the driver retrieves the total number of supported performance counters, the pmu ID, and the device type from idxd device, and registers itself under the Linux perf framework. The perf userspace tool can be used to monitor single or multiple events depending on the given configuration, as well as event groups, which are also supported by the perfmon driver. The user configures events using the perf tool command-line interface by specifying the event and corresponding event category, along with an optional set of filters that can be used to restrict counting to specific work queues, traffic classes, page and transfer sizes, and engines (See [1] for specifics). With the configuration specified by the user, the perf tool issues a system call passing that information to the kernel, which uses it to initialize the specified event(s). The event(s) are opened and started, and following termination of the perf command, they're stopped. At that point, the perfmon driver will read the latest count for the event(s), calculate the difference between the latest counter values and previously tracked counter values, and display the final incremental count as the event count for the cycle. An overflow handler registered on the IDXD irq path is used to account for counter overflows, which are signaled by an overflow interrupt. Below are a couple of examples of perf usage for monitoring DSA events. The following monitors all events in the 'engine' category. Becuuse no filters are specified, this captures all engine events for the workload, which in this case is 19 iterations of the work generated by the kernel dmatest module. Details describing the events can be found in Appendix D of [1], Performance Monitoring Events, but briefly they are: event 0x1: total input data processed, in 32-byte units event 0x2: total data written, in 32-byte units event 0x4: number of work descriptors that read the source event 0x8: number of work descriptors that write the destination event 0x10: number of work descriptors dispatched from batch descriptors event 0x20: number of work descriptors dispatched from work queues # perf stat -e dsa0/event=0x1,event_category=0x1/, dsa0/event=0x2,event_category=0x1/, dsa0/event=0x4,event_category=0x1/, dsa0/event=0x8,event_category=0x1/, dsa0/event=0x10,event_category=0x1/, dsa0/event=0x20,event_category=0x1/ modprobe dmatest channel=dma0chan0 timeout=2000 iterations=19 run=1 wait=1 Performance counter stats for 'system wide': 5,332 dsa0/event=0x1,event_category=0x1/ 5,327 dsa0/event=0x2,event_category=0x1/ 19 dsa0/event=0x4,event_category=0x1/ 19 dsa0/event=0x8,event_category=0x1/ 0 dsa0/event=0x10,event_category=0x1/ 19 dsa0/event=0x20,event_category=0x1/ 21.977436186 seconds time elapsed The command below illustrates filter usage with a simple example. It specifies that MEM_MOVE operations should be counted for the DSA device dsa0 (event 0x8 corresponds to the EV_MEM_MOVE event - Number of Memory Move Descriptors, which is part of event category 0x3 - Operations. The detailed category and event IDs are available in Appendix D, Performance Monitoring Events, of [1]). In addition to the event and event category, a number of filters are also specified (the detailed filter values are available in Chapter 6.4 (Filter Support) of [1]), which will restrict counting to only those events that meet all of the filter criteria. In this case, the filters specify that only MEM_MOVE operations that are serviced by work queue wq0 and specifically engine number engine0 and traffic class tc0 having sizes between 0 and 4k and page size of between 0 and 1G result in a counter hit; anything else will be filtered out and not appear in the final count. Note that filters are optional - any filter not specified is assumed to be all ones and will pass anything. # perf stat -e dsa0/filter_wq=0x1,filter_tc=0x1,filter_sz=0x7, filter_eng=0x1,event=0x8,event_category=0x3/ modprobe dmatest channel=dma0chan0 timeout=2000 iterations=19 run=1 wait=1 Performance counter stats for 'system wide': 19 dsa0/filter_wq=0x1,filter_tc=0x1,filter_sz=0x7, filter_eng=0x1,event=0x8,event_category=0x3/ 21.865914091 seconds time elapsed The output above reflects that the unspecified workload resulted in the counting of 19 MEM_MOVE operation events that met the filter criteria. [1]: https://software.intel.com/content/www/us/en/develop/download/intel-data-streaming-accelerator-preliminary-architecture-specification.html [ Based on work originally by Jing Lin. ] Reviewed-by: Dave Jiang Reviewed-by: Kan Liang Signed-off-by: Tom Zanussi Link: https://lore.kernel.org/r/0c5080a7d541904c4ad42b848c76a1ce056ddac7.1619276133.git.zanussi@kernel.org Signed-off-by: Vinod Koul --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f14adb882338..264d911424c0 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -167,6 +167,7 @@ enum cpuhp_state { CPUHP_AP_PERF_X86_RAPL_ONLINE, CPUHP_AP_PERF_X86_CQM_ONLINE, CPUHP_AP_PERF_X86_CSTATE_ONLINE, + CPUHP_AP_PERF_X86_IDXD_ONLINE, CPUHP_AP_PERF_S390_CF_ONLINE, CPUHP_AP_PERF_S390_CFD_ONLINE, CPUHP_AP_PERF_S390_SF_ONLINE, -- cgit v1.2.3 From e4b52ca01315ad53df41877708428c1c41c1444d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:12 -0400 Subject: xprtrdma: Do not recycle MR after FastReg/LocalInv flushes Better not to touch MRs involved in a flush or post error until the Send and Receive Queues are drained and the transport is fully quiescent. Simply don't insert such MRs back onto the free list. They remain on mr_all and will be released when the connection is torn down. I had thought that recycling would prevent hardware resources from being tied up for a long time. However, since v5.7, a transport disconnect destroys the QP and other hardware-owned resources. The MRs get cleaned up nicely at that point. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index c838e7ac1c2d..e38e745d13b0 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1014,7 +1014,6 @@ DEFINE_MR_EVENT(localinv); DEFINE_MR_EVENT(map); DEFINE_ANON_MR_EVENT(unmap); -DEFINE_ANON_MR_EVENT(recycle); TRACE_EVENT(xprtrdma_dma_maperr, TP_PROTO( -- cgit v1.2.3 From 4ddd0fc32c94fbb77a8c0728dc507b2bdcc67edc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:31 -0400 Subject: xprtrdma: Add tracepoints showing FastReg WRs and remote invalidation The Send signaling logic is a little subtle, so add some observability around it. For every xprtrdma_mr_fastreg event, there should be an xprtrdma_mr_localinv or xprtrdma_mr_reminv event. When these tracepoints are enabled, we can see exactly when an MR is DMA-mapped, registered, invalidated (either locally or remotely) and then DMA-unmapped. kworker/u25:2-190 [000] 787.979512: xprtrdma_mr_map: task:351@5 mr.id=4 nents=2 5608@0x8679e0c8f6f56000:0x00000503 (TO_DEVICE) kworker/u25:2-190 [000] 787.979515: xprtrdma_chunk_read: task:351@5 pos=148 5608@0x8679e0c8f6f56000:0x00000503 (last) kworker/u25:2-190 [000] 787.979519: xprtrdma_marshal: task:351@5 xid=0x8679e0c8: hdr=52 xdr=148/5608/0 read list/inline kworker/u25:2-190 [000] 787.979525: xprtrdma_mr_fastreg: task:351@5 mr.id=4 nents=2 5608@0x8679e0c8f6f56000:0x00000503 (TO_DEVICE) kworker/u25:2-190 [000] 787.979526: xprtrdma_post_send: task:351@5 cq.id=0 cid=73 (2 SGEs) ... kworker/5:1H-219 [005] 787.980567: xprtrdma_wc_receive: cq.id=1 cid=161 status=SUCCESS (0/0x0) received=164 kworker/5:1H-219 [005] 787.980571: xprtrdma_post_recvs: peer=[192.168.100.55]:20049 r_xprt=0xffff8884974d4000: 0 new recvs, 70 active (rc 0) kworker/5:1H-219 [005] 787.980573: xprtrdma_reply: task:351@5 xid=0x8679e0c8 credits=64 kworker/5:1H-219 [005] 787.980576: xprtrdma_mr_reminv: task:351@5 mr.id=4 nents=2 5608@0x8679e0c8f6f56000:0x00000503 (TO_DEVICE) kworker/5:1H-219 [005] 787.980577: xprtrdma_mr_unmap: mr.id=4 nents=2 5608@0x8679e0c8f6f56000:0x00000503 (TO_DEVICE) Note that I've moved the xprtrdma_post_send tracepoint so that event always appears after the xprtrdma_mr_fastreg tracepoint. Otherwise the event log looks counterintuitive (FastReg is always supposed to happen before Send). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index e38e745d13b0..9462326b3535 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1010,7 +1010,9 @@ TRACE_EVENT(xprtrdma_frwr_maperr, ) ); +DEFINE_MR_EVENT(fastreg); DEFINE_MR_EVENT(localinv); +DEFINE_MR_EVENT(reminv); DEFINE_MR_EVENT(map); DEFINE_ANON_MR_EVENT(unmap); -- cgit v1.2.3 From 6b147ea7f442e1fb31dfa25e25b7a8ca3fb817f0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:38 -0400 Subject: xprtrdma: Add an rpcrdma_mr_completion_class I found it confusing that the MR_EVENT class displays the mr.id but the associated COMPLETION_EVENT class displays a cid (that happens to contain the mr.id!). To make it a little easier on humans who have to read and interpret these events, create an MR_COMPLETION class that displays the mr.id in the same way as the MR_EVENT class. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 48 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 9462326b3535..3e6e4c69b533 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -60,6 +60,46 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class, ), \ TP_ARGS(wc, cid)) +DECLARE_EVENT_CLASS(rpcrdma_mr_completion_class, + TP_PROTO( + const struct ib_wc *wc, + const struct rpc_rdma_cid *cid + ), + + TP_ARGS(wc, cid), + + TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) + __field(unsigned long, status) + __field(unsigned int, vendor_err) + ), + + TP_fast_assign( + __entry->cq_id = cid->ci_queue_id; + __entry->completion_id = cid->ci_completion_id; + __entry->status = wc->status; + if (wc->status) + __entry->vendor_err = wc->vendor_err; + else + __entry->vendor_err = 0; + ), + + TP_printk("cq.id=%u mr.id=%d status=%s (%lu/0x%x)", + __entry->cq_id, __entry->completion_id, + rdma_show_wc_status(__entry->status), + __entry->status, __entry->vendor_err + ) +); + +#define DEFINE_MR_COMPLETION_EVENT(name) \ + DEFINE_EVENT(rpcrdma_mr_completion_class, name, \ + TP_PROTO( \ + const struct ib_wc *wc, \ + const struct rpc_rdma_cid *cid \ + ), \ + TP_ARGS(wc, cid)) + DECLARE_EVENT_CLASS(rpcrdma_receive_completion_class, TP_PROTO( const struct ib_wc *wc, @@ -886,10 +926,10 @@ TRACE_EVENT(xprtrdma_post_linv_err, DEFINE_RECEIVE_COMPLETION_EVENT(xprtrdma_wc_receive); DEFINE_COMPLETION_EVENT(xprtrdma_wc_send); -DEFINE_COMPLETION_EVENT(xprtrdma_wc_fastreg); -DEFINE_COMPLETION_EVENT(xprtrdma_wc_li); -DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_wake); -DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_done); +DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_fastreg); +DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_li); +DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_li_wake); +DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_li_done); TRACE_EVENT(xprtrdma_frwr_alloc, TP_PROTO( -- cgit v1.2.3 From 83189d15115467061295c0b75334b39fc64c6142 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:44 -0400 Subject: xprtrdma: Don't display r_xprt memory addresses in tracepoints The remote peer's IP address is sufficient, and does not expose details of the kernel's memory layout. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 51 ++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 3e6e4c69b533..e38b8e33be2d 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -190,19 +190,17 @@ DECLARE_EVENT_CLASS(xprtrdma_rxprt, TP_ARGS(r_xprt), TP_STRUCT__entry( - __field(const void *, r_xprt) __string(addr, rpcrdma_addrstr(r_xprt)) __string(port, rpcrdma_portstr(r_xprt)) ), TP_fast_assign( - __entry->r_xprt = r_xprt; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p", - __get_str(addr), __get_str(port), __entry->r_xprt + TP_printk("peer=[%s]:%s", + __get_str(addr), __get_str(port) ) ); @@ -222,7 +220,6 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class, TP_ARGS(r_xprt, rc), TP_STRUCT__entry( - __field(const void *, r_xprt) __field(int, rc) __field(int, connect_status) __string(addr, rpcrdma_addrstr(r_xprt)) @@ -230,15 +227,14 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class, ), TP_fast_assign( - __entry->r_xprt = r_xprt; __entry->rc = rc; __entry->connect_status = r_xprt->rx_ep->re_connect_status; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connection status=%d", - __get_str(addr), __get_str(port), __entry->r_xprt, + TP_printk("peer=[%s]:%s rc=%d connection status=%d", + __get_str(addr), __get_str(port), __entry->rc, __entry->connect_status ) ); @@ -535,22 +531,19 @@ TRACE_EVENT(xprtrdma_op_connect, TP_ARGS(r_xprt, delay), TP_STRUCT__entry( - __field(const void *, r_xprt) __field(unsigned long, delay) __string(addr, rpcrdma_addrstr(r_xprt)) __string(port, rpcrdma_portstr(r_xprt)) ), TP_fast_assign( - __entry->r_xprt = r_xprt; __entry->delay = delay; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p delay=%lu", - __get_str(addr), __get_str(port), __entry->r_xprt, - __entry->delay + TP_printk("peer=[%s]:%s delay=%lu", + __get_str(addr), __get_str(port), __entry->delay ) ); @@ -565,7 +558,6 @@ TRACE_EVENT(xprtrdma_op_set_cto, TP_ARGS(r_xprt, connect, reconnect), TP_STRUCT__entry( - __field(const void *, r_xprt) __field(unsigned long, connect) __field(unsigned long, reconnect) __string(addr, rpcrdma_addrstr(r_xprt)) @@ -573,15 +565,14 @@ TRACE_EVENT(xprtrdma_op_set_cto, ), TP_fast_assign( - __entry->r_xprt = r_xprt; __entry->connect = connect; __entry->reconnect = reconnect; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: connect=%lu reconnect=%lu", - __get_str(addr), __get_str(port), __entry->r_xprt, + TP_printk("peer=[%s]:%s connect=%lu reconnect=%lu", + __get_str(addr), __get_str(port), __entry->connect / HZ, __entry->reconnect / HZ ) ); @@ -631,22 +622,19 @@ TRACE_EVENT(xprtrdma_createmrs, TP_ARGS(r_xprt, count), TP_STRUCT__entry( - __field(const void *, r_xprt) __string(addr, rpcrdma_addrstr(r_xprt)) __string(port, rpcrdma_portstr(r_xprt)) __field(unsigned int, count) ), TP_fast_assign( - __entry->r_xprt = r_xprt; __entry->count = count; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: created %u MRs", - __get_str(addr), __get_str(port), __entry->r_xprt, - __entry->count + TP_printk("peer=[%s]:%s created %u MRs", + __get_str(addr), __get_str(port), __entry->count ) ); @@ -869,7 +857,7 @@ TRACE_EVENT(xprtrdma_post_recvs, TP_ARGS(r_xprt, count, status), TP_STRUCT__entry( - __field(const void *, r_xprt) + __field(u32, cq_id) __field(unsigned int, count) __field(int, status) __field(int, posted) @@ -878,16 +866,18 @@ TRACE_EVENT(xprtrdma_post_recvs, ), TP_fast_assign( - __entry->r_xprt = r_xprt; + const struct rpcrdma_ep *ep = r_xprt->rx_ep; + + __entry->cq_id = ep->re_attr.recv_cq->res.id; __entry->count = count; __entry->status = status; - __entry->posted = r_xprt->rx_ep->re_receive_count; + __entry->posted = ep->re_receive_count; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: %u new recvs, %d active (rc %d)", - __get_str(addr), __get_str(port), __entry->r_xprt, + TP_printk("peer=[%s]:%s cq.id=%d %u new recvs, %d active (rc %d)", + __get_str(addr), __get_str(port), __entry->cq_id, __entry->count, __entry->posted, __entry->status ) ); @@ -1289,22 +1279,19 @@ TRACE_EVENT(xprtrdma_cb_setup, TP_ARGS(r_xprt, reqs), TP_STRUCT__entry( - __field(const void *, r_xprt) __field(unsigned int, reqs) __string(addr, rpcrdma_addrstr(r_xprt)) __string(port, rpcrdma_portstr(r_xprt)) ), TP_fast_assign( - __entry->r_xprt = r_xprt; __entry->reqs = reqs; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: %u reqs", - __get_str(addr), __get_str(port), - __entry->r_xprt, __entry->reqs + TP_printk("peer=[%s]:%s %u reqs", + __get_str(addr), __get_str(port), __entry->reqs ) ); -- cgit v1.2.3 From e1648eb23d839bd4b9f2999296d5e81dcd93311f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:50 -0400 Subject: xprtrdma: Remove the RPC/RDMA QP event handler Clean up: The handler only recorded a trace event. If indeed no action is needed by the RPC/RDMA consumer, then the event can be ignored. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index e38b8e33be2d..ef6166b840e7 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -577,38 +577,6 @@ TRACE_EVENT(xprtrdma_op_set_cto, ) ); -TRACE_EVENT(xprtrdma_qp_event, - TP_PROTO( - const struct rpcrdma_ep *ep, - const struct ib_event *event - ), - - TP_ARGS(ep, event), - - TP_STRUCT__entry( - __field(unsigned long, event) - __string(name, event->device->name) - __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) - __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) - ), - - TP_fast_assign( - const struct rdma_cm_id *id = ep->re_id; - - __entry->event = event->event; - __assign_str(name, event->device->name); - memcpy(__entry->srcaddr, &id->route.addr.src_addr, - sizeof(struct sockaddr_in6)); - memcpy(__entry->dstaddr, &id->route.addr.dst_addr, - sizeof(struct sockaddr_in6)); - ), - - TP_printk("%pISpc -> %pISpc device=%s %s (%lu)", - __entry->srcaddr, __entry->dstaddr, __get_str(name), - rdma_show_ib_event(__entry->event), __entry->event - ) -); - /** ** Call events **/ -- cgit v1.2.3 From 13bcf7e32a0181095cd62010579869e87aacb332 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:04:21 -0400 Subject: xprtrdma: Move fr_mr field to struct rpcrdma_mr Clean up: The last remaining field in struct rpcrdma_frwr has been removed, so the struct can be eliminated. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index ef6166b840e7..bd55908c1bef 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -379,7 +379,7 @@ DECLARE_EVENT_CLASS(xprtrdma_mr_class, __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; @@ -420,7 +420,7 @@ DECLARE_EVENT_CLASS(xprtrdma_anonymous_mr_class, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; @@ -903,7 +903,7 @@ TRACE_EVENT(xprtrdma_frwr_alloc, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->rc = rc; ), @@ -931,7 +931,7 @@ TRACE_EVENT(xprtrdma_frwr_dereg, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; @@ -964,7 +964,7 @@ TRACE_EVENT(xprtrdma_frwr_sgerr, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->addr = mr->mr_sg->dma_address; __entry->dir = mr->mr_dir; __entry->nents = sg_nents; @@ -994,7 +994,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->addr = mr->mr_sg->dma_address; __entry->dir = mr->mr_dir; __entry->num_mapped = num_mapped; -- cgit v1.2.3 From d83e682e301071313e390e2f5ba2f6ca2ebc1848 Mon Sep 17 00:00:00 2001 From: Nick Kossifidis Date: Mon, 19 Apr 2021 03:55:35 +0300 Subject: RISC-V: Add EM_RISCV to kexec UAPI header Add RISC-V to the list of supported kexec architectures, we need to add the definition early-on so that later patches can use it. EM_RISCV is 243 as per ELF psABI specification here: https://github.com/riscv/riscv-elf-psabi-doc/blob/master/riscv-elf.md Signed-off-by: Nick Kossifidis Signed-off-by: Palmer Dabbelt --- include/uapi/linux/kexec.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 05669c87a0af..778dc191c265 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -42,6 +42,7 @@ #define KEXEC_ARCH_MIPS_LE (10 << 16) #define KEXEC_ARCH_MIPS ( 8 << 16) #define KEXEC_ARCH_AARCH64 (183 << 16) +#define KEXEC_ARCH_RISCV (243 << 16) /* The artificial cap on the number of segments passed to kexec_load. */ #define KEXEC_SEGMENT_MAX 16 -- cgit v1.2.3 From 6cc9e215eb277513719c32b9ba40e5012b02db57 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 18 Apr 2021 15:10:25 +0300 Subject: RDMA/nldev: Add copy-on-fork attribute to get sys command The new attribute indicates that the kernel copies DMA pages on fork, hence libibverbs' fork support through madvise and MADV_DONTFORK is not needed. The introduced attribute is always reported as supported since the kernel has the patch that added the copy-on-fork behavior. This allows the userspace library to identify older vs newer kernel versions. Extra care should be taken when backporting this patch as it relies on the fact that the copy-on-fork patch is merged, hence no check for support is added. Don't backport this patch unless you also have the following series: commit 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") and commit 4eae4efa2c29 ("hugetlb: do early cow when page pinned on src mm"). Fixes: 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") Fixes: 4eae4efa2c29 ("hugetlb: do early cow when page pinned on src mm") Link: https://lore.kernel.org/r/20210418121025.66849-1-galpress@amazon.com Signed-off-by: Gal Pressman Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 2c8d405ec924..75a1ae2311d8 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -546,6 +546,9 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_MIN_RANGE, /* u32 */ RDMA_NLDEV_ATTR_MAX_RANGE, /* u32 */ + + RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK, /* u8 */ + /* * Always the end */ -- cgit v1.2.3 From 527139d738d7f2e9f929c752eebf3cbf0f74c754 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Fri, 16 Apr 2021 20:58:38 +0000 Subject: PCI/sysfs: Convert "rom" to static attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "rom" sysfs attribute allows access to the PCI Option ROM. Previously it was dynamically created either by pci_bus_add_device() or the pci_sysfs_init() initcall, but since it doesn't need to be created or removed dynamically, we can use a static attribute so the device model takes care of addition and removal automatically. Convert "rom" to a static attribute and use the .is_bin_visible() callback to set the correct object size based on the ROM size. Remove "rom_attr" from the struct pci_dev since it is no longer needed. This attribute was added in the pre-git era by https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/drivers/pci/pci-sysfs.c?id=f6d553444da2 [bhelgaas: commit log] Suggested-by: Oliver O'Halloran Link: https://lore.kernel.org/r/20210416205856.3234481-3-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 86c799c97b77..45f1fef80b50 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -458,7 +458,6 @@ struct pci_dev { u32 saved_config_space[16]; /* Config space saved at suspend time */ struct hlist_head saved_cap_space; - struct bin_attribute *rom_attr; /* Attribute descriptor for sysfs ROM entry */ int rom_attr_enabled; /* Display of ROM attribute enabled? */ struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */ struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */ -- cgit v1.2.3 From 94604548aa7163fa14b837149bb0cb708bc613bc Mon Sep 17 00:00:00 2001 From: Andrea Mayer Date: Tue, 27 Apr 2021 17:44:04 +0200 Subject: seg6: add counters support for SRv6 Behaviors This patch provides counters for SRv6 Behaviors as defined in [1], section 6. For each SRv6 Behavior instance, counters defined in [1] are: - the total number of packets that have been correctly processed; - the total amount of traffic in bytes of all packets that have been correctly processed; In addition, this patch introduces a new counter that counts the number of packets that have NOT been properly processed (i.e. errors) by an SRv6 Behavior instance. Counters are not only interesting for network monitoring purposes (i.e. counting the number of packets processed by a given behavior) but they also provide a simple tool for checking whether a behavior instance is working as we expect or not. Counters can be useful for troubleshooting misconfigured SRv6 networks. Indeed, an SRv6 Behavior can silently drop packets for very different reasons (i.e. wrong SID configuration, interfaces set with SID addresses, etc) without any notification/message to the user. Due to the nature of SRv6 networks, diagnostic tools such as ping and traceroute may be ineffective: paths used for reaching a given router can be totally different from the ones followed by probe packets. In addition, paths are often asymmetrical and this makes it even more difficult to keep up with the journey of the packets and to understand which behaviors are actually processing our traffic. When counters are enabled on an SRv6 Behavior instance, it is possible to verify if packets are actually processed by such behavior and what is the outcome of the processing. Therefore, the counters for SRv6 Behaviors offer an non-invasive observability point which can be leveraged for both traffic monitoring and troubleshooting purposes. [1] https://www.rfc-editor.org/rfc/rfc8986.html#name-counters Troubleshooting using SRv6 Behavior counters -------------------------------------------- Let's make a brief example to see how helpful counters can be for SRv6 networks. Let's consider a node where an SRv6 End Behavior receives an SRv6 packet whose Segment Left (SL) is equal to 0. In this case, the End Behavior (which accepts only packets with SL >= 1) discards the packet and increases the error counter. This information can be leveraged by the network operator for troubleshooting. Indeed, the error counter is telling the user that the packet: (i) arrived at the node; (ii) the packet has been taken into account by the SRv6 End behavior; (iii) but an error has occurred during the processing. The error (iii) could be caused by different reasons, such as wrong route settings on the node or due to an invalid SID List carried by the SRv6 packet. Anyway, the error counter is used to exclude that the packet did not arrive at the node or it has not been processed by the behavior at all. Turning on/off counters for SRv6 Behaviors ------------------------------------------ Each SRv6 Behavior instance can be configured, at the time of its creation, to make use of counters. This is done through iproute2 which allows the user to create an SRv6 Behavior instance specifying the optional "count" attribute as shown in the following example: $ ip -6 route add 2001:db8::1 encap seg6local action End count dev eth0 per-behavior counters can be shown by adding "-s" to the iproute2 command line, i.e.: $ ip -s -6 route show 2001:db8::1 2001:db8::1 encap seg6local action End packets 0 bytes 0 errors 0 dev eth0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Impact of counters for SRv6 Behaviors on performance ==================================================== To determine the performance impact due to the introduction of counters in the SRv6 Behavior subsystem, we have carried out extensive tests. We chose to test the throughput achieved by the SRv6 End.DX2 Behavior because, among all the other behaviors implemented so far, it reaches the highest throughput which is around 1.5 Mpps (per core at 2.4 GHz on a Xeon(R) CPU E5-2630 v3) on kernel 5.12-rc2 using packets of size ~ 100 bytes. Three different tests were conducted in order to evaluate the overall throughput of the SRv6 End.DX2 Behavior in the following scenarios: 1) vanilla kernel (without the SRv6 Behavior counters patch) and a single instance of an SRv6 End.DX2 Behavior; 2) patched kernel with SRv6 Behavior counters and a single instance of an SRv6 End.DX2 Behavior with counters turned off; 3) patched kernel with SRv6 Behavior counters and a single instance of SRv6 End.DX2 Behavior with counters turned on. All tests were performed on a testbed deployed on the CloudLab facilities [2], a flexible infrastructure dedicated to scientific research on the future of Cloud Computing. Results of tests are shown in the following table: Scenario (1): average 1504764,81 pps (~1504,76 kpps); std. dev 3956,82 pps Scenario (2): average 1501469,78 pps (~1501,47 kpps); std. dev 2979,85 pps Scenario (3): average 1501315,13 pps (~1501,32 kpps); std. dev 2956,00 pps As can be observed, throughputs achieved in scenarios (2),(3) did not suffer any observable degradation compared to scenario (1). Thanks to Jakub Kicinski and David Ahern for their valuable suggestions and comments provided during the discussion of the proposed RFCs. [2] https://www.cloudlab.us Signed-off-by: Andrea Mayer Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/seg6_local.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h index 3b39ef1dbb46..5ae3ace84de0 100644 --- a/include/uapi/linux/seg6_local.h +++ b/include/uapi/linux/seg6_local.h @@ -27,6 +27,7 @@ enum { SEG6_LOCAL_OIF, SEG6_LOCAL_BPF, SEG6_LOCAL_VRFTABLE, + SEG6_LOCAL_COUNTERS, __SEG6_LOCAL_MAX, }; #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1) @@ -78,4 +79,33 @@ enum { #define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1) +/* SRv6 Behavior counters are encoded as netlink attributes guaranteeing the + * correct alignment. + * Each counter is identified by a different attribute type (i.e. + * SEG6_LOCAL_CNT_PACKETS). + * + * - SEG6_LOCAL_CNT_PACKETS: identifies a counter that counts the number of + * packets that have been CORRECTLY processed by an SRv6 Behavior instance + * (i.e., packets that generate errors or are dropped are NOT counted). + * + * - SEG6_LOCAL_CNT_BYTES: identifies a counter that counts the total amount + * of traffic in bytes of all packets that have been CORRECTLY processed by + * an SRv6 Behavior instance (i.e., packets that generate errors or are + * dropped are NOT counted). + * + * - SEG6_LOCAL_CNT_ERRORS: identifies a counter that counts the number of + * packets that have NOT been properly processed by an SRv6 Behavior instance + * (i.e., packets that generate errors or are dropped). + */ +enum { + SEG6_LOCAL_CNT_UNSPEC, + SEG6_LOCAL_CNT_PAD, /* pad for 64 bits values */ + SEG6_LOCAL_CNT_PACKETS, + SEG6_LOCAL_CNT_BYTES, + SEG6_LOCAL_CNT_ERRORS, + __SEG6_LOCAL_CNT_MAX, +}; + +#define SEG6_LOCAL_CNT_MAX (__SEG6_LOCAL_CNT_MAX - 1) + #endif -- cgit v1.2.3 From d991bb1c8da842a2a0b9dc83b1005e655783f861 Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Thu, 29 Apr 2021 22:53:50 -0700 Subject: include/linux/compiler-gcc.h: sparse can do constant folding of __builtin_bswap*() Sparse can do constant folding of __builtin_bswap*() since 2017. Also, a much recent version of Sparse is needed anyway, see commit 6ec4476ac825 ("Raise gcc version requirement to 4.9"). So, remove the comment about sparse not being yet able to constant fold __builtin_bswap*() and remove the corresponding test of __CHECKER__. Link: https://lkml.kernel.org/r/20210226092236.99369-1-luc.vanoostenryck@gmail.com Signed-off-by: Luc Van Oostenryck Acked-by: Arnd Bergmann Acked-by: Miguel Ojeda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 48750243db4c..5d97ef738a57 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -90,15 +90,11 @@ */ #define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) -/* - * sparse (__CHECKER__) pretends to be gcc, but can't do constant - * folding in __builtin_bswap*() (yet), so don't set these for it. - */ -#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) && !defined(__CHECKER__) +#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) #define __HAVE_BUILTIN_BSWAP32__ #define __HAVE_BUILTIN_BSWAP64__ #define __HAVE_BUILTIN_BSWAP16__ -#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP && !__CHECKER__ */ +#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ #if GCC_VERSION >= 70000 #define KASAN_ABI_VERSION 5 -- cgit v1.2.3 From 8e9b16c47680f6e7d6e5864a37f313f905a91cf5 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Thu, 29 Apr 2021 22:55:08 -0700 Subject: mm: page_owner: detect page_owner recursion via task_struct Before the change page_owner recursion was detected via fetching backtrace and inspecting it for current instruction pointer. It has a few problems: - it is slightly slow as it requires extra backtrace and a linear stack scan of the result - it is too late to check if backtrace fetching required memory allocation itself (ia64's unwinder requires it). To simplify recursion tracking let's use page_owner recursion flag in 'struct task_struct'. The change make page_owner=on work on ia64 by avoiding infinite recursion in: kmalloc() -> __set_page_owner() -> save_stack() -> unwind() [ia64-specific] -> build_script() -> kmalloc() -> __set_page_owner() [we short-circuit here] -> save_stack() -> unwind() [recursion] Link: https://lkml.kernel.org/r/20210402115342.1463781-1-slyfox@gentoo.org Signed-off-by: Sergei Trofimovich Reviewed-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Daniel Bristot de Oliveira Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index d7d07aa0facf..9c25c8e67030 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -841,6 +841,10 @@ struct task_struct { /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif +#ifdef CONFIG_PAGE_OWNER + /* Used by page_owner=on to detect recursion in page tracking. */ + unsigned in_page_owner:1; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ -- cgit v1.2.3 From 63135aa3866db99fd923b716c5ff2e468879624a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 Apr 2021 22:55:18 -0700 Subject: mm: provide filemap_range_needs_writeback() helper Patch series "Improve IOCB_NOWAIT O_DIRECT reads", v3. An internal workload complained because it was using too much CPU, and when I took a look, we had a lot of io_uring workers going to town. For an async buffered read like workload, I am normally expecting _zero_ offloads to a worker thread, but this one had tons of them. I'd drop caches and things would look good again, but then a minute later we'd regress back to using workers. Turns out that every minute something was reading parts of the device, which would add page cache for that inode. I put patches like these in for our kernel, and the problem was solved. Don't -EAGAIN IOCB_NOWAIT dio reads just because we have page cache entries for the given range. This causes unnecessary work from the callers side, when the IO could have been issued totally fine without blocking on writeback when there is none. This patch (of 3): For O_DIRECT reads/writes, we check if we need to issue a call to filemap_write_and_wait_range() to issue and/or wait for writeback for any page in the given range. The existing mechanism just checks for a page in the range, which is suboptimal for IOCB_NOWAIT as we'll fallback to the slow path (and needing retry) if there's just a clean page cache page in the range. Provide filemap_range_needs_writeback() which tries a little harder to check if we actually need to issue and/or wait for writeback in the range. Link: https://lkml.kernel.org/r/20210224164455.1096727-1-axboe@kernel.dk Link: https://lkml.kernel.org/r/20210224164455.1096727-2-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index bf4e90d3ab18..12766edee81f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2878,6 +2878,8 @@ static inline int filemap_fdatawait(struct address_space *mapping) extern bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); +extern bool filemap_range_needs_writeback(struct address_space *, + loff_t lstart, loff_t lend); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); extern int __filemap_fdatawrite_range(struct address_space *mapping, -- cgit v1.2.3 From 1c824a680b1b67ad43c0908f11a70bcf37af56d5 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:55:32 -0700 Subject: mm: page-writeback: simplify memcg handling in test_clear_page_writeback() Page writeback doesn't hold a page reference, which allows truncate to free a page the second PageWriteback is cleared. This used to require special attention in test_clear_page_writeback(), where we had to be careful not to rely on the unstable page->memcg binding and look up all the necessary information before clearing the writeback flag. Since commit 073861ed77b6 ("mm: fix VM_BUG_ON(PageTail) and BUG_ON(PageWriteback)") test_clear_page_writeback() is called with an explicit reference on the page, and this dance is no longer needed. Use unlock_page_memcg() and dec_lruvec_page_state() directly. This removes the last user of the lock_page_memcg() return value, change it to void. Touch up the comments in there as well. This also removes the last extern user of __unlock_page_memcg(), make it static. Further, it removes the last user of dec_lruvec_state(), delete it, along with a few other unused helpers. Link: https://lkml.kernel.org/r/YCQbYAWg4nvBFL6h@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hugh Dickins Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++-------- include/linux/vmstat.h | 24 +++--------------------- 2 files changed, 5 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0c04d39a7967..ae448d955a87 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -867,8 +867,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); extern bool cgroup_memory_noswap; #endif -struct mem_cgroup *lock_page_memcg(struct page *page); -void __unlock_page_memcg(struct mem_cgroup *memcg); +void lock_page_memcg(struct page *page); void unlock_page_memcg(struct page *page); /* @@ -1289,12 +1288,7 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } -static inline struct mem_cgroup *lock_page_memcg(struct page *page) -{ - return NULL; -} - -static inline void __unlock_page_memcg(struct mem_cgroup *memcg) +static inline void lock_page_memcg(struct page *page) { } diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 506d625163a1..3299cd69e4ca 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -512,16 +512,10 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void __inc_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - __mod_lruvec_state(lruvec, idx, 1); -} - -static inline void __dec_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) +static inline void inc_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) { - __mod_lruvec_state(lruvec, idx, -1); + mod_lruvec_state(lruvec, idx, 1); } static inline void __inc_lruvec_page_state(struct page *page, @@ -536,18 +530,6 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } -static inline void inc_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - mod_lruvec_state(lruvec, idx, 1); -} - -static inline void dec_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - mod_lruvec_state(lruvec, idx, -1); -} - static inline void inc_lruvec_page_state(struct page *page, enum node_stat_item idx) { -- cgit v1.2.3 From 842ca547f706b1e05ccf3026a0ab15d24772a188 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 22:55:35 -0700 Subject: mm: move page_mapping_file to pagemap.h page_mapping_file() is only used by some architectures, and then it is usually only used in one place. Make it a static inline function so other architectures don't have to carry this dead code. Link: https://lkml.kernel.org/r/20210317123011.350118-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Acked-by: Mike Rapoport Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - include/linux/pagemap.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 21115933b9b8..2e5c207e702c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1629,7 +1629,6 @@ static inline pgoff_t page_index(struct page *page) bool page_mapped(struct page *page); struct address_space *page_mapping(struct page *page); -struct address_space *page_mapping_file(struct page *page); /* * Return true only if the page has been allocated with diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4686f9ab0636..469fa7ffcf96 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -157,6 +157,16 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) void release_pages(struct page **pages, int nr); +/* + * For file cache pages, return the address_space, otherwise return NULL + */ +static inline struct address_space *page_mapping_file(struct page *page) +{ + if (unlikely(PageSwapCache(page))) + return NULL; + return page_mapping(page); +} + /* * speculatively take a reference to a page. * If the page is free (_refcount == 0), then _refcount is untouched, and 0 -- cgit v1.2.3 From 458a4f788f8602e5701b3d8c2fb6b021310a7301 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 29 Apr 2021 22:55:50 -0700 Subject: mm/gup: add a range variant of unpin_user_pages_dirty_lock() Add an unpin_user_page_range_dirty_lock() API which takes a starting page and how many consecutive pages we want to unpin and optionally dirty. To that end, define another iterator for_each_compound_range() that operates in page ranges as opposed to page array. For users (like RDMA mr_dereg) where each sg represents a contiguous set of pages, we're able to more efficiently unpin pages without having to supply an array of pages much of what happens today with unpin_user_pages(). Link: https://lkml.kernel.org/r/20210212130843.13865-4-joao.m.martins@oracle.com Suggested-by: Jason Gunthorpe Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: John Hubbard Cc: Christoph Hellwig Cc: Doug Ledford Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e5c207e702c..702c2a7379d6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1265,6 +1265,8 @@ static inline void put_page(struct page *page) void unpin_user_page(struct page *page); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); +void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, + bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); /** -- cgit v1.2.3 From 4066c119483af8e86a75447fd35be1d2553d370f Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 29 Apr 2021 22:55:56 -0700 Subject: mm: gup: remove FOLL_SPLIT Since commit 5a52c9df62b4 ("uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT") and commit ba925fa35057 ("s390/gmap: improve THP splitting") FOLL_SPLIT has not been used anymore. Remove the dead code. Link: https://lkml.kernel.org/r/20210330203900.9222-1-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 702c2a7379d6..64be3baf861a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2791,7 +2791,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO * and return without waiting upon it */ #define FOLL_POPULATE 0x40 /* fault in page */ -#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ -- cgit v1.2.3 From a3747b53b1771a787fea71d86a2fc39aea337685 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:14 -0700 Subject: mm: memcontrol: kill mem_cgroup_nodeinfo() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No need to encapsulate a simple struct member access. Link: https://lkml.kernel.org/r/20210209163304.77088-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Michal Hocko Reviewed-by: Michal Koutný Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ae448d955a87..149dc2fd97a6 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -602,12 +602,6 @@ void mem_cgroup_uncharge_list(struct list_head *page_list); void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); -static struct mem_cgroup_per_node * -mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) -{ - return memcg->nodeinfo[nid]; -} - /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec @@ -631,7 +625,7 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, if (!memcg) memcg = root_mem_cgroup; - mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + mz = memcg->nodeinfo[pgdat->node_id]; lruvec = &mz->lruvec; out: /* -- cgit v1.2.3 From a18e6e6e150a98b9ce3e9acabeff407e7b6ba0c0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:17 -0700 Subject: mm: memcontrol: privatize memcg_page_state query functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users outside of the memory controller itself. The rest of the kernel cares either about node or lruvec stats. Link: https://lkml.kernel.org/r/20210209163304.77088-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Michal Hocko Reviewed-by: Michal Koutný Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 44 -------------------------------------------- 1 file changed, 44 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 149dc2fd97a6..9a02aabd0000 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -864,39 +864,6 @@ extern bool cgroup_memory_noswap; void lock_page_memcg(struct page *page); void unlock_page_memcg(struct page *page); -/* - * idx can be of type enum memcg_stat_item or node_stat_item. - * Keep in sync with memcg_exact_page_state(). - */ -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = atomic_long_read(&memcg->vmstats[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} - -/* - * idx can be of type enum memcg_stat_item or node_stat_item. - * Keep in sync with memcg_exact_page_state(). - */ -static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, - int idx) -{ - long x = 0; - int cpu; - - for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_local->stat[idx], cpu); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} - void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); /* idx can be of type enum memcg_stat_item or node_stat_item */ @@ -1322,17 +1289,6 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - return 0; -} - -static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, - int idx) -{ - return 0; -} - static inline void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int nr) -- cgit v1.2.3 From 2d146aa3aa842d7f5065802556b4f9a2c6e8ef12 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:26 -0700 Subject: mm: memcontrol: switch to rstat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the memory controller's custom hierarchical stats code with the generic rstat infrastructure provided by the cgroup core. The current implementation does batched upward propagation from the write side (i.e. as stats change). The per-cpu batches introduce an error, which is multiplied by the number of subgroups in a tree. In systems with many CPUs and sizable cgroup trees, the error can be large enough to confuse users (e.g. 32 batch pages * 32 CPUs * 32 subgroups results in an error of up to 128M per stat item). This can entirely swallow allocation bursts inside a workload that the user is expecting to see reflected in the statistics. In the past, we've done read-side aggregation, where a memory.stat read would have to walk the entire subtree and add up per-cpu counts. This became problematic with lazily-freed cgroups: we could have large subtrees where most cgroups were entirely idle. Hence the switch to change-driven upward propagation. Unfortunately, it needed to trade accuracy for speed due to the write side being so hot. Rstat combines the best of both worlds: from the write side, it cheaply maintains a queue of cgroups that have pending changes, so that the read side can do selective tree aggregation. This way the reported stats will always be precise and recent as can be, while the aggregation can skip over potentially large numbers of idle cgroups. The way rstat works is that it implements a tree for tracking cgroups with pending local changes, as well as a flush function that walks the tree upwards. The controller then drives this by 1) telling rstat when a local cgroup stat changes (e.g. mod_memcg_state) and 2) when a flush is required to get uptodate hierarchy stats for a given subtree (e.g. when memory.stat is read). The controller also provides a flush callback that is called during the rstat flush walk for each cgroup and aggregates its local per-cpu counters and propagates them upwards. This adds a second vmstats to struct mem_cgroup (MEMCG_NR_STAT + NR_VM_EVENT_ITEMS) to track pending subtree deltas during upward aggregation. It removes 3 words from the per-cpu data. It eliminates memcg_exact_page_state(), since memcg_page_state() is now exact. [akpm@linux-foundation.org: merge fix] [hannes@cmpxchg.org: fix a sleep in atomic section problem] Link: https://lkml.kernel.org/r/20210315234100.64307-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20210209163304.77088-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Reviewed-by: Michal Koutný Acked-by: Balbir Singh Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 67 +++++++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9a02aabd0000..74910ce9a3f9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -76,10 +76,27 @@ enum mem_cgroup_events_target { }; struct memcg_vmstats_percpu { - long stat[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; - unsigned long nr_page_events; - unsigned long targets[MEM_CGROUP_NTARGETS]; + /* Local (CPU and cgroup) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[MEMCG_NR_STAT]; + unsigned long events_prev[NR_VM_EVENT_ITEMS]; + + /* Cgroup1: threshold notifications & softlimit tree updates */ + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct memcg_vmstats { + /* Aggregated (CPU and subtree) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Pending child counts during tree propagation */ + long state_pending[MEMCG_NR_STAT]; + unsigned long events_pending[NR_VM_EVENT_ITEMS]; }; struct mem_cgroup_reclaim_iter { @@ -287,8 +304,8 @@ struct mem_cgroup { MEMCG_PADDING(_pad1_); - atomic_long_t vmstats[MEMCG_NR_STAT]; - atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; + /* memory.stat */ + struct memcg_vmstats vmstats; /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; @@ -315,10 +332,6 @@ struct mem_cgroup { atomic_t moving_account; struct task_struct *move_lock_task; - /* Legacy local VM stats and events */ - struct memcg_vmstats_percpu __percpu *vmstats_local; - - /* Subtree VM stats and events (batched updates) */ struct memcg_vmstats_percpu __percpu *vmstats_percpu; #ifdef CONFIG_CGROUP_WRITEBACK @@ -939,10 +952,6 @@ static inline void mod_memcg_lruvec_state(struct lruvec *lruvec, local_irq_restore(flags); } -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned); - void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); @@ -1023,6 +1032,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void split_page_memcg(struct page *head, unsigned int nr); +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned); + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1131,6 +1144,10 @@ static inline bool lruvec_holds_page_lru_lock(struct page *page, return lruvec == &pgdat->__lruvec; } +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ +} + static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; @@ -1334,18 +1351,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, mod_node_page_state(page_pgdat(page), idx, val); } -static inline -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned) -{ - return 0; -} - -static inline void split_page_memcg(struct page *head, unsigned int nr) -{ -} - static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) @@ -1368,8 +1373,16 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +static inline void split_page_memcg(struct page *head, unsigned int nr) +{ +} + +static inline +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) { + return 0; } #endif /* CONFIG_MEMCG */ -- cgit v1.2.3 From 0add0c77a9bd0ce7cd3b53894fb08154881402a4 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 29 Apr 2021 22:56:36 -0700 Subject: memcg: charge before adding to swapcache on swapin Currently the kernel adds the page, allocated for swapin, to the swapcache before charging the page. This is fine but now we want a per-memcg swapcache stat which is essential for folks who wants to transparently migrate from cgroup v1's memsw to cgroup v2's memory and swap counters. In addition charging a page before exposing it to other parts of the kernel is a step in the right direction. To correctly maintain the per-memcg swapcache stat, this patch has adopted to charge the page before adding it to swapcache. One challenge in this option is the failure case of add_to_swap_cache() on which we need to undo the mem_cgroup_charge(). Specifically undoing mem_cgroup_uncharge_swap() is not simple. To resolve the issue, this patch decouples the charging for swapin pages from mem_cgroup_charge(). Two new functions are introduced, mem_cgroup_swapin_charge_page() for just charging the swapin page and mem_cgroup_swapin_uncharge_swap() for uncharging the swap slot once the page has been successfully added to the swapcache. [shakeelb@google.com: set page->private before calling swap_readpage] Link: https://lkml.kernel.org/r/20210318015959.2986837-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20210305212639.775498-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Hugh Dickins Tested-by: Heiko Carstens Cc: Michal Hocko Cc: Stephen Rothwell Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 74910ce9a3f9..e946c96daa32 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -609,6 +609,9 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) } int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); +int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, + gfp_t gfp, swp_entry_t entry); +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); @@ -1112,6 +1115,16 @@ static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, return 0; } +static inline int mem_cgroup_swapin_charge_page(struct page *page, + struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) +{ + return 0; +} + +static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +{ +} + static inline void mem_cgroup_uncharge(struct page *page) { } -- cgit v1.2.3 From b4e0b68fbd9d1fd7e31cbe8adca3ad6cf556e2ee Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 29 Apr 2021 22:56:52 -0700 Subject: mm: memcontrol: use obj_cgroup APIs to charge kmem pages Since Roman's series "The new cgroup slab memory controller" applied. All slab objects are charged via the new APIs of obj_cgroup. The new APIs introduce a struct obj_cgroup to charge slab objects. It prevents long-living objects from pinning the original memory cgroup in the memory. But there are still some corner objects (e.g. allocations larger than order-1 page on SLUB) which are not charged via the new APIs. Those objects (include the pages which are allocated from buddy allocator directly) are charged as kmem pages which still hold a reference to the memory cgroup. We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do that, we should store an object cgroup pointer to page->memcg_data for the kmem pages. Finally, page->memcg_data will have 3 different meanings. 1) For the slab pages, page->memcg_data points to an object cgroups vector. 2) For the kmem pages (exclude the slab pages), page->memcg_data points to an object cgroup. 3) For the user pages (e.g. the LRU pages), page->memcg_data points to a memory cgroup. We do not change the behavior of page_memcg() and page_memcg_rcu(). They are also suitable for LRU pages and kmem pages. Why? Because memory allocations pinning memcgs for a long time - it exists at a larger scale and is causing recurring problems in the real world: page cache doesn't get reclaimed for a long time, or is used by the second, third, fourth, ... instance of the same job that was restarted into a new cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and make page reclaim very inefficient. We can convert LRU pages and most other raw memcg pins to the objcg direction to fix this problem, and then the page->memcg will always point to an object cgroup pointer. At that time, LRU pages and kmem pages will be treated the same. The implementation of page_memcg() will remove the kmem page check. This patch aims to charge the kmem pages by using the new APIs of obj_cgroup. Finally, the page->memcg_data of the kmem page points to an object cgroup. We can use the __page_objcg() to get the object cgroup associated with a kmem page. Or we can use page_memcg() to get the memory cgroup associated with a kmem page, but caller must ensure that the returned memcg won't be released (e.g. acquire the rcu_read_lock or css_set_lock). Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Miaohe Lin Cc: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Cc: Christian Borntraeger [songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 120 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 97 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e946c96daa32..78ca34c935ab 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -371,6 +371,62 @@ enum page_memcg_data_flags { #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) +static inline bool PageMemcgKmem(struct page *page); + +/* + * After the initialization objcg->memcg is always pointing at + * a valid memcg, but can be atomically swapped to the parent memcg. + * + * The caller must ensure that the returned memcg won't be released: + * e.g. acquire the rcu_read_lock or css_set_lock. + */ +static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) +{ + return READ_ONCE(objcg->memcg); +} + +/* + * __page_memcg - get the memory cgroup associated with a non-kmem page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages or + * kmem pages. + */ +static inline struct mem_cgroup *__page_memcg(struct page *page) +{ + unsigned long memcg_data = page->memcg_data; + + VM_BUG_ON_PAGE(PageSlab(page), page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); + + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * __page_objcg - get the object cgroup associated with a kmem page + * @page: a pointer to the page struct + * + * Returns a pointer to the object cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper object cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages or + * LRU pages. + */ +static inline struct obj_cgroup *__page_objcg(struct page *page) +{ + unsigned long memcg_data = page->memcg_data; + + VM_BUG_ON_PAGE(PageSlab(page), page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); + VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page); + + return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + /* * page_memcg - get the memory cgroup associated with a page * @page: a pointer to the page struct @@ -380,20 +436,23 @@ enum page_memcg_data_flags { * proper memory cgroup pointer. It's not safe to call this function * against some type of pages, e.g. slab pages or ex-slab pages. * - * Any of the following ensures page and memcg binding stability: + * For a non-kmem page any of the following ensures page and memcg binding + * stability: + * * - the page lock * - LRU isolation * - lock_page_memcg() * - exclusive reference + * + * For a kmem page a caller should hold an rcu read lock to protect memcg + * associated with a kmem page from being released. */ static inline struct mem_cgroup *page_memcg(struct page *page) { - unsigned long memcg_data = page->memcg_data; - - VM_BUG_ON_PAGE(PageSlab(page), page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); - - return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + if (PageMemcgKmem(page)) + return obj_cgroup_memcg(__page_objcg(page)); + else + return __page_memcg(page); } /* @@ -407,11 +466,19 @@ static inline struct mem_cgroup *page_memcg(struct page *page) */ static inline struct mem_cgroup *page_memcg_rcu(struct page *page) { + unsigned long memcg_data = READ_ONCE(page->memcg_data); + VM_BUG_ON_PAGE(PageSlab(page), page); WARN_ON_ONCE(!rcu_read_lock_held()); - return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) & - ~MEMCG_DATA_FLAGS_MASK); + if (memcg_data & MEMCG_DATA_KMEM) { + struct obj_cgroup *objcg; + + objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + return obj_cgroup_memcg(objcg); + } + + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* @@ -419,15 +486,21 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page) * @page: a pointer to the page struct * * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function unlike page_memcg() can take any page + * or NULL. This function unlike page_memcg() can take any page * as an argument. It has to be used in cases when it's not known if a page - * has an associated memory cgroup pointer or an object cgroups vector. + * has an associated memory cgroup pointer or an object cgroups vector or + * an object cgroup. + * + * For a non-kmem page any of the following ensures page and memcg binding + * stability: * - * Any of the following ensures page and memcg binding stability: * - the page lock * - LRU isolation * - lock_page_memcg() * - exclusive reference + * + * For a kmem page a caller should hold an rcu read lock to protect memcg + * associated with a kmem page from being released. */ static inline struct mem_cgroup *page_memcg_check(struct page *page) { @@ -440,6 +513,13 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) if (memcg_data & MEMCG_DATA_OBJCGS) return NULL; + if (memcg_data & MEMCG_DATA_KMEM) { + struct obj_cgroup *objcg; + + objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + return obj_cgroup_memcg(objcg); + } + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } @@ -718,21 +798,15 @@ static inline void obj_cgroup_get(struct obj_cgroup *objcg) percpu_ref_get(&objcg->refcnt); } -static inline void obj_cgroup_put(struct obj_cgroup *objcg) +static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, + unsigned long nr) { - percpu_ref_put(&objcg->refcnt); + percpu_ref_get_many(&objcg->refcnt, nr); } -/* - * After the initialization objcg->memcg is always pointing at - * a valid memcg, but can be atomically swapped to the parent memcg. - * - * The caller must ensure that the returned memcg won't be released: - * e.g. acquire the rcu_read_lock or css_set_lock. - */ -static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) +static inline void obj_cgroup_put(struct obj_cgroup *objcg) { - return READ_ONCE(objcg->memcg); + percpu_ref_put(&objcg->refcnt); } static inline void mem_cgroup_put(struct mem_cgroup *memcg) -- cgit v1.2.3 From bd290e1e75d8a8b2d87031b63db56ae165677870 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 29 Apr 2021 22:56:58 -0700 Subject: mm: memcontrol: move PageMemcgKmem to the scope of CONFIG_MEMCG_KMEM The page only can be marked as kmem when CONFIG_MEMCG_KMEM is enabled. So move PageMemcgKmem() to the scope of the CONFIG_MEMCG_KMEM. As a bonus, on !CONFIG_MEMCG_KMEM build some code can be compiled out. Link: https://lkml.kernel.org/r/20210319163821.20704-8-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 78ca34c935ab..bcf92b99f001 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -523,6 +523,7 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } +#ifdef CONFIG_MEMCG_KMEM /* * PageMemcgKmem - check if the page has MemcgKmem flag set * @page: a pointer to the page struct @@ -537,7 +538,6 @@ static inline bool PageMemcgKmem(struct page *page) return page->memcg_data & MEMCG_DATA_KMEM; } -#ifdef CONFIG_MEMCG_KMEM /* * page_objcgs - get the object cgroups vector associated with a page * @page: a pointer to the page struct @@ -579,6 +579,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) } #else +static inline bool PageMemcgKmem(struct page *page) +{ + return false; +} + static inline struct obj_cgroup **page_objcgs(struct page *page) { return NULL; -- cgit v1.2.3 From a10e995749a6c65833edd201c55665e5d44d14fc Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 29 Apr 2021 22:57:01 -0700 Subject: linux/memcontrol.h: remove duplicate struct declaration struct mem_cgroup is declared twice. One has been declared at forward struct declaration. Remove the duplicate. Link: https://lkml.kernel.org/r/20210330020246.2265371-1-wanjiabing@vivo.com Signed-off-by: Wan Jiabing Reviewed-by: Muchun Song Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bcf92b99f001..5904716f29ba 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1123,8 +1123,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, #define MEM_CGROUP_ID_SHIFT 0 #define MEM_CGROUP_ID_MAX 0 -struct mem_cgroup; - static inline struct mem_cgroup *page_memcg(struct page *page) { return NULL; -- cgit v1.2.3 From f9001107820c647f65b57fb9c1ca2c0908b5fede Mon Sep 17 00:00:00 2001 From: Ovidiu Panait Date: Thu, 29 Apr 2021 22:57:26 -0700 Subject: mm, tracing: improve rss_stat tracepoint message Adjust the rss_stat tracepoint to print the name of the resident page type that got updated (e.g. MM_ANONPAGES/MM_FILEPAGES), rather than the numeric index corresponding to it (the __entry->member value): Before this patch: ------------------ rss_stat: mm_id=1216113068 curr=0 member=1 size=28672B rss_stat: mm_id=1216113068 curr=0 member=1 size=0B rss_stat: mm_id=534402304 curr=1 member=0 size=188416B rss_stat: mm_id=534402304 curr=1 member=1 size=40960B After this patch: ----------------- rss_stat: mm_id=1726253524 curr=1 type=MM_ANONPAGES size=40960B rss_stat: mm_id=1726253524 curr=1 type=MM_FILEPAGES size=663552B rss_stat: mm_id=1726253524 curr=1 type=MM_ANONPAGES size=65536B rss_stat: mm_id=1726253524 curr=1 type=MM_FILEPAGES size=647168B Use TRACE_DEFINE_ENUM()/__print_symbolic() logic to map the enum values to the strings they represent, so that userspace tools can also parse the raw data correctly. Link: https://lkml.kernel.org/r/20210310162305.4862-1-ovidiu.panait@windriver.com Signed-off-by: Ovidiu Panait Suggested-by: Steven Rostedt (VMware) Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/kmem.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 3a60b6b6db32..829a75692cc0 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -343,6 +343,26 @@ static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr) #define __PTR_TO_HASHVAL #endif +#define TRACE_MM_PAGES \ + EM(MM_FILEPAGES) \ + EM(MM_ANONPAGES) \ + EM(MM_SWAPENTS) \ + EMe(MM_SHMEMPAGES) + +#undef EM +#undef EMe + +#define EM(a) TRACE_DEFINE_ENUM(a); +#define EMe(a) TRACE_DEFINE_ENUM(a); + +TRACE_MM_PAGES + +#undef EM +#undef EMe + +#define EM(a) { a, #a }, +#define EMe(a) { a, #a } + TRACE_EVENT(rss_stat, TP_PROTO(struct mm_struct *mm, @@ -365,10 +385,10 @@ TRACE_EVENT(rss_stat, __entry->size = (count << PAGE_SHIFT); ), - TP_printk("mm_id=%u curr=%d member=%d size=%ldB", + TP_printk("mm_id=%u curr=%d type=%s size=%ldB", __entry->mm_id, __entry->curr, - __entry->member, + __print_symbolic(__entry->member, TRACE_MM_PAGES), __entry->size) ); #endif /* _TRACE_KMEM_H */ -- cgit v1.2.3 From 74ffa5a3e68504dd289135b1cf0422c19ffb3f2e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Apr 2021 22:57:29 -0700 Subject: mm: add remap_pfn_range_notrack Patch series "add remap_pfn_range_notrack instead of reinventing it in i915", v2. i915 has some reason to want to avoid the track_pfn_remap overhead in remap_pfn_range. Add a function to the core VM to do just that rather than reinventing the functionality poorly in the driver. Note that the remap_io_sg path does get exercises when using Xorg on my Thinkpad X1, so this should be considered lightly tested, I've not managed to hit the remap_io_mapping path at all. This patch (of 4): Add a version of remap_pfn_range that does not call track_pfn_range. This will be used to fix horrible abuses of VM internals in the i915 driver. Link: https://lkml.kernel.org/r/20210326055505.1424432-1-hch@lst.de Link: https://lkml.kernel.org/r/20210326055505.1424432-2-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Daniel Vetter Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: Chris Wilson Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 64be3baf861a..a8335cecf706 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2732,6 +2732,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); -- cgit v1.2.3 From 1fbaf8fc12a0136c7e62e7ad6fe886fe1749912c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Apr 2021 22:57:32 -0700 Subject: mm: add a io_mapping_map_user helper Add a helper that calls remap_pfn_range for an struct io_mapping, relying on the pgprot pre-validation done when creating the mapping instead of doing it at runtime. Link: https://lkml.kernel.org/r/20210326055505.1424432-3-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Chris Wilson Cc: Daniel Vetter Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Peter Zijlstra Cc: Rodrigo Vivi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/io-mapping.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index c093e81310a9..e9743cfd8585 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -220,3 +220,6 @@ io_mapping_free(struct io_mapping *iomap) } #endif /* _LINUX_IO_MAPPING_H */ + +int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, unsigned long size); -- cgit v1.2.3 From 14d071134c740cfe61c09fc506fd3ab052beea10 Mon Sep 17 00:00:00 2001 From: Brian Geffon Date: Thu, 29 Apr 2021 22:57:48 -0700 Subject: Revert "mremap: don't allow MREMAP_DONTUNMAP on special_mappings and aio" This reverts commit cd544fd1dc9293c6702fab6effa63dac1cc67e99. As discussed in [1] this commit was a no-op because the mapping type was checked in vma_to_resize before move_vma is ever called. This meant that vm_ops->mremap() would never be called on such mappings. Furthermore, we've since expanded support of MREMAP_DONTUNMAP to non-anonymous mappings, and these special mappings are still protected by the existing check of !VM_DONTEXPAND and !VM_PFNMAP which will result in a -EINVAL. 1. https://lkml.org/lkml/2020/12/28/2340 Link: https://lkml.kernel.org/r/20210323182520.2712101-2-bgeffon@google.com Signed-off-by: Brian Geffon Acked-by: Hugh Dickins Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Alejandro Colomar Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: "Kirill A . Shutemov" Cc: Lokesh Gidra Cc: Michael Kerrisk Cc: "Michael S . Tsirkin" Cc: Mike Rapoport Cc: Minchan Kim Cc: Peter Xu Cc: Sonny Rao Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index a8335cecf706..93097dbd9604 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -580,7 +580,7 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area, unsigned long flags); + int (*mremap)(struct vm_area_struct *area); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not -- cgit v1.2.3 From bbc180a5adb05ee8053fab7a0c0bd56c5964240e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:26 -0700 Subject: mm: HUGE_VMAP arch support cleanup This changes the awkward approach where architectures provide init functions to determine which levels they can provide large mappings for, to one where the arch is queried for each call. This removes code and indirection, and allows constant-folding of dead code for unsupported levels. This also adds a prot argument to the arch query. This is unused currently but could help with some architectures (e.g., some powerpc processors can't map uncacheable memory with large pages). Link: https://lkml.kernel.org/r/20210317062402.533919-7-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Ding Tianhong Acked-by: Catalin Marinas [arm64] Cc: Will Deacon Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Christoph Hellwig Cc: Miaohe Lin Cc: Michael Ellerman Cc: Russell King Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/io.h | 9 --------- include/linux/vmalloc.h | 6 ++++++ 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/io.h b/include/linux/io.h index 61ff7d6278b6..9595151d800d 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -31,15 +31,6 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end, } #endif -#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -void __init ioremap_huge_init(void); -int arch_ioremap_p4d_supported(void); -int arch_ioremap_pud_supported(void); -int arch_ioremap_pmd_supported(void); -#else -static inline void ioremap_huge_init(void) { } -#endif - /* * Managed iomap interface */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3de7be6dd17c..358c51c702c0 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -78,6 +78,12 @@ struct vmap_area { }; }; +#ifndef CONFIG_HAVE_ARCH_HUGE_VMAP +static inline bool arch_vmap_p4d_supported(pgprot_t prot) { return false; } +static inline bool arch_vmap_pud_supported(pgprot_t prot) { return false; } +static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return false; } +#endif + /* * Highlevel APIs for driver use */ -- cgit v1.2.3 From 6f680e70b6ff58c9670769534196800233685d55 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:39 -0700 Subject: mm/vmalloc: provide fallback arch huge vmap support functions If an architecture doesn't support a particular page table level as a huge vmap page size then allow it to skip defining the support query function. Link: https://lkml.kernel.org/r/20210317062402.533919-11-npiggin@gmail.com Signed-off-by: Nicholas Piggin Suggested-by: Christoph Hellwig Cc: Borislav Petkov Cc: Catalin Marinas Cc: Ding Tianhong Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Miaohe Lin Cc: Michael Ellerman Cc: Russell King Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 358c51c702c0..eb5630be6783 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -78,10 +78,26 @@ struct vmap_area { }; }; -#ifndef CONFIG_HAVE_ARCH_HUGE_VMAP -static inline bool arch_vmap_p4d_supported(pgprot_t prot) { return false; } -static inline bool arch_vmap_pud_supported(pgprot_t prot) { return false; } -static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return false; } +/* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */ +#ifndef arch_vmap_p4d_supported +static inline bool arch_vmap_p4d_supported(pgprot_t prot) +{ + return false; +} +#endif + +#ifndef arch_vmap_pud_supported +static inline bool arch_vmap_pud_supported(pgprot_t prot) +{ + return false; +} +#endif + +#ifndef arch_vmap_pmd_supported +static inline bool arch_vmap_pmd_supported(pgprot_t prot) +{ + return false; +} #endif /* -- cgit v1.2.3 From 5e9e3d777b99aabe2f91f793a52e870a02642160 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:43 -0700 Subject: mm: move vmap_range from mm/ioremap.c to mm/vmalloc.c This is a generic kernel virtual memory mapper, not specific to ioremap. Code is unchanged other than making vmap_range non-static. Link: https://lkml.kernel.org/r/20210317062402.533919-12-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Christoph Hellwig Cc: Borislav Petkov Cc: Catalin Marinas Cc: Ding Tianhong Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Miaohe Lin Cc: Michael Ellerman Cc: Russell King Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index eb5630be6783..ae9eb07d30d4 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -189,6 +189,9 @@ extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); #ifdef CONFIG_MMU +int vmap_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift); extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, -- cgit v1.2.3 From 121e6f3258fe393e22c36f61a319be8a4f2c05ae Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:49 -0700 Subject: mm/vmalloc: hugepage vmalloc mappings Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC enables support on architectures that define HAVE_ARCH_HUGE_VMAP and supports PMD sized vmap mappings. vmalloc will attempt to allocate PMD-sized pages if allocating PMD size or larger, and fall back to small pages if that was unsuccessful. Architectures must ensure that any arch specific vmalloc allocations that require PAGE_SIZE mappings (e.g., module allocations vs strict module rwx) use the VM_NOHUGE flag to inhibit larger mappings. This can result in more internal fragmentation and memory overhead for a given allocation, an option nohugevmalloc is added to disable at boot. [colin.king@canonical.com: fix read of uninitialized pointer area] Link: https://lkml.kernel.org/r/20210318155955.18220-1-colin.king@canonical.com Link: https://lkml.kernel.org/r/20210317062402.533919-14-npiggin@gmail.com Signed-off-by: Nicholas Piggin Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Ding Tianhong Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Miaohe Lin Cc: Michael Ellerman Cc: Russell King Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index ae9eb07d30d4..b4c82f2d40dc 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -26,6 +26,7 @@ struct notifier_block; /* in notifier.h */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ #define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ +#define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */ /* * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. @@ -54,6 +55,9 @@ struct vm_struct { unsigned long size; unsigned long flags; struct page **pages; +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + unsigned int page_order; +#endif unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; @@ -188,6 +192,22 @@ void free_vm_area(struct vm_struct *area); extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); +static inline bool is_vm_area_hugepages(const void *addr) +{ + /* + * This may not 100% tell if the area is mapped with > PAGE_SIZE + * page table entries, if for some reason the architecture indicates + * larger sizes are available but decides not to use them, nothing + * prevents that. This only indicates the size of the physical page + * allocated in the vmalloc layer. + */ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + return find_vm_area(addr)->page_order > 0; +#else + return false; +#endif +} + #ifdef CONFIG_MMU int vmap_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, @@ -205,6 +225,7 @@ static inline void set_vm_flush_reset_perms(void *addr) if (vm) vm->flags |= VM_FLUSH_RESET_PERMS; } + #else static inline int map_kernel_range_noflush(unsigned long start, unsigned long size, -- cgit v1.2.3 From b67177ecd956333029dbc1a4971a857fee0ccbb1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:58:53 -0700 Subject: mm/vmalloc: remove map_kernel_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/vmalloc: cleanup after hugepage series", v2. Christoph pointed out some overdue cleanups required after the huge vmalloc series, and I had another failure error message improvement as well. This patch (of 5): This is a shim around vmap_pages_range, get rid of it. Move the main API comment from the _noflush variant to the normal variant, and make _noflush internal to mm/. Link: https://lkml.kernel.org/r/20210322021806.892164-1-npiggin@gmail.com Link: https://lkml.kernel.org/r/20210322021806.892164-2-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Christoph Hellwig Cc: Uladzislau Rezki Cc: Cédric Le Goater Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index b4c82f2d40dc..fb3b9989a4c5 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -212,10 +212,6 @@ static inline bool is_vm_area_hugepages(const void *addr) int vmap_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift); -extern int map_kernel_range_noflush(unsigned long start, unsigned long size, - pgprot_t prot, struct page **pages); -int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, - struct page **pages); extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); static inline void set_vm_flush_reset_perms(void *addr) @@ -227,13 +223,6 @@ static inline void set_vm_flush_reset_perms(void *addr) } #else -static inline int -map_kernel_range_noflush(unsigned long start, unsigned long size, - pgprot_t prot, struct page **pages) -{ - return size >> PAGE_SHIFT; -} -#define map_kernel_range map_kernel_range_noflush static inline void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { -- cgit v1.2.3 From 4ad0ae8c64ac8f81a3651bca11be7c3cb086df80 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 29 Apr 2021 22:59:01 -0700 Subject: mm/vmalloc: remove unmap_kernel_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a shim around vunmap_range, get rid of it. Move the main API comment from the _noflush variant to the normal variant, and make _noflush internal to mm/. [npiggin@gmail.com: fix nommu builds and a comment bug per sfr] Link: https://lkml.kernel.org/r/1617292598.m6g0knx24s.astroid@bobo.none [akpm@linux-foundation.org: move vunmap_range_noflush() stub inside !CONFIG_MMU, not !CONFIG_NUMA] [npiggin@gmail.com: fix nommu builds] Link: https://lkml.kernel.org/r/1617292497.o1uhq5ipxp.astroid@bobo.none Link: https://lkml.kernel.org/r/20210322021806.892164-5-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Christoph Hellwig Cc: Cédric Le Goater Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index fb3b9989a4c5..394d03cc0e92 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -212,8 +212,7 @@ static inline bool is_vm_area_hugepages(const void *addr) int vmap_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift); -extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); -extern void unmap_kernel_range(unsigned long addr, unsigned long size); +void vunmap_range(unsigned long addr, unsigned long end); static inline void set_vm_flush_reset_perms(void *addr) { struct vm_struct *vm = find_vm_area(addr); @@ -223,11 +222,6 @@ static inline void set_vm_flush_reset_perms(void *addr) } #else -static inline void -unmap_kernel_range_noflush(unsigned long addr, unsigned long size) -{ -} -#define unmap_kernel_range unmap_kernel_range_noflush static inline void set_vm_flush_reset_perms(void *addr) { } -- cgit v1.2.3 From 78f4841e34763079be0661744c1ca997be64eb56 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 22:59:25 -0700 Subject: mm/doc: fix fault_flag_allow_retry_first kerneldoc make htmldocs reports: include/linux/mm.h:496: warning: Function parameter or member 'flags' not described in 'fault_flag_allow_retry_first' Add a description. Link: https://lkml.kernel.org/r/20210322195022.2143603-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: John Hubbard Cc: Mike Rapoport Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 93097dbd9604..382c33e7d906 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -485,6 +485,7 @@ extern pgprot_t protection_map[16]; /** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time + * @flags: Fault flags. * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition -- cgit v1.2.3 From 136dfc9949f84089217f84e6478471dabbf14ba7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 22:59:28 -0700 Subject: mm/doc: fix page_maybe_dma_pinned kerneldoc make htmldocs reports: include/linux/mm.h:1341: warning: Excess function parameter 'Return' description in 'page_maybe_dma_pinned' Fix a few other formatting nits while I'm editing this description. Link: https://lkml.kernel.org/r/20210322195022.2143603-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport Reviewed-by: John Hubbard Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 382c33e7d906..5acda8761935 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1271,10 +1271,11 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, void unpin_user_pages(struct page **pages, unsigned long npages); /** - * page_maybe_dma_pinned() - report if a page is pinned for DMA. + * page_maybe_dma_pinned - Report if a page is pinned for DMA. + * @page: The page. * * This function checks if a page has been pinned via a call to - * pin_user_pages*(). + * a function in the pin_user_pages() family. * * For non-huge pages, the return value is partially fuzzy: false is not fuzzy, * because it means "definitely not pinned for DMA", but true means "probably @@ -1292,9 +1293,8 @@ void unpin_user_pages(struct page **pages, unsigned long npages); * * For more information, please see Documentation/core-api/pin_user_pages.rst. * - * @page: pointer to page to be queried. - * @Return: True, if it is likely that the page has been "dma-pinned". - * False, if the page is definitely not dma-pinned. + * Return: True, if it is likely that the page has been "dma-pinned". + * False, if the page is definitely not dma-pinned. */ static inline bool page_maybe_dma_pinned(struct page *page) { -- cgit v1.2.3 From da2f5eb3d344503c4d851bdf1ae2379167074413 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 22:59:31 -0700 Subject: mm/doc: turn fault flags into an enum The kernel-doc script complains about include/linux/mm.h:425: warning: wrong kernel-doc identifier on line: * Fault flag definitions. I don't know how to document a series of #defines, so turn these definitions into an enum and document that instead. Link: https://lkml.kernel.org/r/20210322195022.2143603-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport Cc: John Hubbard Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5acda8761935..64b2e3a0b94d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -432,8 +432,7 @@ extern unsigned int kobjsize(const void *objp); extern pgprot_t protection_map[16]; /** - * Fault flag definitions. - * + * enum fault_flag - Fault flag definitions. * @FAULT_FLAG_WRITE: Fault was a write fault. * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. @@ -464,16 +463,18 @@ extern pgprot_t protection_map[16]; * signals before a retry to make sure the continuous page faults can still be * interrupted if necessary. */ -#define FAULT_FLAG_WRITE 0x01 -#define FAULT_FLAG_MKWRITE 0x02 -#define FAULT_FLAG_ALLOW_RETRY 0x04 -#define FAULT_FLAG_RETRY_NOWAIT 0x08 -#define FAULT_FLAG_KILLABLE 0x10 -#define FAULT_FLAG_TRIED 0x20 -#define FAULT_FLAG_USER 0x40 -#define FAULT_FLAG_REMOTE 0x80 -#define FAULT_FLAG_INSTRUCTION 0x100 -#define FAULT_FLAG_INTERRUPTIBLE 0x200 +enum fault_flag { + FAULT_FLAG_WRITE = 1 << 0, + FAULT_FLAG_MKWRITE = 1 << 1, + FAULT_FLAG_ALLOW_RETRY = 1 << 2, + FAULT_FLAG_RETRY_NOWAIT = 1 << 3, + FAULT_FLAG_KILLABLE = 1 << 4, + FAULT_FLAG_TRIED = 1 << 5, + FAULT_FLAG_USER = 1 << 6, + FAULT_FLAG_REMOTE = 1 << 7, + FAULT_FLAG_INSTRUCTION = 1 << 8, + FAULT_FLAG_INTERRUPTIBLE = 1 << 9, +}; /* * The default fault flags that should be used by most of the @@ -496,7 +497,7 @@ extern pgprot_t protection_map[16]; * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise. */ -static inline bool fault_flag_allow_retry_first(unsigned int flags) +static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { return (flags & FAULT_FLAG_ALLOW_RETRY) && (!(flags & FAULT_FLAG_TRIED)); @@ -531,7 +532,7 @@ struct vm_fault { pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address */ }; - unsigned int flags; /* FAULT_FLAG_xxx flags + enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ -- cgit v1.2.3 From 91ab1a41191ef2d4c6e123951a0f0c3876bd9376 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 29 Apr 2021 22:59:40 -0700 Subject: pagewalk: prefix struct kernel-doc descriptions The script './scripts/kernel-doc -none ./include/linux/pagewalk.h' reports: include/linux/pagewalk.h:37: warning: cannot understand function prototype: 'struct mm_walk_ops ' include/linux/pagewalk.h:85: warning: cannot understand function prototype: 'struct mm_walk ' A kernel-doc description for a structure requires to prefix the struct name with the keyword 'struct'. So, do that such that no further kernel-doc warnings are reported for this file. Link: https://lkml.kernel.org/r/20210322122542.15072-3-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Cc: Joe Perches Cc: Jonathan Corbet Cc: Ralf Ramsauer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagewalk.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index b1cb6b753abb..ac7b38ad5903 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -7,7 +7,7 @@ struct mm_walk; /** - * mm_walk_ops - callbacks for walk_page_range + * struct mm_walk_ops - callbacks for walk_page_range * @pgd_entry: if set, called for each non-empty PGD (top-level) entry * @p4d_entry: if set, called for each non-empty P4D entry * @pud_entry: if set, called for each non-empty PUD entry @@ -71,7 +71,7 @@ enum page_walk_action { }; /** - * mm_walk - walk_page_range data + * struct mm_walk - walk_page_range data * @ops: operation to call during the walk * @mm: mm_struct representing the target process of page table walk * @pgd: pointer to PGD; only valid with no_vma (otherwise set to NULL) -- cgit v1.2.3 From a064cb00d359bc464df6fd2ab6dfb8dc4b31e361 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 22:59:49 -0700 Subject: kasan: initialize shadow to TAG_INVALID for SW_TAGS Currently, KASAN_SW_TAGS uses 0xFF as the default tag value for unallocated memory. The underlying idea is that since that memory hasn't been allocated yet, it's only supposed to be dereferenced through a pointer with the native 0xFF tag. While this is a good idea in terms on consistency, practically it doesn't bring any benefit. Since the 0xFF pointer tag is a match-all tag, it doesn't matter what tag the accessed memory has. No accesses through 0xFF-tagged pointers are considered buggy by KASAN. This patch changes the default tag value for unallocated memory to 0xFE, which is the tag KASAN uses for inaccessible memory. This doesn't affect accesses through 0xFF-tagged pointer to this memory, but this allows KASAN to detect wild and large out-of-bounds invalid memory accesses through otherwise-tagged pointers. This is a prepatory patch for the next one, which changes the tag-based KASAN modes to not poison the boot memory. Link: https://lkml.kernel.org/r/c8e93571c18b3528aac5eb33ade213bf133d10ad.1613692950.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Marco Elver Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d53ea3c047bc..9f5faefd1744 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -30,7 +30,8 @@ struct kunit_kasan_expectation { /* Software KASAN implementations use shadow memory. */ #ifdef CONFIG_KASAN_SW_TAGS -#define KASAN_SHADOW_INIT 0xFF +/* This matches KASAN_TAG_INVALID. */ +#define KASAN_SHADOW_INIT 0xFE #else #define KASAN_SHADOW_INIT 0 #endif -- cgit v1.2.3 From 1bb5eab30d68c1a3d9dbc822e1895e6c06dbe748 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:02 -0700 Subject: kasan, mm: integrate page_alloc init with HW_TAGS This change uses the previously added memory initialization feature of HW_TAGS KASAN routines for page_alloc memory when init_on_alloc/free is enabled. With this change, kernel_init_free_pages() is no longer called when both HW_TAGS KASAN and init_on_alloc/free are enabled. Instead, memory is initialized in KASAN runtime. To avoid discrepancies with which memory gets initialized that can be caused by future changes, both KASAN and kernel_init_free_pages() hooks are put together and a warning comment is added. This patch changes the order in which memory initialization and page poisoning hooks are called. This doesn't lead to any side-effects, as whenever page poisoning is enabled, memory initialization gets disabled. Combining setting allocation tags with memory initialization improves HW_TAGS KASAN performance when init_on_alloc/free is enabled. [andreyknvl@google.com: fix for "integrate page_alloc init with HW_TAGS"] Link: https://lkml.kernel.org/r/65b6028dea2e9a6e8e2cb779b5115c09457363fc.1617122211.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/e77f0d5b1b20658ef0b8288625c74c2b3690e725.1615296150.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Tested-by: Vlastimil Babka Reviewed-by: Sergei Trofimovich Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Joonsoo Kim Cc: Kevin Brodsky Cc: Pekka Enberg Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 9f5faefd1744..30aa2bee8400 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -96,6 +96,11 @@ static __always_inline bool kasan_enabled(void) return static_branch_likely(&kasan_flag_enabled); } +static inline bool kasan_has_integrated_init(void) +{ + return kasan_enabled(); +} + #else /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_enabled(void) @@ -103,6 +108,11 @@ static inline bool kasan_enabled(void) return true; } +static inline bool kasan_has_integrated_init(void) +{ + return false; +} + #endif /* CONFIG_KASAN_HW_TAGS */ slab_flags_t __kasan_never_merge(void); @@ -120,20 +130,20 @@ static __always_inline void kasan_unpoison_range(const void *addr, size_t size) __kasan_unpoison_range(addr, size); } -void __kasan_alloc_pages(struct page *page, unsigned int order); +void __kasan_alloc_pages(struct page *page, unsigned int order, bool init); static __always_inline void kasan_alloc_pages(struct page *page, - unsigned int order) + unsigned int order, bool init) { if (kasan_enabled()) - __kasan_alloc_pages(page, order); + __kasan_alloc_pages(page, order, init); } -void __kasan_free_pages(struct page *page, unsigned int order); +void __kasan_free_pages(struct page *page, unsigned int order, bool init); static __always_inline void kasan_free_pages(struct page *page, - unsigned int order) + unsigned int order, bool init) { if (kasan_enabled()) - __kasan_free_pages(page, order); + __kasan_free_pages(page, order, init); } void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, @@ -277,13 +287,17 @@ static inline bool kasan_enabled(void) { return false; } +static inline bool kasan_has_integrated_init(void) +{ + return false; +} static inline slab_flags_t kasan_never_merge(void) { return 0; } static inline void kasan_unpoison_range(const void *address, size_t size) {} -static inline void kasan_alloc_pages(struct page *page, unsigned int order) {} -static inline void kasan_free_pages(struct page *page, unsigned int order) {} +static inline void kasan_alloc_pages(struct page *page, unsigned int order, bool init) {} +static inline void kasan_free_pages(struct page *page, unsigned int order, bool init) {} static inline void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) {} -- cgit v1.2.3 From da844b787245194cfd69f0f1d2fb1dd3640a8a6d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:06 -0700 Subject: kasan, mm: integrate slab init_on_alloc with HW_TAGS This change uses the previously added memory initialization feature of HW_TAGS KASAN routines for slab memory when init_on_alloc is enabled. With this change, memory initialization memset() is no longer called when both HW_TAGS KASAN and init_on_alloc are enabled. Instead, memory is initialized in KASAN runtime. The memory initialization memset() is moved into slab_post_alloc_hook() that currently directly follows the initialization loop. A new argument is added to slab_post_alloc_hook() that indicates whether to initialize the memory or not. To avoid discrepancies with which memory gets initialized that can be caused by future changes, both KASAN hook and initialization memset() are put together and a warning comment is added. Combining setting allocation tags with memory initialization improves HW_TAGS KASAN performance when init_on_alloc is enabled. Link: https://lkml.kernel.org/r/c1292aeb5d519da221ec74a0684a949b027d7720.1615296150.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Joonsoo Kim Cc: Kevin Brodsky Cc: Pekka Enberg Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 30aa2bee8400..629aee484b6c 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -226,12 +226,12 @@ static __always_inline void kasan_slab_free_mempool(void *ptr) } void * __must_check __kasan_slab_alloc(struct kmem_cache *s, - void *object, gfp_t flags); + void *object, gfp_t flags, bool init); static __always_inline void * __must_check kasan_slab_alloc( - struct kmem_cache *s, void *object, gfp_t flags) + struct kmem_cache *s, void *object, gfp_t flags, bool init) { if (kasan_enabled()) - return __kasan_slab_alloc(s, object, flags); + return __kasan_slab_alloc(s, object, flags, init); return object; } @@ -320,7 +320,7 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object) static inline void kasan_kfree_large(void *ptr) {} static inline void kasan_slab_free_mempool(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, - gfp_t flags) + gfp_t flags, bool init) { return object; } -- cgit v1.2.3 From d57a964e09c22441e9fb497d1d7a5c1983a5d1fb Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 29 Apr 2021 23:00:09 -0700 Subject: kasan, mm: integrate slab init_on_free with HW_TAGS This change uses the previously added memory initialization feature of HW_TAGS KASAN routines for slab memory when init_on_free is enabled. With this change, memory initialization memset() is no longer called when both HW_TAGS KASAN and init_on_free are enabled. Instead, memory is initialized in KASAN runtime. For SLUB, the memory initialization memset() is moved into slab_free_hook() that currently directly follows the initialization loop. A new argument is added to slab_free_hook() that indicates whether to initialize the memory or not. To avoid discrepancies with which memory gets initialized that can be caused by future changes, both KASAN hook and initialization memset() are put together and a warning comment is added. Combining setting allocation tags with memory initialization improves HW_TAGS KASAN performance when init_on_free is enabled. Link: https://lkml.kernel.org/r/190fd15c1886654afdec0d19ebebd5ade665b601.1615296150.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Joonsoo Kim Cc: Kevin Brodsky Cc: Pekka Enberg Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 629aee484b6c..b1678a61e6a7 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -203,11 +203,13 @@ static __always_inline void * __must_check kasan_init_slab_obj( return (void *)object; } -bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip); -static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object) +bool __kasan_slab_free(struct kmem_cache *s, void *object, + unsigned long ip, bool init); +static __always_inline bool kasan_slab_free(struct kmem_cache *s, + void *object, bool init) { if (kasan_enabled()) - return __kasan_slab_free(s, object, _RET_IP_); + return __kasan_slab_free(s, object, _RET_IP_, init); return false; } @@ -313,7 +315,7 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache, { return (void *)object; } -static inline bool kasan_slab_free(struct kmem_cache *s, void *object) +static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) { return false; } -- cgit v1.2.3 From 1f9d03c5e999ed5a57fa4d8aec9fdf67a6234b80 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 29 Apr 2021 23:00:55 -0700 Subject: mm: move mem_init_print_info() into mm_init() mem_init_print_info() is called in mem_init() on each architecture, and pass NULL argument, so using void argument and move it into mm_init(). Link: https://lkml.kernel.org/r/20210317015210.33641-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Dave Hansen [x86] Reviewed-by: Christophe Leroy [powerpc] Acked-by: David Hildenbrand Tested-by: Anatoly Pugachev [sparc64] Acked-by: Russell King [arm] Acked-by: Mike Rapoport Cc: Catalin Marinas Cc: Richard Henderson Cc: Guo Ren Cc: Yoshinori Sato Cc: Huacai Chen Cc: Jonas Bonn Cc: Palmer Dabbelt Cc: Heiko Carstens Cc: "David S. Miller" Cc: "Peter Zijlstra" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 64b2e3a0b94d..011f43605807 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2360,7 +2360,7 @@ extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); extern void adjust_managed_page_count(struct page *page, long count); -extern void mem_init_print_info(const char *str); +extern void mem_init_print_info(void); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); -- cgit v1.2.3 From f73c6c8805ed0762d99122d5332fcf42b0c8fbb8 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 29 Apr 2021 23:01:04 -0700 Subject: include/linux/page-flags-layout.h: correctly determine LAST_CPUPID_WIDTH The naming convention used in include/linux/page-flags-layout.h: *_SHIFT: the number of bits trying to allocate *_WIDTH: the number of bits successfully allocated So when it comes to LAST_CPUPID_WIDTH, we need to check whether all previous *_WIDTH and LAST_CPUPID_SHIFT can fit into page flags. This means we need to use NODES_WIDTH, not NODES_SHIFT. Link: https://lkml.kernel.org/r/20210303071609.797782-1-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags-layout.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 7d4ec26d8a3e..295c2c687d2c 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -83,7 +83,7 @@ #define KASAN_TAG_WIDTH 0 #endif -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \ +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_WIDTH+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \ <= BITS_PER_LONG - NR_PAGEFLAGS #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT #else -- cgit v1.2.3 From 1587db62d8c0dbd943752f657b452213e1c4d8d4 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 29 Apr 2021 23:01:07 -0700 Subject: include/linux/page-flags-layout.h: cleanups Tidy things up and delete comments stating the obvious with typos or making no sense. Link: https://lkml.kernel.org/r/20210303071609.797782-2-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags-layout.h | 62 ++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 295c2c687d2c..ef1e3e736e14 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -21,16 +21,17 @@ #elif MAX_NR_ZONES <= 8 #define ZONES_SHIFT 3 #else -#error ZONES_SHIFT -- too many zones configured adjust calculation +#error ZONES_SHIFT "Too many zones configured" #endif +#define ZONES_WIDTH ZONES_SHIFT + #ifdef CONFIG_SPARSEMEM #include - -/* SECTION_SHIFT #bits space required to store a section # */ #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) - -#endif /* CONFIG_SPARSEMEM */ +#else +#define SECTIONS_SHIFT 0 +#endif #ifndef BUILD_VDSO32_64 /* @@ -54,17 +55,28 @@ #define SECTIONS_WIDTH 0 #endif -#define ZONES_WIDTH ZONES_SHIFT - -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS #define NODES_WIDTH NODES_SHIFT -#else -#ifdef CONFIG_SPARSEMEM_VMEMMAP +#elif defined(CONFIG_SPARSEMEM_VMEMMAP) #error "Vmemmap: No space for nodes field in page flags" -#endif +#else #define NODES_WIDTH 0 #endif +/* + * Note that this #define MUST have a value so that it can be tested with + * the IS_ENABLED() macro. + */ +#if NODES_SHIFT != 0 && NODES_WIDTH == 0 +#define NODE_NOT_IN_PAGE_FLAGS 1 +#endif + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +#define KASAN_TAG_WIDTH 8 +#else +#define KASAN_TAG_WIDTH 0 +#endif + #ifdef CONFIG_NUMA_BALANCING #define LAST__PID_SHIFT 8 #define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1) @@ -77,36 +89,20 @@ #define LAST_CPUPID_SHIFT 0 #endif -#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) -#define KASAN_TAG_WIDTH 8 -#else -#define KASAN_TAG_WIDTH 0 -#endif - -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_WIDTH+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \ +#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ <= BITS_PER_LONG - NR_PAGEFLAGS #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT #else #define LAST_CPUPID_WIDTH 0 #endif -#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH+LAST_CPUPID_WIDTH+KASAN_TAG_WIDTH \ - > BITS_PER_LONG - NR_PAGEFLAGS -#error "Not enough bits in page flags" -#endif - -/* - * We are going to use the flags for the page to node mapping if its in - * there. This includes the case where there is no node, so it is implicit. - * Note that this #define MUST have a value so that it can be tested with - * the IS_ENABLED() macro. - */ -#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) -#define NODE_NOT_IN_PAGE_FLAGS 1 +#if LAST_CPUPID_SHIFT != 0 && LAST_CPUPID_WIDTH == 0 +#define LAST_CPUPID_NOT_IN_PAGE_FLAGS #endif -#if defined(CONFIG_NUMA_BALANCING) && LAST_CPUPID_WIDTH == 0 -#define LAST_CPUPID_NOT_IN_PAGE_FLAGS +#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ + > BITS_PER_LONG - NR_PAGEFLAGS +#error "Not enough bits in page flags" #endif #endif -- cgit v1.2.3 From 84172f4bb752424415756351a40f8da5714e1554 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 23:01:15 -0700 Subject: mm/page_alloc: combine __alloc_pages and __alloc_pages_nodemask There are only two callers of __alloc_pages() so prune the thicket of alloc_page variants by combining the two functions together. Current callers of __alloc_pages() simply add an extra 'NULL' parameter and current callers of __alloc_pages_nodemask() call __alloc_pages() instead. Link: https://lkml.kernel.org/r/20210225150642.2582252-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 8572a1474e16..f39de931bdf9 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -515,15 +515,8 @@ static inline int arch_make_page_accessible(struct page *page) } #endif -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, - nodemask_t *nodemask); - -static inline struct page * -__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid) -{ - return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL); -} +struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask); /* * Allocate pages, preferring the node given as nid. The node must be valid and @@ -535,7 +528,7 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid)); - return __alloc_pages(gfp_mask, order, nid); + return __alloc_pages(gfp_mask, order, nid, NULL); } /* -- cgit v1.2.3 From d7f946d0faf90014547ee5d090e9d05018278c7a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 23:01:18 -0700 Subject: mm/mempolicy: rename alloc_pages_current to alloc_pages When CONFIG_NUMA is enabled, alloc_pages() is a wrapper around alloc_pages_current(). This is pointless, just implement alloc_pages() directly. Link: https://lkml.kernel.org/r/20210225150642.2582252-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f39de931bdf9..0a88f84b08f4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -546,13 +546,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, } #ifdef CONFIG_NUMA -extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); - -static inline struct page * -alloc_pages(gfp_t gfp_mask, unsigned int order) -{ - return alloc_pages_current(gfp_mask, order); -} +struct page *alloc_pages(gfp_t gfp, unsigned int order); extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, int node, bool hugepage); -- cgit v1.2.3 From 387ba26fb1cb9be9e35dc14a6d97188e916eda05 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Apr 2021 23:01:45 -0700 Subject: mm/page_alloc: add a bulk page allocator This patch adds a new page allocator interface via alloc_pages_bulk, and __alloc_pages_bulk_nodemask. A caller requests a number of pages to be allocated and added to a list. The API is not guaranteed to return the requested number of pages and may fail if the preferred allocation zone has limited free memory, the cpuset changes during the allocation or page debugging decides to fail an allocation. It's up to the caller to request more pages in batch if necessary. Note that this implementation is not very efficient and could be improved but it would require refactoring. The intent is to make it available early to determine what semantics are required by different callers. Once the full semantics are nailed down, it can be refactored. [mgorman@techsingularity.net: fix alloc_pages_bulk() return type, per Matthew] Link: https://lkml.kernel.org/r/20210325123713.GQ3697@techsingularity.net [mgorman@techsingularity.net: fix uninit var warning] Link: https://lkml.kernel.org/r/20210330114847.GX3697@techsingularity.net [mgorman@techsingularity.net: fix comment, per Vlastimil] Link: https://lkml.kernel.org/r/20210412110255.GV3697@techsingularity.net Link: https://lkml.kernel.org/r/20210325114228.27719-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Reviewed-by: Alexander Lobakin Tested-by: Colin Ian King Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Ilias Apalodimas Cc: Jesper Dangaard Brouer Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0a88f84b08f4..a2be8f4174a9 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -518,6 +518,17 @@ static inline int arch_make_page_accessible(struct page *page) struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); +unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *list); + +/* Bulk allocate order-0 pages */ +static inline unsigned long +alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list) +{ + return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list); +} + /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). -- cgit v1.2.3 From 0f87d9d30f21390dd71114f30e63038980e6eb3f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Apr 2021 23:01:48 -0700 Subject: mm/page_alloc: add an array-based interface to the bulk page allocator The proposed callers for the bulk allocator store pages from the bulk allocator in an array. This patch adds an array-based interface to the API to avoid multiple list iterations. The page list interface is preserved to avoid requiring all users of the bulk API to allocate and manage enough storage to store the pages. [akpm@linux-foundation.org: remove now unused local `allocated'] Link: https://lkml.kernel.org/r/20210325114228.27719-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Reviewed-by: Alexander Lobakin Acked-by: Vlastimil Babka Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Ilias Apalodimas Cc: Jesper Dangaard Brouer Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a2be8f4174a9..26f4d907254a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -520,13 +520,20 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *list); + struct list_head *page_list, + struct page **page_array); /* Bulk allocate order-0 pages */ static inline unsigned long -alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list) +alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list) { - return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list); + return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL); +} + +static inline unsigned long +alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array) +{ + return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array); } /* -- cgit v1.2.3 From be5dba25b4b27f262626ddc9079d4858a75462fd Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 29 Apr 2021 23:02:07 -0700 Subject: net: page_pool: use alloc_pages_bulk in refill code path There are cases where the page_pool need to refill with pages from the page allocator. Some workloads cause the page_pool to release pages instead of recycling these pages. For these workload it can improve performance to bulk alloc pages from the page-allocator to refill the alloc cache. For XDP-redirect workload with 100G mlx5 driver (that use page_pool) redirecting xdp_frame packets into a veth, that does XDP_PASS to create an SKB from the xdp_frame, which then cannot return the page to the page_pool. Performance results under GitHub xdp-project[1]: [1] https://github.com/xdp-project/xdp-project/blob/master/areas/mem/page_pool06_alloc_pages_bulk.org Mel: The patch "net: page_pool: convert to use alloc_pages_bulk_array variant" was squashed with this patch. From the test page, the array variant was superior with one of the test results as follows. Kernel XDP stats CPU pps Delta Baseline XDP-RX CPU total 3,771,046 n/a List XDP-RX CPU total 3,940,242 +4.49% Array XDP-RX CPU total 4,249,224 +12.68% Link: https://lkml.kernel.org/r/20210325114228.27719-10-mgorman@techsingularity.net Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Mel Gorman Reviewed-by: Alexander Lobakin Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Ilias Apalodimas Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/net/page_pool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index b5b195305346..6d517a37c18b 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -65,7 +65,7 @@ #define PP_ALLOC_CACHE_REFILL 64 struct pp_alloc_cache { u32 count; - void *cache[PP_ALLOC_CACHE_SIZE]; + struct page *cache[PP_ALLOC_CACHE_SIZE]; }; struct page_pool_params { -- cgit v1.2.3 From 198fba4137a1803a9cb93992b56c2ecba1aa83a3 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 29 Apr 2021 23:02:16 -0700 Subject: mm/mmzone.h: fix existing kernel-doc comments and link them to core-api There are a couple of kernel-doc comments in include/linux/mmzone.h but they have minor formatting issues that would cause kernel-doc warnings. Fix the formatting of those comments, add missing Return: descriptions and link include/linux/mmzone.h to Documentation/core-api/mm-api.rst Link: https://lkml.kernel.org/r/20210426141927.1314326-2-rppt@kernel.org Signed-off-by: Mike Rapoport Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 47946cec7584..3b2205741048 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -993,7 +993,8 @@ static inline int is_highmem_idx(enum zone_type idx) * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. - * @zone - pointer to struct zone variable + * @zone: pointer to struct zone variable + * Return: 1 for a highmem zone, 0 otherwise */ static inline int is_highmem(struct zone *zone) { @@ -1044,7 +1045,7 @@ extern struct zone *next_zone(struct zone *zone); /** * for_each_online_pgdat - helper macro to iterate over all online nodes - * @pgdat - pointer to a pg_data_t variable + * @pgdat: pointer to a pg_data_t variable */ #define for_each_online_pgdat(pgdat) \ for (pgdat = first_online_pgdat(); \ @@ -1052,7 +1053,7 @@ extern struct zone *next_zone(struct zone *zone); pgdat = next_online_pgdat(pgdat)) /** * for_each_zone - helper macro to iterate over all memory zones - * @zone - pointer to struct zone variable + * @zone: pointer to struct zone variable * * The user only needs to declare the zone variable, for_each_zone * fills it in. @@ -1091,15 +1092,18 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z, /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point - * @z - The cursor used as a starting point for the search - * @highest_zoneidx - The zone index of the highest zone to return - * @nodes - An optional nodemask to filter the zonelist with + * @z: The cursor used as a starting point for the search + * @highest_zoneidx: The zone index of the highest zone to return + * @nodes: An optional nodemask to filter the zonelist with * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the * search. The zoneref returned is a cursor that represents the current zone * being examined. It should be advanced by one before calling * next_zones_zonelist again. + * + * Return: the next zone at or below highest_zoneidx within the allowed + * nodemask using a cursor within a zonelist as a starting point */ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, @@ -1112,10 +1116,9 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist - * @zonelist - The zonelist to search for a suitable zone - * @highest_zoneidx - The zone index of the highest zone to return - * @nodes - An optional nodemask to filter the zonelist with - * @return - Zoneref pointer for the first suitable zone found (see below) + * @zonelist: The zonelist to search for a suitable zone + * @highest_zoneidx: The zone index of the highest zone to return + * @nodes: An optional nodemask to filter the zonelist with * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be @@ -1125,6 +1128,8 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is * never NULL). This may happen either genuinely, or due to concurrent nodemask * update due to cpuset modification. + * + * Return: Zoneref pointer for the first suitable zone found */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, @@ -1136,11 +1141,11 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, /** * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask - * @zone - The current zone in the iterator - * @z - The current pointer within zonelist->_zonerefs being iterated - * @zlist - The zonelist being iterated - * @highidx - The zone index of the highest zone to return - * @nodemask - Nodemask allowed by the allocator + * @zone: The current zone in the iterator + * @z: The current pointer within zonelist->_zonerefs being iterated + * @zlist: The zonelist being iterated + * @highidx: The zone index of the highest zone to return + * @nodemask: Nodemask allowed by the allocator * * This iterator iterates though all zones at or below a given zone index and * within a given nodemask @@ -1160,10 +1165,10 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index - * @zone - The current zone in the iterator - * @z - The current pointer within zonelist->zones being iterated - * @zlist - The zonelist being iterated - * @highidx - The zone index of the highest zone to return + * @zone: The current zone in the iterator + * @z: The current pointer within zonelist->zones being iterated + * @zlist: The zonelist being iterated + * @highidx: The zone index of the highest zone to return * * This iterator iterates though all zones at or below a given zone index. */ -- cgit v1.2.3 From 384d0c68204a4a657f4bbc096c50d729ae7d9ef0 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 12 Feb 2021 11:02:47 +0100 Subject: PCI/VPD: Remove pci_set_vpd_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 24a1720a0841 ("cxgb4: collect serial config version from register") removed the only usage of pci_set_vpd_size(). If a device needs to override the auto-detected VPD size, then this can be done with a PCI quirk, as is done for Chelsio devices. There's no need to allow drivers to change the VPD size. Remove pci_set_vpd_size(). [bhelgaas: squash in Arnd's fix for "'pci_vpd_set_size' defined but not used" from https://lore.kernel.org/r/20210421140334.3847155-1-arnd@kernel.org] Link: https://lore.kernel.org/r/47d86e52-9bcf-7da7-1edb-0d988a7a82ab@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 86c799c97b77..edadc62ae058 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1302,7 +1302,6 @@ void pci_unlock_rescan_remove(void); /* Vital Product Data routines */ ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf); ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); -int pci_set_vpd_size(struct pci_dev *dev, size_t len); /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx); -- cgit v1.2.3 From 4cf0abbce69bde3d07757dfa9be6420407fdbc45 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 1 Apr 2021 18:43:15 +0200 Subject: PCI/VPD: Remove pci_vpd_find_tag() 'offset' argument All callers pass 0 as offset. Therefore remove the parameter and use a fixed offset 0 in pci_vpd_find_tag(). Link: https://lore.kernel.org/r/f62e6e19-5423-2ead-b2bd-62844b23ef8f@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index edadc62ae058..1eb35c09674e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2310,14 +2310,13 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field) /** * pci_vpd_find_tag - Locates the Resource Data Type tag provided * @buf: Pointer to buffered vpd data - * @off: The offset into the buffer at which to begin the search * @len: The length of the vpd buffer * @rdt: The Resource Data Type to search for * * Returns the index where the Resource Data Type was found or * -ENOENT otherwise. */ -int pci_vpd_find_tag(const u8 *buf, unsigned int off, unsigned int len, u8 rdt); +int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt); /** * pci_vpd_find_info_keyword - Locates an information field keyword in the VPD -- cgit v1.2.3 From 51eac7f2f06b5f60d22dfb06c48d98a227507b8e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 1 May 2021 04:03:00 +0800 Subject: sctp: do asoc update earlier in sctp_sf_do_dupcook_b The same thing should be done for sctp_sf_do_dupcook_b(). Meanwhile, SCTP_CMD_UPDATE_ASSOC cmd can be removed. v1->v2: - Fix the return value in sctp_sf_do_assoc_update(). Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/command.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h index e8df72e1627a..5e848884ff61 100644 --- a/include/net/sctp/command.h +++ b/include/net/sctp/command.h @@ -68,7 +68,6 @@ enum sctp_verb { SCTP_CMD_ASSOC_FAILED, /* Handle association failure. */ SCTP_CMD_DISCARD_PACKET, /* Discard the whole packet. */ SCTP_CMD_GEN_SHUTDOWN, /* Generate a SHUTDOWN chunk. */ - SCTP_CMD_UPDATE_ASSOC, /* Update association information. */ SCTP_CMD_PURGE_OUTQUEUE, /* Purge all data waiting to be sent. */ SCTP_CMD_SETUP_T2, /* Hi-level, setup T2-shutdown parms. */ SCTP_CMD_RTO_PENDING, /* Set transport's rto_pending. */ -- cgit v1.2.3 From d0f9164eb294aeb884cbe36ddbbae34fa0124aa1 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 6 Apr 2021 20:04:44 +0300 Subject: vdpa: Follow kdoc comment style Follow comment style mentioned in the Writing kernel-doc document [1]. Following warnings are fixed. $ scripts/kernel-doc -v -none include/linux/vdpa.h include/linux/vdpa.h:11: warning: missing initial short description on line: * vDPA callback definition. include/linux/vdpa.h:11: info: Scanning doc for vDPA include/linux/vdpa.h:15: warning: cannot understand function prototype: 'struct vdpa_callback ' include/linux/vdpa.h:21: warning: missing initial short description on line: * vDPA notification area include/linux/vdpa.h:21: info: Scanning doc for vDPA include/linux/vdpa.h:25: warning: cannot understand function prototype: 'struct vdpa_notification_area ' include/linux/vdpa.h:31: warning: missing initial short description on line: * vDPA vq_state definition include/linux/vdpa.h:31: info: Scanning doc for vDPA include/linux/vdpa.h:34: warning: cannot understand function prototype: 'struct vdpa_vq_state ' include/linux/vdpa.h:41: info: Scanning doc for vDPA device include/linux/vdpa.h:51: warning: cannot understand function prototype: 'struct vdpa_device ' include/linux/vdpa.h:62: info: Scanning doc for vDPA IOVA range include/linux/vdpa.h:66: warning: cannot understand function prototype: 'struct vdpa_iova_range ' include/linux/vdpa.h:72: info: Scanning doc for vDPA_config_ops include/linux/vdpa.h:203: warning: cannot understand function prototype: 'struct vdpa_config_ops ' include/linux/vdpa.h:270: info: Scanning doc for vdpa_driver include/linux/vdpa.h:275: warning: cannot understand function prototype: 'struct vdpa_driver ' include/linux/vdpa.h:347: info: Scanning doc for vdpa_mgmtdev_ops include/linux/vdpa.h:360: warning: cannot understand function prototype: 'struct vdpa_mgmtdev_ops ' After this fix: scripts/kernel-doc -v -none include/linux/vdpa.h include/linux/vdpa.h:11: info: Scanning doc for struct vdpa_calllback include/linux/vdpa.h:21: info: Scanning doc for struct vdpa_notification_area include/linux/vdpa.h:31: info: Scanning doc for struct vdpa_vq_state include/linux/vdpa.h:41: info: Scanning doc for struct vdpa_device include/linux/vdpa.h:62: info: Scanning doc for struct vdpa_iova_range include/linux/vdpa.h:72: info: Scanning doc for struct vdpa_config_ops include/linux/vdpa.h:270: info: Scanning doc for struct vdpa_driver include/linux/vdpa.h:347: info: Scanning doc for struct vdpa_mgmtdev_ops [1] https://www.kernel.org/doc/html/latest/doc-guide/kernel-doc.html Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Link: https://lore.kernel.org/r/20210406170457.98481-2-parav@nvidia.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Acked-by: Jason Wang --- include/linux/vdpa.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 15fa085fab05..37b65ca940cf 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -8,7 +8,7 @@ #include /** - * vDPA callback definition. + * struct vdpa_calllback - vDPA callback definition. * @callback: interrupt callback function * @private: the data passed to the callback function */ @@ -18,7 +18,7 @@ struct vdpa_callback { }; /** - * vDPA notification area + * struct vdpa_notification_area - vDPA notification area * @addr: base address of the notification area * @size: size of the notification area */ @@ -28,7 +28,7 @@ struct vdpa_notification_area { }; /** - * vDPA vq_state definition + * struct vdpa_vq_state - vDPA vq_state definition * @avail_index: available index */ struct vdpa_vq_state { @@ -38,7 +38,7 @@ struct vdpa_vq_state { struct vdpa_mgmt_dev; /** - * vDPA device - representation of a vDPA device + * struct vdpa_device - representation of a vDPA device * @dev: underlying device * @dma_dev: the actual device that is performing DMA * @config: the configuration ops for this device. @@ -59,7 +59,7 @@ struct vdpa_device { }; /** - * vDPA IOVA range - the IOVA range support by the device + * struct vdpa_iova_range - the IOVA range support by the device * @first: start of the IOVA range * @last: end of the IOVA range */ @@ -69,7 +69,7 @@ struct vdpa_iova_range { }; /** - * vDPA_config_ops - operations for configuring a vDPA device. + * struct vdpa_config_ops - operations for configuring a vDPA device. * Note: vDPA device drivers are required to implement all of the * operations unless it is mentioned to be optional in the following * list. @@ -267,7 +267,7 @@ int _vdpa_register_device(struct vdpa_device *vdev, int nvqs); void _vdpa_unregister_device(struct vdpa_device *vdev); /** - * vdpa_driver - operations for a vDPA driver + * struct vdpa_driver - operations for a vDPA driver * @driver: underlying device driver * @probe: the function to call when a device is found. Returns 0 or -errno. * @remove: the function to call when a device is removed. @@ -344,18 +344,18 @@ static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset, } /** - * vdpa_mgmtdev_ops - vdpa device ops - * @dev_add: Add a vdpa device using alloc and register - * @mdev: parent device to use for device addition - * @name: name of the new vdpa device - * Driver need to add a new device using _vdpa_register_device() - * after fully initializing the vdpa device. Driver must return 0 - * on success or appropriate error code. - * @dev_del: Remove a vdpa device using unregister - * @mdev: parent device to use for device removal - * @dev: vdpa device to remove - * Driver need to remove the specified device by calling - * _vdpa_unregister_device(). + * struct vdpa_mgmtdev_ops - vdpa device ops + * @dev_add: Add a vdpa device using alloc and register + * @mdev: parent device to use for device addition + * @name: name of the new vdpa device + * Driver need to add a new device using _vdpa_register_device() + * after fully initializing the vdpa device. Driver must return 0 + * on success or appropriate error code. + * @dev_del: Remove a vdpa device using unregister + * @mdev: parent device to use for device removal + * @dev: vdpa device to remove + * Driver need to remove the specified device by calling + * _vdpa_unregister_device(). */ struct vdpa_mgmtdev_ops { int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name); -- cgit v1.2.3 From 9e3bb9b79a7131a088cfffbdcc30e747dad9d090 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 15 Apr 2021 03:31:41 -0400 Subject: virtio_pci_modern: introduce helper to map vq notify area This patch factors out the logic of vq notify area mapping. Following patches will switch to use this common helpers for both virtio_pci library and virtio-pci vDPA driver. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210415073147.19331-2-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen --- include/linux/virtio_pci_modern.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index f26acbeec965..1b95d39b00fc 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -106,6 +106,8 @@ void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, in u32 align, u32 start, u32 size, size_t *len); +void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, + u16 index); int vp_modern_probe(struct virtio_pci_modern_device *mdev); void vp_modern_remove(struct virtio_pci_modern_device *mdev); #endif -- cgit v1.2.3 From a5f7a24f49d81fab9f59611814a8817cc8a876a2 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 15 Apr 2021 03:31:44 -0400 Subject: virtio_pci_modern: hide vp_modern_get_queue_notify_off() All users (both virtio-pci library and vp_vdpa driver) has been switched to use vp_modern_map_vq_notify(). So there's no need to export the low level helper of vp_modern_get_queue_notify_off(). Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210415073147.19331-5-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen --- include/linux/virtio_pci_modern.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index 1b95d39b00fc..179a2fb4bf37 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -99,8 +99,6 @@ void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev, u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, u16 idx); u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev); -u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev, - u16 idx); void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, size_t minlen, u32 align, -- cgit v1.2.3 From fd466b36940b22a506265edf12714bd0cf9ed836 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 15 Apr 2021 03:31:45 -0400 Subject: virito_pci libray: hide vp_modern_map_capability() No user now and the capability should not be setup externally. Instead, every access to the capability should be done via virtio_pci_modern_device. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210415073147.19331-6-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen --- include/linux/virtio_pci_modern.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index 179a2fb4bf37..e6e7072413c1 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -99,11 +99,6 @@ void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev, u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, u16 idx); u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev); -void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, - size_t minlen, - u32 align, - u32 start, u32 size, - size_t *len); void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, u16 index); int vp_modern_probe(struct virtio_pci_modern_device *mdev); -- cgit v1.2.3 From 9e311bcad73dc14bd0a736db6ad3d382227e11fe Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 15 Apr 2021 03:31:46 -0400 Subject: virtio-pci library: report resource address Sometimes it might be useful to report the capability physical address. One example is to report the physical address of the doorbell in order to be mapped by userspace. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210415073147.19331-7-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_pci_modern.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index e6e7072413c1..cdfabbefacdf 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -13,6 +13,8 @@ struct virtio_pci_modern_device { void __iomem *device; /* Base of vq notifications (non-legacy mode). */ void __iomem *notify_base; + /* Physical base of vq notifications */ + resource_size_t notify_pa; /* Where to read and clear interrupt */ u8 __iomem *isr; @@ -100,7 +102,7 @@ u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, u16 idx); u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev); void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, - u16 index); + u16 index, resource_size_t *pa); int vp_modern_probe(struct virtio_pci_modern_device *mdev); void vp_modern_remove(struct virtio_pci_modern_device *mdev); #endif -- cgit v1.2.3 From f53d9910d009bc015b42d88114e2d86a93b0e6b7 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:38 +0100 Subject: vringh: add 'iotlb_lock' to synchronize iotlb accesses Usually iotlb accesses are synchronized with a spinlock. Let's request it as a new parameter in vringh_set_iotlb() and hold it when we navigate the iotlb in iotlb_translate() to avoid race conditions with any new additions/deletions of ranges from the ioltb. Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-3-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/vringh.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 59bd50f99291..9c077863c8f6 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -46,6 +46,9 @@ struct vringh { /* IOTLB for this vring */ struct vhost_iotlb *iotlb; + /* spinlock to synchronize IOTLB accesses */ + spinlock_t *iotlb_lock; + /* The function to call to notify the guest about added buffers */ void (*notify)(struct vringh *); }; @@ -258,7 +261,8 @@ static inline __virtio64 cpu_to_vringh64(const struct vringh *vrh, u64 val) #if IS_REACHABLE(CONFIG_VHOST_IOTLB) -void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb); +void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb, + spinlock_t *iotlb_lock); int vringh_init_iotlb(struct vringh *vrh, u64 features, unsigned int num, bool weak_barriers, -- cgit v1.2.3 From b8c06ad4d67db56ed6bdfb685c134da74e92a2c7 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:41 +0100 Subject: vringh: implement vringh_kiov_advance() In some cases, it may be useful to provide a way to skip a number of bytes in a vringh_kiov. Let's implement vringh_kiov_advance() for this purpose, reusing the code from vringh_iov_xfer(). We replace that code calling the new vringh_kiov_advance(). Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-6-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/vringh.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 9c077863c8f6..755211ebd195 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -199,6 +199,8 @@ static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov) kiov->iov = NULL; } +void vringh_kiov_advance(struct vringh_kiov *kiov, size_t len); + int vringh_getdesc_kern(struct vringh *vrh, struct vringh_kiov *riov, struct vringh_kiov *wiov, -- cgit v1.2.3 From 14c9ac05ce09c8c6a89ffcca6ffb68707cba36c2 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:42 +0100 Subject: vringh: add vringh_kiov_length() helper This new helper returns the total number of bytes covered by a vringh_kiov. Suggested-by: Jason Wang Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-7-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/vringh.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 755211ebd195..84db7b8f912f 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -199,6 +199,17 @@ static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov) kiov->iov = NULL; } +static inline size_t vringh_kiov_length(struct vringh_kiov *kiov) +{ + size_t len = 0; + int i; + + for (i = kiov->i; i < kiov->used; i++) + len += kiov->iov[i].iov_len; + + return len; +} + void vringh_kiov_advance(struct vringh_kiov *kiov, size_t len); int vringh_getdesc_kern(struct vringh *vrh, -- cgit v1.2.3 From 442706f9f94d28fe3c9f188ae4ebbd6b40addffe Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:44 +0100 Subject: vdpa: add get_config_size callback in vdpa_config_ops This new callback is used to get the size of the configuration space of vDPA devices. Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-9-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- include/linux/vdpa.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 37b65ca940cf..f311d227aa1b 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -150,6 +150,9 @@ struct vdpa_iova_range { * @set_status: Set the device status * @vdev: vdpa device * @status: virtio device status + * @get_config_size: Get the size of the configuration space + * @vdev: vdpa device + * Returns size_t: configuration size * @get_config: Read from device specific configuration space * @vdev: vdpa device * @offset: offset from the beginning of @@ -231,6 +234,7 @@ struct vdpa_config_ops { u32 (*get_vendor_id)(struct vdpa_device *vdev); u8 (*get_status)(struct vdpa_device *vdev); void (*set_status)(struct vdpa_device *vdev, u8 status); + size_t (*get_config_size)(struct vdpa_device *vdev); void (*get_config)(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len); void (*set_config)(struct vdpa_device *vdev, unsigned int offset, -- cgit v1.2.3 From 801c6058d14a82179a7ee17a4b532cac6fad067f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 29 Apr 2021 15:19:37 +0000 Subject: bpf: Fix leakage of uninitialized bpf stack under speculation The current implemented mechanisms to mitigate data disclosure under speculation mainly address stack and map value oob access from the speculative domain. However, Piotr discovered that uninitialized BPF stack is not protected yet, and thus old data from the kernel stack, potentially including addresses of kernel structures, could still be extracted from that 512 bytes large window. The BPF stack is special compared to map values since it's not zero initialized for every program invocation, whereas map values /are/ zero initialized upon their initial allocation and thus cannot leak any prior data in either domain. In the non-speculative domain, the verifier ensures that every stack slot read must have a prior stack slot write by the BPF program to avoid such data leaking issue. However, this is not enough: for example, when the pointer arithmetic operation moves the stack pointer from the last valid stack offset to the first valid offset, the sanitation logic allows for any intermediate offsets during speculative execution, which could then be used to extract any restricted stack content via side-channel. Given for unprivileged stack pointer arithmetic the use of unknown but bounded scalars is generally forbidden, we can simply turn the register-based arithmetic operation into an immediate-based arithmetic operation without the need for masking. This also gives the benefit of reducing the needed instructions for the operation. Given after the work in 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask"), the aux->alu_limit already holds the final immediate value for the offset register with the known scalar. Thus, a simple mov of the immediate to AX register with using AX as the source for the original instruction is sufficient and possible now in this case. Reported-by: Piotr Krysiuk Signed-off-by: Daniel Borkmann Tested-by: Piotr Krysiuk Reviewed-by: Piotr Krysiuk Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6023a1367853..06841517ab1e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -302,10 +302,11 @@ struct bpf_verifier_state_list { }; /* Possible states for alu_state member. */ -#define BPF_ALU_SANITIZE_SRC 1U -#define BPF_ALU_SANITIZE_DST 2U +#define BPF_ALU_SANITIZE_SRC (1U << 0) +#define BPF_ALU_SANITIZE_DST (1U << 1) #define BPF_ALU_NEG_VALUE (1U << 2) #define BPF_ALU_NON_POINTER (1U << 3) +#define BPF_ALU_IMMEDIATE (1U << 4) #define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ BPF_ALU_SANITIZE_DST) -- cgit v1.2.3 From cd2c7545ae1beac3b6aae033c7f31193b3255946 Mon Sep 17 00:00:00 2001 From: Changheun Lee Date: Mon, 3 May 2021 18:52:03 +0900 Subject: bio: limit bio max size bio size can grow up to 4GB when muli-page bvec is enabled. but sometimes it would lead to inefficient behaviors. in case of large chunk direct I/O, - 32MB chunk read in user space - all pages for 32MB would be merged to a bio structure if the pages physical addresses are contiguous. it makes some delay to submit until merge complete. bio max size should be limited to a proper size. When 32MB chunk read with direct I/O option is coming from userspace, kernel behavior is below now in do_direct_IO() loop. it's timeline. | bio merge for 32MB. total 8,192 pages are merged. | total elapsed time is over 2ms. |------------------ ... ----------------------->| | 8,192 pages merged a bio. | at this time, first bio submit is done. | 1 bio is split to 32 read request and issue. |---------------> |---------------> |---------------> ...... |---------------> |--------------->| total 19ms elapsed to complete 32MB read done from device. | If bio max size is limited with 1MB, behavior is changed below. | bio merge for 1MB. 256 pages are merged for each bio. | total 32 bio will be made. | total elapsed time is over 2ms. it's same. | but, first bio submit timing is fast. about 100us. |--->|--->|--->|---> ... -->|--->|--->|--->|--->| | 256 pages merged a bio. | at this time, first bio submit is done. | and 1 read request is issued for 1 bio. |---------------> |---------------> |---------------> ...... |---------------> |--------------->| total 17ms elapsed to complete 32MB read done from device. | As a result, read request issue timing is faster if bio max size is limited. Current kernel behavior with multipage bvec, super large bio can be created. And it lead to delay first I/O request issue. Signed-off-by: Changheun Lee Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20210503095203.29076-1-nanich.lee@samsung.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 4 +++- include/linux/blkdev.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index a0b4cfdf62a4..f1a99f0a240c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -106,6 +106,8 @@ static inline void *bio_data(struct bio *bio) return NULL; } +extern unsigned int bio_max_size(struct bio *bio); + /** * bio_full - check if the bio is full * @bio: bio to check @@ -119,7 +121,7 @@ static inline bool bio_full(struct bio *bio, unsigned len) if (bio->bi_vcnt >= bio->bi_max_vecs) return true; - if (bio->bi_iter.bi_size > UINT_MAX - len) + if (bio->bi_iter.bi_size > bio_max_size(bio) - len) return true; return false; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b91ba6207365..40c7c4d87aa1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -327,6 +327,8 @@ enum blk_bounce { }; struct queue_limits { + unsigned int bio_max_bytes; + enum blk_bounce bounce; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; -- cgit v1.2.3 From 48582b2e3b87b794a9845d488af2c76ce055502b Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Fri, 30 Apr 2021 11:21:54 -0400 Subject: reset: add missing empty function reset_control_rearm() All other functions are defined for when CONFIG_RESET_CONTROLLER is not set. Fixes: 557acb3d2cd9 ("reset: make shared pulsed reset controls re-triggerable") Link: https://lore.kernel.org/r/20210430152156.21162-2-jim2101024@gmail.com Signed-off-by: Jim Quinlan Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v5.11+ --- include/linux/reset.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/reset.h b/include/linux/reset.h index b9109efa2a5c..9700124affa3 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -47,6 +47,11 @@ static inline int reset_control_reset(struct reset_control *rstc) return 0; } +static inline int reset_control_rearm(struct reset_control *rstc) +{ + return 0; +} + static inline int reset_control_assert(struct reset_control *rstc) { return 0; -- cgit v1.2.3 From c7d13358b6a2f49f81a34aa323a2d0878a0532a2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 30 Apr 2021 14:00:13 +0200 Subject: netfilter: xt_SECMARK: add new revision to fix structure layout This extension breaks when trying to delete rules, add a new revision to fix this. Fixes: 5e6874cdb8de ("[SECMARK]: Add xtables SECMARK target") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_SECMARK.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/xt_SECMARK.h b/include/uapi/linux/netfilter/xt_SECMARK.h index 1f2a708413f5..beb2cadba8a9 100644 --- a/include/uapi/linux/netfilter/xt_SECMARK.h +++ b/include/uapi/linux/netfilter/xt_SECMARK.h @@ -20,4 +20,10 @@ struct xt_secmark_target_info { char secctx[SECMARK_SECCTX_MAX]; }; +struct xt_secmark_target_info_v1 { + __u8 mode; + char secctx[SECMARK_SECCTX_MAX]; + __u32 secid; +}; + #endif /*_XT_SECMARK_H_target */ -- cgit v1.2.3 From 43016d02cf6e46edfc4696452251d34bba0c0435 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 May 2021 13:51:15 +0200 Subject: netfilter: arptables: use pernet ops struct during unregister Like with iptables and ebtables, hook unregistration has to use the pernet ops struct, not the template. This triggered following splat: hook not found, pf 3 num 0 WARNING: CPU: 0 PID: 224 at net/netfilter/core.c:480 __nf_unregister_net_hook+0x1eb/0x610 net/netfilter/core.c:480 [..] nf_unregister_net_hook net/netfilter/core.c:502 [inline] nf_unregister_net_hooks+0x117/0x160 net/netfilter/core.c:576 arpt_unregister_table_pre_exit+0x67/0x80 net/ipv4/netfilter/arp_tables.c:1565 Fixes: f9006acc8dfe5 ("netfilter: arp_tables: pass table pointer via nf_hook_ops") Reported-by: syzbot+dcccba8a1e41a38cb9df@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_arp/arp_tables.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index 2aab9612f6ab..4f9a4b3c5892 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -53,8 +53,7 @@ int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, const struct nf_hook_ops *ops); void arpt_unregister_table(struct net *net, const char *name); -void arpt_unregister_table_pre_exit(struct net *net, const char *name, - const struct nf_hook_ops *ops); +void arpt_unregister_table_pre_exit(struct net *net, const char *name); extern unsigned int arpt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table); -- cgit v1.2.3 From d7bce85aa7b92b5de8f69b3bcedfe51d7b1aabe1 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 4 May 2021 04:17:20 -0400 Subject: virtio_pci_modern: correct sparse tags for notify When switching virtio_pci_modern to use a helper for mappings we lost an __iomem tag. Restore it. Reported-by: kernel test robot Fixes: 9e3bb9b79a71 ("virtio_pci_modern: introduce helper to map vq notify area") Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_pci_modern.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index cdfabbefacdf..6a95b58fd0f4 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -101,8 +101,8 @@ void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev, u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, u16 idx); u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev); -void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, - u16 index, resource_size_t *pa); +void __iomem * vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, + u16 index, resource_size_t *pa); int vp_modern_probe(struct virtio_pci_modern_device *mdev); void vp_modern_remove(struct virtio_pci_modern_device *mdev); #endif -- cgit v1.2.3 From c61287bf17836b67e0b649343778bb4a659bd70d Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Tue, 4 May 2021 18:59:35 +0800 Subject: clk: sifive: Add pcie_aux clock in prci driver for PCIe driver We add pcie_aux clock in this patch so that pcie driver can use clk_prepare_enable() and clk_disable_unprepare() to enable and disable pcie_aux clock. Link: https://lore.kernel.org/r/20210504105940.100004-2-greentime.hu@sifive.com Signed-off-by: Greentime Hu Signed-off-by: Lorenzo Pieralisi Acked-by: Stephen Boyd --- include/dt-bindings/clock/sifive-fu740-prci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/clock/sifive-fu740-prci.h b/include/dt-bindings/clock/sifive-fu740-prci.h index cd7706ea5677..7899b7fee7db 100644 --- a/include/dt-bindings/clock/sifive-fu740-prci.h +++ b/include/dt-bindings/clock/sifive-fu740-prci.h @@ -19,5 +19,6 @@ #define PRCI_CLK_CLTXPLL 5 #define PRCI_CLK_TLCLK 6 #define PRCI_CLK_PCLK 7 +#define PRCI_CLK_PCIE_AUX 8 #endif /* __DT_BINDINGS_CLOCK_SIFIVE_FU740_PRCI_H */ -- cgit v1.2.3 From 6e552494fb90acae005d74ce6a2ee102d965184b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 4 May 2021 08:54:29 -0700 Subject: iomap: remove unused private field from ioend The only remaining user of ->io_private is the generic ioend merging infrastructure. The only user of that is XFS, which no longer sets ->io_private or passes an associated merge callback. Remove the unused parameter and the ->io_private field. CC: linux-fsdevel@vger.kernel.org Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/iomap.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index d202fd2d0f91..c87d0cb0de6d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -198,7 +198,6 @@ struct iomap_ioend { struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ - void *io_private; /* file system private data */ struct bio *io_bio; /* bio being built */ struct bio io_inline_bio; /* MUST BE LAST! */ }; @@ -234,9 +233,7 @@ struct iomap_writepage_ctx { void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, - struct list_head *more_ioends, - void (*merge_private)(struct iomap_ioend *ioend, - struct iomap_ioend *next)); + struct list_head *more_ioends); void iomap_sort_ioends(struct list_head *ioend_list); int iomap_writepage(struct page *page, struct writeback_control *wbc, struct iomap_writepage_ctx *wpc, -- cgit v1.2.3 From 98635b29a73f1a49ab6882ae58d56c9cd5ecb902 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 15 Mar 2021 10:13:54 +0100 Subject: lib: bitmap: remove the 'extern' keyword from function declarations The 'extern' keyword doesn't have any benefits for functions in header files. Remove it. Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko --- include/linux/bitmap.h | 115 ++++++++++++++++++++++++------------------------- 1 file changed, 57 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 70a932470b2d..6939a8983026 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -118,54 +118,53 @@ * Allocation and deallocation of bitmap. * Provided in lib/bitmap.c to avoid circular dependency. */ -extern unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags); -extern unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags); -extern void bitmap_free(const unsigned long *bitmap); +unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags); +unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags); +void bitmap_free(const unsigned long *bitmap); /* * lib/bitmap.c provides these functions: */ -extern int __bitmap_equal(const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); -extern bool __pure __bitmap_or_equal(const unsigned long *src1, - const unsigned long *src2, - const unsigned long *src3, - unsigned int nbits); -extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, - unsigned int nbits); -extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, - unsigned int shift, unsigned int nbits); -extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, - unsigned int shift, unsigned int nbits); -extern void bitmap_cut(unsigned long *dst, const unsigned long *src, - unsigned int first, unsigned int cut, - unsigned int nbits); -extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, +int __bitmap_equal(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +bool __pure __bitmap_or_equal(const unsigned long *src1, + const unsigned long *src2, + const unsigned long *src3, + unsigned int nbits); +void __bitmap_complement(unsigned long *dst, const unsigned long *src, + unsigned int nbits); +void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, + unsigned int shift, unsigned int nbits); +void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, + unsigned int shift, unsigned int nbits); +void bitmap_cut(unsigned long *dst, const unsigned long *src, + unsigned int first, unsigned int cut, unsigned int nbits); +int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +void __bitmap_replace(unsigned long *dst, + const unsigned long *old, const unsigned long *new, + const unsigned long *mask, unsigned int nbits); +int __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); -extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); -extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); -extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); -extern void __bitmap_replace(unsigned long *dst, - const unsigned long *old, const unsigned long *new, - const unsigned long *mask, unsigned int nbits); -extern int __bitmap_intersects(const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); -extern int __bitmap_subset(const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); -extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); -extern void __bitmap_set(unsigned long *map, unsigned int start, int len); -extern void __bitmap_clear(unsigned long *map, unsigned int start, int len); - -extern unsigned long bitmap_find_next_zero_area_off(unsigned long *map, - unsigned long size, - unsigned long start, - unsigned int nr, - unsigned long align_mask, - unsigned long align_offset); +int __bitmap_subset(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); +void __bitmap_set(unsigned long *map, unsigned int start, int len); +void __bitmap_clear(unsigned long *map, unsigned int start, int len); + +unsigned long bitmap_find_next_zero_area_off(unsigned long *map, + unsigned long size, + unsigned long start, + unsigned int nr, + unsigned long align_mask, + unsigned long align_offset); /** * bitmap_find_next_zero_area - find a contiguous aligned zero area @@ -190,33 +189,33 @@ bitmap_find_next_zero_area(unsigned long *map, align_mask, 0); } -extern int bitmap_parse(const char *buf, unsigned int buflen, +int bitmap_parse(const char *buf, unsigned int buflen, unsigned long *dst, int nbits); -extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, +int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *dst, int nbits); -extern int bitmap_parselist(const char *buf, unsigned long *maskp, +int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits); -extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, +int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, unsigned long *dst, int nbits); -extern void bitmap_remap(unsigned long *dst, const unsigned long *src, +void bitmap_remap(unsigned long *dst, const unsigned long *src, const unsigned long *old, const unsigned long *new, unsigned int nbits); -extern int bitmap_bitremap(int oldbit, +int bitmap_bitremap(int oldbit, const unsigned long *old, const unsigned long *new, int bits); -extern void bitmap_onto(unsigned long *dst, const unsigned long *orig, +void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, unsigned int bits); -extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, +void bitmap_fold(unsigned long *dst, const unsigned long *orig, unsigned int sz, unsigned int nbits); -extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); -extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); -extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); +int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); +void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); +int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); #ifdef __BIG_ENDIAN -extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits); +void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits); #else #define bitmap_copy_le bitmap_copy #endif -extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits); -extern int bitmap_print_to_pagebuf(bool list, char *buf, +unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits); +int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) @@ -265,9 +264,9 @@ static inline void bitmap_copy_clear_tail(unsigned long *dst, * therefore conversion is not needed when copying data from/to arrays of u32. */ #if BITS_PER_LONG == 64 -extern void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, +void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits); -extern void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, +void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits); #else #define bitmap_from_arr32(bitmap, buf, nbits) \ -- cgit v1.2.3 From c13656b904b6173aad723d9680a81c60de2f5edc Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 15 Mar 2021 10:13:55 +0100 Subject: lib: bitmap: order includes alphabetically For better readability and maintenance: order the includes in bitmap source files alphabetically. Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko --- include/linux/bitmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 6939a8983026..3282db97e06c 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -4,10 +4,10 @@ #ifndef __ASSEMBLY__ -#include #include -#include #include +#include +#include /* * bitmaps provide bit arrays that consume one or more unsigned -- cgit v1.2.3 From e829c2e4744850bab4d8f8ffebd00df10b4c6c2b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 15 Mar 2021 10:13:56 +0100 Subject: lib: bitmap: provide devm_bitmap_alloc() and devm_bitmap_zalloc() Provide managed variants of bitmap_alloc() and bitmap_zalloc(). Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko --- include/linux/bitmap.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 3282db97e06c..73d039476fa4 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -9,6 +9,8 @@ #include #include +struct device; + /* * bitmaps provide bit arrays that consume one or more unsigned * longs. The bitmap interface and available operations are listed @@ -122,6 +124,12 @@ unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags); unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags); void bitmap_free(const unsigned long *bitmap); +/* Managed variants of the above. */ +unsigned long *devm_bitmap_alloc(struct device *dev, + unsigned int nbits, gfp_t flags); +unsigned long *devm_bitmap_zalloc(struct device *dev, + unsigned int nbits, gfp_t flags); + /* * lib/bitmap.c provides these functions: */ -- cgit v1.2.3 From 77b8aeb9da0490357f1f5a2b0d12125e6332c37a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 4 May 2021 09:52:02 -0600 Subject: vfio/pci: Revert nvlink removal uAPI breakage Revert the uAPI changes from the below commit with notice that these regions and capabilities are no longer provided. Fixes: b392a1989170 ("vfio/pci: remove vfio_pci_nvlink2") Reported-by: Greg Kurz Signed-off-by: Alex Williamson Reviewed-by: Cornelia Huck Reviewed-by: Greg Kurz Tested-by: Greg Kurz Reviewed-by: Christoph Hellwig Message-Id: <162014341432.3807030.11054087109120670135.stgit@omen> --- include/uapi/linux/vfio.h | 46 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 34b1f53a3901..ef33ea002b0b 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -333,10 +333,21 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) /* 10de vendor PCI sub-types */ -/* subtype 1 was VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, don't use */ +/* + * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space. + * + * Deprecated, region no longer provided + */ +#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM (1) /* 1014 vendor PCI sub-types */ -/* subtype 1 was VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, don't use */ +/* + * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU + * to do TLB invalidation on a GPU. + * + * Deprecated, region no longer provided + */ +#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD (1) /* sub-types for VFIO_REGION_TYPE_GFX */ #define VFIO_REGION_SUBTYPE_GFX_EDID (1) @@ -630,9 +641,36 @@ struct vfio_device_migration_info { */ #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE 3 -/* subtype 4 was VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, don't use */ +/* + * Capability with compressed real address (aka SSA - small system address) + * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing + * and by the userspace to associate a NVLink bridge with a GPU. + * + * Deprecated, capability no longer provided + */ +#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT 4 + +struct vfio_region_info_cap_nvlink2_ssatgt { + struct vfio_info_cap_header header; + __u64 tgt; +}; -/* subtype 5 was VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, don't use */ +/* + * Capability with an NVLink link speed. The value is read by + * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed" + * property in the device tree. The value is fixed in the hardware + * and failing to provide the correct value results in the link + * not working with no indication from the driver why. + * + * Deprecated, capability no longer provided + */ +#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD 5 + +struct vfio_region_info_cap_nvlink2_lnkspd { + struct vfio_info_cap_header header; + __u32 link_speed; + __u32 __pad; +}; /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, -- cgit v1.2.3 From 7716506adac4664793a9d6d3dfa31ffddfa98714 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 18:32:45 -0700 Subject: mm: introduce and use mapping_empty() Patch series "Remove nrexceptional tracking", v2. We actually use nrexceptional for very little these days. It's a minor pain to keep in sync with nrpages, but the pain becomes much bigger with the THP patches because we don't know how many indices a shadow entry occupies. It's easier to just remove it than keep it accurate. Also, we save 8 bytes per inode which is nothing to sneeze at; on my laptop, it would improve shmem_inode_cache from 22 to 23 objects per 16kB, and inode_cache from 26 to 27 objects. Combined, that saves a megabyte of memory from a combined usage of 25MB for both caches. Unfortunately, ext4 doesn't cross a magic boundary, so it doesn't save any memory for ext4. This patch (of 4): Instead of checking the two counters (nrpages and nrexceptional), we can just check whether i_pages is empty. Link: https://lkml.kernel.org/r/20201026151849.24232-1-willy@infradead.org Link: https://lkml.kernel.org/r/20201026151849.24232-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Vishal Verma Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 469fa7ffcf96..a4bd41128bf3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -18,6 +18,11 @@ struct pagevec; +static inline bool mapping_empty(struct address_space *mapping) +{ + return xa_empty(&mapping->i_pages); +} + /* * Bits in mapping->flags. */ -- cgit v1.2.3 From 8bc3c481b3d0dcef2cf8e1b7c6b780af6725f7e3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 18:32:54 -0700 Subject: mm: remove nrexceptional from inode We no longer track anything in nrexceptional, so remove it, saving 8 bytes per inode. Link: https://lkml.kernel.org/r/20201026151849.24232-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Vishal Verma Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 12766edee81f..acef282b97c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -442,7 +442,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping, * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. - * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). @@ -463,7 +462,6 @@ struct address_space { struct rb_root_cached i_mmap; struct rw_semaphore i_mmap_rwsem; unsigned long nrpages; - unsigned long nrexceptional; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; -- cgit v1.2.3 From aec44e0f0213e36d4f0868a80cdc5097a510f79d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:00 -0700 Subject: hugetlb: pass vma into huge_pte_alloc() and huge_pmd_share() Patch series "hugetlb: Disable huge pmd unshare for uffd-wp", v4. This series tries to disable huge pmd unshare of hugetlbfs backed memory for uffd-wp. Although uffd-wp of hugetlbfs is still during rfc stage, the idea of this series may be needed for multiple tasks (Axel's uffd minor fault series, and Mike's soft dirty series), so I picked it out from the larger series. This patch (of 4): It is a preparation work to be able to behave differently in the per architecture huge_pte_alloc() according to different VMA attributes. Pass it deeper into huge_pmd_share() so that we can avoid the find_vma() call. [peterx@redhat.com: build fix] Link: https://lkml.kernel.org/r/20210304164653.GB397383@xz-x1Link: https://lkml.kernel.org/r/20210218230633.15028-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210218230633.15028-2-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Mike Kravetz Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cccd1aab69dd..653ef322fac9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -152,7 +152,8 @@ void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud); struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); @@ -161,7 +162,7 @@ extern struct list_head huge_boot_pages; /* arch callbacks */ -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); -- cgit v1.2.3 From c1991e0705d143be773c984b006f2078aa9f2853 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:04 -0700 Subject: hugetlb/userfaultfd: forbid huge pmd sharing when uffd enabled Huge pmd sharing could bring problem to userfaultfd. The thing is that userfaultfd is running its logic based on the special bits on page table entries, however the huge pmd sharing could potentially share page table entries for different address ranges. That could cause issues on either: - When sharing huge pmd page tables for an uffd write protected range, the newly mapped huge pmd range will also be write protected unexpectedly, or, - When we try to write protect a range of huge pmd shared range, we'll first do huge_pmd_unshare() in hugetlb_change_protection(), however that also means the UFFDIO_WRITEPROTECT could be silently skipped for the shared region, which could lead to data loss. While at it, a few other things are done altogether: - Move want_pmd_share() from mm/hugetlb.c into linux/hugetlb.h, because that's definitely something that arch code would like to use too - ARM64 currently directly check against CONFIG_ARCH_WANT_HUGE_PMD_SHARE when trying to share huge pmd. Switch to the want_pmd_share() helper. - Move vma_shareable() from huge_pmd_share() into want_pmd_share(). [peterx@redhat.com: fix build with !ARCH_WANT_HUGE_PMD_SHARE] Link: https://lkml.kernel.org/r/20210310185359.88297-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210218231202.15426-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: Axel Rasmussen Tested-by: Naresh Kamboju Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 ++ include/linux/userfaultfd_k.h | 9 +++++++++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 653ef322fac9..88e93809a455 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1040,4 +1040,6 @@ static inline __init void hugetlb_cma_check(void) } #endif +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); + #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index a8e5f3ea9bb2..c63ccdae3eab 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -52,6 +52,15 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx; } +/* + * Never enable huge pmd sharing on uffd-wp registered vmas, because uffd-wp + * protect information is per pgtable entry. + */ +static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_UFFD_WP; +} + static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MISSING; -- cgit v1.2.3 From 537cf30bba241ae88d5f4b0b6a5e66271b394852 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:08 -0700 Subject: mm/hugetlb: move flush_hugetlb_tlb_range() into hugetlb.h Prepare for it to be called outside of mm/hugetlb.c. Link: https://lkml.kernel.org/r/20210218231204.15474-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: Axel Rasmussen Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 88e93809a455..e43668144664 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1042,4 +1042,12 @@ static inline __init void hugetlb_cma_check(void) bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); +#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE +/* + * ARCHes with special requirements for evicting HUGETLB backing TLB entries can + * implement this. + */ +#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) +#endif + #endif /* _LINUX_HUGETLB_H */ -- cgit v1.2.3 From 6dfeaff93be1a4cab4fb48dad7df326d05059a99 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:13 -0700 Subject: hugetlb/userfaultfd: unshare all pmds for hugetlbfs when register wp Huge pmd sharing for hugetlbfs is racy with userfaultfd-wp because userfaultfd-wp is always based on pgtable entries, so they cannot be shared. Walk the hugetlb range and unshare all such mappings if there is, right before UFFDIO_REGISTER will succeed and return to userspace. This will pair with want_pmd_share() in hugetlb code so that huge pmd sharing is completely disabled for userfaultfd-wp registered range. Link: https://lkml.kernel.org/r/20210218231206.15524-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Peter Xu Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Lokesh Gidra Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e43668144664..0f5813522224 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -188,6 +188,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); bool is_hugetlb_entry_migration(pte_t pte); +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -369,6 +370,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, return 0; } +static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } + #endif /* !CONFIG_HUGETLB_PAGE */ /* * hugepages at page global directory. If arch support -- cgit v1.2.3 From d4afd60c24f87b6275b12ec3d67d8c2ad78cb075 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:05 -0700 Subject: mm/huge_memory.c: remove unused macro TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG Commit 4958e4d86ecb ("mm: thp: remove debug_cow switch") forgot to remove TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG macro. Remove it here. Link: https://lkml.kernel.org/r/20210318122722.13135-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Reviewed-by: Peter Xu Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Vlastimil Babka Cc: Wei Yang Cc: William Kucharski Cc: Yang Shi Cc: yuleixzhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ba973efcd369..9626fda5efce 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -87,9 +87,6 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, -#ifdef CONFIG_DEBUG_VM - TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, -#endif }; struct kobject; -- cgit v1.2.3 From 2938396771c8fd0870b5284319f9e78b4b552a79 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:34:52 -0700 Subject: hugetlb: add per-hstate mutex to synchronize user adjustments The helper routine hstate_next_node_to_alloc accesses and modifies the hstate variable next_nid_to_alloc. The helper is used by the routines alloc_pool_huge_page and adjust_pool_surplus. adjust_pool_surplus is called with hugetlb_lock held. However, alloc_pool_huge_page can not be called with the hugetlb lock held as it will call the page allocator. Two instances of alloc_pool_huge_page could be run in parallel or alloc_pool_huge_page could run in parallel with adjust_pool_surplus which may result in the variable next_nid_to_alloc becoming invalid for the caller and pages being allocated on the wrong node. Both alloc_pool_huge_page and adjust_pool_surplus are only called from the routine set_max_huge_pages after boot. set_max_huge_pages is only called as the reusult of a user writing to the proc/sysfs nr_hugepages, or nr_hugepages_mempolicy file to adjust the number of hugetlb pages. It makes little sense to allow multiple adjustment to the number of hugetlb pages in parallel. Add a mutex to the hstate and use it to only allow one hugetlb page adjustment at a time. This will synchronize modifications to the next_nid_to_alloc variable. Link: https://lkml.kernel.org/r/20210409205254.242291-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Reviewed-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0f5813522224..628639422c5d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -559,6 +559,7 @@ HPAGEFLAG(Freed, freed) #define HSTATE_NAME_LEN 32 /* Defines one hugetlb page size */ struct hstate { + struct mutex resize_lock; int next_nid_to_alloc; int next_nid_to_free; unsigned int order; -- cgit v1.2.3 From 369fa227c21949b22fd7374506c4992a0d7bb580 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:26 -0700 Subject: mm: make alloc_contig_range handle free hugetlb pages alloc_contig_range will fail if it ever sees a HugeTLB page within the range we are trying to allocate, even when that page is free and can be easily reallocated. This has proved to be problematic for some users of alloc_contic_range, e.g: CMA and virtio-mem, where those would fail the call even when those pages lay in ZONE_MOVABLE and are free. We can do better by trying to replace such page. Free hugepages are tricky to handle so as to no userspace application notices disruption, we need to replace the current free hugepage with a new one. In order to do that, a new function called alloc_and_dissolve_huge_page is introduced. This function will first try to get a new fresh hugepage, and if it succeeds, it will replace the old one in the free hugepage pool. The free page replacement is done under hugetlb_lock, so no external users of hugetlb will notice the change. To allocate the new huge page, we use alloc_buddy_huge_page(), so we do not have to deal with any counters, and prep_new_huge_page() is not called. This is valulable because in case we need to free the new page, we only need to call __free_pages(). Once we know that the page to be replaced is a genuine 0-refcounted huge page, we remove the old page from the freelist by remove_hugetlb_page(). Then, we can call __prep_new_huge_page() and __prep_account_new_huge_page() for the new huge page to properly initialize it and increment the hstate->nr_huge_pages counter (previously decremented by remove_hugetlb_page()). Once done, the page is enqueued by enqueue_huge_page() and it is ready to be used. There is one tricky case when page's refcount is 0 because it is in the process of being released. A missing PageHugeFreed bit will tell us that freeing is in flight so we retry after dropping the hugetlb_lock. The race window should be small and the next retry should make a forward progress. E.g: CPU0 CPU1 free_huge_page() isolate_or_dissolve_huge_page PageHuge() == T alloc_and_dissolve_huge_page alloc_buddy_huge_page() spin_lock_irq(hugetlb_lock) // PageHuge() && !PageHugeFreed && // !PageCount() spin_unlock_irq(hugetlb_lock) spin_lock_irq(hugetlb_lock) 1) update_and_free_page PageHuge() == F __free_pages() 2) enqueue_huge_page SetPageHugeFreed() spin_unlock_irq(&hugetlb_lock) spin_lock_irq(hugetlb_lock) 1) PageHuge() == F (freed by case#1 from CPU0) 2) PageHuge() == T PageHugeFreed() == T - proceed with replacing the page In the case above we retry as the window race is quite small and we have high chances to succeed next time. With regard to the allocation, we restrict it to the node the page belongs to with __GFP_THISNODE, meaning we do not fallback on other node's zones. Note that gigantic hugetlb pages are fenced off since there is a cyclic dependency between them and alloc_contig_range. Link: https://lkml.kernel.org/r/20210419075413.1064-6-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: David Hildenbrand Reviewed-by: Mike Kravetz Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 628639422c5d..ec6a10b8860a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -588,6 +588,7 @@ struct huge_bootmem_page { struct hstate *hstate; }; +int isolate_or_dissolve_huge_page(struct page *page); struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, @@ -870,6 +871,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; +static inline int isolate_or_dissolve_huge_page(struct page *page) +{ + return -ENOMEM; +} + static inline struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) -- cgit v1.2.3 From ae37c7ff79f1f030e28ec76c46ee032f8fd07607 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:29 -0700 Subject: mm: make alloc_contig_range handle in-use hugetlb pages alloc_contig_range() will fail if it finds a HugeTLB page within the range, without a chance to handle them. Since HugeTLB pages can be migrated as any LRU or Movable page, it does not make sense to bail out without trying. Enable the interface to recognize in-use HugeTLB pages so we can migrate them, and have much better chances to succeed the call. Link: https://lkml.kernel.org/r/20210419075413.1064-7-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Mike Kravetz Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ec6a10b8860a..d0f310ae3f82 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -588,7 +588,7 @@ struct huge_bootmem_page { struct hstate *hstate; }; -int isolate_or_dissolve_huge_page(struct page *page); +int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, @@ -871,7 +871,8 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; -static inline int isolate_or_dissolve_huge_page(struct page *page) +static inline int isolate_or_dissolve_huge_page(struct page *page, + struct list_head *list) { return -ENOMEM; } -- cgit v1.2.3 From 7677f7fd8be76659cd2d0db8ff4093bbb51c20e5 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:36 -0700 Subject: userfaultfd: add minor fault registration mode Patch series "userfaultfd: add minor fault handling", v9. Overview ======== This series adds a new userfaultfd feature, UFFD_FEATURE_MINOR_HUGETLBFS. When enabled (via the UFFDIO_API ioctl), this feature means that any hugetlbfs VMAs registered with UFFDIO_REGISTER_MODE_MISSING will *also* get events for "minor" faults. By "minor" fault, I mean the following situation: Let there exist two mappings (i.e., VMAs) to the same page(s) (shared memory). One of the mappings is registered with userfaultfd (in minor mode), and the other is not. Via the non-UFFD mapping, the underlying pages have already been allocated & filled with some contents. The UFFD mapping has not yet been faulted in; when it is touched for the first time, this results in what I'm calling a "minor" fault. As a concrete example, when working with hugetlbfs, we have huge_pte_none(), but find_lock_page() finds an existing page. We also add a new ioctl to resolve such faults: UFFDIO_CONTINUE. The idea is, userspace resolves the fault by either a) doing nothing if the contents are already correct, or b) updating the underlying contents using the second, non-UFFD mapping (via memcpy/memset or similar, or something fancier like RDMA, or etc...). In either case, userspace issues UFFDIO_CONTINUE to tell the kernel "I have ensured the page contents are correct, carry on setting up the mapping". Use Case ======== Consider the use case of VM live migration (e.g. under QEMU/KVM): 1. While a VM is still running, we copy the contents of its memory to a target machine. The pages are populated on the target by writing to the non-UFFD mapping, using the setup described above. The VM is still running (and therefore its memory is likely changing), so this may be repeated several times, until we decide the target is "up to date enough". 2. We pause the VM on the source, and start executing on the target machine. During this gap, the VM's user(s) will *see* a pause, so it is desirable to minimize this window. 3. Between the last time any page was copied from the source to the target, and when the VM was paused, the contents of that page may have changed - and therefore the copy we have on the target machine is out of date. Although we can keep track of which pages are out of date, for VMs with large amounts of memory, it is "slow" to transfer this information to the target machine. We want to resume execution before such a transfer would complete. 4. So, the guest begins executing on the target machine. The first time it touches its memory (via the UFFD-registered mapping), userspace wants to intercept this fault. Userspace checks whether or not the page is up to date, and if not, copies the updated page from the source machine, via the non-UFFD mapping. Finally, whether a copy was performed or not, userspace issues a UFFDIO_CONTINUE ioctl to tell the kernel "I have ensured the page contents are correct, carry on setting up the mapping". We don't have to do all of the final updates on-demand. The userfaultfd manager can, in the background, also copy over updated pages once it receives the map of which pages are up-to-date or not. Interaction with Existing APIs ============================== Because this is a feature, a registered VMA could potentially receive both missing and minor faults. I spent some time thinking through how the existing API interacts with the new feature: UFFDIO_CONTINUE cannot be used to resolve non-minor faults, as it does not allocate a new page. If UFFDIO_CONTINUE is used on a non-minor fault: - For non-shared memory or shmem, -EINVAL is returned. - For hugetlb, -EFAULT is returned. UFFDIO_COPY and UFFDIO_ZEROPAGE cannot be used to resolve minor faults. Without modifications, the existing codepath assumes a new page needs to be allocated. This is okay, since userspace must have a second non-UFFD-registered mapping anyway, thus there isn't much reason to want to use these in any case (just memcpy or memset or similar). - If UFFDIO_COPY is used on a minor fault, -EEXIST is returned. - If UFFDIO_ZEROPAGE is used on a minor fault, -EEXIST is returned (or -EINVAL in the case of hugetlb, as UFFDIO_ZEROPAGE is unsupported in any case). - UFFDIO_WRITEPROTECT simply doesn't work with shared memory, and returns -ENOENT in that case (regardless of the kind of fault). Future Work =========== This series only supports hugetlbfs. I have a second series in flight to support shmem as well, extending the functionality. This series is more mature than the shmem support at this point, and the functionality works fully on hugetlbfs, so this series can be merged first and then shmem support will follow. This patch (of 6): This feature allows userspace to intercept "minor" faults. By "minor" faults, I mean the following situation: Let there exist two mappings (i.e., VMAs) to the same page(s). One of the mappings is registered with userfaultfd (in minor mode), and the other is not. Via the non-UFFD mapping, the underlying pages have already been allocated & filled with some contents. The UFFD mapping has not yet been faulted in; when it is touched for the first time, this results in what I'm calling a "minor" fault. As a concrete example, when working with hugetlbfs, we have huge_pte_none(), but find_lock_page() finds an existing page. This commit adds the new registration mode, and sets the relevant flag on the VMAs being registered. In the hugetlb fault path, if we find that we have huge_pte_none(), but find_lock_page() does indeed find an existing page, then we have a "minor" fault, and if the VMA has the userfaultfd registration flag, we call into userfaultfd to handle it. This is implemented as a new registration mode, instead of an API feature. This is because the alternative implementation has significant drawbacks [1]. However, doing it this was requires we allocate a VM_* flag for the new registration mode. On 32-bit systems, there are no unused bits, so this feature is only supported on architectures with CONFIG_ARCH_USES_HIGH_VMA_FLAGS. When attempting to register a VMA in MINOR mode on 32-bit architectures, we return -EINVAL. [1] https://lore.kernel.org/patchwork/patch/1380226/ [peterx@redhat.com: fix minor fault page leak] Link: https://lkml.kernel.org/r/20210322175132.36659-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210301222728.176417-1-axelrasmussen@google.com Link: https://lkml.kernel.org/r/20210301222728.176417-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Chinwen Chang Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Xu Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Rostedt Cc: Steven Price Cc: Vlastimil Babka Cc: Adam Ruprecht Cc: Axel Rasmussen Cc: Cannon Matthews Cc: "Dr . David Alan Gilbert" Cc: David Rientjes Cc: Mina Almasry Cc: Oliver Upton Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 +++++++ include/linux/userfaultfd_k.h | 15 ++++++++++++++- include/trace/events/mmflags.h | 7 +++++++ include/uapi/linux/userfaultfd.h | 15 +++++++++++++-- 4 files changed, 41 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 011f43605807..1dbb53c44243 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -372,6 +372,13 @@ extern unsigned int kobjsize(const void *objp); # define VM_GROWSUP VM_NONE #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +# define VM_UFFD_MINOR_BIT 37 +# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ +#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +# define VM_UFFD_MINOR VM_NONE +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c63ccdae3eab..0390e5ac63b3 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -17,6 +17,9 @@ #include #include +/* The set of all possible UFFD-related VM flags. */ +#define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) + /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want @@ -71,6 +74,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma) return vma->vm_flags & VM_UFFD_WP; } +static inline bool userfaultfd_minor(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_UFFD_MINOR; +} + static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { @@ -85,7 +93,7 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, static inline bool userfaultfd_armed(struct vm_area_struct *vma) { - return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP); + return vma->vm_flags & __VM_UFFD_FLAGS; } extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); @@ -132,6 +140,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma) return false; } +static inline bool userfaultfd_minor(struct vm_area_struct *vma) +{ + return false; +} + static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 67018d367b9f..629c7a0eaff2 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) #define IF_HAVE_VM_SOFTDIRTY(flag,name) #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +# define IF_HAVE_UFFD_MINOR(flag, name) {flag, name}, +#else +# define IF_HAVE_UFFD_MINOR(flag, name) +#endif + #define __def_vmaflag_names \ {VM_READ, "read" }, \ {VM_WRITE, "write" }, \ @@ -148,6 +154,7 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) {VM_MAYSHARE, "mayshare" }, \ {VM_GROWSDOWN, "growsdown" }, \ {VM_UFFD_MISSING, "uffd_missing" }, \ +IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR, "uffd_minor" ) \ {VM_PFNMAP, "pfnmap" }, \ {VM_DENYWRITE, "denywrite" }, \ {VM_UFFD_WP, "uffd_wp" }, \ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 5f2d88212f7c..f24dd4fcbad9 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -19,15 +19,19 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) +#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \ + UFFDIO_REGISTER_MODE_WP | \ + UFFDIO_REGISTER_MODE_MINOR) #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ - UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_EVENT_UNMAP | \ UFFD_FEATURE_MISSING_HUGETLBFS | \ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ - UFFD_FEATURE_THREAD_ID) + UFFD_FEATURE_THREAD_ID | \ + UFFD_FEATURE_MINOR_HUGETLBFS) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -127,6 +131,7 @@ struct uffd_msg { /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ struct uffdio_api { /* userland asks for an API number and the features to enable */ @@ -171,6 +176,10 @@ struct uffdio_api { * * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will * be returned, if feature is not requested 0 will be returned. + * + * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults + * can be intercepted (via REGISTER_MODE_MINOR) for + * hugetlbfs-backed pages. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -181,6 +190,7 @@ struct uffdio_api { #define UFFD_FEATURE_EVENT_UNMAP (1<<6) #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) +#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) __u64 features; __u64 ioctls; @@ -195,6 +205,7 @@ struct uffdio_register { struct uffdio_range range; #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) #define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) +#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) __u64 mode; /* -- cgit v1.2.3 From 0d9cadabd193c6008d256533f544de8206fd3a80 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:40 -0700 Subject: userfaultfd: disable huge PMD sharing for MINOR registered VMAs As the comment says: for the MINOR fault use case, although the page might be present and populated in the other (non-UFFD-registered) half of the mapping, it may be out of date, and we explicitly want userspace to get a minor fault so it can check and potentially update the page's contents. Huge PMD sharing would prevent these faults from occurring for suitably aligned areas, so disable it upon UFFD registration. Link: https://lkml.kernel.org/r/20210301222728.176417-3-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/userfaultfd_k.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 0390e5ac63b3..e060d5f77cc5 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -56,12 +56,19 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, } /* - * Never enable huge pmd sharing on uffd-wp registered vmas, because uffd-wp - * protect information is per pgtable entry. + * Never enable huge pmd sharing on some uffd registered vmas: + * + * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry. + * + * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for + * VMAs which share huge pmds. (If you have two mappings to the same + * underlying pages, and fault in the non-UFFD-registered one with a write, + * with huge pmd sharing this would *also* setup the second UFFD-registered + * mapping, and we'd not get minor faults.) */ static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) { - return vma->vm_flags & VM_UFFD_WP; + return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } static inline bool userfaultfd_missing(struct vm_area_struct *vma) -- cgit v1.2.3 From 714c189108244f1df579689061db1d785d92e7e2 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:45 -0700 Subject: userfaultfd: hugetlbfs: only compile UFFD helpers if config enabled For background, mm/userfaultfd.c provides a general mcopy_atomic implementation. But some types of memory (i.e., hugetlb and shmem) need a slightly different implementation, so they provide their own helpers for this. In other words, userfaultfd is the only caller of these functions. This patch achieves two things: 1. Don't spend time compiling code which will end up never being referenced anyway (a small build time optimization). 2. In patches later in this series, we extend the signature of these helpers with UFFD-specific state (a mode enumeration). Once this happens, we *have to* either not compile the helpers, or unconditionally define the UFFD-only state (which seems messier to me). This includes the declarations in the headers, as otherwise they'd yield warnings about implicitly defining the type of those arguments. Link: https://lkml.kernel.org/r/20210301222728.176417-4-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Mike Kravetz Reviewed-by: Peter Xu Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d0f310ae3f82..a1dbe4568707 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -134,11 +134,13 @@ void hugetlb_show_meminfo(void); unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); +#ifdef CONFIG_USERFAULTFD int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, struct page **pagep); +#endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); @@ -310,6 +312,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, BUG(); } +#ifdef CONFIG_USERFAULTFD static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, @@ -320,6 +323,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, BUG(); return 0; } +#endif /* CONFIG_USERFAULTFD */ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) -- cgit v1.2.3 From f619147104c8ea71e120e4936d2b68ec11a1e527 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:49 -0700 Subject: userfaultfd: add UFFDIO_CONTINUE ioctl This ioctl is how userspace ought to resolve "minor" userfaults. The idea is, userspace is notified that a minor fault has occurred. It might change the contents of the page using its second non-UFFD mapping, or not. Then, it calls UFFDIO_CONTINUE to tell the kernel "I have ensured the page contents are correct, carry on setting up the mapping". Note that it doesn't make much sense to use UFFDIO_{COPY,ZEROPAGE} for MINOR registered VMAs. ZEROPAGE maps the VMA to the zero page; but in the minor fault case, we already have some pre-existing underlying page. Likewise, UFFDIO_COPY isn't useful if we have a second non-UFFD mapping. We'd just use memcpy() or similar instead. It turns out hugetlb_mcopy_atomic_pte() already does very close to what we want, if an existing page is provided via `struct page **pagep`. We already special-case the behavior a bit for the UFFDIO_ZEROPAGE case, so just extend that design: add an enum for the three modes of operation, and make the small adjustments needed for the MCOPY_ATOMIC_CONTINUE case. (Basically, look up the existing page, and avoid adding the existing page to the page cache or calling set_page_huge_active() on it.) Link: https://lkml.kernel.org/r/20210301222728.176417-5-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Kravetz Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +++ include/linux/userfaultfd_k.h | 18 ++++++++++++++++++ include/uapi/linux/userfaultfd.h | 21 +++++++++++++++++++-- 3 files changed, 40 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a1dbe4568707..b92f25ccef58 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -11,6 +11,7 @@ #include #include #include +#include struct ctl_table; struct user_struct; @@ -139,6 +140,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, @@ -318,6 +320,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { BUG(); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e060d5f77cc5..794d1538b8ba 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -37,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd; extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); +/* + * The mode of operation for __mcopy_atomic and its helpers. + * + * This is almost an implementation detail (mcopy_atomic below doesn't take this + * as a parameter), but it's exposed here because memory-kind-specific + * implementations (e.g. hugetlbfs) need to know the mode of operation. + */ +enum mcopy_atomic_mode { + /* A normal copy_from_user into the destination range. */ + MCOPY_ATOMIC_NORMAL, + /* Don't copy; map the destination range to the zero page. */ + MCOPY_ATOMIC_ZEROPAGE, + /* Just install pte(s) with the existing page(s) in the page cache. */ + MCOPY_ATOMIC_CONTINUE, +}; + extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, bool *mmap_changing, __u64 mode); @@ -44,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, bool *mmap_changing); +extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long len, bool *mmap_changing); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, bool *mmap_changing); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index f24dd4fcbad9..bafbeb1a2624 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -40,10 +40,12 @@ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE | \ - (__u64)1 << _UFFDIO_WRITEPROTECT) + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE) #define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ - (__u64)1 << _UFFDIO_COPY) + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_CONTINUE) /* * Valid ioctl command number range with this API is from 0x00 to @@ -59,6 +61,7 @@ #define _UFFDIO_COPY (0x03) #define _UFFDIO_ZEROPAGE (0x04) #define _UFFDIO_WRITEPROTECT (0x06) +#define _UFFDIO_CONTINUE (0x07) #define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ @@ -77,6 +80,8 @@ struct uffdio_zeropage) #define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ struct uffdio_writeprotect) +#define UFFDIO_CONTINUE _IOR(UFFDIO, _UFFDIO_CONTINUE, \ + struct uffdio_continue) /* read() structure */ struct uffd_msg { @@ -268,6 +273,18 @@ struct uffdio_writeprotect { __u64 mode; }; +struct uffdio_continue { + struct uffdio_range range; +#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 mapped; +}; + /* * Flags for the userfaultfd(2) system call itself. */ -- cgit v1.2.3 From b6676de8d7b48724d4cd3a3742c62fa525baa904 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 4 May 2021 18:36:01 -0700 Subject: mm/vmscan: move RECLAIM* bits to uapi header It is currently not obvious that the RECLAIM_* bits are part of the uapi since they are defined in vmscan.c. Move them to a uapi header to make it obvious. This should have no functional impact. Link: https://lkml.kernel.org/r/20210219172557.08074910@viggo.jf.intel.com Signed-off-by: Dave Hansen Reviewed-by: Ben Widawsky Reviewed-by: Oscar Salvador Acked-by: David Rientjes Acked-by: Christoph Lameter Cc: Alex Shi Cc: Daniel Wagner Cc: "Tobin C. Harding" Cc: Christoph Lameter Cc: Huang Ying Cc: Dan Williams Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/mempolicy.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 8948467b3992..4832fd0b5642 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -64,5 +64,12 @@ enum { #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ +/* + * These bit locations are exposed in the vm.zone_reclaim_mode sysctl + * ABI. New bits are OK, but existing bits can never change. + */ +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ #endif /* _UAPI_LINUX_MEMPOLICY_H */ -- cgit v1.2.3 From 202e35db5e719ee8af6028183403f475e243f82d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 4 May 2021 18:36:04 -0700 Subject: mm/vmscan: replace implicit RECLAIM_ZONE checks with explicit checks RECLAIM_ZONE was assumed to be unused because it was never explicitly used in the kernel. However, there were a number of places where it was checked implicitly by checking 'node_reclaim_mode' for a zero value. These zero checks are not great because it is not obvious what a zero mode *means* in the code. Replace them with a helper which makes it more obvious: node_reclaim_enabled(). This helper also provides a handy place to explicitly check the RECLAIM_ZONE bit itself. Check it explicitly there to make it more obvious where the bit can affect behavior. This should have no functional impact. Link: https://lkml.kernel.org/r/20210219172559.BF589C44@viggo.jf.intel.com Signed-off-by: Dave Hansen Reviewed-by: Ben Widawsky Reviewed-by: Oscar Salvador Acked-by: Christoph Lameter Acked-by: David Rientjes Cc: Alex Shi Cc: "Tobin C. Harding" Cc: Huang Ying Cc: Dan Williams Cc: Qian Cai Cc: Daniel Wagner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 4cc6ec3bf0ab..42191da1bdc9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -12,6 +12,7 @@ #include #include #include +#include #include struct notifier_block; @@ -378,6 +379,12 @@ extern int sysctl_min_slab_ratio; #define node_reclaim_mode 0 #endif +static inline bool node_reclaim_enabled(void) +{ + /* Is any node_reclaim_mode bit set? */ + return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); +} + extern void check_move_unevictable_pages(struct pagevec *pvec); extern int kswapd_run(int nid); -- cgit v1.2.3 From 2bfd36374edd9ed7f2ebf66cacebedf7273901cb Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:11 -0700 Subject: mm: vmscan: consolidate shrinker_maps handling code The shrinker map management is not purely memcg specific, it is at the intersection between memory cgroup and shrinkers. It's allocation and assignment of a structure, and the only memcg bit is the map is being stored in a memcg structure. So move the shrinker_maps handling code into vmscan.c for tighter integration with shrinker code, and remove the "memcg_" prefix. There is no functional change. Link: https://lkml.kernel.org/r/20210311190845.9708-3-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5904716f29ba..7dbd2c9bad32 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1610,10 +1610,9 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -extern int memcg_expand_shrinker_maps(int new_id); - -extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id); +int alloc_shrinker_maps(struct mem_cgroup *memcg); +void free_shrinker_maps(struct mem_cgroup *memcg); +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; @@ -1623,8 +1622,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id) +static inline void set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id) { } #endif -- cgit v1.2.3 From e4262c4f51d6373447c9d89093f49ff6b1e607be Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:23 -0700 Subject: mm: memcontrol: rename shrinker_map to shrinker_info The following patch is going to add nr_deferred into shrinker_map, the change will make shrinker_map not only include map anymore, so rename it to "memcg_shrinker_info". And this should make the patch adding nr_deferred cleaner and readable and make review easier. Also remove the "memcg_" prefix. Link: https://lkml.kernel.org/r/20210311190845.9708-7-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7dbd2c9bad32..6cd800fe9a67 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -117,7 +117,7 @@ struct batched_lruvec_stat { * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, * which have elements charged to this memcg. */ -struct memcg_shrinker_map { +struct shrinker_info { struct rcu_head rcu; unsigned long map[]; }; @@ -145,7 +145,7 @@ struct mem_cgroup_per_node { struct mem_cgroup_reclaim_iter iter; - struct memcg_shrinker_map __rcu *shrinker_map; + struct shrinker_info __rcu *shrinker_info; struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ @@ -1610,8 +1610,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -int alloc_shrinker_maps(struct mem_cgroup *memcg); -void free_shrinker_maps(struct mem_cgroup *memcg); +int alloc_shrinker_info(struct mem_cgroup *memcg); +void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 -- cgit v1.2.3 From 41ca668a71e7b03743369a2c6d8b8edc1e943dc8 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:29 -0700 Subject: mm: vmscan: use a new flag to indicate shrinker is registered Currently registered shrinker is indicated by non-NULL shrinker->nr_deferred. This approach is fine with nr_deferred at the shrinker level, but the following patches will move MEMCG_AWARE shrinkers' nr_deferred to memcg level, so their shrinker->nr_deferred would always be NULL. This would prevent the shrinkers from unregistering correctly. Remove SHRINKER_REGISTERING since we could check if shrinker is registered successfully by the new flag. Link: https://lkml.kernel.org/r/20210311190845.9708-9-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Kirill Tkhai Acked-by: Vlastimil Babka Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shrinker.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 0f80123650e2..1eac79ce57d4 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -79,13 +79,14 @@ struct shrinker { #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ /* Flags */ -#define SHRINKER_NUMA_AWARE (1 << 0) -#define SHRINKER_MEMCG_AWARE (1 << 1) +#define SHRINKER_REGISTERED (1 << 0) +#define SHRINKER_NUMA_AWARE (1 << 1) +#define SHRINKER_MEMCG_AWARE (1 << 2) /* * It just makes sense when the shrinker is also MEMCG_AWARE for now, * non-MEMCG_AWARE shrinker should not have this flag set. */ -#define SHRINKER_NONSLAB (1 << 2) +#define SHRINKER_NONSLAB (1 << 3) extern int prealloc_shrinker(struct shrinker *shrinker); extern void register_shrinker_prepared(struct shrinker *shrinker); -- cgit v1.2.3 From 3c6f17e6c5d048c8029578c475dd037dd5db58af Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:33 -0700 Subject: mm: vmscan: add per memcg shrinker nr_deferred Currently the number of deferred objects are per shrinker, but some slabs, for example, vfs inode/dentry cache are per memcg, this would result in poor isolation among memcgs. The deferred objects typically are generated by __GFP_NOFS allocations, one memcg with excessive __GFP_NOFS allocations may blow up deferred objects, then other innocent memcgs may suffer from over shrink, excessive reclaim latency, etc. For example, two workloads run in memcgA and memcgB respectively, workload in B is vfs heavy workload. Workload in A generates excessive deferred objects, then B's vfs cache might be hit heavily (drop half of caches) by B's limit reclaim or global reclaim. We observed this hit in our production environment which was running vfs heavy workload shown as the below tracing log: <...>-409454 [016] .... 28286961.747146: mm_shrink_slab_start: super_cache_scan+0x0/0x1a0 ffff9a83046f3458: nid: 1 objects to shrink 3641681686040 gfp_flags GFP_HIGHUSER_MOVABLE|__GFP_ZERO pgs_scanned 1 lru_pgs 15721 cache items 246404277 delta 31345 total_scan 123202138 <...>-409454 [022] .... 28287105.928018: mm_shrink_slab_end: super_cache_scan+0x0/0x1a0 ffff9a83046f3458: nid: 1 unused scan count 3641681686040 new scan count 3641798379189 total_scan 602 last shrinker return val 123186855 The vfs cache and page cache ratio was 10:1 on this machine, and half of caches were dropped. This also resulted in significant amount of page caches were dropped due to inodes eviction. Make nr_deferred per memcg for memcg aware shrinkers would solve the unfairness and bring better isolation. The following patch will add nr_deferred to parent memcg when memcg offline. To preserve nr_deferred when reparenting memcgs to root, root memcg needs shrinker_info allocated too. When memcg is not enabled (!CONFIG_MEMCG or memcg disabled), the shrinker's nr_deferred would be used. And non memcg aware shrinkers use shrinker's nr_deferred all the time. Link: https://lkml.kernel.org/r/20210311190845.9708-10-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Roman Gushchin Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6cd800fe9a67..32bd62047238 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -114,12 +114,13 @@ struct batched_lruvec_stat { }; /* - * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, - * which have elements charged to this memcg. + * Bitmap and deferred work of shrinker::id corresponding to memcg-aware + * shrinkers, which have elements charged to this memcg. */ struct shrinker_info { struct rcu_head rcu; - unsigned long map[]; + atomic_long_t *nr_deferred; + unsigned long *map; }; /* -- cgit v1.2.3 From a178015cde69981cdcd8f109c5abc98703fead62 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:42 -0700 Subject: mm: memcontrol: reparent nr_deferred when memcg offline Now shrinker's nr_deferred is per memcg for memcg aware shrinkers, add to parent's corresponding nr_deferred when memcg offline. Link: https://lkml.kernel.org/r/20210311190845.9708-13-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 32bd62047238..c193be760709 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1614,6 +1614,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); +void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; -- cgit v1.2.3 From ef4984384172e93cc95e0e8cd102536d67e8a787 Mon Sep 17 00:00:00 2001 From: Pintu Kumar Date: Tue, 4 May 2021 18:36:48 -0700 Subject: mm/compaction: remove unused variable sysctl_compact_memory The sysctl_compact_memory is mostly unused in mm/compaction.c It just acts as a place holder for sysctl to store .data. But the .data itself is not needed here. So we can get ride of this variable completely and make .data as NULL. This will also eliminate the extern declaration from header file. No functionality is broken or changed this way. Link: https://lkml.kernel.org/r/1614852224-14671-1-git-send-email-pintu@codeaurora.org Signed-off-by: Pintu Kumar Signed-off-by: Pintu Agarwal Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index ed4070ed41ef..4221888bdcd6 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -81,7 +81,6 @@ static inline unsigned long compact_gap(unsigned int order) } #ifdef CONFIG_COMPACTION -extern int sysctl_compact_memory; extern unsigned int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); -- cgit v1.2.3 From d479960e44f27e0e52ba31b21740b703c538027c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:36:54 -0700 Subject: mm: disable LRU pagevec during the migration temporarily LRU pagevec holds refcount of pages until the pagevec are drained. It could prevent migration since the refcount of the page is greater than the expection in migration logic. To mitigate the issue, callers of migrate_pages drains LRU pagevec via migrate_prep or lru_add_drain_all before migrate_pages call. However, it's not enough because pages coming into pagevec after the draining call still could stay at the pagevec so it could keep preventing page migration. Since some callers of migrate_pages have retrial logic with LRU draining, the page would migrate at next trail but it is still fragile in that it doesn't close the fundamental race between upcoming LRU pages into pagvec and migration so the migration failure could cause contiguous memory allocation failure in the end. To close the race, this patch disables lru caches(i.e, pagevec) during ongoing migration until migrate is done. Since it's really hard to reproduce, I measured how many times migrate_pages retried with force mode(it is about a fallback to a sync migration) with below debug code. int migrate_pages(struct list_head *from, new_page_t get_new_page, .. .. if (rc && reason == MR_CONTIG_RANGE && pass > 2) { printk(KERN_ERR, "pfn 0x%lx reason %d", page_to_pfn(page), rc); dump_page(page, "fail to migrate"); } The test was repeating android apps launching with cma allocation in background every five seconds. Total cma allocation count was about 500 during the testing. With this patch, the dump_page count was reduced from 400 to 30. The new interface is also useful for memory hotplug which currently drains lru pcp caches after each migration failure. This is rather suboptimal as it has to disrupt others running during the operation. With the new interface the operation happens only once. This is also in line with pcp allocator cache which are disabled for the offlining as well. Link: https://lkml.kernel.org/r/20210319175127.886124-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Chris Goldsworthy Acked-by: Michal Hocko Cc: John Dias Cc: Suren Baghdasaryan Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Vlastimil Babka Cc: Oliver Sang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 ++ include/linux/swap.h | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3a389633b68f..9e4a2dc8622c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -46,6 +46,7 @@ extern int isolate_movable_page(struct page *page, isolate_mode_t mode); extern void putback_movable_page(struct page *page); extern void migrate_prep(void); +extern void migrate_finish(void); extern void migrate_prep_local(void); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); @@ -67,6 +68,7 @@ static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) { return -EBUSY; } static inline int migrate_prep(void) { return -ENOSYS; } +static inline int migrate_finish(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } static inline void migrate_page_states(struct page *newpage, struct page *page) diff --git a/include/linux/swap.h b/include/linux/swap.h index 42191da1bdc9..f69e0f67651d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -340,6 +340,20 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file, extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); extern void mark_page_accessed(struct page *); + +extern atomic_t lru_disable_count; + +static inline bool lru_cache_disabled(void) +{ + return atomic_read(&lru_disable_count); +} + +static inline void lru_cache_enable(void) +{ + atomic_dec(&lru_disable_count); +} + +extern void lru_cache_disable(void); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); -- cgit v1.2.3 From 361a2a229fa31ab7f2b236b5946e434964d00762 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:36:57 -0700 Subject: mm: replace migrate_[prep|finish] with lru_cache_[disable|enable] Currently, migrate_[prep|finish] is merely a wrapper of lru_cache_[disable|enable]. There is not much to gain from having additional abstraction. Use lru_cache_[disable|enable] instead of migrate_[prep|finish], which would be more descriptive. note: migrate_prep_local in compaction.c changed into lru_add_drain to avoid CPU schedule cost with involving many other CPUs to keep old behavior. Link: https://lkml.kernel.org/r/20210319175127.886124-2-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Cc: Chris Goldsworthy Cc: John Dias Cc: Matthew Wilcox Cc: Oliver Sang Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 9e4a2dc8622c..6155d97ec76c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -45,9 +45,6 @@ extern struct page *alloc_migration_target(struct page *page, unsigned long priv extern int isolate_movable_page(struct page *page, isolate_mode_t mode); extern void putback_movable_page(struct page *page); -extern void migrate_prep(void); -extern void migrate_finish(void); -extern void migrate_prep_local(void); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); extern int migrate_huge_page_move_mapping(struct address_space *mapping, @@ -67,10 +64,6 @@ static inline struct page *alloc_migration_target(struct page *page, static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) { return -EBUSY; } -static inline int migrate_prep(void) { return -ENOSYS; } -static inline int migrate_finish(void) { return -ENOSYS; } -static inline int migrate_prep_local(void) { return -ENOSYS; } - static inline void migrate_page_states(struct page *newpage, struct page *page) { } -- cgit v1.2.3 From 8cc621d2f45ddd3dc664024a647ee7adf48d79a5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:00 -0700 Subject: mm: fs: invalidate BH LRU during page migration Pages containing buffer_heads that are in one of the per-CPU buffer_head LRU caches will be pinned and thus cannot be migrated. This can prevent CMA allocations from succeeding, which are often used on platforms with co-processors (such as a DSP) that can only use physically contiguous memory. It can also prevent memory hot-unplugging from succeeding, which involves migrating at least MIN_MEMORY_BLOCK_SIZE bytes of memory, which ranges from 8 MiB to 1 GiB based on the architecture in use. Correspondingly, invalidate the BH LRU caches before a migration starts and stop any buffer_head from being cached in the LRU caches, until migration has finished. Link: https://lkml.kernel.org/r/20210319175127.886124-3-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: Chris Goldsworthy Reported-by: Laura Abbott Tested-by: Oliver Sang Cc: David Hildenbrand Cc: John Dias Cc: Matthew Wilcox Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6b47f94378c5..e7e99da31349 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -194,6 +194,8 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size, struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); +void invalidate_bh_lrus_cpu(int cpu); +bool has_bh_in_lru(int cpu, void *dummy); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); void unlock_buffer(struct buffer_head *bh); @@ -406,6 +408,8 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } +static inline void invalidate_bh_lrus_cpu(int cpu) {} +static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; } #define buffer_heads_over_limit 0 #endif /* CONFIG_BLOCK */ -- cgit v1.2.3 From 606a6f71a25accfc960a5063c23717ff07aa43a3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:04 -0700 Subject: mm/migrate.c: make putback_movable_page() static Patch series "Cleanup and fixup for mm/migrate.c", v3. This series contains cleanups to remove unnecessary VM_BUG_ON_PAGE and rc != MIGRATEPAGE_SUCCESS check. Also use helper function to remove some duplicated codes. What's more, this fixes potential deadlock in NUMA balancing shared exec THP case and so on. More details can be found in the respective changelogs. This patch (of 5): The putback_movable_page() is just called by putback_movable_pages() and we know the page is locked and both PageMovable() and PageIsolated() is checked right before calling putback_movable_page(). So we make it static and remove all the 3 VM_BUG_ON_PAGE(). Link: https://lkml.kernel.org/r/20210325131524.48181-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210325131524.48181-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Jerome Glisse Cc: Rafael Aquini Cc: Alistair Popple Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 6155d97ec76c..175ef15ae9e8 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -43,7 +43,6 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason); extern struct page *alloc_migration_target(struct page *page, unsigned long private); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); -extern void putback_movable_page(struct page *page); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); -- cgit v1.2.3 From bbb269206f3c914d4f23e023de4ec020abea6d1b Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:19 -0700 Subject: mm: vmstat: add cma statistics Since CMA is used more widely, it's worth to have CMA allocation statistics into vmstat. With it, we could know how agressively system uses cma allocation and how often it fails. Link: https://lkml.kernel.org/r/20210302183346.3707237-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: John Hubbard Cc: John Dias Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 18e75974d4e3..21d7c7f72f1c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -70,6 +70,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #endif #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, +#endif +#ifdef CONFIG_CMA + CMA_ALLOC_SUCCESS, + CMA_ALLOC_FAIL, #endif UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ -- cgit v1.2.3 From 7bc1aec5e28765ad18742824b3b972471807a632 Mon Sep 17 00:00:00 2001 From: Liam Mark Date: Tue, 4 May 2021 18:37:25 -0700 Subject: mm: cma: add trace events for CMA alloc perf testing Add cma and migrate trace events to enable CMA allocation performance to be measured via ftrace. [georgi.djakov@linaro.org: add the CMA instance name to the cma_alloc_start trace event] Link: https://lkml.kernel.org/r/20210326155414.25006-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20210324160740.15901-1-georgi.djakov@linaro.org Signed-off-by: Liam Mark Signed-off-by: Georgi Djakov Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/cma.h | 42 +++++++++++++++++++++++++++++++++++++++++- include/trace/events/migrate.h | 22 ++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 5017a8829270..be1525a10457 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -8,7 +8,7 @@ #include #include -TRACE_EVENT(cma_alloc, +DECLARE_EVENT_CLASS(cma_alloc_class, TP_PROTO(unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), @@ -61,6 +61,46 @@ TRACE_EVENT(cma_release, __entry->count) ); +TRACE_EVENT(cma_alloc_start, + + TP_PROTO(const char *name, unsigned int count, unsigned int align), + + TP_ARGS(name, count, align), + + TP_STRUCT__entry( + __string(name, name) + __field(unsigned int, count) + __field(unsigned int, align) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->count = count; + __entry->align = align; + ), + + TP_printk("name=%s count=%u align=%u", + __get_str(name), + __entry->count, + __entry->align) +); + +DEFINE_EVENT(cma_alloc_class, cma_alloc, + + TP_PROTO(unsigned long pfn, const struct page *page, + unsigned int count, unsigned int align), + + TP_ARGS(pfn, page, count, align) +); + +DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, + + TP_PROTO(unsigned long pfn, const struct page *page, + unsigned int count, unsigned int align), + + TP_ARGS(pfn, page, count, align) +); + #endif /* _TRACE_CMA_H */ /* This part must be outside protection */ diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 4d434398d64d..f2c990603888 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -81,6 +81,28 @@ TRACE_EVENT(mm_migrate_pages, __print_symbolic(__entry->mode, MIGRATE_MODE), __print_symbolic(__entry->reason, MIGRATE_REASON)) ); + +TRACE_EVENT(mm_migrate_pages_start, + + TP_PROTO(enum migrate_mode mode, int reason), + + TP_ARGS(mode, reason), + + TP_STRUCT__entry( + __field(enum migrate_mode, mode) + __field(int, reason) + ), + + TP_fast_assign( + __entry->mode = mode; + __entry->reason = reason; + ), + + TP_printk("mode=%s reason=%s", + __print_symbolic(__entry->mode, MIGRATE_MODE), + __print_symbolic(__entry->reason, MIGRATE_REASON)) +); + #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 3aab8ae7aace3388da319a233edf48f0f5d26a44 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:31 -0700 Subject: mm: cma: add the CMA instance name to cma trace events There were missing places to add cma instance name. To identify each CMA instance, let's add the name for every cma trace. This patch also changes the existing cma_trace_alloc to cma_trace_finish since we have cma_alloc_start[1]. [1] https://lore.kernel.org/linux-mm/20210324160740.15901-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20210330220237.748899-1-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Liam Mark Cc: Georgi Djakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/cma.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index be1525a10457..5cf385ae7c08 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -10,12 +10,13 @@ DECLARE_EVENT_CLASS(cma_alloc_class, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align), + TP_ARGS(name, pfn, page, count, align), TP_STRUCT__entry( + __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) __field(unsigned int, count) @@ -23,13 +24,15 @@ DECLARE_EVENT_CLASS(cma_alloc_class, ), TP_fast_assign( + __assign_str(name, name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; __entry->align = align; ), - TP_printk("pfn=%lx page=%p count=%u align=%u", + TP_printk("name=%s pfn=%lx page=%p count=%u align=%u", + __get_str(name), __entry->pfn, __entry->page, __entry->count, @@ -38,24 +41,27 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TRACE_EVENT(cma_release, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count), - TP_ARGS(pfn, page, count), + TP_ARGS(name, pfn, page, count), TP_STRUCT__entry( + __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) __field(unsigned int, count) ), TP_fast_assign( + __assign_str(name, name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; ), - TP_printk("pfn=%lx page=%p count=%u", + TP_printk("name=%s pfn=%lx page=%p count=%u", + __get_str(name), __entry->pfn, __entry->page, __entry->count) @@ -85,20 +91,20 @@ TRACE_EVENT(cma_alloc_start, __entry->align) ); -DEFINE_EVENT(cma_alloc_class, cma_alloc, +DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align) + TP_ARGS(name, pfn, page, count, align) ); DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align) + TP_ARGS(name, pfn, page, count, align) ); #endif /* _TRACE_CMA_H */ -- cgit v1.2.3 From 78fa51503fdbe463c96eef4c3cf69ca54032647a Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:34 -0700 Subject: mm: use proper type for cma_[alloc|release] size_t in cma_alloc is confusing since it makes people think it's byte count, not pages. Change it to unsigned long[1]. The unsigned int in cma_release is also not right so change it. Since we have unsigned long in cma_release, free_contig_range should also respect it. [1] 67a2e213e7e9, mm: cma: fix incorrect type conversion for size during dma allocation Link: https://lore.kernel.org/linux-mm/20210324043434.GP1719932@casper.infradead.org/ Link: https://lkml.kernel.org/r/20210331164018.710560-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: David Hildenbrand Cc: Matthew Wilcox Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cma.h | 4 ++-- include/linux/gfp.h | 2 +- include/trace/events/cma.h | 22 +++++++++++----------- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/cma.h b/include/linux/cma.h index 217999c8a762..53fd8c3cdbd0 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -44,9 +44,9 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, const char *name, struct cma **res_cma); -extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, +extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn); -extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); +extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); #endif diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 26f4d907254a..8a5f6c3d7dba 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -657,7 +657,7 @@ extern int alloc_contig_range(unsigned long start, unsigned long end, extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #endif -void free_contig_range(unsigned long pfn, unsigned int nr_pages); +void free_contig_range(unsigned long pfn, unsigned long nr_pages); #ifdef CONFIG_CMA /* CMA stuff */ diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 5cf385ae7c08..c3d354702cb0 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -11,7 +11,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align), @@ -19,7 +19,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) - __field(unsigned int, count) + __field(unsigned long, count) __field(unsigned int, align) ), @@ -31,7 +31,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, __entry->align = align; ), - TP_printk("name=%s pfn=%lx page=%p count=%u align=%u", + TP_printk("name=%s pfn=%lx page=%p count=%lu align=%u", __get_str(name), __entry->pfn, __entry->page, @@ -42,7 +42,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TRACE_EVENT(cma_release, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count), + unsigned long count), TP_ARGS(name, pfn, page, count), @@ -50,7 +50,7 @@ TRACE_EVENT(cma_release, __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) - __field(unsigned int, count) + __field(unsigned long, count) ), TP_fast_assign( @@ -60,7 +60,7 @@ TRACE_EVENT(cma_release, __entry->count = count; ), - TP_printk("name=%s pfn=%lx page=%p count=%u", + TP_printk("name=%s pfn=%lx page=%p count=%lu", __get_str(name), __entry->pfn, __entry->page, @@ -69,13 +69,13 @@ TRACE_EVENT(cma_release, TRACE_EVENT(cma_alloc_start, - TP_PROTO(const char *name, unsigned int count, unsigned int align), + TP_PROTO(const char *name, unsigned long count, unsigned int align), TP_ARGS(name, count, align), TP_STRUCT__entry( __string(name, name) - __field(unsigned int, count) + __field(unsigned long, count) __field(unsigned int, align) ), @@ -85,7 +85,7 @@ TRACE_EVENT(cma_alloc_start, __entry->align = align; ), - TP_printk("name=%s count=%u align=%u", + TP_printk("name=%s count=%lu align=%u", __get_str(name), __entry->count, __entry->align) @@ -94,7 +94,7 @@ TRACE_EVENT(cma_alloc_start, DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align) ); @@ -102,7 +102,7 @@ DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align) ); -- cgit v1.2.3 From 575299ea18a8c0575d4c2ef6ad3fa4d41d529d1c Mon Sep 17 00:00:00 2001 From: Saravanan D Date: Tue, 4 May 2021 18:38:03 -0700 Subject: x86/mm: track linear mapping split events To help with debugging the sluggishness caused by TLB miss/reload, we introduce monotonic hugepage [direct mapped] split event counts since system state: SYSTEM_RUNNING to be displayed as part of /proc/vmstat in x86 servers The lifetime split event information will be displayed at the bottom of /proc/vmstat .... swap_ra 0 swap_ra_hit 0 direct_map_level2_splits 94 direct_map_level3_splits 4 nr_unstable 0 .... One of the many lasting sources of direct hugepage splits is kernel tracing (kprobes, tracepoints). Note that the kernel's code segment [512 MB] points to the same physical addresses that have been already mapped in the kernel's direct mapping range. Source : Documentation/x86/x86_64/mm.rst When we enable kernel tracing, the kernel has to modify attributes/permissions of the text segment hugepages that are direct mapped causing them to split. Kernel's direct mapped hugepages do not coalesce back after split and remain in place for the remainder of the lifetime. An instance of direct page splits when we turn on dynamic kernel tracing .... cat /proc/vmstat | grep -i direct_map_level direct_map_level2_splits 784 direct_map_level3_splits 12 bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @ [pid, comm] = count(); }' cat /proc/vmstat | grep -i direct_map_level direct_map_level2_splits 789 direct_map_level3_splits 12 .... Link: https://lkml.kernel.org/r/20210218235744.1040634-1-saravanand@fb.com Signed-off-by: Saravanan D Acked-by: Tejun Heo Acked-by: Johannes Weiner Acked-by: Dave Hansen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 21d7c7f72f1c..ae0dd1948c2b 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -124,6 +124,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, +#endif +#ifdef CONFIG_X86 + DIRECT_MAP_LEVEL2_SPLIT, + DIRECT_MAP_LEVEL3_SPLIT, #endif NR_VM_EVENT_ITEMS }; -- cgit v1.2.3 From 1a08ae36cf8b5f26d0c64ebfe46f8eb07ea0b678 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:53 -0700 Subject: mm cma: rename PF_MEMALLOC_NOCMA to PF_MEMALLOC_PIN PF_MEMALLOC_NOCMA is used ot guarantee that the allocator will not return pages that might belong to CMA region. This is currently used for long term gup to make sure that such pins are not going to be done on any CMA pages. When PF_MEMALLOC_NOCMA has been introduced we haven't realized that it is focusing on CMA pages too much and that there is larger class of pages that need the same treatment. MOVABLE zone cannot contain any long term pins as well so it makes sense to reuse and redefine this flag for that usecase as well. Rename the flag to PF_MEMALLOC_PIN which defines an allocation context which can only get pages suitable for long-term pins. Also rename: memalloc_nocma_save()/memalloc_nocma_restore to memalloc_pin_save()/memalloc_pin_restore() and make the new functions common. [rppt@linux.ibm.com: fix renaming of PF_MEMALLOC_NOCMA to PF_MEMALLOC_PIN] Link: https://lkml.kernel.org/r/20210331163816.11517-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210215161349.246722-6-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Acked-by: Michal Hocko Signed-off-by: Mike Rapoport Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- include/linux/sched/mm.h | 21 +++++---------------- 2 files changed, 6 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c25c8e67030..d2c881384517 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1583,7 +1583,7 @@ extern struct pid *cad_pid; #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ +#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 90b2a0bce11c..ae654819e8aa 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -271,29 +271,18 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } -#ifdef CONFIG_CMA -static inline unsigned int memalloc_nocma_save(void) +static inline unsigned int memalloc_pin_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_NOCMA; + unsigned int flags = current->flags & PF_MEMALLOC_PIN; - current->flags |= PF_MEMALLOC_NOCMA; + current->flags |= PF_MEMALLOC_PIN; return flags; } -static inline void memalloc_nocma_restore(unsigned int flags) +static inline void memalloc_pin_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags; + current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags; } -#else -static inline unsigned int memalloc_nocma_save(void) -{ - return 0; -} - -static inline void memalloc_nocma_restore(unsigned int flags) -{ -} -#endif #ifdef CONFIG_MEMCG DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); -- cgit v1.2.3 From 8e3560d963d22ba41857f48e4114ce80373144ea Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:00 -0700 Subject: mm: honor PF_MEMALLOC_PIN for all movable pages PF_MEMALLOC_PIN is only honored for CMA pages, extend this flag to work for any allocations from ZONE_MOVABLE by removing __GFP_MOVABLE from gfp_mask when this flag is passed in the current context. Add is_pinnable_page() to return true if page is in a pinnable page. A pinnable page is not in ZONE_MOVABLE and not of MIGRATE_CMA type. Link: https://lkml.kernel.org/r/20210215161349.246722-8-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 18 ++++++++++++++++++ include/linux/sched/mm.h | 6 +++++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1dbb53c44243..d0e628f511e4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1141,6 +1141,11 @@ static inline bool is_zone_device_page(const struct page *page) } #endif +static inline bool is_zone_movable_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_MOVABLE; +} + #ifdef CONFIG_DEV_PAGEMAP_OPS void free_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); @@ -1550,6 +1555,19 @@ static inline unsigned long page_to_section(const struct page *page) } #endif +/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ +#ifdef CONFIG_MIGRATION +static inline bool is_pinnable_page(struct page *page) +{ + return !is_zone_movable_page(page) && !is_migrate_cma_page(page); +} +#else +static inline bool is_pinnable_page(struct page *page) +{ + return true; +} +#endif + static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index ae654819e8aa..e24b1fe348e3 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -151,12 +151,13 @@ static inline bool in_vfork(struct task_struct *tsk) * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS + * PF_MEMALLOC_PIN implies !GFP_MOVABLE */ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); - if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) { + if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { /* * NOIO implies both NOIO and NOFS and it is a weaker context * so always make sure it makes precedence @@ -165,6 +166,9 @@ static inline gfp_t current_gfp_context(gfp_t flags) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; + + if (pflags & PF_MEMALLOC_PIN) + flags &= ~__GFP_MOVABLE; } return flags; } -- cgit v1.2.3 From 9afaf30f7a1aab2022961715a66f644275b8daec Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:04 -0700 Subject: mm/gup: do not migrate zero page On some platforms ZERO_PAGE(0) might end-up in a movable zone. Do not migrate zero page in gup during longterm pinning as migration of zero page is not allowed. For example, in x86 QEMU with 16G of memory and kernelcore=5G parameter, I see the following: Boot#1: zero_pfn 0x48a8d zero_pfn zone: ZONE_DMA32 Boot#2: zero_pfn 0x20168d zero_pfn zone: ZONE_MOVABLE On x86, empty_zero_page is declared in .bss and depending on the loader may end up in different physical locations during boots. Also, move is_zero_pfn() my_zero_pfn() functions under CONFIG_MMU, because zero_pfn that they are using is declared in memory.c which is compiled with CONFIG_MMU. Link: https://lkml.kernel.org/r/20210215161349.246722-9-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++- include/linux/mmzone.h | 4 ++++ include/linux/pgtable.h | 12 ++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d0e628f511e4..76e27ebb28a3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1559,7 +1559,8 @@ static inline unsigned long page_to_section(const struct page *page) #ifdef CONFIG_MIGRATION static inline bool is_pinnable_page(struct page *page) { - return !is_zone_movable_page(page) && !is_migrate_cma_page(page); + return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) || + is_zero_pfn(page_to_pfn(page)); } #else static inline bool is_pinnable_page(struct page *page) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3b2205741048..92b44149d5b9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -427,6 +427,10 @@ enum zone_type { * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). + * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create + * situations where ZERO_PAGE(0) which is allocated differently + * on different platforms may end up in a movable zone. ZERO_PAGE(0) + * cannot be migrated. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5e772392a379..2194a9cd885c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1111,6 +1111,7 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif +#ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { @@ -1134,6 +1135,17 @@ static inline unsigned long my_zero_pfn(unsigned long addr) return zero_pfn; } #endif +#else +static inline int is_zero_pfn(unsigned long pfn) +{ + return 0; +} + +static inline unsigned long my_zero_pfn(unsigned long addr) +{ + return 0; +} +#endif /* CONFIG_MMU */ #ifdef CONFIG_MMU -- cgit v1.2.3 From d1e153fea2a8940273174fc17733c44323d35cd5 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:08 -0700 Subject: mm/gup: migrate pinned pages out of movable zone We should not pin pages in ZONE_MOVABLE. Currently, we do not pin only movable CMA pages. Generalize the function that migrates CMA pages to migrate all movable pages. Use is_pinnable_page() to check which pages need to be migrated Link: https://lkml.kernel.org/r/20210215161349.246722-10-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 1 + include/linux/mmzone.h | 9 +++++++-- include/trace/events/migrate.h | 3 ++- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 175ef15ae9e8..4bb4e519e3f5 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -27,6 +27,7 @@ enum migrate_reason { MR_MEMPOLICY_MBIND, MR_NUMA_MISPLACED, MR_CONTIG_RANGE, + MR_LONGTERM_PIN, MR_TYPES }; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 92b44149d5b9..e8922a67d1a4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -407,8 +407,13 @@ enum zone_type { * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might - * essentially turn such pages unmovable. Memory offlining might - * retry a long time. + * essentially turn such pages unmovable. Therefore, we do not allow + * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and + * faulted, they come from the right zone right away. However, it is + * still possible that address space already has pages in + * ZONE_MOVABLE at the time when pages are pinned (i.e. user has + * touches that memory before pinning). In such case we migrate them + * to a different zone. When migration fails - pinning fails. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index f2c990603888..9fb2a3bbcdfb 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -20,7 +20,8 @@ EM( MR_SYSCALL, "syscall_or_cpuset") \ EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \ EM( MR_NUMA_MISPLACED, "numa_misplaced") \ - EMe(MR_CONTIG_RANGE, "contig_range") + EM( MR_CONTIG_RANGE, "contig_range") \ + EMe(MR_LONGTERM_PIN, "longterm_pin") /* * First define the enums in the above macros to be exported to userspace -- cgit v1.2.3 From a08a2ae3461383c2d50d0997dcc6cd1dd1fefb08 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:42 -0700 Subject: mm,memory_hotplug: allocate memmap from the added memory range Physical memory hotadd has to allocate a memmap (struct page array) for the newly added memory section. Currently, alloc_pages_node() is used for those allocations. This has some disadvantages: a) an existing memory is consumed for that purpose (eg: ~2MB per 128MB memory section on x86_64) This can even lead to extreme cases where system goes OOM because the physically hotplugged memory depletes the available memory before it is onlined. b) if the whole node is movable then we have off-node struct pages which has performance drawbacks. c) It might be there are no PMD_ALIGNED chunks so memmap array gets populated with base pages. This can be improved when CONFIG_SPARSEMEM_VMEMMAP is enabled. Vmemap page tables can map arbitrary memory. That means that we can reserve a part of the physically hotadded memory to back vmemmap page tables. This implementation uses the beginning of the hotplugged memory for that purpose. There are some non-obviously things to consider though. Vmemmap pages are allocated/freed during the memory hotplug events (add_memory_resource(), try_remove_memory()) when the memory is added/removed. This means that the reserved physical range is not online although it is used. The most obvious side effect is that pfn_to_online_page() returns NULL for those pfns. The current design expects that this should be OK as the hotplugged memory is considered a garbage until it is onlined. For example hibernation wouldn't save the content of those vmmemmaps into the image so it wouldn't be restored on resume but this should be OK as there no real content to recover anyway while metadata is reachable from other data structures (e.g. vmemmap page tables). The reserved space is therefore (de)initialized during the {on,off}line events (mhp_{de}init_memmap_on_memory). That is done by extracting page allocator independent initialization from the regular onlining path. The primary reason to handle the reserved space outside of {on,off}line_pages is to make each initialization specific to the purpose rather than special case them in a single function. As per above, the functions that are introduced are: - mhp_init_memmap_on_memory: Initializes vmemmap pages by calling move_pfn_range_to_zone(), calls kasan_add_zero_shadow(), and onlines as many sections as vmemmap pages fully span. - mhp_deinit_memmap_on_memory: Offlines as many sections as vmemmap pages fully span, removes the range from zhe zone by remove_pfn_range_from_zone(), and calls kasan_remove_zero_shadow() for the range. The new function memory_block_online() calls mhp_init_memmap_on_memory() before doing the actual online_pages(). Should online_pages() fail, we clean up by calling mhp_deinit_memmap_on_memory(). Adjusting of present_pages is done at the end once we know that online_pages() succedeed. On offline, memory_block_offline() needs to unaccount vmemmap pages from present_pages() before calling offline_pages(). This is necessary because offline_pages() tears down some structures based on the fact whether the node or the zone become empty. If offline_pages() fails, we account back vmemmap pages. If it succeeds, we call mhp_deinit_memmap_on_memory(). Hot-remove: We need to be careful when removing memory, as adding and removing memory needs to be done with the same granularity. To check that this assumption is not violated, we check the memory range we want to remove and if a) any memory block has vmemmap pages and b) the range spans more than a single memory block, we scream out loud and refuse to proceed. If all is good and the range was using memmap on memory (aka vmemmap pages), we construct an altmap structure so free_hugepage_table does the right thing and calls vmem_altmap_free instead of free_pagetable. Link: https://lkml.kernel.org/r/20210421102701.25051-5-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 8 +++++++- include/linux/memory_hotplug.h | 15 ++++++++++++++- include/linux/memremap.h | 2 +- include/linux/mmzone.h | 7 +++++-- 4 files changed, 27 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index 4da95e684e20..97e92e8b556a 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -29,6 +29,11 @@ struct memory_block { int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ struct device dev; + /* + * Number of vmemmap pages. These pages + * lay at the beginning of the memory block. + */ + unsigned long nr_vmemmap_pages; }; int arch_get_memory_phys_device(unsigned long start_pfn); @@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v) #else extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); -int create_memory_block_devices(unsigned long start, unsigned long size); +int create_memory_block_devices(unsigned long start, unsigned long size, + unsigned long vmemmap_pages); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7288aa5ef73b..28f32fd00fe9 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -55,6 +55,14 @@ typedef int __bitwise mhp_t; */ #define MHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) +/* + * We want memmap (struct page array) to be self contained. + * To do so, we will use the beginning of the hot-added range to build + * the page tables for the memmap array that describes the entire range. + * Only selected architectures support it with SPARSE_VMEMMAP. + */ +#define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1)) + /* * Extended parameters for memory hotplug: * altmap: alternative allocator for memmap array (optional) @@ -99,9 +107,13 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); +extern void adjust_present_page_count(struct zone *zone, long nr_pages); /* VM interface that may be used by firmware interface */ +extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, + struct zone *zone); +extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, - int online_type, int nid); + struct zone *zone); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, @@ -359,6 +371,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_ extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); +extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f5b464daeeca..45a79da89c5f 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -17,7 +17,7 @@ struct device; * @alloc: track pages consumed, private to vmemmap_populate() */ struct vmem_altmap { - const unsigned long base_pfn; + unsigned long base_pfn; const unsigned long end_pfn; const unsigned long reserve; unsigned long free; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e8922a67d1a4..917bd6c604d5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -436,6 +436,11 @@ enum zone_type { * situations where ZERO_PAGE(0) which is allocated differently * on different platforms may end up in a movable zone. ZERO_PAGE(0) * cannot be migrated. + * 7. Memory-hotplug: when using memmap_on_memory and onlining the + * memory to the MOVABLE zone, the vmemmap pages are also placed in + * such zone. Such pages cannot be really moved around as they are + * self-stored in the range, but they are treated as movable when + * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) @@ -1392,10 +1397,8 @@ static inline int online_section_nr(unsigned long nr) #ifdef CONFIG_MEMORY_HOTPLUG void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); -#ifdef CONFIG_MEMORY_HOTREMOVE void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #endif -#endif static inline struct mem_section *__pfn_to_section(unsigned long pfn) { -- cgit v1.2.3 From 28961998f858114e51d2ae862065b858afcfa2b2 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 4 May 2021 18:40:03 -0700 Subject: iov_iter: lift memzero_page() to highmem.h Patch series "btrfs: Convert kmap/memset/kunmap to memzero_user()". Lifting memzero_user(), convert it to kmap_local_page() and then use it in btrfs. This patch (of 3): memzero_page() can replace the kmap/memset/kunmap pattern in other places in the code. While zero_user() has the same interface it is not the same call and its use should be limited and some of those calls may be better converted from zero_user() to memzero_page().[1] But that is not addressed in this series. Lift memzero_page() to highmem. [1] https://lore.kernel.org/lkml/CAHk-=wijdojzo56FzYqE5TOYw2Vws7ik3LEMGj9SPQaJJ+Z73Q@mail.gmail.com/ Link: https://lkml.kernel.org/r/20210309212137.2610186-1-ira.weiny@intel.com Link: https://lkml.kernel.org/r/20210309212137.2610186-2-ira.weiny@intel.com Signed-off-by: Ira Weiny Cc: Alexander Viro Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: Chaitanya Kulkarni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 44170f312ae7..832b49b50c7b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -332,4 +332,11 @@ static inline void memcpy_to_page(struct page *page, size_t offset, kunmap_local(to); } +static inline void memzero_page(struct page *page, size_t offset, size_t len) +{ + char *addr = kmap_atomic(page); + memset(addr + offset, 0, len); + kunmap_atomic(addr); +} + #endif /* _LINUX_HIGHMEM_H */ -- cgit v1.2.3 From 866a6dadbb027b2955a7ae00bab9705d382def12 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 4 May 2021 17:27:28 -0700 Subject: context_tracking: Move guest exit context tracking to separate helpers Provide separate context tracking helpers for guest exit, the standalone helpers will be called separately by KVM x86 in later patches to fix tick-based accounting. Suggested-by: Thomas Gleixner Signed-off-by: Wanpeng Li Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210505002735.1684165-2-seanjc@google.com --- include/linux/context_tracking.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index bceb06498521..b8c7313495a7 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -131,10 +131,15 @@ static __always_inline void guest_enter_irqoff(void) } } -static __always_inline void guest_exit_irqoff(void) +static __always_inline void context_tracking_guest_exit(void) { if (context_tracking_enabled()) __context_tracking_exit(CONTEXT_GUEST); +} + +static __always_inline void guest_exit_irqoff(void) +{ + context_tracking_guest_exit(); instrumentation_begin(); if (vtime_accounting_enabled_this_cpu()) @@ -159,6 +164,8 @@ static __always_inline void guest_enter_irqoff(void) instrumentation_end(); } +static __always_inline void context_tracking_guest_exit(void) { } + static __always_inline void guest_exit_irqoff(void) { instrumentation_begin(); -- cgit v1.2.3 From 88d8220bbf06dd8045b2ac4be1046290eaa7773a Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 4 May 2021 17:27:29 -0700 Subject: context_tracking: Move guest exit vtime accounting to separate helpers Provide separate vtime accounting functions for guest exit instead of open coding the logic within the context tracking code. This will allow KVM x86 to handle vtime accounting slightly differently when using tick-based accounting. Suggested-by: Thomas Gleixner Signed-off-by: Wanpeng Li Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Reviewed-by: Christian Borntraeger Link: https://lore.kernel.org/r/20210505002735.1684165-3-seanjc@google.com --- include/linux/context_tracking.h | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index b8c7313495a7..4f4556232dcf 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -137,15 +137,20 @@ static __always_inline void context_tracking_guest_exit(void) __context_tracking_exit(CONTEXT_GUEST); } -static __always_inline void guest_exit_irqoff(void) +static __always_inline void vtime_account_guest_exit(void) { - context_tracking_guest_exit(); - - instrumentation_begin(); if (vtime_accounting_enabled_this_cpu()) vtime_guest_exit(current); else current->flags &= ~PF_VCPU; +} + +static __always_inline void guest_exit_irqoff(void) +{ + context_tracking_guest_exit(); + + instrumentation_begin(); + vtime_account_guest_exit(); instrumentation_end(); } @@ -166,12 +171,17 @@ static __always_inline void guest_enter_irqoff(void) static __always_inline void context_tracking_guest_exit(void) { } +static __always_inline void vtime_account_guest_exit(void) +{ + vtime_account_kernel(current); + current->flags &= ~PF_VCPU; +} + static __always_inline void guest_exit_irqoff(void) { instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ - vtime_account_kernel(current); - current->flags &= ~PF_VCPU; + vtime_account_guest_exit(); instrumentation_end(); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit v1.2.3 From b41c723b203e19480c26f2ec8f04eedc03d34b34 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 17:27:31 -0700 Subject: sched/vtime: Move vtime accounting external declarations above inlines Move the blob of external declarations (and their stubs) above the set of inline definitions (and their stubs) for vtime accounting. This will allow a future patch to bring in more inline definitions without also having to shuffle large chunks of code. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Reviewed-by: Christian Borntraeger Link: https://lore.kernel.org/r/20210505002735.1684165-5-seanjc@google.com --- include/linux/vtime.h | 74 +++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 041d6524d144..6a4317560539 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -10,6 +10,43 @@ struct task_struct; +/* + * Common vtime APIs + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +extern void vtime_account_kernel(struct task_struct *tsk); +extern void vtime_account_idle(struct task_struct *tsk); +#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ +static inline void vtime_account_kernel(struct task_struct *tsk) { } +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +extern void arch_vtime_task_switch(struct task_struct *tsk); +extern void vtime_user_enter(struct task_struct *tsk); +extern void vtime_user_exit(struct task_struct *tsk); +extern void vtime_guest_enter(struct task_struct *tsk); +extern void vtime_guest_exit(struct task_struct *tsk); +extern void vtime_init_idle(struct task_struct *tsk, int cpu); +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ +static inline void vtime_user_enter(struct task_struct *tsk) { } +static inline void vtime_user_exit(struct task_struct *tsk) { } +static inline void vtime_guest_enter(struct task_struct *tsk) { } +static inline void vtime_guest_exit(struct task_struct *tsk) { } +static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } +#endif + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset); +extern void vtime_account_softirq(struct task_struct *tsk); +extern void vtime_account_hardirq(struct task_struct *tsk); +extern void vtime_flush(struct task_struct *tsk); +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { } +static inline void vtime_account_softirq(struct task_struct *tsk) { } +static inline void vtime_account_hardirq(struct task_struct *tsk) { } +static inline void vtime_flush(struct task_struct *tsk) { } +#endif + /* * vtime_accounting_enabled_this_cpu() definitions/declarations */ @@ -57,43 +94,6 @@ static inline void vtime_task_switch(struct task_struct *prev) { } #endif -/* - * Common vtime APIs - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -extern void vtime_account_kernel(struct task_struct *tsk); -extern void vtime_account_idle(struct task_struct *tsk); -#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ -static inline void vtime_account_kernel(struct task_struct *tsk) { } -#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -extern void arch_vtime_task_switch(struct task_struct *tsk); -extern void vtime_user_enter(struct task_struct *tsk); -extern void vtime_user_exit(struct task_struct *tsk); -extern void vtime_guest_enter(struct task_struct *tsk); -extern void vtime_guest_exit(struct task_struct *tsk); -extern void vtime_init_idle(struct task_struct *tsk, int cpu); -#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -static inline void vtime_user_enter(struct task_struct *tsk) { } -static inline void vtime_user_exit(struct task_struct *tsk) { } -static inline void vtime_guest_enter(struct task_struct *tsk) { } -static inline void vtime_guest_exit(struct task_struct *tsk) { } -static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } -#endif - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset); -extern void vtime_account_softirq(struct task_struct *tsk); -extern void vtime_account_hardirq(struct task_struct *tsk); -extern void vtime_flush(struct task_struct *tsk); -#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ -static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { } -static inline void vtime_account_softirq(struct task_struct *tsk) { } -static inline void vtime_account_hardirq(struct task_struct *tsk) { } -static inline void vtime_flush(struct task_struct *tsk) { } -#endif - #ifdef CONFIG_IRQ_TIME_ACCOUNTING extern void irqtime_account_irq(struct task_struct *tsk, unsigned int offset); -- cgit v1.2.3 From 6f922b89e5518143920b10e3643e556d9df58d94 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 17:27:32 -0700 Subject: sched/vtime: Move guest enter/exit vtime accounting to vtime.h Provide separate helpers for guest enter vtime accounting (in addition to the existing guest exit helpers), and move all vtime accounting helpers to vtime.h where the existing #ifdef infrastructure can be leveraged to better delineate the different types of accounting. This will also allow future cleanups via deduplication of context tracking code. Opportunstically delete the vtime_account_kernel() stub now that all callers are wrapped with CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210505002735.1684165-6-seanjc@google.com --- include/linux/context_tracking.h | 17 +-------------- include/linux/vtime.h | 46 ++++++++++++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 4f4556232dcf..56c648bdbde8 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -137,14 +137,6 @@ static __always_inline void context_tracking_guest_exit(void) __context_tracking_exit(CONTEXT_GUEST); } -static __always_inline void vtime_account_guest_exit(void) -{ - if (vtime_accounting_enabled_this_cpu()) - vtime_guest_exit(current); - else - current->flags &= ~PF_VCPU; -} - static __always_inline void guest_exit_irqoff(void) { context_tracking_guest_exit(); @@ -163,20 +155,13 @@ static __always_inline void guest_enter_irqoff(void) * to flush. */ instrumentation_begin(); - vtime_account_kernel(current); - current->flags |= PF_VCPU; + vtime_account_guest_enter(); rcu_virt_note_context_switch(smp_processor_id()); instrumentation_end(); } static __always_inline void context_tracking_guest_exit(void) { } -static __always_inline void vtime_account_guest_exit(void) -{ - vtime_account_kernel(current); - current->flags &= ~PF_VCPU; -} - static __always_inline void guest_exit_irqoff(void) { instrumentation_begin(); diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 6a4317560539..3684487d01e1 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -3,21 +3,18 @@ #define _LINUX_KERNEL_VTIME_H #include +#include + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #include #endif - -struct task_struct; - /* * Common vtime APIs */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING extern void vtime_account_kernel(struct task_struct *tsk); extern void vtime_account_idle(struct task_struct *tsk); -#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ -static inline void vtime_account_kernel(struct task_struct *tsk) { } #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN @@ -55,6 +52,18 @@ static inline void vtime_flush(struct task_struct *tsk) { } static inline bool vtime_accounting_enabled_this_cpu(void) { return true; } extern void vtime_task_switch(struct task_struct *prev); +static __always_inline void vtime_account_guest_enter(void) +{ + vtime_account_kernel(current); + current->flags |= PF_VCPU; +} + +static __always_inline void vtime_account_guest_exit(void) +{ + vtime_account_kernel(current); + current->flags &= ~PF_VCPU; +} + #elif defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN) /* @@ -86,12 +95,37 @@ static inline void vtime_task_switch(struct task_struct *prev) vtime_task_switch_generic(prev); } +static __always_inline void vtime_account_guest_enter(void) +{ + if (vtime_accounting_enabled_this_cpu()) + vtime_guest_enter(current); + else + current->flags |= PF_VCPU; +} + +static __always_inline void vtime_account_guest_exit(void) +{ + if (vtime_accounting_enabled_this_cpu()) + vtime_guest_exit(current); + else + current->flags &= ~PF_VCPU; +} + #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ -static inline bool vtime_accounting_enabled_cpu(int cpu) {return false; } static inline bool vtime_accounting_enabled_this_cpu(void) { return false; } static inline void vtime_task_switch(struct task_struct *prev) { } +static __always_inline void vtime_account_guest_enter(void) +{ + current->flags |= PF_VCPU; +} + +static __always_inline void vtime_account_guest_exit(void) +{ + current->flags &= ~PF_VCPU; +} + #endif -- cgit v1.2.3 From 14296e0c447885d6c7b326e059fb528eb00526ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 17:27:33 -0700 Subject: context_tracking: Consolidate guest enter/exit wrappers Consolidate the guest enter/exit wrappers, providing and tweaking stubs as needed. This will allow moving the wrappers under KVM without having to bleed #ifdefs into the soon-to-be KVM code. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210505002735.1684165-7-seanjc@google.com --- include/linux/context_tracking.h | 65 +++++++++++++++------------------------- 1 file changed, 24 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 56c648bdbde8..aa58c2ac67ca 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -71,6 +71,19 @@ static inline void exception_exit(enum ctx_state prev_ctx) } } +static __always_inline bool context_tracking_guest_enter(void) +{ + if (context_tracking_enabled()) + __context_tracking_enter(CONTEXT_GUEST); + + return context_tracking_enabled_this_cpu(); +} + +static __always_inline void context_tracking_guest_exit(void) +{ + if (context_tracking_enabled()) + __context_tracking_exit(CONTEXT_GUEST); +} /** * ct_state() - return the current context tracking state if known @@ -92,6 +105,9 @@ static inline void user_exit_irqoff(void) { } static inline enum ctx_state exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; } +static inline bool context_tracking_guest_enter(void) { return false; } +static inline void context_tracking_guest_exit(void) { } + #endif /* !CONFIG_CONTEXT_TRACKING */ #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond)) @@ -102,74 +118,41 @@ extern void context_tracking_init(void); static inline void context_tracking_init(void) { } #endif /* CONFIG_CONTEXT_TRACKING_FORCE */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN /* must be called with irqs disabled */ static __always_inline void guest_enter_irqoff(void) { + /* + * This is running in ioctl context so its safe to assume that it's the + * stime pending cputime to flush. + */ instrumentation_begin(); - if (vtime_accounting_enabled_this_cpu()) - vtime_guest_enter(current); - else - current->flags |= PF_VCPU; + vtime_account_guest_enter(); instrumentation_end(); - if (context_tracking_enabled()) - __context_tracking_enter(CONTEXT_GUEST); - - /* KVM does not hold any references to rcu protected data when it + /* + * KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode * is very similar to exiting to userspace from rcu point of view. In * addition CPU may stay in a guest mode for quite a long time (up to * one time slice). Lets treat guest mode as quiescent state, just like * we do with user-mode execution. */ - if (!context_tracking_enabled_this_cpu()) { + if (!context_tracking_guest_enter()) { instrumentation_begin(); rcu_virt_note_context_switch(smp_processor_id()); instrumentation_end(); } } -static __always_inline void context_tracking_guest_exit(void) -{ - if (context_tracking_enabled()) - __context_tracking_exit(CONTEXT_GUEST); -} - static __always_inline void guest_exit_irqoff(void) { context_tracking_guest_exit(); - instrumentation_begin(); - vtime_account_guest_exit(); - instrumentation_end(); -} - -#else -static __always_inline void guest_enter_irqoff(void) -{ - /* - * This is running in ioctl context so its safe - * to assume that it's the stime pending cputime - * to flush. - */ - instrumentation_begin(); - vtime_account_guest_enter(); - rcu_virt_note_context_switch(smp_processor_id()); - instrumentation_end(); -} - -static __always_inline void context_tracking_guest_exit(void) { } - -static __always_inline void guest_exit_irqoff(void) -{ instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ vtime_account_guest_exit(); instrumentation_end(); } -#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ static inline void guest_exit(void) { -- cgit v1.2.3 From 1ca0016c149be35fe19a6b75fce95c25807b7159 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 17:27:34 -0700 Subject: context_tracking: KVM: Move guest enter/exit wrappers to KVM's domain Move the guest enter/exit wrappers to kvm_host.h so that KVM can manage its context tracking vs. vtime accounting without bleeding too many KVM details into the context tracking code. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210505002735.1684165-8-seanjc@google.com --- include/linux/context_tracking.h | 45 ---------------------------------------- include/linux/kvm_host.h | 45 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index aa58c2ac67ca..4d7fced3a39f 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -118,49 +118,4 @@ extern void context_tracking_init(void); static inline void context_tracking_init(void) { } #endif /* CONFIG_CONTEXT_TRACKING_FORCE */ -/* must be called with irqs disabled */ -static __always_inline void guest_enter_irqoff(void) -{ - /* - * This is running in ioctl context so its safe to assume that it's the - * stime pending cputime to flush. - */ - instrumentation_begin(); - vtime_account_guest_enter(); - instrumentation_end(); - - /* - * KVM does not hold any references to rcu protected data when it - * switches CPU into a guest mode. In fact switching to a guest mode - * is very similar to exiting to userspace from rcu point of view. In - * addition CPU may stay in a guest mode for quite a long time (up to - * one time slice). Lets treat guest mode as quiescent state, just like - * we do with user-mode execution. - */ - if (!context_tracking_guest_enter()) { - instrumentation_begin(); - rcu_virt_note_context_switch(smp_processor_id()); - instrumentation_end(); - } -} - -static __always_inline void guest_exit_irqoff(void) -{ - context_tracking_guest_exit(); - - instrumentation_begin(); - /* Flush the guest cputime we spent on the guest */ - vtime_account_guest_exit(); - instrumentation_end(); -} - -static inline void guest_exit(void) -{ - unsigned long flags; - - local_irq_save(flags); - guest_exit_irqoff(); - local_irq_restore(flags); -} - #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8895b95b6a22..2f34487e21f2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -338,6 +338,51 @@ struct kvm_vcpu { struct kvm_dirty_ring dirty_ring; }; +/* must be called with irqs disabled */ +static __always_inline void guest_enter_irqoff(void) +{ + /* + * This is running in ioctl context so its safe to assume that it's the + * stime pending cputime to flush. + */ + instrumentation_begin(); + vtime_account_guest_enter(); + instrumentation_end(); + + /* + * KVM does not hold any references to rcu protected data when it + * switches CPU into a guest mode. In fact switching to a guest mode + * is very similar to exiting to userspace from rcu point of view. In + * addition CPU may stay in a guest mode for quite a long time (up to + * one time slice). Lets treat guest mode as quiescent state, just like + * we do with user-mode execution. + */ + if (!context_tracking_guest_enter()) { + instrumentation_begin(); + rcu_virt_note_context_switch(smp_processor_id()); + instrumentation_end(); + } +} + +static __always_inline void guest_exit_irqoff(void) +{ + context_tracking_guest_exit(); + + instrumentation_begin(); + /* Flush the guest cputime we spent on the guest */ + vtime_account_guest_exit(); + instrumentation_end(); +} + +static inline void guest_exit(void) +{ + unsigned long flags; + + local_irq_save(flags); + guest_exit_irqoff(); + local_irq_restore(flags); +} + static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { /* -- cgit v1.2.3 From 1139aeb1c521eb4a050920ce6c64c36c4f2a3ab7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 5 May 2021 23:12:42 +0200 Subject: smp: Fix smp_call_function_single_async prototype As of commit 966a967116e6 ("smp: Avoid using two cache lines for struct call_single_data"), the smp code prefers 32-byte aligned call_single_data objects for performance reasons, but the block layer includes an instance of this structure in the main 'struct request' that is more senstive to size than to performance here, see 4ccafe032005 ("block: unalign call_single_data in struct request"). The result is a violation of the calling conventions that clang correctly points out: block/blk-mq.c:630:39: warning: passing 8-byte aligned argument to 32-byte aligned parameter 2 of 'smp_call_function_single_async' may result in an unaligned pointer access [-Walign-mismatch] smp_call_function_single_async(cpu, &rq->csd); It does seem that the usage of the call_single_data without cache line alignment should still be allowed by the smp code, so just change the function prototype so it accepts both, but leave the default alignment unchanged for the other users. This seems better to me than adding a local hack to shut up an otherwise correct warning in the caller. Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jens Axboe Link: https://lkml.kernel.org/r/20210505211300.3174456-1-arnd@kernel.org --- include/linux/smp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/smp.h b/include/linux/smp.h index 84a0b4828f66..f0d3ef654207 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -53,7 +53,7 @@ int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask); -int smp_call_function_single_async(int cpu, call_single_data_t *csd); +int smp_call_function_single_async(int cpu, struct __call_single_data *csd); /* * Call a function on all processors -- cgit v1.2.3 From d4455faccd6cbe11ddfdbe28723a2122453b4f4e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 6 May 2021 18:02:16 -0700 Subject: proc: mandate ->proc_lseek in "struct proc_ops" Now that proc_ops are separate from file_operations and other operations it easy to check all instances to have ->proc_lseek hook and remove check in main code. Note: nonseekable_open() files naturally don't require ->proc_lseek. Garbage collect pde_lseek() function. [adobriyan@gmail.com: smoke test lseek()] Link: https://lkml.kernel.org/r/YG4OIhChOrVTPgdN@localhost.localdomain Link: https://lkml.kernel.org/r/YFYX0Bzwxlc7aBa/@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/proc_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 000cc0533c33..069c7fd95396 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -32,6 +32,7 @@ struct proc_ops { ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *); ssize_t (*proc_read_iter)(struct kiocb *, struct iov_iter *); ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *); + /* mandatory unless nonseekable_open() or equivalent is used */ loff_t (*proc_lseek)(struct file *, loff_t, int); int (*proc_release)(struct inode *, struct file *); __poll_t (*proc_poll)(struct file *, struct poll_table_struct *); -- cgit v1.2.3 From 4ee60ec156d91c315d1f62dfc1bc5799dcc6b473 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 6 May 2021 18:02:27 -0700 Subject: include: remove pagemap.h from blkdev.h My UEK-derived config has 1030 files depending on pagemap.h before this change. Afterwards, just 326 files need to be rebuilt when I touch pagemap.h. I think blkdev.h is probably included too widely, but untangling that dependency is harder and this solves my problem. x86 allmodconfig builds, but there may be implicit include problems on other architectures. Link: https://lkml.kernel.org/r/20210309195747.283796-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Dan Williams [nvdimm] Acked-by: Jens Axboe [block] Reviewed-by: Christoph Hellwig Acked-by: Coly Li [bcache] Acked-by: Martin K. Petersen [scsi] Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 1 - include/linux/swap.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b91ba6207365..1255823b2bc0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/swap.h b/include/linux/swap.h index f69e0f67651d..144727041e78 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 08c5188ef40ff82aed559123dc0ab2d2254b1b1c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 6 May 2021 18:02:30 -0700 Subject: kernel.h: drop inclusion in bitmap.h The bitmap.h header is used in a lot of code around the kernel. Besides that it includes kernel.h which sometimes makes a loop. The problem here is many unneeded loops that make header hell dependencies. For example, how may you move bitmap_zalloc() from C-file to the header? Currently it's impossible. And bitmap.h here is only the tip of an iceberg. kerne.h is a dump of everything that even has nothing in common at all. We may still have it, but in my new code I prefer to include only the headers that I want to use, without the bulk of unneeded kernel code. Break the loop by introducing align.h, including it in kernel.h and bitmap.h followed by replacing kernel.h with limits.h. Link: https://lkml.kernel.org/r/20210326170347.37441-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Yury Norov Cc: Rasmus Villemoes Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/align.h | 15 +++++++++++++++ include/linux/bitmap.h | 3 ++- include/linux/kernel.h | 9 +-------- 3 files changed, 18 insertions(+), 9 deletions(-) create mode 100644 include/linux/align.h (limited to 'include') diff --git a/include/linux/align.h b/include/linux/align.h new file mode 100644 index 000000000000..2b4acec7b95a --- /dev/null +++ b/include/linux/align.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ALIGN_H +#define _LINUX_ALIGN_H + +#include + +/* @a is a power of 2 value */ +#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) +#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) +#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) +#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) +#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) +#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) + +#endif /* _LINUX_ALIGN_H */ diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 70a932470b2d..6cbcd9d9edd2 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -4,10 +4,11 @@ #ifndef __ASSEMBLY__ +#include #include #include +#include #include -#include /* * bitmaps provide bit arrays that consume one or more unsigned diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5b7ed6dc99ac..09035ac67d4b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -3,6 +3,7 @@ #define _LINUX_KERNEL_H #include +#include #include #include #include @@ -30,14 +31,6 @@ */ #define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) -/* @a is a power of 2 value */ -#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) -#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) -#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) -#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) -#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) -#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) - /* generic data direction definitions */ #define READ 0 #define WRITE 1 -- cgit v1.2.3 From 112dfce8f29798192eb0be8066b54f4a68f4eb36 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 6 May 2021 18:02:33 -0700 Subject: linux/profile.h: remove unnecessary declaration Declaring struct pt_regs is unnecessary. On the one hand, there is no function using it; on the other hand, struct pt_regs has been declared in linux/kernel.h. Remove them. Link: https://lkml.kernel.org/r/20210401104834.1009157-1-wanjiabing@vivo.com Signed-off-by: Wan Jiabing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/profile.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/profile.h b/include/linux/profile.h index bad18ca43150..fd18ca96f557 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -15,7 +15,6 @@ #define KVM_PROFILING 4 struct proc_dir_entry; -struct pt_regs; struct notifier_block; #if defined(CONFIG_PROFILING) && defined(CONFIG_PROC_FS) @@ -84,8 +83,6 @@ int task_handoff_unregister(struct notifier_block * n); int profile_event_register(enum profile_type, struct notifier_block * n); int profile_event_unregister(enum profile_type, struct notifier_block * n); -struct pt_regs; - #else #define prof_on 0 -- cgit v1.2.3 From 32c93976ac2ee7ecb4b09cc032efe1445d37bd7e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 6 May 2021 18:02:39 -0700 Subject: kernel/cred.c: make init_groups static init_groups is declared in both cred.h and init_task.h, but it is not actually referenced anywhere outside of cred.c where it is defined. So make it static and remove the declarations. Link: https://lkml.kernel.org/r/20210310220102.2484201-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cred.h | 1 - include/linux/init_task.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index ac0e5f97d7d8..14971322e1a0 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -53,7 +53,6 @@ do { \ groups_free(group_info); \ } while (0) -extern struct group_info init_groups; #ifdef CONFIG_MULTIUSER extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index b2412b4d4c20..40fc5813cf93 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,7 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; extern struct nsproxy init_nsproxy; -extern struct group_info init_groups; extern struct cred init_cred; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -- cgit v1.2.3 From 586eaebea5988302c5a8b018096dd6c6f4564940 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:02:56 -0700 Subject: lib: extend the scope of small_const_nbits() macro find_bit would also benefit from small_const_nbits() optimizations. The detailed comment is provided by Rasmus Villemoes. Link: https://lkml.kernel.org/r/20210401003153.97325-6-yury.norov@gmail.com Suggested-by: Rasmus Villemoes Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Acked-by: Andy Shevchenko Cc: Alexey Klimov Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bitsperlong.h | 12 ++++++++++++ include/linux/bitmap.h | 8 -------- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bitsperlong.h b/include/asm-generic/bitsperlong.h index 3905c1c93dc2..1023e2a4bd37 100644 --- a/include/asm-generic/bitsperlong.h +++ b/include/asm-generic/bitsperlong.h @@ -23,4 +23,16 @@ #define BITS_PER_LONG_LONG 64 #endif +/* + * small_const_nbits(n) is true precisely when it is known at compile-time + * that BITMAP_SIZE(n) is 1, i.e. 1 <= n <= BITS_PER_LONG. This allows + * various bit/bitmap APIs to provide a fast inline implementation. Bitmaps + * of size 0 are very rare, and a compile-time-known-size 0 is most likely + * a sign of error. They will be handled correctly by the bit/bitmap APIs, + * but using the out-of-line functions, so that the inline implementations + * can unconditionally dereference the pointer(s). + */ +#define small_const_nbits(nbits) \ + (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0) + #endif /* __ASM_GENERIC_BITS_PER_LONG */ diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 6cbcd9d9edd2..f57f4473bbe4 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -223,14 +223,6 @@ extern int bitmap_print_to_pagebuf(bool list, char *buf, #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) -/* - * The static inlines below do not handle constant nbits==0 correctly, - * so make such users (should any ever turn up) call the out-of-line - * versions. - */ -#define small_const_nbits(nbits) \ - (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0) - static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); -- cgit v1.2.3 From 5c88af59f9abc202648a431428ad9d32e5d2a201 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:03:03 -0700 Subject: lib: inline _find_next_bit() wrappers lib/find_bit.c declares five single-line wrappers for _find_next_bit(). We may turn those wrappers to inline functions. It eliminates unneeded function calls and opens room for compile-time optimizations. Link: https://lkml.kernel.org/r/20210401003153.97325-8-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Acked-by: Andy Shevchenko Cc: Alexey Klimov Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bitops/find.h | 28 ++++++++++++++++++++++------ include/asm-generic/bitops/le.h | 17 +++++++++++++---- 2 files changed, 35 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 9fdf21302fdf..7ad70dab8e93 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -2,6 +2,10 @@ #ifndef _ASM_GENERIC_BITOPS_FIND_H_ #define _ASM_GENERIC_BITOPS_FIND_H_ +extern unsigned long _find_next_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long nbits, + unsigned long start, unsigned long invert, unsigned long le); + #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region @@ -12,8 +16,12 @@ * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -extern unsigned long find_next_bit(const unsigned long *addr, unsigned long - size, unsigned long offset); +static inline +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + return _find_next_bit(addr, NULL, size, offset, 0UL, 0); +} #endif #ifndef find_next_and_bit @@ -27,9 +35,13 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -extern unsigned long find_next_and_bit(const unsigned long *addr1, +static inline +unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, - unsigned long offset); + unsigned long offset) +{ + return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); +} #endif #ifndef find_next_zero_bit @@ -42,8 +54,12 @@ extern unsigned long find_next_and_bit(const unsigned long *addr1, * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ -extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned - long size, unsigned long offset); +static inline +unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); +} #endif #ifdef CONFIG_GENERIC_FIND_FIRST_BIT diff --git a/include/asm-generic/bitops/le.h b/include/asm-generic/bitops/le.h index 188d3eba3ace..21305f6cea0b 100644 --- a/include/asm-generic/bitops/le.h +++ b/include/asm-generic/bitops/le.h @@ -2,6 +2,7 @@ #ifndef _ASM_GENERIC_BITOPS_LE_H_ #define _ASM_GENERIC_BITOPS_LE_H_ +#include #include #include @@ -32,13 +33,21 @@ static inline unsigned long find_first_zero_bit_le(const void *addr, #define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) #ifndef find_next_zero_bit_le -extern unsigned long find_next_zero_bit_le(const void *addr, - unsigned long size, unsigned long offset); +static inline +unsigned long find_next_zero_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); +} #endif #ifndef find_next_bit_le -extern unsigned long find_next_bit_le(const void *addr, - unsigned long size, unsigned long offset); +static inline +unsigned long find_next_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + return _find_next_bit(addr, NULL, size, offset, 0UL, 1); +} #endif #ifndef find_first_zero_bit_le -- cgit v1.2.3 From 277a20a498d30753f5d8a607dbf967bc163552c1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:03:11 -0700 Subject: lib: add fast path for find_next_*_bit() Similarly to bitmap functions, find_next_*_bit() users will benefit if we'll handle a case of bitmaps that fit into a single word inline. In the very best case, the compiler may replace a function call with a few instructions. This is the quite typical find_next_bit() user: unsigned int cpumask_next(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1); } EXPORT_SYMBOL(cpumask_next); Currently, on ARM64 the generated code looks like this: 0000000000000000 : 0: a9bf7bfd stp x29, x30, [sp, #-16]! 4: 11000402 add w2, w0, #0x1 8: aa0103e0 mov x0, x1 c: d2800401 mov x1, #0x40 // #64 10: 910003fd mov x29, sp 14: 93407c42 sxtw x2, w2 18: 94000000 bl 0 1c: a8c17bfd ldp x29, x30, [sp], #16 20: d65f03c0 ret 24: d503201f nop After applying this patch: 0000000000000140 : 140: 11000400 add w0, w0, #0x1 144: 93407c00 sxtw x0, w0 148: f100fc1f cmp x0, #0x3f 14c: 54000168 b.hi 178 // b.pmore 150: f9400023 ldr x3, [x1] 154: 92800001 mov x1, #0xffffffffffffffff // #-1 158: 9ac02020 lsl x0, x1, x0 15c: 52800802 mov w2, #0x40 // #64 160: 8a030001 and x1, x0, x3 164: dac00020 rbit x0, x1 168: f100003f cmp x1, #0x0 16c: dac01000 clz x0, x0 170: 1a800040 csel w0, w2, w0, eq // eq = none 174: d65f03c0 ret 178: 52800800 mov w0, #0x40 // #64 17c: d65f03c0 ret find_next_bit() call is replaced with 6 instructions. find_next_bit() itself is 41 instructions plus function call overhead. Despite inlining, the scripts/bloat-o-meter report smaller .text size after applying the series: add/remove: 11/9 grow/shrink: 233/176 up/down: 5780/-6768 (-988) Link: https://lkml.kernel.org/r/20210401003153.97325-10-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Acked-by: Andy Shevchenko Cc: Alexey Klimov Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bitops/find.h | 30 ++++++++++++++++++++++++++++++ include/asm-generic/bitops/le.h | 21 +++++++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'include') diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 7ad70dab8e93..4148c74a1e4d 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -20,6 +20,16 @@ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + return _find_next_bit(addr, NULL, size, offset, 0UL, 0); } #endif @@ -40,6 +50,16 @@ unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr1 & *addr2 & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); } #endif @@ -58,6 +78,16 @@ static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); } #endif diff --git a/include/asm-generic/bitops/le.h b/include/asm-generic/bitops/le.h index 21305f6cea0b..5a28629cbf4d 100644 --- a/include/asm-generic/bitops/le.h +++ b/include/asm-generic/bitops/le.h @@ -5,6 +5,7 @@ #include #include #include +#include #if defined(__LITTLE_ENDIAN) @@ -37,6 +38,16 @@ static inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); } #endif @@ -46,6 +57,16 @@ static inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + return _find_next_bit(addr, NULL, size, offset, 0UL, 1); } #endif -- cgit v1.2.3 From 2cc7b6a44ac21d31b398b03f4845c53152070416 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:03:14 -0700 Subject: lib: add fast path for find_first_*_bit() and find_last_bit() Similarly to bitmap functions, users would benefit if we'll handle a case of small-size bitmaps that fit into a single word. While here, move the find_last_bit() declaration to bitops/find.h where other find_*_bit() functions sit. Link: https://lkml.kernel.org/r/20210401003153.97325-11-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Acked-by: Andy Shevchenko Cc: Alexey Klimov Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bitops/find.h | 50 +++++++++++++++++++++++++++++++++++---- include/linux/bitops.h | 12 ---------- 2 files changed, 46 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 4148c74a1e4d..0d132ee2a291 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -5,6 +5,9 @@ extern unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start, unsigned long invert, unsigned long le); +extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); #ifndef find_next_bit /** @@ -102,8 +105,17 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ -extern unsigned long find_first_bit(const unsigned long *addr, - unsigned long size); +static inline +unsigned long find_first_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_bit(addr, size); +} /** * find_first_zero_bit - find the first cleared bit in a memory region @@ -113,8 +125,17 @@ extern unsigned long find_first_bit(const unsigned long *addr, * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ -extern unsigned long find_first_zero_bit(const unsigned long *addr, - unsigned long size); +static inline +unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr | ~GENMASK(size - 1, 0); + + return val == ~0UL ? size : ffz(val); + } + + return _find_first_zero_bit(addr, size); +} #else /* CONFIG_GENERIC_FIND_FIRST_BIT */ #ifndef find_first_bit @@ -126,6 +147,27 @@ extern unsigned long find_first_zero_bit(const unsigned long *addr, #endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ +#ifndef find_last_bit +/** + * find_last_bit - find the last set bit in a memory region + * @addr: The address to start the search at + * @size: The number of bits to search + * + * Returns the bit number of the last set bit, or size. + */ +static inline +unsigned long find_last_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __fls(val) : size; + } + + return _find_last_bit(addr, size); +} +#endif + /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump diff --git a/include/linux/bitops.h b/include/linux/bitops.h index a5a48303b0f1..26bf15e6cd35 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -286,17 +286,5 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, }) #endif -#ifndef find_last_bit -/** - * find_last_bit - find the last set bit in a memory region - * @addr: The address to start the search at - * @size: The number of bits to search - * - * Returns the bit number of the last set bit, or size. - */ -extern unsigned long find_last_bit(const unsigned long *addr, - unsigned long size); -#endif - #endif /* __KERNEL__ */ #endif -- cgit v1.2.3 From e18baa7cc3598999317d6c2fe255756f6b3b7562 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 6 May 2021 18:03:37 -0700 Subject: lib: crc8: pointer to data block should be const crc8() does not change the data passed to it, so the pointer argument should be declared const. This avoids callers that receive const data having to cast it to a non-const pointer to call crc8(). Link: https://lkml.kernel.org/r/20210329122409.3291-1-rf@opensource.cirrus.com Signed-off-by: Richard Fitzgerald Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crc8.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/crc8.h b/include/linux/crc8.h index 13c8dabb0441..674045c59a04 100644 --- a/include/linux/crc8.h +++ b/include/linux/crc8.h @@ -96,6 +96,6 @@ void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial); * Williams, Ross N., rossross.net * (see URL http://www.ross.net/crc/download/crc_v3.txt). */ -u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc); +u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc); #endif /* __CRC8_H_ */ -- cgit v1.2.3 From e13d04ec45b07388d3c38c0e18a4d0aa4841b0c3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 6 May 2021 18:03:52 -0700 Subject: include/linux/compat.h: remove unneeded declaration from COMPAT_SYSCALL_DEFINEx() compat_sys##name is declared twice, just one line below. With this removal SYSCALL_DEFINEx() (defined in ) and COMPAT_SYSCALL_DEFINEx() look symmetrical. Link: https://lkml.kernel.org/r/20210223114924.854794-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/compat.h b/include/linux/compat.h index f0d2dd35d408..cfac4ec9a7df 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -75,7 +75,6 @@ __diag_push(); \ __diag_ignore(GCC, 8, "-Wattribute-alias", \ "Type aliasing is used to sanitize syscall arguments");\ - asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_compat_sys##name)))); \ ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \ -- cgit v1.2.3 From 6f1f942cd5fbbe308f912fc84e3f10fbc8113a68 Mon Sep 17 00:00:00 2001 From: He Ying Date: Thu, 6 May 2021 18:04:57 -0700 Subject: smp: kernel/panic.c - silence warnings We found these warnings in kernel/panic.c by using sparse tool: warning: symbol 'panic_smp_self_stop' was not declared. warning: symbol 'nmi_panic_self_stop' was not declared. warning: symbol 'crash_smp_send_stop' was not declared. To avoid them, add declarations for these three functions in include/linux/smp.h. Link: https://lkml.kernel.org/r/20210316084150.75201-1-heying24@huawei.com Signed-off-by: He Ying Reported-by: Hulk Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/smp.h b/include/linux/smp.h index 84a0b4828f66..669e35c03be2 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -55,6 +55,14 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, int smp_call_function_single_async(int cpu, call_single_data_t *csd); +/* + * Cpus stopping functions in panic. All have default weak definitions. + * Architecture-dependent code may override them. + */ +void panic_smp_self_stop(void); +void nmi_panic_self_stop(struct pt_regs *regs); +void crash_smp_send_stop(void); + /* * Call a function on all processors */ -- cgit v1.2.3 From 3d1c7fd97e4c5e54034231cd11319079dfaed60e Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Thu, 6 May 2021 18:05:00 -0700 Subject: delayacct: clear right task's flag after blkio completes When I was implementing a latency analyzer tool by using task->delays and other things, I found an issue in delayacct. The issue is it should clear the target's flag instead of current's in delayacct_blkio_end(). When I git blame delayacct, I found there're some similar issues we have fixed in delayacct_blkio_end(). - Commit c96f5471ce7d ("delayacct: Account blkio completion on the correct task") fixed the issue that it should account blkio completion on the target task instead of current. - Commit b512719f771a ("delayacct: fix crash in delayacct_blkio_end() after delayacct init failure") fixed the issue that it should check target task's delays instead of current task'. It seems that delayacct_blkio_{begin, end} are error prone. So I introduce a new paratmeter - the target task 'p' - to these helpers. After that change, the callsite will specifilly set the right task, which should make it less error prone. Link: https://lkml.kernel.org/r/20210414083720.24083-1-laoar.shao@gmail.com Signed-off-by: Yafang Shao Cc: Tejun Heo Cc: Josh Snyder Cc: Jens Axboe Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 2d3bdcccf5eb..21651f946751 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -82,16 +82,16 @@ static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) return 0; } -static inline void delayacct_set_flag(int flag) +static inline void delayacct_set_flag(struct task_struct *p, int flag) { - if (current->delays) - current->delays->flags |= flag; + if (p->delays) + p->delays->flags |= flag; } -static inline void delayacct_clear_flag(int flag) +static inline void delayacct_clear_flag(struct task_struct *p, int flag) { - if (current->delays) - current->delays->flags &= ~flag; + if (p->delays) + p->delays->flags &= ~flag; } static inline void delayacct_tsk_init(struct task_struct *tsk) @@ -114,7 +114,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk) static inline void delayacct_blkio_start(void) { - delayacct_set_flag(DELAYACCT_PF_BLKIO); + delayacct_set_flag(current, DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); } @@ -123,7 +123,7 @@ static inline void delayacct_blkio_end(struct task_struct *p) { if (p->delays) __delayacct_blkio_end(p); - delayacct_clear_flag(DELAYACCT_PF_BLKIO); + delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); } static inline int delayacct_add_tsk(struct taskstats *d, @@ -166,9 +166,9 @@ static inline void delayacct_thrashing_end(void) } #else -static inline void delayacct_set_flag(int flag) +static inline void delayacct_set_flag(struct task_struct *p, int flag) {} -static inline void delayacct_clear_flag(int flag) +static inline void delayacct_clear_flag(struct task_struct *p, int flag) {} static inline void delayacct_init(void) {} -- cgit v1.2.3 From a065c0faacb1e472cd4e048986407d1b177373a2 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 6 May 2021 18:05:39 -0700 Subject: kernel/async.c: remove async_unregister_domain() No callers in the tree. Link: https://lkml.kernel.org/r/20210309151723.1907838-2-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/async.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/async.h b/include/linux/async.h index 0a17cd27f348..cce4ad31e8fc 100644 --- a/include/linux/async.h +++ b/include/linux/async.h @@ -112,7 +112,6 @@ async_schedule_dev_domain(async_func_t func, struct device *dev, return async_schedule_node_domain(func, dev, dev_to_node(dev), domain); } -void async_unregister_domain(struct async_domain *domain); extern void async_synchronize_full(void); extern void async_synchronize_full_domain(struct async_domain *domain); extern void async_synchronize_cookie(async_cookie_t cookie); -- cgit v1.2.3 From e7cb072eb988e46295512617c39d004f9e1c26f8 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 6 May 2021 18:05:42 -0700 Subject: init/initramfs.c: do unpacking asynchronously Patch series "background initramfs unpacking, and CONFIG_MODPROBE_PATH", v3. These two patches are independent, but better-together. The second is a rather trivial patch that simply allows the developer to change "/sbin/modprobe" to something else - e.g. the empty string, so that all request_module() during early boot return -ENOENT early, without even spawning a usermode helper, needlessly synchronizing with the initramfs unpacking. The first patch delegates decompressing the initramfs to a worker thread, allowing do_initcalls() in main.c to proceed to the device_ and late_ initcalls without waiting for that decompression (and populating of rootfs) to finish. Obviously, some of those later calls may rely on the initramfs being available, so I've added synchronization points in the firmware loader and usermodehelper paths - there might be other places that would need this, but so far no one has been able to think of any places I have missed. There's not much to win if most of the functionality needed during boot is only available as modules. But systems with a custom-made .config and initramfs can boot faster, partly due to utilizing more than one cpu earlier, partly by avoiding known-futile modprobe calls (which would still trigger synchronization with the initramfs unpacking, thus eliminating most of the first benefit). This patch (of 2): Most of the boot process doesn't actually need anything from the initramfs, until of course PID1 is to be executed. So instead of doing the decompressing and populating of the initramfs synchronously in populate_rootfs() itself, push that off to a worker thread. This is primarily motivated by an embedded ppc target, where unpacking even the rather modest sized initramfs takes 0.6 seconds, which is long enough that the external watchdog becomes unhappy that it doesn't get attention soon enough. By doing the initramfs decompression in a worker thread, we get to do the device_initcalls and hence start petting the watchdog much sooner. Normal desktops might benefit as well. On my mostly stock Ubuntu kernel, my initramfs is a 26M xz-compressed blob, decompressing to around 126M. That takes almost two seconds: [ 0.201454] Trying to unpack rootfs image as initramfs... [ 1.976633] Freeing initrd memory: 29416K Before this patch, these lines occur consecutively in dmesg. With this patch, the timestamps on these two lines is roughly the same as above, but with 172 lines inbetween - so more than one cpu has been kept busy doing work that would otherwise only happen after the populate_rootfs() finished. Should one of the initcalls done after rootfs_initcall time (i.e., device_ and late_ initcalls) need something from the initramfs (say, a kernel module or a firmware blob), it will simply wait for the initramfs unpacking to be done before proceeding, which should in theory make this completely safe. But if some driver pokes around in the filesystem directly and not via one of the official kernel interfaces (i.e. request_firmware*(), call_usermodehelper*) that theory may not hold - also, I certainly might have missed a spot when sprinkling wait_for_initramfs(). So there is an escape hatch in the form of an initramfs_async= command line parameter. Link: https://lkml.kernel.org/r/20210313212528.2956377-1-linux@rasmusvillemoes.dk Link: https://lkml.kernel.org/r/20210313212528.2956377-2-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Reviewed-by: Luis Chamberlain Cc: Jessica Yu Cc: Borislav Petkov Cc: Jonathan Corbet Cc: Greg Kroah-Hartman Cc: Nick Desaulniers Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/initrd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/initrd.h b/include/linux/initrd.h index 85c15717af34..1bbe9af48dc3 100644 --- a/include/linux/initrd.h +++ b/include/linux/initrd.h @@ -20,8 +20,10 @@ extern void free_initrd_mem(unsigned long, unsigned long); #ifdef CONFIG_BLK_DEV_INITRD extern void __init reserve_initrd_mem(void); +extern void wait_for_initramfs(void); #else static inline void __init reserve_initrd_mem(void) {} +static inline void wait_for_initramfs(void) {} #endif extern phys_addr_t phys_initrd_start; -- cgit v1.2.3 From cb152a1a95606aadd81df7a537dde9ef16da4b80 Mon Sep 17 00:00:00 2001 From: Shijie Luo Date: Thu, 6 May 2021 18:05:51 -0700 Subject: mm: fix some typos and code style problems fix some typos and code style problems in mm. gfp.h: s/MAXNODES/MAX_NUMNODES mmzone.h: s/then/than rmap.c: s/__vma_split()/__vma_adjust() swap.c: s/__mod_zone_page_stat/__mod_zone_page_state, s/is is/is swap_state.c: s/whoes/whose z3fold.c: code style problem fix in z3fold_unregister_migration zsmalloc.c: s/of/or, s/give/given Link: https://lkml.kernel.org/r/20210419083057.64820-1-luoshijie1@huawei.com Signed-off-by: Shijie Luo Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- include/linux/mmzone.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 8a5f6c3d7dba..11da8af06704 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -490,7 +490,7 @@ static inline int gfp_zonelist(gfp_t flags) /* * We get the zone list from the current node and the gfp_mask. - * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones. + * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones. * There are two zonelists per node, one for all zones with memory and * one containing just zones from the node the zonelist belongs to. * diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 917bd6c604d5..0d53eba1c383 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -55,7 +55,7 @@ enum migratetype { * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function. What is important though * is that a range of pageblocks must be aligned to - * MAX_ORDER_NR_PAGES should biggest page be bigger then + * MAX_ORDER_NR_PAGES should biggest page be bigger than * a single pageblock. */ MIGRATE_CMA, -- cgit v1.2.3 From bbcd53c960713507ae764bf81970651b5577b95a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 6 May 2021 18:05:55 -0700 Subject: drivers/char: remove /dev/kmem for good Patch series "drivers/char: remove /dev/kmem for good". Exploring /dev/kmem and /dev/mem in the context of memory hot(un)plug and memory ballooning, I started questioning the existence of /dev/kmem. Comparing it with the /proc/kcore implementation, it does not seem to be able to deal with things like a) Pages unmapped from the direct mapping (e.g., to be used by secretmem) -> kern_addr_valid(). virt_addr_valid() is not sufficient. b) Special cases like gart aperture memory that is not to be touched -> mem_pfn_is_ram() Unless I am missing something, it's at least broken in some cases and might fault/crash the machine. Looks like its existence has been questioned before in 2005 and 2010 [1], after ~11 additional years, it might make sense to revive the discussion. CONFIG_DEVKMEM is only enabled in a single defconfig (on purpose or by mistake?). All distributions disable it: in Ubuntu it has been disabled for more than 10 years, in Debian since 2.6.31, in Fedora at least starting with FC3, in RHEL starting with RHEL4, in SUSE starting from 15sp2, and OpenSUSE has it disabled as well. 1) /dev/kmem was popular for rootkits [2] before it got disabled basically everywhere. Ubuntu documents [3] "There is no modern user of /dev/kmem any more beyond attackers using it to load kernel rootkits.". RHEL documents in a BZ [5] "it served no practical purpose other than to serve as a potential security problem or to enable binary module drivers to access structures/functions they shouldn't be touching" 2) /proc/kcore is a decent interface to have a controlled way to read kernel memory for debugging puposes. (will need some extensions to deal with memory offlining/unplug, memory ballooning, and poisoned pages, though) 3) It might be useful for corner case debugging [1]. KDB/KGDB might be a better fit, especially, to write random memory; harder to shoot yourself into the foot. 4) "Kernel Memory Editor" [4] hasn't seen any updates since 2000 and seems to be incompatible with 64bit [1]. For educational purposes, /proc/kcore might be used to monitor value updates -- or older kernels can be used. 5) It's broken on arm64, and therefore, completely disabled there. Looks like it's essentially unused and has been replaced by better suited interfaces for individual tasks (/proc/kcore, KDB/KGDB). Let's just remove it. [1] https://lwn.net/Articles/147901/ [2] https://www.linuxjournal.com/article/10505 [3] https://wiki.ubuntu.com/Security/Features#A.2Fdev.2Fkmem_disabled [4] https://sourceforge.net/projects/kme/ [5] https://bugzilla.redhat.com/show_bug.cgi?id=154796 Link: https://lkml.kernel.org/r/20210324102351.6932-1-david@redhat.com Link: https://lkml.kernel.org/r/20210324102351.6932-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Kees Cook Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: "Alexander A. Klimov" Cc: Alexander Viro Cc: Alexandre Belloni Cc: Andrew Lunn Cc: Andrey Zhizhikin Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: Corentin Labbe Cc: "David S. Miller" Cc: "Eric W. Biederman" Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Greentime Hu Cc: Gregory Clement Cc: Heiko Carstens Cc: Helge Deller Cc: Hillf Danton Cc: huang ying Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: "James E.J. Bottomley" Cc: James Troup Cc: Jiaxun Yang Cc: Jonas Bonn Cc: Jonathan Corbet Cc: Kairui Song Cc: Krzysztof Kozlowski Cc: Kuninori Morimoto Cc: Liviu Dudau Cc: Lorenzo Pieralisi Cc: Luc Van Oostenryck Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport Cc: Mikulas Patocka Cc: Minchan Kim Cc: Niklas Schnelle Cc: Oleksiy Avramchenko Cc: openrisc@lists.librecores.org Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: "Pavel Machek (CIP)" Cc: Pavel Machek Cc: "Peter Zijlstra (Intel)" Cc: Pierre Morel Cc: Randy Dunlap Cc: Richard Henderson Cc: Rich Felker Cc: Robert Richter Cc: Rob Herring Cc: Russell King Cc: Sam Ravnborg Cc: Sebastian Andrzej Siewior Cc: Sebastian Hesselbarth Cc: sparclinux@vger.kernel.org Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Steven Rostedt Cc: Sudeep Holla Cc: Theodore Dubois Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Viresh Kumar Cc: William Cohen Cc: Xiaoming Ni Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- include/linux/vmalloc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index acef282b97c6..c3c88fdb9b2a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -145,7 +145,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)0x1000) -/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */ +/* File is huge (eg. /dev/mem): treat loff_t as unsigned */ #define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) /* File is opened with O_PATH; almost nothing can be done with it */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 394d03cc0e92..f31ba59fb1ef 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -227,7 +227,7 @@ static inline void set_vm_flush_reset_perms(void *addr) } #endif -/* for /dev/kmem */ +/* for /proc/kcore */ extern long vread(char *buf, char *addr, unsigned long count); extern long vwrite(char *buf, char *addr, unsigned long count); -- cgit v1.2.3 From f2e762bab9f5ec74cc9860fc24f01b7f58c98659 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 6 May 2021 18:06:01 -0700 Subject: mm: remove xlate_dev_kmem_ptr() Since /dev/kmem has been removed, let's remove the xlate_dev_kmem_ptr() leftovers. Link: https://lkml.kernel.org/r/20210324102351.6932-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Geert Uytterhoeven Acked-by: Michal Hocko Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Russell King Cc: Brian Cain Cc: Geert Uytterhoeven Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Cc: Arnd Bergmann Cc: David Hildenbrand Cc: Krzysztof Kozlowski Cc: Mikulas Patocka Cc: Luc Van Oostenryck Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Luis Chamberlain Cc: Greentime Hu Cc: Sebastian Andrzej Siewior Cc: Randy Dunlap Cc: Jiaxun Yang Cc: "Peter Zijlstra (Intel)" Cc: Christophe Leroy Cc: Gerald Schaefer Cc: Niklas Schnelle Cc: Pierre Morel Cc: Ingo Molnar Cc: Kuninori Morimoto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/io.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include') diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 76d456c516a1..e93375c710b9 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1064,17 +1064,6 @@ static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p) #endif #endif /* CONFIG_GENERIC_IOMAP */ -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#ifndef xlate_dev_kmem_ptr -#define xlate_dev_kmem_ptr xlate_dev_kmem_ptr -static inline void *xlate_dev_kmem_ptr(void *addr) -{ - return addr; -} -#endif - #ifndef xlate_dev_mem_ptr #define xlate_dev_mem_ptr xlate_dev_mem_ptr static inline void *xlate_dev_mem_ptr(phys_addr_t addr) -- cgit v1.2.3 From f7c8ce44ebb113b83135ada6e496db33d8a535e3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 6 May 2021 18:06:06 -0700 Subject: mm/vmalloc: remove vwrite() The last user (/dev/kmem) is gone. Let's drop it. Link: https://lkml.kernel.org/r/20210324102351.6932-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: Matthew Wilcox Cc: Oleksiy Avramchenko Cc: Steven Rostedt Cc: Minchan Kim Cc: huang ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index f31ba59fb1ef..b6ff16393bf6 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -229,7 +229,6 @@ static inline void set_vm_flush_reset_perms(void *addr) /* for /proc/kcore */ extern long vread(char *buf, char *addr, unsigned long count); -extern long vwrite(char *buf, char *addr, unsigned long count); /* * Internals. Dont't use.. -- cgit v1.2.3 From 2eb70aab25dd9b0013a0035b416dbe0e81e6ad48 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 6 May 2021 18:06:24 -0700 Subject: include/linux/pgtable.h: few spelling fixes Few spelling fixes throughout the file. Link: https://lkml.kernel.org/r/20210318201404.6380-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pgtable.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2194a9cd885c..46b13780c2c8 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -426,7 +426,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres /* * On some architectures hardware does not set page access bit when accessing - * memory page, it is responsibilty of software setting this bit. It brings + * memory page, it is responsibility of software setting this bit. It brings * out extra page fault penalty to track page access bit. For optimization page * access bit can be set during all page fault flow on these arches. * To be differentiate with macro pte_mkyoung, this macro is used on platforms @@ -519,7 +519,7 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); /* * This is an implementation of pmdp_establish() that is only suitable for an * architecture that doesn't have hardware dirty/accessed bits. In this case we - * can't race with CPU which sets these bits and non-atomic aproach is fine. + * can't race with CPU which sets these bits and non-atomic approach is fine. */ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) @@ -852,7 +852,7 @@ static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, * updates, but to prevent any updates it may make from being lost. * * This does not protect against other software modifications of the - * pte; the appropriate pte lock must be held over the transation. + * pte; the appropriate pte lock must be held over the transaction. * * Note that this interface is intended to be batchable, meaning that * ptep_modify_prot_commit may not actually update the pte, but merely @@ -1281,13 +1281,13 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) * * The complete check uses is_pmd_migration_entry() in linux/swapops.h * But using that requires moving current function and pmd_trans_unstable() - * to linux/swapops.h to resovle dependency, which is too much code move. + * to linux/swapops.h to resolve dependency, which is too much code move. * * !pmd_present() is equivalent to is_pmd_migration_entry() currently, * because !pmd_present() pages can only be under migration not swapped * out. * - * pmd_none() is preseved for future condition checks on pmd migration + * pmd_none() is preserved for future condition checks on pmd migration * entries and not confusing with this function name, although it is * redundant with !pmd_present(). */ -- cgit v1.2.3 From fa60ce2cb4506701c43bd4cf3ca23d970daf1b9c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 6 May 2021 18:06:44 -0700 Subject: treewide: remove editor modelines and cruft The section "19) Editor modelines and other cruft" in Documentation/process/coding-style.rst clearly says, "Do not include any of these in source files." I recently receive a patch to explicitly add a new one. Let's do treewide cleanups, otherwise some people follow the existing code and attempt to upstream their favoriate editor setups. It is even nicer if scripts/checkpatch.pl can check it. If we like to impose coding style in an editor-independent manner, I think editorconfig (patch [1]) is a saner solution. [1] https://lore.kernel.org/lkml/20200703073143.423557-1-danny@kdrag0n.dev/ Link: https://lkml.kernel.org/r/20210324054457.1477489-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Acked-by: Geert Uytterhoeven Reviewed-by: Miguel Ojeda [auxdisplay] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/configfs.h | 4 +--- include/linux/genl_magic_func.h | 1 - include/linux/genl_magic_struct.h | 1 - include/uapi/linux/if_bonding.h | 11 ----------- include/uapi/linux/nfs4.h | 6 ------ include/xen/interface/elfnote.h | 10 ---------- include/xen/interface/hvm/hvm_vcpu.h | 10 ---------- include/xen/interface/io/xenbus.h | 10 ---------- 8 files changed, 1 insertion(+), 52 deletions(-) (limited to 'include') diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 2e8c69b43c64..97cfd13bae51 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * configfs.h - definitions for the device driver filesystem * * Based on sysfs: diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 6cb82301d8e9..939b1a8f571b 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -404,4 +404,3 @@ s_fields \ /* }}}1 */ #endif /* GENL_MAGIC_FUNC_H */ -/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */ diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h index 35d21fddaf2d..f81d48987528 100644 --- a/include/linux/genl_magic_struct.h +++ b/include/linux/genl_magic_struct.h @@ -283,4 +283,3 @@ enum { \ /* }}}1 */ #endif /* GENL_MAGIC_STRUCT_H */ -/* vim: set foldmethod=marker nofoldenable : */ diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index e8eb4ad03cf1..d174914a837d 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -153,14 +153,3 @@ enum { #define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1) #endif /* _LINUX_IF_BONDING_H */ - -/* - * Local variables: - * version-control: t - * kept-new-versions: 5 - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ - diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index ed5415e0f1c1..800bb0ffa6e6 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -178,9 +178,3 @@ #define NFS4_MAX_BACK_CHANNEL_OPS 2 #endif /* _UAPI_LINUX_NFS4_H */ - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h index 9e9f9bf7c66d..449bd383cb76 100644 --- a/include/xen/interface/elfnote.h +++ b/include/xen/interface/elfnote.h @@ -208,13 +208,3 @@ #define XEN_ELFNOTE_MAX XEN_ELFNOTE_PHYS32_ENTRY #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/include/xen/interface/hvm/hvm_vcpu.h b/include/xen/interface/hvm/hvm_vcpu.h index 32ca83edd44d..bfc2138e0bf5 100644 --- a/include/xen/interface/hvm/hvm_vcpu.h +++ b/include/xen/interface/hvm/hvm_vcpu.h @@ -131,13 +131,3 @@ struct vcpu_hvm_context { typedef struct vcpu_hvm_context vcpu_hvm_context_t; #endif /* __XEN_PUBLIC_HVM_HVM_VCPU_H__ */ - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h index aaf2951b1cce..fb8716112251 100644 --- a/include/xen/interface/io/xenbus.h +++ b/include/xen/interface/io/xenbus.h @@ -39,13 +39,3 @@ enum xenbus_state }; #endif /* _XEN_PUBLIC_IO_XENBUS_H */ - -/* - * Local variables: - * c-file-style: "linux" - * indent-tabs-mode: t - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ -- cgit v1.2.3 From f0953a1bbaca71e1ebbcb9864eb1b273156157ed Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 6 May 2021 18:06:47 -0700 Subject: mm: fix typos in comments Fix ~94 single-word typos in locking code comments, plus a few very obvious grammar mistakes. Link: https://lkml.kernel.org/r/20210322212624.GA1963421@gmail.com Link: https://lore.kernel.org/r/20210322205203.GB1959563@gmail.com Signed-off-by: Ingo Molnar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Randy Dunlap Cc: Bhaskar Chowdhury Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- include/linux/vmalloc.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 76e27ebb28a3..322ec61d0da7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -106,7 +106,7 @@ extern int mmap_rnd_compat_bits __read_mostly; * embedding these tags into addresses that point to these memory regions, and * checking that the memory and the pointer tags match on memory accesses) * redefine this macro to strip tags from pointers. - * It's defined as noop for arcitectures that don't support memory tagging. + * It's defined as noop for architectures that don't support memory tagging. */ #ifndef untagged_addr #define untagged_addr(addr) (addr) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index b6ff16393bf6..4d668abb6391 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -33,7 +33,7 @@ struct notifier_block; /* in notifier.h */ * * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after * shadow memory has been mapped. It's used to handle allocation errors so that - * we don't try to poision shadow on free if it was never allocated. + * we don't try to poison shadow on free if it was never allocated. * * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to * determine which allocations need the module shadow freed. @@ -43,7 +43,7 @@ struct notifier_block; /* in notifier.h */ /* * Maximum alignment for ioremap() regions. - * Can be overriden by arch-specific value. + * Can be overridden by arch-specific value. */ #ifndef IOREMAP_MAX_ORDER #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ -- cgit v1.2.3 From 285c0faddcebdf360412fc9ef9cde63cf98da7f6 Mon Sep 17 00:00:00 2001 From: Bharat Jauhari Date: Thu, 25 Mar 2021 18:15:40 +0200 Subject: habanalabs: expose ASIC specific PLL index Currently the user cannot interpret the PLL information based on index as its exposed as an integer. This commit exposes ASIC specific PLL indexes and maps it to a generic FW compatible index. Signed-off-by: Bharat Jauhari Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index d3e017b5f0db..6d2d34c9f375 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -239,6 +239,39 @@ enum gaudi_engine_id { GAUDI_ENGINE_ID_SIZE }; +/* + * ASIC specific PLL index + * + * Used to retrieve in frequency info of different IPs via + * HL_INFO_PLL_FREQUENCY under HL_IOCTL_INFO IOCTL. The enums need to be + * used as an index in struct hl_pll_frequency_info + */ + +enum hl_goya_pll_index { + HL_GOYA_CPU_PLL = 0, + HL_GOYA_IC_PLL, + HL_GOYA_MC_PLL, + HL_GOYA_MME_PLL, + HL_GOYA_PCI_PLL, + HL_GOYA_EMMC_PLL, + HL_GOYA_TPC_PLL, + HL_GOYA_PLL_MAX +}; + +enum hl_gaudi_pll_index { + HL_GAUDI_CPU_PLL = 0, + HL_GAUDI_PCI_PLL, + HL_GAUDI_SRAM_PLL, + HL_GAUDI_HBM_PLL, + HL_GAUDI_NIC_PLL, + HL_GAUDI_DMA_PLL, + HL_GAUDI_MESH_PLL, + HL_GAUDI_MME_PLL, + HL_GAUDI_TPC_PLL, + HL_GAUDI_IF_PLL, + HL_GAUDI_PLL_MAX +}; + enum hl_device_status { HL_DEVICE_STATUS_OPERATIONAL, HL_DEVICE_STATUS_IN_RESET, -- cgit v1.2.3 From 0ab1438bad43d95877f848b7df551bd431680270 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 6 May 2021 02:45:15 +0900 Subject: linux/kconfig.h: replace IF_ENABLED() with PTR_IF() in is included from all the kernel-space source files, including C, assembly, linker scripts. It is intended to contain a minimal set of macros to evaluate CONFIG options. IF_ENABLED() is an intruder here because (x ? y : z) is C code, which should not be included from assembly files or linker scripts. Also, is no longer self-contained because NULL is defined in . Move IF_ENABLED() out to as PTR_IF(). PTF_IF() takes the general boolean expression instead of a CONFIG option so that it fits better in . Signed-off-by: Masahiro Yamada Reviewed-by: Kees Cook --- include/linux/kconfig.h | 6 ------ include/linux/kernel.h | 2 ++ 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index 24a59cb06963..cc8fa109cfa3 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -70,10 +70,4 @@ */ #define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option)) -/* - * IF_ENABLED(CONFIG_FOO, ptr) evaluates to (ptr) if CONFIG_FOO is set to 'y' - * or 'm', NULL otherwise. - */ -#define IF_ENABLED(option, ptr) (IS_ENABLED(option) ? (ptr) : NULL) - #endif /* __LINUX_KCONFIG_H */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5b7ed6dc99ac..2f9d15410c93 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -48,6 +48,8 @@ */ #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) +#define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) + #define u64_to_user_ptr(x) ( \ { \ typecheck(u64, (x)); \ -- cgit v1.2.3 From 35c820e71565d1fa835b82499359218b219828ac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 8 May 2021 21:49:48 -0600 Subject: Revert "bio: limit bio max size" This reverts commit cd2c7545ae1beac3b6aae033c7f31193b3255946. Alex reports that the commit causes corruption with LUKS on ext4. Revert it for now so that this can be investigated properly. Link: https://lore.kernel.org/linux-block/1620493841.bxdq8r5haw.none@localhost/ Reported-by: Alex Xu (Hello71) Signed-off-by: Jens Axboe --- include/linux/bio.h | 4 +--- include/linux/blkdev.h | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index f1a99f0a240c..a0b4cfdf62a4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -106,8 +106,6 @@ static inline void *bio_data(struct bio *bio) return NULL; } -extern unsigned int bio_max_size(struct bio *bio); - /** * bio_full - check if the bio is full * @bio: bio to check @@ -121,7 +119,7 @@ static inline bool bio_full(struct bio *bio, unsigned len) if (bio->bi_vcnt >= bio->bi_max_vecs) return true; - if (bio->bi_iter.bi_size > bio_max_size(bio) - len) + if (bio->bi_iter.bi_size > UINT_MAX - len) return true; return false; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 40c7c4d87aa1..b91ba6207365 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -327,8 +327,6 @@ enum blk_bounce { }; struct queue_limits { - unsigned int bio_max_bytes; - enum blk_bounce bounce; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; -- cgit v1.2.3 From 28ec344bb8911bb0d4910456b22ba0dd4f662521 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Wed, 5 May 2021 17:44:22 -0700 Subject: usb: typec: tcpm: Don't block probing of consumers of "connector" nodes fw_devlink expects DT device nodes with "compatible" property to have struct devices created for them. Since the connector node might not be populated as a device, mark it as such so that fw_devlink knows not to wait on this fwnode being populated as a struct device. Without this patch, USB functionality can be broken on some boards. Fixes: f7514a663016 ("of: property: fw_devlink: Add support for remote-endpoint") Reported-by: John Stultz Tested-by: John Stultz Signed-off-by: Saravana Kannan Link: https://lore.kernel.org/r/20210506004423.345199-1-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index ed4e67a7ff1c..59828516ebaf 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -187,5 +187,6 @@ extern u32 fw_devlink_get_flags(void); extern bool fw_devlink_is_strict(void); int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup); void fwnode_links_purge(struct fwnode_handle *fwnode); +void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode); #endif -- cgit v1.2.3 From 63c8af5687f6b1b70e9458cac1ffb25e86db1695 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 10 May 2021 08:48:06 +0900 Subject: block: uapi: fix comment about block device ioctl Fix the comment mentioning ioctl command range used for zoned block devices to reflect the range of commands actually implemented. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210509234806.3000-1-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- include/uapi/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f44eb0a04afd..4c32e97dcdf0 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -185,7 +185,7 @@ struct fsxattr { #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) /* - * A jump here: 130-131 are reserved for zoned block devices + * A jump here: 130-136 are reserved for zoned block devices * (see uapi/linux/blkzoned.h) */ -- cgit v1.2.3 From c745253e2a691a40c66790defe85c104a887e14a Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 5 May 2021 14:09:15 +0300 Subject: PM: runtime: Fix unpaired parent child_count for force_resume As pm_runtime_need_not_resume() relies also on usage_count, it can return a different value in pm_runtime_force_suspend() compared to when called in pm_runtime_force_resume(). Different return values can happen if anything calls PM runtime functions in between, and causes the parent child_count to increase on every resume. So far I've seen the issue only for omapdrm that does complicated things with PM runtime calls during system suspend for legacy reasons: omap_atomic_commit_tail() for omapdrm.0 dispc_runtime_get() wakes up 58000000.dss as it's the dispc parent dispc_runtime_resume() rpm_resume() increases parent child_count dispc_runtime_put() won't idle, PM runtime suspend blocked pm_runtime_force_suspend() for 58000000.dss, !pm_runtime_need_not_resume() __update_runtime_status() system suspended pm_runtime_force_resume() for 58000000.dss, pm_runtime_need_not_resume() pm_runtime_enable() only called because of pm_runtime_need_not_resume() omap_atomic_commit_tail() for omapdrm.0 dispc_runtime_get() wakes up 58000000.dss as it's the dispc parent dispc_runtime_resume() rpm_resume() increases parent child_count dispc_runtime_put() won't idle, PM runtime suspend blocked ... rpm_suspend for 58000000.dss but parent child_count is now unbalanced Let's fix the issue by adding a flag for needs_force_resume and use it in pm_runtime_force_resume() instead of pm_runtime_need_not_resume(). Additionally omapdrm system suspend could be simplified later on to avoid lots of unnecessary PM runtime calls and the complexity it adds. The driver can just use internal functions that are shared between the PM runtime and system suspend related functions. Fixes: 4918e1f87c5f ("PM / runtime: Rework pm_runtime_force_suspend/resume()") Signed-off-by: Tony Lindgren Reviewed-by: Ulf Hansson Tested-by: Tomi Valkeinen Cc: 4.16+ # 4.16+ Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pm.h b/include/linux/pm.h index c9657408fee1..1d8209c09686 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -601,6 +601,7 @@ struct dev_pm_info { unsigned int idle_notification:1; unsigned int request_pending:1; unsigned int deferred_resume:1; + unsigned int needs_force_resume:1; unsigned int runtime_auto:1; bool ignore_children:1; unsigned int no_callbacks:1; -- cgit v1.2.3 From 2515dd6ce8e545b0b2eece84920048ef9ed846c4 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Mon, 19 Apr 2021 16:17:41 -0700 Subject: stack: Replace "o" output with "r" input constraint "o" isn't a common asm() constraint to use; it triggers an assertion in assert-enabled builds of LLVM that it's not recognized when targeting aarch64 (though it appears to fall back to "m"). It's fixed in LLVM 13 now, but there isn't really a good reason to use "o" in particular here. To avoid causing build issues for those using assert-enabled builds of earlier LLVM versions, the constraint needs changing. Instead, if the point is to retain the __builtin_alloca(), make ptr appear to "escape" via being an input to an empty inline asm block. This is preferable anyways, since otherwise this looks like a dead store. While the use of "r" was considered in https://lore.kernel.org/lkml/202104011447.2E7F543@keescook/ it was only tested as an output (which looks like a dead store, and wasn't sufficient). Use "r" as an input constraint instead, which behaves correctly across compilers and architectures. Fixes: 39218ff4c625 ("stack: Optionally randomize kernel stack offset each syscall") Signed-off-by: Nick Desaulniers Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Tested-by: Kees Cook Tested-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Link: https://reviews.llvm.org/D100412 Link: https://bugs.llvm.org/show_bug.cgi?id=49956 Link: https://lore.kernel.org/r/20210419231741.4084415-1-keescook@chromium.org --- include/linux/randomize_kstack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index fd80fab663a9..bebc911161b6 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -38,7 +38,7 @@ void *__builtin_alloca(size_t size); u32 offset = raw_cpu_read(kstack_offset); \ u8 *ptr = __builtin_alloca(KSTACK_OFFSET_MAX(offset)); \ /* Keep allocation even after "ptr" loses scope. */ \ - asm volatile("" : "=o"(*ptr) :: "memory"); \ + asm volatile("" :: "r"(ptr) : "memory"); \ } \ } while (0) -- cgit v1.2.3 From 35f3f8504c3b60a1ae5576e178b27fc0ddd6157d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 May 2021 16:12:42 +0300 Subject: spi: Switch to signed types for *_native_cs SPI controller fields While fixing undefined behaviour the commit f60d7270c8a3 ("spi: Avoid undefined behaviour when counting unused native CSs") missed the case when all CSs are GPIOs and thus unused_native_cs will be evaluated to -1 in unsigned representation. This will falsely trigger a condition in the spi_get_gpio_descs(). Switch to signed types for *_native_cs SPI controller fields to fix above. Fixes: f60d7270c8a3 ("spi: Avoid undefined behaviour when counting unused native CSs") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210510131242.49455-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 360a3bc767ca..74239d65c7fd 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -644,8 +644,8 @@ struct spi_controller { int *cs_gpios; struct gpio_desc **cs_gpiods; bool use_gpio_descriptors; - u8 unused_native_cs; - u8 max_native_cs; + s8 unused_native_cs; + s8 max_native_cs; /* statistics */ struct spi_statistics statistics; -- cgit v1.2.3 From efed9a3337e341bd0989161b97453b52567bc59d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 10 May 2021 17:05:35 -0700 Subject: kyber: fix out of bounds access when preempted __blk_mq_sched_bio_merge() gets the ctx and hctx for the current CPU and passes the hctx to ->bio_merge(). kyber_bio_merge() then gets the ctx for the current CPU again and uses that to get the corresponding Kyber context in the passed hctx. However, the thread may be preempted between the two calls to blk_mq_get_ctx(), and the ctx returned the second time may no longer correspond to the passed hctx. This "works" accidentally most of the time, but it can cause us to read garbage if the second ctx came from an hctx with more ctx's than the first one (i.e., if ctx->index_hw[hctx->type] > hctx->nr_ctx). This manifested as this UBSAN array index out of bounds error reported by Jakub: UBSAN: array-index-out-of-bounds in ../kernel/locking/qspinlock.c:130:9 index 13106 is out of range for type 'long unsigned int [128]' Call Trace: dump_stack+0xa4/0xe5 ubsan_epilogue+0x5/0x40 __ubsan_handle_out_of_bounds.cold.13+0x2a/0x34 queued_spin_lock_slowpath+0x476/0x480 do_raw_spin_lock+0x1c2/0x1d0 kyber_bio_merge+0x112/0x180 blk_mq_submit_bio+0x1f5/0x1100 submit_bio_noacct+0x7b0/0x870 submit_bio+0xc2/0x3a0 btrfs_map_bio+0x4f0/0x9d0 btrfs_submit_data_bio+0x24e/0x310 submit_one_bio+0x7f/0xb0 submit_extent_page+0xc4/0x440 __extent_writepage_io+0x2b8/0x5e0 __extent_writepage+0x28d/0x6e0 extent_write_cache_pages+0x4d7/0x7a0 extent_writepages+0xa2/0x110 do_writepages+0x8f/0x180 __writeback_single_inode+0x99/0x7f0 writeback_sb_inodes+0x34e/0x790 __writeback_inodes_wb+0x9e/0x120 wb_writeback+0x4d2/0x660 wb_workfn+0x64d/0xa10 process_one_work+0x53a/0xa80 worker_thread+0x69/0x5b0 kthread+0x20b/0x240 ret_from_fork+0x1f/0x30 Only Kyber uses the hctx, so fix it by passing the request_queue to ->bio_merge() instead. BFQ and mq-deadline just use that, and Kyber can map the queues itself to avoid the mismatch. Fixes: a6088845c2bf ("block: kyber: make kyber more friendly with merging") Reported-by: Jakub Kicinski Signed-off-by: Omar Sandoval Link: https://lore.kernel.org/r/c7598605401a48d5cfeadebb678abd10af22b83f.1620691329.git.osandov@fb.com Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 1fe8e105b83b..dcb2f9022c1d 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -34,7 +34,7 @@ struct elevator_mq_ops { void (*depth_updated)(struct blk_mq_hw_ctx *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); - bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *, unsigned int); + bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int); int (*request_merge)(struct request_queue *q, struct request **, struct bio *); void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); -- cgit v1.2.3 From a1d5ff5651ea592c67054233b14b30bf4452999c Mon Sep 17 00:00:00 2001 From: Mathy Vanhoef Date: Tue, 11 May 2021 20:02:44 +0200 Subject: mac80211: properly handle A-MSDUs that start with an RFC 1042 header Properly parse A-MSDUs whose first 6 bytes happen to equal a rfc1042 header. This can occur in practice when the destination MAC address equals AA:AA:03:00:00:00. More importantly, this simplifies the next patch to mitigate A-MSDU injection attacks. Cc: stable@vger.kernel.org Signed-off-by: Mathy Vanhoef Link: https://lore.kernel.org/r/20210511200110.0b2b886492f0.I23dd5d685fe16d3b0ec8106e8f01b59f499dffed@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5224f885a99a..58c2cd417e89 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5760,7 +5760,7 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr); */ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, const u8 *addr, enum nl80211_iftype iftype, - u8 data_offset); + u8 data_offset, bool is_amsdu); /** * ieee80211_data_to_8023 - convert an 802.11 data frame to 802.3 @@ -5772,7 +5772,7 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, static inline int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, enum nl80211_iftype iftype) { - return ieee80211_data_to_8023_exthdr(skb, NULL, addr, iftype, 0); + return ieee80211_data_to_8023_exthdr(skb, NULL, addr, iftype, 0, false); } /** -- cgit v1.2.3 From 47c1131633ef6210add63b8b5704497023a3462a Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 12 May 2021 08:09:08 +0900 Subject: ASoC: soc-dai.h: Align the word of comment for SND_SOC_DAIFMT_CBC_CFC Let's use "consumer" instead of "follower". Signed-off-by: Kuninori Morimoto Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/8735usc1gr.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 1358a0ceb4d0..0bc29c4516e7 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -81,7 +81,7 @@ struct snd_compr_stream; #define SND_SOC_DAIFMT_CBP_CFP (1 << 12) /* codec clk provider & frame provider */ #define SND_SOC_DAIFMT_CBC_CFP (2 << 12) /* codec clk consumer & frame provider */ #define SND_SOC_DAIFMT_CBP_CFC (3 << 12) /* codec clk provider & frame consumer */ -#define SND_SOC_DAIFMT_CBC_CFC (4 << 12) /* codec clk consumer & frame follower */ +#define SND_SOC_DAIFMT_CBC_CFC (4 << 12) /* codec clk consumer & frame consumer */ /* previous definitions kept for backwards-compatibility, do not use in new contributions */ #define SND_SOC_DAIFMT_CBM_CFM SND_SOC_DAIFMT_CBP_CFP -- cgit v1.2.3 From 190515f610946db025cdedebde93958b725fb583 Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Wed, 12 May 2021 18:01:24 +0800 Subject: blkdev.h: remove unused codes blk_account_rq Last users of blk_account_rq gone with patch commit a1ce35fa49852db ("block: remove dead elevator code") and now it gets no caller, it can be safely removed. Signed-off-by: Lin Feng Link: https://lore.kernel.org/r/20210512100124.173769-1-linf@wangsu.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b91ba6207365..26c3e368656f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -677,11 +677,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -static inline bool blk_account_rq(struct request *rq) -{ - return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq); -} - #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) -- cgit v1.2.3 From 681865a03d3ec6ac3dda147044ed2a1a0f49f7bf Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Mon, 19 Apr 2021 19:27:25 +0800 Subject: libnvdimm: Remove duplicate struct declaration struct device is declared at 133rd line. The second declaration is unnecessary, remove it. Signed-off-by: Wan Jiabing Link: https://lore.kernel.org/r/20210419112725.42145-1-wanjiabing@vivo.com Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 01f251b6e36c..89b69e645ac7 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -141,7 +141,6 @@ static inline void __iomem *devm_nvdimm_ioremap(struct device *dev, struct nvdimm_bus; struct module; -struct device; struct nd_blk_region; struct nd_blk_region_desc { int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); -- cgit v1.2.3 From 098116e7e640ba677d9e345cbee83d253c13d556 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 11 May 2021 10:35:21 +0200 Subject: net: really orphan skbs tied to closing sk If the owing socket is shutting down - e.g. the sock reference count already dropped to 0 and only sk_wmem_alloc is keeping the sock alive, skb_orphan_partial() becomes a no-op. When forwarding packets over veth with GRO enabled, the above causes refcount errors. This change addresses the issue with a plain skb_orphan() call in the critical scenario. Fixes: 9adc89af724f ("net: let skb_orphan_partial wake-up waiters.") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sock.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 42bc5e1a627f..0e962d8bc73b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2231,13 +2231,15 @@ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) sk_mem_charge(sk, skb->truesize); } -static inline void skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk) +static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk) { if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { skb_orphan(skb); skb->destructor = sock_efree; skb->sk = sk; + return true; } + return false; } void sk_reset_timer(struct sock *sk, struct timer_list *timer, -- cgit v1.2.3 From 860dafa902595fb5f1d23bbcce1215188c3341e6 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 13 May 2021 11:51:50 +0200 Subject: vt: Fix character height handling with VT_RESIZEX Restore the original intent of the VT_RESIZEX ioctl's `v_clin' parameter which is the number of pixel rows per character (cell) rather than the height of the font used. For framebuffer devices the two values are always the same, because the former is inferred from the latter one. For VGA used as a true text mode device these two parameters are independent from each other: the number of pixel rows per character is set in the CRT controller, while font height is in fact hardwired to 32 pixel rows and fonts of heights below that value are handled by padding their data with blanks when loaded to hardware for use by the character generator. One can change the setting in the CRT controller and it will update the screen contents accordingly regardless of the font loaded. The `v_clin' parameter is used by the `vgacon' driver to set the height of the character cell and then the cursor position within. Make the parameter explicit then, by defining a new `vc_cell_height' struct member of `vc_data', set it instead of `vc_font.height' from `v_clin' in the VT_RESIZEX ioctl, and then use it throughout the `vgacon' driver except where actual font data is accessed which as noted above is independent from the CRTC setting. This way the framebuffer console driver is free to ignore the `v_clin' parameter as irrelevant, as it always should have, avoiding any issues attempts to give the parameter a meaning there could have caused, such as one that has led to commit 988d0763361b ("vt_ioctl: make VT_RESIZEX behave like VT_RESIZE"): "syzbot is reporting UAF/OOB read at bit_putcs()/soft_cursor() [1][2], for vt_resizex() from ioctl(VT_RESIZEX) allows setting font height larger than actual font height calculated by con_font_set() from ioctl(PIO_FONT). Since fbcon_set_font() from con_font_set() allocates minimal amount of memory based on actual font height calculated by con_font_set(), use of vt_resizex() can cause UAF/OOB read for font data." The problem first appeared around Linux 2.5.66 which predates our repo history, but the origin could be identified with the old MIPS/Linux repo also at: as commit 9736a3546de7 ("Merge with Linux 2.5.66."), where VT_RESIZEX code in `vt_ioctl' was updated as follows: if (clin) - video_font_height = clin; + vc->vc_font.height = clin; making the parameter apply to framebuffer devices as well, perhaps due to the use of "font" in the name of the original `video_font_height' variable. Use "cell" in the new struct member then to avoid ambiguity. References: [1] https://syzkaller.appspot.com/bug?id=32577e96d88447ded2d3b76d71254fb855245837 [2] https://syzkaller.appspot.com/bug?id=6b8355d27b2b94fb5cedf4655e3a59162d9e48e3 Signed-off-by: Maciej W. Rozycki Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org # v2.6.12+ Signed-off-by: Linus Torvalds --- include/linux/console_struct.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index 153734816b49..d5b9c8d40c18 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -101,6 +101,7 @@ struct vc_data { unsigned int vc_rows; unsigned int vc_size_row; /* Bytes per row */ unsigned int vc_scan_lines; /* # of scan lines */ + unsigned int vc_cell_height; /* CRTC character cell height */ unsigned long vc_origin; /* [!] Start of real screen */ unsigned long vc_scr_end; /* [!] End of real screen */ unsigned long vc_visible_origin; /* [!] Top of visible window */ -- cgit v1.2.3 From 640d1eaff2c09e382a23bd831094ebbfaa16fef5 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Tue, 4 May 2021 16:22:34 -0600 Subject: dyndbg: avoid calling dyndbg_emit_prefix when it has no work Wrap function in a static-inline one, which checks flags to avoid calling the function unnecessarily. And hoist its output-buffer initialization to the grand-caller, which is already allocating the buffer on the stack, and can trivially initialize it too. Signed-off-by: Jim Cromie Link: https://lore.kernel.org/r/20210504222235.1033685-2-jim.cromie@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/dynamic_debug.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index a57ee75342cf..dce631e678dd 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -32,6 +32,11 @@ struct _ddebug { #define _DPRINTK_FLAGS_INCL_FUNCNAME (1<<2) #define _DPRINTK_FLAGS_INCL_LINENO (1<<3) #define _DPRINTK_FLAGS_INCL_TID (1<<4) + +#define _DPRINTK_FLAGS_INCL_ANY \ + (_DPRINTK_FLAGS_INCL_MODNAME | _DPRINTK_FLAGS_INCL_FUNCNAME |\ + _DPRINTK_FLAGS_INCL_LINENO | _DPRINTK_FLAGS_INCL_TID) + #if defined DEBUG #define _DPRINTK_FLAGS_DEFAULT _DPRINTK_FLAGS_PRINT #else -- cgit v1.2.3 From c07531c01d8284aedaf95708ea90e76d11af0e21 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 10 May 2021 14:50:24 +0300 Subject: netfilter: flowtable: Remove redundant hw refresh bit Offloading conns could fail for multiple reasons and a hw refresh bit is set to try to reoffload it in next sw packet. But it could be in some cases and future points that the hw refresh bit is not set but a refresh could succeed. Remove the hw refresh bit and do offload refresh if requested. There won't be a new work entry if a work is already pending anyway as there is the hw pending bit. Fixes: 8b3646d6e0c4 ("net/sched: act_ct: Support refreshing the flow table entries") Signed-off-by: Roi Dayan Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 51d8eb99764d..48ef7460ff30 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -157,7 +157,6 @@ enum nf_flow_flags { NF_FLOW_HW, NF_FLOW_HW_DYING, NF_FLOW_HW_DEAD, - NF_FLOW_HW_REFRESH, NF_FLOW_HW_PENDING, }; -- cgit v1.2.3 From cb6f6b3384d7825d2a43f2256c5200e3b3956fc8 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 12 May 2021 13:18:21 -0700 Subject: xen/arm: move xen_swiotlb_detect to arm/swiotlb-xen.h Move xen_swiotlb_detect to a static inline function to make it available to !CONFIG_XEN builds. CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Stefano Stabellini Reviewed-by: Christoph Hellwig Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210512201823.1963-1-sstabellini@kernel.org Signed-off-by: Juergen Gross --- include/xen/arm/swiotlb-xen.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/xen/arm/swiotlb-xen.h b/include/xen/arm/swiotlb-xen.h index 2994fe6031a0..33336ab58afc 100644 --- a/include/xen/arm/swiotlb-xen.h +++ b/include/xen/arm/swiotlb-xen.h @@ -2,6 +2,19 @@ #ifndef _ASM_ARM_SWIOTLB_XEN_H #define _ASM_ARM_SWIOTLB_XEN_H -extern int xen_swiotlb_detect(void); +#include +#include + +static inline int xen_swiotlb_detect(void) +{ + if (!xen_domain()) + return 0; + if (xen_feature(XENFEAT_direct_mapped)) + return 1; + /* legacy case */ + if (!xen_feature(XENFEAT_not_direct_mapped) && xen_initial_domain()) + return 1; + return 0; +} #endif /* _ASM_ARM_SWIOTLB_XEN_H */ -- cgit v1.2.3 From a90c57f2cedd52a511f739fb55e6244e22e1a2fb Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 14 May 2021 11:16:59 +0800 Subject: net: sched: fix packet stuck problem for lockless qdisc Lockless qdisc has below concurrent problem: cpu0 cpu1 . . q->enqueue . . . qdisc_run_begin() . . . dequeue_skb() . . . sch_direct_xmit() . . . . q->enqueue . qdisc_run_begin() . return and do nothing . . qdisc_run_end() . cpu1 enqueue a skb without calling __qdisc_run() because cpu0 has not released the lock yet and spin_trylock() return false for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb enqueued by cpu1 when calling dequeue_skb() because cpu1 may enqueue the skb after cpu0 calling dequeue_skb() and before cpu0 calling qdisc_run_end(). Lockless qdisc has below another concurrent problem when tx_action is involved: cpu0(serving tx_action) cpu1 cpu2 . . . . q->enqueue . . qdisc_run_begin() . . dequeue_skb() . . . q->enqueue . . . . sch_direct_xmit() . . . qdisc_run_begin() . . return and do nothing . . . clear __QDISC_STATE_SCHED . . qdisc_run_begin() . . return and do nothing . . . . . . qdisc_run_end() . This patch fixes the above data race by: 1. If the first spin_trylock() return false and STATE_MISSED is not set, set STATE_MISSED and retry another spin_trylock() in case other CPU may not see STATE_MISSED after it releases the lock. 2. reschedule if STATE_MISSED is set after the lock is released at the end of qdisc_run_end(). For tx_action case, STATE_MISSED is also set when cpu1 is at the end if qdisc_run_end(), so tx_action will be rescheduled again to dequeue the skb enqueued by cpu2. Clear STATE_MISSED before retrying a dequeuing when dequeuing returns NULL in order to reduce the overhead of the second spin_trylock() and __netif_schedule() calling. Also clear the STATE_MISSED before calling __netif_schedule() at the end of qdisc_run_end() to avoid doing another round of dequeuing in the pfifo_fast_dequeue(). The performance impact of this patch, tested using pktgen and dummy netdev with pfifo_fast qdisc attached: threads without+this_patch with+this_patch delta 1 2.61Mpps 2.60Mpps -0.3% 2 3.97Mpps 3.82Mpps -3.7% 4 5.62Mpps 5.59Mpps -0.5% 8 2.78Mpps 2.77Mpps -0.3% 16 2.22Mpps 2.22Mpps -0.0% Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking") Acked-by: Jakub Kicinski Tested-by: Juergen Gross Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- include/net/sch_generic.h | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f7a6e14491fb..1e625519ae96 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -36,6 +36,7 @@ struct qdisc_rate_table { enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, + __QDISC_STATE_MISSED, }; struct qdisc_size_table { @@ -159,8 +160,33 @@ static inline bool qdisc_is_empty(const struct Qdisc *qdisc) static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { + if (spin_trylock(&qdisc->seqlock)) + goto nolock_empty; + + /* If the MISSED flag is set, it means other thread has + * set the MISSED flag before second spin_trylock(), so + * we can return false here to avoid multi cpus doing + * the set_bit() and second spin_trylock() concurrently. + */ + if (test_bit(__QDISC_STATE_MISSED, &qdisc->state)) + return false; + + /* Set the MISSED flag before the second spin_trylock(), + * if the second spin_trylock() return false, it means + * other cpu holding the lock will do dequeuing for us + * or it will see the MISSED flag set after releasing + * lock and reschedule the net_tx_action() to do the + * dequeuing. + */ + set_bit(__QDISC_STATE_MISSED, &qdisc->state); + + /* Retry again in case other CPU may not see the new flag + * after it releases the lock at the end of qdisc_run_end(). + */ if (!spin_trylock(&qdisc->seqlock)) return false; + +nolock_empty: WRITE_ONCE(qdisc->empty, false); } else if (qdisc_is_running(qdisc)) { return false; @@ -176,8 +202,15 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) static inline void qdisc_run_end(struct Qdisc *qdisc) { write_seqcount_end(&qdisc->running); - if (qdisc->flags & TCQ_F_NOLOCK) + if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); + + if (unlikely(test_bit(__QDISC_STATE_MISSED, + &qdisc->state))) { + clear_bit(__QDISC_STATE_MISSED, &qdisc->state); + __netif_schedule(qdisc); + } + } } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) -- cgit v1.2.3 From 102b55ee92f9fda4dde7a45d2b20538e6e3e3d1e Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 14 May 2021 11:17:00 +0800 Subject: net: sched: fix tx action rescheduling issue during deactivation Currently qdisc_run() checks the STATE_DEACTIVATED of lockless qdisc before calling __qdisc_run(), which ultimately clear the STATE_MISSED when all the skb is dequeued. If STATE_DEACTIVATED is set before clearing STATE_MISSED, there may be rescheduling of net_tx_action() at the end of qdisc_run_end(), see below: CPU0(net_tx_atcion) CPU1(__dev_xmit_skb) CPU2(dev_deactivate) . . . . set STATE_MISSED . . __netif_schedule() . . . set STATE_DEACTIVATED . . qdisc_reset() . . . .<--------------- . synchronize_net() clear __QDISC_STATE_SCHED | . . . | . . . | . some_qdisc_is_busy() . | . return *false* . | . . test STATE_DEACTIVATED | . . __qdisc_run() *not* called | . . . | . . test STATE_MISS | . . __netif_schedule()--------| . . . . . . . . __qdisc_run() is not called by net_tx_atcion() in CPU0 because CPU2 has set STATE_DEACTIVATED flag during dev_deactivate(), and STATE_MISSED is only cleared in __qdisc_run(), __netif_schedule is called at the end of qdisc_run_end(), causing tx action rescheduling problem. qdisc_run() called by net_tx_action() runs in the softirq context, which should has the same semantic as the qdisc_run() called by __dev_xmit_skb() protected by rcu_read_lock_bh(). And there is a synchronize_net() between STATE_DEACTIVATED flag being set and qdisc_reset()/some_qdisc_is_busy in dev_deactivate(), we can safely bail out for the deactived lockless qdisc in net_tx_action(), and qdisc_reset() will reset all skb not dequeued yet. So add the rcu_read_lock() explicitly to protect the qdisc_run() and do the STATE_DEACTIVATED checking in net_tx_action() before calling qdisc_run_begin(). Another option is to do the checking in the qdisc_run_end(), but it will add unnecessary overhead for non-tx_action case, because __dev_queue_xmit() will not see qdisc with STATE_DEACTIVATED after synchronize_net(), the qdisc with STATE_DEACTIVATED can only be seen by net_tx_action() because of __netif_schedule(). The STATE_DEACTIVATED checking in qdisc_run() is to avoid race between net_tx_action() and qdisc_reset(), see: commit d518d2ed8640 ("net/sched: fix race between deactivation and dequeue for NOLOCK qdisc"). As the bailout added above for deactived lockless qdisc in net_tx_action() provides better protection for the race without calling qdisc_run() at all, so remove the STATE_DEACTIVATED checking in qdisc_run(). After qdisc_reset(), there is no skb in qdisc to be dequeued, so clear the STATE_MISSED in dev_reset_queue() too. Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking") Acked-by: Jakub Kicinski Signed-off-by: Yunsheng Lin V8: Clearing STATE_MISSED before calling __netif_schedule() has avoid the endless rescheduling problem, but there may still be a unnecessary rescheduling, so adjust the commit log. Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index f5c1bee0cd6a..6d7b12cba015 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -128,12 +128,7 @@ void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { - /* NOLOCK qdisc must check 'state' under the qdisc seqlock - * to avoid racing with dev_qdisc_reset() - */ - if (!(q->flags & TCQ_F_NOLOCK) || - likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) - __qdisc_run(q); + __qdisc_run(q); qdisc_run_end(q); } } -- cgit v1.2.3 From 22247efd822e6d263f3c8bd327f3f769aea9b1d9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 14 May 2021 17:27:04 -0700 Subject: mm/hugetlb: fix F_SEAL_FUTURE_WRITE Patch series "mm/hugetlb: Fix issues on file sealing and fork", v2. Hugh reported issue with F_SEAL_FUTURE_WRITE not applied correctly to hugetlbfs, which I can easily verify using the memfd_test program, which seems that the program is hardly run with hugetlbfs pages (as by default shmem). Meanwhile I found another probably even more severe issue on that hugetlb fork won't wr-protect child cow pages, so child can potentially write to parent private pages. Patch 2 addresses that. After this series applied, "memfd_test hugetlbfs" should start to pass. This patch (of 2): F_SEAL_FUTURE_WRITE is missing for hugetlb starting from the first day. There is a test program for that and it fails constantly. $ ./memfd_test hugetlbfs memfd-hugetlb: CREATE memfd-hugetlb: BASIC memfd-hugetlb: SEAL-WRITE memfd-hugetlb: SEAL-FUTURE-WRITE mmap() didn't fail as expected Aborted (core dumped) I think it's probably because no one is really running the hugetlbfs test. Fix it by checking FUTURE_WRITE also in hugetlbfs_file_mmap() as what we do in shmem_mmap(). Generalize a helper for that. Link: https://lkml.kernel.org/r/20210503234356.9097-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210503234356.9097-2-peterx@redhat.com Fixes: ab3948f58ff84 ("mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd") Signed-off-by: Peter Xu Reported-by: Hugh Dickins Reviewed-by: Mike Kravetz Cc: Joel Fernandes (Google) Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 322ec61d0da7..c274f75efcf9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3216,5 +3216,37 @@ void mem_dump_obj(void *object); static inline void mem_dump_obj(void *object) {} #endif +/** + * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it + * @seals: the seals to check + * @vma: the vma to operate on + * + * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on + * the vma flags. Return 0 if check pass, or <0 for errors. + */ +static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) +{ + if (seals & F_SEAL_FUTURE_WRITE) { + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * "future write" seal active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; + + /* + * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask + * VM_MAYWRITE as we still want them to be COW-writable. + */ + if (vma->vm_flags & VM_SHARED) + vma->vm_flags &= ~(VM_MAYWRITE); + } + + return 0; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From 9ddb3c14afba8bc5950ed297f02d4ae05ff35cd1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 May 2021 17:27:24 -0700 Subject: mm: fix struct page layout on 32-bit systems 32-bit architectures which expect 8-byte alignment for 8-byte integers and need 64-bit DMA addresses (arm, mips, ppc) had their struct page inadvertently expanded in 2019. When the dma_addr_t was added, it forced the alignment of the union to 8 bytes, which inserted a 4 byte gap between 'flags' and the union. Fix this by storing the dma_addr_t in one or two adjacent unsigned longs. This restores the alignment to that of an unsigned long. We always store the low bits in the first word to prevent the PageTail bit from being inadvertently set on a big endian platform. If that happened, get_user_pages_fast() racing against a page which was freed and reallocated to the page_pool could dereference a bogus compound_head(), which would be hard to trace back to this cause. Link: https://lkml.kernel.org/r/20210510153211.1504886-1-willy@infradead.org Fixes: c25fff7171be ("mm: add dma_addr_t to struct page") Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ilias Apalodimas Acked-by: Jesper Dangaard Brouer Acked-by: Vlastimil Babka Tested-by: Matteo Croce Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 4 ++-- include/net/page_pool.h | 12 +++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6613b26a8894..5aacc1c10a45 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -97,10 +97,10 @@ struct page { }; struct { /* page_pool used by netstack */ /** - * @dma_addr: might require a 64-bit value even on + * @dma_addr: might require a 64-bit value on * 32-bit architectures. */ - dma_addr_t dma_addr; + unsigned long dma_addr[2]; }; struct { /* slab, slob and slub */ union { diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 6d517a37c18b..b4b6de909c93 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -198,7 +198,17 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { - return page->dma_addr; + dma_addr_t ret = page->dma_addr[0]; + if (sizeof(dma_addr_t) > sizeof(unsigned long)) + ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16; + return ret; +} + +static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) +{ + page->dma_addr[0] = addr; + if (sizeof(dma_addr_t) > sizeof(unsigned long)) + page->dma_addr[1] = upper_32_bits(addr); } static inline bool is_page_pool_compiled_in(void) -- cgit v1.2.3 From 076171a67789ad0107de44c2964f2e46a7d0d7b8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 May 2021 17:27:30 -0700 Subject: mm/filemap: fix readahead return types A readahead request will not allocate more memory than can be represented by a size_t, even on systems that have HIGHMEM available. Change the length functions from returning an loff_t to a size_t. Link: https://lkml.kernel.org/r/20210510201201.1558972-1-willy@infradead.org Fixes: 32c0a6bcaa1f57 ("btrfs: add and use readahead_batch_length") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Darrick J. Wong Reported-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a4bd41128bf3..e89df447fae3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -997,9 +997,9 @@ static inline loff_t readahead_pos(struct readahead_control *rac) * readahead_length - The number of bytes in this readahead request. * @rac: The readahead request. */ -static inline loff_t readahead_length(struct readahead_control *rac) +static inline size_t readahead_length(struct readahead_control *rac) { - return (loff_t)rac->_nr_pages * PAGE_SIZE; + return rac->_nr_pages * PAGE_SIZE; } /** @@ -1024,7 +1024,7 @@ static inline unsigned int readahead_count(struct readahead_control *rac) * readahead_batch_length - The number of bytes in the current batch. * @rac: The readahead request. */ -static inline loff_t readahead_batch_length(struct readahead_control *rac) +static inline size_t readahead_batch_length(struct readahead_control *rac) { return rac->_batch_count * PAGE_SIZE; } -- cgit v1.2.3 From e0652f8bb44d6294eeeac06d703185357f25d50b Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Sat, 15 May 2021 07:29:06 +0800 Subject: NFC: nci: fix memory leak in nci_allocate_device nfcmrvl_disconnect fails to free the hci_dev field in struct nci_dev. Fix this by freeing hci_dev in nci_free_device. BUG: memory leak unreferenced object 0xffff888111ea6800 (size 1024): comm "kworker/1:0", pid 19, jiffies 4294942308 (age 13.580s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 60 fd 0c 81 88 ff ff .........`...... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000004bc25d43>] kmalloc include/linux/slab.h:552 [inline] [<000000004bc25d43>] kzalloc include/linux/slab.h:682 [inline] [<000000004bc25d43>] nci_hci_allocate+0x21/0xd0 net/nfc/nci/hci.c:784 [<00000000c59cff92>] nci_allocate_device net/nfc/nci/core.c:1170 [inline] [<00000000c59cff92>] nci_allocate_device+0x10b/0x160 net/nfc/nci/core.c:1132 [<00000000006e0a8e>] nfcmrvl_nci_register_dev+0x10a/0x1c0 drivers/nfc/nfcmrvl/main.c:153 [<000000004da1b57e>] nfcmrvl_probe+0x223/0x290 drivers/nfc/nfcmrvl/usb.c:345 [<00000000d506aed9>] usb_probe_interface+0x177/0x370 drivers/usb/core/driver.c:396 [<00000000bc632c92>] really_probe+0x159/0x4a0 drivers/base/dd.c:554 [<00000000f5009125>] driver_probe_device+0x84/0x100 drivers/base/dd.c:740 [<000000000ce658ca>] __device_attach_driver+0xee/0x110 drivers/base/dd.c:846 [<000000007067d05f>] bus_for_each_drv+0xb7/0x100 drivers/base/bus.c:431 [<00000000f8e13372>] __device_attach+0x122/0x250 drivers/base/dd.c:914 [<000000009cf68860>] bus_probe_device+0xc6/0xe0 drivers/base/bus.c:491 [<00000000359c965a>] device_add+0x5be/0xc30 drivers/base/core.c:3109 [<00000000086e4bd3>] usb_set_configuration+0x9d9/0xb90 drivers/usb/core/message.c:2164 [<00000000ca036872>] usb_generic_driver_probe+0x8c/0xc0 drivers/usb/core/generic.c:238 [<00000000d40d36f6>] usb_probe_device+0x5c/0x140 drivers/usb/core/driver.c:293 [<00000000bc632c92>] really_probe+0x159/0x4a0 drivers/base/dd.c:554 Reported-by: syzbot+19bcfc64a8df1318d1c3@syzkaller.appspotmail.com Fixes: 11f54f228643 ("NFC: nci: Add HCI over NCI protocol support") Signed-off-by: Dongliang Mu Signed-off-by: David S. Miller --- include/net/nfc/nci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index bd76e8e082c0..1df0f8074c9d 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -298,6 +298,7 @@ int nci_nfcc_loopback(struct nci_dev *ndev, void *data, size_t data_len, struct sk_buff **resp); struct nci_hci_dev *nci_hci_allocate(struct nci_dev *ndev); +void nci_hci_deallocate(struct nci_dev *ndev); int nci_hci_send_event(struct nci_dev *ndev, u8 gate, u8 event, const u8 *param, size_t param_len); int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, -- cgit v1.2.3 From add0b32ef9146a8559a60aed54c37692a5f9d34f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 30 Apr 2021 17:06:01 -0500 Subject: siginfo: Move si_trapno inside the union inside _si_fault It turns out that linux uses si_trapno very sparingly, and as such it can be considered extra information for a very narrow selection of signals, rather than information that is present with every fault reported in siginfo. As such move si_trapno inside the union inside of _si_fault. This results in no change in placement, and makes it eaiser to extend _si_fault in the future as this reduces the number of special cases. In particular with si_trapno included in the union it is no longer a concern that the union must be pointer aligned on most architectures because the union follows immediately after si_addr which is a pointer. This change results in a difference in siginfo field placement on sparc and alpha for the fields si_addr_lsb, si_lower, si_upper, si_pkey, and si_perf. These architectures do not implement the signals that would use si_addr_lsb, si_lower, si_upper, si_pkey, and si_perf. Further these architecture have not yet implemented the userspace that would use si_perf. The point of this change is in fact to correct these placement issues before sparc or alpha grow userspace that cares. This change was discussed[1] and the agreement is that this change is currently safe. [1]: https://lkml.kernel.org/r/CAK8P3a0+uKYwL1NhY6Hvtieghba2hKYGD6hcKx5n8=4Gtt+pHA@mail.gmail.com Acked-by: Marco Elver v1: https://lkml.kernel.org/r/m1tunns7yf.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210505141101.11519-5-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20210517195748.8880-1-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/compat.h | 5 ++--- include/uapi/asm-generic/siginfo.h | 7 ++----- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/compat.h b/include/linux/compat.h index f0d2dd35d408..6af7bef15e94 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -214,12 +214,11 @@ typedef struct compat_siginfo { /* SIGILL, SIGFPE, SIGSEGV, SIGBUS, SIGTRAP, SIGEMT */ struct { compat_uptr_t _addr; /* faulting insn/memory ref. */ -#ifdef __ARCH_SI_TRAPNO - int _trapno; /* TRAP # which caused the signal */ -#endif #define __COMPAT_ADDR_BND_PKEY_PAD (__alignof__(compat_uptr_t) < sizeof(short) ? \ sizeof(short) : __alignof__(compat_uptr_t)) union { + /* used on alpha and sparc */ + int _trapno; /* TRAP # which caused the signal */ /* * used when si_code=BUS_MCEERR_AR or * used when si_code=BUS_MCEERR_AO diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index 03d6f6d2c1fe..e663bf117b46 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -63,9 +63,6 @@ union __sifields { /* SIGILL, SIGFPE, SIGSEGV, SIGBUS, SIGTRAP, SIGEMT */ struct { void __user *_addr; /* faulting insn/memory ref. */ -#ifdef __ARCH_SI_TRAPNO - int _trapno; /* TRAP # which caused the signal */ -#endif #ifdef __ia64__ int _imm; /* immediate value for "break" */ unsigned int _flags; /* see ia64 si_flags */ @@ -75,6 +72,8 @@ union __sifields { #define __ADDR_BND_PKEY_PAD (__alignof__(void *) < sizeof(short) ? \ sizeof(short) : __alignof__(void *)) union { + /* used on alpha and sparc */ + int _trapno; /* TRAP # which caused the signal */ /* * used when si_code=BUS_MCEERR_AR or * used when si_code=BUS_MCEERR_AO @@ -150,9 +149,7 @@ typedef struct siginfo { #define si_int _sifields._rt._sigval.sival_int #define si_ptr _sifields._rt._sigval.sival_ptr #define si_addr _sifields._sigfault._addr -#ifdef __ARCH_SI_TRAPNO #define si_trapno _sifields._sigfault._trapno -#endif #define si_addr_lsb _sifields._sigfault._addr_lsb #define si_lower _sifields._sigfault._addr_bnd._lower #define si_upper _sifields._sigfault._addr_bnd._upper -- cgit v1.2.3 From 9abcabe3111811aeae0f3a14e159b14248631875 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 30 Apr 2021 17:29:36 -0500 Subject: signal: Implement SIL_FAULT_TRAPNO Now that si_trapno is part of the union in _si_fault and available on all architectures, add SIL_FAULT_TRAPNO and update siginfo_layout to return SIL_FAULT_TRAPNO when the code assumes si_trapno is valid. There is room for future changes to reduce when si_trapno is valid but this is all that is needed to make si_trapno and the other members of the the union in _sigfault mutually exclusive. Update the code that uses siginfo_layout to deal with SIL_FAULT_TRAPNO and have the same code ignore si_trapno in in all other cases. v1: https://lkml.kernel.org/r/m1o8dvs7s7.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210505141101.11519-6-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20210517195748.8880-2-ebiederm@xmission.com Reviewed-by: Marco Elver Signed-off-by: "Eric W. Biederman" --- include/linux/signal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/signal.h b/include/linux/signal.h index 1e98548d7cf6..5160fd45e5ca 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -40,6 +40,7 @@ enum siginfo_layout { SIL_TIMER, SIL_POLL, SIL_FAULT, + SIL_FAULT_TRAPNO, SIL_FAULT_MCEERR, SIL_FAULT_BNDERR, SIL_FAULT_PKUERR, -- cgit v1.2.3 From af5eeab7e8e8c2f0fad10e4ab8cc8092012a2d5b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 2 May 2021 14:27:24 -0500 Subject: signal: Factor force_sig_perf out of perf_sigtrap Separate filling in siginfo for TRAP_PERF from deciding that siginal needs to be sent. There are enough little details that need to be correct when properly filling in siginfo_t that it is easy to make mistakes if filling in the siginfo_t is in the same function with other logic. So factor out force_sig_perf to reduce the cognative load of on reviewers, maintainers and implementors. v1: https://lkml.kernel.org/r/m17dkjqqxz.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210505141101.11519-10-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20210517195748.8880-3-ebiederm@xmission.com Reviewed-by: Marco Elver Acked-by: Peter Zijlstra (Intel) Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3f6a0fcaa10c..7f4278fa21fe 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -326,6 +326,7 @@ int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); +int force_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); -- cgit v1.2.3 From 0683b53197b55343a166f1507086823030809a19 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 2 May 2021 17:28:31 -0500 Subject: signal: Deliver all of the siginfo perf data in _perf Don't abuse si_errno and deliver all of the perf data in _perf member of siginfo_t. Note: The data field in the perf data structures in a u64 to allow a pointer to be encoded without needed to implement a 32bit and 64bit version of the same structure. There already exists a 32bit and 64bit versions siginfo_t, and the 32bit version can not include a 64bit member as it only has 32bit alignment. So unsigned long is used in siginfo_t instead of a u64 as unsigned long can encode a pointer on all architectures linux supports. v1: https://lkml.kernel.org/r/m11rarqqx2.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210503203814.25487-10-ebiederm@xmission.com v3: https://lkml.kernel.org/r/20210505141101.11519-11-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20210517195748.8880-4-ebiederm@xmission.com Reviewed-by: Marco Elver Signed-off-by: "Eric W. Biederman" --- include/linux/compat.h | 5 ++++- include/uapi/asm-generic/siginfo.h | 8 ++++++-- include/uapi/linux/perf_event.h | 2 +- include/uapi/linux/signalfd.h | 4 ++-- 4 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/compat.h b/include/linux/compat.h index 6af7bef15e94..a27fffaae121 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -236,7 +236,10 @@ typedef struct compat_siginfo { u32 _pkey; } _addr_pkey; /* used when si_code=TRAP_PERF */ - compat_ulong_t _perf; + struct { + compat_ulong_t _data; + u32 _type; + } _perf; }; } _sigfault; diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index e663bf117b46..5a3c221f4c9d 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -91,7 +91,10 @@ union __sifields { __u32 _pkey; } _addr_pkey; /* used when si_code=TRAP_PERF */ - unsigned long _perf; + struct { + unsigned long _data; + __u32 _type; + } _perf; }; } _sigfault; @@ -154,7 +157,8 @@ typedef struct siginfo { #define si_lower _sifields._sigfault._addr_bnd._lower #define si_upper _sifields._sigfault._addr_bnd._upper #define si_pkey _sifields._sigfault._addr_pkey._pkey -#define si_perf _sifields._sigfault._perf +#define si_perf_data _sifields._sigfault._perf._data +#define si_perf_type _sifields._sigfault._perf._type #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd #define si_call_addr _sifields._sigsys._call_addr diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e54e639248c8..7b14753b3d38 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -464,7 +464,7 @@ struct perf_event_attr { /* * User provided data if sigtrap=1, passed back to user via - * siginfo_t::si_perf, e.g. to permit user to identify the event. + * siginfo_t::si_perf_data, e.g. to permit user to identify the event. */ __u64 sig_data; }; diff --git a/include/uapi/linux/signalfd.h b/include/uapi/linux/signalfd.h index 7e333042c7e3..e78dddf433fc 100644 --- a/include/uapi/linux/signalfd.h +++ b/include/uapi/linux/signalfd.h @@ -39,8 +39,8 @@ struct signalfd_siginfo { __s32 ssi_syscall; __u64 ssi_call_addr; __u32 ssi_arch; - __u32 __pad3; - __u64 ssi_perf; + __u32 ssi_perf_type; + __u64 ssi_perf_data; /* * Pad strcture to 128 bytes. Remember to update the -- cgit v1.2.3 From 922e3013046b79b444c87eda5baf43afae1326a8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 3 May 2021 12:52:43 -0500 Subject: signalfd: Remove SIL_PERF_EVENT fields from signalfd_siginfo With the addition of ssi_perf_data and ssi_perf_type struct signalfd_siginfo is dangerously close to running out of space. All that remains is just enough space for two additional 64bit fields. A practice of adding all possible siginfo_t fields into struct singalfd_siginfo can not be supported as adding the missing fields ssi_lower, ssi_upper, and ssi_pkey would require two 64bit fields and one 32bit fields. In practice the fields ssi_perf_data and ssi_perf_type can never be used by signalfd as the signal that generates them always delivers them synchronously to the thread that triggers them. Therefore until someone actually needs the fields ssi_perf_data and ssi_perf_type in signalfd_siginfo remove them. This leaves a bit more room for future expansion. v1: https://lkml.kernel.org/r/20210503203814.25487-12-ebiederm@xmission.com v2: https://lkml.kernel.org/r/20210505141101.11519-12-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20210517195748.8880-5-ebiederm@xmission.com Reviewed-by: Marco Elver Signed-off-by: "Eric W. Biederman" --- include/uapi/linux/signalfd.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/signalfd.h b/include/uapi/linux/signalfd.h index e78dddf433fc..83429a05b698 100644 --- a/include/uapi/linux/signalfd.h +++ b/include/uapi/linux/signalfd.h @@ -39,8 +39,6 @@ struct signalfd_siginfo { __s32 ssi_syscall; __u64 ssi_call_addr; __u32 ssi_arch; - __u32 ssi_perf_type; - __u64 ssi_perf_data; /* * Pad strcture to 128 bytes. Remember to update the @@ -51,7 +49,7 @@ struct signalfd_siginfo { * comes out of a read(2) and we really don't want to have * a compat on read(2). */ - __u8 __pad[16]; + __u8 __pad[28]; }; -- cgit v1.2.3 From 3410fbcd47dc6479af4309febf760ccaa5efb472 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 12 May 2021 13:52:27 +0300 Subject: {net, RDMA}/mlx5: Fix override of log_max_qp by other device mlx5_core_dev holds pointer to static profile, hence when the log_max_qp of the profile is override by some device, then it effect all other mlx5 devices that share the same profile. Fix it by having a profile instance for every mlx5 device. Fixes: 883371c453b9 ("net/mlx5: Check FW limitations on log_max_qp before setting it") Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f8e8d7e90616..020a8f7fdbdd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -703,6 +703,27 @@ struct mlx5_hv_vhca; #define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev) (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity)) #define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)) +enum { + MLX5_PROF_MASK_QP_SIZE = (u64)1 << 0, + MLX5_PROF_MASK_MR_CACHE = (u64)1 << 1, +}; + +enum { + MR_CACHE_LAST_STD_ENTRY = 20, + MLX5_IMR_MTT_CACHE_ENTRY, + MLX5_IMR_KSM_CACHE_ENTRY, + MAX_MR_CACHE_ENTRIES +}; + +struct mlx5_profile { + u64 mask; + u8 log_max_qp; + struct { + int size; + int limit; + } mr_cache[MAX_MR_CACHE_ENTRIES]; +}; + struct mlx5_core_dev { struct device *device; enum mlx5_coredev_type coredev_type; @@ -731,7 +752,7 @@ struct mlx5_core_dev { struct mutex intf_state_mutex; unsigned long intf_state; struct mlx5_priv priv; - struct mlx5_profile *profile; + struct mlx5_profile profile; u32 issi; struct mlx5e_resources mlx5e_res; struct mlx5_dm *dm; @@ -1083,18 +1104,6 @@ static inline u8 mlx5_mkey_variant(u32 mkey) return mkey & 0xff; } -enum { - MLX5_PROF_MASK_QP_SIZE = (u64)1 << 0, - MLX5_PROF_MASK_MR_CACHE = (u64)1 << 1, -}; - -enum { - MR_CACHE_LAST_STD_ENTRY = 20, - MLX5_IMR_MTT_CACHE_ENTRY, - MLX5_IMR_KSM_CACHE_ENTRY, - MAX_MR_CACHE_ENTRIES -}; - /* Async-atomic event notifier used by mlx5 core to forward FW * evetns recived from event queue to mlx5 consumers. * Optimise event queue dipatching. @@ -1148,15 +1157,6 @@ int mlx5_rdma_rn_get_params(struct mlx5_core_dev *mdev, struct ib_device *device, struct rdma_netdev_alloc_params *params); -struct mlx5_profile { - u64 mask; - u8 log_max_qp; - struct { - int size; - int limit; - } mr_cache[MAX_MR_CACHE_ENTRIES]; -}; - enum { MLX5_PCI_DEV_IS_VF = 1 << 0, }; -- cgit v1.2.3 From 7c9f131f366ab414691907fa0407124ea2b2f3bc Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 22 Apr 2021 15:48:10 +0300 Subject: {net,vdpa}/mlx5: Configure interface MAC into mpfs L2 table net/mlx5: Expose MPFS configuration API MPFS is the multi physical function switch that bridges traffic between the physical port and any physical functions associated with it. The driver is required to add or remove MAC entries to properly forward incoming traffic to the correct physical function. We export the API to control MPFS so that other drivers, such as mlx5_vdpa are able to add MAC addresses of their network interfaces. The MAC address of the vdpa interface must be configured into the MPFS L2 address. Failing to do so could cause, in some NIC configurations, failure to forward packets to the vdpa network device instance. Fix this by adding calls to update the MPFS table. CC: CC: CC: Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mpfs.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 include/linux/mlx5/mpfs.h (limited to 'include') diff --git a/include/linux/mlx5/mpfs.h b/include/linux/mlx5/mpfs.h new file mode 100644 index 000000000000..bf700c8d5516 --- /dev/null +++ b/include/linux/mlx5/mpfs.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + * Copyright (c) 2021 Mellanox Technologies Ltd. + */ + +#ifndef _MLX5_MPFS_ +#define _MLX5_MPFS_ + +struct mlx5_core_dev; + +#ifdef CONFIG_MLX5_MPFS +int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac); +int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac); +#else /* #ifndef CONFIG_MLX5_MPFS */ +static inline int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; } +static inline int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; } +#endif + +#endif -- cgit v1.2.3 From ba6e1d8422bd476ad79da409639a773c02f0cbad Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 May 2021 22:04:36 +0200 Subject: platform/surface: aggregator: avoid clang -Wconstant-conversion warning Clang complains about the assignment of SSAM_ANY_IID to ssam_device_uid->instance: drivers/platform/surface/surface_aggregator_registry.c:478:25: error: implicit conversion from 'int' to '__u8' (aka 'unsigned char') changes value from 65535 to 255 [-Werror,-Wconstant-conversion] { SSAM_VDEV(HUB, 0x02, SSAM_ANY_IID, 0x00) }, ~ ^~~~~~~~~~~~ include/linux/surface_aggregator/device.h:71:23: note: expanded from macro 'SSAM_ANY_IID' #define SSAM_ANY_IID 0xffff ^~~~~~ include/linux/surface_aggregator/device.h:126:63: note: expanded from macro 'SSAM_VDEV' SSAM_DEVICE(SSAM_DOMAIN_VIRTUAL, SSAM_VIRTUAL_TC_##cat, tid, iid, fun) ^~~ include/linux/surface_aggregator/device.h:102:41: note: expanded from macro 'SSAM_DEVICE' .instance = ((iid) != SSAM_ANY_IID) ? (iid) : 0, \ ^~~ The assignment doesn't actually happen, but clang checks the type limits before checking whether this assignment is reached. Replace the ?: operator with a __builtin_choose_expr() invocation that avoids the warning for the untaken part. Fixes: eb0e90a82098 ("platform/surface: aggregator: Add dedicated bus and device type") Cc: platform-driver-x86@vger.kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Reviewed-by: Maximilian Luz Link: https://lore.kernel.org/r/20210514200453.1542978-1-arnd@kernel.org Signed-off-by: Hans de Goede --- include/linux/surface_aggregator/device.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h index 4441ad667c3f..6ff9c58b3e17 100644 --- a/include/linux/surface_aggregator/device.h +++ b/include/linux/surface_aggregator/device.h @@ -98,9 +98,9 @@ struct ssam_device_uid { | (((fun) != SSAM_ANY_FUN) ? SSAM_MATCH_FUNCTION : 0), \ .domain = d, \ .category = cat, \ - .target = ((tid) != SSAM_ANY_TID) ? (tid) : 0, \ - .instance = ((iid) != SSAM_ANY_IID) ? (iid) : 0, \ - .function = ((fun) != SSAM_ANY_FUN) ? (fun) : 0 \ + .target = __builtin_choose_expr((tid) != SSAM_ANY_TID, (tid), 0), \ + .instance = __builtin_choose_expr((iid) != SSAM_ANY_IID, (iid), 0), \ + .function = __builtin_choose_expr((fun) != SSAM_ANY_FUN, (fun), 0) /** * SSAM_VDEV() - Initialize a &struct ssam_device_id as virtual device with -- cgit v1.2.3 From 6c60ff048ca1e0739f39aa25996543c6e662a46c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 May 2021 15:18:41 +0200 Subject: block: prevent block device lookups at the beginning of del_gendisk As an artifact of how gendisk lookup used to work in earlier kernels, GENHD_FL_UP is only cleared very late in del_gendisk, and a global lock is used to prevent opens from succeeding while del_gendisk is tearing down the gendisk. Switch to clearing the flag early and under bd_mutex so that callers can use bd_mutex to stabilize the flag, which removes the need for the global mutex. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210514131842.1600568-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7e9660ea967d..6fc26f7bdf71 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -306,8 +306,6 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, } #endif /* CONFIG_SYSFS */ -extern struct rw_semaphore bdev_lookup_sem; - dev_t blk_lookup_devt(const char *name, int partno); void blk_request_module(dev_t devt); #ifdef CONFIG_BLOCK -- cgit v1.2.3 From 80dd33cf72d1ab4f0af303f1fa242c6d6c8d328f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 14 May 2021 14:10:15 +0200 Subject: drivers: base: Fix device link removal When device_link_free() drops references to the supplier and consumer devices of the device link going away and the reference being dropped turns out to be the last one for any of those device objects, its ->release callback will be invoked and it may sleep which goes against the SRCU callback execution requirements. To address this issue, make the device link removal code carry out the device_link_free() actions preceded by SRCU synchronization from a separate work item (the "long" workqueue is used for that, because it does not matter when the device link memory is released and it may take time to get to that point) instead of using SRCU callbacks. While at it, make the code work analogously when SRCU is not enabled to reduce the differences between the SRCU and non-SRCU cases. Fixes: 843e600b8a2b ("driver core: Fix sleeping in invalid context during device link deletion") Cc: stable Reported-by: chenxiang (M) Tested-by: chenxiang (M) Reviewed-by: Saravana Kannan Signed-off-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/5722787.lOV4Wx5bFT@kreacher Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index 38a2071cf776..f1a00040fa53 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -570,7 +570,7 @@ struct device { * @flags: Link flags. * @rpm_active: Whether or not the consumer device is runtime-PM-active. * @kref: Count repeated addition of the same link. - * @rcu_head: An RCU head to use for deferred execution of SRCU callbacks. + * @rm_work: Work structure used for removing the link. * @supplier_preactivated: Supplier has been made active before consumer probe. */ struct device_link { @@ -583,9 +583,7 @@ struct device_link { u32 flags; refcount_t rpm_active; struct kref kref; -#ifdef CONFIG_SRCU - struct rcu_head rcu_head; -#endif + struct work_struct rm_work; bool supplier_preactivated; /* Owned by consumer probe. */ }; -- cgit v1.2.3 From f747e6667ebb2ffb8133486c9cd19800d72b0d98 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Sat, 22 May 2021 17:42:02 -0700 Subject: linux/bits.h: fix compilation error with GENMASK GENMASK() has an input check which uses __builtin_choose_expr() to enable a compile time sanity check of its inputs if they are known at compile time. However, it turns out that __builtin_constant_p() does not always return a compile time constant [0]. It was thought this problem was fixed with gcc 4.9 [1], but apparently this is not the case [2]. Switch to use __is_constexpr() instead which always returns a compile time constant, regardless of its inputs. Link: https://lore.kernel.org/lkml/42b4342b-aefc-a16a-0d43-9f9c0d63ba7a@rasmusvillemoes.dk [0] Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=19449 [1] Link: https://lore.kernel.org/lkml/1ac7bbc2-45d9-26ed-0b33-bf382b8d858b@I-love.SAKURA.ne.jp [2] Link: https://lkml.kernel.org/r/20210511203716.117010-1-rikard.falkeborn@gmail.com Signed-off-by: Rikard Falkeborn Reported-by: Tetsuo Handa Acked-by: Arnd Bergmann Reviewed-by: Andy Shevchenko Cc: Ard Biesheuvel Cc: Yury Norov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bits.h | 2 +- include/linux/const.h | 8 ++++++++ include/linux/minmax.h | 10 ++-------- 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/bits.h b/include/linux/bits.h index 7f475d59a097..87d112650dfb 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -22,7 +22,7 @@ #include #define GENMASK_INPUT_CHECK(h, l) \ (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ - __builtin_constant_p((l) > (h)), (l) > (h), 0))) + __is_constexpr((l) > (h)), (l) > (h), 0))) #else /* * BUILD_BUG_ON_ZERO is not available in h files included from asm files, diff --git a/include/linux/const.h b/include/linux/const.h index 81b8aae5a855..435ddd72d2c4 100644 --- a/include/linux/const.h +++ b/include/linux/const.h @@ -3,4 +3,12 @@ #include +/* + * This returns a constant expression while determining if an argument is + * a constant expression, most importantly without evaluating the argument. + * Glory to Martin Uecker + */ +#define __is_constexpr(x) \ + (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) + #endif /* _LINUX_CONST_H */ diff --git a/include/linux/minmax.h b/include/linux/minmax.h index c0f57b0c64d9..5433c08fcc68 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -2,6 +2,8 @@ #ifndef _LINUX_MINMAX_H #define _LINUX_MINMAX_H +#include + /* * min()/max()/clamp() macros must accomplish three things: * @@ -17,14 +19,6 @@ #define __typecheck(x, y) \ (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) -/* - * This returns a constant expression while determining if an argument is - * a constant expression, most importantly without evaluating the argument. - * Glory to Martin Uecker - */ -#define __is_constexpr(x) \ - (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) - #define __no_side_effects(x, y) \ (__is_constexpr(x) && __is_constexpr(y)) -- cgit v1.2.3 From 08b2b6fdf6b26032f025084ce2893924a0cdb4a2 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 24 May 2021 16:29:43 +0800 Subject: cgroup: fix spelling mistakes Fix some spelling mistakes in comments: hierarhcy ==> hierarchy automtically ==> automatically overriden ==> overridden In absense of .. or ==> In absence of .. and assocaited ==> associated taget ==> target initate ==> initiate succeded ==> succeeded curremt ==> current udpated ==> updated Signed-off-by: Zhen Lei Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 6 +++--- include/linux/cgroup.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 559ee05f86b2..fb8f6d2cd104 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -232,7 +232,7 @@ struct css_set { struct list_head task_iters; /* - * On the default hierarhcy, ->subsys[ssid] may point to a css + * On the default hierarchy, ->subsys[ssid] may point to a css * attached to an ancestor instead of the cgroup this css_set is * associated with. The following node is anchored at * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to @@ -668,7 +668,7 @@ struct cgroup_subsys { */ bool threaded:1; - /* the following two fields are initialized automtically during boot */ + /* the following two fields are initialized automatically during boot */ int id; const char *name; @@ -757,7 +757,7 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer. * On boot, sock_cgroup_data records the cgroup that the sock was created * in so that cgroup2 matches can be made; however, once either net_prio or - * net_cls starts being used, the area is overriden to carry prioidx and/or + * net_cls starts being used, the area is overridden to carry prioidx and/or * classid. The two modes are distinguished by whether the lowest bit is * set. Clear bit indicates cgroup pointer while set bit prioidx and * classid. diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4f2f79de083e..6bc9c76680b2 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -32,7 +32,7 @@ struct kernel_clone_args; #ifdef CONFIG_CGROUPS /* - * All weight knobs on the default hierarhcy should use the following min, + * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of * MIN and MAX and allows 100x to be expressed in both directions. */ -- cgit v1.2.3 From 1cb61759d40716643281b8e0f8c7afebc8699249 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 21 May 2021 09:26:10 +0200 Subject: init: verify that function is initcall_t at compile-time In the spirit of making it hard to misuse an interface, add a compile-time assertion in the CONFIG_HAVE_ARCH_PREL32_RELOCATIONS case to verify the initcall function matches initcall_t, because the inline asm bypasses any type-checking the compiler would otherwise do. This will help developers catch incorrect API use in all configurations. A recent example of this is: https://lkml.kernel.org/r/20210514140015.2944744-1-arnd@kernel.org Signed-off-by: Marco Elver Reviewed-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Reviewed-by: Sami Tolvanen Tested-by: Paul E. McKenney Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210521072610.2880286-1-elver@google.com --- include/linux/init.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/init.h b/include/linux/init.h index 045ad1650ed1..d82b4b2e1d25 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -242,7 +242,8 @@ extern bool initcall_debug; asm(".section \"" __sec "\", \"a\" \n" \ __stringify(__name) ": \n" \ ".long " __stringify(__stub) " - . \n" \ - ".previous \n"); + ".previous \n"); \ + static_assert(__same_type(initcall_t, &fn)); #else #define ____define_initcall(fn, __unused, __name, __sec) \ static initcall_t __name __used \ -- cgit v1.2.3 From a8b98c808eab3ec8f1b5a64be967b0f4af4cae43 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 24 May 2021 16:53:21 +0300 Subject: fanotify: fix permission model of unprivileged group Reporting event->pid should depend on the privileges of the user that initialized the group, not the privileges of the user reading the events. Use an internal group flag FANOTIFY_UNPRIV to record the fact that the group was initialized by an unprivileged user. To be on the safe side, the premissions to setup filesystem and mount marks now require that both the user that initialized the group and the user setting up the mark have CAP_SYS_ADMIN. Link: https://lore.kernel.org/linux-fsdevel/CAOQ4uxiA77_P5vtv7e83g0+9d7B5W9ZTE4GfQEYbWmfT1rA=VA@mail.gmail.com/ Fixes: 7cea2a3c505e ("fanotify: support limited functionality for unprivileged users") Cc: # v5.12+ Link: https://lore.kernel.org/r/20210524135321.2190062-1-amir73il@gmail.com Reviewed-by: Matthew Bobrowski Acked-by: Christian Brauner Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fanotify.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index bad41bcb25df..a16dbeced152 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -51,6 +51,10 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ #define FANOTIFY_INIT_FLAGS (FANOTIFY_ADMIN_INIT_FLAGS | \ FANOTIFY_USER_INIT_FLAGS) +/* Internal group flags */ +#define FANOTIFY_UNPRIV 0x80000000 +#define FANOTIFY_INTERNAL_GROUP_FLAGS (FANOTIFY_UNPRIV) + #define FANOTIFY_MARK_TYPE_BITS (FAN_MARK_INODE | FAN_MARK_MOUNT | \ FAN_MARK_FILESYSTEM) -- cgit v1.2.3 From 9453d45ecb6c2199d72e73c993e9d98677a2801b Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 25 May 2021 16:21:52 +0300 Subject: net: zero-initialize tc skb extension on allocation Function skb_ext_add() doesn't initialize created skb extension with any value and leaves it up to the user. However, since extension of type TC_SKB_EXT originally contained only single value tc_skb_ext->chain its users used to just assign the chain value without setting whole extension memory to zero first. This assumption changed when TC_SKB_EXT extension was extended with additional fields but not all users were updated to initialize the new fields which leads to use of uninitialized memory afterwards. UBSAN log: [ 778.299821] UBSAN: invalid-load in net/openvswitch/flow.c:899:28 [ 778.301495] load of value 107 is not a valid value for type '_Bool' [ 778.303215] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.12.0-rc7+ #2 [ 778.304933] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 778.307901] Call Trace: [ 778.308680] [ 778.309358] dump_stack+0xbb/0x107 [ 778.310307] ubsan_epilogue+0x5/0x40 [ 778.311167] __ubsan_handle_load_invalid_value.cold+0x43/0x48 [ 778.312454] ? memset+0x20/0x40 [ 778.313230] ovs_flow_key_extract.cold+0xf/0x14 [openvswitch] [ 778.314532] ovs_vport_receive+0x19e/0x2e0 [openvswitch] [ 778.315749] ? ovs_vport_find_upcall_portid+0x330/0x330 [openvswitch] [ 778.317188] ? create_prof_cpu_mask+0x20/0x20 [ 778.318220] ? arch_stack_walk+0x82/0xf0 [ 778.319153] ? secondary_startup_64_no_verify+0xb0/0xbb [ 778.320399] ? stack_trace_save+0x91/0xc0 [ 778.321362] ? stack_trace_consume_entry+0x160/0x160 [ 778.322517] ? lock_release+0x52e/0x760 [ 778.323444] netdev_frame_hook+0x323/0x610 [openvswitch] [ 778.324668] ? ovs_netdev_get_vport+0xe0/0xe0 [openvswitch] [ 778.325950] __netif_receive_skb_core+0x771/0x2db0 [ 778.327067] ? lock_downgrade+0x6e0/0x6f0 [ 778.328021] ? lock_acquire+0x565/0x720 [ 778.328940] ? generic_xdp_tx+0x4f0/0x4f0 [ 778.329902] ? inet_gro_receive+0x2a7/0x10a0 [ 778.330914] ? lock_downgrade+0x6f0/0x6f0 [ 778.331867] ? udp4_gro_receive+0x4c4/0x13e0 [ 778.332876] ? lock_release+0x52e/0x760 [ 778.333808] ? dev_gro_receive+0xcc8/0x2380 [ 778.334810] ? lock_downgrade+0x6f0/0x6f0 [ 778.335769] __netif_receive_skb_list_core+0x295/0x820 [ 778.336955] ? process_backlog+0x780/0x780 [ 778.337941] ? mlx5e_rep_tc_netdevice_event_unregister+0x20/0x20 [mlx5_core] [ 778.339613] ? seqcount_lockdep_reader_access.constprop.0+0xa7/0xc0 [ 778.341033] ? kvm_clock_get_cycles+0x14/0x20 [ 778.342072] netif_receive_skb_list_internal+0x5f5/0xcb0 [ 778.343288] ? __kasan_kmalloc+0x7a/0x90 [ 778.344234] ? mlx5e_handle_rx_cqe_mpwrq+0x9e0/0x9e0 [mlx5_core] [ 778.345676] ? mlx5e_xmit_xdp_frame_mpwqe+0x14d0/0x14d0 [mlx5_core] [ 778.347140] ? __netif_receive_skb_list_core+0x820/0x820 [ 778.348351] ? mlx5e_post_rx_mpwqes+0xa6/0x25d0 [mlx5_core] [ 778.349688] ? napi_gro_flush+0x26c/0x3c0 [ 778.350641] napi_complete_done+0x188/0x6b0 [ 778.351627] mlx5e_napi_poll+0x373/0x1b80 [mlx5_core] [ 778.352853] __napi_poll+0x9f/0x510 [ 778.353704] ? mlx5_flow_namespace_set_mode+0x260/0x260 [mlx5_core] [ 778.355158] net_rx_action+0x34c/0xa40 [ 778.356060] ? napi_threaded_poll+0x3d0/0x3d0 [ 778.357083] ? sched_clock_cpu+0x18/0x190 [ 778.358041] ? __common_interrupt+0x8e/0x1a0 [ 778.359045] __do_softirq+0x1ce/0x984 [ 778.359938] __irq_exit_rcu+0x137/0x1d0 [ 778.360865] irq_exit_rcu+0xa/0x20 [ 778.361708] common_interrupt+0x80/0xa0 [ 778.362640] [ 778.363212] asm_common_interrupt+0x1e/0x40 [ 778.364204] RIP: 0010:native_safe_halt+0xe/0x10 [ 778.365273] Code: 4f ff ff ff 4c 89 e7 e8 50 3f 40 fe e9 dc fe ff ff 48 89 df e8 43 3f 40 fe eb 90 cc e9 07 00 00 00 0f 00 2d 74 05 62 00 fb f4 90 e9 07 00 00 00 0f 00 2d 64 05 62 00 f4 c3 cc cc 0f 1f 44 00 [ 778.369355] RSP: 0018:ffffffff84407e48 EFLAGS: 00000246 [ 778.370570] RAX: ffff88842de46a80 RBX: ffffffff84425840 RCX: ffffffff83418468 [ 778.372143] RDX: 000000000026f1da RSI: 0000000000000004 RDI: ffffffff8343af5e [ 778.373722] RBP: fffffbfff0884b08 R08: 0000000000000000 R09: ffff88842de46bcb [ 778.375292] R10: ffffed1085bc8d79 R11: 0000000000000001 R12: 0000000000000000 [ 778.376860] R13: ffffffff851124a0 R14: 0000000000000000 R15: dffffc0000000000 [ 778.378491] ? rcu_eqs_enter.constprop.0+0xb8/0xe0 [ 778.379606] ? default_idle_call+0x5e/0xe0 [ 778.380578] default_idle+0xa/0x10 [ 778.381406] default_idle_call+0x96/0xe0 [ 778.382350] do_idle+0x3d4/0x550 [ 778.383153] ? arch_cpu_idle_exit+0x40/0x40 [ 778.384143] cpu_startup_entry+0x19/0x20 [ 778.385078] start_kernel+0x3c7/0x3e5 [ 778.385978] secondary_startup_64_no_verify+0xb0/0xbb Fix the issue by providing new function tc_skb_ext_alloc() that allocates tc skb extension and initializes its memory to 0 before returning it to the caller. Change all existing users to use new API instead of calling skb_ext_add() directly. Fixes: 038ebb1a713d ("net/sched: act_ct: fix miss set mru for ovs after defrag in act_ct") Fixes: d29334c15d33 ("net/sched: act_api: fix miss set post_ct for ovs after do conntrack in act_ct") Signed-off-by: Vlad Buslov Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 255e4f4b521f..ec7823921bd2 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -709,6 +709,17 @@ tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, cls_common->extack = extack; } +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) +static inline struct tc_skb_ext *tc_skb_ext_alloc(struct sk_buff *skb) +{ + struct tc_skb_ext *tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); + + if (tc_skb_ext) + memset(tc_skb_ext, 0, sizeof(*tc_skb_ext)); + return tc_skb_ext; +} +#endif + enum tc_matchall_command { TC_CLSMATCHALL_REPLACE, TC_CLSMATCHALL_DESTROY, -- cgit v1.2.3 From e86be3a04bc4aeaf12f93af35f08f8d4385bcd98 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 25 May 2021 18:43:38 -0400 Subject: SUNRPC: More fixes for backlog congestion Ensure that we fix the XPRT_CONGESTED starvation issue for RDMA as well as socket based transports. Ensure we always initialise the request after waking up from the backlog list. Fixes: e877a88d1f06 ("SUNRPC in case of backlog, hand free slots directly to waiting task") Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index d81fe8b364d0..61b622e334ee 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -368,6 +368,8 @@ struct rpc_xprt * xprt_alloc(struct net *net, size_t size, unsigned int num_prealloc, unsigned int max_req); void xprt_free(struct rpc_xprt *); +void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task); +bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req); static inline int xprt_enable_swap(struct rpc_xprt *xprt) -- cgit v1.2.3 From 62f3415db237b8d2aa9a804ff84ce2efa87df179 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 26 May 2021 11:46:17 -0700 Subject: net: phy: Document phydev::dev_flags bits allocation Document the phydev::dev_flags bit allocation to allow bits 15:0 to define PHY driver specific behavior, bits 23:16 to be reserved for now, and bits 31:24 to hold generic PHY driver flags. Signed-off-by: Florian Fainelli Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20210526184617.3105012-1-f.fainelli@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 60d2b26026a2..852743f07e3e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -496,6 +496,11 @@ struct macsec_ops; * @mac_managed_pm: Set true if MAC driver takes of suspending/resuming PHY * @state: State of the PHY for management purposes * @dev_flags: Device-specific flags used by the PHY driver. + * Bits [15:0] are free to use by the PHY driver to communicate + * driver specific behavior. + * Bits [23:16] are currently reserved for future use. + * Bits [31:24] are reserved for defining generic + * PHY driver behavior. * @irq: IRQ number of the PHY's interrupt (-1 if none) * @phy_timer: The timer for handling the state machine * @phylink: Pointer to phylink instance for this PHY -- cgit v1.2.3 From 6bd5b743686243dae7351d5dcceeb7f171201bb4 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 18 May 2021 05:00:31 -0700 Subject: KVM: PPC: exit halt polling on need_resched() This is inspired by commit 262de4102c7bb8 (kvm: exit halt polling on need_resched() as well). Due to PPC implements an arch specific halt polling logic, we have to the need_resched() check there as well. This patch adds a helper function that can be shared between book3s and generic halt-polling loops. Reviewed-by: David Matlack Reviewed-by: Venkatesh Srinivas Cc: Ben Segall Cc: Venkatesh Srinivas Cc: Jim Mattson Cc: David Matlack Cc: Paul Mackerras Cc: Suraj Jitindar Singh Signed-off-by: Wanpeng Li Message-Id: <1621339235-11131-1-git-send-email-wanpengli@tencent.com> [Make the function inline. - Paolo] Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2f34487e21f2..5d4b96b36ec0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -265,6 +266,11 @@ static inline bool kvm_vcpu_mapped(struct kvm_host_map *map) return !!map->hva; } +static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop) +{ + return single_task_running() && !need_resched() && ktime_before(cur, stop); +} + /* * Sometimes a large or cross-page mmio needs to be broken up into separate * exits for userspace servicing. -- cgit v1.2.3 From fb1070d18edb37daf3979662975bc54625a19953 Mon Sep 17 00:00:00 2001 From: Joe Richey Date: Fri, 21 May 2021 01:58:43 -0700 Subject: KVM: X86: Use _BITUL() macro in UAPI headers Replace BIT() in KVM's UPAI header with _BITUL(). BIT() is not defined in the UAPI headers and its usage may cause userspace build errors. Fixes: fb04a1eddb1a ("KVM: X86: Implement ring-based dirty memory tracking") Signed-off-by: Joe Richey Message-Id: <20210521085849.37676-3-joerichey94@gmail.com> Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 3fd9a7e9d90c..79d9c44d1ad7 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -8,6 +8,7 @@ * Note: you must update KVM_API_VERSION if you change this interface. */ +#include #include #include #include @@ -1879,8 +1880,8 @@ struct kvm_hyperv_eventfd { * conversion after harvesting an entry. Also, it must not skip any * dirty bits, so that dirty bits are always harvested in sequence. */ -#define KVM_DIRTY_GFN_F_DIRTY BIT(0) -#define KVM_DIRTY_GFN_F_RESET BIT(1) +#define KVM_DIRTY_GFN_F_DIRTY _BITUL(0) +#define KVM_DIRTY_GFN_F_RESET _BITUL(1) #define KVM_DIRTY_GFN_F_MASK 0x3 /* -- cgit v1.2.3 From 084071d5e9226add45a6031928bf10e6afc855fd Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 25 May 2021 10:41:17 -0300 Subject: KVM: rename KVM_REQ_PENDING_TIMER to KVM_REQ_UNBLOCK KVM_REQ_UNBLOCK will be used to exit a vcpu from its inner vcpu halt emulation loop. Rename KVM_REQ_PENDING_TIMER to KVM_REQ_UNBLOCK, switch PowerPC to arch specific request bit. Signed-off-by: Marcelo Tosatti Message-Id: <20210525134321.303768132@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5d4b96b36ec0..76102efbf079 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -147,7 +147,7 @@ static inline bool is_error_page(struct page *page) */ #define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_PENDING_TIMER 2 +#define KVM_REQ_UNBLOCK 2 #define KVM_REQ_UNHALT 3 #define KVM_REQUEST_ARCH_BASE 8 -- cgit v1.2.3