From c2eb8effb265ac5cdd960d8e61ecb931e9c767cd Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 24 Oct 2018 06:50:34 -0400 Subject: media: videodev2.h: add v4l2_timeval_to_ns inline function We want to be able to uniquely identify buffers for stateless codecs. The internal timestamp (a u64) as stored internally in the kernel is a suitable candidate for that, but in struct v4l2_buffer it is represented as a struct timeval. Add a v4l2_timeval_to_ns() function that converts the struct timeval into a u64 in the same way that the kernel does. This makes it possible to use this u64 elsewhere as a unique identifier of the buffer. Since timestamps are also copied from the output buffer to the corresponding capture buffer(s) by M2M devices, the u64 can be used to refer to both output and capture buffers. The plan is that in the future we redesign struct v4l2_buffer and use u64 for the timestamp instead of a struct timeval (which has lots of problems with 32 vs 64 bit and y2038 layout changes), and then there is no more need to use this function. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index b5671ce2724f..d6eed479c3a6 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -973,6 +973,18 @@ struct v4l2_buffer { }; }; +/** + * v4l2_timeval_to_ns - Convert timeval to nanoseconds + * @ts: pointer to the timeval variable to be converted + * + * Returns the scalar nanosecond representation of the timeval + * parameter. + */ +static inline __u64 v4l2_timeval_to_ns(const struct timeval *tv) +{ + return (__u64)tv->tv_sec * 1000000000ULL + tv->tv_usec * 1000; +} + /* Flags for 'flags' field */ /* Buffer is mapped (flag) */ #define V4L2_BUF_FLAG_MAPPED 0x00000001 -- cgit v1.2.3 From 4b837c6d7ee771f68a30f362c9f68171a95be222 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 14 Jan 2019 09:01:54 -0500 Subject: media: v4l: uAPI: V4L2_BUF_TYPE_META_OUTPUT is an output buffer type V4L2_BUF_TYPE_META_OUTPUT was added by commit 72148d1a57e7 ("media: v4l: Add support for V4L2_BUF_TYPE_META_OUTPUT") but the patch missed adding the type to the macro telling whether a given type is an output type or not. Do that now. Getting this wrong leads to handling the buffer as a capture buffer in a lot of places. Fixes: 72148d1a57e7 ("media: v4l: Add support for V4L2_BUF_TYPE_META_OUTPUT") Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index d6eed479c3a6..c0c36c165bf4 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -161,7 +161,8 @@ enum v4l2_buf_type { || (type) == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY \ || (type) == V4L2_BUF_TYPE_VBI_OUTPUT \ || (type) == V4L2_BUF_TYPE_SLICED_VBI_OUTPUT \ - || (type) == V4L2_BUF_TYPE_SDR_OUTPUT) + || (type) == V4L2_BUF_TYPE_SDR_OUTPUT \ + || (type) == V4L2_BUF_TYPE_META_OUTPUT) enum v4l2_tuner_type { V4L2_TUNER_RADIO = 1, -- cgit v1.2.3 From 50656bad786d001b294764e9f047c5d5b3e4db75 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Thu, 10 Jan 2019 11:56:09 -0500 Subject: media: v4l2-ctrl: Add control to enable h.264 constrained intra prediction Allow to enable h.264 constrained intra prediction (macroblocks using intra prediction modes are not allowed to use residual data and decoded samples of neighboring macroblocks coded using inter prediction modes). This control directly corresponds to the constrained_intra_pred_flag field in the h.264 picture parameter set. Signed-off-by: Philipp Zabel Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/v4l2-controls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 3dcfc6148f99..fd65c710b144 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -533,6 +533,7 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type { }; #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER (V4L2_CID_MPEG_BASE+381) #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP (V4L2_CID_MPEG_BASE+382) +#define V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION (V4L2_CID_MPEG_BASE+383) #define V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP (V4L2_CID_MPEG_BASE+400) #define V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP (V4L2_CID_MPEG_BASE+401) #define V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP (V4L2_CID_MPEG_BASE+402) -- cgit v1.2.3 From d034696cbe5a6e00f76ca4b7869c6cdef66aebd5 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Thu, 10 Jan 2019 11:56:10 -0500 Subject: media: v4l2-ctrl: Add control for h.264 chroma qp offset Allow to add fixed quantization parameter offset between luma and chroma quantization parameters. This control directly corresponds to the chroma_qp_index_offset field of the h.264 picture parameter set. Signed-off-by: Philipp Zabel Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/v4l2-controls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index fd65c710b144..06479f2fb3ae 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -534,6 +534,7 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type { #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER (V4L2_CID_MPEG_BASE+381) #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP (V4L2_CID_MPEG_BASE+382) #define V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION (V4L2_CID_MPEG_BASE+383) +#define V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET (V4L2_CID_MPEG_BASE+384) #define V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP (V4L2_CID_MPEG_BASE+400) #define V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP (V4L2_CID_MPEG_BASE+401) #define V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP (V4L2_CID_MPEG_BASE+402) -- cgit v1.2.3 From 1c3721b1f22286033abeda30b7e12439b083ed0f Mon Sep 17 00:00:00 2001 From: Steve Longerbeam Date: Wed, 9 Jan 2019 13:30:04 -0500 Subject: media: videodev2.h: Add more field helper macros Adds two helper macros: V4L2_FIELD_IS_SEQUENTIAL: returns true if the given field type is 'sequential', that is a full frame is transmitted, or exists in memory, as all top field lines followed by all bottom field lines, or vice-versa. V4L2_FIELD_IS_INTERLACED: returns true if the given field type is 'interlaced', that is a full frame is transmitted, or exists in memory, as top field lines interlaced with bottom field lines. Signed-off-by: Steve Longerbeam Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index c0c36c165bf4..9a920f071ff9 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -130,6 +130,13 @@ enum v4l2_field { ((field) == V4L2_FIELD_BOTTOM ||\ (field) == V4L2_FIELD_TOP ||\ (field) == V4L2_FIELD_ALTERNATE) +#define V4L2_FIELD_IS_INTERLACED(field) \ + ((field) == V4L2_FIELD_INTERLACED ||\ + (field) == V4L2_FIELD_INTERLACED_TB ||\ + (field) == V4L2_FIELD_INTERLACED_BT) +#define V4L2_FIELD_IS_SEQUENTIAL(field) \ + ((field) == V4L2_FIELD_SEQ_TB ||\ + (field) == V4L2_FIELD_SEQ_BT) enum v4l2_buf_type { V4L2_BUF_TYPE_VIDEO_CAPTURE = 1, -- cgit v1.2.3 From 6bf8f22aea0ddd93af822aed8afeeee4acdf7694 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 22 Jan 2019 08:29:56 +0200 Subject: IB/mlx5: Introduce MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD Introduce MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD and its initial implementation. This object is from type class FD and will be used to read DEVX async commands completion. The core layer should allow the driver to set object from type FD in a safe mode, this option was added with a matching comment in place. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index b8d121d457f1..6ceae29d77cd 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -113,11 +113,20 @@ enum mlx5_ib_devx_umem_methods { MLX5_IB_METHOD_DEVX_UMEM_DEREG, }; +enum mlx5_ib_devx_async_cmd_fd_alloc_attrs { + MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_devx_async_cmd_fd_methods { + MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT), +}; + enum mlx5_ib_objects { MLX5_IB_OBJECT_DEVX = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_OBJECT_DEVX_OBJ, MLX5_IB_OBJECT_DEVX_UMEM, MLX5_IB_OBJECT_FLOW_MATCHER, + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, }; enum mlx5_ib_flow_matcher_create_attrs { -- cgit v1.2.3 From a124edba26270697540f1058bfcd490c1c65b116 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 22 Jan 2019 08:29:57 +0200 Subject: IB/mlx5: Introduce async DEVX obj query API Introduce async DEVX obj query API to get the command response back to user space once it's ready without blocking when calling the firmware. The event's data includes a header with some meta data then the firmware output command data. The header includes: - The input 'wr_id' to let application recognizing the response. The input FD attribute is used to have the event data ready on. Downstream patches from this series will implement the file ops to let application read it. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 9 +++++++++ include/uapi/rdma/mlx5_user_ioctl_verbs.h | 5 +++++ 2 files changed, 14 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 6ceae29d77cd..8149d224030b 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -84,6 +84,14 @@ enum mlx5_ib_devx_obj_query_attrs { MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, }; +enum mlx5_ib_devx_obj_query_async_attrs { + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN, +}; + enum mlx5_ib_devx_query_eqn_attrs { MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, @@ -94,6 +102,7 @@ enum mlx5_ib_devx_obj_methods { MLX5_IB_METHOD_DEVX_OBJ_DESTROY, MLX5_IB_METHOD_DEVX_OBJ_MODIFY, MLX5_IB_METHOD_DEVX_OBJ_QUERY, + MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY, }; enum mlx5_ib_devx_umem_reg_attrs { diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index 4ef62c0e8452..4a701033b93f 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -51,5 +51,10 @@ enum mlx5_ib_uapi_flow_action_packet_reformat_type { MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x3, }; +struct mlx5_ib_uapi_devx_async_cmd_hdr { + __aligned_u64 wr_id; + __u8 out_data[]; +}; + #endif -- cgit v1.2.3 From 1194c4133195dfcb6c5fc0935d54bbed872a5285 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Tue, 29 Jan 2019 00:56:17 +0000 Subject: nfit: Add Hyper-V NVDIMM DSM command set to white list Add the Hyper-V _DSM command set to the white list of NVDIMM command sets. This command set is documented at http://www.uefi.org/RFIC_LIST (see "Virtual NVDIMM 0x1901"). Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Dan Williams --- include/uapi/linux/ndctl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index f57c9e434d2d..de5d90212409 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -243,6 +243,7 @@ struct nd_cmd_pkg { #define NVDIMM_FAMILY_HPE1 1 #define NVDIMM_FAMILY_HPE2 2 #define NVDIMM_FAMILY_MSFT 3 +#define NVDIMM_FAMILY_HYPERV 4 #define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\ struct nd_cmd_pkg) -- cgit v1.2.3 From d0a060be573bfbf8753a15dca35497db5e968bb0 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Wed, 30 Jan 2019 12:02:44 +0000 Subject: arm64: add ptrace regsets for ptrauth key management Add two new ptrace regsets, which can be used to request and change the pointer authentication keys of a thread. NT_ARM_PACA_KEYS gives access to the instruction/data address keys, and NT_ARM_PACG_KEYS to the generic authentication key. The keys are also part of the core dump file of the process. The regsets are only exposed if the kernel is compiled with CONFIG_CHECKPOINT_RESTORE=y, as the only intended use case is checkpointing and restoring processes that are using pointer authentication. (This can be changed later if there are other use cases.) Reviewed-by: Dave Martin Signed-off-by: Kristina Martsenko Signed-off-by: Catalin Marinas --- include/uapi/linux/elf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index e4d6ddd93567..34c02e4290fe 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -421,6 +421,8 @@ typedef struct elf64_shdr { #define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ #define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension registers */ #define NT_ARM_PAC_MASK 0x406 /* ARM pointer authentication code masks */ +#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#define NT_ARM_PACG_KEYS 0x408 /* ARM pointer authentication generic key */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ -- cgit v1.2.3 From 52a72e2a395fa3c5ab5df41058a8511e87215730 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 22 Jan 2019 08:48:42 +0200 Subject: IB/uverbs: Expose XRC ODP device capabilities Expose XRC ODP capabilities as part of the extended device capabilities. Signed-off-by: Moni Shoua Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/ib_user_verbs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 480d9a60b68e..0474c7400268 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -270,6 +270,8 @@ struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_tm_caps tm_caps; struct ib_uverbs_cq_moderation_caps cq_moderation_caps; __aligned_u64 max_dm_size; + __u32 xrc_odp_caps; + __u32 reserved; }; struct ib_uverbs_query_port { -- cgit v1.2.3 From 668aa15b5bf87f156ec805cb7348c785c56b82ab Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Tue, 29 Jan 2019 12:08:50 +0200 Subject: RDMA/rxe: Improve loopback marking Currently a packet is marked for loopback only if the source and destination addresses equals. This is not enough when multiple gids are present in rxe device's gid table and the traffic is from one gid to another. Fix it by marking the packet for loopback if the destination MAC address is equal to the source MAC address. Signed-off-by: Kamal Heib Reviewed-by: Yuval Shaia Tested-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_user_rxe.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index 44ef6a3b7afc..aae2e696bb38 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -58,8 +58,7 @@ struct rxe_global_route { struct rxe_av { __u8 port_num; __u8 network_type; - __u16 reserved1; - __u32 reserved2; + __u8 dmac[6]; struct rxe_global_route grh; union { struct sockaddr_in _sockaddr_in; -- cgit v1.2.3 From f76903d574b26bc596951a5c5e757eb02c67abbd Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 29 Jan 2019 13:33:11 -0800 Subject: RDMA/IWPM: refactor the IWPM message attribute names In order to add new IWPM_NL attributes, the enums for the IWPM commands attributes are refactored such that a new attribute can be added without breaking ABI version 3. Instead of sharing nl attribute enums for both request and response messages, we create separate enums for each IWPM message request and reply. This allows us to extend any given IWPM message by adding new attributes for just that message. These new enums are created, though, in a way to avoid breaking ABI version 3. Signed-off-by: Steve Wise Reviewed-by: Tatyana Nikolova Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 2e18b77a817f..42d53e182d5f 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -83,13 +83,20 @@ enum { IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0, IWPM_NLA_MANAGE_MAPPING_SEQ, IWPM_NLA_MANAGE_ADDR, - IWPM_NLA_MANAGE_MAPPED_LOC_ADDR, + IWPM_NLA_MANAGE_MAPPING_MAX +}; + +enum { + IWPM_NLA_RMANAGE_MAPPING_UNSPEC = 0, + IWPM_NLA_RMANAGE_MAPPING_SEQ, + IWPM_NLA_RMANAGE_ADDR, + IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR, + /* The following maintains bisectability of rdma-core */ + IWPM_NLA_MANAGE_MAPPED_LOC_ADDR = IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR, IWPM_NLA_RMANAGE_MAPPING_ERR, IWPM_NLA_RMANAGE_MAPPING_MAX }; -#define IWPM_NLA_MANAGE_MAPPING_MAX 3 -#define IWPM_NLA_QUERY_MAPPING_MAX 4 #define IWPM_NLA_MAPINFO_SEND_MAX 3 enum { @@ -97,6 +104,14 @@ enum { IWPM_NLA_QUERY_MAPPING_SEQ, IWPM_NLA_QUERY_LOCAL_ADDR, IWPM_NLA_QUERY_REMOTE_ADDR, + IWPM_NLA_QUERY_MAPPING_MAX, +}; + +enum { + IWPM_NLA_RQUERY_MAPPING_UNSPEC = 0, + IWPM_NLA_RQUERY_MAPPING_SEQ, + IWPM_NLA_RQUERY_LOCAL_ADDR, + IWPM_NLA_RQUERY_REMOTE_ADDR, IWPM_NLA_RQUERY_MAPPED_LOC_ADDR, IWPM_NLA_RQUERY_MAPPED_REM_ADDR, IWPM_NLA_RQUERY_MAPPING_ERR, -- cgit v1.2.3 From b0bad9ad514fc1dd8890f1749f5d2425a73270e3 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 29 Jan 2019 13:33:16 -0800 Subject: RDMA/IWPM: Support no port mapping requirements A soft iwarp driver that uses the host TCP stack via a kernel mode socket does not need port mapping. In fact, if the port map daemon, iwpmd, is running, then iwpmd must not try and create/bind a socket to the actual port for a soft iwarp connection, since the driver already has that socket bound. Yet if the soft iwarp driver wants to interoperate with hard iwarp devices that -are- using port mapping, then the soft iwarp driver's mappings still need to be maintained and advertised by the iwpm protocol. This patch enhances the rdma driver<->iwcm interface to allow an iwarp driver to specify that it does not want port mapping. The iwpm kernel<->iwpmd interface is also enhanced to pass up this information on map requests. Care is taken to interoperate with the current iwpmd version (ABI version 3) and only use the new NL attributes if iwpmd supports ABI version 4. The ABI version define has also been created in rdma_netlink.h so both kernel and user code can share it. The iwcm and iwpmd negotiate the ABI version to use with a new HELLO netlink message. Signed-off-by: Steve Wise Reviewed-by: Tatyana Nikolova Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 42d53e182d5f..0f5263767fb4 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -35,6 +35,19 @@ enum { RDMA_NL_RDMA_CM_NUM_ATTR, }; +/* The minimum version that the iwpm kernel supports */ +#define IWPM_UABI_VERSION_MIN 3 + +/* The latest version that the iwpm kernel supports */ +#define IWPM_UABI_VERSION 4 + +/* iwarp port mapper message flags */ +enum { + + /* Do not map the port for this IWPM request */ + IWPM_FLAGS_NO_PORT_MAP = (1 << 0), +}; + /* iwarp port mapper op-codes */ enum { RDMA_NL_IWPM_REG_PID = 0, @@ -45,6 +58,7 @@ enum { RDMA_NL_IWPM_HANDLE_ERR, RDMA_NL_IWPM_MAPINFO, RDMA_NL_IWPM_MAPINFO_NUM, + RDMA_NL_IWPM_HELLO, RDMA_NL_IWPM_NUM_OPS }; @@ -83,6 +97,7 @@ enum { IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0, IWPM_NLA_MANAGE_MAPPING_SEQ, IWPM_NLA_MANAGE_ADDR, + IWPM_NLA_MANAGE_FLAGS, IWPM_NLA_MANAGE_MAPPING_MAX }; @@ -98,12 +113,14 @@ enum { }; #define IWPM_NLA_MAPINFO_SEND_MAX 3 +#define IWPM_NLA_REMOVE_MAPPING_MAX 3 enum { IWPM_NLA_QUERY_MAPPING_UNSPEC = 0, IWPM_NLA_QUERY_MAPPING_SEQ, IWPM_NLA_QUERY_LOCAL_ADDR, IWPM_NLA_QUERY_REMOTE_ADDR, + IWPM_NLA_QUERY_FLAGS, IWPM_NLA_QUERY_MAPPING_MAX, }; @@ -129,6 +146,7 @@ enum { IWPM_NLA_MAPINFO_UNSPEC = 0, IWPM_NLA_MAPINFO_LOCAL_ADDR, IWPM_NLA_MAPINFO_MAPPED_ADDR, + IWPM_NLA_MAPINFO_FLAGS, IWPM_NLA_MAPINFO_MAX }; @@ -147,6 +165,12 @@ enum { IWPM_NLA_ERR_MAX }; +enum { + IWPM_NLA_HELLO_UNSPEC = 0, + IWPM_NLA_HELLO_ABI_VERSION, + IWPM_NLA_HELLO_MAX +}; + /* * Local service operations: * RESOLVE - The client requests the local service to resolve a path. -- cgit v1.2.3 From a78e8723a50530d15faa25cc0b6f009bcd251c20 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 16 Jan 2019 09:55:41 +0200 Subject: RDMA/cma: Remove CM_ID statistics provided by rdma-cm module Netlink statistics exported by rdma-cm never had any working user space component published to the mailing list or to any open source project. Canvassing various proprietary users, and the original requester, we find that there are no real users of this interface. This patch simply removes all occurrences of RDMA CM netlink in favour of modern nldev implementation, which provides the same information and accompanied by widely used user space component. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 0f5263767fb4..3a9e681e4257 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -5,8 +5,7 @@ #include enum { - RDMA_NL_RDMA_CM = 1, - RDMA_NL_IWCM, + RDMA_NL_IWCM = 2, RDMA_NL_RSVD, RDMA_NL_LS, /* RDMA Local Services */ RDMA_NL_NLDEV, /* RDMA device interface */ @@ -14,8 +13,7 @@ enum { }; enum { - RDMA_NL_GROUP_CM = 1, - RDMA_NL_GROUP_IWPM, + RDMA_NL_GROUP_IWPM = 2, RDMA_NL_GROUP_LS, RDMA_NL_NUM_GROUPS }; @@ -24,17 +22,6 @@ enum { #define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1)) #define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op) -enum { - RDMA_NL_RDMA_CM_ID_STATS = 0, - RDMA_NL_RDMA_CM_NUM_OPS -}; - -enum { - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR = 1, - RDMA_NL_RDMA_CM_ATTR_DST_ADDR, - RDMA_NL_RDMA_CM_NUM_ATTR, -}; - /* The minimum version that the iwpm kernel supports */ #define IWPM_UABI_VERSION_MIN 3 -- cgit v1.2.3 From 95b86d1c91ad3b19f882d9e70aa37c8e99e8dc17 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 7 Feb 2019 01:31:27 -0500 Subject: RDMA/bnxt_re: Update kernel user abi to pass chip context User space verbs provider library would need chip context. Changing the ABI to add chip version details in structure. Furthermore, changing the kernel driver ucontext allocation code to initialize the abi structure with appropriate values. As suggested by community, appended the new fields at the bottom of the ABI structure and retaining to older fields as those were in the older versions. Keeping the ABI version at 1 and adding a new field in the ucontext response structure to hold the component mask. The user space library should check pre-defined flags to figure out if a certain feature is supported on not. Signed-off-by: Devesh Sharma Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/bnxt_re-abi.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index a7a6111e50c7..dc52e3cf574c 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -44,6 +44,14 @@ #define BNXT_RE_ABI_VERSION 1 +#define BNXT_RE_CHIP_ID0_CHIP_NUM_SFT 0x00 +#define BNXT_RE_CHIP_ID0_CHIP_REV_SFT 0x10 +#define BNXT_RE_CHIP_ID0_CHIP_MET_SFT 0x18 + +enum { + BNXT_RE_UCNTX_CMASK_HAVE_CCTX = 0x1ULL +}; + struct bnxt_re_uctx_resp { __u32 dev_id; __u32 max_qp; @@ -51,6 +59,9 @@ struct bnxt_re_uctx_resp { __u32 cqe_sz; __u32 max_cqd; __u32 rsvd; + __aligned_u64 comp_mask; + __u32 chip_id0; + __u32 chip_id1; }; /* -- cgit v1.2.3 From 2c1619edef61a03cb516efaa81750784c3071d10 Mon Sep 17 00:00:00 2001 From: Danit Goldberg Date: Thu, 24 Jan 2019 14:18:15 +0200 Subject: IB/cma: Define option to set ack timeout and pack tos_set Define new option in 'rdma_set_option' to override calculated QP timeout when requested to provide QP attributes to modify a QP. At the same time, pack tos_set to be bitfield. Signed-off-by: Danit Goldberg Reviewed-by: Moni Shoua Signed-off-by: Leon Romanovsky Reviewed-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_user_cm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 0d1e78ebad05..e42940a215a3 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -300,6 +300,10 @@ enum { RDMA_OPTION_ID_TOS = 0, RDMA_OPTION_ID_REUSEADDR = 1, RDMA_OPTION_ID_AFONLY = 2, + RDMA_OPTION_ID_ACK_TIMEOUT = 3 +}; + +enum { RDMA_OPTION_IB_PATH = 1 }; -- cgit v1.2.3 From d9a9ea94f748f47b1d75c6c5e33edcf74476c445 Mon Sep 17 00:00:00 2001 From: Chad Austin Date: Mon, 7 Jan 2019 16:53:17 -0800 Subject: fuse: support clients that don't implement 'opendir' Allow filesystems to return ENOSYS from opendir, preventing the kernel from sending opendir and releasedir messages in the future. This avoids userspace transitions when filesystems don't need to keep track of state per directory handle. A new capability flag, FUSE_NO_OPENDIR_SUPPORT, parallels FUSE_NO_OPEN_SUPPORT, indicating the new semantics for returning ENOSYS from opendir. Signed-off-by: Chad Austin Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index b4967d48bfda..2ac598614a8f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -122,6 +122,9 @@ * - add FOPEN_CACHE_DIR * - add FUSE_MAX_PAGES, add max_pages to init_out * - add FUSE_CACHE_SYMLINKS + * + * 7.29 + * - add FUSE_NO_OPENDIR_SUPPORT flag */ #ifndef _LINUX_FUSE_H @@ -157,7 +160,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 28 +#define FUSE_KERNEL_MINOR_VERSION 29 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -259,6 +262,7 @@ struct fuse_file_lock { * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED * FUSE_MAX_PAGES: init_out.max_pages contains the max number of req pages * FUSE_CACHE_SYMLINKS: cache READLINK responses + * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -284,6 +288,7 @@ struct fuse_file_lock { #define FUSE_ABORT_ERROR (1 << 21) #define FUSE_MAX_PAGES (1 << 22) #define FUSE_CACHE_SYMLINKS (1 << 23) +#define FUSE_NO_OPENDIR_SUPPORT (1 << 24) /** * CUSE INIT request/reply flags -- cgit v1.2.3 From a7fe4ca72b1fe7877de5672640d0b4e023d0fdca Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Thu, 7 Feb 2019 22:18:43 -0500 Subject: media: v4l: Add 32-bit packed YUV formats The formats added in this patch include: V4L2_PIX_FMT_AYUV32 V4L2_PIX_FMT_XYUV32 V4L2_PIX_FMT_VUYA32 V4L2_PIX_FMT_VUYX32 These formats enable the trasmission of alpha channel data to other drivers and userspace applications in addition to YUV data. For example, buffers generated by drivers in one of these formats can be used by the Weston compositor to display as a texture or flipped directly onto the overlay planes with the help of a DRM driver. Signed-off-by: Vivek Kasireddy Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 9a920f071ff9..1db220da3bcc 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -562,6 +562,10 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_YUV555 v4l2_fourcc('Y', 'U', 'V', 'O') /* 16 YUV-5-5-5 */ #define V4L2_PIX_FMT_YUV565 v4l2_fourcc('Y', 'U', 'V', 'P') /* 16 YUV-5-6-5 */ #define V4L2_PIX_FMT_YUV32 v4l2_fourcc('Y', 'U', 'V', '4') /* 32 YUV-8-8-8-8 */ +#define V4L2_PIX_FMT_AYUV32 v4l2_fourcc('A', 'Y', 'U', 'V') /* 32 AYUV-8-8-8-8 */ +#define V4L2_PIX_FMT_XYUV32 v4l2_fourcc('X', 'Y', 'U', 'V') /* 32 XYUV-8-8-8-8 */ +#define V4L2_PIX_FMT_VUYA32 v4l2_fourcc('V', 'U', 'Y', 'A') /* 32 VUYA-8-8-8-8 */ +#define V4L2_PIX_FMT_VUYX32 v4l2_fourcc('V', 'U', 'Y', 'X') /* 32 VUYX-8-8-8-8 */ #define V4L2_PIX_FMT_HI240 v4l2_fourcc('H', 'I', '2', '4') /* 8 8-bit color */ #define V4L2_PIX_FMT_HM12 v4l2_fourcc('H', 'M', '1', '2') /* 8 YUV 4:2:0 16x16 macroblocks */ #define V4L2_PIX_FMT_M420 v4l2_fourcc('M', '4', '2', '0') /* 12 YUV 4:2:0 2 lines y, 1 line uv interleaved */ -- cgit v1.2.3 From 721074b03411327e7bf41555d4cc7c18f49313f7 Mon Sep 17 00:00:00 2001 From: Patrick Lerda Date: Thu, 17 Jan 2019 03:50:13 -0500 Subject: media: rc: rcmm decoder and encoder media: add support for RCMM infrared remote controls. Signed-off-by: Patrick Lerda Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/lirc.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h index 6b319581882f..45fcbf99d72e 100644 --- a/include/uapi/linux/lirc.h +++ b/include/uapi/linux/lirc.h @@ -192,6 +192,9 @@ struct lirc_scancode { * @RC_PROTO_XMP: XMP protocol * @RC_PROTO_CEC: CEC protocol * @RC_PROTO_IMON: iMon Pad protocol + * @RC_PROTO_RCMM12: RC-MM protocol 12 bits + * @RC_PROTO_RCMM24: RC-MM protocol 24 bits + * @RC_PROTO_RCMM32: RC-MM protocol 32 bits */ enum rc_proto { RC_PROTO_UNKNOWN = 0, @@ -218,6 +221,9 @@ enum rc_proto { RC_PROTO_XMP = 21, RC_PROTO_CEC = 22, RC_PROTO_IMON = 23, + RC_PROTO_RCMM12 = 24, + RC_PROTO_RCMM24 = 25, + RC_PROTO_RCMM32 = 26, }; #endif -- cgit v1.2.3 From 517b773e0f612d608cbc62a08c55601bd56f73f6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:49 +0200 Subject: RDMA/nldev: Share with user-space object IDs Give to the user space tools unique identifier for PD, MR, CQ and CM_ID objects, so they can be able to query on them with .doit callbacks. QP .doit is not supported yet, till all drivers will be updated to provide their LQPN to be equal to their restrack ID. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 3a9e681e4257..43362132e0d7 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -456,6 +456,15 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_DRIVER_S64, /* s64 */ RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */ + /* + * Indexes to get/set secific entry, + * for QP use RDMA_NLDEV_ATTR_RES_LQPN + */ + RDMA_NLDEV_ATTR_RES_PDN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CQN, /* u32 */ + RDMA_NLDEV_ATTR_RES_MRN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CM_IDN, /* u32 */ + /* * Always the end */ -- cgit v1.2.3 From c3d02788b45ab4a2d8f243b98c04b549c8193af6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:50 +0200 Subject: RDMA/nldev: Provide parent IDs for PD, MR and QP objects PD, MR and QP objects have parents objects: contexts and PDs. The exposed parent IDs allow to correlate various objects and simplify debug investigation. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 43362132e0d7..4ebbcfb2c6ef 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -464,6 +464,7 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_CQN, /* u32 */ RDMA_NLDEV_ATTR_RES_MRN, /* u32 */ RDMA_NLDEV_ATTR_RES_CM_IDN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CTXN, /* u32 */ /* * Always the end -- cgit v1.2.3 From 3856ec4b93c9463d36ee39098dde1fbbd29ec6dd Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 15 Feb 2019 11:03:53 -0800 Subject: RDMA/core: Add RDMA_NLDEV_CMD_NEWLINK/DELLINK support Add support for new LINK messages to allow adding and deleting rdma interfaces. This will be used initially for soft rdma drivers which instantiate device instances dynamically by the admin specifying a netdev device to use. The rdma_rxe module will be the first user of these messages. The design is modeled after RTNL_NEWLINK/DELLINK: rdma drivers register with the rdma core if they provide link add/delete functions. Each driver registers with a unique "type" string, that is used to dispatch messages coming from user space. A new RDMA_NLDEV_ATTR is defined for the "type" string. User mode will pass 3 attributes in a NEWLINK message: RDMA_NLDEV_ATTR_DEV_NAME for the desired rdma device name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type" of link being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the net_device interface to use for this link. The DELLINK message will contain the RDMA_NLDEV_ATTR_DEV_INDEX of the device to delete. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Reviewed-by: Michael J. Ruhl Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 4ebbcfb2c6ef..5cc592728071 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -255,9 +255,11 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_GET, /* can dump */ RDMA_NLDEV_CMD_SET, - /* 3 - 4 are free to use */ + RDMA_NLDEV_CMD_NEWLINK, - RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */ + RDMA_NLDEV_CMD_DELLINK, + + RDMA_NLDEV_CMD_PORT_GET, /* can dump */ /* 6 - 8 are free to use */ @@ -465,6 +467,10 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_MRN, /* u32 */ RDMA_NLDEV_ATTR_RES_CM_IDN, /* u32 */ RDMA_NLDEV_ATTR_RES_CTXN, /* u32 */ + /* + * Identifies the rdma driver. eg: "rxe" or "siw" + */ + RDMA_NLDEV_ATTR_LINK_TYPE, /* string */ /* * Always the end -- cgit v1.2.3 From fadccd8fc2d06cf7fd222245d7e04b00fae946cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:37:13 +0100 Subject: nvme_ioctl.h: remove duplicate GPL boilerplate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We already have a ЅPDX header, so no need to duplicate the information. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- include/uapi/linux/nvme_ioctl.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index 6e74b1eaf541..1c215ea1798e 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -2,15 +2,6 @@ /* * Definitions for the NVM Express ioctl interface * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _UAPI_LINUX_NVME_IOCTL_H -- cgit v1.2.3 From 61697a6abd24acba941359c6268a94f4afe4a53d Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 18 Jan 2019 14:19:26 -0500 Subject: dm: eliminate 'split_discard_bios' flag from DM target interface There is no need to have DM core split discards on behalf of a DM target now that blk_queue_split() handles splitting discards based on the queue_limits. A DM target just needs to set max_discard_sectors, discard_granularity, etc, in queue_limits. Signed-off-by: Mike Snitzer --- include/uapi/linux/dm-ioctl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index d1e49514977b..f396a82dfd3e 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -270,9 +270,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 39 +#define DM_VERSION_MINOR 40 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2018-04-03)" +#define DM_VERSION_EXTRA "-ioctl (2019-01-18)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3 From 663586c0a8929db81e617c775823efb9d65f2bc2 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 7 Nov 2018 23:16:19 +0100 Subject: ubi: Expose the bitrot interface Using UBI_IOCRPEB and UBI_IOCSPEB userspace can force reading and scrubbing of PEBs. In case of bitflips UBI will automatically take action and move data to a different PEB. This interface allows a daemon to foster your NAND. Signed-off-by: Richard Weinberger --- include/uapi/mtd/ubi-user.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/mtd/ubi-user.h b/include/uapi/mtd/ubi-user.h index aad3b6201fc0..b69e9ba6742b 100644 --- a/include/uapi/mtd/ubi-user.h +++ b/include/uapi/mtd/ubi-user.h @@ -171,6 +171,11 @@ /* Re-name volumes */ #define UBI_IOCRNVOL _IOW(UBI_IOC_MAGIC, 3, struct ubi_rnvol_req) +/* Read the specified PEB and scrub it if there are bitflips */ +#define UBI_IOCRPEB _IOW(UBI_IOC_MAGIC, 4, __s32) +/* Force scrubbing on the specified PEB */ +#define UBI_IOCSPEB _IOW(UBI_IOC_MAGIC, 5, __s32) + /* ioctl commands of the UBI control character device */ #define UBI_CTRL_IOC_MAGIC 'o' -- cgit v1.2.3 From e5567f5f67621877726f99be040af9fbedda37dc Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 19 Feb 2019 11:04:51 -0800 Subject: PCI/ATS: Add pci_prg_resp_pasid_required() interface. Return the PRG Response PASID Required bit in the Page Request Status Register. As per PCIe spec r4.0, sec 10.5.2.3, if this bit is Set, the device expects a PASID TLP Prefix on PRG Response Messages when the corresponding Page Requests had a PASID TLP Prefix. If Clear, the device does not expect PASID TLP Prefixes on any PRG Response Message, and the device behavior is undefined if the device receives a PRG Response Message with a PASID TLP Prefix. Also the device behavior is undefined if this bit is Set and the device receives a PRG Response Message with no PASID TLP Prefix when the corresponding Page Requests had a PASID TLP Prefix. This function will be used by drivers like IOMMU, if it is required to check the status of the PRG Response PASID Required bit before enabling the PASID support of the device. Cc: Ashok Raj Cc: Jacob Pan Cc: Keith Busch Suggested-by: Ashok Raj Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Bjorn Helgaas Signed-off-by: Joerg Roedel --- include/uapi/linux/pci_regs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index e1e9888c85e6..898be572b010 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -880,6 +880,7 @@ #define PCI_PRI_STATUS_RF 0x001 /* Response Failure */ #define PCI_PRI_STATUS_UPRGI 0x002 /* Unexpected PRG index */ #define PCI_PRI_STATUS_STOPPED 0x100 /* PRI Stopped */ +#define PCI_PRI_STATUS_PASID 0x8000 /* PRG Response PASID Required */ #define PCI_PRI_MAX_REQ 0x08 /* PRI max reqs supported */ #define PCI_PRI_ALLOC_REQ 0x0c /* PRI max reqs allowed */ #define PCI_EXT_CAP_PRI_SIZEOF 16 -- cgit v1.2.3 From 8c938ddc6df3bbe72809db1be6c9f3af83f5d7a9 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 19 Feb 2019 11:06:09 -0800 Subject: PCI/ATS: Add pci_ats_page_aligned() interface Return the Page Aligned Request bit in the ATS Capability Register. As per PCIe spec r4.0, sec 10.5.1.2, if the Page Aligned Request bit is set, it indicates the Untranslated Addresses generated by the device are always aligned to a 4096 byte boundary. An IOMMU that can only translate page-aligned addresses can only be used with devices that always produce aligned Untranslated Addresses. This interface will be used by drivers for such IOMMUs to determine whether devices can use the ATS service. Cc: Ashok Raj Cc: Jacob Pan Cc: Keith Busch Suggested-by: Ashok Raj Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Bjorn Helgaas Signed-off-by: Joerg Roedel --- include/uapi/linux/pci_regs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 898be572b010..5c98133f2c94 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -866,6 +866,7 @@ #define PCI_ATS_CAP 0x04 /* ATS Capability Register */ #define PCI_ATS_CAP_QDEP(x) ((x) & 0x1f) /* Invalidate Queue Depth */ #define PCI_ATS_MAX_QDEP 32 /* Max Invalidate Queue Depth */ +#define PCI_ATS_CAP_PAGE_ALIGNED 0x0020 /* Page Aligned Request */ #define PCI_ATS_CTRL 0x06 /* ATS Control Register */ #define PCI_ATS_CTRL_ENABLE 0x8000 /* ATS Enable */ #define PCI_ATS_CTRL_STU(x) ((x) & 0x1f) /* Smallest Translation Unit */ -- cgit v1.2.3 From 2b188cc1bb857a9d4701ae59aa7768b5124e262e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Jan 2019 10:46:33 -0700 Subject: Add io_uring IO interface The submission queue (SQ) and completion queue (CQ) rings are shared between the application and the kernel. This eliminates the need to copy data back and forth to submit and complete IO. IO submissions use the io_uring_sqe data structure, and completions are generated in the form of io_uring_cqe data structures. The SQ ring is an index into the io_uring_sqe array, which makes it possible to submit a batch of IOs without them being contiguous in the ring. The CQ ring is always contiguous, as completion events are inherently unordered, and hence any io_uring_cqe entry can point back to an arbitrary submission. Two new system calls are added for this: io_uring_setup(entries, params) Sets up an io_uring instance for doing async IO. On success, returns a file descriptor that the application can mmap to gain access to the SQ ring, CQ ring, and io_uring_sqes. io_uring_enter(fd, to_submit, min_complete, flags, sigset, sigsetsize) Initiates IO against the rings mapped to this fd, or waits for them to complete, or both. The behavior is controlled by the parameters passed in. If 'to_submit' is non-zero, then we'll try and submit new IO. If IORING_ENTER_GETEVENTS is set, the kernel will wait for 'min_complete' events, if they aren't already available. It's valid to set IORING_ENTER_GETEVENTS and 'min_complete' == 0 at the same time, this allows the kernel to return already completed events without waiting for them. This is useful only for polling, as for IRQ driven IO, the application can just check the CQ ring without entering the kernel. With this setup, it's possible to do async IO with a single system call. Future developments will enable polled IO with this interface, and polled submission as well. The latter will enable an application to do IO without doing ANY system calls at all. For IRQ driven IO, an application only needs to enter the kernel for completions if it wants to wait for them to occur. Each io_uring is backed by a workqueue, to support buffered async IO as well. We will only punt to an async context if the command would need to wait for IO on the device side. Any data that can be accessed directly in the page cache is done inline. This avoids the slowness issue of usual threadpools, since cached data is accessed as quickly as a sync interface. Sample application: http://git.kernel.dk/cgit/fio/plain/t/io_uring.c Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/uapi/asm-generic/unistd.h | 6 ++- include/uapi/linux/io_uring.h | 95 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/io_uring.h (limited to 'include/uapi') diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index d90127298f12..87871e7b7ea7 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -740,9 +740,13 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents) __SYSCALL(__NR_rseq, sys_rseq) #define __NR_kexec_file_load 294 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) +#define __NR_io_uring_setup 425 +__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) +#define __NR_io_uring_enter 426 +__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) #undef __NR_syscalls -#define __NR_syscalls 295 +#define __NR_syscalls 427 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h new file mode 100644 index 000000000000..ac692823d6f4 --- /dev/null +++ b/include/uapi/linux/io_uring.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Header file for the io_uring interface. + * + * Copyright (C) 2019 Jens Axboe + * Copyright (C) 2019 Christoph Hellwig + */ +#ifndef LINUX_IO_URING_H +#define LINUX_IO_URING_H + +#include +#include + +/* + * IO submission data structure (Submission Queue Entry) + */ +struct io_uring_sqe { + __u8 opcode; /* type of operation for this sqe */ + __u8 flags; /* as of now unused */ + __u16 ioprio; /* ioprio for the request */ + __s32 fd; /* file descriptor to do IO on */ + __u64 off; /* offset into file */ + __u64 addr; /* pointer to buffer or iovecs */ + __u32 len; /* buffer size or number of iovecs */ + union { + __kernel_rwf_t rw_flags; + __u32 __resv; + }; + __u64 user_data; /* data to be passed back at completion time */ + __u64 __pad2[3]; +}; + +#define IORING_OP_NOP 0 +#define IORING_OP_READV 1 +#define IORING_OP_WRITEV 2 + +/* + * IO completion data structure (Completion Queue Entry) + */ +struct io_uring_cqe { + __u64 user_data; /* sqe->data submission passed back */ + __s32 res; /* result code for this event */ + __u32 flags; +}; + +/* + * Magic offsets for the application to mmap the data it needs + */ +#define IORING_OFF_SQ_RING 0ULL +#define IORING_OFF_CQ_RING 0x8000000ULL +#define IORING_OFF_SQES 0x10000000ULL + +/* + * Filled with the offset for mmap(2) + */ +struct io_sqring_offsets { + __u32 head; + __u32 tail; + __u32 ring_mask; + __u32 ring_entries; + __u32 flags; + __u32 dropped; + __u32 array; + __u32 resv1; + __u64 resv2; +}; + +struct io_cqring_offsets { + __u32 head; + __u32 tail; + __u32 ring_mask; + __u32 ring_entries; + __u32 overflow; + __u32 cqes; + __u64 resv[2]; +}; + +/* + * io_uring_enter(2) flags + */ +#define IORING_ENTER_GETEVENTS (1U << 0) + +/* + * Passed in for io_uring_setup(2). Copied back with updated info on success + */ +struct io_uring_params { + __u32 sq_entries; + __u32 cq_entries; + __u32 flags; + __u32 resv[7]; + struct io_sqring_offsets sq_off; + struct io_cqring_offsets cq_off; +}; + +#endif -- cgit v1.2.3 From c992fe2925d776be066d9f6cc13f9ea11d78b657 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Jan 2019 09:43:02 -0700 Subject: io_uring: add fsync support Add a new fsync opcode, which either syncs a range if one is passed, or the whole file if the offset and length fields are both cleared to zero. A flag is provided to use fdatasync semantics, that is only force out metadata which is required to retrieve the file data, but not others like metadata. Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ac692823d6f4..4589d56d0b68 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -24,7 +24,7 @@ struct io_uring_sqe { __u32 len; /* buffer size or number of iovecs */ union { __kernel_rwf_t rw_flags; - __u32 __resv; + __u32 fsync_flags; }; __u64 user_data; /* data to be passed back at completion time */ __u64 __pad2[3]; @@ -33,6 +33,12 @@ struct io_uring_sqe { #define IORING_OP_NOP 0 #define IORING_OP_READV 1 #define IORING_OP_WRITEV 2 +#define IORING_OP_FSYNC 3 + +/* + * sqe->fsync_flags + */ +#define IORING_FSYNC_DATASYNC (1U << 0) /* * IO completion data structure (Completion Queue Entry) -- cgit v1.2.3 From def596e9557c91d9846fc4d84d26f2c564644416 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Jan 2019 08:59:42 -0700 Subject: io_uring: support for IO polling Add support for a polled io_uring instance. When a read or write is submitted to a polled io_uring, the application must poll for completions on the CQ ring through io_uring_enter(2). Polled IO may not generate IRQ completions, hence they need to be actively found by the application itself. To use polling, io_uring_setup() must be used with the IORING_SETUP_IOPOLL flag being set. It is illegal to mix and match polled and non-polled IO on an io_uring. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4589d56d0b68..5c457ea396e6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -30,6 +30,11 @@ struct io_uring_sqe { __u64 __pad2[3]; }; +/* + * io_uring_setup() flags + */ +#define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ + #define IORING_OP_NOP 0 #define IORING_OP_READV 1 #define IORING_OP_WRITEV 2 -- cgit v1.2.3 From edafccee56ff31678a091ddb7219aba9b28bc3cb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Jan 2019 09:16:05 -0700 Subject: io_uring: add support for pre-mapped user IO buffers If we have fixed user buffers, we can map them into the kernel when we setup the io_uring. That avoids the need to do get_user_pages() for each and every IO. To utilize this feature, the application must call io_uring_register() after having setup an io_uring instance, passing in IORING_REGISTER_BUFFERS as the opcode. The argument must be a pointer to an iovec array, and the nr_args should contain how many iovecs the application wishes to map. If successful, these buffers are now mapped into the kernel, eligible for IO. To use these fixed buffers, the application must use the IORING_OP_READ_FIXED and IORING_OP_WRITE_FIXED opcodes, and then set sqe->index to the desired buffer index. sqe->addr..sqe->addr+seq->len must point to somewhere inside the indexed buffer. The application may register buffers throughout the lifetime of the io_uring instance. It can call io_uring_register() with IORING_UNREGISTER_BUFFERS as the opcode to unregister the current set of buffers, and then register a new set. The application need not unregister buffers explicitly before shutting down the io_uring instance. It's perfectly valid to setup a larger buffer, and then sometimes only use parts of it for an IO. As long as the range is within the originally mapped region, it will work just fine. For now, buffers must not be file backed. If file backed buffers are passed in, the registration will fail with -1/EOPNOTSUPP. This restriction may be relaxed in the future. RLIMIT_MEMLOCK is used to check how much memory we can pin. A somewhat arbitrary 1G per buffer size is also imposed. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/uapi/asm-generic/unistd.h | 4 +++- include/uapi/linux/io_uring.h | 13 ++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 87871e7b7ea7..d346229a1eb0 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -744,9 +744,11 @@ __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) #define __NR_io_uring_enter 426 __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) +#define __NR_io_uring_register 427 +__SYSCALL(__NR_io_uring_register, sys_io_uring_register) #undef __NR_syscalls -#define __NR_syscalls 427 +#define __NR_syscalls 428 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5c457ea396e6..cf28f7a11f12 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -27,7 +27,10 @@ struct io_uring_sqe { __u32 fsync_flags; }; __u64 user_data; /* data to be passed back at completion time */ - __u64 __pad2[3]; + union { + __u16 buf_index; /* index into fixed buffers, if used */ + __u64 __pad2[3]; + }; }; /* @@ -39,6 +42,8 @@ struct io_uring_sqe { #define IORING_OP_READV 1 #define IORING_OP_WRITEV 2 #define IORING_OP_FSYNC 3 +#define IORING_OP_READ_FIXED 4 +#define IORING_OP_WRITE_FIXED 5 /* * sqe->fsync_flags @@ -103,4 +108,10 @@ struct io_uring_params { struct io_cqring_offsets cq_off; }; +/* + * io_uring_register(2) opcodes and arguments + */ +#define IORING_REGISTER_BUFFERS 0 +#define IORING_UNREGISTER_BUFFERS 1 + #endif -- cgit v1.2.3 From 6b06314c47e141031be043539900d80d2c7ba10f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Jan 2019 22:13:58 -0700 Subject: io_uring: add file set registration We normally have to fget/fput for each IO we do on a file. Even with the batching we do, the cost of the atomic inc/dec of the file usage count adds up. This adds IORING_REGISTER_FILES, and IORING_UNREGISTER_FILES opcodes for the io_uring_register(2) system call. The arguments passed in must be an array of __s32 holding file descriptors, and nr_args should hold the number of file descriptors the application wishes to pin for the duration of the io_uring instance (or until IORING_UNREGISTER_FILES is called). When used, the application must set IOSQE_FIXED_FILE in the sqe->flags member. Then, instead of setting sqe->fd to the real fd, it sets sqe->fd to the index in the array passed in to IORING_REGISTER_FILES. Files are automatically unregistered when the io_uring instance is torn down. An application need only unregister if it wishes to register a new set of fds. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cf28f7a11f12..6257478d55e9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -16,7 +16,7 @@ */ struct io_uring_sqe { __u8 opcode; /* type of operation for this sqe */ - __u8 flags; /* as of now unused */ + __u8 flags; /* IOSQE_ flags */ __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ __u64 off; /* offset into file */ @@ -33,6 +33,11 @@ struct io_uring_sqe { }; }; +/* + * sqe->flags + */ +#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ + /* * io_uring_setup() flags */ @@ -113,5 +118,7 @@ struct io_uring_params { */ #define IORING_REGISTER_BUFFERS 0 #define IORING_UNREGISTER_BUFFERS 1 +#define IORING_REGISTER_FILES 2 +#define IORING_UNREGISTER_FILES 3 #endif -- cgit v1.2.3 From 6c271ce2f1d572f7fa225700a13cfe7ced492434 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Jan 2019 11:22:30 -0700 Subject: io_uring: add submission polling This enables an application to do IO, without ever entering the kernel. By using the SQ ring to fill in new sqes and watching for completions on the CQ ring, we can submit and reap IOs without doing a single system call. The kernel side thread will poll for new submissions, and in case of HIPRI/polled IO, it'll also poll for completions. By default, we allow 1 second of active spinning. This can by changed by passing in a different grace period at io_uring_register(2) time. If the thread exceeds this idle time without having any work to do, it will set: sq_ring->flags |= IORING_SQ_NEED_WAKEUP. The application will have to call io_uring_enter() to start things back up again. If IO is kept busy, that will never be needed. Basically an application that has this feature enabled will guard it's io_uring_enter(2) call with: read_barrier(); if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP) io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP); instead of calling it unconditionally. It's mandatory to use fixed files with this feature. Failure to do so will result in the application getting an -EBADF CQ entry when submitting IO. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6257478d55e9..0ec74bab8dbe 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -42,6 +42,8 @@ struct io_uring_sqe { * io_uring_setup() flags */ #define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ +#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ +#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ #define IORING_OP_NOP 0 #define IORING_OP_READV 1 @@ -86,6 +88,11 @@ struct io_sqring_offsets { __u64 resv2; }; +/* + * sq_ring->flags + */ +#define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ + struct io_cqring_offsets { __u32 head; __u32 tail; @@ -100,6 +107,7 @@ struct io_cqring_offsets { * io_uring_enter(2) flags */ #define IORING_ENTER_GETEVENTS (1U << 0) +#define IORING_ENTER_SQ_WAKEUP (1U << 1) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -108,7 +116,9 @@ struct io_uring_params { __u32 sq_entries; __u32 cq_entries; __u32 flags; - __u32 resv[7]; + __u32 sq_thread_cpu; + __u32 sq_thread_idle; + __u32 resv[5]; struct io_sqring_offsets sq_off; struct io_cqring_offsets cq_off; }; -- cgit v1.2.3 From 3eb39f47934f9d5a3027fe00d906a45fe3a15fad Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 19 Nov 2018 00:51:56 +0100 Subject: signal: add pidfd_send_signal() syscall The kill() syscall operates on process identifiers (pid). After a process has exited its pid can be reused by another process. If a caller sends a signal to a reused pid it will end up signaling the wrong process. This issue has often surfaced and there has been a push to address this problem [1]. This patch uses file descriptors (fd) from proc/ as stable handles on struct pid. Even if a pid is recycled the handle will not change. The fd can be used to send signals to the process it refers to. Thus, the new syscall pidfd_send_signal() is introduced to solve this problem. Instead of pids it operates on process fds (pidfd). /* prototype and argument /* long pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags); /* syscall number 424 */ The syscall number was chosen to be 424 to align with Arnd's rework in his y2038 to minimize merge conflicts (cf. [25]). In addition to the pidfd and signal argument it takes an additional siginfo_t and flags argument. If the siginfo_t argument is NULL then pidfd_send_signal() is equivalent to kill(, ). If it is not NULL pidfd_send_signal() is equivalent to rt_sigqueueinfo(). The flags argument is added to allow for future extensions of this syscall. It currently needs to be passed as 0. Failing to do so will cause EINVAL. /* pidfd_send_signal() replaces multiple pid-based syscalls */ The pidfd_send_signal() syscall currently takes on the job of rt_sigqueueinfo(2) and parts of the functionality of kill(2), Namely, when a positive pid is passed to kill(2). It will however be possible to also replace tgkill(2) and rt_tgsigqueueinfo(2) if this syscall is extended. /* sending signals to threads (tid) and process groups (pgid) */ Specifically, the pidfd_send_signal() syscall does currently not operate on process groups or threads. This is left for future extensions. In order to extend the syscall to allow sending signal to threads and process groups appropriately named flags (e.g. PIDFD_TYPE_PGID, and PIDFD_TYPE_TID) should be added. This implies that the flags argument will determine what is signaled and not the file descriptor itself. Put in other words, grouping in this api is a property of the flags argument not a property of the file descriptor (cf. [13]). Clarification for this has been requested by Eric (cf. [19]). When appropriate extensions through the flags argument are added then pidfd_send_signal() can additionally replace the part of kill(2) which operates on process groups as well as the tgkill(2) and rt_tgsigqueueinfo(2) syscalls. How such an extension could be implemented has been very roughly sketched in [14], [15], and [16]. However, this should not be taken as a commitment to a particular implementation. There might be better ways to do it. Right now this is intentionally left out to keep this patchset as simple as possible (cf. [4]). /* naming */ The syscall had various names throughout iterations of this patchset: - procfd_signal() - procfd_send_signal() - taskfd_send_signal() In the last round of reviews it was pointed out that given that if the flags argument decides the scope of the signal instead of different types of fds it might make sense to either settle for "procfd_" or "pidfd_" as prefix. The community was willing to accept either (cf. [17] and [18]). Given that one developer expressed strong preference for the "pidfd_" prefix (cf. [13]) and with other developers less opinionated about the name we should settle for "pidfd_" to avoid further bikeshedding. The "_send_signal" suffix was chosen to reflect the fact that the syscall takes on the job of multiple syscalls. It is therefore intentional that the name is not reminiscent of neither kill(2) nor rt_sigqueueinfo(2). Not the fomer because it might imply that pidfd_send_signal() is a replacement for kill(2), and not the latter because it is a hassle to remember the correct spelling - especially for non-native speakers - and because it is not descriptive enough of what the syscall actually does. The name "pidfd_send_signal" makes it very clear that its job is to send signals. /* zombies */ Zombies can be signaled just as any other process. No special error will be reported since a zombie state is an unreliable state (cf. [3]). However, this can be added as an extension through the @flags argument if the need ever arises. /* cross-namespace signals */ The patch currently enforces that the signaler and signalee either are in the same pid namespace or that the signaler's pid namespace is an ancestor of the signalee's pid namespace. This is done for the sake of simplicity and because it is unclear to what values certain members of struct siginfo_t would need to be set to (cf. [5], [6]). /* compat syscalls */ It became clear that we would like to avoid adding compat syscalls (cf. [7]). The compat syscall handling is now done in kernel/signal.c itself by adding __copy_siginfo_from_user_generic() which lets us avoid compat syscalls (cf. [8]). It should be noted that the addition of __copy_siginfo_from_user_any() is caused by a bug in the original implementation of rt_sigqueueinfo(2) (cf. 12). With upcoming rework for syscall handling things might improve significantly (cf. [11]) and __copy_siginfo_from_user_any() will not gain any additional callers. /* testing */ This patch was tested on x64 and x86. /* userspace usage */ An asciinema recording for the basic functionality can be found under [9]. With this patch a process can be killed via: #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include static inline int do_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) { #ifdef __NR_pidfd_send_signal return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); #else return -ENOSYS; #endif } int main(int argc, char *argv[]) { int fd, ret, saved_errno, sig; if (argc < 3) exit(EXIT_FAILURE); fd = open(argv[1], O_DIRECTORY | O_CLOEXEC); if (fd < 0) { printf("%s - Failed to open \"%s\"\n", strerror(errno), argv[1]); exit(EXIT_FAILURE); } sig = atoi(argv[2]); printf("Sending signal %d to process %s\n", sig, argv[1]); ret = do_pidfd_send_signal(fd, sig, NULL, 0); saved_errno = errno; close(fd); errno = saved_errno; if (ret < 0) { printf("%s - Failed to send signal %d to process %s\n", strerror(errno), sig, argv[1]); exit(EXIT_FAILURE); } exit(EXIT_SUCCESS); } /* Q&A * Given that it seems the same questions get asked again by people who are * late to the party it makes sense to add a Q&A section to the commit * message so it's hopefully easier to avoid duplicate threads. * * For the sake of progress please consider these arguments settled unless * there is a new point that desperately needs to be addressed. Please make * sure to check the links to the threads in this commit message whether * this has not already been covered. */ Q-01: (Florian Weimer [20], Andrew Morton [21]) What happens when the target process has exited? A-01: Sending the signal will fail with ESRCH (cf. [22]). Q-02: (Andrew Morton [21]) Is the task_struct pinned by the fd? A-02: No. A reference to struct pid is kept. struct pid - as far as I understand - was created exactly for the reason to not require to pin struct task_struct (cf. [22]). Q-03: (Andrew Morton [21]) Does the entire procfs directory remain visible? Just one entry within it? A-03: The same thing that happens right now when you hold a file descriptor to /proc/ open (cf. [22]). Q-04: (Andrew Morton [21]) Does the pid remain reserved? A-04: No. This patchset guarantees a stable handle not that pids are not recycled (cf. [22]). Q-05: (Andrew Morton [21]) Do attempts to signal that fd return errors? A-05: See {Q,A}-01. Q-06: (Andrew Morton [22]) Is there a cleaner way of obtaining the fd? Another syscall perhaps. A-06: Userspace can already trivially retrieve file descriptors from procfs so this is something that we will need to support anyway. Hence, there's no immediate need to add another syscalls just to make pidfd_send_signal() not dependent on the presence of procfs. However, adding a syscalls to get such file descriptors is planned for a future patchset (cf. [22]). Q-07: (Andrew Morton [21] and others) This fd-for-a-process sounds like a handy thing and people may well think up other uses for it in the future, probably unrelated to signals. Are the code and the interface designed to permit such future applications? A-07: Yes (cf. [22]). Q-08: (Andrew Morton [21] and others) Now I think about it, why a new syscall? This thing is looking rather like an ioctl? A-08: This has been extensively discussed. It was agreed that a syscall is preferred for a variety or reasons. Here are just a few taken from prior threads. Syscalls are safer than ioctl()s especially when signaling to fds. Processes are a core kernel concept so a syscall seems more appropriate. The layout of the syscall with its four arguments would require the addition of a custom struct for the ioctl() thereby causing at least the same amount or even more complexity for userspace than a simple syscall. The new syscall will replace multiple other pid-based syscalls (see description above). The file-descriptors-for-processes concept introduced with this syscall will be extended with other syscalls in the future. See also [22], [23] and various other threads already linked in here. Q-09: (Florian Weimer [24]) What happens if you use the new interface with an O_PATH descriptor? A-09: pidfds opened as O_PATH fds cannot be used to send signals to a process (cf. [2]). Signaling processes through pidfds is the equivalent of writing to a file. Thus, this is not an operation that operates "purely at the file descriptor level" as required by the open(2) manpage. See also [4]. /* References */ [1]: https://lore.kernel.org/lkml/20181029221037.87724-1-dancol@google.com/ [2]: https://lore.kernel.org/lkml/874lbtjvtd.fsf@oldenburg2.str.redhat.com/ [3]: https://lore.kernel.org/lkml/20181204132604.aspfupwjgjx6fhva@brauner.io/ [4]: https://lore.kernel.org/lkml/20181203180224.fkvw4kajtbvru2ku@brauner.io/ [5]: https://lore.kernel.org/lkml/20181121213946.GA10795@mail.hallyn.com/ [6]: https://lore.kernel.org/lkml/20181120103111.etlqp7zop34v6nv4@brauner.io/ [7]: https://lore.kernel.org/lkml/36323361-90BD-41AF-AB5B-EE0D7BA02C21@amacapital.net/ [8]: https://lore.kernel.org/lkml/87tvjxp8pc.fsf@xmission.com/ [9]: https://asciinema.org/a/IQjuCHew6bnq1cr78yuMv16cy [11]: https://lore.kernel.org/lkml/F53D6D38-3521-4C20-9034-5AF447DF62FF@amacapital.net/ [12]: https://lore.kernel.org/lkml/87zhtjn8ck.fsf@xmission.com/ [13]: https://lore.kernel.org/lkml/871s6u9z6u.fsf@xmission.com/ [14]: https://lore.kernel.org/lkml/20181206231742.xxi4ghn24z4h2qki@brauner.io/ [15]: https://lore.kernel.org/lkml/20181207003124.GA11160@mail.hallyn.com/ [16]: https://lore.kernel.org/lkml/20181207015423.4miorx43l3qhppfz@brauner.io/ [17]: https://lore.kernel.org/lkml/CAGXu5jL8PciZAXvOvCeCU3wKUEB_dU-O3q0tDw4uB_ojMvDEew@mail.gmail.com/ [18]: https://lore.kernel.org/lkml/20181206222746.GB9224@mail.hallyn.com/ [19]: https://lore.kernel.org/lkml/20181208054059.19813-1-christian@brauner.io/ [20]: https://lore.kernel.org/lkml/8736rebl9s.fsf@oldenburg.str.redhat.com/ [21]: https://lore.kernel.org/lkml/20181228152012.dbf0508c2508138efc5f2bbe@linux-foundation.org/ [22]: https://lore.kernel.org/lkml/20181228233725.722tdfgijxcssg76@brauner.io/ [23]: https://lwn.net/Articles/773459/ [24]: https://lore.kernel.org/lkml/8736rebl9s.fsf@oldenburg.str.redhat.com/ [25]: https://lore.kernel.org/lkml/CAK8P3a0ej9NcJM8wXNPbcGUyOUZYX+VLoDFdbenW3s3114oQZw@mail.gmail.com/ Cc: "Eric W. Biederman" Cc: Jann Horn Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Oleg Nesterov Cc: Al Viro Cc: Florian Weimer Signed-off-by: Christian Brauner Reviewed-by: Tycho Andersen Reviewed-by: Kees Cook Reviewed-by: David Howells Acked-by: Arnd Bergmann Acked-by: Thomas Gleixner Acked-by: Serge Hallyn Acked-by: Aleksa Sarai --- include/uapi/asm-generic/unistd.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index d90127298f12..c861e7d1053b 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -740,9 +740,11 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents) __SYSCALL(__NR_rseq, sys_rseq) #define __NR_kexec_file_load 294 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) +#define __NR_pidfd_send_signal 424 +__SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal) #undef __NR_syscalls -#define __NR_syscalls 295 +#define __NR_syscalls 425 /* * 32 bit systems traditionally used different -- cgit v1.2.3 From 221c5eb2338232f7340386de1c43decc32682e58 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Jan 2019 09:41:58 -0700 Subject: io_uring: add support for IORING_OP_POLL This is basically a direct port of bfe4037e722e, which implements a one-shot poll command through aio. Description below is based on that commit as well. However, instead of adding a POLL command and relying on io_cancel(2) to remove it, we mimic the epoll(2) interface of having a command to add a poll notification, IORING_OP_POLL_ADD, and one to remove it again, IORING_OP_POLL_REMOVE. To poll for a file descriptor the application should submit an sqe of type IORING_OP_POLL. It will poll the fd for the events specified in the poll_events field. Unlike poll or epoll without EPOLLONESHOT this interface always works in one shot mode, that is once the sqe is completed, it will have to be resubmitted. Reviewed-by: Hannes Reinecke Based-on-code-from: Christoph Hellwig Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0ec74bab8dbe..e23408692118 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -25,6 +25,7 @@ struct io_uring_sqe { union { __kernel_rwf_t rw_flags; __u32 fsync_flags; + __u16 poll_events; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -51,6 +52,8 @@ struct io_uring_sqe { #define IORING_OP_FSYNC 3 #define IORING_OP_READ_FIXED 4 #define IORING_OP_WRITE_FIXED 5 +#define IORING_OP_POLL_ADD 6 +#define IORING_OP_POLL_REMOVE 7 /* * sqe->fsync_flags -- cgit v1.2.3 From a623a7a1a5670c25a16881f5078072d272d96b71 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 11 Mar 2019 16:38:17 +0100 Subject: y2038: fix socket.h header inclusion Referencing the __kernel_long_t type caused some user space applications to stop compiling when they had not already included linux/posix_types.h, e.g. s/multicast.c -o ext/sockets/multicast.lo In file included from /builddir/build/BUILD/php-7.3.3/main/php.h:468, from /builddir/build/BUILD/php-7.3.3/ext/sockets/sockets.c:27: /builddir/build/BUILD/php-7.3.3/ext/sockets/sockets.c: In function 'zm_startup_sockets': /builddir/build/BUILD/php-7.3.3/ext/sockets/sockets.c:776:40: error: '__kernel_long_t' undeclared (first use in this function) 776 | REGISTER_LONG_CONSTANT("SO_SNDTIMEO", SO_SNDTIMEO, CONST_CS | CONST_PERSISTENT); It is safe to include that header here, since it only contains kernel internal types that do not conflict with other user space types. It's still possible that some related build failures remain, but those are likely to be for code that is not already y2038 safe. Reported-by: Laura Abbott Fixes: a9beb86ae6e5 ("sock: Add SO_RCVTIMEO_NEW and SO_SNDTIMEO_NEW") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/uapi/asm-generic/socket.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index c8b430cb6dc4..8c1391c89171 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -2,8 +2,8 @@ #ifndef __ASM_GENERIC_SOCKET_H #define __ASM_GENERIC_SOCKET_H +#include #include -#include /* For setsockopt(2) */ #define SOL_SOCKET 1 -- cgit v1.2.3 From dbafd7ddd62369b2f3926ab847cbf8fc40e800b7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 12 Mar 2019 10:23:04 -0700 Subject: bpf: Add bpf_get_listener_sock(struct bpf_sock *sk) helper Add a new helper "struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk)" which returns a bpf_sock in TCP_LISTEN state. It will trace back to the listener sk from a request_sock if possible. It returns NULL for all other cases. No reference is taken because the helper ensures the sk is in SOCK_RCU_FREE (where the TCP_LISTEN sock should be in). Hence, bpf_sk_release() is unnecessary and the verifier does not allow bpf_sk_release(listen_sk) to be called either. The following is also allowed because the bpf_prog is run under rcu_read_lock(): sk = bpf_sk_lookup_tcp(); /* if (!sk) { ... } */ listen_sk = bpf_get_listener_sock(sk); /* if (!listen_sk) { ... } */ bpf_sk_release(sk); src_port = listen_sk->src_port; /* Allowed */ Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3c38ac9a92a7..983b25cb608d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2366,6 +2366,14 @@ union bpf_attr { * current value is ect (ECN capable). Works with IPv6 and IPv4. * Return * 1 if set, 0 if not set. + * + * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) + * Description + * Return a **struct bpf_sock** pointer in TCP_LISTEN state. + * bpf_sk_release() is unnecessary and not allowed. + * Return + * A **struct bpf_sock** pointer on success, or NULL in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2465,7 +2473,8 @@ union bpf_attr { FN(spin_unlock), \ FN(sk_fullsock), \ FN(tcp_sock), \ - FN(skb_ecn_set_ce), + FN(skb_ecn_set_ce), \ + FN(get_listener_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 62369db2df8d1edfa040878203b446e023a16802 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 14 Mar 2019 12:38:39 +0000 Subject: bpf: fix documentation for eBPF helpers Another round of minor fixes for the documentation of the BPF helpers located in the UAPI bpf.h header file. Changes include: - Moving around description of some helpers, to keep the descriptions in the same order as helpers are declared (bpf_map_push_elem(), leftover from commit 90b1023f68c7 ("bpf: fix documentation for eBPF helpers"), bpf_rc_keydown(), and bpf_skb_ancestor_cgroup_id()). - Fixing typos ("contex" -> "context"). - Harmonising return types ("void* " -> "void *", "uint64_t" -> "u64"). - Addition of the "bpf_" prefix to bpf_get_storage(). - Light additions of RST markup on some keywords. - Empty line deletion between description and return value for bpf_tcp_sock(). - Edit for the description for bpf_skb_ecn_set_ce() (capital letters, acronym expansion, no effect if ECT not set, more details on return value). Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 128 ++++++++++++++++++++++++----------------------- 1 file changed, 65 insertions(+), 63 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 983b25cb608d..4465d00d3493 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -502,16 +502,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) - * Description - * Push an element *value* in *map*. *flags* is one of: - * - * **BPF_EXIST** - * If the queue/stack is full, the oldest element is removed to - * make room for this. - * Return - * 0 on success, or a negative error in case of failure. - * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -1435,14 +1425,14 @@ union bpf_attr { * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_addr** contex. + * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return * A 8-byte long non-decreasing number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_ops** contex. + * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return * A 8-byte long non-decreasing number. * @@ -2098,52 +2088,52 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * int bpf_rc_repeat(void *ctx) * Description * This helper is used in programs implementing IR decoding, to - * report a successfully decoded key press with *scancode*, - * *toggle* value in the given *protocol*. The scancode will be - * translated to a keycode using the rc keymap, and reported as - * an input key down event. After a period a key up event is - * generated. This period can be extended by calling either - * **bpf_rc_keydown**\ () again with the same values, or calling - * **bpf_rc_repeat**\ (). + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. * - * Some protocols include a toggle bit, in case the button was - * released and pressed again between consecutive scancodes. + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. * * The *ctx* should point to the lirc sample as passed into * the program. * - * The *protocol* is the decoded protocol number (see - * **enum rc_proto** for some predefined values). - * * This helper is only available is the kernel was compiled with * the **CONFIG_BPF_LIRC_MODE2** configuration option set to * "**y**". * Return * 0 * - * int bpf_rc_repeat(void *ctx) + * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to - * report a successfully decoded repeat key message. This delays - * the generation of a key up event for previously generated - * key down event. + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). * - * Some IR protocols like NEC have a special IR message for - * repeating last button, for when a button is held down. + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. * * The *ctx* should point to the lirc sample as passed into * the program. * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * * This helper is only available is the kernel was compiled with * the **CONFIG_BPF_LIRC_MODE2** configuration option set to * "**y**". * Return * 0 * - * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb) + * u64 bpf_skb_cgroup_id(struct sk_buff *skb) * Description * Return the cgroup v2 id of the socket associated with the *skb*. * This is roughly similar to the **bpf_get_cgroup_classid**\ () @@ -2159,30 +2149,12 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) - * Description - * Return id of cgroup v2 that is ancestor of cgroup associated - * with the *skb* at the *ancestor_level*. The root cgroup is at - * *ancestor_level* zero and each step down the hierarchy - * increments the level. If *ancestor_level* == level of cgroup - * associated with *skb*, then return value will be same as that - * of **bpf_skb_cgroup_id**\ (). - * - * The helper is useful to implement policies based on cgroups - * that are upper in hierarchy than immediate cgroup associated - * with *skb*. - * - * The format of returned id and helper limitations are same as in - * **bpf_skb_cgroup_id**\ (). - * Return - * The id is returned or 0 in case the id could not be retrieved. - * * u64 bpf_get_current_cgroup_id(void) * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. * - * void* get_local_storage(void *map, u64 flags) + * void *bpf_get_local_storage(void *map, u64 flags) * Description * Get the pointer to the local storage area. * The type and the size of the local storage is defined @@ -2209,6 +2181,24 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for TCP socket matching *tuple*, optionally in a child @@ -2289,6 +2279,16 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. @@ -2346,33 +2346,35 @@ union bpf_attr { * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_sock** pointer such - * that all the fields in bpf_sock can be accessed. + * that all the fields in this **bpf_sock** can be accessed. * Return - * A **struct bpf_sock** pointer on success, or NULL in + * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. * * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_tcp_sock** pointer from a * **struct bpf_sock** pointer. - * * Return - * A **struct bpf_tcp_sock** pointer on success, or NULL in + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * * int bpf_skb_ecn_set_ce(struct sk_buf *skb) - * Description - * Sets ECN of IP header to ce (congestion encountered) if - * current value is ect (ECN capable). Works with IPv6 and IPv4. - * Return - * 1 if set, 0 if not set. + * Description + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * Return + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. * * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) * Description - * Return a **struct bpf_sock** pointer in TCP_LISTEN state. - * bpf_sk_release() is unnecessary and not allowed. + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. * Return - * A **struct bpf_sock** pointer on success, or NULL in + * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ -- cgit v1.2.3 From 0eb0978528d47699edd091dc2c337952ad8da436 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 14 Mar 2019 12:38:40 +0000 Subject: bpf: add documentation for helpers bpf_spin_lock(), bpf_spin_unlock() Add documentation for the BPF spinlock-related helpers to the doc in bpf.h. I added the constraints and restrictions coming with the use of spinlocks for BPF: not all of it is directly related to the use of the helper, but I thought it would be nice for users to find them in the man page. This list of restrictions is nearly a verbatim copy of the list in Alexei's commit log for those helpers. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4465d00d3493..929c8e537a14 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2343,6 +2343,61 @@ union bpf_attr { * Return * 0 * + * int bpf_spin_lock(struct bpf_spin_lock *lock) + * Description + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * Return + * 0 + * + * int bpf_spin_unlock(struct bpf_spin_lock *lock) + * Description + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * Return + * 0 + * * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_sock** pointer such -- cgit v1.2.3 From 037fc3368be46dc1a2a90f6e50c8cbce49d75fd6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 17 Mar 2019 11:01:09 +0900 Subject: kbuild: force all architectures except um to include mandatory-y Currently, every arch/*/include/uapi/asm/Kbuild explicitly includes the common Kbuild.asm file. Factor out the duplicated include directives to scripts/Makefile.asm-generic so that no architecture would opt out of the mandatory-y mechanism. um is not forced to include mandatory-y since it is a very exceptional case which does not support UAPI. Signed-off-by: Masahiro Yamada --- include/uapi/asm-generic/Kbuild | 36 ++++++++++++++++++++++++++++++++++++ include/uapi/asm-generic/Kbuild.asm | 34 ---------------------------------- 2 files changed, 36 insertions(+), 34 deletions(-) create mode 100644 include/uapi/asm-generic/Kbuild delete mode 100644 include/uapi/asm-generic/Kbuild.asm (limited to 'include/uapi') diff --git a/include/uapi/asm-generic/Kbuild b/include/uapi/asm-generic/Kbuild new file mode 100644 index 000000000000..ebb180aac74e --- /dev/null +++ b/include/uapi/asm-generic/Kbuild @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Headers that are mandatory in usr/include/asm/ +# (This file is not included when SRCARCH=um since UML does not support UAPI.) + +mandatory-y += auxvec.h +mandatory-y += bitsperlong.h +mandatory-y += bpf_perf_event.h +mandatory-y += byteorder.h +mandatory-y += errno.h +mandatory-y += fcntl.h +mandatory-y += ioctl.h +mandatory-y += ioctls.h +mandatory-y += ipcbuf.h +mandatory-y += mman.h +mandatory-y += msgbuf.h +mandatory-y += param.h +mandatory-y += poll.h +mandatory-y += posix_types.h +mandatory-y += ptrace.h +mandatory-y += resource.h +mandatory-y += sembuf.h +mandatory-y += setup.h +mandatory-y += shmbuf.h +mandatory-y += sigcontext.h +mandatory-y += siginfo.h +mandatory-y += signal.h +mandatory-y += socket.h +mandatory-y += sockios.h +mandatory-y += stat.h +mandatory-y += statfs.h +mandatory-y += swab.h +mandatory-y += termbits.h +mandatory-y += termios.h +mandatory-y += types.h +mandatory-y += unistd.h diff --git a/include/uapi/asm-generic/Kbuild.asm b/include/uapi/asm-generic/Kbuild.asm deleted file mode 100644 index 355c4ac2c0b0..000000000000 --- a/include/uapi/asm-generic/Kbuild.asm +++ /dev/null @@ -1,34 +0,0 @@ -# -# Headers that are mandatory in usr/include/asm/ -# -mandatory-y += auxvec.h -mandatory-y += bitsperlong.h -mandatory-y += bpf_perf_event.h -mandatory-y += byteorder.h -mandatory-y += errno.h -mandatory-y += fcntl.h -mandatory-y += ioctl.h -mandatory-y += ioctls.h -mandatory-y += ipcbuf.h -mandatory-y += mman.h -mandatory-y += msgbuf.h -mandatory-y += param.h -mandatory-y += poll.h -mandatory-y += posix_types.h -mandatory-y += ptrace.h -mandatory-y += resource.h -mandatory-y += sembuf.h -mandatory-y += setup.h -mandatory-y += shmbuf.h -mandatory-y += sigcontext.h -mandatory-y += siginfo.h -mandatory-y += signal.h -mandatory-y += socket.h -mandatory-y += sockios.h -mandatory-y += stat.h -mandatory-y += statfs.h -mandatory-y += swab.h -mandatory-y += termbits.h -mandatory-y += termios.h -mandatory-y += types.h -mandatory-y += unistd.h -- cgit v1.2.3 From 0532a1b0d045115521a93acf28f1270df89ad806 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 22 Mar 2019 09:19:34 +0100 Subject: virt: vbox: Implement passing requestor info to the host for VirtualBox 6.0.x VirtualBox 6.0.x has a new feature where the guest kernel driver passes info about the origin of the request (e.g. userspace or kernelspace) to the hypervisor. If we do not pass this information then when running the 6.0.x userspace guest-additions tools on a 6.0.x host, some requests will get denied with a VERR_VERSION_MISMATCH error, breaking vboxservice.service and the mounting of shared folders marked to be auto-mounted. This commit implements passing the requestor info to the host, fixing this. Signed-off-by: Hans de Goede Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/vbox_vmmdev_types.h | 60 ++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/vbox_vmmdev_types.h b/include/uapi/linux/vbox_vmmdev_types.h index 0e68024f36c7..26f39816af14 100644 --- a/include/uapi/linux/vbox_vmmdev_types.h +++ b/include/uapi/linux/vbox_vmmdev_types.h @@ -102,6 +102,66 @@ enum vmmdev_request_type { #define VMMDEVREQ_HGCM_CALL VMMDEVREQ_HGCM_CALL32 #endif +/* vmmdev_request_header.requestor defines */ + +/* Requestor user not given. */ +#define VMMDEV_REQUESTOR_USR_NOT_GIVEN 0x00000000 +/* The kernel driver (vboxguest) is the requestor. */ +#define VMMDEV_REQUESTOR_USR_DRV 0x00000001 +/* Some other kernel driver is the requestor. */ +#define VMMDEV_REQUESTOR_USR_DRV_OTHER 0x00000002 +/* The root or a admin user is the requestor. */ +#define VMMDEV_REQUESTOR_USR_ROOT 0x00000003 +/* Regular joe user is making the request. */ +#define VMMDEV_REQUESTOR_USR_USER 0x00000006 +/* User classification mask. */ +#define VMMDEV_REQUESTOR_USR_MASK 0x00000007 + +/* Kernel mode request. Note this is 0, check for !USERMODE instead. */ +#define VMMDEV_REQUESTOR_KERNEL 0x00000000 +/* User mode request. */ +#define VMMDEV_REQUESTOR_USERMODE 0x00000008 +/* User or kernel mode classification mask. */ +#define VMMDEV_REQUESTOR_MODE_MASK 0x00000008 + +/* Don't know the physical console association of the requestor. */ +#define VMMDEV_REQUESTOR_CON_DONT_KNOW 0x00000000 +/* + * The request originates with a process that is NOT associated with the + * physical console. + */ +#define VMMDEV_REQUESTOR_CON_NO 0x00000010 +/* Requestor process is associated with the physical console. */ +#define VMMDEV_REQUESTOR_CON_YES 0x00000020 +/* Console classification mask. */ +#define VMMDEV_REQUESTOR_CON_MASK 0x00000030 + +/* Requestor is member of special VirtualBox user group. */ +#define VMMDEV_REQUESTOR_GRP_VBOX 0x00000080 + +/* Note: trust level is for windows guests only, linux always uses not-given */ +/* Requestor trust level: Unspecified */ +#define VMMDEV_REQUESTOR_TRUST_NOT_GIVEN 0x00000000 +/* Requestor trust level: Untrusted (SID S-1-16-0) */ +#define VMMDEV_REQUESTOR_TRUST_UNTRUSTED 0x00001000 +/* Requestor trust level: Untrusted (SID S-1-16-4096) */ +#define VMMDEV_REQUESTOR_TRUST_LOW 0x00002000 +/* Requestor trust level: Medium (SID S-1-16-8192) */ +#define VMMDEV_REQUESTOR_TRUST_MEDIUM 0x00003000 +/* Requestor trust level: Medium plus (SID S-1-16-8448) */ +#define VMMDEV_REQUESTOR_TRUST_MEDIUM_PLUS 0x00004000 +/* Requestor trust level: High (SID S-1-16-12288) */ +#define VMMDEV_REQUESTOR_TRUST_HIGH 0x00005000 +/* Requestor trust level: System (SID S-1-16-16384) */ +#define VMMDEV_REQUESTOR_TRUST_SYSTEM 0x00006000 +/* Requestor trust level >= Protected (SID S-1-16-20480, S-1-16-28672) */ +#define VMMDEV_REQUESTOR_TRUST_PROTECTED 0x00007000 +/* Requestor trust level mask */ +#define VMMDEV_REQUESTOR_TRUST_MASK 0x00007000 + +/* Requestor is using the less trusted user device node (/dev/vboxuser) */ +#define VMMDEV_REQUESTOR_USER_DEVICE 0x00008000 + /** HGCM service location types. */ enum vmmdev_hgcm_service_location_type { VMMDEV_HGCM_LOC_INVALID = 0, -- cgit v1.2.3 From 3d9683cf3bfb6d4e4605a153958dfca7e18b52f2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 18 Mar 2019 18:08:12 +0900 Subject: KVM: export and iif KVM is supported I do not see any consistency about headers_install of and . According to my analysis of Linux 5.1-rc1, there are 3 groups: [1] Both and are exported alpha, arm, hexagon, mips, powerpc, s390, sparc, x86 [2] is exported, but is not arc, arm64, c6x, h8300, ia64, m68k, microblaze, nios2, openrisc, parisc, sh, unicore32, xtensa [3] Neither nor is exported csky, nds32, riscv This does not match to the actual KVM support. At least, [2] is half-baked. Nor do arch maintainers look like they care about this. For example, commit 0add53713b1c ("microblaze: Add missing kvm_para.h to Kbuild") exported to user-space in order to fix an in-kernel build error. We have two ways to make this consistent: [A] export both and for all architectures, irrespective of the KVM support [B] Match the header export of and to the KVM support My first attempt was [A] because the code looks cleaner, but Paolo suggested [B]. So, this commit goes with [B]. For most architectures, was moved to the kernel-space. I changed include/uapi/linux/Kbuild so that it checks generated asm/kvm_para.h as well as check-in ones. After this commit, there will be two groups: [1] Both and are exported arm, arm64, mips, powerpc, s390, x86 [2] Neither nor is exported alpha, arc, c6x, csky, h8300, hexagon, ia64, m68k, microblaze, nds32, nios2, openrisc, parisc, riscv, sh, sparc, unicore32, xtensa Signed-off-by: Masahiro Yamada Acked-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- include/uapi/linux/Kbuild | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 5f24b50c9e88..059dc2bedaf6 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -7,5 +7,7 @@ no-export-headers += kvm.h endif ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm_para.h),) +ifeq ($(wildcard $(objtree)/arch/$(SRCARCH)/include/generated/uapi/asm/kvm_para.h),) no-export-headers += kvm_para.h endif +endif -- cgit v1.2.3