From 882227626459ce7a24191f13c6d9749a00992059 Mon Sep 17 00:00:00 2001 From: Scott Branden Date: Wed, 20 Jan 2021 09:58:15 -0800 Subject: bcm-vk: add bcm_vk UAPI Add user space api for bcm-vk driver. Provide ioctl api to load images and issue reset command to card. FW status registers in PCI BAR space also defined as part of API so that user space is able to interpret these memory locations as needed via direct PCIe access. Acked-by: Olof Johansson Signed-off-by: Scott Branden Link: https://lore.kernel.org/r/20210120175827.14820-2-scott.branden@broadcom.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/misc/bcm_vk.h | 84 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 include/uapi/linux/misc/bcm_vk.h (limited to 'include/uapi') diff --git a/include/uapi/linux/misc/bcm_vk.h b/include/uapi/linux/misc/bcm_vk.h new file mode 100644 index 000000000000..ec28e0bd46a9 --- /dev/null +++ b/include/uapi/linux/misc/bcm_vk.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright 2018-2020 Broadcom. + */ + +#ifndef __UAPI_LINUX_MISC_BCM_VK_H +#define __UAPI_LINUX_MISC_BCM_VK_H + +#include +#include + +#define BCM_VK_MAX_FILENAME 64 + +struct vk_image { + __u32 type; /* Type of image */ +#define VK_IMAGE_TYPE_BOOT1 1 /* 1st stage (load to SRAM) */ +#define VK_IMAGE_TYPE_BOOT2 2 /* 2nd stage (load to DDR) */ + __u8 filename[BCM_VK_MAX_FILENAME]; /* Filename of image */ +}; + +struct vk_reset { + __u32 arg1; + __u32 arg2; +}; + +#define VK_MAGIC 0x5e + +/* Load image to Valkyrie */ +#define VK_IOCTL_LOAD_IMAGE _IOW(VK_MAGIC, 0x2, struct vk_image) + +/* Send Reset to Valkyrie */ +#define VK_IOCTL_RESET _IOW(VK_MAGIC, 0x4, struct vk_reset) + +/* + * Firmware Status accessed directly via BAR space + */ +#define VK_BAR_FWSTS 0x41c +#define VK_BAR_COP_FWSTS 0x428 +/* VK_FWSTS definitions */ +#define VK_FWSTS_RELOCATION_ENTRY (1UL << 0) +#define VK_FWSTS_RELOCATION_EXIT (1UL << 1) +#define VK_FWSTS_INIT_START (1UL << 2) +#define VK_FWSTS_ARCH_INIT_DONE (1UL << 3) +#define VK_FWSTS_PRE_KNL1_INIT_DONE (1UL << 4) +#define VK_FWSTS_PRE_KNL2_INIT_DONE (1UL << 5) +#define VK_FWSTS_POST_KNL_INIT_DONE (1UL << 6) +#define VK_FWSTS_INIT_DONE (1UL << 7) +#define VK_FWSTS_APP_INIT_START (1UL << 8) +#define VK_FWSTS_APP_INIT_DONE (1UL << 9) +#define VK_FWSTS_MASK 0xffffffff +#define VK_FWSTS_READY (VK_FWSTS_INIT_START | \ + VK_FWSTS_ARCH_INIT_DONE | \ + VK_FWSTS_PRE_KNL1_INIT_DONE | \ + VK_FWSTS_PRE_KNL2_INIT_DONE | \ + VK_FWSTS_POST_KNL_INIT_DONE | \ + VK_FWSTS_INIT_DONE | \ + VK_FWSTS_APP_INIT_START | \ + VK_FWSTS_APP_INIT_DONE) +/* Deinit */ +#define VK_FWSTS_APP_DEINIT_START (1UL << 23) +#define VK_FWSTS_APP_DEINIT_DONE (1UL << 24) +#define VK_FWSTS_DRV_DEINIT_START (1UL << 25) +#define VK_FWSTS_DRV_DEINIT_DONE (1UL << 26) +#define VK_FWSTS_RESET_DONE (1UL << 27) +#define VK_FWSTS_DEINIT_TRIGGERED (VK_FWSTS_APP_DEINIT_START | \ + VK_FWSTS_APP_DEINIT_DONE | \ + VK_FWSTS_DRV_DEINIT_START | \ + VK_FWSTS_DRV_DEINIT_DONE) +/* Last nibble for reboot reason */ +#define VK_FWSTS_RESET_REASON_SHIFT 28 +#define VK_FWSTS_RESET_REASON_MASK (0xf << VK_FWSTS_RESET_REASON_SHIFT) +#define VK_FWSTS_RESET_SYS_PWRUP (0x0 << VK_FWSTS_RESET_REASON_SHIFT) +#define VK_FWSTS_RESET_MBOX_DB (0x1 << VK_FWSTS_RESET_REASON_SHIFT) +#define VK_FWSTS_RESET_M7_WDOG (0x2 << VK_FWSTS_RESET_REASON_SHIFT) +#define VK_FWSTS_RESET_TEMP (0x3 << VK_FWSTS_RESET_REASON_SHIFT) +#define VK_FWSTS_RESET_PCI_FLR (0x4 << VK_FWSTS_RESET_REASON_SHIFT) +#define VK_FWSTS_RESET_PCI_HOT (0x5 << VK_FWSTS_RESET_REASON_SHIFT) +#define VK_FWSTS_RESET_PCI_WARM (0x6 << VK_FWSTS_RESET_REASON_SHIFT) +#define VK_FWSTS_RESET_PCI_COLD (0x7 << VK_FWSTS_RESET_REASON_SHIFT) +#define VK_FWSTS_RESET_L1 (0x8 << VK_FWSTS_RESET_REASON_SHIFT) +#define VK_FWSTS_RESET_L0 (0x9 << VK_FWSTS_RESET_REASON_SHIFT) +#define VK_FWSTS_RESET_UNKNOWN (0xf << VK_FWSTS_RESET_REASON_SHIFT) + +#endif /* __UAPI_LINUX_MISC_BCM_VK_H */ -- cgit v1.2.3 From 8544717cdacc2f33f0f53a3b34c5125b37e13ce9 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Thu, 14 Jan 2021 19:07:48 +0200 Subject: bus: fsl-mc: move fsl_mc_command struct in a uapi header Define "struct fsl_mc_command" as a structure that can cross the user/kernel boundary. Acked-by: Laurentiu Tudor Signed-off-by: Ioana Ciornei Link: https://lore.kernel.org/r/20210114170752.2927915-2-ciorneiioana@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/fsl_mc.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 include/uapi/linux/fsl_mc.h (limited to 'include/uapi') diff --git a/include/uapi/linux/fsl_mc.h b/include/uapi/linux/fsl_mc.h new file mode 100644 index 000000000000..cf56d46f052e --- /dev/null +++ b/include/uapi/linux/fsl_mc.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Management Complex (MC) userspace public interface + * + * Copyright 2021 NXP + * + */ +#ifndef _UAPI_FSL_MC_H_ +#define _UAPI_FSL_MC_H_ + +#include + +#define MC_CMD_NUM_OF_PARAMS 7 + +/** + * struct fsl_mc_command - Management Complex (MC) command structure + * @header: MC command header + * @params: MC command parameters + */ +struct fsl_mc_command { + __le64 header; + __le64 params[MC_CMD_NUM_OF_PARAMS]; +}; + +#endif /* _UAPI_FSL_MC_H_ */ -- cgit v1.2.3 From 2cf1e703f066cfa82eb5a358ae84c29fe15a3b3a Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Thu, 14 Jan 2021 19:07:50 +0200 Subject: bus: fsl-mc: add fsl-mc userspace support Adding userspace support for the MC (Management Complex) means exporting an ioctl capable device file representing the root resource container. This new functionality in the fsl-mc bus driver intends to provide userspace applications an interface to interact with the MC firmware. Commands that are composed in userspace are sent to the MC firmware through the FSL_MC_SEND_MC_COMMAND ioctl. By default the implicit MC I/O portal is used for this operation, but if the implicit one is busy, a dynamic portal is allocated and then freed upon execution. The command received through the ioctl interface is checked against a known whitelist of accepted MC commands. Commands that attempt a change in hardware configuration will need CAP_NET_ADMIN, while commands used in debugging do not need it. Acked-by: Laurentiu Tudor Signed-off-by: Ioana Ciornei Link: https://lore.kernel.org/r/20210114170752.2927915-4-ciorneiioana@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/fsl_mc.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/fsl_mc.h b/include/uapi/linux/fsl_mc.h index cf56d46f052e..e57451570033 100644 --- a/include/uapi/linux/fsl_mc.h +++ b/include/uapi/linux/fsl_mc.h @@ -16,10 +16,19 @@ * struct fsl_mc_command - Management Complex (MC) command structure * @header: MC command header * @params: MC command parameters + * + * Used by FSL_MC_SEND_MC_COMMAND */ struct fsl_mc_command { __le64 header; __le64 params[MC_CMD_NUM_OF_PARAMS]; }; +#define FSL_MC_SEND_CMD_IOCTL_TYPE 'R' +#define FSL_MC_SEND_CMD_IOCTL_SEQ 0xE0 + +#define FSL_MC_SEND_MC_COMMAND \ + _IOWR(FSL_MC_SEND_CMD_IOCTL_TYPE, FSL_MC_SEND_CMD_IOCTL_SEQ, \ + struct fsl_mc_command) + #endif /* _UAPI_FSL_MC_H_ */ -- cgit v1.2.3 From c209e742141bc0064ecdb73ef0dd825310de28cb Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 3 Dec 2020 17:12:09 +0200 Subject: habanalabs: allow user to pass a staged submission seq In order to support the staged submission feature, user must be allowed to use the same CS sequence for all submissions in the same staged submission. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index dba3827c43ca..b9afe1cdac24 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -604,11 +604,14 @@ struct hl_cs_chunk { }; /* SIGNAL and WAIT/COLLECTIVE_WAIT flags are mutually exclusive */ -#define HL_CS_FLAGS_FORCE_RESTORE 0x1 -#define HL_CS_FLAGS_SIGNAL 0x2 -#define HL_CS_FLAGS_WAIT 0x4 -#define HL_CS_FLAGS_COLLECTIVE_WAIT 0x8 -#define HL_CS_FLAGS_TIMESTAMP 0x20 +#define HL_CS_FLAGS_FORCE_RESTORE 0x1 +#define HL_CS_FLAGS_SIGNAL 0x2 +#define HL_CS_FLAGS_WAIT 0x4 +#define HL_CS_FLAGS_COLLECTIVE_WAIT 0x8 +#define HL_CS_FLAGS_TIMESTAMP 0x20 +#define HL_CS_FLAGS_STAGED_SUBMISSION 0x40 +#define HL_CS_FLAGS_STAGED_SUBMISSION_FIRST 0x80 +#define HL_CS_FLAGS_STAGED_SUBMISSION_LAST 0x100 #define HL_CS_STATUS_SUCCESS 0 @@ -622,10 +625,17 @@ struct hl_cs_in { /* holds address of array of hl_cs_chunk for execution phase */ __u64 chunks_execute; - /* this holds address of array of hl_cs_chunk for store phase - - * Currently not in use - */ - __u64 chunks_store; + union { + /* this holds address of array of hl_cs_chunk for store phase - + * Currently not in use + */ + __u64 chunks_store; + + /* Sequence number of a staged submission CS + * valid only if HL_CS_FLAGS_STAGED_SUBMISSION is set + */ + __u64 seq; + }; /* Number of chunks in restore phase array. Maximum number is * HL_MAX_JOBS_PER_CS -- cgit v1.2.3 From 0eda23d77e1beede9c61b0cba6d9267d3a92bd4e Mon Sep 17 00:00:00 2001 From: Moti Haimovski Date: Mon, 7 Dec 2020 09:10:34 +0200 Subject: habanalabs: report dram_page_size in hw_ip_info ioctl Instead of having it hard-coded as a define, pass it to the user in runtime. Signed-off-by: Moti Haimovski Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index b9afe1cdac24..b431a70e1b8b 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -320,6 +320,8 @@ struct hl_info_hw_ip_info { __u8 pad[2]; __u8 cpucp_version[HL_INFO_VERSION_MAX_LEN]; __u8 card_name[HL_INFO_CARD_NAME_MAX_LEN]; + __u64 reserved2; + __u64 dram_page_size; }; struct hl_info_dram_usage { -- cgit v1.2.3 From e1fa724dd17a6a9b9934636226e683912d12c876 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 6 Jan 2021 15:40:37 +0200 Subject: habanalabs: add user available interrupt to hw_ip In order to support completions that arrive directly to the user, the driver needs to supply the user with the first available msix interrupt available. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index b431a70e1b8b..866355a53188 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -309,7 +309,9 @@ struct hl_info_hw_ip_info { __u32 num_of_events; __u32 device_id; /* PCI Device ID */ __u32 module_id; /* For mezzanine cards in servers (From OCP spec.) */ - __u32 reserved[2]; + __u32 reserved; + __u16 first_available_interrupt_id; + __u16 reserved2; __u32 cpld_version; __u32 psoc_pci_pll_nr; __u32 psoc_pci_pll_nf; @@ -320,7 +322,7 @@ struct hl_info_hw_ip_info { __u8 pad[2]; __u8 cpucp_version[HL_INFO_VERSION_MAX_LEN]; __u8 card_name[HL_INFO_CARD_NAME_MAX_LEN]; - __u64 reserved2; + __u64 reserved3; __u64 dram_page_size; }; -- cgit v1.2.3 From d00697fbe13ccc1dfb7adb82204e1469a7783b07 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 5 Jan 2021 12:55:06 +0200 Subject: habanalabs: add new mem ioctl op for mapping hw blocks For future ASIC support the driver allows user to map certain regions in the device's configuration space for direct access from userspace. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 866355a53188..b1c09eba8ac2 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -718,6 +718,8 @@ union hl_wait_cs_args { #define HL_MEM_OP_MAP 2 /* Opcode to unmap previously mapped host and device memory */ #define HL_MEM_OP_UNMAP 3 +/* Opcode to map a hw block */ +#define HL_MEM_OP_MAP_BLOCK 4 /* Memory flags */ #define HL_MEM_CONTIGUOUS 0x1 @@ -772,6 +774,17 @@ struct hl_mem_in { __u64 mem_size; } map_host; + /* HL_MEM_OP_MAP_BLOCK - map a hw block */ + struct { + /* + * HW block address to map, a handle will be returned + * to the user and will be used to mmap the relevant + * block. Only addresses from configuration space are + * allowed. + */ + __u64 block_addr; + } map_block; + /* HL_MEM_OP_UNMAP - unmap host memory */ struct { /* Virtual address returned from HL_MEM_OP_MAP */ @@ -798,8 +811,9 @@ struct hl_mem_out { __u64 device_virt_addr; /* - * Used for HL_MEM_OP_ALLOC. This is the assigned - * handle for the allocated memory + * Used for HL_MEM_OP_ALLOC and HL_MEM_OP_MAP_BLOCK. + * This is the assigned handle for the allocated memory + * or mapped block */ __u64 handle; }; -- cgit v1.2.3 From cf30339d3f44a64115e88d46a932fdc3d3644785 Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Sun, 17 Jan 2021 16:01:56 +0200 Subject: habanalabs: modify device_idle interface Currently this API uses single 64 bits mask for engines idle indication. Recently, it was observed that more bits are needed for some ASICs. This patch modifies the use of the idle mask and the idle_extensions mask. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index b1c09eba8ac2..ebde42b37b43 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -331,6 +331,8 @@ struct hl_info_dram_usage { __u64 ctx_dram_mem; }; +#define HL_BUSY_ENGINES_MASK_EXT_SIZE 2 + struct hl_info_hw_idle { __u32 is_idle; /* @@ -343,7 +345,7 @@ struct hl_info_hw_idle { * Extended Bitmask of busy engines. * Bits definition is according to `enum _enging_id'. */ - __u64 busy_engines_mask_ext; + __u64 busy_engines_mask_ext[HL_BUSY_ENGINES_MASK_EXT_SIZE]; }; struct hl_info_device_status { -- cgit v1.2.3 From c98fe7c2a2038af2bc24f039ccdb5a65c22ba11a Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 29 Jan 2021 08:54:04 -0800 Subject: vfio: option to unmap all For the UNMAP_DMA ioctl, delete all mappings if VFIO_DMA_UNMAP_FLAG_ALL is set. Define the corresponding VFIO_UNMAP_ALL extension. Signed-off-by: Steve Sistare Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index d1812777139f..78462599e5ed 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -46,6 +46,9 @@ */ #define VFIO_NOIOMMU_IOMMU 8 +/* Supports VFIO_DMA_UNMAP_FLAG_ALL */ +#define VFIO_UNMAP_ALL 9 + /* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between @@ -1102,6 +1105,7 @@ struct vfio_bitmap { * field. No guarantee is made to the user that arbitrary unmaps of iova * or size different from those used in the original mapping call will * succeed. + * * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap * before unmapping IO virtual addresses. When this flag is set, the user must * provide a struct vfio_bitmap in data[]. User must provide zero-allocated @@ -1111,11 +1115,15 @@ struct vfio_bitmap { * indicates that the page at that offset from iova is dirty. A Bitmap of the * pages in the range of unmapped size is returned in the user-provided * vfio_bitmap.data. + * + * If flags & VFIO_DMA_UNMAP_FLAG_ALL, unmap all addresses. iova and size + * must be 0. This cannot be combined with the get-dirty-bitmap flag. */ struct vfio_iommu_type1_dma_unmap { __u32 argsz; __u32 flags; #define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0) +#define VFIO_DMA_UNMAP_FLAG_ALL (1 << 1) __u64 iova; /* IO virtual address */ __u64 size; /* Size of mapping (bytes) */ __u8 data[]; -- cgit v1.2.3 From 441e8106a238920f28e2e79cff462044551f60e2 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 29 Jan 2021 08:54:07 -0800 Subject: vfio: interfaces to update vaddr Define interfaces that allow the underlying memory object of an iova range to be mapped to a new host virtual address in the host process: - VFIO_DMA_UNMAP_FLAG_VADDR for VFIO_IOMMU_UNMAP_DMA - VFIO_DMA_MAP_FLAG_VADDR flag for VFIO_IOMMU_MAP_DMA - VFIO_UPDATE_VADDR extension for VFIO_CHECK_EXTENSION Unmap vaddr invalidates the host virtual address in an iova range, and blocks vfio translation of host virtual addresses. DMA to already-mapped pages continues. Map vaddr updates the base VA and resumes translation. See comments in uapi/linux/vfio.h for more details. Signed-off-by: Steve Sistare Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 78462599e5ed..8ce36c1d53ca 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -49,6 +49,9 @@ /* Supports VFIO_DMA_UNMAP_FLAG_ALL */ #define VFIO_UNMAP_ALL 9 +/* Supports the vaddr flag for DMA map and unmap */ +#define VFIO_UPDATE_VADDR 10 + /* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between @@ -1077,12 +1080,22 @@ struct vfio_iommu_type1_info_dma_avail { * * Map process virtual addresses to IO virtual addresses using the * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required. + * + * If flags & VFIO_DMA_MAP_FLAG_VADDR, update the base vaddr for iova, and + * unblock translation of host virtual addresses in the iova range. The vaddr + * must have previously been invalidated with VFIO_DMA_UNMAP_FLAG_VADDR. To + * maintain memory consistency within the user application, the updated vaddr + * must address the same memory object as originally mapped. Failure to do so + * will result in user memory corruption and/or device misbehavior. iova and + * size must match those in the original MAP_DMA call. Protection is not + * changed, and the READ & WRITE flags must be 0. */ struct vfio_iommu_type1_dma_map { __u32 argsz; __u32 flags; #define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device */ #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */ +#define VFIO_DMA_MAP_FLAG_VADDR (1 << 2) __u64 vaddr; /* Process virtual address */ __u64 iova; /* IO virtual address */ __u64 size; /* Size of mapping (bytes) */ @@ -1118,12 +1131,18 @@ struct vfio_bitmap { * * If flags & VFIO_DMA_UNMAP_FLAG_ALL, unmap all addresses. iova and size * must be 0. This cannot be combined with the get-dirty-bitmap flag. + * + * If flags & VFIO_DMA_UNMAP_FLAG_VADDR, do not unmap, but invalidate host + * virtual addresses in the iova range. Tasks that attempt to translate an + * iova's vaddr will block. DMA to already-mapped pages continues. This + * cannot be combined with the get-dirty-bitmap flag. */ struct vfio_iommu_type1_dma_unmap { __u32 argsz; __u32 flags; #define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0) #define VFIO_DMA_UNMAP_FLAG_ALL (1 << 1) +#define VFIO_DMA_UNMAP_FLAG_VADDR (1 << 2) __u64 iova; /* IO virtual address */ __u64 size; /* Size of mapping (bytes) */ __u8 data[]; -- cgit v1.2.3 From e52606d2f5363f4900cfe8419e391644b0229c6f Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 27 Jan 2021 16:34:37 +0200 Subject: habanalabs: support fetching first available user CQ User must be aware of the available CQs when it needs to use them. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index ebde42b37b43..64ae83b5f8e5 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -414,10 +414,13 @@ struct hl_pll_frequency_info { * struct hl_info_sync_manager - sync manager information * @first_available_sync_object: first available sob * @first_available_monitor: first available monitor + * @first_available_cq: first available cq */ struct hl_info_sync_manager { __u32 first_available_sync_object; __u32 first_available_monitor; + __u32 first_available_cq; + __u32 reserved; }; /** -- cgit v1.2.3 From 6df50d274363aa189a31435024339b781a6e32a9 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Fri, 5 Feb 2021 16:04:34 +0200 Subject: habanalabs: return block size + block ID When user gives us a block address to get its ID to mmap it, he also needs to get from us the block size to pass to the driver in the mmap function. Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 64ae83b5f8e5..5a86b521a450 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -782,10 +782,10 @@ struct hl_mem_in { /* HL_MEM_OP_MAP_BLOCK - map a hw block */ struct { /* - * HW block address to map, a handle will be returned - * to the user and will be used to mmap the relevant - * block. Only addresses from configuration space are - * allowed. + * HW block address to map, a handle and size will be + * returned to the user and will be used to mmap the + * relevant block. Only addresses from configuration + * space are allowed. */ __u64 block_addr; } map_block; @@ -816,11 +816,26 @@ struct hl_mem_out { __u64 device_virt_addr; /* - * Used for HL_MEM_OP_ALLOC and HL_MEM_OP_MAP_BLOCK. + * Used in HL_MEM_OP_ALLOC * This is the assigned handle for the allocated memory - * or mapped block */ __u64 handle; + + struct { + /* + * Used in HL_MEM_OP_MAP_BLOCK. + * This is the assigned handle for the mapped block + */ + __u64 block_handle; + + /* + * Used in HL_MEM_OP_MAP_BLOCK + * This is the size of the mapped block + */ + __u32 block_size; + + __u32 pad; + }; }; }; -- cgit v1.2.3 From 9c5137aedd112f78a968bdd2325de2ea06df46c0 Mon Sep 17 00:00:00 2001 From: Shuo Liu Date: Sun, 7 Feb 2021 11:10:28 +0800 Subject: virt: acrn: Introduce VM management interfaces The VM management interfaces expose several VM operations to ACRN userspace via ioctls. For example, creating VM, starting VM, destroying VM and so on. The ACRN Hypervisor needs to exchange data with the ACRN userspace during the VM operations. HSM provides VM operation ioctls to the ACRN userspace and communicates with the ACRN Hypervisor for VM operations via hypercalls. HSM maintains a list of User VM. Each User VM will be bound to an existing file descriptor of /dev/acrn_hsm. The User VM will be destroyed when the file descriptor is closed. Cc: Zhi Wang Cc: Zhenyu Wang Cc: Yu Wang Cc: Reinette Chatre Cc: Greg Kroah-Hartman Reviewed-by: Zhi Wang Reviewed-by: Reinette Chatre Signed-off-by: Shuo Liu Link: https://lore.kernel.org/r/20210207031040.49576-7-shuo.a.liu@intel.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/acrn.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 include/uapi/linux/acrn.h (limited to 'include/uapi') diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h new file mode 100644 index 000000000000..32521168075f --- /dev/null +++ b/include/uapi/linux/acrn.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Userspace interface for /dev/acrn_hsm - ACRN Hypervisor Service Module + * + * This file can be used by applications that need to communicate with the HSM + * via the ioctl interface. + * + * Copyright (C) 2021 Intel Corporation. All rights reserved. + */ + +#ifndef _UAPI_ACRN_H +#define _UAPI_ACRN_H + +#include +#include + +/** + * struct acrn_vm_creation - Info to create a User VM + * @vmid: User VM ID returned from the hypervisor + * @reserved0: Reserved and must be 0 + * @vcpu_num: Number of vCPU in the VM. Return from hypervisor. + * @reserved1: Reserved and must be 0 + * @uuid: UUID of the VM. Pass to hypervisor directly. + * @vm_flag: Flag of the VM creating. Pass to hypervisor directly. + * @ioreq_buf: Service VM GPA of I/O request buffer. Pass to + * hypervisor directly. + * @cpu_affinity: CPU affinity of the VM. Pass to hypervisor directly. + * It's a bitmap which indicates CPUs used by the VM. + */ +struct acrn_vm_creation { + __u16 vmid; + __u16 reserved0; + __u16 vcpu_num; + __u16 reserved1; + guid_t uuid; + __u64 vm_flag; + __u64 ioreq_buf; + __u64 cpu_affinity; +}; + +/* The ioctl type, documented in ioctl-number.rst */ +#define ACRN_IOCTL_TYPE 0xA2 + +/* + * Common IOCTL IDs definition for ACRN userspace + */ +#define ACRN_IOCTL_CREATE_VM \ + _IOWR(ACRN_IOCTL_TYPE, 0x10, struct acrn_vm_creation) +#define ACRN_IOCTL_DESTROY_VM \ + _IO(ACRN_IOCTL_TYPE, 0x11) +#define ACRN_IOCTL_START_VM \ + _IO(ACRN_IOCTL_TYPE, 0x12) +#define ACRN_IOCTL_PAUSE_VM \ + _IO(ACRN_IOCTL_TYPE, 0x13) +#define ACRN_IOCTL_RESET_VM \ + _IO(ACRN_IOCTL_TYPE, 0x15) + +#endif /* _UAPI_ACRN_H */ -- cgit v1.2.3 From 2ad2aaee1bc9568d0c146463483d2c926ef20055 Mon Sep 17 00:00:00 2001 From: Shuo Liu Date: Sun, 7 Feb 2021 11:10:29 +0800 Subject: virt: acrn: Introduce an ioctl to set vCPU registers state A virtual CPU of User VM has different context due to the different registers state. ACRN userspace needs to set the virtual CPU registers state (e.g. giving a initial registers state to a virtual BSP of a User VM). HSM provides an ioctl ACRN_IOCTL_SET_VCPU_REGS to do the virtual CPU registers state setting. The ioctl passes the registers state from ACRN userspace to the hypervisor directly. Cc: Zhi Wang Cc: Zhenyu Wang Cc: Yu Wang Cc: Reinette Chatre Cc: Greg Kroah-Hartman Reviewed-by: Zhi Wang Reviewed-by: Reinette Chatre Signed-off-by: Shuo Liu Link: https://lore.kernel.org/r/20210207031040.49576-8-shuo.a.liu@intel.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/acrn.h | 119 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h index 32521168075f..775c58bad026 100644 --- a/include/uapi/linux/acrn.h +++ b/include/uapi/linux/acrn.h @@ -38,6 +38,123 @@ struct acrn_vm_creation { __u64 cpu_affinity; }; +/** + * struct acrn_gp_regs - General registers of a User VM + * @rax: Value of register RAX + * @rcx: Value of register RCX + * @rdx: Value of register RDX + * @rbx: Value of register RBX + * @rsp: Value of register RSP + * @rbp: Value of register RBP + * @rsi: Value of register RSI + * @rdi: Value of register RDI + * @r8: Value of register R8 + * @r9: Value of register R9 + * @r10: Value of register R10 + * @r11: Value of register R11 + * @r12: Value of register R12 + * @r13: Value of register R13 + * @r14: Value of register R14 + * @r15: Value of register R15 + */ +struct acrn_gp_regs { + __le64 rax; + __le64 rcx; + __le64 rdx; + __le64 rbx; + __le64 rsp; + __le64 rbp; + __le64 rsi; + __le64 rdi; + __le64 r8; + __le64 r9; + __le64 r10; + __le64 r11; + __le64 r12; + __le64 r13; + __le64 r14; + __le64 r15; +}; + +/** + * struct acrn_descriptor_ptr - Segment descriptor table of a User VM. + * @limit: Limit field. + * @base: Base field. + * @reserved: Reserved and must be 0. + */ +struct acrn_descriptor_ptr { + __le16 limit; + __le64 base; + __le16 reserved[3]; +} __attribute__ ((__packed__)); + +/** + * struct acrn_regs - Registers structure of a User VM + * @gprs: General registers + * @gdt: Global Descriptor Table + * @idt: Interrupt Descriptor Table + * @rip: Value of register RIP + * @cs_base: Base of code segment selector + * @cr0: Value of register CR0 + * @cr4: Value of register CR4 + * @cr3: Value of register CR3 + * @ia32_efer: Value of IA32_EFER MSR + * @rflags: Value of regsiter RFLAGS + * @reserved_64: Reserved and must be 0 + * @cs_ar: Attribute field of code segment selector + * @cs_limit: Limit field of code segment selector + * @reserved_32: Reserved and must be 0 + * @cs_sel: Value of code segment selector + * @ss_sel: Value of stack segment selector + * @ds_sel: Value of data segment selector + * @es_sel: Value of extra segment selector + * @fs_sel: Value of FS selector + * @gs_sel: Value of GS selector + * @ldt_sel: Value of LDT descriptor selector + * @tr_sel: Value of TSS descriptor selector + */ +struct acrn_regs { + struct acrn_gp_regs gprs; + struct acrn_descriptor_ptr gdt; + struct acrn_descriptor_ptr idt; + + __le64 rip; + __le64 cs_base; + __le64 cr0; + __le64 cr4; + __le64 cr3; + __le64 ia32_efer; + __le64 rflags; + __le64 reserved_64[4]; + + __le32 cs_ar; + __le32 cs_limit; + __le32 reserved_32[3]; + + __le16 cs_sel; + __le16 ss_sel; + __le16 ds_sel; + __le16 es_sel; + __le16 fs_sel; + __le16 gs_sel; + __le16 ldt_sel; + __le16 tr_sel; +}; + +/** + * struct acrn_vcpu_regs - Info of vCPU registers state + * @vcpu_id: vCPU ID + * @reserved: Reserved and must be 0 + * @vcpu_regs: vCPU registers state + * + * This structure will be passed to hypervisor directly. + */ +struct acrn_vcpu_regs { + __u16 vcpu_id; + __u16 reserved[3]; + struct acrn_regs vcpu_regs; +}; + /* The ioctl type, documented in ioctl-number.rst */ #define ACRN_IOCTL_TYPE 0xA2 @@ -54,5 +171,7 @@ struct acrn_vm_creation { _IO(ACRN_IOCTL_TYPE, 0x13) #define ACRN_IOCTL_RESET_VM \ _IO(ACRN_IOCTL_TYPE, 0x15) +#define ACRN_IOCTL_SET_VCPU_REGS \ + _IOW(ACRN_IOCTL_TYPE, 0x16, struct acrn_vcpu_regs) #endif /* _UAPI_ACRN_H */ -- cgit v1.2.3 From 88f537d5e8ddc89c2622f4a2bc1eb28455e8339c Mon Sep 17 00:00:00 2001 From: Shuo Liu Date: Sun, 7 Feb 2021 11:10:30 +0800 Subject: virt: acrn: Introduce EPT mapping management The HSM provides hypervisor services to the ACRN userspace. While launching a User VM, ACRN userspace needs to allocate memory and request the ACRN Hypervisor to set up the EPT mapping for the VM. A mapping cache is introduced for accelerating the translation between the Service VM kernel virtual address and User VM physical address. >From the perspective of the hypervisor, the types of GPA of User VM can be listed as following: 1) RAM region, which is used by User VM as system ram. 2) MMIO region, which is recognized by User VM as MMIO. MMIO region is used to be utilized for devices emulation. Generally, User VM RAM regions mapping is set up before VM started and is released in the User VM destruction. MMIO regions mapping may be set and unset dynamically during User VM running. To achieve this, ioctls ACRN_IOCTL_SET_MEMSEG and ACRN_IOCTL_UNSET_MEMSEG are introduced in HSM. Cc: Zhi Wang Cc: Zhenyu Wang Cc: Yu Wang Cc: Reinette Chatre Cc: Greg Kroah-Hartman Reviewed-by: Zhi Wang Reviewed-by: Reinette Chatre Signed-off-by: Shuo Liu Link: https://lore.kernel.org/r/20210207031040.49576-9-shuo.a.liu@intel.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/acrn.h | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h index 775c58bad026..ec4c61e7c170 100644 --- a/include/uapi/linux/acrn.h +++ b/include/uapi/linux/acrn.h @@ -155,6 +155,50 @@ struct acrn_vcpu_regs { struct acrn_regs vcpu_regs; }; +#define ACRN_MEM_ACCESS_RIGHT_MASK 0x00000007U +#define ACRN_MEM_ACCESS_READ 0x00000001U +#define ACRN_MEM_ACCESS_WRITE 0x00000002U +#define ACRN_MEM_ACCESS_EXEC 0x00000004U +#define ACRN_MEM_ACCESS_RWX (ACRN_MEM_ACCESS_READ | \ + ACRN_MEM_ACCESS_WRITE | \ + ACRN_MEM_ACCESS_EXEC) + +#define ACRN_MEM_TYPE_MASK 0x000007C0U +#define ACRN_MEM_TYPE_WB 0x00000040U +#define ACRN_MEM_TYPE_WT 0x00000080U +#define ACRN_MEM_TYPE_UC 0x00000100U +#define ACRN_MEM_TYPE_WC 0x00000200U +#define ACRN_MEM_TYPE_WP 0x00000400U + +/* Memory mapping types */ +#define ACRN_MEMMAP_RAM 0 +#define ACRN_MEMMAP_MMIO 1 + +/** + * struct acrn_vm_memmap - A EPT memory mapping info for a User VM. + * @type: Type of the memory mapping (ACRM_MEMMAP_*). + * Pass to hypervisor directly. + * @attr: Attribute of the memory mapping. + * Pass to hypervisor directly. + * @user_vm_pa: Physical address of User VM. + * Pass to hypervisor directly. + * @service_vm_pa: Physical address of Service VM. + * Pass to hypervisor directly. + * @vma_base: VMA address of Service VM. Pass to hypervisor directly. + * @len: Length of the memory mapping. + * Pass to hypervisor directly. + */ +struct acrn_vm_memmap { + __u32 type; + __u32 attr; + __u64 user_vm_pa; + union { + __u64 service_vm_pa; + __u64 vma_base; + }; + __u64 len; +}; + /* The ioctl type, documented in ioctl-number.rst */ #define ACRN_IOCTL_TYPE 0xA2 @@ -174,4 +218,9 @@ struct acrn_vcpu_regs { #define ACRN_IOCTL_SET_VCPU_REGS \ _IOW(ACRN_IOCTL_TYPE, 0x16, struct acrn_vcpu_regs) +#define ACRN_IOCTL_SET_MEMSEG \ + _IOW(ACRN_IOCTL_TYPE, 0x41, struct acrn_vm_memmap) +#define ACRN_IOCTL_UNSET_MEMSEG \ + _IOW(ACRN_IOCTL_TYPE, 0x42, struct acrn_vm_memmap) + #endif /* _UAPI_ACRN_H */ -- cgit v1.2.3 From 72f293de3ff40b57db573c1bf623f494f3446f74 Mon Sep 17 00:00:00 2001 From: Shuo Liu Date: Sun, 7 Feb 2021 11:10:31 +0800 Subject: virt: acrn: Introduce I/O request management An I/O request of a User VM, which is constructed by the hypervisor, is distributed by the ACRN Hypervisor Service Module to an I/O client corresponding to the address range of the I/O request. For each User VM, there is a shared 4-KByte memory region used for I/O requests communication between the hypervisor and Service VM. An I/O request is a 256-byte structure buffer, which is 'struct acrn_io_request', that is filled by an I/O handler of the hypervisor when a trapped I/O access happens in a User VM. ACRN userspace in the Service VM first allocates a 4-KByte page and passes the GPA (Guest Physical Address) of the buffer to the hypervisor. The buffer is used as an array of 16 I/O request slots with each I/O request slot being 256 bytes. This array is indexed by vCPU ID. An I/O client, which is 'struct acrn_ioreq_client', is responsible for handling User VM I/O requests whose accessed GPA falls in a certain range. Multiple I/O clients can be associated with each User VM. There is a special client associated with each User VM, called the default client, that handles all I/O requests that do not fit into the range of any other I/O clients. The ACRN userspace acts as the default client for each User VM. The state transitions of a ACRN I/O request are as follows. FREE -> PENDING -> PROCESSING -> COMPLETE -> FREE -> ... FREE: this I/O request slot is empty PENDING: a valid I/O request is pending in this slot PROCESSING: the I/O request is being processed COMPLETE: the I/O request has been processed An I/O request in COMPLETE or FREE state is owned by the hypervisor. HSM and ACRN userspace are in charge of processing the others. The processing flow of I/O requests are listed as following: a) The I/O handler of the hypervisor will fill an I/O request with PENDING state when a trapped I/O access happens in a User VM. b) The hypervisor makes an upcall, which is a notification interrupt, to the Service VM. c) The upcall handler schedules a worker to dispatch I/O requests. d) The worker looks for the PENDING I/O requests, assigns them to different registered clients based on the address of the I/O accesses, updates their state to PROCESSING, and notifies the corresponding client to handle. e) The notified client handles the assigned I/O requests. f) The HSM updates I/O requests states to COMPLETE and notifies the hypervisor of the completion via hypercalls. Cc: Davidlohr Bueso Cc: Zhi Wang Cc: Zhenyu Wang Cc: Yu Wang Cc: Reinette Chatre Cc: Greg Kroah-Hartman Reviewed-by: Zhi Wang Reviewed-by: Reinette Chatre Acked-by: Davidlohr Bueso Signed-off-by: Shuo Liu Link: https://lore.kernel.org/r/20210207031040.49576-10-shuo.a.liu@intel.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/acrn.h | 150 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h index ec4c61e7c170..b0f73ab5e4e8 100644 --- a/include/uapi/linux/acrn.h +++ b/include/uapi/linux/acrn.h @@ -14,6 +14,145 @@ #include #include +#define ACRN_IO_REQUEST_MAX 16 + +#define ACRN_IOREQ_STATE_PENDING 0 +#define ACRN_IOREQ_STATE_COMPLETE 1 +#define ACRN_IOREQ_STATE_PROCESSING 2 +#define ACRN_IOREQ_STATE_FREE 3 + +#define ACRN_IOREQ_TYPE_PORTIO 0 +#define ACRN_IOREQ_TYPE_MMIO 1 + +#define ACRN_IOREQ_DIR_READ 0 +#define ACRN_IOREQ_DIR_WRITE 1 + +/** + * struct acrn_mmio_request - Info of a MMIO I/O request + * @direction: Access direction of this request (ACRN_IOREQ_DIR_*) + * @reserved: Reserved for alignment and should be 0 + * @address: Access address of this MMIO I/O request + * @size: Access size of this MMIO I/O request + * @value: Read/write value of this MMIO I/O request + */ +struct acrn_mmio_request { + __u32 direction; + __u32 reserved; + __u64 address; + __u64 size; + __u64 value; +}; + +/** + * struct acrn_pio_request - Info of a PIO I/O request + * @direction: Access direction of this request (ACRN_IOREQ_DIR_*) + * @reserved: Reserved for alignment and should be 0 + * @address: Access address of this PIO I/O request + * @size: Access size of this PIO I/O request + * @value: Read/write value of this PIO I/O request + */ +struct acrn_pio_request { + __u32 direction; + __u32 reserved; + __u64 address; + __u64 size; + __u32 value; +}; + +/** + * struct acrn_io_request - 256-byte ACRN I/O request + * @type: Type of this request (ACRN_IOREQ_TYPE_*). + * @completion_polling: Polling flag. Hypervisor will poll completion of the + * I/O request if this flag set. + * @reserved0: Reserved fields. + * @reqs: Union of different types of request. Byte offset: 64. + * @reqs.pio_request: PIO request data of the I/O request. + * @reqs.mmio_request: MMIO request data of the I/O request. + * @reqs.data: Raw data of the I/O request. + * @reserved1: Reserved fields. + * @kernel_handled: Flag indicates this request need be handled in kernel. + * @processed: The status of this request (ACRN_IOREQ_STATE_*). + * + * The state transitions of ACRN I/O request: + * + * FREE -> PENDING -> PROCESSING -> COMPLETE -> FREE -> ... + * + * An I/O request in COMPLETE or FREE state is owned by the hypervisor. HSM and + * ACRN userspace are in charge of processing the others. + * + * On basis of the states illustrated above, a typical lifecycle of ACRN IO + * request would look like: + * + * Flow (assume the initial state is FREE) + * | + * | Service VM vCPU 0 Service VM vCPU x User vCPU y + * | + * | hypervisor: + * | fills in type, addr, etc. + * | pauses the User VM vCPU y + * | sets the state to PENDING (a) + * | fires an upcall to Service VM + * | + * | HSM: + * | scans for PENDING requests + * | sets the states to PROCESSING (b) + * | assigns the requests to clients (c) + * V + * | client: + * | scans for the assigned requests + * | handles the requests (d) + * | HSM: + * | sets states to COMPLETE + * | notifies the hypervisor + * | + * | hypervisor: + * | resumes User VM vCPU y (e) + * | + * | hypervisor: + * | post handling (f) + * V sets states to FREE + * + * Note that the procedures (a) to (f) in the illustration above require to be + * strictly processed in the order. One vCPU cannot trigger another request of + * I/O emulation before completing the previous one. + * + * Atomic and barriers are required when HSM and hypervisor accessing the state + * of &struct acrn_io_request. + * + */ +struct acrn_io_request { + __u32 type; + __u32 completion_polling; + __u32 reserved0[14]; + union { + struct acrn_pio_request pio_request; + struct acrn_mmio_request mmio_request; + __u64 data[8]; + } reqs; + __u32 reserved1; + __u32 kernel_handled; + __u32 processed; +} __attribute__((aligned(256))); + +struct acrn_io_request_buffer { + union { + struct acrn_io_request req_slot[ACRN_IO_REQUEST_MAX]; + __u8 reserved[4096]; + }; +}; + +/** + * struct acrn_ioreq_notify - The structure of ioreq completion notification + * @vmid: User VM ID + * @reserved: Reserved and should be 0 + * @vcpu: vCPU ID + */ +struct acrn_ioreq_notify { + __u16 vmid; + __u16 reserved; + __u32 vcpu; +}; + /** * struct acrn_vm_creation - Info to create a User VM * @vmid: User VM ID returned from the hypervisor @@ -218,6 +357,17 @@ struct acrn_vm_memmap { #define ACRN_IOCTL_SET_VCPU_REGS \ _IOW(ACRN_IOCTL_TYPE, 0x16, struct acrn_vcpu_regs) +#define ACRN_IOCTL_NOTIFY_REQUEST_FINISH \ + _IOW(ACRN_IOCTL_TYPE, 0x31, struct acrn_ioreq_notify) +#define ACRN_IOCTL_CREATE_IOREQ_CLIENT \ + _IO(ACRN_IOCTL_TYPE, 0x32) +#define ACRN_IOCTL_ATTACH_IOREQ_CLIENT \ + _IO(ACRN_IOCTL_TYPE, 0x33) +#define ACRN_IOCTL_DESTROY_IOREQ_CLIENT \ + _IO(ACRN_IOCTL_TYPE, 0x34) +#define ACRN_IOCTL_CLEAR_VM_IOREQ \ + _IO(ACRN_IOCTL_TYPE, 0x35) + #define ACRN_IOCTL_SET_MEMSEG \ _IOW(ACRN_IOCTL_TYPE, 0x41, struct acrn_vm_memmap) #define ACRN_IOCTL_UNSET_MEMSEG \ -- cgit v1.2.3 From 3c4c331667d4d9f1b5f3fdff9c4db36776da30ae Mon Sep 17 00:00:00 2001 From: Shuo Liu Date: Sun, 7 Feb 2021 11:10:32 +0800 Subject: virt: acrn: Introduce PCI configuration space PIO accesses combiner A User VM can access its virtual PCI configuration spaces via port IO approach, which has two following steps: 1) writes address into port 0xCF8 2) put/get data in/from port 0xCFC To distribute a complete PCI configuration space access one time, HSM need to combine such two accesses together. Combine two paired PIO I/O requests into one PCI I/O request and continue the I/O request distribution. Cc: Greg Kroah-Hartman Reviewed-by: Reinette Chatre Signed-off-by: Shuo Liu Link: https://lore.kernel.org/r/20210207031040.49576-11-shuo.a.liu@intel.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/acrn.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h index b0f73ab5e4e8..da40f7ad13d9 100644 --- a/include/uapi/linux/acrn.h +++ b/include/uapi/linux/acrn.h @@ -23,6 +23,7 @@ #define ACRN_IOREQ_TYPE_PORTIO 0 #define ACRN_IOREQ_TYPE_MMIO 1 +#define ACRN_IOREQ_TYPE_PCICFG 2 #define ACRN_IOREQ_DIR_READ 0 #define ACRN_IOREQ_DIR_WRITE 1 @@ -59,6 +60,30 @@ struct acrn_pio_request { __u32 value; }; +/** + * struct acrn_pci_request - Info of a PCI I/O request + * @direction: Access direction of this request (ACRN_IOREQ_DIR_*) + * @reserved: Reserved for alignment and should be 0 + * @size: Access size of this PCI I/O request + * @value: Read/write value of this PIO I/O request + * @bus: PCI bus value of this PCI I/O request + * @dev: PCI device value of this PCI I/O request + * @func: PCI function value of this PCI I/O request + * @reg: PCI config space offset of this PCI I/O request + * + * Need keep same header layout with &struct acrn_pio_request. + */ +struct acrn_pci_request { + __u32 direction; + __u32 reserved[3]; + __u64 size; + __u32 value; + __u32 bus; + __u32 dev; + __u32 func; + __u32 reg; +}; + /** * struct acrn_io_request - 256-byte ACRN I/O request * @type: Type of this request (ACRN_IOREQ_TYPE_*). @@ -67,6 +92,7 @@ struct acrn_pio_request { * @reserved0: Reserved fields. * @reqs: Union of different types of request. Byte offset: 64. * @reqs.pio_request: PIO request data of the I/O request. + * @reqs.pci_request: PCI configuration space request data of the I/O request. * @reqs.mmio_request: MMIO request data of the I/O request. * @reqs.data: Raw data of the I/O request. * @reserved1: Reserved fields. @@ -126,6 +152,7 @@ struct acrn_io_request { __u32 reserved0[14]; union { struct acrn_pio_request pio_request; + struct acrn_pci_request pci_request; struct acrn_mmio_request mmio_request; __u64 data[8]; } reqs; -- cgit v1.2.3 From ce011e1363a1fe43de0ca05abc394022ee4fefeb Mon Sep 17 00:00:00 2001 From: Shuo Liu Date: Sun, 7 Feb 2021 11:10:33 +0800 Subject: virt: acrn: Introduce interfaces for PCI device passthrough PCI device passthrough enables an OS in a virtual machine to directly access a PCI device in the host. It promises almost the native performance, which is required in performance-critical scenarios of ACRN. HSM provides the following ioctls: - Assign - ACRN_IOCTL_ASSIGN_PCIDEV Pass data struct acrn_pcidev from userspace to the hypervisor, and inform the hypervisor to assign a PCI device to a User VM. - De-assign - ACRN_IOCTL_DEASSIGN_PCIDEV Pass data struct acrn_pcidev from userspace to the hypervisor, and inform the hypervisor to de-assign a PCI device from a User VM. - Set a interrupt of a passthrough device - ACRN_IOCTL_SET_PTDEV_INTR Pass data struct acrn_ptdev_irq from userspace to the hypervisor, and inform the hypervisor to map a INTx interrupt of passthrough device of User VM. - Reset passthrough device interrupt - ACRN_IOCTL_RESET_PTDEV_INTR Pass data struct acrn_ptdev_irq from userspace to the hypervisor, and inform the hypervisor to unmap a INTx interrupt of passthrough device of User VM. Cc: Zhi Wang Cc: Zhenyu Wang Cc: Yu Wang Cc: Reinette Chatre Cc: Greg Kroah-Hartman Reviewed-by: Zhi Wang Reviewed-by: Reinette Chatre Signed-off-by: Shuo Liu Link: https://lore.kernel.org/r/20210207031040.49576-12-shuo.a.liu@intel.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/acrn.h | 61 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h index da40f7ad13d9..b25ca8c33b92 100644 --- a/include/uapi/linux/acrn.h +++ b/include/uapi/linux/acrn.h @@ -365,6 +365,58 @@ struct acrn_vm_memmap { __u64 len; }; +/* Type of interrupt of a passthrough device */ +#define ACRN_PTDEV_IRQ_INTX 0 +#define ACRN_PTDEV_IRQ_MSI 1 +#define ACRN_PTDEV_IRQ_MSIX 2 +/** + * struct acrn_ptdev_irq - Interrupt data of a passthrough device. + * @type: Type (ACRN_PTDEV_IRQ_*) + * @virt_bdf: Virtual Bus/Device/Function + * @phys_bdf: Physical Bus/Device/Function + * @intx: Info of interrupt + * @intx.virt_pin: Virtual IOAPIC pin + * @intx.phys_pin: Physical IOAPIC pin + * @intx.is_pic_pin: Is PIC pin or not + * + * This structure will be passed to hypervisor directly. + */ +struct acrn_ptdev_irq { + __u32 type; + __u16 virt_bdf; + __u16 phys_bdf; + + struct { + __u32 virt_pin; + __u32 phys_pin; + __u32 is_pic_pin; + } intx; +}; + +/* Type of PCI device assignment */ +#define ACRN_PTDEV_QUIRK_ASSIGN (1U << 0) + +#define ACRN_PCI_NUM_BARS 6 +/** + * struct acrn_pcidev - Info for assigning or de-assigning a PCI device + * @type: Type of the assignment + * @virt_bdf: Virtual Bus/Device/Function + * @phys_bdf: Physical Bus/Device/Function + * @intr_line: PCI interrupt line + * @intr_pin: PCI interrupt pin + * @bar: PCI BARs. + * + * This structure will be passed to hypervisor directly. + */ +struct acrn_pcidev { + __u32 type; + __u16 virt_bdf; + __u16 phys_bdf; + __u8 intr_line; + __u8 intr_pin; + __u32 bar[ACRN_PCI_NUM_BARS]; +}; + /* The ioctl type, documented in ioctl-number.rst */ #define ACRN_IOCTL_TYPE 0xA2 @@ -400,4 +452,13 @@ struct acrn_vm_memmap { #define ACRN_IOCTL_UNSET_MEMSEG \ _IOW(ACRN_IOCTL_TYPE, 0x42, struct acrn_vm_memmap) +#define ACRN_IOCTL_SET_PTDEV_INTR \ + _IOW(ACRN_IOCTL_TYPE, 0x53, struct acrn_ptdev_irq) +#define ACRN_IOCTL_RESET_PTDEV_INTR \ + _IOW(ACRN_IOCTL_TYPE, 0x54, struct acrn_ptdev_irq) +#define ACRN_IOCTL_ASSIGN_PCIDEV \ + _IOW(ACRN_IOCTL_TYPE, 0x55, struct acrn_pcidev) +#define ACRN_IOCTL_DEASSIGN_PCIDEV \ + _IOW(ACRN_IOCTL_TYPE, 0x56, struct acrn_pcidev) + #endif /* _UAPI_ACRN_H */ -- cgit v1.2.3 From c7cf8d27244f2ccdde30c79eb6314c943bbeac28 Mon Sep 17 00:00:00 2001 From: Shuo Liu Date: Sun, 7 Feb 2021 11:10:34 +0800 Subject: virt: acrn: Introduce interrupt injection interfaces ACRN userspace need to inject virtual interrupts into a User VM in devices emulation. HSM needs provide interfaces to do so. Introduce following interrupt injection interfaces: ioctl ACRN_IOCTL_SET_IRQLINE: Pass data from userspace to the hypervisor, and inform the hypervisor to inject a virtual IOAPIC GSI interrupt to a User VM. ioctl ACRN_IOCTL_INJECT_MSI: Pass data struct acrn_msi_entry from userspace to the hypervisor, and inform the hypervisor to inject a virtual MSI to a User VM. ioctl ACRN_IOCTL_VM_INTR_MONITOR: Set a 4-Kbyte aligned shared page for statistics information of interrupts of a User VM. Cc: Zhi Wang Cc: Zhenyu Wang Cc: Yu Wang Cc: Reinette Chatre Cc: Greg Kroah-Hartman Reviewed-by: Zhi Wang Reviewed-by: Reinette Chatre Signed-off-by: Shuo Liu Link: https://lore.kernel.org/r/20210207031040.49576-13-shuo.a.liu@intel.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/acrn.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h index b25ca8c33b92..b1c06b28ebdc 100644 --- a/include/uapi/linux/acrn.h +++ b/include/uapi/linux/acrn.h @@ -417,6 +417,16 @@ struct acrn_pcidev { __u32 bar[ACRN_PCI_NUM_BARS]; }; +/** + * struct acrn_msi_entry - Info for injecting a MSI interrupt to a VM + * @msi_addr: MSI addr[19:12] with dest vCPU ID + * @msi_data: MSI data[7:0] with vector + */ +struct acrn_msi_entry { + __u64 msi_addr; + __u64 msi_data; +}; + /* The ioctl type, documented in ioctl-number.rst */ #define ACRN_IOCTL_TYPE 0xA2 @@ -436,6 +446,13 @@ struct acrn_pcidev { #define ACRN_IOCTL_SET_VCPU_REGS \ _IOW(ACRN_IOCTL_TYPE, 0x16, struct acrn_vcpu_regs) +#define ACRN_IOCTL_INJECT_MSI \ + _IOW(ACRN_IOCTL_TYPE, 0x23, struct acrn_msi_entry) +#define ACRN_IOCTL_VM_INTR_MONITOR \ + _IOW(ACRN_IOCTL_TYPE, 0x24, unsigned long) +#define ACRN_IOCTL_SET_IRQLINE \ + _IOW(ACRN_IOCTL_TYPE, 0x25, __u64) + #define ACRN_IOCTL_NOTIFY_REQUEST_FINISH \ _IOW(ACRN_IOCTL_TYPE, 0x31, struct acrn_ioreq_notify) #define ACRN_IOCTL_CREATE_IOREQ_CLIENT \ -- cgit v1.2.3 From 3d679d5aec648f50e645702929890b9611998a0b Mon Sep 17 00:00:00 2001 From: Shuo Liu Date: Sun, 7 Feb 2021 11:10:35 +0800 Subject: virt: acrn: Introduce interfaces to query C-states and P-states allowed by hypervisor The C-states and P-states data are used to support CPU power management. The hypervisor controls C-states and P-states for a User VM. ACRN userspace need to query the data from the hypervisor to build ACPI tables for a User VM. HSM provides ioctls for ACRN userspace to query C-states and P-states data obtained from the hypervisor. Cc: Zhi Wang Cc: Zhenyu Wang Cc: Yu Wang Cc: Reinette Chatre Cc: Greg Kroah-Hartman Reviewed-by: Zhi Wang Reviewed-by: Reinette Chatre Signed-off-by: Shuo Liu Link: https://lore.kernel.org/r/20210207031040.49576-14-shuo.a.liu@intel.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/acrn.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h index b1c06b28ebdc..e997d80a0cc7 100644 --- a/include/uapi/linux/acrn.h +++ b/include/uapi/linux/acrn.h @@ -427,6 +427,58 @@ struct acrn_msi_entry { __u64 msi_data; }; +struct acrn_acpi_generic_address { + __u8 space_id; + __u8 bit_width; + __u8 bit_offset; + __u8 access_size; + __u64 address; +} __attribute__ ((__packed__)); + +/** + * struct acrn_cstate_data - A C state package defined in ACPI + * @cx_reg: Register of the C state object + * @type: Type of the C state object + * @latency: The worst-case latency to enter and exit this C state + * @power: The average power consumption when in this C state + */ +struct acrn_cstate_data { + struct acrn_acpi_generic_address cx_reg; + __u8 type; + __u32 latency; + __u64 power; +}; + +/** + * struct acrn_pstate_data - A P state package defined in ACPI + * @core_frequency: CPU frequency (in MHz). + * @power: Power dissipation (in milliwatts). + * @transition_latency: The worst-case latency in microseconds that CPU is + * unavailable during a transition from any P state to + * this P state. + * @bus_master_latency: The worst-case latency in microseconds that Bus Masters + * are prevented from accessing memory during a transition + * from any P state to this P state. + * @control: The value to be written to Performance Control Register + * @status: Transition status. + */ +struct acrn_pstate_data { + __u64 core_frequency; + __u64 power; + __u64 transition_latency; + __u64 bus_master_latency; + __u64 control; + __u64 status; +}; + +#define PMCMD_TYPE_MASK 0x000000ff +enum acrn_pm_cmd_type { + ACRN_PMCMD_GET_PX_CNT, + ACRN_PMCMD_GET_PX_DATA, + ACRN_PMCMD_GET_CX_CNT, + ACRN_PMCMD_GET_CX_DATA, +}; + /* The ioctl type, documented in ioctl-number.rst */ #define ACRN_IOCTL_TYPE 0xA2 @@ -478,4 +530,7 @@ struct acrn_msi_entry { #define ACRN_IOCTL_DEASSIGN_PCIDEV \ _IOW(ACRN_IOCTL_TYPE, 0x56, struct acrn_pcidev) +#define ACRN_IOCTL_PM_GET_CPU_STATE \ + _IOWR(ACRN_IOCTL_TYPE, 0x60, __u64) + #endif /* _UAPI_ACRN_H */ -- cgit v1.2.3 From d8ad515156b66e7e79a6e4c814f997ee54eb47c7 Mon Sep 17 00:00:00 2001 From: Shuo Liu Date: Sun, 7 Feb 2021 11:10:37 +0800 Subject: virt: acrn: Introduce ioeventfd ioeventfd is a mechanism to register PIO/MMIO regions to trigger an eventfd signal when written to by a User VM. ACRN userspace can register any arbitrary I/O address with a corresponding eventfd and then pass the eventfd to a specific end-point of interest for handling. Vhost is a kernel-level virtio server which uses eventfd for signalling. To support vhost on ACRN, ioeventfd is introduced in HSM. A new I/O client dedicated to ioeventfd is associated with a User VM during VM creation. HSM provides ioctls to associate an I/O region with a eventfd. The I/O client signals a eventfd once its corresponding I/O region is matched with an I/O request. Cc: Zhi Wang Cc: Zhenyu Wang Cc: Yu Wang Cc: Reinette Chatre Cc: Greg Kroah-Hartman Reviewed-by: Zhi Wang Reviewed-by: Reinette Chatre Signed-off-by: Shuo Liu Link: https://lore.kernel.org/r/20210207031040.49576-16-shuo.a.liu@intel.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/acrn.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h index e997d80a0cc7..132c5ff95967 100644 --- a/include/uapi/linux/acrn.h +++ b/include/uapi/linux/acrn.h @@ -479,6 +479,32 @@ enum acrn_pm_cmd_type { ACRN_PMCMD_GET_CX_DATA, }; +#define ACRN_IOEVENTFD_FLAG_PIO 0x01 +#define ACRN_IOEVENTFD_FLAG_DATAMATCH 0x02 +#define ACRN_IOEVENTFD_FLAG_DEASSIGN 0x04 +/** + * struct acrn_ioeventfd - Data to operate a &struct hsm_ioeventfd + * @fd: The fd of eventfd associated with a hsm_ioeventfd + * @flags: Logical-OR of ACRN_IOEVENTFD_FLAG_* + * @addr: The start address of IO range of ioeventfd + * @len: The length of IO range of ioeventfd + * @reserved: Reserved and should be 0 + * @data: Data for data matching + * + * Without flag ACRN_IOEVENTFD_FLAG_DEASSIGN, ioctl ACRN_IOCTL_IOEVENTFD + * creates a &struct hsm_ioeventfd with properties originated from &struct + * acrn_ioeventfd. With flag ACRN_IOEVENTFD_FLAG_DEASSIGN, ioctl + * ACRN_IOCTL_IOEVENTFD destroys the &struct hsm_ioeventfd matching the fd. + */ +struct acrn_ioeventfd { + __u32 fd; + __u32 flags; + __u64 addr; + __u32 len; + __u32 reserved; + __u64 data; +}; + /* The ioctl type, documented in ioctl-number.rst */ #define ACRN_IOCTL_TYPE 0xA2 @@ -533,4 +559,7 @@ enum acrn_pm_cmd_type { #define ACRN_IOCTL_PM_GET_CPU_STATE \ _IOWR(ACRN_IOCTL_TYPE, 0x60, __u64) +#define ACRN_IOCTL_IOEVENTFD \ + _IOW(ACRN_IOCTL_TYPE, 0x70, struct acrn_ioeventfd) + #endif /* _UAPI_ACRN_H */ -- cgit v1.2.3 From aa3b483ff1d71c50b33db154048dff9a8f08ac71 Mon Sep 17 00:00:00 2001 From: Shuo Liu Date: Sun, 7 Feb 2021 11:10:38 +0800 Subject: virt: acrn: Introduce irqfd irqfd is a mechanism to inject a specific interrupt to a User VM using a decoupled eventfd mechanism. Vhost is a kernel-level virtio server which uses eventfd for interrupt injection. To support vhost on ACRN, irqfd is introduced in HSM. HSM provides ioctls to associate a virtual Message Signaled Interrupt (MSI) with an eventfd. The corresponding virtual MSI will be injected into a User VM once the eventfd got signal. Cc: Zhi Wang Cc: Zhenyu Wang Cc: Yu Wang Cc: Reinette Chatre Cc: Greg Kroah-Hartman Reviewed-by: Zhi Wang Reviewed-by: Reinette Chatre Signed-off-by: Shuo Liu Link: https://lore.kernel.org/r/20210207031040.49576-17-shuo.a.liu@intel.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/acrn.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h index 132c5ff95967..353b2a2e4536 100644 --- a/include/uapi/linux/acrn.h +++ b/include/uapi/linux/acrn.h @@ -505,6 +505,19 @@ struct acrn_ioeventfd { __u64 data; }; +#define ACRN_IRQFD_FLAG_DEASSIGN 0x01 +/** + * struct acrn_irqfd - Data to operate a &struct hsm_irqfd + * @fd: The fd of eventfd associated with a hsm_irqfd + * @flags: Logical-OR of ACRN_IRQFD_FLAG_* + * @msi: Info of MSI associated with the irqfd + */ +struct acrn_irqfd { + __s32 fd; + __u32 flags; + struct acrn_msi_entry msi; +}; + /* The ioctl type, documented in ioctl-number.rst */ #define ACRN_IOCTL_TYPE 0xA2 @@ -561,5 +574,7 @@ struct acrn_ioeventfd { #define ACRN_IOCTL_IOEVENTFD \ _IOW(ACRN_IOCTL_TYPE, 0x70, struct acrn_ioeventfd) +#define ACRN_IOCTL_IRQFD \ + _IOW(ACRN_IOCTL_TYPE, 0x71, struct acrn_irqfd) #endif /* _UAPI_ACRN_H */ -- cgit v1.2.3 From 0566752c3e8681ec47fee37374cb38081d801e95 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 7 Feb 2021 14:05:43 +0100 Subject: uapi: map_to_7segment: Update example in documentation The device_attribute .show() and .store() methods gained an extra parameter in v2.6.13, but the example in the documentation for the 7-segment header file was never updated. Add the missing parameters. While at it, get rid of the (misspelled) deprecated symbolic permissions, and switch to DEVICE_ATTR_RW(), which was introduced in v3.11 Fixes: 54b6f35c99974e99 ("[PATCH] Driver core: change device_attribute callbacks") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20210207130543.2128980-1-geert@linux-m68k.org Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/map_to_7segment.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/map_to_7segment.h b/include/uapi/linux/map_to_7segment.h index 13a06e5e966e..8b02088f96e3 100644 --- a/include/uapi/linux/map_to_7segment.h +++ b/include/uapi/linux/map_to_7segment.h @@ -45,17 +45,22 @@ * In device drivers it is recommended, if required, to make the char map * accessible via the sysfs interface using the following scheme: * - * static ssize_t show_map(struct device *dev, char *buf) { + * static ssize_t map_seg7_show(struct device *dev, + * struct device_attribute *attr, char *buf) + * { * memcpy(buf, &map_seg7, sizeof(map_seg7)); * return sizeof(map_seg7); * } - * static ssize_t store_map(struct device *dev, const char *buf, size_t cnt) { + * static ssize_t map_seg7_store(struct device *dev, + * struct device_attribute *attr, const char *buf, + * size_t cnt) + * { * if(cnt != sizeof(map_seg7)) * return -EINVAL; * memcpy(&map_seg7, buf, cnt); * return cnt; * } - * static DEVICE_ATTR(map_seg7, PERMS_RW, show_map, store_map); + * static DEVICE_ATTR_RW(map_seg7); * * History: * 2005-05-31 RFC linux-kernel@vger.kernel.org -- cgit v1.2.3 From 583fa5e71caeb79e04e477e9837e2f7fa53b71e4 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Tue, 16 Feb 2021 20:09:53 -0800 Subject: cxl/mem: Add basic IOCTL interface Add a straightforward IOCTL that provides a mechanism for userspace to query the supported memory device commands. CXL commands as they appear to userspace are described as part of the UAPI kerneldoc. The command list returned via this IOCTL will contain the full set of commands that the driver supports, however, some of those commands may not be available for use by userspace. Memory device commands first appear in the CXL 2.0 specification. They are submitted through a mailbox mechanism specified in the CXL 2.0 specification. The send command allows userspace to issue mailbox commands directly to the hardware. The list of available commands to send are the output of the query command. The driver verifies basic properties of the command and possibly inspect the input (or output) payload to determine whether or not the command is allowed (or might taint the kernel). Reported-by: kernel test robot # bug in earlier revision Reported-by: Stephen Rothwell Signed-off-by: Ben Widawsky Reviewed-by: Dan Williams (v2) Cc: Al Viro Link: https://lore.kernel.org/r/20210217040958.1354670-5-ben.widawsky@intel.com Signed-off-by: Dan Williams --- include/uapi/linux/cxl_mem.h | 156 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 include/uapi/linux/cxl_mem.h (limited to 'include/uapi') diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h new file mode 100644 index 000000000000..887781dc3b6c --- /dev/null +++ b/include/uapi/linux/cxl_mem.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * CXL IOCTLs for Memory Devices + */ + +#ifndef _UAPI_CXL_MEM_H_ +#define _UAPI_CXL_MEM_H_ + +#include + +/** + * DOC: UAPI + * + * Not all of all commands that the driver supports are always available for use + * by userspace. Userspace must check the results from the QUERY command in + * order to determine the live set of commands. + */ + +#define CXL_MEM_QUERY_COMMANDS _IOR(0xCE, 1, struct cxl_mem_query_commands) +#define CXL_MEM_SEND_COMMAND _IOWR(0xCE, 2, struct cxl_send_command) + +#define CXL_CMDS \ + ___C(INVALID, "Invalid Command"), \ + ___C(IDENTIFY, "Identify Command"), \ + ___C(MAX, "invalid / last command") + +#define ___C(a, b) CXL_MEM_COMMAND_ID_##a +enum { CXL_CMDS }; + +#undef ___C +#define ___C(a, b) { b } +static const struct { + const char *name; +} cxl_command_names[] = { CXL_CMDS }; + +/* + * Here's how this actually breaks out: + * cxl_command_names[] = { + * [CXL_MEM_COMMAND_ID_INVALID] = { "Invalid Command" }, + * [CXL_MEM_COMMAND_ID_IDENTIFY] = { "Identify Command" }, + * ... + * [CXL_MEM_COMMAND_ID_MAX] = { "invalid / last command" }, + * }; + */ + +#undef ___C + +/** + * struct cxl_command_info - Command information returned from a query. + * @id: ID number for the command. + * @flags: Flags that specify command behavior. + * @size_in: Expected input size, or -1 if variable length. + * @size_out: Expected output size, or -1 if variable length. + * + * Represents a single command that is supported by both the driver and the + * hardware. This is returned as part of an array from the query ioctl. The + * following would be a command that takes a variable length input and returns 0 + * bytes of output. + * + * - @id = 10 + * - @flags = 0 + * - @size_in = -1 + * - @size_out = 0 + * + * See struct cxl_mem_query_commands. + */ +struct cxl_command_info { + __u32 id; + + __u32 flags; +#define CXL_MEM_COMMAND_FLAG_MASK GENMASK(0, 0) + + __s32 size_in; + __s32 size_out; +}; + +/** + * struct cxl_mem_query_commands - Query supported commands. + * @n_commands: In/out parameter. When @n_commands is > 0, the driver will + * return min(num_support_commands, n_commands). When @n_commands + * is 0, driver will return the number of total supported commands. + * @rsvd: Reserved for future use. + * @commands: Output array of supported commands. This array must be allocated + * by userspace to be at least min(num_support_commands, @n_commands) + * + * Allow userspace to query the available commands supported by both the driver, + * and the hardware. Commands that aren't supported by either the driver, or the + * hardware are not returned in the query. + * + * Examples: + * + * - { .n_commands = 0 } // Get number of supported commands + * - { .n_commands = 15, .commands = buf } // Return first 15 (or less) + * supported commands + * + * See struct cxl_command_info. + */ +struct cxl_mem_query_commands { + /* + * Input: Number of commands to return (space allocated by user) + * Output: Number of commands supported by the driver/hardware + * + * If n_commands is 0, kernel will only return number of commands and + * not try to populate commands[], thus allowing userspace to know how + * much space to allocate + */ + __u32 n_commands; + __u32 rsvd; + + struct cxl_command_info __user commands[]; /* out: supported commands */ +}; + +/** + * struct cxl_send_command - Send a command to a memory device. + * @id: The command to send to the memory device. This must be one of the + * commands returned by the query command. + * @flags: Flags for the command (input). + * @rsvd: Must be zero. + * @retval: Return value from the memory device (output). + * @in: Parameters associated with input payload. + * @in.size: Size of the payload to provide to the device (input). + * @in.rsvd: Must be zero. + * @in.payload: Pointer to memory for payload input, payload is little endian. + * @out: Parameters associated with output payload. + * @out.size: Size of the payload received from the device (input/output). This + * field is filled in by userspace to let the driver know how much + * space was allocated for output. It is populated by the driver to + * let userspace know how large the output payload actually was. + * @out.rsvd: Must be zero. + * @out.payload: Pointer to memory for payload output, payload is little endian. + * + * Mechanism for userspace to send a command to the hardware for processing. The + * driver will do basic validation on the command sizes. In some cases even the + * payload may be introspected. Userspace is required to allocate large enough + * buffers for size_out which can be variable length in certain situations. + */ +struct cxl_send_command { + __u32 id; + __u32 flags; + __u32 rsvd; + __u32 retval; + + struct { + __s32 size; + __u32 rsvd; + __u64 payload; + } in; + + struct { + __s32 size; + __u32 rsvd; + __u64 payload; + } out; +}; + +#endif -- cgit v1.2.3 From 13237183c735f5cba4ae26bc782c613ae0d4e4d3 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Tue, 16 Feb 2021 20:09:54 -0800 Subject: cxl/mem: Add a "RAW" send command The CXL memory device send interface will have a number of supported commands. The raw command is not such a command. Raw commands allow userspace to send a specified opcode to the underlying hardware and bypass all driver checks on the command. The primary use for this command is to [begrudgingly] allow undocumented vendor specific hardware commands. While not the main motivation, it also allows prototyping new hardware commands without a driver patch and rebuild. While this all sounds very powerful it comes with a couple of caveats: 1. Bug reports using raw commands will not get the same level of attention as bug reports using supported commands (via taint). 2. Supported commands will be rejected by the RAW command. With this comes new debugfs knob to allow full access to your toes with your weapon of choice. Signed-off-by: Ben Widawsky Reviewed-by: Dan Williams (v2) Reviewed-by: Jonathan Cameron Cc: Ariel Sibley Link: https://lore.kernel.org/r/20210217040958.1354670-6-ben.widawsky@intel.com Signed-off-by: Dan Williams --- include/uapi/linux/cxl_mem.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h index 887781dc3b6c..c316028730e7 100644 --- a/include/uapi/linux/cxl_mem.h +++ b/include/uapi/linux/cxl_mem.h @@ -22,6 +22,7 @@ #define CXL_CMDS \ ___C(INVALID, "Invalid Command"), \ ___C(IDENTIFY, "Identify Command"), \ + ___C(RAW, "Raw device command"), \ ___C(MAX, "invalid / last command") #define ___C(a, b) CXL_MEM_COMMAND_ID_##a @@ -115,6 +116,9 @@ struct cxl_mem_query_commands { * @id: The command to send to the memory device. This must be one of the * commands returned by the query command. * @flags: Flags for the command (input). + * @raw: Special fields for raw commands + * @raw.opcode: Opcode passed to hardware when using the RAW command. + * @raw.rsvd: Must be zero. * @rsvd: Must be zero. * @retval: Return value from the memory device (output). * @in: Parameters associated with input payload. @@ -137,7 +141,13 @@ struct cxl_mem_query_commands { struct cxl_send_command { __u32 id; __u32 flags; - __u32 rsvd; + union { + struct { + __u16 opcode; + __u16 rsvd; + } raw; + __u32 rsvd; + }; __u32 retval; struct { -- cgit v1.2.3 From 472b1ce6e9d6396ab3f11fc5101c6b63b934a018 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Tue, 16 Feb 2021 20:09:55 -0800 Subject: cxl/mem: Enable commands via CEL CXL devices identified by the memory-device class code must implement the Device Command Interface (described in 8.2.9 of the CXL 2.0 spec). While the driver already maintains a list of commands it supports, there is still a need to be able to distinguish between commands that the driver knows about from commands that are optionally supported by the hardware. The Command Effects Log (CEL) is specified in the CXL 2.0 specification. The CEL is one of two types of logs, the other being vendor specific. They are distinguished in hardware/spec via UUID. The CEL is useful for 2 things: 1. Determine which optional commands are supported by the CXL device. 2. Enumerate any vendor specific commands The CEL is used by the driver to determine which commands are available in the hardware and therefore which commands userspace is allowed to execute. The set of enabled commands might be a subset of commands which are advertised in UAPI via CXL_MEM_SEND_COMMAND IOCTL. With the CEL enabling comes a internal flag to indicate a base set of commands that are enabled regardless of CEL. Such commands are required for basic interaction with the hardware and thus can be useful in debug cases, for example if the CEL is corrupted. The implementation leaves the statically defined table of commands and supplements it with a bitmap to determine commands that are enabled. This organization was chosen for the following reasons: - Smaller memory footprint. Doesn't need a table per device. - Reduce memory allocation complexity. - Fixed command IDs to opcode mapping for all devices makes development and debugging easier. - Certain helpers are easily achievable, like cxl_for_each_cmd(). Signed-off-by: Ben Widawsky Reviewed-by: Dan Williams (v2) Reviewed-by: Jonathan Cameron (v3) Link: https://lore.kernel.org/r/20210217040958.1354670-7-ben.widawsky@intel.com Signed-off-by: Dan Williams --- include/uapi/linux/cxl_mem.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h index c316028730e7..59227f82a4c1 100644 --- a/include/uapi/linux/cxl_mem.h +++ b/include/uapi/linux/cxl_mem.h @@ -23,6 +23,7 @@ ___C(INVALID, "Invalid Command"), \ ___C(IDENTIFY, "Identify Command"), \ ___C(RAW, "Raw device command"), \ + ___C(GET_SUPPORTED_LOGS, "Get Supported Logs"), \ ___C(MAX, "invalid / last command") #define ___C(a, b) CXL_MEM_COMMAND_ID_##a -- cgit v1.2.3 From 57ee605b976c30a86613648935d255bbe704aeab Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Tue, 16 Feb 2021 20:09:56 -0800 Subject: cxl/mem: Add set of informational commands Add initial set of formal commands beyond basic identify and command enumeration. Signed-off-by: Ben Widawsky Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron (v2) Link: https://lore.kernel.org/r/20210217040958.1354670-8-ben.widawsky@intel.com Signed-off-by: Dan Williams --- include/uapi/linux/cxl_mem.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h index 59227f82a4c1..3155382dfc9b 100644 --- a/include/uapi/linux/cxl_mem.h +++ b/include/uapi/linux/cxl_mem.h @@ -24,6 +24,11 @@ ___C(IDENTIFY, "Identify Command"), \ ___C(RAW, "Raw device command"), \ ___C(GET_SUPPORTED_LOGS, "Get Supported Logs"), \ + ___C(GET_FW_INFO, "Get FW Info"), \ + ___C(GET_PARTITION_INFO, "Get Partition Information"), \ + ___C(GET_LSA, "Get Label Storage Area"), \ + ___C(GET_HEALTH_INFO, "Get Health Info"), \ + ___C(GET_LOG, "Get Log"), \ ___C(MAX, "invalid / last command") #define ___C(a, b) CXL_MEM_COMMAND_ID_##a -- cgit v1.2.3 From 33b347503f014ebf76257327cbc7001c6b721956 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 5 Jan 2021 12:32:00 +0200 Subject: vdpa: Define vdpa mgmt device, ops and a netlink interface To add one or more VDPA devices, define a management device which allows adding or removing vdpa device. A management device defines set of callbacks to manage vdpa devices. To begin with, it defines add and remove callbacks through which a user defined vdpa device can be added or removed. A unique management device is identified by its unique handle identified by management device name and optionally the bus name. Hence, introduce routine through which driver can register a management device and its callback operations for adding and remove a vdpa device. Introduce vdpa netlink socket family so that user can query management device and its attributes. Example of show vdpa management device which allows creating vdpa device of networking class (device id = 0x1) of virtio specification 1.1 section 5.1.1. $ vdpa mgmtdev show vdpasim_net: supported_classes: net Example of showing vdpa management device in JSON format. $ vdpa mgmtdev show -jp { "show": { "vdpasim_net": { "supported_classes": [ "net" ] } } } Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Reviewed-by: Jason Wang Link: https://lore.kernel.org/r/20210105103203.82508-4-parav@nvidia.com Signed-off-by: Michael S. Tsirkin Including a bugfix: vpda: correctly size vdpa_nl_policy We need to ensure last entry of vdpa_nl_policy[] is zero, otherwise out-of-bounds access is hurting us. Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Parav Pandit Cc: Eli Cohen Cc: Jason Wang Cc: Michael S. Tsirkin Link: https://lore.kernel.org/r/20210210134911.4119555-1-eric.dumazet@gmail.com Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/vdpa.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 include/uapi/linux/vdpa.h (limited to 'include/uapi') diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h new file mode 100644 index 000000000000..d44d82e567b1 --- /dev/null +++ b/include/uapi/linux/vdpa.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * vdpa device management interface + * Copyright (c) 2020 Mellanox Technologies Ltd. All rights reserved. + */ + +#ifndef _UAPI_LINUX_VDPA_H_ +#define _UAPI_LINUX_VDPA_H_ + +#define VDPA_GENL_NAME "vdpa" +#define VDPA_GENL_VERSION 0x1 + +enum vdpa_command { + VDPA_CMD_UNSPEC, + VDPA_CMD_MGMTDEV_NEW, + VDPA_CMD_MGMTDEV_GET, /* can dump */ +}; + +enum vdpa_attr { + VDPA_ATTR_UNSPEC, + + /* bus name (optional) + dev name together make the parent device handle */ + VDPA_ATTR_MGMTDEV_BUS_NAME, /* string */ + VDPA_ATTR_MGMTDEV_DEV_NAME, /* string */ + VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, /* u64 */ + + /* new attributes must be added above here */ + VDPA_ATTR_MAX, +}; + +#endif -- cgit v1.2.3 From 903f7bcaedb84ca47998e609015a34ddde93742e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 5 Jan 2021 12:32:01 +0200 Subject: vdpa: Enable a user to add and delete a vdpa device Add the ability to add and delete a vdpa device. Examples: Create a vdpa device of type network named "foo2" from the management device vdpasim: $ vdpa dev add mgmtdev vdpasim_net name foo2 Delete the vdpa device after its use: $ vdpa dev del foo2 Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Reviewed-by: Jason Wang Link: https://lore.kernel.org/r/20210105103203.82508-5-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/vdpa.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h index d44d82e567b1..bb4a1f00eb1c 100644 --- a/include/uapi/linux/vdpa.h +++ b/include/uapi/linux/vdpa.h @@ -14,6 +14,8 @@ enum vdpa_command { VDPA_CMD_UNSPEC, VDPA_CMD_MGMTDEV_NEW, VDPA_CMD_MGMTDEV_GET, /* can dump */ + VDPA_CMD_DEV_NEW, + VDPA_CMD_DEV_DEL, }; enum vdpa_attr { @@ -24,6 +26,8 @@ enum vdpa_attr { VDPA_ATTR_MGMTDEV_DEV_NAME, /* string */ VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, /* u64 */ + VDPA_ATTR_DEV_NAME, /* string */ + /* new attributes must be added above here */ VDPA_ATTR_MAX, }; -- cgit v1.2.3 From bc0d90ee021f1baecd6aaa010d787eb373aa74dd Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 5 Jan 2021 12:32:02 +0200 Subject: vdpa: Enable user to query vdpa device info Enable user to query vdpa device information. $ vdpa dev add mgmtdev vdpasim_net name foo2 Show the newly created vdpa device by its name: $ vdpa dev show foo2 foo2: type network mgmtdev vdpasim_net vendor_id 0 max_vqs 2 max_vq_size 256 $ vdpa dev show foo2 -jp { "dev": { "foo2": { "type": "network", "mgmtdev": "vdpasim_net", "vendor_id": 0, "max_vqs": 2, "max_vq_size": 256 } } } Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Reviewed-by: Jason Wang Link: https://lore.kernel.org/r/20210105103203.82508-6-parav@nvidia.com Including a memory leak fix: Link: https://lore.kernel.org/r/20210217060614.59561-1-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/vdpa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h index bb4a1f00eb1c..66a41e4ec163 100644 --- a/include/uapi/linux/vdpa.h +++ b/include/uapi/linux/vdpa.h @@ -16,6 +16,7 @@ enum vdpa_command { VDPA_CMD_MGMTDEV_GET, /* can dump */ VDPA_CMD_DEV_NEW, VDPA_CMD_DEV_DEL, + VDPA_CMD_DEV_GET, /* can dump */ }; enum vdpa_attr { @@ -27,6 +28,10 @@ enum vdpa_attr { VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, /* u64 */ VDPA_ATTR_DEV_NAME, /* string */ + VDPA_ATTR_DEV_ID, /* u32 */ + VDPA_ATTR_DEV_VENDOR_ID, /* u32 */ + VDPA_ATTR_DEV_MAX_VQS, /* u32 */ + VDPA_ATTR_DEV_MAX_VQ_SIZE, /* u16 */ /* new attributes must be added above here */ VDPA_ATTR_MAX, -- cgit v1.2.3 From 1c0aa1fae1acb77c5f9917adb0e4cb4500b9f3a6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 20 Feb 2021 11:55:28 -0700 Subject: io_uring: flag new native workers with IORING_FEAT_NATIVE_WORKERS A few reasons to do this: - The naming of the manager and worker have changed. That's a user visible change, so makes sense to flag it. - Opening certain files that use ->signal (like /proc/self or /dev/tty) now works, and the flag tells the application upfront that this is the case. - Related to the above, using signalfd will now work as well. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ac4e1738a9af..2514eb6b1cf2 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -262,6 +262,7 @@ struct io_uring_params { #define IORING_FEAT_POLL_32BITS (1U << 6) #define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) #define IORING_FEAT_EXT_ARG (1U << 8) +#define IORING_FEAT_NATIVE_WORKERS (1U << 9) /* * io_uring_register(2) opcodes and arguments -- cgit v1.2.3 From bda420b985054a3badafef23807c4b4fa38a3dff Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 24 Feb 2021 12:09:43 -0800 Subject: numa balancing: migrate on fault among multiple bound nodes Now, NUMA balancing can only optimize the page placement among the NUMA nodes if the default memory policy is used. Because the memory policy specified explicitly should take precedence. But this seems too strict in some situations. For example, on a system with 4 NUMA nodes, if the memory of an application is bound to the node 0 and 1, NUMA balancing can potentially migrate the pages between the node 0 and 1 to reduce cross-node accessing without breaking the explicit memory binding policy. So in this patch, we add MPOL_F_NUMA_BALANCING mode flag to set_mempolicy() when mode is MPOL_BIND. With the flag specified, NUMA balancing will be enabled within the thread to optimize the page placement within the constrains of the specified memory binding policy. With the newly added flag, the NUMA balancing control mechanism becomes, - sysctl knob numa_balancing can enable/disable the NUMA balancing globally. - even if sysctl numa_balancing is enabled, the NUMA balancing will be disabled for the memory areas or applications with the explicit memory policy by default. - MPOL_F_NUMA_BALANCING can be used to enable the NUMA balancing for the applications when specifying the explicit memory policy (MPOL_BIND). Various page placement optimization based on the NUMA balancing can be done with these flags. As the first step, in this patch, if the memory of the application is bound to multiple nodes (MPOL_BIND), and in the hint page fault handler the accessing node are in the policy nodemask, the page will be tried to be migrated to the accessing node to reduce the cross-node accessing. If the newly added MPOL_F_NUMA_BALANCING flag is specified by an application on an old kernel version without its support, set_mempolicy() will return -1 and errno will be set to EINVAL. The application can use this behavior to run on both old and new kernel versions. And if the MPOL_F_NUMA_BALANCING flag is specified for the mode other than MPOL_BIND, set_mempolicy() will return -1 and errno will be set to EINVAL as before. Because we don't support optimization based on the NUMA balancing for these modes. In the previous version of the patch, we tried to reuse MPOL_MF_LAZY for mbind(). But that flag is tied to MPOL_MF_MOVE.*, so it seems not a good API/ABI for the purpose of the patch. And because it's not clear whether it's necessary to enable NUMA balancing for a specific memory area inside an application, so we only add the flag at the thread level (set_mempolicy()) instead of the memory area level (mbind()). We can do that when it become necessary. To test the patch, we run a test case as follows on a 4-node machine with 192 GB memory (48 GB per node). 1. Change pmbench memory accessing benchmark to call set_mempolicy() to bind its memory to node 1 and 3 and enable NUMA balancing. Some related code snippets are as follows, #include #include struct bitmask *bmp; int ret; bmp = numa_parse_nodestring("1,3"); ret = set_mempolicy(MPOL_BIND | MPOL_F_NUMA_BALANCING, bmp->maskp, bmp->size + 1); /* If MPOL_F_NUMA_BALANCING isn't supported, fall back to MPOL_BIND */ if (ret < 0 && errno == EINVAL) ret = set_mempolicy(MPOL_BIND, bmp->maskp, bmp->size + 1); if (ret < 0) { perror("Failed to call set_mempolicy"); exit(-1); } 2. Run a memory eater on node 3 to use 40 GB memory before running pmbench. 3. Run pmbench with 64 processes, the working-set size of each process is 640 MB, so the total working-set size is 64 * 640 MB = 40 GB. The CPU and the memory (as in step 1.) of all pmbench processes is bound to node 1 and 3. So, after CPU usage is balanced, some pmbench processes run on the CPUs of the node 3 will access the memory of the node 1. 4. After the pmbench processes run for 100 seconds, kill the memory eater. Now it's possible for some pmbench processes to migrate their pages from node 1 to node 3 to reduce cross-node accessing. Test results show that, with the patch, the pages can be migrated from node 1 to node 3 after killing the memory eater, and the pmbench score can increase about 17.5%. Link: https://lkml.kernel.org/r/20210120061235.148637-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Mel Gorman Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Rik van Riel Cc: Johannes Weiner Cc: "Matthew Wilcox (Oracle)" Cc: Dave Hansen Cc: Andi Kleen Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/mempolicy.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 3354774af61e..8948467b3992 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -28,12 +28,14 @@ enum { /* Flags for set_mempolicy */ #define MPOL_F_STATIC_NODES (1 << 15) #define MPOL_F_RELATIVE_NODES (1 << 14) +#define MPOL_F_NUMA_BALANCING (1 << 13) /* Optimize with NUMA balancing if possible */ /* * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to * either set_mempolicy() or mbind(). */ -#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES) +#define MPOL_MODE_FLAGS \ + (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES | MPOL_F_NUMA_BALANCING) /* Flags for get_mempolicy */ #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */ -- cgit v1.2.3 From df54714f579a77662054132161612ce3da876b0d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Feb 2021 17:20:56 -0800 Subject: include/linux: remove repeated words Drop the doubled word "for" in a comment. {firewire-cdev.h} Drop the doubled word "in" in a comment. {input.h} Drop the doubled word "a" in a comment. {mdev.h} Drop the doubled word "the" in a comment. {ptrace.h} Link: https://lkml.kernel.org/r/20210126232444.22861-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Stefan Richter Cc: Dmitry Torokhov Cc: Kirti Wankhede Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/firewire-cdev.h | 2 +- include/uapi/linux/input.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 7e5b5c10a49c..5effa9832802 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -844,7 +844,7 @@ struct fw_cdev_queue_iso { * struct fw_cdev_start_iso - Start an isochronous transmission or reception * @cycle: Cycle in which to start I/O. If @cycle is greater than or * equal to 0, the I/O will start on that cycle. - * @sync: Determines the value to wait for for receive packets that have + * @sync: Determines the value to wait for receive packets that have * the %FW_CDEV_ISO_SYNC bit set * @tags: Tag filter bit mask. Only valid for isochronous reception. * Determines the tag values for which packets will be accepted. diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index 9a61c28ed3ae..ee3127461ee0 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -84,7 +84,7 @@ struct input_id { * in units per radian. * When INPUT_PROP_ACCELEROMETER is set the resolution changes. * The main axes (ABS_X, ABS_Y, ABS_Z) are then reported in - * in units per g (units/g) and in units per degree per second + * units per g (units/g) and in units per degree per second * (units/deg/s) for rotational axes (ABS_RX, ABS_RY, ABS_RZ). */ struct input_absinfo { -- cgit v1.2.3 From 30b5c851af7991ad08abe90c1e7c31615fa98a1a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 1 Mar 2021 12:53:09 +0000 Subject: KVM: x86/xen: Add support for vCPU runstate information This is how Xen guests do steal time accounting. The hypervisor records the amount of time spent in each of running/runnable/blocked/offline states. In the Xen accounting, a vCPU is still in state RUNSTATE_running while in Xen for a hypercall or I/O trap, etc. Only if Xen explicitly schedules does the state become RUNSTATE_blocked. In KVM this means that even when the vCPU exits the kvm_run loop, the state remains RUNSTATE_running. The VMM can explicitly set the vCPU to RUNSTATE_blocked by using the KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT attribute, and can also use KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST to retrospectively add a given amount of time to the blocked state and subtract it from the running state. The state_entry_time corresponds to get_kvmclock_ns() at the time the vCPU entered the current state, and the total times of all four states should always add up to state_entry_time. Co-developed-by: Joao Martins Signed-off-by: Joao Martins Signed-off-by: David Woodhouse Message-Id: <20210301125309.874953-2-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 8b281f722e5b..f6afee209620 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1154,6 +1154,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) +#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) struct kvm_xen_hvm_config { __u32 flags; @@ -1621,12 +1622,24 @@ struct kvm_xen_vcpu_attr { union { __u64 gpa; __u64 pad[8]; + struct { + __u64 state; + __u64 state_entry_time; + __u64 time_running; + __u64 time_runnable; + __u64 time_blocked; + __u64 time_offline; + } runstate; } u; }; /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 /* Secure Encrypted Virtualization command */ enum sev_cmd_id { -- cgit v1.2.3