From 2131c1517f3004da208b7f1a3b06b8119172e194 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Thu, 30 Oct 2025 09:57:44 +0800 Subject: hisi_acc_vfio_pci: adapt to new migration configuration On new platforms greater than QM_HW_V3, the migration region has been relocated from the VF to the PF. The VF's own configuration space is restored to the complete 64KB, and there is no need to divide the size of the BAR configuration space equally. The driver should be modified accordingly to adapt to the new hardware device. On the older hardware platform QM_HW_V3, the live migration configuration region is placed in the latter 32K portion of the VF's BAR2 configuration space. On the new hardware platform QM_HW_V4, the live migration configuration region also exists in the same 32K area immediately following the VF's BAR2, just like on QM_HW_V3. However, access to this region is now controlled by hardware. Additionally, a copy of the live migration configuration region is present in the PF's BAR2 configuration space. On the new hardware platform QM_HW_V4, when an older version of the driver is loaded, it behaves like QM_HW_V3 and uses the configuration region in the VF, ensuring that the live migration function continues to work normally. When the new version of the driver is loaded, it directly uses the configuration region in the PF. Meanwhile, hardware configuration disables the live migration configuration region in the VF's BAR2: reads return all 0xF values, and writes are silently ignored. Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Link: https://lore.kernel.org/r/20251030015744.131771-3-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 130 ++++++++++++++++++------- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 23 ++++- 2 files changed, 114 insertions(+), 39 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index fde33f54e99e..498cb7d1c9e5 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -125,9 +125,25 @@ static int qm_get_cqc(struct hisi_qm *qm, u64 *addr) return 0; } +static void qm_xqc_reg_offsets(struct hisi_qm *qm, + u32 *eqc_addr, u32 *aeqc_addr) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + container_of(qm, struct hisi_acc_vf_core_device, vf_qm); + + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) { + *eqc_addr = QM_EQC_VF_DW0; + *aeqc_addr = QM_AEQC_VF_DW0; + } else { + *eqc_addr = QM_EQC_PF_DW0; + *aeqc_addr = QM_AEQC_PF_DW0; + } +} + static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; ret = qm_read_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); @@ -167,15 +183,16 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_read_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_read_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_AEQC_DW\n"); return ret; @@ -187,6 +204,7 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; /* Check VF state */ @@ -239,15 +257,16 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_write_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_write_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_AEQC_DW\n"); return ret; @@ -1186,34 +1205,52 @@ static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device; struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm; struct pci_dev *vf_dev = vdev->pdev; + u32 val; - /* - * ACC VF dev BAR2 region consists of both functional register space - * and migration control register space. For migration to work, we - * need access to both. Hence, we map the entire BAR2 region here. - * But unnecessarily exposing the migration BAR region to the Guest - * has the potential to prevent/corrupt the Guest migration. Hence, - * we restrict access to the migration control space from - * Guest(Please see mmap/ioctl/read/write override functions). - * - * Please note that it is OK to expose the entire VF BAR if migration - * is not supported or required as this cannot affect the ACC PF - * configurations. - * - * Also the HiSilicon ACC VF devices supported by this driver on - * HiSilicon hardware platforms are integrated end point devices - * and the platform lacks the capability to perform any PCIe P2P - * between these devices. - */ + val = readl(pf_qm->io_base + QM_MIG_REGION_SEL); + if (pf_qm->ver > QM_HW_V3 && (val & QM_MIG_REGION_EN)) + hisi_acc_vdev->drv_mode = HW_ACC_MIG_PF_CTRL; + else + hisi_acc_vdev->drv_mode = HW_ACC_MIG_VF_CTRL; - vf_qm->io_base = - ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), - pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); - if (!vf_qm->io_base) - return -EIO; + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_PF_CTRL) { + /* + * On hardware platforms greater than QM_HW_V3, the migration function + * register is placed in the BAR2 configuration region of the PF, + * and each VF device occupies 8KB of configuration space. + */ + vf_qm->io_base = pf_qm->io_base + QM_MIG_REGION_OFFSET + + hisi_acc_vdev->vf_id * QM_MIG_REGION_SIZE; + } else { + /* + * ACC VF dev BAR2 region consists of both functional register space + * and migration control register space. For migration to work, we + * need access to both. Hence, we map the entire BAR2 region here. + * But unnecessarily exposing the migration BAR region to the Guest + * has the potential to prevent/corrupt the Guest migration. Hence, + * we restrict access to the migration control space from + * Guest(Please see mmap/ioctl/read/write override functions). + * + * Please note that it is OK to expose the entire VF BAR if migration + * is not supported or required as this cannot affect the ACC PF + * configurations. + * + * Also the HiSilicon ACC VF devices supported by this driver on + * HiSilicon hardware platforms are integrated end point devices + * and the platform lacks the capability to perform any PCIe P2P + * between these devices. + */ + vf_qm->io_base = + ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), + pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); + if (!vf_qm->io_base) + return -EIO; + } vf_qm->fun_type = QM_HW_VF; + vf_qm->ver = pf_qm->ver; vf_qm->pdev = vf_dev; mutex_init(&vf_qm->mailbox_lock); @@ -1250,6 +1287,28 @@ static struct hisi_qm *hisi_acc_get_pf_qm(struct pci_dev *pdev) return !IS_ERR(pf_qm) ? pf_qm : NULL; } +static size_t hisi_acc_get_resource_len(struct vfio_pci_core_device *vdev, + unsigned int index) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + hisi_acc_drvdata(vdev->pdev); + + /* + * On the old HW_ACC_MIG_VF_CTRL mode device, the ACC VF device + * BAR2 region encompasses both functional register space + * and migration control register space. + * only the functional region should be report to Guest. + */ + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + return (pci_resource_len(vdev->pdev, index) >> 1); + /* + * On the new HW device, the migration control register + * has been moved to the PF device BAR2 region. + * The VF device BAR2 is entirely functional register space. + */ + return pci_resource_len(vdev->pdev, index); +} + static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, size_t count, loff_t *ppos, size_t *new_count) @@ -1260,8 +1319,9 @@ static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, if (index == VFIO_PCI_BAR2_REGION_INDEX) { loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); /* Check if access is for migration control region */ if (pos >= end) return -EINVAL; @@ -1282,8 +1342,9 @@ static int hisi_acc_vfio_pci_mmap(struct vfio_device *core_vdev, index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); if (index == VFIO_PCI_BAR2_REGION_INDEX) { u64 req_len, pgoff, req_start; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); req_len = vma->vm_end - vma->vm_start; pgoff = vma->vm_pgoff & ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); @@ -1330,7 +1391,6 @@ static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int if (cmd == VFIO_DEVICE_GET_REGION_INFO) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct pci_dev *pdev = vdev->pdev; struct vfio_region_info info; unsigned long minsz; @@ -1345,12 +1405,7 @@ static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int if (info.index == VFIO_PCI_BAR2_REGION_INDEX) { info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - /* - * ACC VF dev BAR2 region consists of both functional - * register space and migration control register space. - * Report only the functional region to Guest. - */ - info.size = pci_resource_len(pdev, info.index) / 2; + info.size = hisi_acc_get_resource_len(vdev, info.index); info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | @@ -1521,7 +1576,8 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) hisi_acc_vf_disable_fds(hisi_acc_vdev); mutex_lock(&hisi_acc_vdev->open_mutex); hisi_acc_vdev->dev_opened = false; - iounmap(vf_qm->io_base); + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + iounmap(vf_qm->io_base); mutex_unlock(&hisi_acc_vdev->open_mutex); vfio_pci_core_close_device(core_vdev); } diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 91002ceeebc1..cd55eba64dfb 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -50,8 +50,10 @@ #define QM_QUE_ISO_CFG_V 0x0030 #define QM_PAGE_SIZE 0x0034 -#define QM_EQC_DW0 0X8000 -#define QM_AEQC_DW0 0X8020 +#define QM_EQC_VF_DW0 0X8000 +#define QM_AEQC_VF_DW0 0X8020 +#define QM_EQC_PF_DW0 0x1c00 +#define QM_AEQC_PF_DW0 0x1c20 #define ACC_DRV_MAJOR_VER 1 #define ACC_DRV_MINOR_VER 0 @@ -59,6 +61,22 @@ #define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC #define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE +#define QM_MIG_REGION_OFFSET 0x180000 +#define QM_MIG_REGION_SIZE 0x2000 + +/** + * On HW_ACC_MIG_VF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the latter 32KB of the VF's BAR2. + * The Guest is only provided with the first 32KB of the VF's BAR2. + * On HW_ACC_MIG_PF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the PF's BAR2, and the entire 64KB + * of the VF's BAR2 is allocated to the Guest. + */ +enum hw_drv_mode { + HW_ACC_MIG_VF_CTRL = 0, + HW_ACC_MIG_PF_CTRL, +}; + struct acc_vf_data { #define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state) /* QM match information */ @@ -125,6 +143,7 @@ struct hisi_acc_vf_core_device { struct pci_dev *vf_dev; struct hisi_qm *pf_qm; struct hisi_qm vf_qm; + enum hw_drv_mode drv_mode; /* * vf_qm_state represents the QM_VF_STATE register value. * It is set by Guest driver for the ACC VF dev indicating -- cgit v1.2.3 From 2f03f21fe7516902283b135de272d3c7b10672de Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Fri, 31 Oct 2025 17:06:02 +0000 Subject: vfio: Fix ksize arg while copying user struct in vfio_df_ioctl_bind_iommufd() For the cases where user includes a non-zero value in 'token_uuid_ptr' field of 'struct vfio_device_bind_iommufd', the copy_struct_from_user() in vfio_df_ioctl_bind_iommufd() fails with -E2BIG. For the 'minsz' passed, copy_struct_from_user() expects the newly introduced field to be zero-ed, which would be incorrect in this case. Fix this by passing the actual size of the kernel struct. If working with a newer userspace, copy_struct_from_user() would copy the 'token_uuid_ptr' field, and if working with an old userspace, it would zero out this field, thus still retaining backward compatibility. Fixes: 86624ba3b522 ("vfio/pci: Do vf_token checks for VFIO_DEVICE_BIND_IOMMUFD") Cc: stable@vger.kernel.org Signed-off-by: Raghavendra Rao Ananta Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20251031170603.2260022-2-rananta@google.com Signed-off-by: Alex Williamson --- drivers/vfio/device_cdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index 480cac3a0c27..8ceca24ac136 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -99,7 +99,7 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, return ret; if (user_size < minsz) return -EINVAL; - ret = copy_struct_from_user(&bind, minsz, arg, user_size); + ret = copy_struct_from_user(&bind, sizeof(bind), arg, user_size); if (ret) return ret; -- cgit v1.2.3 From 0ed3a30fd996cb0cac872432cf25185fda7e5316 Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Fri, 31 Oct 2025 17:06:03 +0000 Subject: hisi_acc_vfio_pci: Add .match_token_uuid callback in hisi_acc_vfio_pci_migrn_ops The commit, <86624ba3b522> ("vfio/pci: Do vf_token checks for VFIO_DEVICE_BIND_IOMMUFD") accidentally ignored including the .match_token_uuid callback in the hisi_acc_vfio_pci_migrn_ops struct. Introduce the missed callback here. Fixes: 86624ba3b522 ("vfio/pci: Do vf_token checks for VFIO_DEVICE_BIND_IOMMUFD") Cc: stable@vger.kernel.org Suggested-by: Longfang Liu Signed-off-by: Raghavendra Rao Ananta Reviewed-by: Longfang Liu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20251031170603.2260022-3-rananta@google.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 498cb7d1c9e5..fe2ffcd00d6e 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1620,6 +1620,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .mmap = hisi_acc_vfio_pci_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, -- cgit v1.2.3 From 113557b0406818a8a5df3479b0a89125d2b2a04c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:17 -0400 Subject: vfio: Provide a get_region_info op Instead of hooking the general ioctl op, have the core code directly decode VFIO_DEVICE_GET_REGION_INFO and call an op just for it. This is intended to allow mechanical changes to the drivers to pull their VFIO_DEVICE_GET_REGION_INFO int oa function. Later patches will improve the function signature to consolidate more code. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 9 ++++++--- drivers/vfio/vfio_main.c | 7 +++++++ 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 7dcf5439dedc..1dc350003f07 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -996,9 +996,11 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, - struct vfio_region_info __user *arg) +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) { + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; struct vfio_region_info info; @@ -1132,6 +1134,7 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } +EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info); static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, struct vfio_irq_info __user *arg) @@ -1458,7 +1461,7 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); case VFIO_DEVICE_GET_REGION_INFO: - return vfio_pci_ioctl_get_region_info(vdev, uarg); + return vfio_pci_ioctl_get_region_info(core_vdev, uarg); case VFIO_DEVICE_IOEVENTFD: return vfio_pci_ioctl_ioeventfd(vdev, uarg); case VFIO_DEVICE_PCI_HOT_RESET: diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 38c8e9350a60..a390163ce706 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1296,7 +1296,14 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, ret = vfio_ioctl_device_feature(device, uptr); break; + case VFIO_DEVICE_GET_REGION_INFO: + if (!device->ops->get_region_info) + goto ioctl_fallback; + ret = device->ops->get_region_info(device, uptr); + break; + default: +ioctl_fallback: if (unlikely(!device->ops->ioctl)) ret = -EINVAL; else -- cgit v1.2.3 From e238f147d517681e5dcb3dc9ab149211d66b00a1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:18 -0400 Subject: vfio/hisi: Convert to the get_region_info op Change the function signature of hisi_acc_vfio_pci_ioctl() and re-indent it. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 45 ++++++++++++-------------- 1 file changed, 21 insertions(+), 24 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index fe2ffcd00d6e..30c438595eda 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1385,37 +1385,33 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev, return vfio_pci_core_read(core_vdev, buf, new_count, ppos); } -static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) +static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) { - if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_pci_core_device *vdev = - container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct vfio_region_info info; - unsigned long minsz; + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + struct vfio_region_info info; + unsigned long minsz; - minsz = offsetofend(struct vfio_region_info, offset); + minsz = offsetofend(struct vfio_region_info, offset); - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; + if (info.argsz < minsz) + return -EINVAL; - if (info.index == VFIO_PCI_BAR2_REGION_INDEX) { - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + if (info.index != VFIO_PCI_BAR2_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, arg); - info.size = hisi_acc_get_resource_len(vdev, info.index); + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_MMAP; + info.size = hisi_acc_get_resource_len(vdev, info.index); - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; - } - } - return vfio_pci_core_ioctl(core_vdev, cmd, arg); + info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) @@ -1613,7 +1609,8 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .release = vfio_pci_core_release_dev, .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, - .ioctl = hisi_acc_vfio_pci_ioctl, + .ioctl = vfio_pci_core_ioctl, + .get_region_info = hisi_acc_vfio_ioctl_get_region, .device_feature = vfio_pci_core_ioctl_feature, .read = hisi_acc_vfio_pci_read, .write = hisi_acc_vfio_pci_write, -- cgit v1.2.3 From c044eefa47864fb436254cb330e8d90cb6a3a870 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:19 -0400 Subject: vfio/virtio: Convert to the get_region_info op Remove virtiovf_vfio_pci_core_ioctl() and change the signature of virtiovf_pci_ioctl_get_region_info(). Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/virtio/common.h | 4 +--- drivers/vfio/pci/virtio/legacy_io.c | 20 ++++---------------- drivers/vfio/pci/virtio/main.c | 3 ++- 3 files changed, 7 insertions(+), 20 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h index c7d7e27af386..a10f2d92cb62 100644 --- a/drivers/vfio/pci/virtio/common.h +++ b/drivers/vfio/pci/virtio/common.h @@ -109,10 +109,8 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev); #ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev); -long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg); int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg); + struct vfio_region_info __user *arg); ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index 832af5ba267c..d735d5c4bd77 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -281,15 +281,14 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user } int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg) + struct vfio_region_info __user *arg) { struct virtiovf_pci_core_device *virtvdev = container_of( core_vdev, struct virtiovf_pci_core_device, core_device.vdev); unsigned long minsz = offsetofend(struct vfio_region_info, offset); - void __user *uarg = (void __user *)arg; struct vfio_region_info info = {}; - if (copy_from_user(&info, uarg, minsz)) + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) @@ -301,20 +300,9 @@ int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, info.size = virtvdev->bar0_virtual_buf_size; info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; - return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); - } -} - -long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) -{ - switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg); - default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, arg); } } diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index 8084f3e36a9f..92b525e52abe 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -108,7 +108,8 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .release = virtiovf_pci_core_release_dev, .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, - .ioctl = virtiovf_vfio_pci_core_ioctl, + .ioctl = vfio_pci_core_ioctl, + .get_region_info = virtiovf_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = virtiovf_pci_core_read, .write = virtiovf_pci_core_write, -- cgit v1.2.3 From 5ac7206474777dff56f79f1d6bc9973e988f7587 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:20 -0400 Subject: vfio/nvgrace: Convert to the get_region_info op Change the signature of nvgrace_gpu_ioctl_get_region_info() Reviewed-by: Kevin Tian Reviewed-by: Ankit Agrawal Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e346392b72f6..d3a5253473e0 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -205,9 +205,9 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, return 0; } -static long +static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned long arg) + struct vfio_region_info __user *arg) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, @@ -220,7 +220,7 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, u32 size; int ret; - if (copy_from_user(&info, (void __user *)arg, minsz)) + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) @@ -232,8 +232,7 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, */ memregion = nvgrace_gpu_memregion(info.index, nvdev); if (!memregion) - return vfio_pci_core_ioctl(core_vdev, - VFIO_DEVICE_GET_REGION_INFO, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, arg); size = struct_size(sparse, areas, 1); @@ -285,16 +284,13 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, } kfree(caps.buf); } - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg); case VFIO_DEVICE_IOEVENTFD: return -ENOTTY; case VFIO_DEVICE_RESET: @@ -690,6 +686,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .open_device = nvgrace_gpu_open_device, .close_device = nvgrace_gpu_close_device, .ioctl = nvgrace_gpu_ioctl, + .get_region_info = nvgrace_gpu_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = nvgrace_gpu_read, .write = nvgrace_gpu_write, -- cgit v1.2.3 From f3fddb71dd50ed31ae474238551e8623a1bc16db Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:21 -0400 Subject: vfio/pci: Fill in the missing get_region_info ops Now that every variant driver provides a get_region_info op remove the ioctl based dispatch from vfio_pci_core_ioctl(). Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 1 + drivers/vfio/pci/mlx5/main.c | 1 + drivers/vfio/pci/nvgrace-gpu/main.c | 1 + drivers/vfio/pci/pds/vfio_dev.c | 1 + drivers/vfio/pci/qat/main.c | 1 + drivers/vfio/pci/vfio_pci.c | 1 + drivers/vfio/pci/vfio_pci_core.c | 2 -- drivers/vfio/pci/virtio/main.c | 2 ++ 8 files changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 30c438595eda..1fd3a2075ee4 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1631,6 +1631,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 7ec47e736a8e..b7f941f8047e 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1366,6 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index d3a5253473e0..cab743a30dc3 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -707,6 +707,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .open_device = nvgrace_gpu_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index f3ccb0008f67..1946bc75d99b 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -195,6 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .open_device = pds_vfio_open_device, .close_device = pds_vfio_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index a19b68043eb2..8452d9c1d11d 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -609,6 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .open_device = qat_vf_pci_open_device, .close_device = qat_vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .read = vfio_pci_core_read, .write = vfio_pci_core_write, .mmap = vfio_pci_core_mmap, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac10f14417f2..2d9122efc10b 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -132,6 +132,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .open_device = vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 1dc350003f07..f21d9026068c 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1460,8 +1460,6 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return vfio_pci_ioctl_get_irq_info(vdev, uarg); case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); - case VFIO_DEVICE_GET_REGION_INFO: - return vfio_pci_ioctl_get_region_info(core_vdev, uarg); case VFIO_DEVICE_IOEVENTFD: return vfio_pci_ioctl_ioeventfd(vdev, uarg); case VFIO_DEVICE_PCI_HOT_RESET: diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index 92b525e52abe..d68096bc5252 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -88,6 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -131,6 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .open_device = virtiovf_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, -- cgit v1.2.3 From d4635df279f57de1dd301b864724a930551028b2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:25 -0400 Subject: vfio/platform: Provide a get_region_info op Move it out of vfio_platform_ioctl() and re-indent it. Add it to all platform drivers. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Mostafa Saleh Reviewed-by: Eric Auger Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/9-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_amba.c | 1 + drivers/vfio/platform/vfio_platform.c | 1 + drivers/vfio/platform/vfio_platform_common.c | 50 +++++++++++++++------------ drivers/vfio/platform/vfio_platform_private.h | 2 ++ 4 files changed, 32 insertions(+), 22 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index 9f5c527baa8a..d600deaf23b6 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -115,6 +115,7 @@ static const struct vfio_device_ops vfio_amba_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, + .get_region_info = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 512533501eb7..0e85c914b651 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -101,6 +101,7 @@ static const struct vfio_device_ops vfio_platform_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, + .get_region_info = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 3bf1043cd795..3ebd50fb78fb 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -272,6 +272,34 @@ err_irq: } EXPORT_SYMBOL_GPL(vfio_platform_open_device); +int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) +{ + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + struct vfio_region_info info; + unsigned long minsz; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (info.index >= vdev->num_regions) + return -EINVAL; + + /* map offset to the physical address */ + info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index); + info.size = vdev->regions[info.index].size; + info.flags = vdev->regions[info.index].flags; + + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(vfio_platform_ioctl_get_region_info); + long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -300,28 +328,6 @@ long vfio_platform_ioctl(struct vfio_device *core_vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= vdev->num_regions) - return -EINVAL; - - /* map offset to the physical address */ - info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index 8d8fab516849..a6008320e77b 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -85,6 +85,8 @@ int vfio_platform_open_device(struct vfio_device *core_vdev); void vfio_platform_close_device(struct vfio_device *core_vdev); long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); +int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg); ssize_t vfio_platform_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); -- cgit v1.2.3 From 6cdae5d0c326f0ec8f0f50571df0374f2f0b8815 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:26 -0400 Subject: vfio/fsl: Provide a get_region_info op Move it out of vfio_fsl_mc_ioctl() and re-indent it. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/10-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/fsl-mc/vfio_fsl_mc.c | 56 ++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 24 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index 76ccbab0e3d6..d38e51a57f07 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -117,6 +117,37 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) fsl_mc_cleanup_irq_pool(mc_cont); } +static int +vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) +{ + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); + struct fsl_mc_device *mc_dev = vdev->mc_dev; + struct vfio_region_info info; + unsigned long minsz; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (info.index >= mc_dev->obj_desc.region_count) + return -EINVAL; + + /* map offset to the physical address */ + info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index); + info.size = vdev->regions[info.index].size; + info.flags = vdev->regions[info.index].flags; + + if (copy_to_user(arg, &info, minsz)) + return -EFAULT; + return 0; +} + static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -149,30 +180,6 @@ static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= mc_dev->obj_desc.region_count) - return -EINVAL; - - /* map offset to the physical address */ - info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { struct vfio_irq_info info; @@ -589,6 +596,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .open_device = vfio_fsl_mc_open_device, .close_device = vfio_fsl_mc_close_device, .ioctl = vfio_fsl_mc_ioctl, + .get_region_info = vfio_fsl_mc_ioctl_get_region_info, .read = vfio_fsl_mc_read, .write = vfio_fsl_mc_write, .mmap = vfio_fsl_mc_mmap, -- cgit v1.2.3 From b9827eff6b4aab8203a1aa720ae3a6d3731f1a4d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:27 -0400 Subject: vfio/cdx: Provide a get_region_info op Change the signature of vfio_cdx_ioctl_get_region_info() and hook it to the op. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/11-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/cdx/main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index 5dd5f5ad7686..506d849139d3 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -129,9 +129,11 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev, +static int vfio_cdx_ioctl_get_region_info(struct vfio_device *core_vdev, struct vfio_region_info __user *arg) { + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); struct vfio_region_info info; @@ -219,8 +221,6 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev, switch (cmd) { case VFIO_DEVICE_GET_INFO: return vfio_cdx_ioctl_get_info(vdev, uarg); - case VFIO_DEVICE_GET_REGION_INFO: - return vfio_cdx_ioctl_get_region_info(vdev, uarg); case VFIO_DEVICE_GET_IRQ_INFO: return vfio_cdx_ioctl_get_irq_info(vdev, uarg); case VFIO_DEVICE_SET_IRQS: @@ -284,6 +284,7 @@ static const struct vfio_device_ops vfio_cdx_ops = { .open_device = vfio_cdx_open_device, .close_device = vfio_cdx_close_device, .ioctl = vfio_cdx_ioctl, + .get_region_info = vfio_cdx_ioctl_get_region_info, .device_feature = vfio_cdx_ioctl_feature, .mmap = vfio_cdx_mmap, .bind_iommufd = vfio_iommufd_physical_bind, -- cgit v1.2.3 From f97859503859fd217d27fc4496f294955e197eaf Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:30 -0400 Subject: vfio: Require drivers to implement get_region_info Remove the fallback through the ioctl callback, no drivers use this now. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Mostafa Saleh Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/14-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index a390163ce706..f056e82ba350 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1297,13 +1297,13 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, break; case VFIO_DEVICE_GET_REGION_INFO: - if (!device->ops->get_region_info) - goto ioctl_fallback; - ret = device->ops->get_region_info(device, uptr); + if (unlikely(!device->ops->get_region_info)) + ret = -EINVAL; + else + ret = device->ops->get_region_info(device, uptr); break; default: -ioctl_fallback: if (unlikely(!device->ops->ioctl)) ret = -EINVAL; else -- cgit v1.2.3 From 775f726a742a60d8d0ed2b4733a5b6a796d9d1dd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:31 -0400 Subject: vfio: Add get_region_info_caps op This op does the copy to/from user for the info and can return back a cap chain through a vfio_info_cap * result. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/15-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 56 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f056e82ba350..48d034aede46 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1259,6 +1259,57 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, } } +static long vfio_get_region_info(struct vfio_device *device, + struct vfio_region_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_region_info info = {}; + struct vfio_info_cap caps = {}; + int ret; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + if (device->ops->get_region_info_caps) { + ret = device->ops->get_region_info_caps(device, &info, &caps); + if (ret) + goto out_free; + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, + caps.size)) { + ret = -EFAULT; + goto out_free; + } + info.cap_offset = sizeof(info); + } + } + + if (copy_to_user(arg, &info, minsz)) { + ret = -EFAULT; + goto out_free; + } + } else if (device->ops->get_region_info) { + ret = device->ops->get_region_info(device, arg); + if (ret) + return ret; + } else { + return -EINVAL; + } + +out_free: + kfree(caps.buf); + return ret; +} + static long vfio_device_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { @@ -1297,10 +1348,7 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, break; case VFIO_DEVICE_GET_REGION_INFO: - if (unlikely(!device->ops->get_region_info)) - ret = -EINVAL; - else - ret = device->ops->get_region_info(device, uptr); + ret = vfio_get_region_info(device, uptr); break; default: -- cgit v1.2.3 From 1b0ecb5baf4af3baa8627144bbcf9848806aa5f1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:35 -0400 Subject: vfio/pci: Convert all PCI drivers to get_region_info_caps Since the core function signature changes it has to flow up to all drivers. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Brett Creeley Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/19-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 30 +++---- drivers/vfio/pci/mlx5/main.c | 2 +- drivers/vfio/pci/nvgrace-gpu/main.c | 51 +++--------- drivers/vfio/pci/pds/vfio_dev.c | 2 +- drivers/vfio/pci/qat/main.c | 2 +- drivers/vfio/pci/vfio_pci.c | 2 +- drivers/vfio/pci/vfio_pci_core.c | 103 ++++++++++--------------- drivers/vfio/pci/virtio/common.h | 3 +- drivers/vfio/pci/virtio/legacy_io.c | 26 ++----- drivers/vfio/pci/virtio/main.c | 6 +- 10 files changed, 78 insertions(+), 149 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 1fd3a2075ee4..cf45f6370c36 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1386,32 +1386,22 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev, } static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct vfio_region_info info; - unsigned long minsz; - - minsz = offsetofend(struct vfio_region_info, offset); - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; + if (info->index != VFIO_PCI_BAR2_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - if (info.argsz < minsz) - return -EINVAL; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); - if (info.index != VFIO_PCI_BAR2_REGION_INDEX) - return vfio_pci_ioctl_get_region_info(core_vdev, arg); + info->size = hisi_acc_get_resource_len(vdev, info->index); - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - - info.size = hisi_acc_get_resource_len(vdev, info.index); - - info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) @@ -1610,7 +1600,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = hisi_acc_vfio_ioctl_get_region, + .get_region_info_caps = hisi_acc_vfio_ioctl_get_region, .device_feature = vfio_pci_core_ioctl_feature, .read = hisi_acc_vfio_pci_read, .write = hisi_acc_vfio_pci_write, @@ -1631,7 +1621,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index b7f941f8047e..9c5970411d07 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1366,7 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index cab743a30dc3..5a6f77d5c81e 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -205,34 +205,25 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, return 0; } -static int -nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) +static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_sparse_mmap *sparse; - struct vfio_region_info info; struct mem_region *memregion; u32 size; int ret; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - /* * Request to determine the BAR region information. Send the * GPU memory information. */ - memregion = nvgrace_gpu_memregion(info.index, nvdev); + memregion = nvgrace_gpu_memregion(info->index, nvdev); if (!memregion) - return vfio_pci_ioctl_get_region_info(core_vdev, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); size = struct_size(sparse, areas, 1); @@ -251,40 +242,22 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; sparse->header.version = 1; - ret = vfio_info_add_capability(&caps, &sparse->header, size); + ret = vfio_info_add_capability(caps, &sparse->header, size); kfree(sparse); if (ret) return ret; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); /* * The region memory size may not be power-of-2 aligned. * Given that the memory is a BAR and may not be * aligned, roundup to the next power-of-2. */ - info.size = memregion->bar_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | + info->size = memregion->bar_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - kfree(caps.buf); - } - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, @@ -686,7 +659,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .open_device = nvgrace_gpu_open_device, .close_device = nvgrace_gpu_close_device, .ioctl = nvgrace_gpu_ioctl, - .get_region_info = nvgrace_gpu_ioctl_get_region_info, + .get_region_info_caps = nvgrace_gpu_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = nvgrace_gpu_read, .write = nvgrace_gpu_write, @@ -707,7 +680,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .open_device = nvgrace_gpu_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index 1946bc75d99b..be103c74e969 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -195,7 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .open_device = pds_vfio_open_device, .close_device = pds_vfio_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index 8452d9c1d11d..8fbdf7c6d666 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -609,7 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .open_device = qat_vf_pci_open_device, .close_device = qat_vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .read = vfio_pci_core_read, .write = vfio_pci_core_write, .mmap = vfio_pci_core_mmap, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2d9122efc10b..a3e49d42c771 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -132,7 +132,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .open_device = vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index f21d9026068c..57c0766fb9f8 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -997,43 +997,35 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, } int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; int i, ret; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { + switch (info->index) { case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pdev->cfg_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pdev->cfg_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pci_resource_len(pdev, info->index); + if (!info->size) { + info->flags = 0; break; } - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - if (vdev->bar_mmap_supported[info.index]) { - info.flags |= VFIO_REGION_INFO_FLAG_MMAP; - if (info.index == vdev->msix_bar) { - ret = msix_mmappable_cap(vdev, &caps); + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + if (vdev->bar_mmap_supported[info->index]) { + info->flags |= VFIO_REGION_INFO_FLAG_MMAP; + if (info->index == vdev->msix_bar) { + ret = msix_mmappable_cap(vdev, caps); if (ret) return ret; } @@ -1045,9 +1037,9 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, size_t size; u16 cmd; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = 0; - info.size = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->flags = 0; + info->size = 0; if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { /* @@ -1057,16 +1049,17 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, cmd = vfio_pci_memory_lock_and_enable(vdev); io = pci_map_rom(pdev, &size); if (io) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report the BAR size, not the ROM size. */ - info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + info->size = pci_resource_len(pdev, + PCI_ROM_RESOURCE); pci_unmap_rom(pdev, io); } vfio_pci_memory_unlock_and_restore(vdev, cmd); } else if (pdev->rom && pdev->romlen) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report BAR size as power of two. */ - info.size = roundup_pow_of_two(pdev->romlen); + info->size = roundup_pow_of_two(pdev->romlen); } break; @@ -1075,10 +1068,10 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, if (!vdev->has_vga) return -EINVAL; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0xc0000; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0xc0000; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; default: { @@ -1087,52 +1080,34 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, .header.version = 1 }; - if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) + if (info->index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; - info.index = array_index_nospec( - info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); + info->index = array_index_nospec( + info->index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); - i = info.index - VFIO_PCI_NUM_REGIONS; + i = info->index - VFIO_PCI_NUM_REGIONS; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vdev->region[i].size; - info.flags = vdev->region[i].flags; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vdev->region[i].size; + info->flags = vdev->region[i].flags; cap_type.type = vdev->region[i].type; cap_type.subtype = vdev->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; if (vdev->region[i].ops->add_capability) { ret = vdev->region[i].ops->add_capability( - vdev, &vdev->region[i], &caps); + vdev, &vdev->region[i], caps); if (ret) return ret; } } } - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user(arg + 1, caps.buf, caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(*arg); - } - - kfree(caps.buf); - } - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info); diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h index a10f2d92cb62..cb3d5e57d3a3 100644 --- a/drivers/vfio/pci/virtio/common.h +++ b/drivers/vfio/pci/virtio/common.h @@ -110,7 +110,8 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev); #ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev); int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index d735d5c4bd77..1ed349a55629 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -281,29 +281,19 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user } int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct virtiovf_pci_core_device *virtvdev = container_of( core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct vfio_region_info info = {}; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; + if (info->index != VFIO_PCI_BAR0_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = virtvdev->bar0_virtual_buf_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; - default: - return vfio_pci_ioctl_get_region_info(core_vdev, arg); - } + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = virtvdev->bar0_virtual_buf_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + return 0; } static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index d68096bc5252..d2e5cbca13c8 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -88,7 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -110,7 +110,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = virtiovf_pci_ioctl_get_region_info, + .get_region_info_caps = virtiovf_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = virtiovf_pci_core_read, .write = virtiovf_pci_core_write, @@ -132,7 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .open_device = virtiovf_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, -- cgit v1.2.3 From 182c62861ba5a2301c7178484d2f424aaae4283b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:36 -0400 Subject: vfio/platform: Convert to get_region_info_caps Remove the duplicate code and change info to a pointer. caps are not used. Reviewed-by: Kevin Tian Reviewed-by: Mostafa Saleh Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_amba.c | 2 +- drivers/vfio/platform/vfio_platform.c | 2 +- drivers/vfio/platform/vfio_platform_common.c | 24 +++++++----------------- drivers/vfio/platform/vfio_platform_private.h | 3 ++- 4 files changed, 11 insertions(+), 20 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index d600deaf23b6..fa754f203b2d 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -115,7 +115,7 @@ static const struct vfio_device_ops vfio_amba_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, - .get_region_info = vfio_platform_ioctl_get_region_info, + .get_region_info_caps = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 0e85c914b651..a4d3ace3e02d 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -101,7 +101,7 @@ static const struct vfio_device_ops vfio_platform_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, - .get_region_info = vfio_platform_ioctl_get_region_info, + .get_region_info_caps = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 3ebd50fb78fb..c2990b7e900f 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -273,30 +273,20 @@ err_irq: EXPORT_SYMBOL_GPL(vfio_platform_open_device); int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); - struct vfio_region_info info; - unsigned long minsz; - - minsz = offsetofend(struct vfio_region_info, offset); - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= vdev->num_regions) + if (info->index >= vdev->num_regions) return -EINVAL; /* map offset to the physical address */ - info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + info->offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; } EXPORT_SYMBOL_GPL(vfio_platform_ioctl_get_region_info); diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index a6008320e77b..05084212a76e 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -86,7 +86,8 @@ void vfio_platform_close_device(struct vfio_device *core_vdev); long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t vfio_platform_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); -- cgit v1.2.3 From dc10734610e29bb6464498e933fc0d622227153d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:37 -0400 Subject: vfio: Move the remaining drivers to get_region_info_caps Remove the duplicate code and change info to a pointer. caps are not used. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/21-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/cdx/main.c | 24 ++++++++---------------- drivers/vfio/fsl-mc/vfio_fsl_mc.c | 29 ++++++++--------------------- 2 files changed, 16 insertions(+), 37 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index 506d849139d3..253031b86b60 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -130,29 +130,21 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, } static int vfio_cdx_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_cdx_device *vdev = container_of(core_vdev, struct vfio_cdx_device, vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); - struct vfio_region_info info; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= cdx_dev->res_count) + if (info->index >= cdx_dev->res_count) return -EINVAL; /* map offset to the physical address */ - info.offset = vfio_cdx_index_to_offset(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + info->offset = vfio_cdx_index_to_offset(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; } static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev, @@ -284,7 +276,7 @@ static const struct vfio_device_ops vfio_cdx_ops = { .open_device = vfio_cdx_open_device, .close_device = vfio_cdx_close_device, .ioctl = vfio_cdx_ioctl, - .get_region_info = vfio_cdx_ioctl_get_region_info, + .get_region_info_caps = vfio_cdx_ioctl_get_region_info, .device_feature = vfio_cdx_ioctl_feature, .mmap = vfio_cdx_mmap, .bind_iommufd = vfio_iommufd_physical_bind, diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index d38e51a57f07..ba47100f28c1 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -117,34 +117,21 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) fsl_mc_cleanup_irq_pool(mc_cont); } -static int -vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) +static int vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_fsl_mc_device *vdev = container_of(core_vdev, struct vfio_fsl_mc_device, vdev); struct fsl_mc_device *mc_dev = vdev->mc_dev; - struct vfio_region_info info; - unsigned long minsz; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= mc_dev->obj_desc.region_count) + if (info->index >= mc_dev->obj_desc.region_count) return -EINVAL; /* map offset to the physical address */ - info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - if (copy_to_user(arg, &info, minsz)) - return -EFAULT; + info->offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; return 0; } @@ -596,7 +583,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .open_device = vfio_fsl_mc_open_device, .close_device = vfio_fsl_mc_close_device, .ioctl = vfio_fsl_mc_ioctl, - .get_region_info = vfio_fsl_mc_ioctl_get_region_info, + .get_region_info_caps = vfio_fsl_mc_ioctl_get_region_info, .read = vfio_fsl_mc_read, .write = vfio_fsl_mc_write, .mmap = vfio_fsl_mc_mmap, -- cgit v1.2.3 From 56c069307dfd0a5e39b685e0aeee6c40d1d7ddfc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:38 -0400 Subject: vfio: Remove the get_region_info op No driver uses it now, all are using get_region_info_caps(). Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/22-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 50 +++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 48d034aede46..b8fe1a75e48a 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1267,42 +1267,36 @@ static long vfio_get_region_info(struct vfio_device *device, struct vfio_info_cap caps = {}; int ret; + if (unlikely(!device->ops->get_region_info_caps)) + return -EINVAL; + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) return -EINVAL; - if (device->ops->get_region_info_caps) { - ret = device->ops->get_region_info_caps(device, &info, &caps); - if (ret) - goto out_free; - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user(arg + 1, caps.buf, - caps.size)) { - ret = -EFAULT; - goto out_free; - } - info.cap_offset = sizeof(info); + ret = device->ops->get_region_info_caps(device, &info, &caps); + if (ret) + goto out_free; + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, caps.size)) { + ret = -EFAULT; + goto out_free; } + info.cap_offset = sizeof(info); } + } - if (copy_to_user(arg, &info, minsz)) { - ret = -EFAULT; - goto out_free; - } - } else if (device->ops->get_region_info) { - ret = device->ops->get_region_info(device, arg); - if (ret) - return ret; - } else { - return -EINVAL; + if (copy_to_user(arg, &info, minsz)){ + ret = -EFAULT; + goto out_free; } out_free: -- cgit v1.2.3 From 64a5dedcff801072154a806102d731ecdf0e7552 Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Thu, 20 Nov 2025 11:28:26 +0200 Subject: vfio: Export vfio device get and put registration helpers These helpers are useful for managing additional references taken on the device from other associated VFIO modules. Original-patch-by: Jason Gunthorpe Signed-off-by: Vivek Kasireddy Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-7-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 38c8e9350a60..9aa4a5d081e8 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -172,11 +172,13 @@ void vfio_device_put_registration(struct vfio_device *device) if (refcount_dec_and_test(&device->refcount)) complete(&device->comp); } +EXPORT_SYMBOL_GPL(vfio_device_put_registration); bool vfio_device_try_get_registration(struct vfio_device *device) { return refcount_inc_not_zero(&device->refcount); } +EXPORT_SYMBOL_GPL(vfio_device_try_get_registration); /* * VFIO driver API -- cgit v1.2.3 From 47d13c939d89d348966857b24bb15092398ed8bb Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Thu, 20 Nov 2025 11:28:27 +0200 Subject: vfio/pci: Share the core device pointer while invoking feature functions There is no need to share the main device pointer (struct vfio_device *) with all the feature functions as they only need the core device pointer. Therefore, extract the core device pointer once in the caller (vfio_pci_core_ioctl_feature) and share it instead. Signed-off-by: Vivek Kasireddy Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-8-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 7dcf5439dedc..ca9a95716a85 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -299,11 +299,9 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, return 0; } -static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, +static int vfio_pci_core_pm_entry(struct vfio_pci_core_device *vdev, u32 flags, void __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); int ret; ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); @@ -320,12 +318,10 @@ static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, } static int vfio_pci_core_pm_entry_with_wakeup( - struct vfio_device *device, u32 flags, + struct vfio_pci_core_device *vdev, u32 flags, struct vfio_device_low_power_entry_with_wakeup __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); struct vfio_device_low_power_entry_with_wakeup entry; struct eventfd_ctx *efdctx; int ret; @@ -376,11 +372,9 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) up_write(&vdev->memory_lock); } -static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags, +static int vfio_pci_core_pm_exit(struct vfio_pci_core_device *vdev, u32 flags, void __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); int ret; ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); @@ -1473,11 +1467,10 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, } EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); -static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, - uuid_t __user *arg, size_t argsz) +static int vfio_pci_core_feature_token(struct vfio_pci_core_device *vdev, + u32 flags, uuid_t __user *arg, + size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); uuid_t uuid; int ret; @@ -1504,16 +1497,19 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) { + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + switch (flags & VFIO_DEVICE_FEATURE_MASK) { case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY: - return vfio_pci_core_pm_entry(device, flags, arg, argsz); + return vfio_pci_core_pm_entry(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP: - return vfio_pci_core_pm_entry_with_wakeup(device, flags, + return vfio_pci_core_pm_entry_with_wakeup(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT: - return vfio_pci_core_pm_exit(device, flags, arg, argsz); + return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: - return vfio_pci_core_feature_token(device, flags, arg, argsz); + return vfio_pci_core_feature_token(vdev, flags, arg, argsz); default: return -ENOTTY; } -- cgit v1.2.3 From 35c3503908d320989d76b735580fd93aa497185f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:28 +0200 Subject: vfio/pci: Enable peer-to-peer DMA transactions by default Make sure that all VFIO PCI devices have peer-to-peer capabilities enables, so we would be able to export their MMIO memory through DMABUF, VFIO has always supported P2P mappings with itself. VFIO type 1 insecurely reads PFNs directly out of a VMA's PTEs and programs them into the IOMMU allowing any two VFIO devices to perform P2P to each other. All existing VMMs use this capability to export P2P into a VM where the VM could setup any kind of DMA it likes. Projects like DPDK/SPDK are also known to make use of this, though less frequently. As a first step to more properly integrating VFIO with the P2P subsystem unconditionally enable P2P support for VFIO PCI devices. The struct p2pdma_provider will act has a handle to the P2P subsystem to do things like DMA mapping. While real PCI devices have to support P2P (they can't even tell if an IOVA is P2P or not) there may be fake PCI devices that may trigger some kind of catastrophic system failure. To date VFIO has never tripped up on such a case, but if one is discovered the plan is to add a PCI quirk and have pcim_p2pdma_init() fail. This will fully block the broken device throughout any users of the P2P subsystem in the kernel. Thus P2P through DMABUF will follow the historical VFIO model and be unconditionally enabled by vfio-pci. Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-9-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index ca9a95716a85..142b84b3f225 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -28,6 +28,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_EEH) #include #endif @@ -2081,6 +2082,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); + int ret; vdev->pdev = to_pci_dev(core_vdev->dev); vdev->irq_type = VFIO_PCI_NUM_IRQS; @@ -2090,6 +2092,9 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); + ret = pcim_p2pdma_init(vdev->pdev); + if (ret && ret != -EOPNOTSUPP) + return ret; init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); -- cgit v1.2.3 From 5d74781ebc86c5fa9e9d6934024c505412de9b52 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:29 +0200 Subject: vfio/pci: Add dma-buf export support for MMIO regions Add support for exporting PCI device MMIO regions through dma-buf, enabling safe sharing of non-struct page memory with controlled lifetime management. This allows RDMA and other subsystems to import dma-buf FDs and build them into memory regions for PCI P2P operations. The implementation provides a revocable attachment mechanism using dma-buf move operations. MMIO regions are normally pinned as BARs don't change physical addresses, but access is revoked when the VFIO device is closed or a PCI reset is issued. This ensures kernel self-defense against potentially hostile userspace. Currently VFIO can take MMIO regions from the device's BAR and map them into a PFNMAP VMA with special PTEs. This mapping type ensures the memory cannot be used with things like pin_user_pages(), hmm, and so on. In practice only the user process CPU and KVM can safely make use of these VMA. When VFIO shuts down these VMAs are cleaned by unmap_mapping_range() to prevent any UAF of the MMIO beyond driver unbind. However, VFIO type 1 has an insecure behavior where it uses follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back into the IOMMU. This has a long history of enabling P2P DMA inside VMs, but has serious lifetime problems by allowing a UAF of the MMIO after the VFIO driver has been unbound. Introduce DMABUF as a new safe way to export a FD based handle for the MMIO regions. This can be consumed by existing DMABUF importers like RDMA or DRM without opening an UAF. A following series will add an importer to iommufd to obsolete the type 1 code and allow safe UAF-free MMIO P2P in VM cases. DMABUF has a built in synchronous invalidation mechanism called move_notify. VFIO keeps track of all drivers importing its MMIO and can invoke a synchronous invalidation callback to tell the importing drivers to DMA unmap and forget about the MMIO pfns. This process is being called revoke. This synchronous invalidation fully prevents any lifecycle problems. VFIO will do this before unbinding its driver ensuring there is no UAF of the MMIO beyond the driver lifecycle. Further, VFIO has additional behavior to block access to the MMIO during things like Function Level Reset. This is because some poor platforms may experience a MCE type crash when touching MMIO of a PCI device that is undergoing a reset. Today this is done by using unmap_mapping_range() on the VMAs. Extend that into the DMABUF world and temporarily revoke the MMIO from the DMABUF importers during FLR as well. This will more robustly prevent an errant P2P from possibly upsetting the platform. A DMABUF FD is a preferred handle for MMIO compared to using something like a pgmap because: - VFIO is supported, including its P2P feature, on archs that don't support pgmap - PCI devices have all sorts of BAR sizes, including ones smaller than a section so a pgmap cannot always be created - It is undesirable to waste a lot of memory for struct pages, especially for a case like a GPU with ~100GB of BAR size - We want a synchronous revoke semantic to support FLR with light hardware requirements Use the P2P subsystem to help generate the DMA mapping. This is a significant upgrade over the abuse of dma_map_resource() that has historically been used by DMABUF exporters. Experience with an OOT version of this patch shows that real systems do need this. This approach deals with all the P2P scenarios: - Non-zero PCI bus_offset - ACS flags routing traffic to the IOMMU - ACS flags that bypass the IOMMU - though vfio noiommu is required to hit this. There will be further work to formalize the revoke semantic in DMABUF. For now this acts like a move_notify dynamic exporter where importer fault handling will get a failure when they attempt to map. This means that only fully restartable fault capable importers can import the VFIO DMABUFs. A future revoke semantic should open this up to more HW as the HW only needs to invalidate, not handle restartable faults. Signed-off-by: Jason Gunthorpe Signed-off-by: Vivek Kasireddy Reviewed-by: Kevin Tian Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-10-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/Kconfig | 3 + drivers/vfio/pci/Makefile | 1 + drivers/vfio/pci/vfio_pci.c | 5 + drivers/vfio/pci/vfio_pci_config.c | 22 ++- drivers/vfio/pci/vfio_pci_core.c | 18 ++- drivers/vfio/pci/vfio_pci_dmabuf.c | 316 +++++++++++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_priv.h | 23 +++ 7 files changed, 383 insertions(+), 5 deletions(-) create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 2b0172f54665..2b9fca00e9e8 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -55,6 +55,9 @@ config VFIO_PCI_ZDEV_KVM To enable s390x KVM vfio-pci extensions, say Y. +config VFIO_PCI_DMABUF + def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER + source "drivers/vfio/pci/mlx5/Kconfig" source "drivers/vfio/pci/hisilicon/Kconfig" diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index cf00c0a7e55c..53f59226ae01 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -2,6 +2,7 @@ vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o vfio-pci-y := vfio_pci.o diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac10f14417f2..6d41cf26b539 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -147,6 +147,10 @@ static const struct vfio_device_ops vfio_pci_ops = { .pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas, }; +static const struct vfio_pci_device_ops vfio_pci_dev_ops = { + .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, +}; + static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct vfio_pci_core_device *vdev; @@ -161,6 +165,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return PTR_ERR(vdev); dev_set_drvdata(&pdev->dev, vdev); + vdev->pci_ops = &vfio_pci_dev_ops; ret = vfio_pci_core_register_device(vdev); if (ret) goto out_put_vdev; diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 8f02f236b5b4..1f6008eabf23 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); - if (!new_mem) + if (!new_mem) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } /* * If the user is writing mem/io enable (new_mem/io) and we @@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, *virt_cmd &= cpu_to_le16(~mask); *virt_cmd |= cpu_to_le16(new_cmd & mask); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm) static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state) { - if (state >= PCI_D3hot) + if (state >= PCI_D3hot) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } vfio_pci_set_power_state(vdev, state); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } @@ -982,7 +993,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 142b84b3f225..9449cf44c18a 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -287,6 +287,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, * semaphore. */ vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); + if (vdev->pm_runtime_engaged) { up_write(&vdev->memory_lock); return -EINVAL; @@ -370,6 +372,8 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) */ down_write(&vdev->memory_lock); __vfio_pci_runtime_pm_exit(vdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -690,6 +694,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) #endif vfio_pci_core_disable(vdev); + vfio_pci_dma_buf_cleanup(vdev); + mutex_lock(&vdev->igate); if (vdev->err_trigger) { eventfd_ctx_put(vdev->err_trigger); @@ -1222,7 +1228,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, */ vfio_pci_set_power_state(vdev, PCI_D0); + vfio_pci_dma_buf_move(vdev, true); ret = pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); return ret; @@ -1511,6 +1520,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: return vfio_pci_core_feature_token(vdev, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_DMA_BUF: + return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz); default: return -ENOTTY; } @@ -2095,6 +2106,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) ret = pcim_p2pdma_init(vdev->pdev); if (ret && ret != -EOPNOTSUPP) return ret; + INIT_LIST_HEAD(&vdev->dmabufs); init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2459,6 +2471,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, break; } + vfio_pci_dma_buf_move(vdev, true); vfio_pci_zap_bars(vdev); } @@ -2487,8 +2500,11 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, err_undo: list_for_each_entry_from_reverse(vdev, &dev_set->device_list, - vdev.dev_set_list) + vdev.dev_set_list) { + if (vdev->vdev.open_count && __vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); + } list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) pm_runtime_put(&vdev->pdev->dev); diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c new file mode 100644 index 000000000000..6698f540bdac --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + */ +#include +#include +#include + +#include "vfio_pci_priv.h" + +MODULE_IMPORT_NS("DMA_BUF"); + +struct vfio_pci_dma_buf { + struct dma_buf *dmabuf; + struct vfio_pci_core_device *vdev; + struct list_head dmabufs_elm; + size_t size; + struct dma_buf_phys_vec *phys_vec; + struct p2pdma_provider *provider; + u32 nr_ranges; + u8 revoked : 1; +}; + +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + if (!attachment->peer2peer) + return -EOPNOTSUPP; + + if (priv->revoked) + return -ENODEV; + + return 0; +} + +static struct sg_table * +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(priv->dmabuf->resv); + + if (priv->revoked) + return ERR_PTR(-ENODEV); + + return dma_buf_phys_vec_to_sgt(attachment, priv->provider, + priv->phys_vec, priv->nr_ranges, + priv->size, dir); +} + +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + dma_buf_free_sgt(attachment, sgt, dir); +} + +static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + /* + * Either this or vfio_pci_dma_buf_cleanup() will remove from the list. + * The refcount prevents both. + */ + if (priv->vdev) { + down_write(&priv->vdev->memory_lock); + list_del_init(&priv->dmabufs_elm); + up_write(&priv->vdev->memory_lock); + vfio_device_put_registration(&priv->vdev->vdev); + } + kfree(priv->phys_vec); + kfree(priv); +} + +static const struct dma_buf_ops vfio_pci_dmabuf_ops = { + .attach = vfio_pci_dma_buf_attach, + .map_dma_buf = vfio_pci_dma_buf_map, + .unmap_dma_buf = vfio_pci_dma_buf_unmap, + .release = vfio_pci_dma_buf_release, +}; + +int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len) +{ + phys_addr_t max_addr; + unsigned int i; + + max_addr = start + len; + for (i = 0; i < nr_ranges; i++) { + phys_addr_t end; + + if (!dma_ranges[i].length) + return -EINVAL; + + if (check_add_overflow(start, dma_ranges[i].offset, + &phys_vec[i].paddr) || + check_add_overflow(phys_vec[i].paddr, + dma_ranges[i].length, &end)) + return -EOVERFLOW; + if (end > max_addr) + return -EINVAL; + + phys_vec[i].len = dma_ranges[i].length; + } + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_core_fill_phys_vec); + +int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges) +{ + struct pci_dev *pdev = vdev->pdev; + + *provider = pcim_p2pdma_provider(pdev, region_index); + if (!*provider) + return -EINVAL; + + return vfio_pci_core_fill_phys_vec( + phys_vec, dma_ranges, nr_ranges, + pci_resource_start(pdev, region_index), + pci_resource_len(pdev, region_index)); +} +EXPORT_SYMBOL_GPL(vfio_pci_core_get_dmabuf_phys); + +static int validate_dmabuf_input(struct vfio_device_feature_dma_buf *dma_buf, + struct vfio_region_dma_range *dma_ranges, + size_t *lengthp) +{ + size_t length = 0; + u32 i; + + for (i = 0; i < dma_buf->nr_ranges; i++) { + u64 offset = dma_ranges[i].offset; + u64 len = dma_ranges[i].length; + + if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) + return -EINVAL; + + if (check_add_overflow(length, len, &length)) + return -EINVAL; + } + + /* + * dma_iova_try_alloc() will WARN on if userspace proposes a size that + * is too big, eg with lots of ranges. + */ + if ((u64)(length) & DMA_IOVA_USE_SWIOTLB) + return -EINVAL; + + *lengthp = length; + return 0; +} + +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + struct vfio_device_feature_dma_buf get_dma_buf = {}; + struct vfio_region_dma_range *dma_ranges; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct vfio_pci_dma_buf *priv; + size_t length; + int ret; + + if (!vdev->pci_ops || !vdev->pci_ops->get_dmabuf_phys) + return -EOPNOTSUPP; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, + sizeof(get_dma_buf)); + if (ret != 1) + return ret; + + if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf))) + return -EFAULT; + + if (!get_dma_buf.nr_ranges || get_dma_buf.flags) + return -EINVAL; + + /* + * For PCI the region_index is the BAR number like everything else. + */ + if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX) + return -ENODEV; + + dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges, + sizeof(*dma_ranges)); + if (IS_ERR(dma_ranges)) + return PTR_ERR(dma_ranges); + + ret = validate_dmabuf_input(&get_dma_buf, dma_ranges, &length); + if (ret) + goto err_free_ranges; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + goto err_free_ranges; + } + priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec), + GFP_KERNEL); + if (!priv->phys_vec) { + ret = -ENOMEM; + goto err_free_priv; + } + + priv->vdev = vdev; + priv->nr_ranges = get_dma_buf.nr_ranges; + priv->size = length; + ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider, + get_dma_buf.region_index, + priv->phys_vec, dma_ranges, + priv->nr_ranges); + if (ret) + goto err_free_phys; + + kfree(dma_ranges); + dma_ranges = NULL; + + if (!vfio_device_try_get_registration(&vdev->vdev)) { + ret = -ENODEV; + goto err_free_phys; + } + + exp_info.ops = &vfio_pci_dmabuf_ops; + exp_info.size = priv->size; + exp_info.flags = get_dma_buf.open_flags; + exp_info.priv = priv; + + priv->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(priv->dmabuf)) { + ret = PTR_ERR(priv->dmabuf); + goto err_dev_put; + } + + /* dma_buf_put() now frees priv */ + INIT_LIST_HEAD(&priv->dmabufs_elm); + down_write(&vdev->memory_lock); + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = !__vfio_pci_memory_enabled(vdev); + list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs); + dma_resv_unlock(priv->dmabuf->resv); + up_write(&vdev->memory_lock); + + /* + * dma_buf_fd() consumes the reference, when the file closes the dmabuf + * will be released. + */ + ret = dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags); + if (ret < 0) + goto err_dma_buf; + return ret; + +err_dma_buf: + dma_buf_put(priv->dmabuf); +err_dev_put: + vfio_device_put_registration(&vdev->vdev); +err_free_phys: + kfree(priv->phys_vec); +err_free_priv: + kfree(priv); +err_free_ranges: + kfree(dma_ranges); + return ret; +} + +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + lockdep_assert_held_write(&vdev->memory_lock); + + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + if (priv->revoked != revoked) { + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = revoked; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + } + fput(priv->dmabuf->file); + } +} + +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + down_write(&vdev->memory_lock); + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + dma_resv_lock(priv->dmabuf->resv, NULL); + list_del_init(&priv->dmabufs_elm); + priv->vdev = NULL; + priv->revoked = true; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + vfio_device_put_registration(&vdev->vdev); + fput(priv->dmabuf->file); + } + up_write(&vdev->memory_lock); +} diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index a9972eacb293..28a405f8b97c 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -107,4 +107,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; } +#ifdef CONFIG_VFIO_PCI_DMABUF +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz); +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev); +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); +#else +static inline int +vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + return -ENOTTY; +} +static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ +} +static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, + bool revoked) +{ +} +#endif + #endif -- cgit v1.2.3 From 5415d887db0e059920cb5673a32cc4d66daa280f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 20 Nov 2025 11:28:30 +0200 Subject: vfio/nvgrace: Support get_dmabuf_phys Call vfio_pci_core_fill_phys_vec() with the proper physical ranges for the synthetic BAR 2 and BAR 4 regions. Otherwise use the normal flow based on the PCI bar. This demonstrates a DMABUF that follows the region info report to only allow mapping parts of the region that are mmapable. Since the BAR is power of two sized and the "CXL" region is just page aligned the there can be a padding region at the end that is not mmaped or passed into the DMABUF. The "CXL" ranges that are remapped into BAR 2 and BAR 4 areas are not PCI MMIO, they actually run over the CXL-like coherent interconnect and for the purposes of DMA behave identically to DRAM. We don't try to model this distinction between true PCI BAR memory that takes a real PCI path and the "CXL" memory that takes a different path in the p2p framework for now. Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Reviewed-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-11-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 52 +++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e346392b72f6..f19a679b5bf2 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -7,6 +7,7 @@ #include #include #include +#include /* * The device memory usable to the workloads running in the VM is cached @@ -683,6 +684,50 @@ nvgrace_gpu_write(struct vfio_device *core_vdev, return vfio_pci_core_write(core_vdev, buf, count, ppos); } +static int nvgrace_get_dmabuf_phys(struct vfio_pci_core_device *core_vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges) +{ + struct nvgrace_gpu_pci_core_device *nvdev = container_of( + core_vdev, struct nvgrace_gpu_pci_core_device, core_device); + struct pci_dev *pdev = core_vdev->pdev; + struct mem_region *mem_region; + + /* + * if (nvdev->resmem.memlength && region_index == RESMEM_REGION_INDEX) { + * The P2P properties of the non-BAR memory is the same as the + * BAR memory, so just use the provider for index 0. Someday + * when CXL gets P2P support we could create CXLish providers + * for the non-BAR memory. + * } else if (region_index == USEMEM_REGION_INDEX) { + * This is actually cachable memory and isn't treated as P2P in + * the chip. For now we have no way to push cachable memory + * through everything and the Grace HW doesn't care what caching + * attribute is programmed into the SMMU. So use BAR 0. + * } + */ + mem_region = nvgrace_gpu_memregion(region_index, nvdev); + if (mem_region) { + *provider = pcim_p2pdma_provider(pdev, 0); + if (!*provider) + return -EINVAL; + return vfio_pci_core_fill_phys_vec(phys_vec, dma_ranges, + nr_ranges, + mem_region->memphys, + mem_region->memlength); + } + + return vfio_pci_core_get_dmabuf_phys(core_vdev, provider, region_index, + phys_vec, dma_ranges, nr_ranges); +} + +static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_ops = { + .get_dmabuf_phys = nvgrace_get_dmabuf_phys, +}; + static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .name = "nvgrace-gpu-vfio-pci", .init = vfio_pci_core_init_dev, @@ -703,6 +748,10 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .detach_ioas = vfio_iommufd_physical_detach_ioas, }; +static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_core_ops = { + .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, +}; + static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .name = "nvgrace-gpu-vfio-pci-core", .init = vfio_pci_core_init_dev, @@ -965,6 +1014,9 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, memphys, memlength); if (ret) goto out_put_vdev; + nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_ops; + } else { + nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_core_ops; } ret = vfio_pci_core_register_device(&nvdev->core_device); -- cgit v1.2.3 From 98693e0897f754e3f51ce6626ed5f785f625ba2b Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 24 Nov 2025 15:36:22 -0700 Subject: vfio/pci: Use RCU for error/request triggers to avoid circular locking Thanks to a device generating an ACS violation during bus reset, lockdep reported the following circular locking issue: CPU0: SET_IRQS (MSI/X): holds igate, acquires memory_lock CPU1: HOT_RESET: holds memory_lock, acquires pci_bus_sem CPU2: AER: holds pci_bus_sem, acquires igate This results in a potential 3-way deadlock. Remove the pci_bus_sem->igate leg of the triangle by using RCU to peek at the eventfd rather than locking it with igate. Fixes: 3be3a074cf5b ("vfio-pci: Don't use device_lock around AER interrupt setup") Signed-off-by: Alex Williamson Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20251124223623.2770706-1-alex@shazbot.org Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 68 +++++++++++++++++++++++++++------------ drivers/vfio/pci/vfio_pci_intrs.c | 52 +++++++++++++++++++----------- drivers/vfio/pci/vfio_pci_priv.h | 4 +++ 3 files changed, 85 insertions(+), 39 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 79a1a50a4ef7..2b01bfbce3ea 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -42,6 +42,40 @@ static bool nointxmask; static bool disable_vga; static bool disable_idle_d3; +static void vfio_pci_eventfd_rcu_free(struct rcu_head *rcu) +{ + struct vfio_pci_eventfd *eventfd = + container_of(rcu, struct vfio_pci_eventfd, rcu); + + eventfd_ctx_put(eventfd->ctx); + kfree(eventfd); +} + +int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev, + struct vfio_pci_eventfd __rcu **peventfd, + struct eventfd_ctx *ctx) +{ + struct vfio_pci_eventfd *new = NULL; + struct vfio_pci_eventfd *old; + + lockdep_assert_held(&vdev->igate); + + if (ctx) { + new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); + if (!new) + return -ENOMEM; + + new->ctx = ctx; + } + + old = rcu_replace_pointer(*peventfd, new, + lockdep_is_held(&vdev->igate)); + if (old) + call_rcu(&old->rcu, vfio_pci_eventfd_rcu_free); + + return 0; +} + /* List of PF's that vfio_pci_core_sriov_configure() has been called on */ static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex); static LIST_HEAD(vfio_pci_sriov_pfs); @@ -697,14 +731,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) vfio_pci_dma_buf_cleanup(vdev); mutex_lock(&vdev->igate); - if (vdev->err_trigger) { - eventfd_ctx_put(vdev->err_trigger); - vdev->err_trigger = NULL; - } - if (vdev->req_trigger) { - eventfd_ctx_put(vdev->req_trigger); - vdev->req_trigger = NULL; - } + vfio_pci_eventfd_replace_locked(vdev, &vdev->err_trigger, NULL); + vfio_pci_eventfd_replace_locked(vdev, &vdev->req_trigger, NULL); mutex_unlock(&vdev->igate); } EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); @@ -1784,21 +1812,21 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count) struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); struct pci_dev *pdev = vdev->pdev; + struct vfio_pci_eventfd *eventfd; - mutex_lock(&vdev->igate); - - if (vdev->req_trigger) { + rcu_read_lock(); + eventfd = rcu_dereference(vdev->req_trigger); + if (eventfd) { if (!(count % 10)) pci_notice_ratelimited(pdev, "Relaying device request to user (#%u)\n", count); - eventfd_signal(vdev->req_trigger); + eventfd_signal(eventfd->ctx); } else if (count == 0) { pci_warn(pdev, "No device request channel registered, blocked until released by user\n"); } - - mutex_unlock(&vdev->igate); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(vfio_pci_core_request); @@ -2216,13 +2244,13 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); + struct vfio_pci_eventfd *eventfd; - mutex_lock(&vdev->igate); - - if (vdev->err_trigger) - eventfd_signal(vdev->err_trigger); - - mutex_unlock(&vdev->igate); + rcu_read_lock(); + eventfd = rcu_dereference(vdev->err_trigger); + if (eventfd) + eventfd_signal(eventfd->ctx); + rcu_read_unlock(); return PCI_ERS_RESULT_CAN_RECOVER; } diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 30d3e921cb0d..c76e753b3cec 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -731,21 +731,27 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, return 0; } -static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, +static int vfio_pci_set_ctx_trigger_single(struct vfio_pci_core_device *vdev, + struct vfio_pci_eventfd __rcu **peventfd, unsigned int count, uint32_t flags, void *data) { /* DATA_NONE/DATA_BOOL enables loopback testing */ if (flags & VFIO_IRQ_SET_DATA_NONE) { - if (*ctx) { - if (count) { - eventfd_signal(*ctx); - } else { - eventfd_ctx_put(*ctx); - *ctx = NULL; - } + struct vfio_pci_eventfd *eventfd; + + eventfd = rcu_dereference_protected(*peventfd, + lockdep_is_held(&vdev->igate)); + + if (!eventfd) + return -EINVAL; + + if (count) { + eventfd_signal(eventfd->ctx); return 0; } + + return vfio_pci_eventfd_replace_locked(vdev, peventfd, NULL); } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { uint8_t trigger; @@ -753,8 +759,15 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, return -EINVAL; trigger = *(uint8_t *)data; - if (trigger && *ctx) - eventfd_signal(*ctx); + + if (trigger) { + struct vfio_pci_eventfd *eventfd = + rcu_dereference_protected(*peventfd, + lockdep_is_held(&vdev->igate)); + + if (eventfd) + eventfd_signal(eventfd->ctx); + } return 0; } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { @@ -765,22 +778,23 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, fd = *(int32_t *)data; if (fd == -1) { - if (*ctx) - eventfd_ctx_put(*ctx); - *ctx = NULL; + return vfio_pci_eventfd_replace_locked(vdev, + peventfd, NULL); } else if (fd >= 0) { struct eventfd_ctx *efdctx; + int ret; efdctx = eventfd_ctx_fdget(fd); if (IS_ERR(efdctx)) return PTR_ERR(efdctx); - if (*ctx) - eventfd_ctx_put(*ctx); + ret = vfio_pci_eventfd_replace_locked(vdev, + peventfd, efdctx); + if (ret) + eventfd_ctx_put(efdctx); - *ctx = efdctx; + return ret; } - return 0; } return -EINVAL; @@ -793,7 +807,7 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev, if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1) return -EINVAL; - return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, + return vfio_pci_set_ctx_trigger_single(vdev, &vdev->err_trigger, count, flags, data); } @@ -804,7 +818,7 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev, if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1) return -EINVAL; - return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, + return vfio_pci_set_ctx_trigger_single(vdev, &vdev->req_trigger, count, flags, data); } diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 28a405f8b97c..6681389518a7 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -26,6 +26,10 @@ struct vfio_pci_ioeventfd { bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); +int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev, + struct vfio_pci_eventfd __rcu **peventfd, + struct eventfd_ctx *ctx); + int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, unsigned index, unsigned start, unsigned count, void *data); -- cgit v1.2.3 From 9b92bc7554b543dc00a0a0b62904a9ef2ad5c4b0 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:27 +0000 Subject: vfio: refactor vfio_pci_mmap_huge_fault function Refactor vfio_pci_mmap_huge_fault to take out the implementation to map the VMA to the PTE/PMD/PUD as a separate function. Export the new function to be used by nvgrace-gpu module. Move the alignment check code to verify that pfn and VMA VA is aligned to the page order to the header file and make it inline. No functional change is intended. Cc: Shameer Kolothum Cc: Alex Williamson Cc: Jason Gunthorpe Reviewed-by: Shameer Kolothum Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-2-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 54 ++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 2b01bfbce3ea..9bb700d25ccb 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1652,49 +1652,49 @@ static unsigned long vma_to_pfn(struct vm_area_struct *vma) return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; } -static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, - unsigned int order) +vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev, + struct vm_fault *vmf, + unsigned long pfn, + unsigned int order) { - struct vm_area_struct *vma = vmf->vma; - struct vfio_pci_core_device *vdev = vma->vm_private_data; - unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); - unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; - unsigned long pfn = vma_to_pfn(vma) + pgoff; - vm_fault_t ret = VM_FAULT_SIGBUS; - - if (order && (addr < vma->vm_start || - addr + (PAGE_SIZE << order) > vma->vm_end || - pfn & ((1 << order) - 1))) { - ret = VM_FAULT_FALLBACK; - goto out; - } - - down_read(&vdev->memory_lock); + lockdep_assert_held_read(&vdev->memory_lock); if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) - goto out_unlock; + return VM_FAULT_SIGBUS; switch (order) { case 0: - ret = vmf_insert_pfn(vma, vmf->address, pfn); - break; + return vmf_insert_pfn(vmf->vma, vmf->address, pfn); #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP case PMD_ORDER: - ret = vmf_insert_pfn_pmd(vmf, pfn, false); - break; + return vmf_insert_pfn_pmd(vmf, pfn, false); #endif #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP case PUD_ORDER: - ret = vmf_insert_pfn_pud(vmf, pfn, false); + return vmf_insert_pfn_pud(vmf, pfn, false); break; #endif default: - ret = VM_FAULT_FALLBACK; + return VM_FAULT_FALLBACK; + } +} +EXPORT_SYMBOL_GPL(vfio_pci_vmf_insert_pfn); + +static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, + unsigned int order) +{ + struct vm_area_struct *vma = vmf->vma; + struct vfio_pci_core_device *vdev = vma->vm_private_data; + unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); + unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + unsigned long pfn = vma_to_pfn(vma) + pgoff; + vm_fault_t ret = VM_FAULT_FALLBACK; + + if (is_aligned_for_order(vma, addr, pfn, order)) { + scoped_guard(rwsem_read, &vdev->memory_lock) + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); } -out_unlock: - up_read(&vdev->memory_lock); -out: dev_dbg_ratelimited(&vdev->pdev->dev, "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", __func__, order, -- cgit v1.2.3 From 9db65489b87298f20baef683e70143c108959793 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:28 +0000 Subject: vfio/nvgrace-gpu: Add support for huge pfnmap NVIDIA's Grace based systems have large device memory. The device memory is mapped as VM_PFNMAP in the VMM VMA. The nvgrace-gpu module could make use of the huge PFNMAP support added in mm [1]. To make use of the huge pfnmap support, fault/huge_fault ops based mapping mechanism needs to be implemented. Currently nvgrace-gpu module relies on remap_pfn_range to do the mapping during VM bootup. Replace it to instead rely on fault and use vfio_pci_vmf_insert_pfn to setup the mapping. Moreover to enable huge pfnmap, nvgrace-gpu module is updated by adding huge_fault ops implementation. The implementation establishes mapping according to the order request. Note that if the PFN or the VMA address is unaligned to the order, the mapping fallbacks to the PTE level. Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1] Cc: Shameer Kolothum Cc: Alex Williamson Cc: Jason Gunthorpe Cc: Vikram Sethi Reviewed-by: Zhi Wang Reviewed-by: Shameer Kolothum Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-3-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 81 +++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 22 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e33f24fbb0a4..3034d6adf576 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -131,6 +131,59 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) vfio_pci_core_close_device(core_vdev); } +static unsigned long addr_to_pgoff(struct vm_area_struct *vma, + unsigned long addr) +{ + u64 pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf, + unsigned int order) +{ + struct vm_area_struct *vma = vmf->vma; + struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data; + struct vfio_pci_core_device *vdev = &nvdev->core_device; + unsigned int index = + vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + vm_fault_t ret = VM_FAULT_FALLBACK; + struct mem_region *memregion; + unsigned long pfn, addr; + + memregion = nvgrace_gpu_memregion(index, nvdev); + if (!memregion) + return VM_FAULT_SIGBUS; + + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr); + + if (is_aligned_for_order(vma, addr, pfn, order)) { + scoped_guard(rwsem_read, &vdev->memory_lock) + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); + } + + dev_dbg_ratelimited(&vdev->pdev->dev, + "%s order = %d pfn 0x%lx: 0x%x\n", + __func__, order, pfn, + (unsigned int)ret); + + return ret; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) +{ + return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { + .fault = nvgrace_gpu_vfio_pci_fault, +#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP + .huge_fault = nvgrace_gpu_vfio_pci_huge_fault, +#endif +}; + static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { @@ -138,10 +191,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); struct mem_region *memregion; - unsigned long start_pfn; u64 req_len, pgoff, end; unsigned int index; - int ret = 0; index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); @@ -158,17 +209,18 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || - check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) return -EOVERFLOW; /* - * Check that the mapping request does not go beyond available device - * memory size + * Check that the mapping request does not go beyond the exposed + * device memory size. */ if (end > memregion->memlength) return -EINVAL; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + /* * The carved out region of the device memory needs the NORMAL_NC * property. Communicate as such to the hypervisor. @@ -185,23 +237,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); } - /* - * Perform a PFN map to the memory and back the device BAR by the - * GPU memory. - * - * The available GPU memory size may not be power-of-2 aligned. The - * remainder is only backed by vfio_device_ops read/write handlers. - * - * During device reset, the GPU is safely disconnected to the CPU - * and access to the BAR will be immediately returned preventing - * machine check. - */ - ret = remap_pfn_range(vma, vma->vm_start, start_pfn, - req_len, vma->vm_page_prot); - if (ret) - return ret; - - vma->vm_pgoff = start_pfn; + vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; + vma->vm_private_data = nvdev; return 0; } -- cgit v1.2.3 From 7f5764e179c66f4d3776b4fcb78b383ebcd99fb3 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:29 +0000 Subject: vfio: use vfio_pci_core_setup_barmap to map bar in mmap Remove code duplication in vfio_pci_core_mmap by calling vfio_pci_core_setup_barmap to perform the bar mapping. No functional change is intended. Cc: Donald Dutile Reviewed-by: Shameer Kolothum Reviewed-by: Zhi Wang Suggested-by: Alex Williamson Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-4-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9bb700d25ccb..3a11e6f450f7 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1761,18 +1761,9 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma * Even though we don't make use of the barmap for the mmap, * we need to request the region and the barmap tracks that. */ - if (!vdev->barmap[index]) { - ret = pci_request_selected_regions(pdev, - 1 << index, "vfio-pci"); - if (ret) - return ret; - - vdev->barmap[index] = pci_iomap(pdev, index, 0); - if (!vdev->barmap[index]) { - pci_release_selected_regions(pdev, 1 << index); - return -ENOMEM; - } - } + ret = vfio_pci_core_setup_barmap(vdev, index); + if (ret) + return ret; vma->vm_private_data = vdev; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); -- cgit v1.2.3 From 7d055071d73bac7acc3a0b441f0632f564f048fe Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:30 +0000 Subject: vfio/nvgrace-gpu: split the code to wait for GPU ready Split the function that check for the GPU device being ready on the probe. Move the code to wait for the GPU to be ready through BAR0 register reads to a separate function. This would help reuse the code. This also fixes a bug where the return status in case of timeout gets overridden by return from pci_enable_device. With the fix, a timeout generate an error as initially intended. Fixes: d85f69d520e6 ("vfio/nvgrace-gpu: Check the HBM training and C2C link status") Reviewed-by: Zhi Wang Reviewed-by: Shameer Kolothum Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-5-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 3034d6adf576..5f122929dd44 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -131,6 +131,20 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) vfio_pci_core_close_device(core_vdev); } +static int nvgrace_gpu_wait_device_ready(void __iomem *io) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); + + do { + if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && + (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) + return 0; + msleep(POLL_QUANTUM_MS); + } while (!time_after(jiffies, timeout)); + + return -ETIME; +} + static unsigned long addr_to_pgoff(struct vm_area_struct *vma, unsigned long addr) { @@ -950,11 +964,10 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) * Ensure that the BAR0 region is enabled before accessing the * registers. */ -static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) +static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev) { - unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); void __iomem *io; - int ret = -ETIME; + int ret; ret = pci_enable_device(pdev); if (ret) @@ -970,16 +983,8 @@ static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) goto iomap_exit; } - do { - if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && - (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { - ret = 0; - goto reg_check_exit; - } - msleep(POLL_QUANTUM_MS); - } while (!time_after(jiffies, timeout)); + ret = nvgrace_gpu_wait_device_ready(io); -reg_check_exit: pci_iounmap(pdev, io); iomap_exit: pci_release_selected_regions(pdev, 1 << 0); @@ -996,7 +1001,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, u64 memphys, memlength; int ret; - ret = nvgrace_gpu_wait_device_ready(pdev); + ret = nvgrace_gpu_probe_check_device_ready(pdev); if (ret) return ret; -- cgit v1.2.3 From dfe765499abfbdb9df0f2e70ebb1199c7d2fc65e Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:31 +0000 Subject: vfio/nvgrace-gpu: Inform devmem unmapped after reset Introduce a new flag reset_done to notify that the GPU has just been reset and the mapping to the GPU memory is zapped. Implement the reset_done handler to set this new variable. It will be used later in the patches to wait for the GPU memory to be ready before doing any mapping or access. Cc: Jason Gunthorpe Reviewed-by: Shameer Kolothum Suggested-by: Alex Williamson Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-6-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 5f122929dd44..df360e12b299 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -59,6 +59,8 @@ struct nvgrace_gpu_pci_core_device { /* Lock to control device memory kernel mapping */ struct mutex remap_lock; bool has_mig_hw_bug; + /* GPU has just been reset */ + bool reset_done; }; static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) @@ -1067,12 +1069,34 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); +/* + * The GPU reset is required to be serialized against the *first* mapping + * faults and read/writes accesses to prevent potential RAS events logging. + * + * First fault or access after a reset needs to poll device readiness, + * flag that a reset has occurred. + */ +static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_device, struct nvgrace_gpu_pci_core_device, + core_device); + + nvdev->reset_done = true; +} + +static const struct pci_error_handlers nvgrace_gpu_vfio_pci_err_handlers = { + .reset_done = nvgrace_gpu_vfio_pci_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + static struct pci_driver nvgrace_gpu_vfio_pci_driver = { .name = KBUILD_MODNAME, .id_table = nvgrace_gpu_vfio_pci_table, .probe = nvgrace_gpu_probe, .remove = nvgrace_gpu_remove, - .err_handler = &vfio_pci_core_err_handlers, + .err_handler = &nvgrace_gpu_vfio_pci_err_handlers, .driver_managed_dma = true, }; -- cgit v1.2.3 From a23b10608d420346e5af7eda6c46726a61572469 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:32 +0000 Subject: vfio/nvgrace-gpu: wait for the GPU mem to be ready Speculative prefetches from CPU to GPU memory until the GPU is ready after reset can cause harmless corrected RAS events to be logged on Grace systems. It is thus preferred that the mapping not be re-established until the GPU is ready post reset. The GPU readiness can be checked through BAR0 registers similar to the checking at the time of device probe. It can take several seconds for the GPU to be ready. So it is desirable that the time overlaps as much of the VM startup as possible to reduce impact on the VM bootup time. The GPU readiness state is thus checked on the first fault/huge_fault request or read/write access which amortizes the GPU readiness time. The first fault and read/write checks the GPU state when the reset_done flag - which denotes whether the GPU has just been reset. The memory_lock is taken across map/access to avoid races with GPU reset. Also check if the memory is enabled, before waiting for GPU to be ready. Otherwise the readiness check would block for 30s. Lastly added PM handling wrapping on read/write access. Cc: Shameer Kolothum Cc: Alex Williamson Cc: Jason Gunthorpe Cc: Vikram Sethi Reviewed-by: Shameer Kolothum Suggested-by: Alex Williamson Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-7-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 103 +++++++++++++++++++++++++++++++----- drivers/vfio/pci/vfio_pci_config.c | 1 + drivers/vfio/pci/vfio_pci_priv.h | 1 - 3 files changed, 92 insertions(+), 13 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index df360e12b299..84d142a47ec6 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -8,6 +8,7 @@ #include #include #include +#include /* * The device memory usable to the workloads running in the VM is cached @@ -105,6 +106,19 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) mutex_init(&nvdev->remap_lock); } + /* + * GPU readiness is checked by reading the BAR0 registers. + * + * ioremap BAR0 to ensure that the BAR0 mapping is present before + * register reads on first fault before establishing any GPU + * memory mapping. + */ + ret = vfio_pci_core_setup_barmap(vdev, 0); + if (ret) { + vfio_pci_core_disable(vdev); + return ret; + } + vfio_pci_core_finish_enable(vdev); return 0; @@ -147,6 +161,34 @@ static int nvgrace_gpu_wait_device_ready(void __iomem *io) return -ETIME; } +/* + * If the GPU memory is accessed by the CPU while the GPU is not ready + * after reset, it can cause harmless corrected RAS events to be logged. + * Make sure the GPU is ready before establishing the mappings. + */ +static int +nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev) +{ + struct vfio_pci_core_device *vdev = &nvdev->core_device; + int ret; + + lockdep_assert_held_read(&vdev->memory_lock); + + if (!nvdev->reset_done) + return 0; + + if (!__vfio_pci_memory_enabled(vdev)) + return -EIO; + + ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]); + if (ret) + return ret; + + nvdev->reset_done = false; + + return 0; +} + static unsigned long addr_to_pgoff(struct vm_area_struct *vma, unsigned long addr) { @@ -176,8 +218,13 @@ static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf, pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr); if (is_aligned_for_order(vma, addr, pfn, order)) { - scoped_guard(rwsem_read, &vdev->memory_lock) + scoped_guard(rwsem_read, &vdev->memory_lock) { + if (vdev->pm_runtime_engaged || + nvgrace_gpu_check_device_ready(nvdev)) + return VM_FAULT_SIGBUS; + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); + } } dev_dbg_ratelimited(&vdev->pdev->dev, @@ -533,6 +580,7 @@ static ssize_t nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, char __user *buf, size_t count, loff_t *ppos) { + struct vfio_pci_core_device *vdev = &nvdev->core_device; u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); struct mem_region *memregion; @@ -559,9 +607,15 @@ nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, else mem_count = min(count, memregion->memlength - (size_t)offset); - ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); - if (ret) - return ret; + scoped_guard(rwsem_read, &vdev->memory_lock) { + ret = nvgrace_gpu_check_device_ready(nvdev); + if (ret) + return ret; + + ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + } /* * Only the device memory present on the hardware is mapped, which may @@ -586,9 +640,16 @@ nvgrace_gpu_read(struct vfio_device *core_vdev, struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &nvdev->core_device; + int ret; - if (nvgrace_gpu_memregion(index, nvdev)) - return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); + if (nvgrace_gpu_memregion(index, nvdev)) { + if (pm_runtime_resume_and_get(&vdev->pdev->dev)) + return -EIO; + ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos); + pm_runtime_put(&vdev->pdev->dev); + return ret; + } if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); @@ -650,6 +711,7 @@ static ssize_t nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, size_t count, loff_t *ppos, const char __user *buf) { + struct vfio_pci_core_device *vdev = &nvdev->core_device; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; struct mem_region *memregion; @@ -679,9 +741,15 @@ nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, */ mem_count = min(count, memregion->memlength - (size_t)offset); - ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); - if (ret) - return ret; + scoped_guard(rwsem_read, &vdev->memory_lock) { + ret = nvgrace_gpu_check_device_ready(nvdev); + if (ret) + return ret; + + ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + } exitfn: *ppos += count; @@ -695,10 +763,17 @@ nvgrace_gpu_write(struct vfio_device *core_vdev, struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &nvdev->core_device; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int ret; - if (nvgrace_gpu_memregion(index, nvdev)) - return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); + if (nvgrace_gpu_memregion(index, nvdev)) { + if (pm_runtime_resume_and_get(&vdev->pdev->dev)) + return -EIO; + ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf); + pm_runtime_put(&vdev->pdev->dev); + return ret; + } if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); @@ -1074,7 +1149,11 @@ MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); * faults and read/writes accesses to prevent potential RAS events logging. * * First fault or access after a reset needs to poll device readiness, - * flag that a reset has occurred. + * flag that a reset has occurred. The readiness test is done by holding + * the memory_lock read lock and we expect all vfio-pci initiated resets to + * hold the memory_lock write lock to avoid races. However, .reset_done + * extends beyond the scope of vfio-pci initiated resets therefore we + * cannot assert this behavior and use lockdep_assert_held_write. */ static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev) { diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 1f6008eabf23..dc4e510e6e1b 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -416,6 +416,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev) return pdev->current_state < PCI_D3hot && (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY)); } +EXPORT_SYMBOL_GPL(__vfio_pci_memory_enabled); /* * Restore the *real* BARs after we detect a FLR or backdoor reset. diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 6681389518a7..27ac280f00b9 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -64,7 +64,6 @@ void vfio_config_free(struct vfio_pci_core_device *vdev); int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state); -bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev); u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, -- cgit v1.2.3