From 4868d2d52df6f724b01531843805a3b1322e2dd9 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Thu, 30 Oct 2025 09:57:43 +0800 Subject: crypto: hisilicon - qm updates BAR configuration On new platforms greater than QM_HW_V3, the configuration region for the live migration function of the accelerator device is no longer placed in the VF, but is instead placed in the PF. Therefore, the configuration region of the live migration function needs to be opened when the QM driver is loaded. When the QM driver is uninstalled, the driver needs to clear this configuration. Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Acked-by: Herbert Xu Link: https://lore.kernel.org/r/20251030015744.131771-2-liulongfang@huawei.com Signed-off-by: Alex Williamson --- include/linux/hisi_acc_qm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index c4690e365ade..ca1ec437a3ca 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -99,6 +99,9 @@ #define QM_DEV_ALG_MAX_LEN 256 +#define QM_MIG_REGION_SEL 0x100198 +#define QM_MIG_REGION_EN BIT(0) + /* uacce mode of the driver */ #define UACCE_MODE_NOUACCE 0 /* don't use uacce */ #define UACCE_MODE_SVA 1 /* use uacce sva mode */ -- cgit v1.2.3 From 113557b0406818a8a5df3479b0a89125d2b2a04c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:17 -0400 Subject: vfio: Provide a get_region_info op Instead of hooking the general ioctl op, have the core code directly decode VFIO_DEVICE_GET_REGION_INFO and call an op just for it. This is intended to allow mechanical changes to the drivers to pull their VFIO_DEVICE_GET_REGION_INFO int oa function. Later patches will improve the function signature to consolidate more code. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 2 ++ include/linux/vfio_pci_core.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index eb563f538dee..be5fcf8432e8 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -132,6 +132,8 @@ struct vfio_device_ops { size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); + int (*get_region_info)(struct vfio_device *vdev, + struct vfio_region_info __user *arg); int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index f541044e42a2..160bc2e31ece 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -115,6 +115,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg); ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, -- cgit v1.2.3 From 775f726a742a60d8d0ed2b4733a5b6a796d9d1dd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:31 -0400 Subject: vfio: Add get_region_info_caps op This op does the copy to/from user for the info and can return back a cap chain through a vfio_info_cap * result. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/15-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index be5fcf8432e8..6311ddc83770 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -21,6 +21,7 @@ struct kvm; struct iommufd_ctx; struct iommufd_device; struct iommufd_access; +struct vfio_info_cap; /* * VFIO devices can be placed in a set, this allows all devices to share this @@ -134,6 +135,9 @@ struct vfio_device_ops { unsigned long arg); int (*get_region_info)(struct vfio_device *vdev, struct vfio_region_info __user *arg); + int (*get_region_info_caps)(struct vfio_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); -- cgit v1.2.3 From 1b0ecb5baf4af3baa8627144bbcf9848806aa5f1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:35 -0400 Subject: vfio/pci: Convert all PCI drivers to get_region_info_caps Since the core function signature changes it has to flow up to all drivers. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Brett Creeley Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/19-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 160bc2e31ece..e74f94c17fbe 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -116,7 +116,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, -- cgit v1.2.3 From 56c069307dfd0a5e39b685e0aeee6c40d1d7ddfc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:38 -0400 Subject: vfio: Remove the get_region_info op No driver uses it now, all are using get_region_info_caps(). Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/22-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 6311ddc83770..8e1ddb48b9b5 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -133,8 +133,6 @@ struct vfio_device_ops { size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); - int (*get_region_info)(struct vfio_device *vdev, - struct vfio_region_info __user *arg); int (*get_region_info_caps)(struct vfio_device *vdev, struct vfio_region_info *info, struct vfio_info_cap *caps); -- cgit v1.2.3 From f58ef9d1d1355b15443719df95081f193067ab88 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:20 +0200 Subject: PCI/P2PDMA: Separate the mmap() support from the core logic Currently the P2PDMA code requires a pgmap and a struct page to function. The was serving three important purposes: - DMA API compatibility, where scatterlist required a struct page as input - Life cycle management, the percpu_ref is used to prevent UAF during device hot unplug - A way to get the P2P provider data through the pci_p2pdma_pagemap The DMA API now has a new flow, and has gained phys_addr_t support, so it no longer needs struct pages to perform P2P mapping. Lifecycle management can be delegated to the user, DMABUF for instance has a suitable invalidation protocol that does not require struct page. Finding the P2P provider data can also be managed by the caller without need to look it up from the phys_addr. Split the P2PDMA code into two layers. The optional upper layer, effectively, provides a way to mmap() P2P memory into a VMA by providing struct page, pgmap, a genalloc and sysfs. The lower layer provides the actual P2P infrastructure and is wrapped up in a new struct p2pdma_provider. Rework the mmap layer to use new p2pdma_provider based APIs. Drivers that do not want to put P2P memory into VMA's can allocate a struct p2pdma_provider after probe() starts and free it before remove() completes. When DMA mapping the driver must convey the struct p2pdma_provider to the DMA mapping code along with a phys_addr of the MMIO BAR slice to map. The driver must ensure that no DMA mapping outlives the lifetime of the struct p2pdma_provider. The intended target of this new API layer is DMABUF. There is usually only a single p2pdma_provider for a DMABUF exporter. Most drivers can establish the p2pdma_provider during probe, access the single instance during DMABUF attach and use that to drive the DMA mapping. DMABUF provides an invalidation mechanism that can guarantee all DMA is halted and the DMA mappings are undone prior to destroying the struct p2pdma_provider. This ensures there is no UAF through DMABUFs that are lingering past driver removal. The new p2pdma_provider layer cannot be used to create P2P memory that can be mapped into VMA's, be used with pin_user_pages(), O_DIRECT, and so on. These use cases must still use the mmap() layer. The p2pdma_provider layer is principally for DMABUF-like use cases where DMABUF natively manages the life cycle and access instead of vmas/pin_user_pages()/struct page. In addition, remove the bus_off field from pci_p2pdma_map_state since it duplicates information already available in the pgmap structure. The bus_offset is only used in one location (pci_p2pdma_bus_addr_map) and is always identical to pgmap->bus_offset. Signed-off-by: Jason Gunthorpe Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-1-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/pci-p2pdma.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 951f81a38f3a..1400f3ad4299 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -16,6 +16,16 @@ struct block_device; struct scatterlist; +/** + * struct p2pdma_provider + * + * A p2pdma provider is a range of MMIO address space available to the CPU. + */ +struct p2pdma_provider { + struct device *owner; + u64 bus_offset; +}; + #ifdef CONFIG_PCI_P2PDMA int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset); @@ -139,11 +149,11 @@ enum pci_p2pdma_map_type { }; struct pci_p2pdma_map_state { - struct dev_pagemap *pgmap; + struct p2pdma_provider *mem; enum pci_p2pdma_map_type map; - u64 bus_off; }; + /* helper for pci_p2pdma_state(), do not use directly */ void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page); @@ -162,8 +172,7 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page) { if (IS_ENABLED(CONFIG_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { - if (state->pgmap != page_pgmap(page)) - __pci_p2pdma_update_state(state, dev, page); + __pci_p2pdma_update_state(state, dev, page); return state->map; } return PCI_P2PDMA_MAP_NONE; @@ -181,7 +190,7 @@ static inline dma_addr_t pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr) { WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR); - return paddr + state->bus_off; + return paddr + state->mem->bus_offset; } #endif /* _LINUX_PCI_P2P_H */ -- cgit v1.2.3 From d4504262f745e48c1739c8b864f779b4b0f9de80 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:21 +0200 Subject: PCI/P2PDMA: Simplify bus address mapping API Update the pci_p2pdma_bus_addr_map() function to take a direct pointer to the p2pdma_provider structure instead of the pci_p2pdma_map_state. This simplifies the API by removing the need for callers to extract the provider from the state structure. The change updates all callers across the kernel (block layer, IOMMU, DMA direct, and HMM) to pass the provider pointer directly, making the code more explicit and reducing unnecessary indirection. This also removes the runtime warning check since callers now have direct control over which provider they use. Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-2-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/pci-p2pdma.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 1400f3ad4299..9516ef97b17a 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -181,16 +181,15 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, /** * pci_p2pdma_bus_addr_map - Translate a physical address to a bus address * for a PCI_P2PDMA_MAP_BUS_ADDR transfer. - * @state: P2P state structure + * @provider: P2P provider structure * @paddr: physical address to map * * Map a physically contiguous PCI_P2PDMA_MAP_BUS_ADDR transfer. */ static inline dma_addr_t -pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr) +pci_p2pdma_bus_addr_map(struct p2pdma_provider *provider, phys_addr_t paddr) { - WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR); - return paddr + state->mem->bus_offset; + return paddr + provider->bus_offset; } #endif /* _LINUX_PCI_P2P_H */ -- cgit v1.2.3 From 372d6d1b8ae3cdfe6b0638a0a848c6865ec94567 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:22 +0200 Subject: PCI/P2PDMA: Refactor to separate core P2P functionality from memory allocation Refactor the PCI P2PDMA subsystem to separate the core peer-to-peer DMA functionality from the optional memory allocation layer. This creates a two-tier architecture: The core layer provides P2P mapping functionality for physical addresses based on PCI device MMIO BARs and integrates with the DMA API for mapping operations. This layer is required for all P2PDMA users. The optional upper layer provides memory allocation capabilities including gen_pool allocator, struct page support, and sysfs interface for user space access. This separation allows subsystems like DMABUF to use only the core P2P mapping functionality without the overhead of memory allocation features they don't need. The core functionality is now available through the new pcim_p2pdma_provider() function that returns a p2pdma_provider structure. Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-3-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/pci-p2pdma.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 9516ef97b17a..15471252817b 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -27,6 +27,8 @@ struct p2pdma_provider { }; #ifdef CONFIG_PCI_P2PDMA +int pcim_p2pdma_init(struct pci_dev *pdev); +struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar); int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset); int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, @@ -44,6 +46,15 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, bool use_p2pdma); #else /* CONFIG_PCI_P2PDMA */ +static inline int pcim_p2pdma_init(struct pci_dev *pdev) +{ + return -EOPNOTSUPP; +} +static inline struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, + int bar) +{ + return NULL; +} static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) { -- cgit v1.2.3 From 395698bd2cd7639b85784a4a8f5ddb7a581e353c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:23 +0200 Subject: PCI/P2PDMA: Provide an access to pci_p2pdma_map_type() function Provide an access to pci_p2pdma_map_type() function to allow subsystems to determine the appropriate mapping type for P2PDMA transfers between a provider and target device. The pci_p2pdma_map_type() function is the core P2P layer version of the existing public, but struct page focused, pci_p2pdma_state() function. It returns the same result. It is required to use the p2p subsystem from drivers that don't use the struct page layer. Like __pci_p2pdma_update_state() it is not an exported function. The idea is that only subsystem code will implement mapping helpers for taking in phys_addr_t lists, this is deliberately not made accessible to every driver to prevent abuse. Following patches will use this function to implement a shared DMA mapping helper for DMABUF. Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-4-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/pci-p2pdma.h | 85 +++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 15471252817b..517e121d2598 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -26,6 +26,45 @@ struct p2pdma_provider { u64 bus_offset; }; +enum pci_p2pdma_map_type { + /* + * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before + * the mapping type has been calculated. Exported routines for the API + * will never return this value. + */ + PCI_P2PDMA_MAP_UNKNOWN = 0, + + /* + * Not a PCI P2PDMA transfer. + */ + PCI_P2PDMA_MAP_NONE, + + /* + * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will + * traverse the host bridge and the host bridge is not in the + * allowlist. DMA Mapping routines should return an error when + * this is returned. + */ + PCI_P2PDMA_MAP_NOT_SUPPORTED, + + /* + * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to + * each other directly through a PCI switch and the transaction will + * not traverse the host bridge. Such a mapping should program + * the DMA engine with PCI bus addresses. + */ + PCI_P2PDMA_MAP_BUS_ADDR, + + /* + * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk + * to each other, but the transaction traverses a host bridge on the + * allowlist. In this case, a normal mapping either with CPU physical + * addresses (in the case of dma-direct) or IOVA addresses (in the + * case of IOMMUs) should be used to program the DMA engine. + */ + PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, +}; + #ifdef CONFIG_PCI_P2PDMA int pcim_p2pdma_init(struct pci_dev *pdev); struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar); @@ -45,6 +84,8 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma); ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, bool use_p2pdma); +enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider, + struct device *dev); #else /* CONFIG_PCI_P2PDMA */ static inline int pcim_p2pdma_init(struct pci_dev *pdev) { @@ -106,6 +147,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page, { return sprintf(page, "none\n"); } +static inline enum pci_p2pdma_map_type +pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev) +{ + return PCI_P2PDMA_MAP_NOT_SUPPORTED; +} #endif /* CONFIG_PCI_P2PDMA */ @@ -120,45 +166,6 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client) return pci_p2pmem_find_many(&client, 1); } -enum pci_p2pdma_map_type { - /* - * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before - * the mapping type has been calculated. Exported routines for the API - * will never return this value. - */ - PCI_P2PDMA_MAP_UNKNOWN = 0, - - /* - * Not a PCI P2PDMA transfer. - */ - PCI_P2PDMA_MAP_NONE, - - /* - * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will - * traverse the host bridge and the host bridge is not in the - * allowlist. DMA Mapping routines should return an error when - * this is returned. - */ - PCI_P2PDMA_MAP_NOT_SUPPORTED, - - /* - * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to - * each other directly through a PCI switch and the transaction will - * not traverse the host bridge. Such a mapping should program - * the DMA engine with PCI bus addresses. - */ - PCI_P2PDMA_MAP_BUS_ADDR, - - /* - * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk - * to each other, but the transaction traverses a host bridge on the - * allowlist. In this case, a normal mapping either with CPU physical - * addresses (in the case of dma-direct) or IOVA addresses (in the - * case of IOMMUs) should be used to program the DMA engine. - */ - PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, -}; - struct pci_p2pdma_map_state { struct p2pdma_provider *mem; enum pci_p2pdma_map_type map; -- cgit v1.2.3 From 3aa31a8bb11e47c0ff2b306988d1756b810c1c3c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:25 +0200 Subject: dma-buf: provide phys_vec to scatter-gather mapping routine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add dma_buf_phys_vec_to_sgt() and dma_buf_free_sgt() helpers to convert an array of MMIO physical address ranges into scatter-gather tables with proper DMA mapping. These common functions are a starting point and support any PCI drivers creating mappings from their BAR's MMIO addresses. VFIO is one case, as shortly will be RDMA. We can review existing DRM drivers to refactor them separately. We hope this will evolve into routines to help common DRM that include mixed CPU and MMIO mappings. Compared to the dma_map_resource() abuse this implementation handles the complicated PCI P2P scenarios properly, especially when an IOMMU is enabled: - Direct bus address mapping without IOVA allocation for PCI_P2PDMA_MAP_BUS_ADDR, using pci_p2pdma_bus_addr_map(). This happens if the IOMMU is enabled but the PCIe switch ACS flags allow transactions to avoid the host bridge. Further, this handles the slightly obscure, case of MMIO with a phys_addr_t that is different from the physical BAR programming (bus offset). The phys_addr_t is converted to a dma_addr_t and accommodates this effect. This enables certain real systems to work, especially on ARM platforms. - Mapping through host bridge with IOVA allocation and DMA_ATTR_MMIO attribute for MMIO memory regions (PCI_P2PDMA_MAP_THRU_HOST_BRIDGE). This happens when the IOMMU is enabled and the ACS flags are forcing all traffic to the IOMMU - ie for virtualization systems. - Cases where P2P is not supported through the host bridge/CPU. The P2P subsystem is the proper place to detect this and block it. Helper functions fill_sg_entry() and calc_sg_nents() handle the scatter-gather table construction, splitting large regions into UINT_MAX-sized chunks to fit within sg->length field limits. Since the physical address based DMA API forbids use of the CPU list of the scatterlist this will produce a mangled scatterlist that has a fully zero-length and NULL'd CPU list. The list is 0 length, all the struct page pointers are NULL and zero sized. This is stronger and more robust than the existing mangle_sg_table() technique. It is a future project to migrate DMABUF as a subsystem away from using scatterlist for this data structure. Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Christian König Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-6-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/dma-buf-mapping.h | 17 +++++++++++++++++ include/linux/dma-buf.h | 11 +++++++++++ 2 files changed, 28 insertions(+) create mode 100644 include/linux/dma-buf-mapping.h (limited to 'include/linux') diff --git a/include/linux/dma-buf-mapping.h b/include/linux/dma-buf-mapping.h new file mode 100644 index 000000000000..a3c0ce2d3a42 --- /dev/null +++ b/include/linux/dma-buf-mapping.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * DMA BUF Mapping Helpers + * + */ +#ifndef __DMA_BUF_MAPPING_H__ +#define __DMA_BUF_MAPPING_H__ +#include + +struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach, + struct p2pdma_provider *provider, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size, + enum dma_data_direction dir); +void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt, + enum dma_data_direction dir); +#endif diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index d58e329ac0e7..0bc492090237 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -22,6 +22,7 @@ #include #include #include +#include struct device; struct dma_buf; @@ -530,6 +531,16 @@ struct dma_buf_export_info { void *priv; }; +/** + * struct dma_buf_phys_vec - describe continuous chunk of memory + * @paddr: physical address of that chunk + * @len: Length of this chunk + */ +struct dma_buf_phys_vec { + phys_addr_t paddr; + size_t len; +}; + /** * DEFINE_DMA_BUF_EXPORT_INFO - helper macro for exporters * @name: export-info name -- cgit v1.2.3 From 64a5dedcff801072154a806102d731ecdf0e7552 Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Thu, 20 Nov 2025 11:28:26 +0200 Subject: vfio: Export vfio device get and put registration helpers These helpers are useful for managing additional references taken on the device from other associated VFIO modules. Original-patch-by: Jason Gunthorpe Signed-off-by: Vivek Kasireddy Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-7-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index eb563f538dee..217ba4ef1752 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -297,6 +297,8 @@ static inline void vfio_put_device(struct vfio_device *device) int vfio_register_group_dev(struct vfio_device *device); int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); +bool vfio_device_try_get_registration(struct vfio_device *device); +void vfio_device_put_registration(struct vfio_device *device); int vfio_assign_device_set(struct vfio_device *device, void *set_id); unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set); -- cgit v1.2.3 From 5d74781ebc86c5fa9e9d6934024c505412de9b52 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:29 +0200 Subject: vfio/pci: Add dma-buf export support for MMIO regions Add support for exporting PCI device MMIO regions through dma-buf, enabling safe sharing of non-struct page memory with controlled lifetime management. This allows RDMA and other subsystems to import dma-buf FDs and build them into memory regions for PCI P2P operations. The implementation provides a revocable attachment mechanism using dma-buf move operations. MMIO regions are normally pinned as BARs don't change physical addresses, but access is revoked when the VFIO device is closed or a PCI reset is issued. This ensures kernel self-defense against potentially hostile userspace. Currently VFIO can take MMIO regions from the device's BAR and map them into a PFNMAP VMA with special PTEs. This mapping type ensures the memory cannot be used with things like pin_user_pages(), hmm, and so on. In practice only the user process CPU and KVM can safely make use of these VMA. When VFIO shuts down these VMAs are cleaned by unmap_mapping_range() to prevent any UAF of the MMIO beyond driver unbind. However, VFIO type 1 has an insecure behavior where it uses follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back into the IOMMU. This has a long history of enabling P2P DMA inside VMs, but has serious lifetime problems by allowing a UAF of the MMIO after the VFIO driver has been unbound. Introduce DMABUF as a new safe way to export a FD based handle for the MMIO regions. This can be consumed by existing DMABUF importers like RDMA or DRM without opening an UAF. A following series will add an importer to iommufd to obsolete the type 1 code and allow safe UAF-free MMIO P2P in VM cases. DMABUF has a built in synchronous invalidation mechanism called move_notify. VFIO keeps track of all drivers importing its MMIO and can invoke a synchronous invalidation callback to tell the importing drivers to DMA unmap and forget about the MMIO pfns. This process is being called revoke. This synchronous invalidation fully prevents any lifecycle problems. VFIO will do this before unbinding its driver ensuring there is no UAF of the MMIO beyond the driver lifecycle. Further, VFIO has additional behavior to block access to the MMIO during things like Function Level Reset. This is because some poor platforms may experience a MCE type crash when touching MMIO of a PCI device that is undergoing a reset. Today this is done by using unmap_mapping_range() on the VMAs. Extend that into the DMABUF world and temporarily revoke the MMIO from the DMABUF importers during FLR as well. This will more robustly prevent an errant P2P from possibly upsetting the platform. A DMABUF FD is a preferred handle for MMIO compared to using something like a pgmap because: - VFIO is supported, including its P2P feature, on archs that don't support pgmap - PCI devices have all sorts of BAR sizes, including ones smaller than a section so a pgmap cannot always be created - It is undesirable to waste a lot of memory for struct pages, especially for a case like a GPU with ~100GB of BAR size - We want a synchronous revoke semantic to support FLR with light hardware requirements Use the P2P subsystem to help generate the DMA mapping. This is a significant upgrade over the abuse of dma_map_resource() that has historically been used by DMABUF exporters. Experience with an OOT version of this patch shows that real systems do need this. This approach deals with all the P2P scenarios: - Non-zero PCI bus_offset - ACS flags routing traffic to the IOMMU - ACS flags that bypass the IOMMU - though vfio noiommu is required to hit this. There will be further work to formalize the revoke semantic in DMABUF. For now this acts like a move_notify dynamic exporter where importer fault handling will get a failure when they attempt to map. This means that only fully restartable fault capable importers can import the VFIO DMABUFs. A future revoke semantic should open this up to more HW as the HW only needs to invalidate, not handle restartable faults. Signed-off-by: Jason Gunthorpe Signed-off-by: Vivek Kasireddy Reviewed-by: Kevin Tian Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-10-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index f541044e42a2..c9466ba323fa 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -26,6 +26,8 @@ struct vfio_pci_core_device; struct vfio_pci_region; +struct p2pdma_provider; +struct dma_buf_phys_vec; struct vfio_pci_regops { ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf, @@ -49,9 +51,48 @@ struct vfio_pci_region { u32 flags; }; +struct vfio_pci_device_ops { + int (*get_dmabuf_phys)(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges); +}; + +#if IS_ENABLED(CONFIG_VFIO_PCI_DMABUF) +int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len); +int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges); +#else +static inline int +vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len) +{ + return -EINVAL; +} +static inline int vfio_pci_core_get_dmabuf_phys( + struct vfio_pci_core_device *vdev, struct p2pdma_provider **provider, + unsigned int region_index, struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, size_t nr_ranges) +{ + return -EOPNOTSUPP; +} +#endif + struct vfio_pci_core_device { struct vfio_device vdev; struct pci_dev *pdev; + const struct vfio_pci_device_ops *pci_ops; void __iomem *barmap[PCI_STD_NUM_BARS]; bool bar_mmap_supported[PCI_STD_NUM_BARS]; u8 *pci_config_map; @@ -94,6 +135,7 @@ struct vfio_pci_core_device { struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; struct rw_semaphore memory_lock; + struct list_head dmabufs; }; /* Will be exported for vfio pci drivers usage */ -- cgit v1.2.3 From 98693e0897f754e3f51ce6626ed5f785f625ba2b Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 24 Nov 2025 15:36:22 -0700 Subject: vfio/pci: Use RCU for error/request triggers to avoid circular locking Thanks to a device generating an ACS violation during bus reset, lockdep reported the following circular locking issue: CPU0: SET_IRQS (MSI/X): holds igate, acquires memory_lock CPU1: HOT_RESET: holds memory_lock, acquires pci_bus_sem CPU2: AER: holds pci_bus_sem, acquires igate This results in a potential 3-way deadlock. Remove the pci_bus_sem->igate leg of the triangle by using RCU to peek at the eventfd rather than locking it with igate. Fixes: 3be3a074cf5b ("vfio-pci: Don't use device_lock around AER interrupt setup") Signed-off-by: Alex Williamson Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20251124223623.2770706-1-alex@shazbot.org Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 88fd2fd895d0..a1eddd55dab8 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,11 @@ struct vfio_pci_region; struct p2pdma_provider; struct dma_buf_phys_vec; +struct vfio_pci_eventfd { + struct eventfd_ctx *ctx; + struct rcu_head rcu; +}; + struct vfio_pci_regops { ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); @@ -124,8 +130,8 @@ struct vfio_pci_core_device { struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; - struct eventfd_ctx *err_trigger; - struct eventfd_ctx *req_trigger; + struct vfio_pci_eventfd __rcu *err_trigger; + struct vfio_pci_eventfd __rcu *req_trigger; struct eventfd_ctx *pm_wake_eventfd_ctx; struct list_head dummy_resources_list; struct mutex ioeventfds_lock; -- cgit v1.2.3 From 9b92bc7554b543dc00a0a0b62904a9ef2ad5c4b0 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:27 +0000 Subject: vfio: refactor vfio_pci_mmap_huge_fault function Refactor vfio_pci_mmap_huge_fault to take out the implementation to map the VMA to the PTE/PMD/PUD as a separate function. Export the new function to be used by nvgrace-gpu module. Move the alignment check code to verify that pfn and VMA VA is aligned to the page order to the header file and make it inline. No functional change is intended. Cc: Shameer Kolothum Cc: Alex Williamson Cc: Jason Gunthorpe Reviewed-by: Shameer Kolothum Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-2-ankita@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index a1eddd55dab8..5569488ec4dc 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -170,6 +170,9 @@ ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); +vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev, + struct vm_fault *vmf, unsigned long pfn, + unsigned int order); int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma); void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count); int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf); @@ -212,4 +215,14 @@ VFIO_IOREAD_DECLARATION(32) VFIO_IOREAD_DECLARATION(64) #endif +static inline bool is_aligned_for_order(struct vm_area_struct *vma, + unsigned long addr, + unsigned long pfn, + unsigned int order) +{ + return !(order && (addr < vma->vm_start || + addr + (PAGE_SIZE << order) > vma->vm_end || + !IS_ALIGNED(pfn, 1 << order))); +} + #endif /* VFIO_PCI_CORE_H */ -- cgit v1.2.3 From a23b10608d420346e5af7eda6c46726a61572469 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:32 +0000 Subject: vfio/nvgrace-gpu: wait for the GPU mem to be ready Speculative prefetches from CPU to GPU memory until the GPU is ready after reset can cause harmless corrected RAS events to be logged on Grace systems. It is thus preferred that the mapping not be re-established until the GPU is ready post reset. The GPU readiness can be checked through BAR0 registers similar to the checking at the time of device probe. It can take several seconds for the GPU to be ready. So it is desirable that the time overlaps as much of the VM startup as possible to reduce impact on the VM bootup time. The GPU readiness state is thus checked on the first fault/huge_fault request or read/write access which amortizes the GPU readiness time. The first fault and read/write checks the GPU state when the reset_done flag - which denotes whether the GPU has just been reset. The memory_lock is taken across map/access to avoid races with GPU reset. Also check if the memory is enabled, before waiting for GPU to be ready. Otherwise the readiness check would block for 30s. Lastly added PM handling wrapping on read/write access. Cc: Shameer Kolothum Cc: Alex Williamson Cc: Jason Gunthorpe Cc: Vikram Sethi Reviewed-by: Shameer Kolothum Suggested-by: Alex Williamson Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-7-ankita@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 5569488ec4dc..336a0e58b443 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -188,6 +188,7 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, void __iomem *io, char __user *buf, loff_t off, size_t count, size_t x_start, size_t x_end, bool iswrite); +bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, loff_t reg_start, size_t reg_cnt, loff_t *buf_offset, -- cgit v1.2.3