From e71e00127110dedc6a9e746178282b4dac97ed96 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 11:25:36 -0400 Subject: iommupt: Add the RISC-V page table format The RISC-V format is a fairly simple 5 level page table not unlike the x86 one. It has optional support for a single contiguous page size of 64k (16 x 4k). The specification describes a 32-bit format, the general code can support it via a #define but the iommu side implementation has been left off until a user comes. Tested-by: Vincent Chen Acked-by: Paul Walmsley # arch/riscv Reviewed-by: Tomasz Jeznach Tested-by: Tomasz Jeznach Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/common.h | 16 ++++++++++++++++ include/linux/generic_pt/iommu.h | 11 +++++++++++ 2 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 6a9a1acb5aad..fc5d0b5edadc 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -175,6 +175,22 @@ enum { PT_FEAT_VTDSS_FORCE_WRITEABLE, }; +struct pt_riscv_32 { + struct pt_common common; +}; + +struct pt_riscv_64 { + struct pt_common common; +}; + +enum { + /* + * Support the 64k contiguous page size following the Svnapot extension. + */ + PT_FEAT_RISCV_SVNAPOT_64K = PT_FEAT_FMT_START, + +}; + struct pt_x86_64 { struct pt_common common; }; diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 9eefbb74efd0..49d9addb98c5 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -275,6 +275,17 @@ struct pt_iommu_vtdss_hw_info { IOMMU_FORMAT(vtdss, vtdss_pt); +struct pt_iommu_riscv_64_cfg { + struct pt_iommu_cfg common; +}; + +struct pt_iommu_riscv_64_hw_info { + u64 ppn; + u8 fsc_iosatp_mode; +}; + +IOMMU_FORMAT(riscv_64, riscv_64pt); + struct pt_iommu_x86_64_cfg { struct pt_iommu_cfg common; /* 4 is a 57 bit 5 level table */ -- cgit v1.2.3 From 99fb8afa16add85ed016baee9735231bca0c32b4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 15:30:10 -0400 Subject: iommupt: Directly call iommupt's unmap_range() The common algorithm in iommupt does not require the iommu_pgsize() calculations, it can directly unmap any arbitrary range. Add a new function pointer to directly call an iommupt unmap_range op and make __iommu_unmap() call it directly. Gives about a 5% gain on single page unmappings. The function pointer is run through pt_iommu_ops instead of iommu_domain_ops to discourage using it outside iommupt. All drivers with their own page tables should continue to use the simplified map/unmap_pages() style interfaces. Reviewed-by: Samiullah Khawaja Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- include/linux/generic_pt/iommu.h | 37 +++++++++++++++++++++++++++++++------ include/linux/iommu.h | 1 + 2 files changed, 32 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 49d9addb98c5..0da971134a37 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -66,6 +66,13 @@ struct pt_iommu { struct device *iommu_device; }; +static inline struct pt_iommu *iommupt_from_domain(struct iommu_domain *domain) +{ + if (!IS_ENABLED(CONFIG_IOMMU_PT) || !domain->is_iommupt) + return NULL; + return container_of(domain, struct pt_iommu, domain); +} + /** * struct pt_iommu_info - Details about the IOMMU page table * @@ -80,6 +87,29 @@ struct pt_iommu_info { }; struct pt_iommu_ops { + /** + * @unmap_range: Make a range of IOVA empty/not present + * @iommu_table: Table to manipulate + * @iova: IO virtual address to start + * @len: Length of the range starting from @iova + * @iotlb_gather: Gather struct that must be flushed on return + * + * unmap_range() will remove a translation created by map_range(). It + * cannot subdivide a mapping created by map_range(), so it should be + * called with IOVA ranges that match those passed to map_pages. The + * IOVA range can aggregate contiguous map_range() calls so long as no + * individual range is split. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: Number of bytes of VA unmapped. iova + res will be the + * point unmapping stopped. + */ + size_t (*unmap_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + dma_addr_t len, + struct iommu_iotlb_gather *iotlb_gather); + /** * @set_dirty: Make the iova write dirty * @iommu_table: Table to manipulate @@ -198,10 +228,6 @@ struct pt_iommu_cfg { unsigned long iova, phys_addr_t paddr, \ size_t pgsize, size_t pgcount, \ int prot, gfp_t gfp, size_t *mapped); \ - size_t pt_iommu_##fmt##_unmap_pages( \ - struct iommu_domain *domain, unsigned long iova, \ - size_t pgsize, size_t pgcount, \ - struct iommu_iotlb_gather *iotlb_gather); \ int pt_iommu_##fmt##_read_and_clear_dirty( \ struct iommu_domain *domain, unsigned long iova, size_t size, \ unsigned long flags, struct iommu_dirty_bitmap *dirty); \ @@ -223,8 +249,7 @@ struct pt_iommu_cfg { */ #define IOMMU_PT_DOMAIN_OPS(fmt) \ .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ - .map_pages = &pt_iommu_##fmt##_map_pages, \ - .unmap_pages = &pt_iommu_##fmt##_unmap_pages + .map_pages = &pt_iommu_##fmt##_map_pages #define IOMMU_PT_DIRTY_OPS(fmt) \ .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 54b8b48c762e..7ca648c01336 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -223,6 +223,7 @@ enum iommu_domain_cookie_type { struct iommu_domain { unsigned type; enum iommu_domain_cookie_type cookie_type; + bool is_iommupt; const struct iommu_domain_ops *ops; const struct iommu_dirty_ops *dirty_ops; const struct iommu_ops *owner; /* Whose domain_alloc we came from */ -- cgit v1.2.3 From d6c65b0fd6218bd21ed0be7a8d3218e8f6dc91de Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 15:30:11 -0400 Subject: iommupt: Avoid rewalking during map Currently the core code provides a simplified interface to drivers where it fragments a requested multi-page map into single page size steps after doing all the calculations to figure out what page size is appropriate. Each step rewalks the page tables from the start. Since iommupt has a single implementation of the mapping algorithm it can internally compute each step as it goes while retaining its current position in the walk. Add a new function pt_pgsz_count() which computes the same page size fragement of a large mapping operations. Compute the next fragment when all the leaf entries of the current fragement have been written, then continue walking from the current point. The function pointer is run through pt_iommu_ops instead of iommu_domain_ops to discourage using it outside iommupt. All drivers with their own page tables should continue to use the simplified map_pages() style interfaces. Reviewed-by: Samiullah Khawaja Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- include/linux/generic_pt/iommu.h | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 0da971134a37..dd0edd02a48a 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -87,6 +87,33 @@ struct pt_iommu_info { }; struct pt_iommu_ops { + /** + * @map_range: Install translation for an IOVA range + * @iommu_table: Table to manipulate + * @iova: IO virtual address to start + * @paddr: Physical/Output address to start + * @len: Length of the range starting from @iova + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO + * @gfp: GFP flags for any memory allocations + * + * The range starting at IOVA will have paddr installed into it. The + * rage is automatically segmented into optimally sized table entries, + * and can have any valid alignment. + * + * On error the caller will probably want to invoke unmap on the range + * from iova up to the amount indicated by @mapped to return the table + * back to an unchanged state. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA + * that were mapped are added to @mapped, @mapped is not zerod first. + */ + int (*map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + phys_addr_t paddr, dma_addr_t len, unsigned int prot, + gfp_t gfp, size_t *mapped); + /** * @unmap_range: Make a range of IOVA empty/not present * @iommu_table: Table to manipulate @@ -224,10 +251,6 @@ struct pt_iommu_cfg { #define IOMMU_PROTOTYPES(fmt) \ phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ dma_addr_t iova); \ - int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \ - unsigned long iova, phys_addr_t paddr, \ - size_t pgsize, size_t pgcount, \ - int prot, gfp_t gfp, size_t *mapped); \ int pt_iommu_##fmt##_read_and_clear_dirty( \ struct iommu_domain *domain, unsigned long iova, size_t size, \ unsigned long flags, struct iommu_dirty_bitmap *dirty); \ @@ -248,8 +271,7 @@ struct pt_iommu_cfg { * iommu_pt */ #define IOMMU_PT_DOMAIN_OPS(fmt) \ - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ - .map_pages = &pt_iommu_##fmt##_map_pages + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys #define IOMMU_PT_DIRTY_OPS(fmt) \ .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty -- cgit v1.2.3 From a82efb8747d1b8a7c0a377dc79c2aac204eae788 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 17 Mar 2026 11:16:02 +0000 Subject: iommu: Add device ATS supported capability PCIe ATS may be disabled by platform firmware, root complex limitations, or kernel policy even when a device advertises the ATS capability in its PCI configuration space. Add a new IOMMU_CAP_PCI_ATS_SUPPORTED capability to allow IOMMU drivers to report the effective ATS decision for a device. When this capability is true for a device, ATS may be enabled for that device, but it does not imply that ATS is currently enabled. A subsequent patch will extend iommufd to expose the effective ATS status to userspace. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Signed-off-by: Shameer Kolothum Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7ca648c01336..a904821ed169 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -272,6 +272,8 @@ enum iommu_cap { */ IOMMU_CAP_DEFERRED_FLUSH, IOMMU_CAP_DIRTY_TRACKING, /* IOMMU supports dirty tracking */ + /* ATS is supported and may be enabled for this device */ + IOMMU_CAP_PCI_ATS_SUPPORTED, }; /* These are the possible reserved region types */ -- cgit v1.2.3 From 90c5def10bea574b101b7a520c015ca81742183f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 2 Mar 2026 18:22:52 -0400 Subject: iommu: Do not call drivers for empty gathers An empty gather is coded with start=U64_MAX, end=0 and several drivers go on to convert that to a size with: end - start + 1 Which gives 2 for an empty gather. This then causes Weird Stuff to happen (for example an UBSAN splat in VT-d) that is hopefully harmless, but maybe not. Prevent drivers from being called right in iommu_iotlb_sync(). Auditing shows that AMD, Intel, Mediatek and RSIC-V drivers all do things on these empty gathers. Further, there are several callers that can trigger empty gathers, especially in unusual conditions. For example iommu_map_nosync() will call a 0 size unmap on some error paths. Also in VFIO, iommupt and other places. Cc: stable@vger.kernel.org Reported-by: Janusz Krzysztofik Closes: https://lore.kernel.org/r/11145826.aFP6jjVeTY@jkrzyszt-mobl2.ger.corp.intel.com Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Samiullah Khawaja Reviewed-by: Robin Murphy Reviewed-by: Vasant Hegde Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 54b8b48c762e..555597b54083 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -980,7 +980,8 @@ static inline void iommu_flush_iotlb_all(struct iommu_domain *domain) static inline void iommu_iotlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather) { - if (domain->ops->iotlb_sync) + if (domain->ops->iotlb_sync && + likely(iotlb_gather->start < iotlb_gather->end)) domain->ops->iotlb_sync(domain, iotlb_gather); iommu_iotlb_gather_init(iotlb_gather); -- cgit v1.2.3