From a75b2be249d60eff6015737f6c3e94935b541068 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 15 Dec 2025 13:42:18 -0800 Subject: iommu: Add iommu_driver_get_domain_for_dev() helper There is a need to stage a resetting PCI device to temporarily the blocked domain and then attach back to its previously attached domain after reset. This can be simply done by keeping the "previously attached domain" in the iommu_group->domain pointer while adding an iommu_group->resetting_domain, which gives troubles to IOMMU drivers using the iommu_get_domain_for_dev() for a device's physical domain in order to program IOMMU hardware. And in such for-driver use cases, the iommu_group->mutex must be held, so it doesn't fit in external callers that don't hold the iommu_group->mutex. Introduce a new iommu_driver_get_domain_for_dev() helper, exclusively for driver use cases that hold the iommu_group->mutex, to separate from those external use cases. Add a lockdep_assert_not_held to the existing iommu_get_domain_for_dev() and highlight that in a kdoc. Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Tested-by: Dheeraj Kumar Srivastava Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 8c66284a91a8..ff097df318b9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -910,6 +910,7 @@ extern int iommu_attach_device(struct iommu_domain *domain, extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); +struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); -- cgit v1.2.3 From c279e83953d937470f8a6e69b69f62608714f13f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 15 Dec 2025 13:42:19 -0800 Subject: iommu: Introduce pci_dev_reset_iommu_prepare/done() PCIe permits a device to ignore ATS invalidation TLPs while processing a reset. This creates a problem visible to the OS where an ATS invalidation command will time out. E.g. an SVA domain will have no coordination with a reset event and can racily issue ATS invalidations to a resetting device. The OS should do something to mitigate this as we do not want production systems to be reporting critical ATS failures, especially in a hypervisor environment. Broadly, OS could arrange to ignore the timeouts, block page table mutations to prevent invalidations, or disable and block ATS. The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends SW to disable and block ATS before initiating a Function Level Reset. It also mentions that other reset methods could have the same vulnerability as well. Provide a callback from the PCI subsystem that will enclose the reset and have the iommu core temporarily change all the attached RID/PASID domains group->blocking_domain so that the IOMMU hardware would fence any incoming ATS queries. And IOMMU drivers should also synchronously stop issuing new ATS invalidations and wait for all ATS invalidations to complete. This can avoid any ATS invaliation timeouts. However, if there is a domain attachment/replacement happening during an ongoing reset, ATS routines may be re-activated between the two function calls. So, introduce a new resetting_domain in the iommu_group structure to reject any concurrent attach_dev/set_dev_pasid call during a reset for a concern of compatibility failure. Since this changes the behavior of an attach operation, update the uAPI accordingly. Note that there are two corner cases: 1. Devices in the same iommu_group Since an attachment is always per iommu_group, this means that any sibling devices in the iommu_group cannot change domain, to prevent race conditions. 2. An SR-IOV PF that is being reset while its VF is not In such case, the VF itself is already broken. So, there is no point in preventing PF from going through the iommu reset. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Dheeraj Kumar Srivastava Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 13 +++++++++++++ include/uapi/linux/vfio.h | 4 ++++ 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ff097df318b9..54b8b48c762e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1188,6 +1188,10 @@ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); ioasid_t iommu_alloc_global_pasid(struct device *dev); void iommu_free_global_pasid(ioasid_t pasid); + +/* PCI device reset functions */ +int pci_dev_reset_iommu_prepare(struct pci_dev *pdev); +void pci_dev_reset_iommu_done(struct pci_dev *pdev); #else /* CONFIG_IOMMU_API */ struct iommu_ops {}; @@ -1511,6 +1515,15 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) } static inline void iommu_free_global_pasid(ioasid_t pasid) {} + +static inline int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) +{ + return 0; +} + +static inline void pci_dev_reset_iommu_done(struct pci_dev *pdev) +{ +} #endif /* CONFIG_IOMMU_API */ #ifdef CONFIG_IRQ_MSI_IOMMU diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index ac2329f24141..bb7b89330d35 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -964,6 +964,10 @@ struct vfio_device_bind_iommufd { * hwpt corresponding to the given pt_id. * * Return: 0 on success, -errno on failure. + * + * When a device is resetting, -EBUSY will be returned to reject any concurrent + * attachment to the resetting device itself or any sibling device in the IOMMU + * group having the resetting device. */ struct vfio_device_attach_iommufd_pt { __u32 argsz; -- cgit v1.2.3 From 466ae6978a5b8c6022bd4537fbfd00e94bb07219 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Fri, 9 Jan 2026 17:18:02 +0000 Subject: iommu: Add page_ext for IOMMU_DEBUG_PAGEALLOC Add a new config IOMMU_DEBUG_PAGEALLOC, which registers new data to page_ext. This config will be used by the IOMMU API to track pages mapped in the IOMMU to catch drivers trying to free kernel memory that they still map in their domains, causing all types of memory corruption. This behaviour is disabled by default and can be enabled using kernel cmdline iommu.debug_pagealloc. Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Pranjal Shrivastava Reviewed-by: Lu Baolu Signed-off-by: Mostafa Saleh Signed-off-by: Joerg Roedel --- include/linux/iommu-debug-pagealloc.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 include/linux/iommu-debug-pagealloc.h (limited to 'include') diff --git a/include/linux/iommu-debug-pagealloc.h b/include/linux/iommu-debug-pagealloc.h new file mode 100644 index 000000000000..83e64d70bf6c --- /dev/null +++ b/include/linux/iommu-debug-pagealloc.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 - Google Inc + * Author: Mostafa Saleh + * IOMMU API debug page alloc sanitizer + */ + +#ifndef __LINUX_IOMMU_DEBUG_PAGEALLOC_H +#define __LINUX_IOMMU_DEBUG_PAGEALLOC_H + +#ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC + +extern struct page_ext_operations page_iommu_debug_ops; + +#endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */ + +#endif /* __LINUX_IOMMU_DEBUG_PAGEALLOC_H */ -- cgit v1.2.3 From ccc21213f013834b484cdcc738e282f963fcfc97 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Fri, 9 Jan 2026 17:18:03 +0000 Subject: iommu: Add calls for IOMMU_DEBUG_PAGEALLOC Add calls for the new iommu debug config IOMMU_DEBUG_PAGEALLOC: - iommu_debug_init: Enable the debug mode if configured by the user. - iommu_debug_map: Track iommu pages mapped, using physical address. - iommu_debug_unmap_begin: Track start of iommu unmap operation, with IOVA and size. - iommu_debug_unmap_end: Track the end of unmap operation, passing the actual unmapped size versus the tracked one at unmap_begin. We have to do the unmap_begin/end as once pages are unmapped we lose the information of the physical address. This is racy, but the API is racy by construction as it uses refcounts and doesn't attempt to lock/synchronize with the IOMMU API as that will be costly, meaning that possibility of false negative exists. Reviewed-by: Samiullah Khawaja Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Signed-off-by: Mostafa Saleh Signed-off-by: Joerg Roedel --- include/linux/iommu-debug-pagealloc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/iommu-debug-pagealloc.h b/include/linux/iommu-debug-pagealloc.h index 83e64d70bf6c..a439d6815ca1 100644 --- a/include/linux/iommu-debug-pagealloc.h +++ b/include/linux/iommu-debug-pagealloc.h @@ -9,6 +9,7 @@ #define __LINUX_IOMMU_DEBUG_PAGEALLOC_H #ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC +DECLARE_STATIC_KEY_FALSE(iommu_debug_initialized); extern struct page_ext_operations page_iommu_debug_ops; -- cgit v1.2.3 From a8258ffed2efdf533bdc756141eeb7bc5301ad4f Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Fri, 9 Jan 2026 17:18:05 +0000 Subject: iommu: debug-pagealloc: Check mapped/unmapped kernel memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now, as the page_ext holds count of IOMMU mappings, we can use it to assert that any page allocated/freed is indeed not in the IOMMU. The sanitizer doesn’t protect against mapping/unmapping during this period. However, that’s less harmful as the page is not used by the kernel. Reviewed-by: Samiullah Khawaja Reviewed-by: Lu Baolu Signed-off-by: Mostafa Saleh Reviewed-by: Pranjal Shrivastava Signed-off-by: Joerg Roedel --- include/linux/iommu-debug-pagealloc.h | 14 ++++++++++++++ include/linux/mm.h | 5 +++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/iommu-debug-pagealloc.h b/include/linux/iommu-debug-pagealloc.h index a439d6815ca1..46c3c1f70150 100644 --- a/include/linux/iommu-debug-pagealloc.h +++ b/include/linux/iommu-debug-pagealloc.h @@ -13,6 +13,20 @@ DECLARE_STATIC_KEY_FALSE(iommu_debug_initialized); extern struct page_ext_operations page_iommu_debug_ops; +void __iommu_debug_check_unmapped(const struct page *page, int numpages); + +static inline void iommu_debug_check_unmapped(const struct page *page, int numpages) +{ + if (static_branch_unlikely(&iommu_debug_initialized)) + __iommu_debug_check_unmapped(page, numpages); +} + +#else +static inline void iommu_debug_check_unmapped(const struct page *page, + int numpages) +{ +} + #endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */ #endif /* __LINUX_IOMMU_DEBUG_PAGEALLOC_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 6f959d8ca4b4..32205d2a24b2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -36,6 +36,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -4133,12 +4134,16 @@ extern void __kernel_map_pages(struct page *page, int numpages, int enable); #ifdef CONFIG_DEBUG_PAGEALLOC static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { + iommu_debug_check_unmapped(page, numpages); + if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 1); } static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) { + iommu_debug_check_unmapped(page, numpages); + if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 0); } -- cgit v1.2.3 From 7d8b06ecc45bd679dec58d2cc2bd86223d4e076d Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:02 +0000 Subject: iommu/amd: Add support for hw_info for iommu capability query AMD IOMMU Extended Feature (EFR) and Extended Feature 2 (EFR2) registers specify features supported by each IOMMU hardware instance. The IOMMU driver checks each feature-specific bits before enabling each feature at run time. For IOMMUFD, the hypervisor passes the raw value of amd_iommu_efr and amd_iommu_efr2 to VMM via iommufd IOMMU_DEVICE_GET_HW_INFO ioctl. Reviewed-by: Nicolin Chen Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- include/uapi/linux/iommufd.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 2c41920b641d..3db37f6042a0 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -623,6 +623,32 @@ struct iommu_hw_info_tegra241_cmdqv { __u8 __reserved; }; +/** + * struct iommu_hw_info_amd - AMD IOMMU device info + * + * @efr : Value of AMD IOMMU Extended Feature Register (EFR) + * @efr2: Value of AMD IOMMU Extended Feature 2 Register (EFR2) + * + * Please See description of these registers in the following sections of + * the AMD I/O Virtualization Technology (IOMMU) Specification. + * (https://docs.amd.com/v/u/en-US/48882_3.10_PUB) + * + * - MMIO Offset 0030h IOMMU Extended Feature Register + * - MMIO Offset 01A0h IOMMU Extended Feature 2 Register + * + * Note: The EFR and EFR2 are raw values reported by hardware. + * VMM is responsible to determine the appropriate flags to be exposed to + * the VM since cetertain features are not currently supported by the kernel + * for HW-vIOMMU. + * + * Current VMM-allowed list of feature flags are: + * - EFR[GTSup, GASup, GioSup, PPRSup, EPHSup, GATS, GLX, PASmax] + */ +struct iommu_hw_info_amd { + __aligned_u64 efr; + __aligned_u64 efr2; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware @@ -632,6 +658,7 @@ struct iommu_hw_info_tegra241_cmdqv { * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type * @IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM * SMMUv3) info type + * @IOMMU_HW_INFO_TYPE_AMD: AMD IOMMU info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE = 0, @@ -639,6 +666,7 @@ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV = 3, + IOMMU_HW_INFO_TYPE_AMD = 4, }; /** -- cgit v1.2.3 From e05698c10d980ac0a0b57ed81ec9353b9e9533c6 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:06 +0000 Subject: iommufd: Introduce data struct for AMD nested domain allocation Introduce IOMMU_HWPT_DATA_AMD_GUEST data type for IOMMU guest page table, which is used for stage-1 in nested translation. The data structure contains information necessary for setting up the AMD HW-vIOMMU support. Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Vasant Hegde Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- include/uapi/linux/iommufd.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 3db37f6042a0..1dafbc552d37 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -465,16 +465,27 @@ struct iommu_hwpt_arm_smmuv3 { __aligned_le64 ste[2]; }; +/** + * struct iommu_hwpt_amd_guest - AMD IOMMU guest I/O page table data + * (IOMMU_HWPT_DATA_AMD_GUEST) + * @dte: Guest Device Table Entry (DTE) + */ +struct iommu_hwpt_amd_guest { + __aligned_u64 dte[4]; +}; + /** * enum iommu_hwpt_data_type - IOMMU HWPT Data Type * @IOMMU_HWPT_DATA_NONE: no data * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table + * @IOMMU_HWPT_DATA_AMD_GUEST: AMD IOMMU guest page table */ enum iommu_hwpt_data_type { IOMMU_HWPT_DATA_NONE = 0, IOMMU_HWPT_DATA_VTD_S1 = 1, IOMMU_HWPT_DATA_ARM_SMMUV3 = 2, + IOMMU_HWPT_DATA_AMD_GUEST = 3, }; /** -- cgit v1.2.3 From d414b83dc5f90a6a9a656cd6fbb9378ddc824032 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Tue, 20 Jan 2026 09:19:25 +0000 Subject: mm/page_ext: Add page_ext_get_from_phys() The IOMMU code operates on physical addresses which can be outside of system RAM. Add a new function page_ext_get_from_phys() to abstract the logic of checking the address and returning the page_ext. Signed-off-by: Mostafa Saleh Acked-by: Vlastimil Babka Signed-off-by: Joerg Roedel --- include/linux/page_ext.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 76c817162d2f..61e876e255e8 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -93,6 +93,7 @@ static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) #endif extern struct page_ext *page_ext_get(const struct page *page); +extern struct page_ext *page_ext_from_phys(phys_addr_t phys); extern void page_ext_put(struct page_ext *page_ext); extern struct page_ext *page_ext_lookup(unsigned long pfn); @@ -215,6 +216,11 @@ static inline struct page_ext *page_ext_get(const struct page *page) return NULL; } +static inline struct page_ext *page_ext_from_phys(phys_addr_t phys) +{ + return NULL; +} + static inline void page_ext_put(struct page_ext *page_ext) { } -- cgit v1.2.3