From df180b1a4cc51011c5f8c52c7ec02ad2e42962de Mon Sep 17 00:00:00 2001 From: Alexander Grest Date: Mon, 8 Dec 2025 13:28:57 -0800 Subject: iommu/arm-smmu-v3: Improve CMDQ lock fairness and efficiency The SMMU CMDQ lock is highly contentious when there are multiple CPUs issuing commands and the queue is nearly full. The lock has the following states: - 0: Unlocked - >0: Shared lock held with count - INT_MIN+N: Exclusive lock held, where N is the # of shared waiters - INT_MIN: Exclusive lock held, no shared waiters When multiple CPUs are polling for space in the queue, they attempt to grab the exclusive lock to update the cons pointer from the hardware. If they fail to get the lock, they will spin until either the cons pointer is updated by another CPU. The current code allows the possibility of shared lock starvation if there is a constant stream of CPUs trying to grab the exclusive lock. This leads to severe latency issues and soft lockups. Consider the following scenario where CPU1's attempt to acquire the shared lock is starved by CPU2 and CPU0 contending for the exclusive lock. CPU0 (exclusive) | CPU1 (shared) | CPU2 (exclusive) | `cmdq->lock` -------------------------------------------------------------------------- trylock() //takes | | | 0 | shared_lock() | | INT_MIN | fetch_inc() | | INT_MIN | no return | | INT_MIN + 1 | spins // VAL >= 0 | | INT_MIN + 1 unlock() | spins... | | INT_MIN + 1 set_release(0) | spins... | | 0 see[NOTE] (done) | (sees 0) | trylock() // takes | 0 | *exits loop* | cmpxchg(0, INT_MIN) | 0 | | *cuts in* | INT_MIN | cmpxchg(0, 1) | | INT_MIN | fails // != 0 | | INT_MIN | spins // VAL >= 0 | | INT_MIN | *starved* | | INT_MIN [NOTE] The current code resets the exclusive lock to 0 regardless of the state of the lock. This causes two problems: 1. It opens the possibility of back-to-back exclusive locks and the downstream effect of starving shared lock. 2. The count of shared lock waiters are lost. To mitigate this, we release the exclusive lock by only clearing the sign bit while retaining the shared lock waiter count as a way to avoid starving the shared lock waiters. Also deleted cmpxchg loop while trying to acquire the shared lock as it is not needed. The waiters can see the positive lock count and proceed immediately after the exclusive lock is released. Exclusive lock is not starved in that submitters will try exclusive lock first when new spaces become available. Reviewed-by: Mostafa Saleh Reviewed-by: Nicolin Chen Signed-off-by: Alexander Grest Signed-off-by: Jacob Pan Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 31 +++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index d16d35c78c06..7a6aea3b61c1 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -487,20 +487,26 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) */ static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq) { - int val; - /* - * We can try to avoid the cmpxchg() loop by simply incrementing the - * lock counter. When held in exclusive state, the lock counter is set - * to INT_MIN so these increments won't hurt as the value will remain - * negative. + * When held in exclusive state, the lock counter is set to INT_MIN + * so these increments won't hurt as the value will remain negative. + * The increment will also signal the exclusive locker that there are + * shared waiters. */ if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0) return; - do { - val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0); - } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val); + /* + * Someone else is holding the lock in exclusive state, so wait + * for them to finish. Since we already incremented the lock counter, + * no exclusive lock can be acquired until we finish. We don't need + * the return value since we only care that the exclusive lock is + * released (i.e. the lock counter is non-negative). + * Once the exclusive locker releases the lock, the sign bit will + * be cleared and our increment will make the lock counter positive, + * allowing us to proceed. + */ + atomic_cond_read_relaxed(&cmdq->lock, VAL > 0); } static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq) @@ -527,9 +533,14 @@ static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq) __ret; \ }) +/* + * Only clear the sign bit when releasing the exclusive lock this will + * allow any shared_lock() waiters to proceed without the possibility + * of entering the exclusive lock in a tight loop. + */ #define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) \ ({ \ - atomic_set_release(&cmdq->lock, 0); \ + atomic_fetch_andnot_release(INT_MIN, &cmdq->lock); \ local_irq_restore(flags); \ }) -- cgit v1.2.3 From f91879fdf70bdf369906a20a8980284d3ef1558f Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Tue, 9 Dec 2025 10:53:23 +0530 Subject: iommu/arm-smmu-qcom: Add actlr settings for mdss on Qualcomm platforms Add ACTLR settings for missing MDSS devices on Qualcomm platforms. These are QoS settings and are specific to per SoC thus different settings, eg: some have shallow prefetch while others have no prefetch. Aswell, this prefetch feature is not implemented for all the platforms, capturing to those are implemented to the best of my knowledge. Reviewed-by: Dmitry Baryshkov Reviewed-by: Konrad Dybcio Signed-off-by: Charan Teja Kalla Signed-off-by: Bibek Kumar Patro Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 573085349df3..456d5146831e 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -41,12 +41,38 @@ static const struct of_device_id qcom_smmu_actlr_client_of_match[] = { .data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) }, { .compatible = "qcom,fastrpc", .data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) }, + { .compatible = "qcom,qcm2290-mdss", + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, { .compatible = "qcom,sc7280-mdss", .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, { .compatible = "qcom,sc7280-venus", .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, + { .compatible = "qcom,sc8180x-mdss", + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, + { .compatible = "qcom,sc8280xp-mdss", + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, + { .compatible = "qcom,sm6115-mdss", + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, + { .compatible = "qcom,sm6125-mdss", + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, + { .compatible = "qcom,sm6350-mdss", + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, + { .compatible = "qcom,sm8150-mdss", + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, + { .compatible = "qcom,sm8250-mdss", + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, + { .compatible = "qcom,sm8350-mdss", + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, + { .compatible = "qcom,sm8450-mdss", + .data = (const void *) (PREFETCH_DEFAULT | CMTLB) }, { .compatible = "qcom,sm8550-mdss", .data = (const void *) (PREFETCH_DEFAULT | CMTLB) }, + { .compatible = "qcom,sm8650-mdss", + .data = (const void *) (PREFETCH_DEFAULT | CMTLB) }, + { .compatible = "qcom,sm8750-mdss", + .data = (const void *) (PREFETCH_DEFAULT | CMTLB) }, + { .compatible = "qcom,x1e80100-mdss", + .data = (const void *) (PREFETCH_DEFAULT | CMTLB) }, { } }; -- cgit v1.2.3 From 2026159372bb6ff5e18e63272473eb4549b0779e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 2 Jan 2026 13:50:04 +0100 Subject: iommu/qcom: Simplify with scoped for each OF child loop Use scoped for-each loop when iterating over device nodes to make code a bit simpler. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Konrad Dybcio Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/qcom_iommu.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index f69d9276dc55..c98bed38c58a 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -761,14 +761,10 @@ static struct platform_driver qcom_iommu_ctx_driver = { static bool qcom_iommu_has_secure_context(struct qcom_iommu_dev *qcom_iommu) { - struct device_node *child; - - for_each_child_of_node(qcom_iommu->dev->of_node, child) { + for_each_child_of_node_scoped(qcom_iommu->dev->of_node, child) { if (of_device_is_compatible(child, "qcom,msm-iommu-v1-sec") || - of_device_is_compatible(child, "qcom,msm-iommu-v2-sec")) { - of_node_put(child); + of_device_is_compatible(child, "qcom,msm-iommu-v2-sec")) return true; - } } return false; -- cgit v1.2.3 From 5ac66ed8417fa43f64edbabc5fbac18d2bca9437 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Fri, 2 Jan 2026 20:53:52 +0000 Subject: iommu/arm-smmu-v3: Remove IAS The driver only supports AArch64 page tables where OAS == IAS. Remove the extra IAS tracking for AArch32 as this feature was never implemented and that was creating BAD_STEs for SMMUv3 with stage-2 and OAS < 40. Further discussion on this in: https://lore.kernel.org/linux-iommu/20251130194506.593700-1-smostafa@google.com/ Reported-by: Tomasz Nowicki Fixes: f0c453dbcce7 ("iommu/arm-smmu: Ensure IAS is set correctly for AArch32-capable SMMUs") Signed-off-by: Mostafa Saleh Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 18 +++++------------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 2 -- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 7a6aea3b61c1..62bdc4d39101 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2562,7 +2562,7 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, ARM_SMMU_FEAT_VAX) ? 52 : 48; pgtbl_cfg.ias = min_t(unsigned long, ias, VA_BITS); - pgtbl_cfg.oas = smmu->ias; + pgtbl_cfg.oas = smmu->oas; if (enable_dirty) pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_HD; fmt = ARM_64_LPAE_S1; @@ -2572,7 +2572,7 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, case ARM_SMMU_DOMAIN_S2: if (enable_dirty) return -EOPNOTSUPP; - pgtbl_cfg.ias = smmu->ias; + pgtbl_cfg.ias = smmu->oas; pgtbl_cfg.oas = smmu->oas; fmt = ARM_64_LPAE_S2; finalise_stage_fn = arm_smmu_domain_finalise_s2; @@ -4406,13 +4406,7 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) } /* We only support the AArch64 table format at present */ - switch (FIELD_GET(IDR0_TTF, reg)) { - case IDR0_TTF_AARCH32_64: - smmu->ias = 40; - fallthrough; - case IDR0_TTF_AARCH64: - break; - default: + if (!(FIELD_GET(IDR0_TTF, reg) & IDR0_TTF_AARCH64)) { dev_err(smmu->dev, "AArch64 table format not supported!\n"); return -ENXIO; } @@ -4525,8 +4519,6 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) dev_warn(smmu->dev, "failed to set DMA mask for table walker\n"); - smmu->ias = max(smmu->ias, smmu->oas); - if ((smmu->features & ARM_SMMU_FEAT_TRANS_S1) && (smmu->features & ARM_SMMU_FEAT_TRANS_S2)) smmu->features |= ARM_SMMU_FEAT_NESTING; @@ -4536,8 +4528,8 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) if (arm_smmu_sva_supported(smmu)) smmu->features |= ARM_SMMU_FEAT_SVA; - dev_info(smmu->dev, "ias %lu-bit, oas %lu-bit (features 0x%08x)\n", - smmu->ias, smmu->oas, smmu->features); + dev_info(smmu->dev, "oas %lu-bit (features 0x%08x)\n", + smmu->oas, smmu->features); return 0; } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index ae23aacc3840..0a5bb57dbdfe 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -43,7 +43,6 @@ struct arm_vsmmu; #define IDR0_COHACC (1 << 4) #define IDR0_TTF GENMASK(3, 2) #define IDR0_TTF_AARCH64 2 -#define IDR0_TTF_AARCH32_64 3 #define IDR0_S1P (1 << 1) #define IDR0_S2P (1 << 0) @@ -784,7 +783,6 @@ struct arm_smmu_device { int gerr_irq; int combined_irq; - unsigned long ias; /* IPA */ unsigned long oas; /* PA */ unsigned long pgsize_bitmap; -- cgit v1.2.3 From 5d5388b0e190b6decb964d3711473b7010bf1f6f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 15 Dec 2025 13:42:16 -0800 Subject: iommu: Lock group->mutex in iommu_deferred_attach() The iommu_deferred_attach() function invokes __iommu_attach_device(), but doesn't hold the group->mutex like other __iommu_attach_device() callers. Though there is no pratical bug being triggered so far, it would be better to apply the same locking to this __iommu_attach_device(), since the IOMMU drivers nowaday are more aware of the group->mutex -- some of them use the iommu_group_mutex_assert() function that could be potentially in the path of an attach_dev callback function invoked by the __iommu_attach_device(). Worth mentioning that the iommu_deferred_attach() will soon need to check group->resetting_domain that must be locked also. Thus, grab the mutex to guard __iommu_attach_device() like other callers. Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Tested-by: Dheeraj Kumar Srivastava Signed-off-by: Nicolin Chen Reviewed-by: Samiullah Khawaja Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 2ca990dfbb88..170e522b5bda 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2185,10 +2185,17 @@ EXPORT_SYMBOL_GPL(iommu_attach_device); int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) { - if (dev->iommu && dev->iommu->attach_deferred) - return __iommu_attach_device(domain, dev, NULL); + /* + * This is called on the dma mapping fast path so avoid locking. This is + * racy, but we have an expectation that the driver will setup its DMAs + * inside probe while being single threaded to avoid racing. + */ + if (!dev->iommu || !dev->iommu->attach_deferred) + return 0; - return 0; + guard(mutex)(&dev->iommu_group->mutex); + + return __iommu_attach_device(domain, dev, NULL); } void iommu_detach_device(struct iommu_domain *domain, struct device *dev) -- cgit v1.2.3 From 4a73abb965b7546864957d2bde428f1e72bc3bb5 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 15 Dec 2025 13:42:17 -0800 Subject: iommu: Tidy domain for iommu_setup_dma_ops() This function can only be called on the default_domain. Trivally pass it in. In all three existing cases, the default domain was just attached to the device. This avoids iommu_setup_dma_ops() calling iommu_get_domain_for_dev() that will be used by external callers. Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Tested-by: Dheeraj Kumar Srivastava Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 4 +--- drivers/iommu/dma-iommu.h | 5 +++-- drivers/iommu/iommu.c | 6 +++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index c92088855450..aeaf8fad985c 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -2097,10 +2097,8 @@ void dma_iova_destroy(struct device *dev, struct dma_iova_state *state, } EXPORT_SYMBOL_GPL(dma_iova_destroy); -void iommu_setup_dma_ops(struct device *dev) +void iommu_setup_dma_ops(struct device *dev, struct iommu_domain *domain) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - if (dev_is_pci(dev)) dev->iommu->pci_32bit_workaround = !iommu_dma_forcedac; diff --git a/drivers/iommu/dma-iommu.h b/drivers/iommu/dma-iommu.h index eca201c1f963..040d00252563 100644 --- a/drivers/iommu/dma-iommu.h +++ b/drivers/iommu/dma-iommu.h @@ -9,7 +9,7 @@ #ifdef CONFIG_IOMMU_DMA -void iommu_setup_dma_ops(struct device *dev); +void iommu_setup_dma_ops(struct device *dev, struct iommu_domain *domain); int iommu_get_dma_cookie(struct iommu_domain *domain); void iommu_put_dma_cookie(struct iommu_domain *domain); @@ -26,7 +26,8 @@ extern bool iommu_dma_forcedac; #else /* CONFIG_IOMMU_DMA */ -static inline void iommu_setup_dma_ops(struct device *dev) +static inline void iommu_setup_dma_ops(struct device *dev, + struct iommu_domain *domain) { } diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 170e522b5bda..1e322f87b171 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -661,7 +661,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list } if (group->default_domain) - iommu_setup_dma_ops(dev); + iommu_setup_dma_ops(dev, group->default_domain); mutex_unlock(&group->mutex); @@ -1949,7 +1949,7 @@ static int bus_iommu_probe(const struct bus_type *bus) return ret; } for_each_group_device(group, gdev) - iommu_setup_dma_ops(gdev->dev); + iommu_setup_dma_ops(gdev->dev, group->default_domain); mutex_unlock(&group->mutex); /* @@ -3155,7 +3155,7 @@ static ssize_t iommu_group_store_type(struct iommu_group *group, /* Make sure dma_ops is appropriatley set */ for_each_group_device(group, gdev) - iommu_setup_dma_ops(gdev->dev); + iommu_setup_dma_ops(gdev->dev, group->default_domain); out_unlock: mutex_unlock(&group->mutex); -- cgit v1.2.3 From a75b2be249d60eff6015737f6c3e94935b541068 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 15 Dec 2025 13:42:18 -0800 Subject: iommu: Add iommu_driver_get_domain_for_dev() helper There is a need to stage a resetting PCI device to temporarily the blocked domain and then attach back to its previously attached domain after reset. This can be simply done by keeping the "previously attached domain" in the iommu_group->domain pointer while adding an iommu_group->resetting_domain, which gives troubles to IOMMU drivers using the iommu_get_domain_for_dev() for a device's physical domain in order to program IOMMU hardware. And in such for-driver use cases, the iommu_group->mutex must be held, so it doesn't fit in external callers that don't hold the iommu_group->mutex. Introduce a new iommu_driver_get_domain_for_dev() helper, exclusively for driver use cases that hold the iommu_group->mutex, to separate from those external use cases. Add a lockdep_assert_not_held to the existing iommu_get_domain_for_dev() and highlight that in a kdoc. Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Tested-by: Dheeraj Kumar Srivastava Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 5 +++-- drivers/iommu/iommu.c | 28 ++++++++++++++++++++++++++++ include/linux/iommu.h | 1 + 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index d16d35c78c06..b8d2fef3ee6b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3125,7 +3125,8 @@ int arm_smmu_set_pasid(struct arm_smmu_master *master, struct arm_smmu_domain *smmu_domain, ioasid_t pasid, struct arm_smmu_cd *cd, struct iommu_domain *old) { - struct iommu_domain *sid_domain = iommu_get_domain_for_dev(master->dev); + struct iommu_domain *sid_domain = + iommu_driver_get_domain_for_dev(master->dev); struct arm_smmu_attach_state state = { .master = master, .ssid = pasid, @@ -3191,7 +3192,7 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain, */ if (!arm_smmu_ssids_in_use(&master->cd_table)) { struct iommu_domain *sid_domain = - iommu_get_domain_for_dev(master->dev); + iommu_driver_get_domain_for_dev(master->dev); if (sid_domain->type == IOMMU_DOMAIN_IDENTITY || sid_domain->type == IOMMU_DOMAIN_BLOCKED) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 1e322f87b171..672597100e9a 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2217,6 +2217,15 @@ out_unlock: } EXPORT_SYMBOL_GPL(iommu_detach_device); +/** + * iommu_get_domain_for_dev() - Return the DMA API domain pointer + * @dev: Device to query + * + * This function can be called within a driver bound to dev. The returned + * pointer is valid for the lifetime of the bound driver. + * + * It should not be called by drivers with driver_managed_dma = true. + */ struct iommu_domain *iommu_get_domain_for_dev(struct device *dev) { /* Caller must be a probed driver on dev */ @@ -2225,10 +2234,29 @@ struct iommu_domain *iommu_get_domain_for_dev(struct device *dev) if (!group) return NULL; + lockdep_assert_not_held(&group->mutex); + return group->domain; } EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev); +/** + * iommu_driver_get_domain_for_dev() - Return the driver-level domain pointer + * @dev: Device to query + * + * This function can be called by an iommu driver that wants to get the physical + * domain within an iommu callback function where group->mutex is held. + */ +struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev) +{ + struct iommu_group *group = dev->iommu_group; + + lockdep_assert_held(&group->mutex); + + return group->domain; +} +EXPORT_SYMBOL_GPL(iommu_driver_get_domain_for_dev); + /* * For IOMMU_DOMAIN_DMA implementations which already provide their own * guarantees that the group and its default domain are valid and correct. diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 8c66284a91a8..ff097df318b9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -910,6 +910,7 @@ extern int iommu_attach_device(struct iommu_domain *domain, extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); +struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); -- cgit v1.2.3 From c279e83953d937470f8a6e69b69f62608714f13f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 15 Dec 2025 13:42:19 -0800 Subject: iommu: Introduce pci_dev_reset_iommu_prepare/done() PCIe permits a device to ignore ATS invalidation TLPs while processing a reset. This creates a problem visible to the OS where an ATS invalidation command will time out. E.g. an SVA domain will have no coordination with a reset event and can racily issue ATS invalidations to a resetting device. The OS should do something to mitigate this as we do not want production systems to be reporting critical ATS failures, especially in a hypervisor environment. Broadly, OS could arrange to ignore the timeouts, block page table mutations to prevent invalidations, or disable and block ATS. The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends SW to disable and block ATS before initiating a Function Level Reset. It also mentions that other reset methods could have the same vulnerability as well. Provide a callback from the PCI subsystem that will enclose the reset and have the iommu core temporarily change all the attached RID/PASID domains group->blocking_domain so that the IOMMU hardware would fence any incoming ATS queries. And IOMMU drivers should also synchronously stop issuing new ATS invalidations and wait for all ATS invalidations to complete. This can avoid any ATS invaliation timeouts. However, if there is a domain attachment/replacement happening during an ongoing reset, ATS routines may be re-activated between the two function calls. So, introduce a new resetting_domain in the iommu_group structure to reject any concurrent attach_dev/set_dev_pasid call during a reset for a concern of compatibility failure. Since this changes the behavior of an attach operation, update the uAPI accordingly. Note that there are two corner cases: 1. Devices in the same iommu_group Since an attachment is always per iommu_group, this means that any sibling devices in the iommu_group cannot change domain, to prevent race conditions. 2. An SR-IOV PF that is being reset while its VF is not In such case, the VF itself is already broken. So, there is no point in preventing PF from going through the iommu reset. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Dheeraj Kumar Srivastava Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/iommu.h | 13 ++++ include/uapi/linux/vfio.h | 4 ++ 3 files changed, 190 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 672597100e9a..0665dedd91b2 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -61,6 +61,11 @@ struct iommu_group { int id; struct iommu_domain *default_domain; struct iommu_domain *blocking_domain; + /* + * During a group device reset, @resetting_domain points to the physical + * domain, while @domain points to the attached domain before the reset. + */ + struct iommu_domain *resetting_domain; struct iommu_domain *domain; struct list_head entry; unsigned int owner_cnt; @@ -2195,6 +2200,15 @@ int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) guard(mutex)(&dev->iommu_group->mutex); + /* + * This is a concurrent attach during a device reset. Reject it until + * pci_dev_reset_iommu_done() attaches the device to group->domain. + * + * Note that this might fail the iommu_dma_map(). But there's nothing + * more we can do here. + */ + if (dev->iommu_group->resetting_domain) + return -EBUSY; return __iommu_attach_device(domain, dev, NULL); } @@ -2253,6 +2267,17 @@ struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev) lockdep_assert_held(&group->mutex); + /* + * Driver handles the low-level __iommu_attach_device(), including the + * one invoked by pci_dev_reset_iommu_done() re-attaching the device to + * the cached group->domain. In this case, the driver must get the old + * domain from group->resetting_domain rather than group->domain. This + * prevents it from re-attaching the device from group->domain (old) to + * group->domain (new). + */ + if (group->resetting_domain) + return group->resetting_domain; + return group->domain; } EXPORT_SYMBOL_GPL(iommu_driver_get_domain_for_dev); @@ -2409,6 +2434,13 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, if (WARN_ON(!new_domain)) return -EINVAL; + /* + * This is a concurrent attach during a device reset. Reject it until + * pci_dev_reset_iommu_done() attaches the device to group->domain. + */ + if (group->resetting_domain) + return -EBUSY; + /* * Changing the domain is done by calling attach_dev() on the new * domain. This switch does not have to be atomic and DMA can be @@ -3527,6 +3559,16 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, return -EINVAL; mutex_lock(&group->mutex); + + /* + * This is a concurrent attach during a device reset. Reject it until + * pci_dev_reset_iommu_done() attaches the device to group->domain. + */ + if (group->resetting_domain) { + ret = -EBUSY; + goto out_unlock; + } + for_each_group_device(group, device) { /* * Skip PASID validation for devices without PASID support @@ -3610,6 +3652,16 @@ int iommu_replace_device_pasid(struct iommu_domain *domain, return -EINVAL; mutex_lock(&group->mutex); + + /* + * This is a concurrent attach during a device reset. Reject it until + * pci_dev_reset_iommu_done() attaches the device to group->domain. + */ + if (group->resetting_domain) { + ret = -EBUSY; + goto out_unlock; + } + entry = iommu_make_pasid_array_entry(domain, handle); curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, XA_ZERO_ENTRY, GFP_KERNEL); @@ -3867,6 +3919,127 @@ err_unlock: } EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); +/** + * pci_dev_reset_iommu_prepare() - Block IOMMU to prepare for a PCI device reset + * @pdev: PCI device that is going to enter a reset routine + * + * The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends to disable and block + * ATS before initiating a reset. This means that a PCIe device during the reset + * routine wants to block any IOMMU activity: translation and ATS invalidation. + * + * This function attaches the device's RID/PASID(s) the group->blocking_domain, + * setting the group->resetting_domain. This allows the IOMMU driver pausing any + * IOMMU activity while leaving the group->domain pointer intact. Later when the + * reset is finished, pci_dev_reset_iommu_done() can restore everything. + * + * Caller must use pci_dev_reset_iommu_prepare() with pci_dev_reset_iommu_done() + * before/after the core-level reset routine, to unset the resetting_domain. + * + * Return: 0 on success or negative error code if the preparation failed. + * + * These two functions are designed to be used by PCI reset functions that would + * not invoke any racy iommu_release_device(), since PCI sysfs node gets removed + * before it notifies with a BUS_NOTIFY_REMOVED_DEVICE. When using them in other + * case, callers must ensure there will be no racy iommu_release_device() call, + * which otherwise would UAF the dev->iommu_group pointer. + */ +int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) +{ + struct iommu_group *group = pdev->dev.iommu_group; + unsigned long pasid; + void *entry; + int ret; + + if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev)) + return 0; + + guard(mutex)(&group->mutex); + + /* Re-entry is not allowed */ + if (WARN_ON(group->resetting_domain)) + return -EBUSY; + + ret = __iommu_group_alloc_blocking_domain(group); + if (ret) + return ret; + + /* Stage RID domain at blocking_domain while retaining group->domain */ + if (group->domain != group->blocking_domain) { + ret = __iommu_attach_device(group->blocking_domain, &pdev->dev, + group->domain); + if (ret) + return ret; + } + + /* + * Stage PASID domains at blocking_domain while retaining pasid_array. + * + * The pasid_array is mostly fenced by group->mutex, except one reader + * in iommu_attach_handle_get(), so it's safe to read without xa_lock. + */ + xa_for_each_start(&group->pasid_array, pasid, entry, 1) + iommu_remove_dev_pasid(&pdev->dev, pasid, + pasid_array_entry_to_domain(entry)); + + group->resetting_domain = group->blocking_domain; + return ret; +} +EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); + +/** + * pci_dev_reset_iommu_done() - Restore IOMMU after a PCI device reset is done + * @pdev: PCI device that has finished a reset routine + * + * After a PCIe device finishes a reset routine, it wants to restore its IOMMU + * IOMMU activity, including new translation as well as cache invalidation, by + * re-attaching all RID/PASID of the device's back to the domains retained in + * the core-level structure. + * + * Caller must pair it with a successful pci_dev_reset_iommu_prepare(). + * + * Note that, although unlikely, there is a risk that re-attaching domains might + * fail due to some unexpected happening like OOM. + */ +void pci_dev_reset_iommu_done(struct pci_dev *pdev) +{ + struct iommu_group *group = pdev->dev.iommu_group; + unsigned long pasid; + void *entry; + + if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev)) + return; + + guard(mutex)(&group->mutex); + + /* pci_dev_reset_iommu_prepare() was bypassed for the device */ + if (!group->resetting_domain) + return; + + /* pci_dev_reset_iommu_prepare() was not successfully called */ + if (WARN_ON(!group->blocking_domain)) + return; + + /* Re-attach RID domain back to group->domain */ + if (group->domain != group->blocking_domain) { + WARN_ON(__iommu_attach_device(group->domain, &pdev->dev, + group->blocking_domain)); + } + + /* + * Re-attach PASID domains back to the domains retained in pasid_array. + * + * The pasid_array is mostly fenced by group->mutex, except one reader + * in iommu_attach_handle_get(), so it's safe to read without xa_lock. + */ + xa_for_each_start(&group->pasid_array, pasid, entry, 1) + WARN_ON(__iommu_set_group_pasid( + pasid_array_entry_to_domain(entry), group, pasid, + group->blocking_domain)); + + group->resetting_domain = NULL; +} +EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done); + #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) /** * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ff097df318b9..54b8b48c762e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1188,6 +1188,10 @@ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); ioasid_t iommu_alloc_global_pasid(struct device *dev); void iommu_free_global_pasid(ioasid_t pasid); + +/* PCI device reset functions */ +int pci_dev_reset_iommu_prepare(struct pci_dev *pdev); +void pci_dev_reset_iommu_done(struct pci_dev *pdev); #else /* CONFIG_IOMMU_API */ struct iommu_ops {}; @@ -1511,6 +1515,15 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) } static inline void iommu_free_global_pasid(ioasid_t pasid) {} + +static inline int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) +{ + return 0; +} + +static inline void pci_dev_reset_iommu_done(struct pci_dev *pdev) +{ +} #endif /* CONFIG_IOMMU_API */ #ifdef CONFIG_IRQ_MSI_IOMMU diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index ac2329f24141..bb7b89330d35 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -964,6 +964,10 @@ struct vfio_device_bind_iommufd { * hwpt corresponding to the given pt_id. * * Return: 0 on success, -errno on failure. + * + * When a device is resetting, -EBUSY will be returned to reject any concurrent + * attachment to the resetting device itself or any sibling device in the IOMMU + * group having the resetting device. */ struct vfio_device_attach_iommufd_pt { __u32 argsz; -- cgit v1.2.3 From f5b16b802174fe2b67e2b6a27fa793b749981816 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 15 Dec 2025 13:42:20 -0800 Subject: PCI: Suspend iommu function prior to resetting a device PCIe permits a device to ignore ATS invalidation TLPs while processing a reset. This creates a problem visible to the OS where an ATS invalidation command will time out: e.g. an SVA domain will have no coordination with a reset event and can racily issue ATS invalidations to a resetting device. The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends SW to disable and block ATS before initiating a Function Level Reset. It also mentions that other reset methods could have the same vulnerability as well. The IOMMU subsystem provides pci_dev_reset_iommu_prepare/done() callback helpers for this matter. Use them in all the existing reset functions. This will attach the device to its iommu_group->blocking_domain during the device reset, so as to allow IOMMU driver to: - invoke pci_disable_ats() and pci_enable_ats(), if necessary - wait for all ATS invalidations to complete - stop issuing new ATS invalidations - fence any incoming ATS queries Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Acked-by: Bjorn Helgaas Tested-by: Dheeraj Kumar Srivastava Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/pci/pci-acpi.c | 13 ++++++++-- drivers/pci/pci.c | 65 ++++++++++++++++++++++++++++++++++++++++++++------ drivers/pci/quirks.c | 19 ++++++++++++++- 3 files changed, 87 insertions(+), 10 deletions(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 9369377725fa..651d9b5561ff 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -971,6 +972,7 @@ void pci_set_acpi_fwnode(struct pci_dev *dev) int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) { acpi_handle handle = ACPI_HANDLE(&dev->dev); + int ret; if (!handle || !acpi_has_method(handle, "_RST")) return -ENOTTY; @@ -978,12 +980,19 @@ int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) if (probe) return 0; + ret = pci_dev_reset_iommu_prepare(dev); + if (ret) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); + return ret; + } + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_RST", NULL, NULL))) { pci_warn(dev, "ACPI _RST failed\n"); - return -ENOTTY; + ret = -ENOTTY; } - return 0; + pci_dev_reset_iommu_done(dev); + return ret; } bool acpi_pci_power_manageable(struct pci_dev *dev) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 13dbb405dc31..a0ba42ae7ee0 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -4330,13 +4332,22 @@ EXPORT_SYMBOL(pci_wait_for_pending_transaction); */ int pcie_flr(struct pci_dev *dev) { + int ret; + if (!pci_wait_for_pending_transaction(dev)) pci_err(dev, "timed out waiting for pending transaction; performing function level reset anyway\n"); + /* Have to call it after waiting for pending DMA transaction */ + ret = pci_dev_reset_iommu_prepare(dev); + if (ret) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); + return ret; + } + pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_BCR_FLR); if (dev->imm_ready) - return 0; + goto done; /* * Per PCIe r4.0, sec 6.6.2, a device must complete an FLR within @@ -4345,7 +4356,10 @@ int pcie_flr(struct pci_dev *dev) */ msleep(100); - return pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS); + ret = pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS); +done: + pci_dev_reset_iommu_done(dev); + return ret; } EXPORT_SYMBOL_GPL(pcie_flr); @@ -4373,6 +4387,7 @@ EXPORT_SYMBOL_GPL(pcie_reset_flr); static int pci_af_flr(struct pci_dev *dev, bool probe) { + int ret; int pos; u8 cap; @@ -4399,10 +4414,17 @@ static int pci_af_flr(struct pci_dev *dev, bool probe) PCI_AF_STATUS_TP << 8)) pci_err(dev, "timed out waiting for pending transaction; performing AF function level reset anyway\n"); + /* Have to call it after waiting for pending DMA transaction */ + ret = pci_dev_reset_iommu_prepare(dev); + if (ret) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); + return ret; + } + pci_write_config_byte(dev, pos + PCI_AF_CTRL, PCI_AF_CTRL_FLR); if (dev->imm_ready) - return 0; + goto done; /* * Per Advanced Capabilities for Conventional PCI ECN, 13 April 2006, @@ -4412,7 +4434,10 @@ static int pci_af_flr(struct pci_dev *dev, bool probe) */ msleep(100); - return pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS); + ret = pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS); +done: + pci_dev_reset_iommu_done(dev); + return ret; } /** @@ -4433,6 +4458,7 @@ static int pci_af_flr(struct pci_dev *dev, bool probe) static int pci_pm_reset(struct pci_dev *dev, bool probe) { u16 csr; + int ret; if (!dev->pm_cap || dev->dev_flags & PCI_DEV_FLAGS_NO_PM_RESET) return -ENOTTY; @@ -4447,6 +4473,12 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe) if (dev->current_state != PCI_D0) return -EINVAL; + ret = pci_dev_reset_iommu_prepare(dev); + if (ret) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); + return ret; + } + csr &= ~PCI_PM_CTRL_STATE_MASK; csr |= PCI_D3hot; pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr); @@ -4457,7 +4489,9 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe) pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr); pci_dev_d3_sleep(dev); - return pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS); + ret = pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS); + pci_dev_reset_iommu_done(dev); + return ret; } /** @@ -4885,10 +4919,20 @@ static int pci_reset_bus_function(struct pci_dev *dev, bool probe) return -ENOTTY; } + rc = pci_dev_reset_iommu_prepare(dev); + if (rc) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", rc); + return rc; + } + rc = pci_dev_reset_slot_function(dev, probe); if (rc != -ENOTTY) - return rc; - return pci_parent_bus_reset(dev, probe); + goto done; + + rc = pci_parent_bus_reset(dev, probe); +done: + pci_dev_reset_iommu_done(dev); + return rc; } static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) @@ -4912,6 +4956,12 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) if (rc) return -ENOTTY; + rc = pci_dev_reset_iommu_prepare(dev); + if (rc) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", rc); + return rc; + } + if (reg & PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR) { val = reg; } else { @@ -4926,6 +4976,7 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL, reg); + pci_dev_reset_iommu_done(dev); return rc; } diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index b9c252aa6fe0..c6b999045c70 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -21,6 +21,7 @@ #include #include /* isa_dma_bridge_buggy */ #include +#include #include #include #include @@ -4228,6 +4229,22 @@ static const struct pci_dev_reset_methods pci_dev_reset_methods[] = { { 0 } }; +static int __pci_dev_specific_reset(struct pci_dev *dev, bool probe, + const struct pci_dev_reset_methods *i) +{ + int ret; + + ret = pci_dev_reset_iommu_prepare(dev); + if (ret) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); + return ret; + } + + ret = i->reset(dev, probe); + pci_dev_reset_iommu_done(dev); + return ret; +} + /* * These device-specific reset methods are here rather than in a driver * because when a host assigns a device to a guest VM, the host may need @@ -4242,7 +4259,7 @@ int pci_dev_specific_reset(struct pci_dev *dev, bool probe) i->vendor == (u16)PCI_ANY_ID) && (i->device == dev->device || i->device == (u16)PCI_ANY_ID)) - return i->reset(dev, probe); + return __pci_dev_specific_reset(dev, probe, i); } return -ENOTTY; -- cgit v1.2.3 From 466ae6978a5b8c6022bd4537fbfd00e94bb07219 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Fri, 9 Jan 2026 17:18:02 +0000 Subject: iommu: Add page_ext for IOMMU_DEBUG_PAGEALLOC Add a new config IOMMU_DEBUG_PAGEALLOC, which registers new data to page_ext. This config will be used by the IOMMU API to track pages mapped in the IOMMU to catch drivers trying to free kernel memory that they still map in their domains, causing all types of memory corruption. This behaviour is disabled by default and can be enabled using kernel cmdline iommu.debug_pagealloc. Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Pranjal Shrivastava Reviewed-by: Lu Baolu Signed-off-by: Mostafa Saleh Signed-off-by: Joerg Roedel --- Documentation/admin-guide/kernel-parameters.txt | 9 +++++++ drivers/iommu/Kconfig | 19 +++++++++++++++ drivers/iommu/Makefile | 1 + drivers/iommu/iommu-debug-pagealloc.c | 32 +++++++++++++++++++++++++ include/linux/iommu-debug-pagealloc.h | 17 +++++++++++++ mm/page_ext.c | 4 ++++ 6 files changed, 82 insertions(+) create mode 100644 drivers/iommu/iommu-debug-pagealloc.c create mode 100644 include/linux/iommu-debug-pagealloc.h diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a8d0afde7f85..d484d9d8d0a4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2675,6 +2675,15 @@ Kernel parameters 1 - Bypass the IOMMU for DMA. unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH. + iommu.debug_pagealloc= + [KNL,EARLY] When CONFIG_IOMMU_DEBUG_PAGEALLOC is set, this + parameter enables the feature at boot time. By default, it + is disabled and the system behaves the same way as a kernel + built without CONFIG_IOMMU_DEBUG_PAGEALLOC. + Format: { "0" | "1" } + 0 - Sanitizer disabled. + 1 - Sanitizer enabled, expect runtime overhead. + io7= [HW] IO7 for Marvel-based Alpha systems See comment before marvel_specify_io7 in arch/alpha/kernel/core_marvel.c. diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 99095645134f..f86262b11416 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -384,6 +384,25 @@ config SPRD_IOMMU Say Y here if you want to use the multimedia devices listed above. +config IOMMU_DEBUG_PAGEALLOC + bool "Debug IOMMU mappings against page allocations" + depends on DEBUG_PAGEALLOC && IOMMU_API && PAGE_EXTENSION + help + This enables a consistency check between the kernel page allocator and + the IOMMU subsystem. It verifies that pages being allocated or freed + are not currently mapped in any IOMMU domain. + + This helps detect DMA use-after-free bugs where a driver frees a page + but forgets to unmap it from the IOMMU, potentially allowing a device + to overwrite memory that the kernel has repurposed. + + These checks are best-effort and may not detect all problems. + + Due to performance overhead, this feature is disabled by default. + You must enable "iommu.debug_pagealloc" from the kernel command + line to activate the runtime checks. + + If unsure, say N. endif # IOMMU_SUPPORT source "drivers/iommu/generic_pt/Kconfig" diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 8e8843316c4b..0275821f4ef9 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o obj-$(CONFIG_APPLE_DART) += apple-dart.o +obj-$(CONFIG_IOMMU_DEBUG_PAGEALLOC) += iommu-debug-pagealloc.o diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c new file mode 100644 index 000000000000..4022e9af7f27 --- /dev/null +++ b/drivers/iommu/iommu-debug-pagealloc.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 - Google Inc + * Author: Mostafa Saleh + * IOMMU API debug page alloc sanitizer + */ +#include +#include +#include +#include + +static bool needed; + +struct iommu_debug_metadata { + atomic_t ref; +}; + +static __init bool need_iommu_debug(void) +{ + return needed; +} + +struct page_ext_operations page_iommu_debug_ops = { + .size = sizeof(struct iommu_debug_metadata), + .need = need_iommu_debug, +}; + +static int __init iommu_debug_pagealloc(char *str) +{ + return kstrtobool(str, &needed); +} +early_param("iommu.debug_pagealloc", iommu_debug_pagealloc); diff --git a/include/linux/iommu-debug-pagealloc.h b/include/linux/iommu-debug-pagealloc.h new file mode 100644 index 000000000000..83e64d70bf6c --- /dev/null +++ b/include/linux/iommu-debug-pagealloc.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 - Google Inc + * Author: Mostafa Saleh + * IOMMU API debug page alloc sanitizer + */ + +#ifndef __LINUX_IOMMU_DEBUG_PAGEALLOC_H +#define __LINUX_IOMMU_DEBUG_PAGEALLOC_H + +#ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC + +extern struct page_ext_operations page_iommu_debug_ops; + +#endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */ + +#endif /* __LINUX_IOMMU_DEBUG_PAGEALLOC_H */ diff --git a/mm/page_ext.c b/mm/page_ext.c index d7396a8970e5..297e4cd8ce90 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -11,6 +11,7 @@ #include #include #include +#include /* * struct page extension @@ -89,6 +90,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_TABLE_CHECK &page_table_check_ops, #endif +#ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC + &page_iommu_debug_ops, +#endif }; unsigned long page_ext_size; -- cgit v1.2.3 From ccc21213f013834b484cdcc738e282f963fcfc97 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Fri, 9 Jan 2026 17:18:03 +0000 Subject: iommu: Add calls for IOMMU_DEBUG_PAGEALLOC Add calls for the new iommu debug config IOMMU_DEBUG_PAGEALLOC: - iommu_debug_init: Enable the debug mode if configured by the user. - iommu_debug_map: Track iommu pages mapped, using physical address. - iommu_debug_unmap_begin: Track start of iommu unmap operation, with IOVA and size. - iommu_debug_unmap_end: Track the end of unmap operation, passing the actual unmapped size versus the tracked one at unmap_begin. We have to do the unmap_begin/end as once pages are unmapped we lose the information of the physical address. This is racy, but the API is racy by construction as it uses refcounts and doesn't attempt to lock/synchronize with the IOMMU API as that will be costly, meaning that possibility of false negative exists. Reviewed-by: Samiullah Khawaja Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Signed-off-by: Mostafa Saleh Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-debug-pagealloc.c | 28 +++++++++++++++++ drivers/iommu/iommu-priv.h | 58 +++++++++++++++++++++++++++++++++++ drivers/iommu/iommu.c | 11 +++++-- include/linux/iommu-debug-pagealloc.h | 1 + 4 files changed, 96 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c index 4022e9af7f27..1d343421da98 100644 --- a/drivers/iommu/iommu-debug-pagealloc.c +++ b/drivers/iommu/iommu-debug-pagealloc.c @@ -5,11 +5,15 @@ * IOMMU API debug page alloc sanitizer */ #include +#include #include #include #include +#include "iommu-priv.h" + static bool needed; +DEFINE_STATIC_KEY_FALSE(iommu_debug_initialized); struct iommu_debug_metadata { atomic_t ref; @@ -25,6 +29,30 @@ struct page_ext_operations page_iommu_debug_ops = { .need = need_iommu_debug, }; +void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size) +{ +} + +void __iommu_debug_unmap_begin(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ +} + +void __iommu_debug_unmap_end(struct iommu_domain *domain, + unsigned long iova, size_t size, + size_t unmapped) +{ +} + +void iommu_debug_init(void) +{ + if (!needed) + return; + + pr_info("iommu: Debugging page allocations, expect overhead or disable iommu.debug_pagealloc"); + static_branch_enable(&iommu_debug_initialized); +} + static int __init iommu_debug_pagealloc(char *str) { return kstrtobool(str, &needed); diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index c95394cd03a7..aaffad5854fc 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -5,6 +5,7 @@ #define __LINUX_IOMMU_PRIV_H #include +#include #include static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) @@ -65,4 +66,61 @@ static inline int iommufd_sw_msi(struct iommu_domain *domain, int iommu_replace_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_attach_handle *handle); + +#ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC + +void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, + size_t size); +void __iommu_debug_unmap_begin(struct iommu_domain *domain, + unsigned long iova, size_t size); +void __iommu_debug_unmap_end(struct iommu_domain *domain, + unsigned long iova, size_t size, size_t unmapped); + +static inline void iommu_debug_map(struct iommu_domain *domain, + phys_addr_t phys, size_t size) +{ + if (static_branch_unlikely(&iommu_debug_initialized)) + __iommu_debug_map(domain, phys, size); +} + +static inline void iommu_debug_unmap_begin(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ + if (static_branch_unlikely(&iommu_debug_initialized)) + __iommu_debug_unmap_begin(domain, iova, size); +} + +static inline void iommu_debug_unmap_end(struct iommu_domain *domain, + unsigned long iova, size_t size, + size_t unmapped) +{ + if (static_branch_unlikely(&iommu_debug_initialized)) + __iommu_debug_unmap_end(domain, iova, size, unmapped); +} + +void iommu_debug_init(void); + +#else +static inline void iommu_debug_map(struct iommu_domain *domain, + phys_addr_t phys, size_t size) +{ +} + +static inline void iommu_debug_unmap_begin(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ +} + +static inline void iommu_debug_unmap_end(struct iommu_domain *domain, + unsigned long iova, size_t size, + size_t unmapped) +{ +} + +static inline void iommu_debug_init(void) +{ +} + +#endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */ + #endif /* __LINUX_IOMMU_PRIV_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 0665dedd91b2..585b13bcc8cf 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -237,6 +237,8 @@ static int __init iommu_subsys_init(void) if (!nb) return -ENOMEM; + iommu_debug_init(); + for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) { nb[i].notifier_call = iommu_bus_notifier; bus_register_notifier(iommu_buses[i], &nb[i]); @@ -2629,10 +2631,12 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, } /* unroll mapping in case something went wrong */ - if (ret) + if (ret) { iommu_unmap(domain, orig_iova, orig_size - size); - else + } else { trace_map(orig_iova, orig_paddr, orig_size); + iommu_debug_map(domain, orig_paddr, orig_size); + } return ret; } @@ -2694,6 +2698,8 @@ static size_t __iommu_unmap(struct iommu_domain *domain, pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size); + iommu_debug_unmap_begin(domain, iova, size); + /* * Keep iterating until we either unmap 'size' bytes (or more) * or we hit an area that isn't mapped. @@ -2714,6 +2720,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain, } trace_unmap(orig_iova, size, unmapped); + iommu_debug_unmap_end(domain, orig_iova, size, unmapped); return unmapped; } diff --git a/include/linux/iommu-debug-pagealloc.h b/include/linux/iommu-debug-pagealloc.h index 83e64d70bf6c..a439d6815ca1 100644 --- a/include/linux/iommu-debug-pagealloc.h +++ b/include/linux/iommu-debug-pagealloc.h @@ -9,6 +9,7 @@ #define __LINUX_IOMMU_DEBUG_PAGEALLOC_H #ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC +DECLARE_STATIC_KEY_FALSE(iommu_debug_initialized); extern struct page_ext_operations page_iommu_debug_ops; -- cgit v1.2.3 From 7e84593795b3c05ca682b8aa74cceeceae636ec7 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Fri, 9 Jan 2026 17:18:04 +0000 Subject: iommu: debug-pagealloc: Track IOMMU pages Using the new calls, use an atomic refcount to track how many times a page is mapped in any of the IOMMUs. For unmap we need to use iova_to_phys() to get the physical address of the pages. We use the smallest supported page size as the granularity of tracking per domain. This is important as it is possible to map pages and unmap them with larger sizes (as in map_sg()) cases. Reviewed-by: Samiullah Khawaja Reviewed-by: Lu Baolu Signed-off-by: Mostafa Saleh Reviewed-by: Pranjal Shrivastava Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-debug-pagealloc.c | 84 +++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c index 1d343421da98..9eb49e1230ee 100644 --- a/drivers/iommu/iommu-debug-pagealloc.c +++ b/drivers/iommu/iommu-debug-pagealloc.c @@ -29,19 +29,103 @@ struct page_ext_operations page_iommu_debug_ops = { .need = need_iommu_debug, }; +static struct page_ext *get_iommu_page_ext(phys_addr_t phys) +{ + struct page *page = phys_to_page(phys); + struct page_ext *page_ext = page_ext_get(page); + + return page_ext; +} + +static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext) +{ + return page_ext_data(page_ext, &page_iommu_debug_ops); +} + +static void iommu_debug_inc_page(phys_addr_t phys) +{ + struct page_ext *page_ext = get_iommu_page_ext(phys); + struct iommu_debug_metadata *d = get_iommu_data(page_ext); + + WARN_ON(atomic_inc_return_relaxed(&d->ref) <= 0); + page_ext_put(page_ext); +} + +static void iommu_debug_dec_page(phys_addr_t phys) +{ + struct page_ext *page_ext = get_iommu_page_ext(phys); + struct iommu_debug_metadata *d = get_iommu_data(page_ext); + + WARN_ON(atomic_dec_return_relaxed(&d->ref) < 0); + page_ext_put(page_ext); +} + +/* + * IOMMU page size doesn't have to match the CPU page size. So, we use + * the smallest IOMMU page size to refcount the pages in the vmemmap. + * That is important as both map and unmap has to use the same page size + * to update the refcount to avoid double counting the same page. + * And as we can't know from iommu_unmap() what was the original page size + * used for map, we just use the minimum supported one for both. + */ +static size_t iommu_debug_page_size(struct iommu_domain *domain) +{ + return 1UL << __ffs(domain->pgsize_bitmap); +} + void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size) { + size_t off, end; + size_t page_size = iommu_debug_page_size(domain); + + if (WARN_ON(!phys || check_add_overflow(phys, size, &end))) + return; + + for (off = 0 ; off < size ; off += page_size) { + if (!pfn_valid(__phys_to_pfn(phys + off))) + continue; + iommu_debug_inc_page(phys + off); + } +} + +static void __iommu_debug_update_iova(struct iommu_domain *domain, + unsigned long iova, size_t size, bool inc) +{ + size_t off, end; + size_t page_size = iommu_debug_page_size(domain); + + if (WARN_ON(check_add_overflow(iova, size, &end))) + return; + + for (off = 0 ; off < size ; off += page_size) { + phys_addr_t phys = iommu_iova_to_phys(domain, iova + off); + + if (!phys || !pfn_valid(__phys_to_pfn(phys))) + continue; + + if (inc) + iommu_debug_inc_page(phys); + else + iommu_debug_dec_page(phys); + } } void __iommu_debug_unmap_begin(struct iommu_domain *domain, unsigned long iova, size_t size) { + __iommu_debug_update_iova(domain, iova, size, false); } void __iommu_debug_unmap_end(struct iommu_domain *domain, unsigned long iova, size_t size, size_t unmapped) { + if ((unmapped == size) || WARN_ON_ONCE(unmapped > size)) + return; + + /* If unmap failed, re-increment the refcount. */ + __iommu_debug_update_iova(domain, iova + unmapped, + size - unmapped, true); } void iommu_debug_init(void) -- cgit v1.2.3 From a8258ffed2efdf533bdc756141eeb7bc5301ad4f Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Fri, 9 Jan 2026 17:18:05 +0000 Subject: iommu: debug-pagealloc: Check mapped/unmapped kernel memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now, as the page_ext holds count of IOMMU mappings, we can use it to assert that any page allocated/freed is indeed not in the IOMMU. The sanitizer doesn’t protect against mapping/unmapping during this period. However, that’s less harmful as the page is not used by the kernel. Reviewed-by: Samiullah Khawaja Reviewed-by: Lu Baolu Signed-off-by: Mostafa Saleh Reviewed-by: Pranjal Shrivastava Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-debug-pagealloc.c | 23 +++++++++++++++++++++++ include/linux/iommu-debug-pagealloc.h | 14 ++++++++++++++ include/linux/mm.h | 5 +++++ 3 files changed, 42 insertions(+) diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c index 9eb49e1230ee..c080a38f45a4 100644 --- a/drivers/iommu/iommu-debug-pagealloc.c +++ b/drivers/iommu/iommu-debug-pagealloc.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "iommu-priv.h" @@ -73,6 +74,28 @@ static size_t iommu_debug_page_size(struct iommu_domain *domain) return 1UL << __ffs(domain->pgsize_bitmap); } +static bool iommu_debug_page_count(const struct page *page) +{ + unsigned int ref; + struct page_ext *page_ext = page_ext_get(page); + struct iommu_debug_metadata *d = get_iommu_data(page_ext); + + ref = atomic_read(&d->ref); + page_ext_put(page_ext); + return ref != 0; +} + +void __iommu_debug_check_unmapped(const struct page *page, int numpages) +{ + while (numpages--) { + if (WARN_ON(iommu_debug_page_count(page))) { + pr_warn("iommu: Detected page leak!\n"); + dump_page_owner(page); + } + page++; + } +} + void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size) { size_t off, end; diff --git a/include/linux/iommu-debug-pagealloc.h b/include/linux/iommu-debug-pagealloc.h index a439d6815ca1..46c3c1f70150 100644 --- a/include/linux/iommu-debug-pagealloc.h +++ b/include/linux/iommu-debug-pagealloc.h @@ -13,6 +13,20 @@ DECLARE_STATIC_KEY_FALSE(iommu_debug_initialized); extern struct page_ext_operations page_iommu_debug_ops; +void __iommu_debug_check_unmapped(const struct page *page, int numpages); + +static inline void iommu_debug_check_unmapped(const struct page *page, int numpages) +{ + if (static_branch_unlikely(&iommu_debug_initialized)) + __iommu_debug_check_unmapped(page, numpages); +} + +#else +static inline void iommu_debug_check_unmapped(const struct page *page, + int numpages) +{ +} + #endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */ #endif /* __LINUX_IOMMU_DEBUG_PAGEALLOC_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 6f959d8ca4b4..32205d2a24b2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -36,6 +36,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -4133,12 +4134,16 @@ extern void __kernel_map_pages(struct page *page, int numpages, int enable); #ifdef CONFIG_DEBUG_PAGEALLOC static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { + iommu_debug_check_unmapped(page, numpages); + if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 1); } static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) { + iommu_debug_check_unmapped(page, numpages); + if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 0); } -- cgit v1.2.3 From d2a0cac10597068567d336e85fa3cbdbe8ca62bf Mon Sep 17 00:00:00 2001 From: Ankit Soni Date: Mon, 1 Dec 2025 14:39:40 +0000 Subject: iommu/amd: move wait_on_sem() out of spinlock With iommu.strict=1, the existing completion wait path can cause soft lockups under stressed environment, as wait_on_sem() busy-waits under the spinlock with interrupts disabled. Move the completion wait in iommu_completion_wait() out of the spinlock. wait_on_sem() only polls the hardware-updated cmd_sem and does not require iommu->lock, so holding the lock during the busy wait unnecessarily increases contention and extends the time with interrupts disabled. Signed-off-by: Ankit Soni Reviewed-by: Vasant Hegde Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 5d45795c367a..858d1669fe6c 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1185,7 +1185,12 @@ static int wait_on_sem(struct amd_iommu *iommu, u64 data) { int i = 0; - while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) { + /* + * cmd_sem holds a monotonically non-decreasing completion sequence + * number. + */ + while ((__s64)(READ_ONCE(*iommu->cmd_sem) - data) < 0 && + i < LOOP_TIMEOUT) { udelay(1); i += 1; } @@ -1437,14 +1442,13 @@ static int iommu_completion_wait(struct amd_iommu *iommu) raw_spin_lock_irqsave(&iommu->lock, flags); ret = __iommu_queue_command_sync(iommu, &cmd, false); + raw_spin_unlock_irqrestore(&iommu->lock, flags); + if (ret) - goto out_unlock; + return ret; ret = wait_on_sem(iommu, data); -out_unlock: - raw_spin_unlock_irqrestore(&iommu->lock, flags); - return ret; } @@ -3121,13 +3125,18 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) raw_spin_lock_irqsave(&iommu->lock, flags); ret = __iommu_queue_command_sync(iommu, &cmd, true); if (ret) - goto out; + goto out_err; ret = __iommu_queue_command_sync(iommu, &cmd2, false); if (ret) - goto out; + goto out_err; + raw_spin_unlock_irqrestore(&iommu->lock, flags); + wait_on_sem(iommu, data); -out: + return; + +out_err: raw_spin_unlock_irqrestore(&iommu->lock, flags); + return; } static inline u8 iommu_get_int_tablen(struct iommu_dev_data *dev_data) -- cgit v1.2.3 From 16e3423fc755cf9ddcceefaf70869311046928e2 Mon Sep 17 00:00:00 2001 From: Can Peng Date: Tue, 9 Dec 2025 15:15:13 +0800 Subject: iommu: simplify list initialization in iommu_create_device_direct_mappings() Use LIST_HEAD() to declare and initialize the 'mappings' list head in iommu_create_device_direct_mappings() instead of separate declaration and INIT_LIST_HEAD(). This simplifies the code by combining declaration and initialization into a single idiomatic form, improving readability without changing functionality. Signed-off-by: Can Peng Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 585b13bcc8cf..4926a43118e6 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1180,12 +1180,11 @@ static int iommu_create_device_direct_mappings(struct iommu_domain *domain, struct device *dev) { struct iommu_resv_region *entry; - struct list_head mappings; + LIST_HEAD(mappings); unsigned long pg_size; int ret = 0; pg_size = domain->pgsize_bitmap ? 1UL << __ffs(domain->pgsize_bitmap) : 0; - INIT_LIST_HEAD(&mappings); if (WARN_ON_ONCE(iommu_is_dma_domain(domain) && !pg_size)) return -EINVAL; -- cgit v1.2.3 From 2e666595651ba02128b13e9029d5248cfc2fb702 Mon Sep 17 00:00:00 2001 From: Rakuram Eswaran Date: Mon, 22 Dec 2025 22:40:08 +0530 Subject: iommu/amd: Drop incorrect NULL check for iommu in alloc_irq_table() alloc_irq_table() contains a conditional check for a NULL iommu pointer when computing the NUMA node, but the function dereferences iommu in multiple places afterwards. All callers ensure that a valid iommu pointer is passed in, and a NULL iommu is not expected by the current callers. Remove the incorrect NULL check to make the assumptions consistent and address the Smatch warning. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202512191724.meqJENXe-lkp@intel.com/ Signed-off-by: Rakuram Eswaran Reviewed-by: Ankit Soni Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 858d1669fe6c..d7f457338de7 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3249,7 +3249,7 @@ static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, struct irq_remap_table *new_table = NULL; struct amd_iommu_pci_seg *pci_seg; unsigned long flags; - int nid = iommu && iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; + int nid = iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; u16 alias; spin_lock_irqsave(&iommu_table_lock, flags); -- cgit v1.2.3 From 7d8b06ecc45bd679dec58d2cc2bd86223d4e076d Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:02 +0000 Subject: iommu/amd: Add support for hw_info for iommu capability query AMD IOMMU Extended Feature (EFR) and Extended Feature 2 (EFR2) registers specify features supported by each IOMMU hardware instance. The IOMMU driver checks each feature-specific bits before enabling each feature at run time. For IOMMUFD, the hypervisor passes the raw value of amd_iommu_efr and amd_iommu_efr2 to VMM via iommufd IOMMU_DEVICE_GET_HW_INFO ioctl. Reviewed-by: Nicolin Chen Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/Kconfig | 10 ++++++++++ drivers/iommu/amd/Makefile | 1 + drivers/iommu/amd/iommu.c | 2 ++ drivers/iommu/amd/iommufd.c | 31 +++++++++++++++++++++++++++++++ drivers/iommu/amd/iommufd.h | 15 +++++++++++++++ include/uapi/linux/iommufd.h | 28 ++++++++++++++++++++++++++++ 6 files changed, 87 insertions(+) create mode 100644 drivers/iommu/amd/iommufd.c create mode 100644 drivers/iommu/amd/iommufd.h diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index f2acf471cb5d..588355ff7eb7 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -30,6 +30,16 @@ config AMD_IOMMU your BIOS for an option to enable it or if you have an IVRS ACPI table. +config AMD_IOMMU_IOMMUFD + bool "Enable IOMMUFD features for AMD IOMMU (EXPERIMENTAL)" + depends on IOMMUFD + depends on AMD_IOMMU + help + Support for IOMMUFD features intended to support virtual machines + with accelerated virtual IOMMUs. + + Say Y here if you are doing development and testing on this feature. + config AMD_IOMMU_DEBUGFS bool "Enable AMD IOMMU internals in DebugFS" depends on AMD_IOMMU && IOMMU_DEBUGFS diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index 5412a563c697..41f053b49dce 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += iommu.o init.o quirks.o ppr.o pasid.o +obj-$(CONFIG_AMD_IOMMU_IOMMUFD) += iommufd.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d7f457338de7..d550a7e431ac 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -43,6 +43,7 @@ #include #include "amd_iommu.h" +#include "iommufd.h" #include "../irq_remapping.h" #include "../iommu-pages.h" @@ -3083,6 +3084,7 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, + .hw_info = amd_iommufd_hw_info, .blocked_domain = &blocked_domain, .release_domain = &blocked_domain, .identity_domain = &identity_domain.domain, diff --git a/drivers/iommu/amd/iommufd.c b/drivers/iommu/amd/iommufd.c new file mode 100644 index 000000000000..72eaaa923d04 --- /dev/null +++ b/drivers/iommu/amd/iommufd.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#include + +#include "iommufd.h" +#include "amd_iommu.h" +#include "amd_iommu_types.h" + +void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type) +{ + struct iommu_hw_info_amd *hwinfo; + + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_AMD) + return ERR_PTR(-EOPNOTSUPP); + + hwinfo = kzalloc(sizeof(*hwinfo), GFP_KERNEL); + if (!hwinfo) + return ERR_PTR(-ENOMEM); + + *length = sizeof(*hwinfo); + *type = IOMMU_HW_INFO_TYPE_AMD; + + hwinfo->efr = amd_iommu_efr; + hwinfo->efr2 = amd_iommu_efr2; + + return hwinfo; +} diff --git a/drivers/iommu/amd/iommufd.h b/drivers/iommu/amd/iommufd.h new file mode 100644 index 000000000000..f880be80a30d --- /dev/null +++ b/drivers/iommu/amd/iommufd.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#ifndef AMD_IOMMUFD_H +#define AMD_IOMMUFD_H + +#if IS_ENABLED(CONFIG_AMD_IOMMU_IOMMUFD) +void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type); +#else +#define amd_iommufd_hw_info NULL +#endif /* CONFIG_AMD_IOMMU_IOMMUFD */ + +#endif /* AMD_IOMMUFD_H */ diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 2c41920b641d..3db37f6042a0 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -623,6 +623,32 @@ struct iommu_hw_info_tegra241_cmdqv { __u8 __reserved; }; +/** + * struct iommu_hw_info_amd - AMD IOMMU device info + * + * @efr : Value of AMD IOMMU Extended Feature Register (EFR) + * @efr2: Value of AMD IOMMU Extended Feature 2 Register (EFR2) + * + * Please See description of these registers in the following sections of + * the AMD I/O Virtualization Technology (IOMMU) Specification. + * (https://docs.amd.com/v/u/en-US/48882_3.10_PUB) + * + * - MMIO Offset 0030h IOMMU Extended Feature Register + * - MMIO Offset 01A0h IOMMU Extended Feature 2 Register + * + * Note: The EFR and EFR2 are raw values reported by hardware. + * VMM is responsible to determine the appropriate flags to be exposed to + * the VM since cetertain features are not currently supported by the kernel + * for HW-vIOMMU. + * + * Current VMM-allowed list of feature flags are: + * - EFR[GTSup, GASup, GioSup, PPRSup, EPHSup, GATS, GLX, PASmax] + */ +struct iommu_hw_info_amd { + __aligned_u64 efr; + __aligned_u64 efr2; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware @@ -632,6 +658,7 @@ struct iommu_hw_info_tegra241_cmdqv { * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type * @IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM * SMMUv3) info type + * @IOMMU_HW_INFO_TYPE_AMD: AMD IOMMU info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE = 0, @@ -639,6 +666,7 @@ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV = 3, + IOMMU_HW_INFO_TYPE_AMD = 4, }; /** -- cgit v1.2.3 From 5335fc1657493f352fd90121fcb4e5ff2e2fc796 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:03 +0000 Subject: iommu/amd: Rename DEV_DOMID_MASK to DTE_DOMID_MASK Also change the define to use GENMASK_ULL instead. There is no functional change. Reviewed-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 2 +- drivers/iommu/amd/init.c | 2 +- drivers/iommu/amd/iommu.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 320733e7d8b4..14801d734684 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -358,7 +358,7 @@ #define DTE_FLAG_IOTLB BIT_ULL(32) #define DTE_FLAG_MASK (0x3ffULL << 32) -#define DEV_DOMID_MASK 0xffffULL +#define DTE_DOMID_MASK GENMASK_ULL(15, 0) #define DTE_GCR3_14_12 GENMASK_ULL(60, 58) #define DTE_GCR3_30_15 GENMASK_ULL(31, 16) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 384c90b4f90a..cfbc9ff105c3 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1179,7 +1179,7 @@ static bool __reuse_device_table(struct amd_iommu *iommu) for (devid = 0; devid <= pci_seg->last_bdf; devid++) { old_dev_tbl_entry = &pci_seg->old_dev_tbl_cpy[devid]; dte_v = FIELD_GET(DTE_FLAG_V, old_dev_tbl_entry->data[0]); - dom_id = FIELD_GET(DEV_DOMID_MASK, old_dev_tbl_entry->data[1]); + dom_id = FIELD_GET(DTE_DOMID_MASK, old_dev_tbl_entry->data[1]); if (!dte_v || !dom_id) continue; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d550a7e431ac..e3d9215244ea 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2120,7 +2120,7 @@ static void set_dte_entry(struct amd_iommu *iommu, if (dev_data->ats_enabled) new.data[1] |= DTE_FLAG_IOTLB; - old_domid = READ_ONCE(dte->data[1]) & DEV_DOMID_MASK; + old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK; new.data[1] |= domid; /* -- cgit v1.2.3 From 11cfa782f01b2d988175011c19906180a62dd893 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:04 +0000 Subject: iommu/amd: Make amd_iommu_make_clear_dte() non-static inline This will be reused in a new nested.c file for nested translation. Also, remove unused function parameter ptr. Reviewed-by: Nicolin Chen Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 8 ++++++++ drivers/iommu/amd/iommu.c | 13 ++----------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index b742ef1adb35..8eb5e9857079 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -190,4 +190,12 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain, struct dev_table_entry *get_dev_table(struct amd_iommu *iommu); struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid); +static inline void +amd_iommu_make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *new) +{ + /* All existing DTE must have V bit set */ + new->data128[0] = DTE_FLAG_V; + new->data128[1] = 0; +} + #endif /* AMD_IOMMU_H */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index e3d9215244ea..ae26fbe09023 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2015,14 +2015,6 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid) return ret; } -static void make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *ptr, - struct dev_table_entry *new) -{ - /* All existing DTE must have V bit set */ - new->data128[0] = DTE_FLAG_V; - new->data128[1] = 0; -} - /* * Note: * The old value for GCR3 table and GPT have been cleared from caller. @@ -2072,7 +2064,7 @@ static void set_dte_entry(struct amd_iommu *iommu, struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; struct pt_iommu_amdv1_hw_info pt_info; - make_clear_dte(dev_data, dte, &new); + amd_iommu_make_clear_dte(dev_data, &new); if (gcr3_info && gcr3_info->gcr3_tbl) domid = dev_data->gcr3_info.domid; @@ -2153,9 +2145,8 @@ static void set_dte_entry(struct amd_iommu *iommu, static void clear_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data) { struct dev_table_entry new = {}; - struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; - make_clear_dte(dev_data, dte, &new); + amd_iommu_make_clear_dte(dev_data, &new); update_dte256(iommu, dev_data, &new); } -- cgit v1.2.3 From 9b467a5af856dce9aba17bb6db3ed34ed79f5d90 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:05 +0000 Subject: iommu/amd: Introduce helper function amd_iommu_update_dte() Which includes DTE update, clone_aliases, DTE flush and completion-wait commands to avoid code duplication when reuse to setup DTE for nested translation. Also, make amd_iommu_update_dte() non-static to reuse in in a new nested.c file for nested translation. Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 4 ++++ drivers/iommu/amd/iommu.c | 24 ++++++++++++++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 8eb5e9857079..d97b9b6d76d3 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -190,6 +190,10 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain, struct dev_table_entry *get_dev_table(struct amd_iommu *iommu); struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid); +void amd_iommu_update_dte(struct amd_iommu *iommu, + struct iommu_dev_data *dev_data, + struct dev_table_entry *new); + static inline void amd_iommu_make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *new) { diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index ae26fbe09023..2b7099e09852 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -76,6 +76,8 @@ static void set_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, phys_addr_t top_paddr, unsigned int top_level); +static int device_flush_dte(struct iommu_dev_data *dev_data); + static void amd_iommu_change_top(struct pt_iommu *iommu_table, phys_addr_t top_paddr, unsigned int top_level); @@ -86,6 +88,10 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain); static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, bool enable); +static void clone_aliases(struct amd_iommu *iommu, struct device *dev); + +static int iommu_completion_wait(struct amd_iommu *iommu); + /**************************************************************************** * * Helper functions @@ -203,6 +209,16 @@ static void update_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_da spin_unlock_irqrestore(&dev_data->dte_lock, flags); } +void amd_iommu_update_dte(struct amd_iommu *iommu, + struct iommu_dev_data *dev_data, + struct dev_table_entry *new) +{ + update_dte256(iommu, dev_data, new); + clone_aliases(iommu, dev_data->dev); + device_flush_dte(dev_data); + iommu_completion_wait(iommu); +} + static void get_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, struct dev_table_entry *dte) { @@ -2127,7 +2143,7 @@ static void set_dte_entry(struct amd_iommu *iommu, set_dte_gcr3_table(iommu, dev_data, &new); - update_dte256(iommu, dev_data, &new); + amd_iommu_update_dte(iommu, dev_data, &new); /* * A kdump kernel might be replacing a domain ID that was copied from @@ -2147,7 +2163,7 @@ static void clear_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_ struct dev_table_entry new = {}; amd_iommu_make_clear_dte(dev_data, &new); - update_dte256(iommu, dev_data, &new); + amd_iommu_update_dte(iommu, dev_data, &new); } /* Update and flush DTE for the given device */ @@ -2159,10 +2175,6 @@ static void dev_update_dte(struct iommu_dev_data *dev_data, bool set) set_dte_entry(iommu, dev_data, 0, 0); else clear_dte_entry(iommu, dev_data); - - clone_aliases(iommu, dev_data->dev); - device_flush_dte(dev_data); - iommu_completion_wait(iommu); } /* -- cgit v1.2.3 From e05698c10d980ac0a0b57ed81ec9353b9e9533c6 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:06 +0000 Subject: iommufd: Introduce data struct for AMD nested domain allocation Introduce IOMMU_HWPT_DATA_AMD_GUEST data type for IOMMU guest page table, which is used for stage-1 in nested translation. The data structure contains information necessary for setting up the AMD HW-vIOMMU support. Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Vasant Hegde Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- include/uapi/linux/iommufd.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 3db37f6042a0..1dafbc552d37 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -465,16 +465,27 @@ struct iommu_hwpt_arm_smmuv3 { __aligned_le64 ste[2]; }; +/** + * struct iommu_hwpt_amd_guest - AMD IOMMU guest I/O page table data + * (IOMMU_HWPT_DATA_AMD_GUEST) + * @dte: Guest Device Table Entry (DTE) + */ +struct iommu_hwpt_amd_guest { + __aligned_u64 dte[4]; +}; + /** * enum iommu_hwpt_data_type - IOMMU HWPT Data Type * @IOMMU_HWPT_DATA_NONE: no data * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table + * @IOMMU_HWPT_DATA_AMD_GUEST: AMD IOMMU guest page table */ enum iommu_hwpt_data_type { IOMMU_HWPT_DATA_NONE = 0, IOMMU_HWPT_DATA_VTD_S1 = 1, IOMMU_HWPT_DATA_ARM_SMMUV3 = 2, + IOMMU_HWPT_DATA_AMD_GUEST = 3, }; /** -- cgit v1.2.3 From b2bb0573ddb2dcac7ebcd65708e172ce0a1de754 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:07 +0000 Subject: iommu/amd: Always enable GCR3TRPMode when supported. The GCR3TRPMode feature allows the DTE[GCR3TRP] field to be configured with GPA (instead of SPA). This simplifies the implementation, and is a pre-requisite for nested translation support. Therefore, always enable this feature if available. Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 2 ++ drivers/iommu/amd/init.c | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 14801d734684..d8753841cd1f 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -108,6 +108,7 @@ /* Extended Feature 2 Bits */ #define FEATURE_SEVSNPIO_SUP BIT_ULL(1) +#define FEATURE_GCR3TRPMODE BIT_ULL(3) #define FEATURE_SNPAVICSUP GENMASK_ULL(7, 5) #define FEATURE_SNPAVICSUP_GAM(x) \ (FIELD_GET(FEATURE_SNPAVICSUP, x) == 0x1) @@ -186,6 +187,7 @@ #define CONTROL_EPH_EN 45 #define CONTROL_XT_EN 50 #define CONTROL_INTCAPXT_EN 51 +#define CONTROL_GCR3TRPMODE 58 #define CONTROL_IRTCACHEDIS 59 #define CONTROL_SNPAVIC_EN 61 diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index cfbc9ff105c3..b1c344ed7dbd 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1122,6 +1122,14 @@ static void iommu_enable_gt(struct amd_iommu *iommu) return; iommu_feature_enable(iommu, CONTROL_GT_EN); + + /* + * This feature needs to be enabled prior to a call + * to iommu_snp_enable(). Since this function is called + * in early_enable_iommu(), it is safe to enable here. + */ + if (check_feature2(FEATURE_GCR3TRPMODE)) + iommu_feature_enable(iommu, CONTROL_GCR3TRPMODE); } /* sets a specific bit in the device table entry. */ -- cgit v1.2.3 From b43a29def24f38a28085635e2be39566671976a8 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:08 +0000 Subject: iommu/amd: Add support for nest parent domain allocation To support nested translation, the nest parent domain is allocated with IOMMU_HWPT_ALLOC_NEST_PARENT flag, and stores information of the v1 page table for stage 2 (i.e. GPA->SPA). Also, only support nest parent domain on AMD system, which can support the Guest CR3 Table (GCR3TRPMode) feature. This feature is required in order to program DTE[GCR3 Table Root Pointer] with the GPA. Reviewed-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 2b7099e09852..a223387eec5c 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2769,6 +2769,14 @@ static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev, return &domain->domain; } +static inline bool is_nest_parent_supported(u32 flags) +{ + /* Only allow nest parent when these features are supported */ + return check_feature(FEATURE_GT) && + check_feature(FEATURE_GIOSUP) && + check_feature2(FEATURE_GCR3TRPMODE); +} + static struct iommu_domain * amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, const struct iommu_user_data *user_data) @@ -2776,16 +2784,28 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, { struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | - IOMMU_HWPT_ALLOC_PASID; + IOMMU_HWPT_ALLOC_PASID | + IOMMU_HWPT_ALLOC_NEST_PARENT; if ((flags & ~supported_flags) || user_data) return ERR_PTR(-EOPNOTSUPP); switch (flags & supported_flags) { case IOMMU_HWPT_ALLOC_DIRTY_TRACKING: - /* Allocate domain with v1 page table for dirty tracking */ - if (!amd_iommu_hd_support(iommu)) + case IOMMU_HWPT_ALLOC_NEST_PARENT: + case IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_NEST_PARENT: + /* + * Allocate domain with v1 page table for dirty tracking + * and/or Nest parent. + */ + if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && + !amd_iommu_hd_support(iommu)) + break; + + if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && + !is_nest_parent_supported(flags)) break; + return amd_iommu_domain_alloc_paging_v1(dev, flags); case IOMMU_HWPT_ALLOC_PASID: /* Allocate domain with v2 page table if IOMMU supports PASID. */ -- cgit v1.2.3 From e113a72576d6056aa91925beaa7256533a808750 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:09 +0000 Subject: iommu/amd: Introduce struct amd_iommu_viommu Which stores reference to nested parent domain assigned during the call to struct iommu_ops.viommu_init(). Information in the nest parent is needed when setting up the nested translation. Note that the viommu initialization will be introduced in subsequent commit. Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 6 ++++++ drivers/iommu/amd/iommu.c | 2 ++ drivers/iommu/amd/iommufd.c | 16 ++++++++++++++++ drivers/iommu/amd/iommufd.h | 5 +++++ 4 files changed, 29 insertions(+) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index d8753841cd1f..d5b3393ab3a9 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -495,6 +496,11 @@ struct pdom_iommu_info { u32 refcnt; /* Count of attached dev/pasid per domain/IOMMU */ }; +struct amd_iommu_viommu { + struct iommufd_viommu core; + struct protection_domain *parent; /* nest parent domain for this viommu */ +}; + /* * This structure contains generic data for IOMMU protection domains * independent of their use. diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index a223387eec5c..f6a6ec90248a 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3120,6 +3120,8 @@ const struct iommu_ops amd_iommu_ops = { .is_attach_deferred = amd_iommu_is_attach_deferred, .def_domain_type = amd_iommu_def_domain_type, .page_response = amd_iommu_page_response, + .get_viommu_size = amd_iommufd_get_viommu_size, + .viommu_init = amd_iommufd_viommu_init, }; #ifdef CONFIG_IRQ_REMAP diff --git a/drivers/iommu/amd/iommufd.c b/drivers/iommu/amd/iommufd.c index 72eaaa923d04..eb6119bdcf12 100644 --- a/drivers/iommu/amd/iommufd.c +++ b/drivers/iommu/amd/iommufd.c @@ -29,3 +29,19 @@ void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type) return hwinfo; } + +size_t amd_iommufd_get_viommu_size(struct device *dev, enum iommu_viommu_type viommu_type) +{ + return VIOMMU_STRUCT_SIZE(struct amd_iommu_viommu, core); +} + +int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct protection_domain *pdom = to_pdomain(parent); + struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core); + + aviommu->parent = pdom; + + return 0; +} diff --git a/drivers/iommu/amd/iommufd.h b/drivers/iommu/amd/iommufd.h index f880be80a30d..f05aad495b5b 100644 --- a/drivers/iommu/amd/iommufd.h +++ b/drivers/iommu/amd/iommufd.h @@ -8,8 +8,13 @@ #if IS_ENABLED(CONFIG_AMD_IOMMU_IOMMUFD) void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type); +size_t amd_iommufd_get_viommu_size(struct device *dev, enum iommu_viommu_type viommu_type); +int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *parent, + const struct iommu_user_data *user_data); #else #define amd_iommufd_hw_info NULL +#define amd_iommufd_viommu_init NULL +#define amd_iommufd_get_viommu_size NULL #endif /* CONFIG_AMD_IOMMU_IOMMUFD */ #endif /* AMD_IOMMUFD_H */ -- cgit v1.2.3 From 774180a74abc89fd1389f51a6f93dbfcded365c2 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:10 +0000 Subject: iommu/amd: Add support for nested domain allocation The nested domain is allocated with IOMMU_DOMAIN_NESTED type to store stage-1 translation (i.e. GVA->GPA). This includes the GCR3 root pointer table along with guest page tables. The struct iommu_hwpt_amd_guest contains this information, and is passed from user-space as a parameter of the struct iommu_ops.domain_alloc_nested(). Reviewed-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/Makefile | 2 +- drivers/iommu/amd/amd_iommu.h | 4 ++ drivers/iommu/amd/amd_iommu_types.h | 14 +++++ drivers/iommu/amd/nested.c | 110 ++++++++++++++++++++++++++++++++++++ 4 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 drivers/iommu/amd/nested.c diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index 41f053b49dce..94b8ef2acb18 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += iommu.o init.o quirks.o ppr.o pasid.o -obj-$(CONFIG_AMD_IOMMU_IOMMUFD) += iommufd.o +obj-$(CONFIG_AMD_IOMMU_IOMMUFD) += iommufd.o nested.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index d97b9b6d76d3..aa29afe96e90 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -202,4 +202,8 @@ amd_iommu_make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry new->data128[1] = 0; } +/* NESTED */ +struct iommu_domain * +amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data); #endif /* AMD_IOMMU_H */ diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index d5b3393ab3a9..487ee6123de5 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -21,6 +21,8 @@ #include #include +#include + /* * Maximum number of IOMMUs supported */ @@ -353,6 +355,8 @@ #define DTE_FLAG_V BIT_ULL(0) #define DTE_FLAG_TV BIT_ULL(1) #define DTE_FLAG_HAD (3ULL << 7) +#define DTE_MODE_MASK GENMASK_ULL(11, 9) +#define DTE_HOST_TRP GENMASK_ULL(51, 12) #define DTE_FLAG_GIOV BIT_ULL(54) #define DTE_FLAG_GV BIT_ULL(55) #define DTE_GLX GENMASK_ULL(57, 56) @@ -501,6 +505,16 @@ struct amd_iommu_viommu { struct protection_domain *parent; /* nest parent domain for this viommu */ }; +/* + * Nested domain is specifically used for nested translation + */ +struct nested_domain { + struct iommu_domain domain; /* generic domain handle used by iommu core code */ + u16 gdom_id; /* domain ID from gDTE */ + struct iommu_hwpt_amd_guest gdte; /* Guest vIOMMU DTE */ + struct amd_iommu_viommu *viommu; /* AMD hw-viommu this nested domain belong to */ +}; + /* * This structure contains generic data for IOMMU protection domains * independent of their use. diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c new file mode 100644 index 000000000000..a8c0bb4dd733 --- /dev/null +++ b/drivers/iommu/amd/nested.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#define dev_fmt(fmt) "AMD-Vi: " fmt + +#include +#include + +#include "amd_iommu.h" + +static const struct iommu_domain_ops nested_domain_ops; + +static inline struct nested_domain *to_ndomain(struct iommu_domain *dom) +{ + return container_of(dom, struct nested_domain, domain); +} + +/* + * Validate guest DTE to make sure that configuration for host (v1) + * and guest (v2) page tables are valid when allocating nested domain. + */ +static int validate_gdte_nested(struct iommu_hwpt_amd_guest *gdte) +{ + u32 gpt_level = FIELD_GET(DTE_GPT_LEVEL_MASK, gdte->dte[2]); + + /* Must be zero: Mode, Host-TPR */ + if (FIELD_GET(DTE_MODE_MASK, gdte->dte[0]) != 0 || + FIELD_GET(DTE_HOST_TRP, gdte->dte[0]) != 0) + return -EINVAL; + + /* GCR3 TRP must be non-zero if V, GV is set */ + if (FIELD_GET(DTE_FLAG_V, gdte->dte[0]) == 1 && + FIELD_GET(DTE_FLAG_GV, gdte->dte[0]) == 1 && + FIELD_GET(DTE_GCR3_14_12, gdte->dte[0]) == 0 && + FIELD_GET(DTE_GCR3_30_15, gdte->dte[1]) == 0 && + FIELD_GET(DTE_GCR3_51_31, gdte->dte[1]) == 0) + return -EINVAL; + + /* Valid Guest Paging Mode values are 0 and 1 */ + if (gpt_level != GUEST_PGTABLE_4_LEVEL && + gpt_level != GUEST_PGTABLE_5_LEVEL) + return -EINVAL; + + /* GLX = 3 is reserved */ + if (FIELD_GET(DTE_GLX, gdte->dte[0]) == 3) + return -EINVAL; + + /* + * We need to check host capability before setting + * the Guest Paging Mode + */ + if (gpt_level == GUEST_PGTABLE_5_LEVEL && + amd_iommu_gpt_level < PAGE_MODE_5_LEVEL) + return -EOPNOTSUPP; + + return 0; +} + +/* + * This function is assigned to struct iommufd_viommu_ops.alloc_domain_nested() + * during the call to struct iommu_ops.viommu_init(). + */ +struct iommu_domain * +amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data) +{ + int ret; + struct nested_domain *ndom; + struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core); + + if (user_data->type != IOMMU_HWPT_DATA_AMD_GUEST) + return ERR_PTR(-EOPNOTSUPP); + + ndom = kzalloc(sizeof(*ndom), GFP_KERNEL); + if (!ndom) + return ERR_PTR(-ENOMEM); + + ret = iommu_copy_struct_from_user(&ndom->gdte, user_data, + IOMMU_HWPT_DATA_AMD_GUEST, + dte); + if (ret) + goto out_err; + + ret = validate_gdte_nested(&ndom->gdte); + if (ret) + goto out_err; + + ndom->gdom_id = FIELD_GET(DTE_DOMID_MASK, ndom->gdte.dte[1]); + ndom->domain.ops = &nested_domain_ops; + ndom->domain.type = IOMMU_DOMAIN_NESTED; + ndom->viommu = aviommu; + + return &ndom->domain; +out_err: + kfree(ndom); + return ERR_PTR(ret); +} + +static void nested_domain_free(struct iommu_domain *dom) +{ + struct nested_domain *ndom = to_ndomain(dom); + + kfree(ndom); +} + +static const struct iommu_domain_ops nested_domain_ops = { + .free = nested_domain_free, +}; -- cgit v1.2.3 From 757d2b1fdf5b7d6eead5963a49b5780617987ab8 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:11 +0000 Subject: iommu/amd: Introduce gDomID-to-hDomID Mapping and handle parent domain invalidation Each nested domain is assigned guest domain ID (gDomID), which guest OS programs into guest Device Table Entry (gDTE). For each gDomID, the driver assigns a corresponding host domain ID (hDomID), which will be programmed into the host Device Table Entry (hDTE). The hDomID is allocated during amd_iommu_alloc_domain_nested(), and free during nested_domain_free(). The gDomID-to-hDomID mapping info (struct guest_domain_mapping_info) is stored in a per-viommu xarray (struct amd_iommu_viommu.gdomid_array), which is indexed by gDomID. Note also that parent domain can be shared among struct iommufd_viommu. Therefore, when hypervisor invalidates the nest parent domain, the AMD IOMMU command INVALIDATE_IOMMU_PAGES must be issued for each hDomID in the gdomid_array. This is handled by the iommu_flush_pages_v1_hdom_ids(), where it iterates through struct protection_domain.viommu_list. Suggested-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 23 ++++++++ drivers/iommu/amd/iommu.c | 38 ++++++++++++ drivers/iommu/amd/iommufd.c | 31 ++++++++++ drivers/iommu/amd/nested.c | 111 ++++++++++++++++++++++++++++++++++++ 4 files changed, 203 insertions(+) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 487ee6123de5..4a98ac7dca0f 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -503,6 +503,22 @@ struct pdom_iommu_info { struct amd_iommu_viommu { struct iommufd_viommu core; struct protection_domain *parent; /* nest parent domain for this viommu */ + struct list_head pdom_list; /* For protection_domain->viommu_list */ + + /* + * Per-vIOMMU guest domain ID to host domain ID mapping. + * Indexed by guest domain ID. + */ + struct xarray gdomid_array; +}; + +/* + * Contains guest domain ID mapping info, + * which is stored in the struct xarray gdomid_array. + */ +struct guest_domain_mapping_info { + refcount_t users; + u32 hdom_id; /* Host domain ID */ }; /* @@ -511,6 +527,7 @@ struct amd_iommu_viommu { struct nested_domain { struct iommu_domain domain; /* generic domain handle used by iommu core code */ u16 gdom_id; /* domain ID from gDTE */ + struct guest_domain_mapping_info *gdom_info; struct iommu_hwpt_amd_guest gdte; /* Guest vIOMMU DTE */ struct amd_iommu_viommu *viommu; /* AMD hw-viommu this nested domain belong to */ }; @@ -535,6 +552,12 @@ struct protection_domain { struct mmu_notifier mn; /* mmu notifier for the SVA domain */ struct list_head dev_data_list; /* List of pdom_dev_data */ + + /* + * Store reference to list of vIOMMUs, which use this protection domain. + * This will be used to look up host domain ID when flushing this domain. + */ + struct list_head viommu_list; }; PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain); PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain); diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index f6a6ec90248a..9d2c88aa5c5f 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1543,6 +1543,32 @@ static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id) iommu_completion_wait(iommu); } +static int iommu_flush_pages_v1_hdom_ids(struct protection_domain *pdom, u64 address, size_t size) +{ + int ret = 0; + struct amd_iommu_viommu *aviommu; + + list_for_each_entry(aviommu, &pdom->viommu_list, pdom_list) { + unsigned long i; + struct guest_domain_mapping_info *gdom_info; + struct amd_iommu *iommu = container_of(aviommu->core.iommu_dev, + struct amd_iommu, iommu); + + xa_lock(&aviommu->gdomid_array); + xa_for_each(&aviommu->gdomid_array, i, gdom_info) { + struct iommu_cmd cmd; + + pr_debug("%s: iommu=%#x, hdom_id=%#x\n", __func__, + iommu->devid, gdom_info->hdom_id); + build_inv_iommu_pages(&cmd, address, size, gdom_info->hdom_id, + IOMMU_NO_PASID, false); + ret |= iommu_queue_command(iommu, &cmd); + } + xa_unlock(&aviommu->gdomid_array); + } + return ret; +} + static void amd_iommu_flush_all(struct amd_iommu *iommu) { struct iommu_cmd cmd; @@ -1691,6 +1717,17 @@ static int domain_flush_pages_v1(struct protection_domain *pdom, ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd); } + /* + * A domain w/ v1 table can be a nest parent, which can have + * multiple nested domains. Each nested domain has 1:1 mapping + * between gDomID and hDomID. Therefore, flush every hDomID + * associated to this nest parent domain. + * + * See drivers/iommu/amd/nested.c: amd_iommu_alloc_domain_nested() + */ + if (!list_empty(&pdom->viommu_list)) + ret |= iommu_flush_pages_v1_hdom_ids(pdom, address, size); + return ret; } @@ -2508,6 +2545,7 @@ static void protection_domain_init(struct protection_domain *domain) spin_lock_init(&domain->lock); INIT_LIST_HEAD(&domain->dev_list); INIT_LIST_HEAD(&domain->dev_data_list); + INIT_LIST_HEAD(&domain->viommu_list); xa_init(&domain->iommu_array); } diff --git a/drivers/iommu/amd/iommufd.c b/drivers/iommu/amd/iommufd.c index eb6119bdcf12..2e50633d9c72 100644 --- a/drivers/iommu/amd/iommufd.c +++ b/drivers/iommu/amd/iommufd.c @@ -9,6 +9,8 @@ #include "amd_iommu.h" #include "amd_iommu_types.h" +static const struct iommufd_viommu_ops amd_viommu_ops; + void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type) { struct iommu_hw_info_amd *hwinfo; @@ -38,10 +40,39 @@ size_t amd_iommufd_get_viommu_size(struct device *dev, enum iommu_viommu_type vi int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *parent, const struct iommu_user_data *user_data) { + unsigned long flags; struct protection_domain *pdom = to_pdomain(parent); struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core); + xa_init_flags(&aviommu->gdomid_array, XA_FLAGS_ALLOC1); aviommu->parent = pdom; + viommu->ops = &amd_viommu_ops; + + spin_lock_irqsave(&pdom->lock, flags); + list_add(&aviommu->pdom_list, &pdom->viommu_list); + spin_unlock_irqrestore(&pdom->lock, flags); + return 0; } + +static void amd_iommufd_viommu_destroy(struct iommufd_viommu *viommu) +{ + unsigned long flags; + struct amd_iommu *iommu = container_of(viommu->iommu_dev, struct amd_iommu, iommu); + struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core); + struct protection_domain *pdom = aviommu->parent; + + spin_lock_irqsave(&pdom->lock, flags); + list_del(&aviommu->pdom_list); + spin_unlock_irqrestore(&pdom->lock, flags); + xa_destroy(&aviommu->gdomid_array); +} + +/* + * See include/linux/iommufd.h + * struct iommufd_viommu_ops - vIOMMU specific operations + */ +static const struct iommufd_viommu_ops amd_viommu_ops = { + .destroy = amd_iommufd_viommu_destroy, +}; diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c index a8c0bb4dd733..8154a773eed8 100644 --- a/drivers/iommu/amd/nested.c +++ b/drivers/iommu/amd/nested.c @@ -6,6 +6,7 @@ #define dev_fmt(fmt) "AMD-Vi: " fmt #include +#include #include #include "amd_iommu.h" @@ -58,6 +59,33 @@ static int validate_gdte_nested(struct iommu_hwpt_amd_guest *gdte) return 0; } +static void *gdom_info_load_or_alloc_locked(struct xarray *xa, unsigned long index) +{ + struct guest_domain_mapping_info *elm, *res; + + elm = xa_load(xa, index); + if (elm) + return elm; + + xa_unlock(xa); + elm = kzalloc(sizeof(struct guest_domain_mapping_info), GFP_KERNEL); + xa_lock(xa); + if (!elm) + return ERR_PTR(-ENOMEM); + + res = __xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); + if (xa_is_err(res)) + res = ERR_PTR(xa_err(res)); + + if (res) { + kfree(elm); + return res; + } + + refcount_set(&elm->users, 0); + return elm; +} + /* * This function is assigned to struct iommufd_viommu_ops.alloc_domain_nested() * during the call to struct iommu_ops.viommu_init(). @@ -68,6 +96,7 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, { int ret; struct nested_domain *ndom; + struct guest_domain_mapping_info *gdom_info; struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core); if (user_data->type != IOMMU_HWPT_DATA_AMD_GUEST) @@ -92,7 +121,63 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, ndom->domain.type = IOMMU_DOMAIN_NESTED; ndom->viommu = aviommu; + /* + * Normally, when a guest has multiple pass-through devices, + * the IOMMU driver setup DTEs with the same stage-2 table and + * use the same host domain ID (hDomId). In case of nested translation, + * if the guest setup different stage-1 tables with same PASID, + * IOMMU would use the same TLB tag. This will results in TLB + * aliasing issue. + * + * The guest is assigning gDomIDs based on its own algorithm for managing + * cache tags of (DomID, PASID). Within a single viommu, the nest parent domain + * (w/ S2 table) is used by all DTEs. But we need to consistently map the gDomID + * to a single hDomID. This is done using an xarray in the vIOMMU to + * keep track of the gDomID mapping. When the S2 is changed, the INVALIDATE_IOMMU_PAGES + * command must be issued for each hDomID in the xarray. + */ + xa_lock(&aviommu->gdomid_array); + + gdom_info = gdom_info_load_or_alloc_locked(&aviommu->gdomid_array, ndom->gdom_id); + if (IS_ERR(gdom_info)) { + xa_unlock(&aviommu->gdomid_array); + ret = PTR_ERR(gdom_info); + goto out_err; + } + + /* Check if gDomID exist */ + if (refcount_inc_not_zero(&gdom_info->users)) { + ndom->gdom_info = gdom_info; + xa_unlock(&aviommu->gdomid_array); + + pr_debug("%s: Found gdom_id=%#x, hdom_id=%#x\n", + __func__, ndom->gdom_id, gdom_info->hdom_id); + + return &ndom->domain; + } + + /* The gDomID does not exist. We allocate new hdom_id */ + gdom_info->hdom_id = amd_iommu_pdom_id_alloc(); + if (gdom_info->hdom_id <= 0) { + __xa_cmpxchg(&aviommu->gdomid_array, + ndom->gdom_id, gdom_info, NULL, GFP_ATOMIC); + xa_unlock(&aviommu->gdomid_array); + ret = -ENOSPC; + goto out_err_gdom_info; + } + + ndom->gdom_info = gdom_info; + refcount_set(&gdom_info->users, 1); + + xa_unlock(&aviommu->gdomid_array); + + pr_debug("%s: Allocate gdom_id=%#x, hdom_id=%#x\n", + __func__, ndom->gdom_id, gdom_info->hdom_id); + return &ndom->domain; + +out_err_gdom_info: + kfree(gdom_info); out_err: kfree(ndom); return ERR_PTR(ret); @@ -100,8 +185,34 @@ out_err: static void nested_domain_free(struct iommu_domain *dom) { + struct guest_domain_mapping_info *curr; struct nested_domain *ndom = to_ndomain(dom); + struct amd_iommu_viommu *aviommu = ndom->viommu; + + xa_lock(&aviommu->gdomid_array); + + if (!refcount_dec_and_test(&ndom->gdom_info->users)) { + xa_unlock(&aviommu->gdomid_array); + return; + } + + /* + * The refcount for the gdom_id to hdom_id mapping is zero. + * It is now safe to remove the mapping. + */ + curr = __xa_cmpxchg(&aviommu->gdomid_array, ndom->gdom_id, + ndom->gdom_info, NULL, GFP_ATOMIC); + + xa_unlock(&aviommu->gdomid_array); + if (WARN_ON(!curr || xa_err(curr))) + return; + + /* success */ + pr_debug("%s: Free gdom_id=%#x, hdom_id=%#x\n", + __func__, ndom->gdom_id, curr->hdom_id); + amd_iommu_pdom_id_free(ndom->gdom_info->hdom_id); + kfree(curr); kfree(ndom); } -- cgit v1.2.3 From 4e1b09d90bdfc79b2a024121a1f3b5e41c80de81 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:12 +0000 Subject: iommu/amd: Refactor persistent DTE bits programming into amd_iommu_make_clear_dte() To help avoid duplicate logic when programing DTE for nested translation. Note that this commit changes behavior of when the IOMMU driver is switching domain during attach and the blocking domain, where DTE bit fields for interrupt pass-through (i.e. Lint0, Lint1, NMI, INIT, ExtInt) and System management message could be affected. These DTE bits are specified in the IVRS table for specific devices, and should be persistent. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 13 +++++++++++++ drivers/iommu/amd/iommu.c | 11 ----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index aa29afe96e90..00fc9c6073de 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -197,9 +197,22 @@ void amd_iommu_update_dte(struct amd_iommu *iommu, static inline void amd_iommu_make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *new) { + struct dev_table_entry *initial_dte; + struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); + /* All existing DTE must have V bit set */ new->data128[0] = DTE_FLAG_V; new->data128[1] = 0; + + /* + * Restore cached persistent DTE bits, which can be set by information + * in IVRS table. See set_dev_entry_from_acpi(). + */ + initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid); + if (initial_dte) { + new->data128[0] |= initial_dte->data128[0]; + new->data128[1] |= initial_dte->data128[1]; + } } /* NESTED */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 9d2c88aa5c5f..debc33cd4bea 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2110,7 +2110,6 @@ static void set_dte_entry(struct amd_iommu *iommu, { u16 domid; u32 old_domid; - struct dev_table_entry *initial_dte; struct dev_table_entry new = {}; struct protection_domain *domain = dev_data->domain; struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; @@ -2168,16 +2167,6 @@ static void set_dte_entry(struct amd_iommu *iommu, old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK; new.data[1] |= domid; - /* - * Restore cached persistent DTE bits, which can be set by information - * in IVRS table. See set_dev_entry_from_acpi(). - */ - initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid); - if (initial_dte) { - new.data128[0] |= initial_dte->data128[0]; - new.data128[1] |= initial_dte->data128[1]; - } - set_dte_gcr3_table(iommu, dev_data, &new); amd_iommu_update_dte(iommu, dev_data, &new); -- cgit v1.2.3 From 93eee2a49c1b33b73921a915a1ec47845bd60551 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:13 +0000 Subject: iommu/amd: Refactor logic to program the host page table in DTE Introduce the amd_iommu_set_dte_v1() helper function to configure IOMMU host (v1) page table into DTE. This will be used later when attaching nested doamin. Also, remove obsolete warning when SNP is enabled and domain id is zero since this check is no longer applicable. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 4 + drivers/iommu/amd/amd_iommu_types.h | 1 + drivers/iommu/amd/iommu.c | 150 ++++++++++++++++++------------------ 3 files changed, 82 insertions(+), 73 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 00fc9c6073de..02f10922f70b 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -190,6 +190,10 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain, struct dev_table_entry *get_dev_table(struct amd_iommu *iommu); struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid); +void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data, + struct protection_domain *domain, u16 domid, + struct pt_iommu_amdv1_hw_info *pt_info, + struct dev_table_entry *new); void amd_iommu_update_dte(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, struct dev_table_entry *new); diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 4a98ac7dca0f..cfcbad6c28ff 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -357,6 +357,7 @@ #define DTE_FLAG_HAD (3ULL << 7) #define DTE_MODE_MASK GENMASK_ULL(11, 9) #define DTE_HOST_TRP GENMASK_ULL(51, 12) +#define DTE_FLAG_PPR BIT_ULL(52) #define DTE_FLAG_GIOV BIT_ULL(54) #define DTE_FLAG_GV BIT_ULL(55) #define DTE_GLX GENMASK_ULL(57, 56) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index debc33cd4bea..285ae635c324 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2072,102 +2072,106 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid) * Note: * The old value for GCR3 table and GPT have been cleared from caller. */ -static void set_dte_gcr3_table(struct amd_iommu *iommu, - struct iommu_dev_data *dev_data, - struct dev_table_entry *target) +static void set_dte_gcr3_table(struct iommu_dev_data *dev_data, + struct dev_table_entry *new) { struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; - u64 gcr3; + u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); - if (!gcr3_info->gcr3_tbl) - return; - - pr_debug("%s: devid=%#x, glx=%#x, gcr3_tbl=%#llx\n", - __func__, dev_data->devid, gcr3_info->glx, - (unsigned long long)gcr3_info->gcr3_tbl); - - gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); + new->data[0] |= DTE_FLAG_TV | + (dev_data->ppr ? DTE_FLAG_PPR : 0) | + (pdom_is_v2_pgtbl_mode(dev_data->domain) ? DTE_FLAG_GIOV : 0) | + DTE_FLAG_GV | + FIELD_PREP(DTE_GLX, gcr3_info->glx) | + FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12) | + DTE_FLAG_IR | DTE_FLAG_IW; - target->data[0] |= DTE_FLAG_GV | - FIELD_PREP(DTE_GLX, gcr3_info->glx) | - FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12); - if (pdom_is_v2_pgtbl_mode(dev_data->domain)) - target->data[0] |= DTE_FLAG_GIOV; - - target->data[1] |= FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) | - FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31); + new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, dev_data->gcr3_info.domid) | + FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) | + (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0) | + FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31); /* Guest page table can only support 4 and 5 levels */ if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) - target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL); + new->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL); else - target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL); + new->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL); +} + +void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data, + struct protection_domain *domain, u16 domid, + struct pt_iommu_amdv1_hw_info *pt_info, + struct dev_table_entry *new) +{ + u64 host_pt_root = __sme_set(pt_info->host_pt_root); + + /* Note Dirty tracking is used for v1 table only for now */ + new->data[0] |= DTE_FLAG_TV | + FIELD_PREP(DTE_MODE_MASK, pt_info->mode) | + (domain->dirty_tracking ? DTE_FLAG_HAD : 0) | + FIELD_PREP(DTE_HOST_TRP, host_pt_root >> 12) | + DTE_FLAG_IR | DTE_FLAG_IW; + + new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, domid) | + (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0); +} + +static void set_dte_v1(struct iommu_dev_data *dev_data, + struct protection_domain *domain, u16 domid, + phys_addr_t top_paddr, unsigned int top_level, + struct dev_table_entry *new) +{ + struct pt_iommu_amdv1_hw_info pt_info; + + /* + * When updating the IO pagetable, the new top and level + * are provided as parameters. For other operations i.e. + * device attach, retrieve the current pagetable info + * via the IOMMU PT API. + */ + if (top_paddr) { + pt_info.host_pt_root = top_paddr; + pt_info.mode = top_level + 1; + } else { + WARN_ON(top_paddr || top_level); + pt_iommu_amdv1_hw_info(&domain->amdv1, &pt_info); + } + + amd_iommu_set_dte_v1(dev_data, domain, domid, &pt_info, new); +} + +static void set_dte_passthrough(struct iommu_dev_data *dev_data, + struct protection_domain *domain, + struct dev_table_entry *new) +{ + new->data[0] |= DTE_FLAG_TV | DTE_FLAG_IR | DTE_FLAG_IW; + + new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, domain->id) | + (dev_data->ats_enabled) ? DTE_FLAG_IOTLB : 0; } static void set_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, phys_addr_t top_paddr, unsigned int top_level) { - u16 domid; u32 old_domid; struct dev_table_entry new = {}; struct protection_domain *domain = dev_data->domain; struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; - struct pt_iommu_amdv1_hw_info pt_info; amd_iommu_make_clear_dte(dev_data, &new); - if (gcr3_info && gcr3_info->gcr3_tbl) - domid = dev_data->gcr3_info.domid; - else { - domid = domain->id; - - if (domain->domain.type & __IOMMU_DOMAIN_PAGING) { - /* - * When updating the IO pagetable, the new top and level - * are provided as parameters. For other operations i.e. - * device attach, retrieve the current pagetable info - * via the IOMMU PT API. - */ - if (top_paddr) { - pt_info.host_pt_root = top_paddr; - pt_info.mode = top_level + 1; - } else { - WARN_ON(top_paddr || top_level); - pt_iommu_amdv1_hw_info(&domain->amdv1, - &pt_info); - } - - new.data[0] |= __sme_set(pt_info.host_pt_root) | - (pt_info.mode & DEV_ENTRY_MODE_MASK) - << DEV_ENTRY_MODE_SHIFT; - } - } - - new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW; - - /* - * When SNP is enabled, we can only support TV=1 with non-zero domain ID. - * This is prevented by the SNP-enable and IOMMU_DOMAIN_IDENTITY check in - * do_iommu_domain_alloc(). - */ - WARN_ON(amd_iommu_snp_en && (domid == 0)); - new.data[0] |= DTE_FLAG_TV; - - if (dev_data->ppr) - new.data[0] |= 1ULL << DEV_ENTRY_PPR; - - if (domain->dirty_tracking) - new.data[0] |= DTE_FLAG_HAD; - - if (dev_data->ats_enabled) - new.data[1] |= DTE_FLAG_IOTLB; - old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK; - new.data[1] |= domid; - - set_dte_gcr3_table(iommu, dev_data, &new); + if (gcr3_info->gcr3_tbl) + set_dte_gcr3_table(dev_data, &new); + else if (domain->domain.type == IOMMU_DOMAIN_IDENTITY) + set_dte_passthrough(dev_data, domain, &new); + else if ((domain->domain.type & __IOMMU_DOMAIN_PAGING) && + domain->pd_mode == PD_MODE_V1) + set_dte_v1(dev_data, domain, domain->id, top_paddr, top_level, &new); + else + WARN_ON(true); amd_iommu_update_dte(iommu, dev_data, &new); -- cgit v1.2.3 From 103f4e7c8544961c72dd9f68eed384a2f1194d03 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:14 +0000 Subject: iommu/amd: Add support for nested domain attach/detach Introduce set_dte_nested() to program guest translation settings in the host DTE when attaches the nested domain to a device. Reviewed-by: Nicolin Chen Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/nested.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c index 8154a773eed8..66cc36133c8b 100644 --- a/drivers/iommu/amd/nested.c +++ b/drivers/iommu/amd/nested.c @@ -183,6 +183,78 @@ out_err: return ERR_PTR(ret); } +static void set_dte_nested(struct amd_iommu *iommu, struct iommu_domain *dom, + struct iommu_dev_data *dev_data, struct dev_table_entry *new) +{ + struct protection_domain *parent; + struct nested_domain *ndom = to_ndomain(dom); + struct iommu_hwpt_amd_guest *gdte = &ndom->gdte; + struct pt_iommu_amdv1_hw_info pt_info; + + /* + * The nest parent domain is attached during the call to the + * struct iommu_ops.viommu_init(), which will be stored as part + * of the struct amd_iommu_viommu.parent. + */ + if (WARN_ON(!ndom->viommu || !ndom->viommu->parent)) + return; + + parent = ndom->viommu->parent; + amd_iommu_make_clear_dte(dev_data, new); + + /* Retrieve the current pagetable info via the IOMMU PT API. */ + pt_iommu_amdv1_hw_info(&parent->amdv1, &pt_info); + + /* + * Use domain ID from nested domain to program DTE. + * See amd_iommu_alloc_domain_nested(). + */ + amd_iommu_set_dte_v1(dev_data, parent, ndom->gdom_info->hdom_id, + &pt_info, new); + + /* GV is required for nested page table */ + new->data[0] |= DTE_FLAG_GV; + + /* Guest PPR */ + new->data[0] |= gdte->dte[0] & DTE_FLAG_PPR; + + /* Guest translation stuff */ + new->data[0] |= gdte->dte[0] & (DTE_GLX | DTE_FLAG_GIOV); + + /* GCR3 table */ + new->data[0] |= gdte->dte[0] & DTE_GCR3_14_12; + new->data[1] |= gdte->dte[1] & (DTE_GCR3_30_15 | DTE_GCR3_51_31); + + /* Guest paging mode */ + new->data[2] |= gdte->dte[2] & DTE_GPT_LEVEL_MASK; +} + +static int nested_attach_device(struct iommu_domain *dom, struct device *dev, + struct iommu_domain *old) +{ + struct dev_table_entry new = {0}; + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); + struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + int ret = 0; + + /* + * Needs to make sure PASID is not enabled + * for this attach path. + */ + if (WARN_ON(dev_data->pasid_enabled)) + return -EINVAL; + + mutex_lock(&dev_data->mutex); + + set_dte_nested(iommu, dom, dev_data, &new); + + amd_iommu_update_dte(iommu, dev_data, &new); + + mutex_unlock(&dev_data->mutex); + + return ret; +} + static void nested_domain_free(struct iommu_domain *dom) { struct guest_domain_mapping_info *curr; @@ -217,5 +289,6 @@ static void nested_domain_free(struct iommu_domain *dom) } static const struct iommu_domain_ops nested_domain_ops = { + .attach_dev = nested_attach_device, .free = nested_domain_free, }; -- cgit v1.2.3 From e2692c4eeaa4bd945b7bae156b4cac55d6a0c730 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 26 Jan 2026 03:19:20 +0800 Subject: iommupt: Do not set C-bit on MMIO backed PTEs AMD Secure Memory Encryption (SME) marks individual memory pages as encrypted by setting the C-bit in page table entries. According to the AMD APM,any pages corresponding to MMIO addresses must be configured with the C-bit clear. The current *_iommu_set_prot() implementation sets the C-bit on all PTEs in the IOMMU page tables. This is incorrect for PTEs backed by MMIO, and can break PCIe peer-to-peer communication when IOVA is used. Fix this by avoiding the C-bit for MMIO-backed mappings. For amdv2 IOMMU page tables, there is a usage scenario for GVA->GPA mappings, and for the trusted MMIO in the TEE-IO case, the C-bit will need to be added to GPA. However, SNP guests do not yet support vIOMMU, and the trusted MMIO support is not ready in upstream. Adding the C-bit for trusted MMIO can be considered once those features land. Fixes: 879ced2bab1b ("iommupt: Add the AMD IOMMU v1 page table format") Fixes: aef5de756ea8 ("iommupt: Add the x86 64 bit page table format") Suggested-by: Jason Gunthorpe Signed-off-by: Wei Wang Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/fmt/amdv1.h | 3 ++- drivers/iommu/generic_pt/fmt/x86_64.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h index aa8e1a8ec95f..3b2c41d9654d 100644 --- a/drivers/iommu/generic_pt/fmt/amdv1.h +++ b/drivers/iommu/generic_pt/fmt/amdv1.h @@ -354,7 +354,8 @@ static inline int amdv1pt_iommu_set_prot(struct pt_common *common, * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to * control this. For now if the tables use sme_set then so do the ptes. */ - if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES) && + !(iommu_prot & IOMMU_MMIO)) pte = __sme_set(pte); attrs->descriptor_bits = pte; diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h index 210748d9d6e8..ed9a47cbb6e0 100644 --- a/drivers/iommu/generic_pt/fmt/x86_64.h +++ b/drivers/iommu/generic_pt/fmt/x86_64.h @@ -227,7 +227,8 @@ static inline int x86_64_pt_iommu_set_prot(struct pt_common *common, * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to * control this. For now if the tables use sme_set then so do the ptes. */ - if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES) && + !(iommu_prot & IOMMU_MMIO)) pte = __sme_set(pte); attrs->descriptor_bits = pte; -- cgit v1.2.3 From 2e2f6b0ef8551bf3bd8255729d27e3ad9451e562 Mon Sep 17 00:00:00 2001 From: Asahi Lina Date: Sun, 18 Jan 2026 10:08:08 +0000 Subject: rust: iommu: add io_pgtable abstraction This will be used by the Tyr driver to create and modify the page table of each address space on the GPU. Each time a mapping gets created or removed by userspace, Tyr will call into GPUVM, which will figure out which calls to map_pages and unmap_pages are required to map the data in question in the page table so that the GPU may access those pages when using that address space. The Rust type wraps the struct using a raw pointer rather than the usual Opaque+ARef approach because Opaque+ARef requires the target type to be refcounted. Signed-off-by: Asahi Lina Acked-by: Boris Brezillon Reviewed-by: Daniel Almeida Tested-by: Deborah Brouwer Co-developed-by: Alice Ryhl Signed-off-by: Alice Ryhl Reviewed-by: Gary Guo Reviewed-by: Danilo Krummrich [joro: Fixed up Rust import style] Signed-off-by: Joerg Roedel --- MAINTAINERS | 1 + rust/bindings/bindings_helper.h | 3 +- rust/kernel/iommu/mod.rs | 5 + rust/kernel/iommu/pgtable.rs | 279 ++++++++++++++++++++++++++++++++++++++++ rust/kernel/lib.rs | 1 + 5 files changed, 288 insertions(+), 1 deletion(-) create mode 100644 rust/kernel/iommu/mod.rs create mode 100644 rust/kernel/iommu/pgtable.rs diff --git a/MAINTAINERS b/MAINTAINERS index 765ad2daa218..354257f69a01 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13241,6 +13241,7 @@ F: drivers/iommu/ F: include/linux/iommu.h F: include/linux/iova.h F: include/linux/of_iommu.h +F: rust/kernel/iommu/ IOMMUFD M: Jason Gunthorpe diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index a067038b4b42..1b05a5e4cfb4 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -56,9 +56,10 @@ #include #include #include -#include #include #include +#include +#include #include #include #include diff --git a/rust/kernel/iommu/mod.rs b/rust/kernel/iommu/mod.rs new file mode 100644 index 000000000000..1423d7b19b57 --- /dev/null +++ b/rust/kernel/iommu/mod.rs @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust support related to IOMMU. + +pub mod pgtable; diff --git a/rust/kernel/iommu/pgtable.rs b/rust/kernel/iommu/pgtable.rs new file mode 100644 index 000000000000..6135ba141e48 --- /dev/null +++ b/rust/kernel/iommu/pgtable.rs @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! IOMMU page table management. +//! +//! C header: [`include/io-pgtable.h`](srctree/include/io-pgtable.h) + +use core::{ + marker::PhantomData, + ptr::NonNull, // +}; + +use crate::{ + alloc, + bindings, + device::{ + Bound, + Device // + }, + devres::Devres, + error::to_result, + io::PhysAddr, + prelude::*, // +}; + +use bindings::io_pgtable_fmt; + +/// Protection flags used with IOMMU mappings. +pub mod prot { + /// Read access. + pub const READ: u32 = bindings::IOMMU_READ; + /// Write access. + pub const WRITE: u32 = bindings::IOMMU_WRITE; + /// Request cache coherency. + pub const CACHE: u32 = bindings::IOMMU_CACHE; + /// Request no-execute permission. + pub const NOEXEC: u32 = bindings::IOMMU_NOEXEC; + /// MMIO peripheral mapping. + pub const MMIO: u32 = bindings::IOMMU_MMIO; + /// Privileged mapping. + pub const PRIVILEGED: u32 = bindings::IOMMU_PRIV; +} + +/// Represents a requested `io_pgtable` configuration. +pub struct Config { + /// Quirk bitmask (type-specific). + pub quirks: usize, + /// Valid page sizes, as a bitmask of powers of two. + pub pgsize_bitmap: usize, + /// Input address space size in bits. + pub ias: u32, + /// Output address space size in bits. + pub oas: u32, + /// IOMMU uses coherent accesses for page table walks. + pub coherent_walk: bool, +} + +/// An io page table using a specific format. +/// +/// # Invariants +/// +/// The pointer references a valid io page table. +pub struct IoPageTable { + ptr: NonNull, + _marker: PhantomData, +} + +// SAFETY: `struct io_pgtable_ops` is not restricted to a single thread. +unsafe impl Send for IoPageTable {} +// SAFETY: `struct io_pgtable_ops` may be accessed concurrently. +unsafe impl Sync for IoPageTable {} + +/// The format used by this page table. +pub trait IoPageTableFmt: 'static { + /// The value representing this format. + const FORMAT: io_pgtable_fmt; +} + +impl IoPageTable { + /// Create a new `IoPageTable` as a device resource. + #[inline] + pub fn new( + dev: &Device, + config: Config, + ) -> impl PinInit>, Error> + '_ { + // SAFETY: Devres ensures that the value is dropped during device unbind. + Devres::new(dev, unsafe { Self::new_raw(dev, config) }) + } + + /// Create a new `IoPageTable`. + /// + /// # Safety + /// + /// If successful, then the returned `IoPageTable` must be dropped before the device is + /// unbound. + #[inline] + pub unsafe fn new_raw(dev: &Device, config: Config) -> Result> { + let mut raw_cfg = bindings::io_pgtable_cfg { + quirks: config.quirks, + pgsize_bitmap: config.pgsize_bitmap, + ias: config.ias, + oas: config.oas, + coherent_walk: config.coherent_walk, + tlb: &raw const NOOP_FLUSH_OPS, + iommu_dev: dev.as_raw(), + // SAFETY: All zeroes is a valid value for `struct io_pgtable_cfg`. + ..unsafe { core::mem::zeroed() } + }; + + // SAFETY: + // * The raw_cfg pointer is valid for the duration of this call. + // * The provided `FLUSH_OPS` contains valid function pointers that accept a null pointer + // as cookie. + // * The caller ensures that the io pgtable does not outlive the device. + let ops = unsafe { + bindings::alloc_io_pgtable_ops(F::FORMAT, &mut raw_cfg, core::ptr::null_mut()) + }; + + // INVARIANT: We successfully created a valid page table. + Ok(IoPageTable { + ptr: NonNull::new(ops).ok_or(ENOMEM)?, + _marker: PhantomData, + }) + } + + /// Obtain a raw pointer to the underlying `struct io_pgtable_ops`. + #[inline] + pub fn raw_ops(&self) -> *mut bindings::io_pgtable_ops { + self.ptr.as_ptr() + } + + /// Obtain a raw pointer to the underlying `struct io_pgtable`. + #[inline] + pub fn raw_pgtable(&self) -> *mut bindings::io_pgtable { + // SAFETY: The io_pgtable_ops of an io-pgtable is always the ops field of a io_pgtable. + unsafe { kernel::container_of!(self.raw_ops(), bindings::io_pgtable, ops) } + } + + /// Obtain a raw pointer to the underlying `struct io_pgtable_cfg`. + #[inline] + pub fn raw_cfg(&self) -> *mut bindings::io_pgtable_cfg { + // SAFETY: The `raw_pgtable()` method returns a valid pointer. + unsafe { &raw mut (*self.raw_pgtable()).cfg } + } + + /// Map a physically contiguous range of pages of the same size. + /// + /// Even if successful, this operation may not map the entire range. In that case, only a + /// prefix of the range is mapped, and the returned integer indicates its length in bytes. In + /// this case, the caller will usually call `map_pages` again for the remaining range. + /// + /// The returned [`Result`] indicates whether an error was encountered while mapping pages. + /// Note that this may return a non-zero length even if an error was encountered. The caller + /// will usually [unmap the relevant pages](Self::unmap_pages) on error. + /// + /// The caller must flush the TLB before using the pgtable to access the newly created mapping. + /// + /// # Safety + /// + /// * No other io-pgtable operation may access the range `iova .. iova+pgsize*pgcount` while + /// this `map_pages` operation executes. + /// * This page table must not contain any mapping that overlaps with the mapping created by + /// this call. + /// * If this page table is live, then the caller must ensure that it's okay to access the + /// physical address being mapped for the duration in which it is mapped. + #[inline] + pub unsafe fn map_pages( + &self, + iova: usize, + paddr: PhysAddr, + pgsize: usize, + pgcount: usize, + prot: u32, + flags: alloc::Flags, + ) -> (usize, Result) { + let mut mapped: usize = 0; + + // SAFETY: The `map_pages` function in `io_pgtable_ops` is never null. + let map_pages = unsafe { (*self.raw_ops()).map_pages.unwrap_unchecked() }; + + // SAFETY: The safety requirements of this method are sufficient to call `map_pages`. + let ret = to_result(unsafe { + (map_pages)( + self.raw_ops(), + iova, + paddr, + pgsize, + pgcount, + prot as i32, + flags.as_raw(), + &mut mapped, + ) + }); + + (mapped, ret) + } + + /// Unmap a range of virtually contiguous pages of the same size. + /// + /// This may not unmap the entire range, and returns the length of the unmapped prefix in + /// bytes. + /// + /// # Safety + /// + /// * No other io-pgtable operation may access the range `iova .. iova+pgsize*pgcount` while + /// this `unmap_pages` operation executes. + /// * This page table must contain one or more consecutive mappings starting at `iova` whose + /// total size is `pgcount * pgsize`. + #[inline] + #[must_use] + pub unsafe fn unmap_pages(&self, iova: usize, pgsize: usize, pgcount: usize) -> usize { + // SAFETY: The `unmap_pages` function in `io_pgtable_ops` is never null. + let unmap_pages = unsafe { (*self.raw_ops()).unmap_pages.unwrap_unchecked() }; + + // SAFETY: The safety requirements of this method are sufficient to call `unmap_pages`. + unsafe { (unmap_pages)(self.raw_ops(), iova, pgsize, pgcount, core::ptr::null_mut()) } + } +} + +// For the initial users of these rust bindings, the GPU FW is managing the IOTLB and performs all +// required invalidations using a range. There is no need for it get ARM style invalidation +// instructions from the page table code. +// +// Support for flushing the TLB with ARM style invalidation instructions may be added in the +// future. +static NOOP_FLUSH_OPS: bindings::iommu_flush_ops = bindings::iommu_flush_ops { + tlb_flush_all: Some(rust_tlb_flush_all_noop), + tlb_flush_walk: Some(rust_tlb_flush_walk_noop), + tlb_add_page: None, +}; + +#[no_mangle] +extern "C" fn rust_tlb_flush_all_noop(_cookie: *mut core::ffi::c_void) {} + +#[no_mangle] +extern "C" fn rust_tlb_flush_walk_noop( + _iova: usize, + _size: usize, + _granule: usize, + _cookie: *mut core::ffi::c_void, +) { +} + +impl Drop for IoPageTable { + fn drop(&mut self) { + // SAFETY: The caller of `Self::ttbr()` promised that the page table is not live when this + // destructor runs. + unsafe { bindings::free_io_pgtable_ops(self.raw_ops()) }; + } +} + +/// The `ARM_64_LPAE_S1` page table format. +pub enum ARM64LPAES1 {} + +impl IoPageTableFmt for ARM64LPAES1 { + const FORMAT: io_pgtable_fmt = bindings::io_pgtable_fmt_ARM_64_LPAE_S1 as io_pgtable_fmt; +} + +impl IoPageTable { + /// Access the `ttbr` field of the configuration. + /// + /// This is the physical address of the page table, which may be passed to the device that + /// needs to use it. + /// + /// # Safety + /// + /// The caller must ensure that the device stops using the page table before dropping it. + #[inline] + pub unsafe fn ttbr(&self) -> u64 { + // SAFETY: `arm_lpae_s1_cfg` is the right cfg type for `ARM64LPAES1`. + unsafe { (*self.raw_cfg()).__bindgen_anon_1.arm_lpae_s1_cfg.ttbr } + } + + /// Access the `mair` field of the configuration. + #[inline] + pub fn mair(&self) -> u64 { + // SAFETY: `arm_lpae_s1_cfg` is the right cfg type for `ARM64LPAES1`. + unsafe { (*self.raw_cfg()).__bindgen_anon_1.arm_lpae_s1_cfg.mair } + } +} diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index f812cf120042..e7fba6fa0f81 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -103,6 +103,7 @@ pub mod id_pool; pub mod init; pub mod io; pub mod ioctl; +pub mod iommu; pub mod iov; pub mod irq; pub mod jump_label; -- cgit v1.2.3 From c0a652a3d1970caa0023632ae3a4ea21991d2f1a Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 20 Jan 2026 01:48:43 +0000 Subject: iommu/amd: Remove unused variable in amd_iommufd_viommu_destroy() This fixes warning reported by 0-DAY CI Kernel Test Service. Fixes: 757d2b1fdf5b ("iommu/amd: Introduce gDomID-to-hDomID Mapping and handle parent domain invalidation") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202601190634.bl7Mjx5Q-lkp@intel.com/ Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommufd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/iommu/amd/iommufd.c b/drivers/iommu/amd/iommufd.c index 2e50633d9c72..ad627fd5ccc7 100644 --- a/drivers/iommu/amd/iommufd.c +++ b/drivers/iommu/amd/iommufd.c @@ -59,7 +59,6 @@ int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain * static void amd_iommufd_viommu_destroy(struct iommufd_viommu *viommu) { unsigned long flags; - struct amd_iommu *iommu = container_of(viommu->iommu_dev, struct amd_iommu, iommu); struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core); struct protection_domain *pdom = aviommu->parent; -- cgit v1.2.3 From d414b83dc5f90a6a9a656cd6fbb9378ddc824032 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Tue, 20 Jan 2026 09:19:25 +0000 Subject: mm/page_ext: Add page_ext_get_from_phys() The IOMMU code operates on physical addresses which can be outside of system RAM. Add a new function page_ext_get_from_phys() to abstract the logic of checking the address and returning the page_ext. Signed-off-by: Mostafa Saleh Acked-by: Vlastimil Babka Signed-off-by: Joerg Roedel --- include/linux/page_ext.h | 6 ++++++ mm/page_ext.c | 23 +++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 76c817162d2f..61e876e255e8 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -93,6 +93,7 @@ static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) #endif extern struct page_ext *page_ext_get(const struct page *page); +extern struct page_ext *page_ext_from_phys(phys_addr_t phys); extern void page_ext_put(struct page_ext *page_ext); extern struct page_ext *page_ext_lookup(unsigned long pfn); @@ -215,6 +216,11 @@ static inline struct page_ext *page_ext_get(const struct page *page) return NULL; } +static inline struct page_ext *page_ext_from_phys(phys_addr_t phys) +{ + return NULL; +} + static inline void page_ext_put(struct page_ext *page_ext) { } diff --git a/mm/page_ext.c b/mm/page_ext.c index 297e4cd8ce90..e2e92bd27ebd 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -538,6 +538,29 @@ struct page_ext *page_ext_get(const struct page *page) return page_ext; } +/** + * page_ext_from_phys() - Get the page_ext structure for a physical address. + * @phys: The physical address to query. + * + * This function safely gets the `struct page_ext` associated with a given + * physical address. It performs validation to ensure the address corresponds + * to a valid, online struct page before attempting to access it. + * It returns NULL for MMIO, ZONE_DEVICE, holes and offline memory. + * + * Return: NULL if no page_ext exists for this physical address. + * Context: Any context. Caller may not sleep until they have called + * page_ext_put(). + */ +struct page_ext *page_ext_from_phys(phys_addr_t phys) +{ + struct page *page = pfn_to_online_page(__phys_to_pfn(phys)); + + if (!page) + return NULL; + + return page_ext_get(page); +} + /** * page_ext_put() - Working with page extended information is done. * @page_ext: Page extended information received from page_ext_get(). -- cgit v1.2.3 From a7f1bc231b666aed85358d4940ca8c37a75639f7 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Tue, 20 Jan 2026 09:19:26 +0000 Subject: iommu: debug-pagealloc: Use page_ext_get_from_phys() Instead of calling pfn_valid() and then getting the page, call the newly added function page_ext_get_from_phys(), which would also check for MMIO and offline memory and return NULL in that case. Signed-off-by: Mostafa Saleh Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-debug-pagealloc.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c index c080a38f45a4..80164df5bab1 100644 --- a/drivers/iommu/iommu-debug-pagealloc.c +++ b/drivers/iommu/iommu-debug-pagealloc.c @@ -30,14 +30,6 @@ struct page_ext_operations page_iommu_debug_ops = { .need = need_iommu_debug, }; -static struct page_ext *get_iommu_page_ext(phys_addr_t phys) -{ - struct page *page = phys_to_page(phys); - struct page_ext *page_ext = page_ext_get(page); - - return page_ext; -} - static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext) { return page_ext_data(page_ext, &page_iommu_debug_ops); @@ -45,18 +37,26 @@ static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext) static void iommu_debug_inc_page(phys_addr_t phys) { - struct page_ext *page_ext = get_iommu_page_ext(phys); - struct iommu_debug_metadata *d = get_iommu_data(page_ext); + struct page_ext *page_ext = page_ext_from_phys(phys); + struct iommu_debug_metadata *d; + + if (!page_ext) + return; + d = get_iommu_data(page_ext); WARN_ON(atomic_inc_return_relaxed(&d->ref) <= 0); page_ext_put(page_ext); } static void iommu_debug_dec_page(phys_addr_t phys) { - struct page_ext *page_ext = get_iommu_page_ext(phys); - struct iommu_debug_metadata *d = get_iommu_data(page_ext); + struct page_ext *page_ext = page_ext_from_phys(phys); + struct iommu_debug_metadata *d; + + if (!page_ext) + return; + d = get_iommu_data(page_ext); WARN_ON(atomic_dec_return_relaxed(&d->ref) < 0); page_ext_put(page_ext); } @@ -104,11 +104,8 @@ void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t siz if (WARN_ON(!phys || check_add_overflow(phys, size, &end))) return; - for (off = 0 ; off < size ; off += page_size) { - if (!pfn_valid(__phys_to_pfn(phys + off))) - continue; + for (off = 0 ; off < size ; off += page_size) iommu_debug_inc_page(phys + off); - } } static void __iommu_debug_update_iova(struct iommu_domain *domain, @@ -123,7 +120,7 @@ static void __iommu_debug_update_iova(struct iommu_domain *domain, for (off = 0 ; off < size ; off += page_size) { phys_addr_t phys = iommu_iova_to_phys(domain, iova + off); - if (!phys || !pfn_valid(__phys_to_pfn(phys))) + if (!phys) continue; if (inc) -- cgit v1.2.3 From 7222dd071b221dde38fea6d0798520572877ee2a Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Wed, 21 Jan 2026 19:43:38 +0100 Subject: rust: iommu: fix Rust formatting The Rust kernel code should be kept `rustfmt`-clean [1]. Thus run the `rustfmt` target to fix the formatting issue. Link: https://rust-for-linux.com/contributing#submit-checklist-addendum [1] Fixes: 2e2f6b0ef855 ("rust: iommu: add io_pgtable abstraction") Signed-off-by: Miguel Ojeda Signed-off-by: Joerg Roedel --- rust/kernel/iommu/pgtable.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/iommu/pgtable.rs b/rust/kernel/iommu/pgtable.rs index 6135ba141e48..916e1f509e62 100644 --- a/rust/kernel/iommu/pgtable.rs +++ b/rust/kernel/iommu/pgtable.rs @@ -14,7 +14,7 @@ use crate::{ bindings, device::{ Bound, - Device // + Device, // }, devres::Devres, error::to_result, -- cgit v1.2.3 From 12248a3862d50ef1a889153bb222d10d43e78c9d Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Wed, 21 Jan 2026 19:43:39 +0100 Subject: rust: iommu: fix `srctree` link warning The Rust kernel code should be kept `rustdoc`-clean [1]. Our custom `srctree` link checker in the `rustdoc` target reports: warning: srctree/ link to include/io-pgtable.h does not exist Thus fix it. Link: https://rust-for-linux.com/contributing#submit-checklist-addendum [1] Fixes: 2e2f6b0ef855 ("rust: iommu: add io_pgtable abstraction") Signed-off-by: Miguel Ojeda Signed-off-by: Joerg Roedel --- rust/kernel/iommu/pgtable.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/iommu/pgtable.rs b/rust/kernel/iommu/pgtable.rs index 916e1f509e62..c88e38fd938a 100644 --- a/rust/kernel/iommu/pgtable.rs +++ b/rust/kernel/iommu/pgtable.rs @@ -2,7 +2,7 @@ //! IOMMU page table management. //! -//! C header: [`include/io-pgtable.h`](srctree/include/io-pgtable.h) +//! C header: [`include/linux/io-pgtable.h`](srctree/include/linux/io-pgtable.h) use core::{ marker::PhantomData, -- cgit v1.2.3 From 42662d19839f34735b718129ea200e3734b07e50 Mon Sep 17 00:00:00 2001 From: Jinhui Guo Date: Thu, 22 Jan 2026 09:48:50 +0800 Subject: iommu/vt-d: Skip dev-iotlb flush for inaccessible PCIe device without scalable mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCIe endpoints with ATS enabled and passed through to userspace (e.g., QEMU, DPDK) can hard-lock the host when their link drops, either by surprise removal or by a link fault. Commit 4fc82cd907ac ("iommu/vt-d: Don't issue ATS Invalidation request when device is disconnected") adds pci_dev_is_disconnected() to devtlb_invalidation_with_pasid() so ATS invalidation is skipped only when the device is being safely removed, but it applies only when Intel IOMMU scalable mode is enabled. With scalable mode disabled or unsupported, a system hard-lock occurs when a PCIe endpoint's link drops because the Intel IOMMU waits indefinitely for an ATS invalidation that cannot complete. Call Trace: qi_submit_sync qi_flush_dev_iotlb __context_flush_dev_iotlb.part.0 domain_context_clear_one_cb pci_for_each_dma_alias device_block_translation blocking_domain_attach_dev iommu_deinit_device __iommu_group_remove_device iommu_release_device iommu_bus_notifier blocking_notifier_call_chain bus_notify device_del pci_remove_bus_device pci_stop_and_remove_bus_device pciehp_unconfigure_device pciehp_disable_slot pciehp_handle_presence_or_link_change pciehp_ist Commit 81e921fd3216 ("iommu/vt-d: Fix NULL domain on device release") adds intel_pasid_teardown_sm_context() to intel_iommu_release_device(), which calls qi_flush_dev_iotlb() and can also hard-lock the system when a PCIe endpoint's link drops. Call Trace: qi_submit_sync qi_flush_dev_iotlb __context_flush_dev_iotlb.part.0 intel_context_flush_no_pasid device_pasid_table_teardown pci_pasid_table_teardown pci_for_each_dma_alias intel_pasid_teardown_sm_context intel_iommu_release_device iommu_deinit_device __iommu_group_remove_device iommu_release_device iommu_bus_notifier blocking_notifier_call_chain bus_notify device_del pci_remove_bus_device pci_stop_and_remove_bus_device pciehp_unconfigure_device pciehp_disable_slot pciehp_handle_presence_or_link_change pciehp_ist Sometimes the endpoint loses connection without a link-down event (e.g., due to a link fault); killing the process (virsh destroy) then hard-locks the host. Call Trace: qi_submit_sync qi_flush_dev_iotlb __context_flush_dev_iotlb.part.0 domain_context_clear_one_cb pci_for_each_dma_alias device_block_translation blocking_domain_attach_dev __iommu_attach_device __iommu_device_set_domain __iommu_group_set_domain_internal iommu_detach_group vfio_iommu_type1_detach_group vfio_group_detach_container vfio_group_fops_release __fput pci_dev_is_disconnected() only covers safe-removal paths; pci_device_is_present() tests accessibility by reading vendor/device IDs and internally calls pci_dev_is_disconnected(). On a ConnectX-5 (8 GT/s, x2) this costs ~70 µs. Since __context_flush_dev_iotlb() is only called on {attach,release}_dev paths (not hot), add pci_device_is_present() there to skip inaccessible devices and avoid the hard-lock. Fixes: 37764b952e1b ("iommu/vt-d: Global devTLB flush when present context entry changed") Fixes: 81e921fd3216 ("iommu/vt-d: Fix NULL domain on device release") Cc: stable@vger.kernel.org Signed-off-by: Jinhui Guo Link: https://lore.kernel.org/r/20251211035946.2071-2-guojinhui.liam@bytedance.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/pasid.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 3e2255057079..3f6d78180d79 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -1102,6 +1102,14 @@ static void __context_flush_dev_iotlb(struct device_domain_info *info) if (!info->ats_enabled) return; + /* + * Skip dev-IOTLB flush for inaccessible PCIe devices to prevent the + * Intel IOMMU from waiting indefinitely for an ATS invalidation that + * cannot complete. + */ + if (!pci_device_is_present(to_pci_dev(info->dev))) + return; + qi_flush_dev_iotlb(info->iommu, PCI_DEVID(info->bus, info->devfn), info->pfsid, info->ats_qdep, 0, MAX_AGAW_PFN_WIDTH); -- cgit v1.2.3 From 10e60d87813989e20eac1f3eda30b3bae461e7f9 Mon Sep 17 00:00:00 2001 From: Jinhui Guo Date: Thu, 22 Jan 2026 09:48:51 +0800 Subject: iommu/vt-d: Flush dev-IOTLB only when PCIe device is accessible in scalable mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 4fc82cd907ac ("iommu/vt-d: Don't issue ATS Invalidation request when device is disconnected") relies on pci_dev_is_disconnected() to skip ATS invalidation for safely-removed devices, but it does not cover link-down caused by faults, which can still hard-lock the system. For example, if a VM fails to connect to the PCIe device, "virsh destroy" is executed to release resources and isolate the fault, but a hard-lockup occurs while releasing the group fd. Call Trace: qi_submit_sync qi_flush_dev_iotlb intel_pasid_tear_down_entry device_block_translation blocking_domain_attach_dev __iommu_attach_device __iommu_device_set_domain __iommu_group_set_domain_internal iommu_detach_group vfio_iommu_type1_detach_group vfio_group_detach_container vfio_group_fops_release __fput Although pci_device_is_present() is slower than pci_dev_is_disconnected(), it still takes only ~70 µs on a ConnectX-5 (8 GT/s, x2) and becomes even faster as PCIe speed and width increase. Besides, devtlb_invalidation_with_pasid() is called only in the paths below, which are far less frequent than memory map/unmap. 1. mm-struct release 2. {attach,release}_dev 3. set/remove PASID 4. dirty-tracking setup The gain in system stability far outweighs the negligible cost of using pci_device_is_present() instead of pci_dev_is_disconnected() to decide when to skip ATS invalidation, especially under GDR high-load conditions. Fixes: 4fc82cd907ac ("iommu/vt-d: Don't issue ATS Invalidation request when device is disconnected") Cc: stable@vger.kernel.org Signed-off-by: Jinhui Guo Link: https://lore.kernel.org/r/20251211035946.2071-3-guojinhui.liam@bytedance.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/pasid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 3f6d78180d79..99692f88b883 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -218,7 +218,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu, if (!info || !info->ats_enabled) return; - if (pci_dev_is_disconnected(to_pci_dev(dev))) + if (!pci_device_is_present(to_pci_dev(dev))) return; sid = PCI_DEVID(info->bus, info->devfn); -- cgit v1.2.3 From 22d169bdd2849fe6bd18c2643742e1c02be6451c Mon Sep 17 00:00:00 2001 From: Dmytro Maluka Date: Thu, 22 Jan 2026 09:48:52 +0800 Subject: iommu/vt-d: Flush cache for PASID table before using it When writing the address of a freshly allocated zero-initialized PASID table to a PASID directory entry, do that after the CPU cache flush for this PASID table, not before it, to avoid the time window when this PASID table may be already used by non-coherent IOMMU hardware while its contents in RAM is still some random old data, not zero-initialized. Fixes: 194b3348bdbb ("iommu/vt-d: Fix PASID directory pointer coherency") Signed-off-by: Dmytro Maluka Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20251221123508.37495-1-dmaluka@chromium.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/pasid.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 99692f88b883..6379b211f12b 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -153,6 +153,9 @@ retry: if (!entries) return NULL; + if (!ecap_coherent(info->iommu->ecap)) + clflush_cache_range(entries, VTD_PAGE_SIZE); + /* * The pasid directory table entry won't be freed after * allocation. No worry about the race with free and @@ -165,10 +168,8 @@ retry: iommu_free_pages(entries); goto retry; } - if (!ecap_coherent(info->iommu->ecap)) { - clflush_cache_range(entries, VTD_PAGE_SIZE); + if (!ecap_coherent(info->iommu->ecap)) clflush_cache_range(&dir[dir_index].val, sizeof(*dir)); - } } return &entries[index]; -- cgit v1.2.3 From 04b1b069f151e793767755f58b51670bff00cbc1 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 22 Jan 2026 09:48:53 +0800 Subject: iommu/vt-d: Flush piotlb for SVM and Nested domain Besides the paging domains that use FS, SVM and Nested domains need to use piotlb invalidation descriptor as well. Fixes: b33125296b50 ("iommu/vt-d: Create unique domain ops for each stage") Cc: stable@vger.kernel.org Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20251223065824.6164-1-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/cache.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c index 265e7290256b..385ae5cfb30d 100644 --- a/drivers/iommu/intel/cache.c +++ b/drivers/iommu/intel/cache.c @@ -363,6 +363,13 @@ static void qi_batch_add_pasid_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qi_batch_increment_index(iommu, batch); } +static bool intel_domain_use_piotlb(struct dmar_domain *domain) +{ + return domain->domain.type == IOMMU_DOMAIN_SVA || + domain->domain.type == IOMMU_DOMAIN_NESTED || + intel_domain_is_fs_paging(domain); +} + static void cache_tag_flush_iotlb(struct dmar_domain *domain, struct cache_tag *tag, unsigned long addr, unsigned long pages, unsigned long mask, int ih) @@ -370,7 +377,7 @@ static void cache_tag_flush_iotlb(struct dmar_domain *domain, struct cache_tag * struct intel_iommu *iommu = tag->iommu; u64 type = DMA_TLB_PSI_FLUSH; - if (intel_domain_is_fs_paging(domain)) { + if (intel_domain_use_piotlb(domain)) { qi_batch_add_piotlb(iommu, tag->domain_id, tag->pasid, addr, pages, ih, domain->qi_batch); return; -- cgit v1.2.3 From 75ed00055c059dedc47b5daaaa2f8a7a019138ff Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 22 Jan 2026 09:48:54 +0800 Subject: iommu/vt-d: Clear Present bit before tearing down PASID entry The Intel VT-d Scalable Mode PASID table entry consists of 512 bits (64 bytes). When tearing down an entry, the current implementation zeros the entire 64-byte structure immediately using multiple 64-bit writes. Since the IOMMU hardware may fetch these 64 bytes using multiple internal transactions (e.g., four 128-bit bursts), updating or zeroing the entire entry while it is active (P=1) risks a "torn" read. If a hardware fetch occurs simultaneously with the CPU zeroing the entry, the hardware could observe an inconsistent state, leading to unpredictable behavior or spurious faults. Follow the "Guidance to Software for Invalidations" in the VT-d spec (Section 6.5.3.3) by implementing the recommended ownership handshake: 1. Clear only the 'Present' (P) bit of the PASID entry. 2. Use a dma_wmb() to ensure the cleared bit is visible to hardware before proceeding. 3. Execute the required invalidation sequence (PASID cache, IOTLB, and Device-TLB flush) to ensure the hardware has released all cached references. 4. Only after the flushes are complete, zero out the remaining fields of the PASID entry. Also, add a dma_wmb() in pasid_set_present() to ensure that all other fields of the PASID entry are visible to the hardware before the Present bit is set. Fixes: 0bbeb01a4faf ("iommu/vt-d: Manage scalalble mode PASID tables") Signed-off-by: Lu Baolu Reviewed-by: Dmytro Maluka Reviewed-by: Samiullah Khawaja Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20260120061816.2132558-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/pasid.c | 6 +++++- drivers/iommu/intel/pasid.h | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 6379b211f12b..07e056b24605 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -273,7 +273,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, did = pasid_get_domain_id(pte); pgtt = pasid_pte_get_pgtt(pte); - intel_pasid_clear_entry(dev, pasid, fault_ignore); + pasid_clear_present(pte); spin_unlock(&iommu->lock); if (!ecap_coherent(iommu->ecap)) @@ -287,6 +287,10 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); devtlb_invalidation_with_pasid(iommu, dev, pasid); + intel_pasid_clear_entry(dev, pasid, fault_ignore); + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + if (!fault_ignore) intel_iommu_drain_pasid_prq(dev, pasid); } diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index b4c85242dc79..0b303bd0b0c1 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -234,9 +234,23 @@ static inline void pasid_set_wpe(struct pasid_entry *pe) */ static inline void pasid_set_present(struct pasid_entry *pe) { + dma_wmb(); pasid_set_bits(&pe->val[0], 1 << 0, 1); } +/* + * Clear the Present (P) bit (bit 0) of a scalable-mode PASID table entry. + * This initiates the transition of the entry's ownership from hardware + * to software. The caller is responsible for fulfilling the invalidation + * handshake recommended by the VT-d spec, Section 6.5.3.3 (Guidance to + * Software for Invalidations). + */ +static inline void pasid_clear_present(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 0, 0); + dma_wmb(); +} + /* * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID * entry. -- cgit v1.2.3 From c1e4f1dccbe9d7656d1c6872ebeadb5992d0aaa2 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 22 Jan 2026 09:48:55 +0800 Subject: iommu/vt-d: Clear Present bit before tearing down context entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When tearing down a context entry, the current implementation zeros the entire 128-bit entry using multiple 64-bit writes. This creates a window where the hardware can fetch a "torn" entry — where some fields are already zeroed while the 'Present' bit is still set — leading to unpredictable behavior or spurious faults. While x86 provides strong write ordering, the compiler may reorder writes to the two 64-bit halves of the context entry. Even without compiler reordering, the hardware fetch is not guaranteed to be atomic with respect to multiple CPU writes. Align with the "Guidance to Software for Invalidations" in the VT-d spec (Section 6.5.3.3) by implementing the recommended ownership handshake: 1. Clear only the 'Present' (P) bit of the context entry first to signal the transition of ownership from hardware to software. 2. Use dma_wmb() to ensure the cleared bit is visible to the IOMMU. 3. Perform the required cache and context-cache invalidation to ensure hardware no longer has cached references to the entry. 4. Fully zero out the entry only after the invalidation is complete. Also, add a dma_wmb() to context_set_present() to ensure the entry is fully initialized before the 'Present' bit becomes visible. Fixes: ba39592764ed2 ("Intel IOMMU: Intel IOMMU driver") Reported-by: Dmytro Maluka Closes: https://lore.kernel.org/all/aTG7gc7I5wExai3S@google.com/ Signed-off-by: Lu Baolu Reviewed-by: Dmytro Maluka Reviewed-by: Samiullah Khawaja Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20260120061816.2132558-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 4 +++- drivers/iommu/intel/iommu.h | 21 ++++++++++++++++++++- drivers/iommu/intel/pasid.c | 5 ++++- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 134302fbcd92..c66cc51f9e51 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1240,10 +1240,12 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 } did = context_domain_id(context); - context_clear_entry(context); + context_clear_present(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); intel_context_flush_no_pasid(info, context, did); + context_clear_entry(context); + __iommu_flush_cache(iommu, context, sizeof(*context)); } int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 25c5e22096d4..599913fb65d5 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -900,7 +900,26 @@ static inline int pfn_level_offset(u64 pfn, int level) static inline void context_set_present(struct context_entry *context) { - context->lo |= 1; + u64 val; + + dma_wmb(); + val = READ_ONCE(context->lo) | 1; + WRITE_ONCE(context->lo, val); +} + +/* + * Clear the Present (P) bit (bit 0) of a context table entry. This initiates + * the transition of the entry's ownership from hardware to software. The + * caller is responsible for fulfilling the invalidation handshake recommended + * by the VT-d spec, Section 6.5.3.3 (Guidance to Software for Invalidations). + */ +static inline void context_clear_present(struct context_entry *context) +{ + u64 val; + + val = READ_ONCE(context->lo) & GENMASK_ULL(63, 1); + WRITE_ONCE(context->lo, val); + dma_wmb(); } static inline void context_set_fault_enable(struct context_entry *context) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 07e056b24605..f5dfa9b9eb3e 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -1024,7 +1024,7 @@ static int device_pasid_table_setup(struct device *dev, u8 bus, u8 devfn) } if (context_copied(iommu, bus, devfn)) { - context_clear_entry(context); + context_clear_present(context); __iommu_flush_cache(iommu, context, sizeof(*context)); /* @@ -1044,6 +1044,9 @@ static int device_pasid_table_setup(struct device *dev, u8 bus, u8 devfn) iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID); + context_clear_entry(context); + __iommu_flush_cache(iommu, context, sizeof(*context)); + /* * At this point, the device is supposed to finish reset at * its driver probe stage, so no in-flight DMA will exist, -- cgit v1.2.3 From c3b1edea3791fa91ab7032faa90355913ad9451b Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 22 Jan 2026 09:48:56 +0800 Subject: iommu/vt-d: Fix race condition during PASID entry replacement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Intel VT-d PASID table entry is 512 bits (64 bytes). When replacing an active PASID entry (e.g., during domain replacement), the current implementation calculates a new entry on the stack and copies it to the table using a single structure assignment. struct pasid_entry *pte, new_pte; pte = intel_pasid_get_entry(dev, pasid); pasid_pte_config_first_level(iommu, &new_pte, ...); *pte = new_pte; Because the hardware may fetch the 512-bit PASID entry in multiple 128-bit chunks, updating the entire entry while it is active (Present bit set) risks a "torn" read. In this scenario, the IOMMU hardware could observe an inconsistent state — partially new data and partially old data — leading to unpredictable behavior or spurious faults. Fix this by removing the unsafe "replace" helpers and following the "clear-then-update" flow, which ensures the Present bit is cleared and the required invalidation handshake is completed before the new configuration is applied. Fixes: 7543ee63e811 ("iommu/vt-d: Add pasid replace helpers") Signed-off-by: Lu Baolu Reviewed-by: Samiullah Khawaja Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20260120061816.2132558-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 29 +++---- drivers/iommu/intel/nested.c | 9 +-- drivers/iommu/intel/pasid.c | 184 ------------------------------------------- drivers/iommu/intel/pasid.h | 14 ---- 4 files changed, 16 insertions(+), 220 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index c66cc51f9e51..705828b06e32 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1252,12 +1252,10 @@ int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, ioasid_t pasid, u16 did, phys_addr_t fsptptr, int flags, struct iommu_domain *old) { - if (!old) - return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid, - did, flags); - return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did, - iommu_domain_did(old, iommu), - flags); + if (old) + intel_pasid_tear_down_entry(iommu, dev, pasid, false); + + return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid, did, flags); } static int domain_setup_second_level(struct intel_iommu *iommu, @@ -1265,23 +1263,20 @@ static int domain_setup_second_level(struct intel_iommu *iommu, struct device *dev, ioasid_t pasid, struct iommu_domain *old) { - if (!old) - return intel_pasid_setup_second_level(iommu, domain, - dev, pasid); - return intel_pasid_replace_second_level(iommu, domain, dev, - iommu_domain_did(old, iommu), - pasid); + if (old) + intel_pasid_tear_down_entry(iommu, dev, pasid, false); + + return intel_pasid_setup_second_level(iommu, domain, dev, pasid); } static int domain_setup_passthrough(struct intel_iommu *iommu, struct device *dev, ioasid_t pasid, struct iommu_domain *old) { - if (!old) - return intel_pasid_setup_pass_through(iommu, dev, pasid); - return intel_pasid_replace_pass_through(iommu, dev, - iommu_domain_did(old, iommu), - pasid); + if (old) + intel_pasid_tear_down_entry(iommu, dev, pasid, false); + + return intel_pasid_setup_pass_through(iommu, dev, pasid); } static int domain_setup_first_level(struct intel_iommu *iommu, diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index a3fb8c193ca6..e9a440e9c960 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -136,11 +136,10 @@ static int domain_setup_nested(struct intel_iommu *iommu, struct device *dev, ioasid_t pasid, struct iommu_domain *old) { - if (!old) - return intel_pasid_setup_nested(iommu, dev, pasid, domain); - return intel_pasid_replace_nested(iommu, dev, pasid, - iommu_domain_did(old, iommu), - domain); + if (old) + intel_pasid_tear_down_entry(iommu, dev, pasid, false); + + return intel_pasid_setup_nested(iommu, dev, pasid, domain); } static int intel_nested_set_dev_pasid(struct iommu_domain *domain, diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index f5dfa9b9eb3e..b63a71904cfb 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -417,50 +417,6 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, return 0; } -int intel_pasid_replace_first_level(struct intel_iommu *iommu, - struct device *dev, phys_addr_t fsptptr, - u32 pasid, u16 did, u16 old_did, - int flags) -{ - struct pasid_entry *pte, new_pte; - - if (!ecap_flts(iommu->ecap)) { - pr_err("No first level translation support on %s\n", - iommu->name); - return -EINVAL; - } - - if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) { - pr_err("No 5-level paging support for first-level on %s\n", - iommu->name); - return -EINVAL; - } - - pasid_pte_config_first_level(iommu, &new_pte, fsptptr, did, flags); - - spin_lock(&iommu->lock); - pte = intel_pasid_get_entry(dev, pasid); - if (!pte) { - spin_unlock(&iommu->lock); - return -ENODEV; - } - - if (!pasid_pte_is_present(pte)) { - spin_unlock(&iommu->lock); - return -EINVAL; - } - - WARN_ON(old_did != pasid_get_domain_id(pte)); - - *pte = new_pte; - spin_unlock(&iommu->lock); - - intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); - intel_iommu_drain_pasid_prq(dev, pasid); - - return 0; -} - /* * Set up the scalable mode pasid entry for second only translation type. */ @@ -527,51 +483,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return 0; } -int intel_pasid_replace_second_level(struct intel_iommu *iommu, - struct dmar_domain *domain, - struct device *dev, u16 old_did, - u32 pasid) -{ - struct pasid_entry *pte, new_pte; - u16 did; - - /* - * If hardware advertises no support for second level - * translation, return directly. - */ - if (!ecap_slts(iommu->ecap)) { - pr_err("No second level translation support on %s\n", - iommu->name); - return -EINVAL; - } - - did = domain_id_iommu(domain, iommu); - - pasid_pte_config_second_level(iommu, &new_pte, domain, did); - - spin_lock(&iommu->lock); - pte = intel_pasid_get_entry(dev, pasid); - if (!pte) { - spin_unlock(&iommu->lock); - return -ENODEV; - } - - if (!pasid_pte_is_present(pte)) { - spin_unlock(&iommu->lock); - return -EINVAL; - } - - WARN_ON(old_did != pasid_get_domain_id(pte)); - - *pte = new_pte; - spin_unlock(&iommu->lock); - - intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); - intel_iommu_drain_pasid_prq(dev, pasid); - - return 0; -} - /* * Set up dirty tracking on a second only or nested translation type. */ @@ -684,38 +595,6 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return 0; } -int intel_pasid_replace_pass_through(struct intel_iommu *iommu, - struct device *dev, u16 old_did, - u32 pasid) -{ - struct pasid_entry *pte, new_pte; - u16 did = FLPT_DEFAULT_DID; - - pasid_pte_config_pass_through(iommu, &new_pte, did); - - spin_lock(&iommu->lock); - pte = intel_pasid_get_entry(dev, pasid); - if (!pte) { - spin_unlock(&iommu->lock); - return -ENODEV; - } - - if (!pasid_pte_is_present(pte)) { - spin_unlock(&iommu->lock); - return -EINVAL; - } - - WARN_ON(old_did != pasid_get_domain_id(pte)); - - *pte = new_pte; - spin_unlock(&iommu->lock); - - intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); - intel_iommu_drain_pasid_prq(dev, pasid); - - return 0; -} - /* * Set the page snoop control for a pasid entry which has been set up. */ @@ -849,69 +728,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, return 0; } -int intel_pasid_replace_nested(struct intel_iommu *iommu, - struct device *dev, u32 pasid, - u16 old_did, struct dmar_domain *domain) -{ - struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; - struct dmar_domain *s2_domain = domain->s2_domain; - u16 did = domain_id_iommu(domain, iommu); - struct pasid_entry *pte, new_pte; - - /* Address width should match the address width supported by hardware */ - switch (s1_cfg->addr_width) { - case ADDR_WIDTH_4LEVEL: - break; - case ADDR_WIDTH_5LEVEL: - if (!cap_fl5lp_support(iommu->cap)) { - dev_err_ratelimited(dev, - "5-level paging not supported\n"); - return -EINVAL; - } - break; - default: - dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", - s1_cfg->addr_width); - return -EINVAL; - } - - if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { - pr_err_ratelimited("No supervisor request support on %s\n", - iommu->name); - return -EINVAL; - } - - if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { - pr_err_ratelimited("No extended access flag support on %s\n", - iommu->name); - return -EINVAL; - } - - pasid_pte_config_nestd(iommu, &new_pte, s1_cfg, s2_domain, did); - - spin_lock(&iommu->lock); - pte = intel_pasid_get_entry(dev, pasid); - if (!pte) { - spin_unlock(&iommu->lock); - return -ENODEV; - } - - if (!pasid_pte_is_present(pte)) { - spin_unlock(&iommu->lock); - return -EINVAL; - } - - WARN_ON(old_did != pasid_get_domain_id(pte)); - - *pte = new_pte; - spin_unlock(&iommu->lock); - - intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); - intel_iommu_drain_pasid_prq(dev, pasid); - - return 0; -} - /* * Interfaces to setup or teardown a pasid table to the scalable-mode * context table entry: diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 0b303bd0b0c1..c3c8c907983e 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -316,20 +316,6 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct device *dev, u32 pasid); int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain); -int intel_pasid_replace_first_level(struct intel_iommu *iommu, - struct device *dev, phys_addr_t fsptptr, - u32 pasid, u16 did, u16 old_did, int flags); -int intel_pasid_replace_second_level(struct intel_iommu *iommu, - struct dmar_domain *domain, - struct device *dev, u16 old_did, - u32 pasid); -int intel_pasid_replace_pass_through(struct intel_iommu *iommu, - struct device *dev, u16 old_did, - u32 pasid); -int intel_pasid_replace_nested(struct intel_iommu *iommu, - struct device *dev, u32 pasid, - u16 old_did, struct dmar_domain *domain); - void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool fault_ignore); -- cgit v1.2.3 From 14e9a138dd020e234bbbecdbcf4d68eabfe3641e Mon Sep 17 00:00:00 2001 From: Bibek Kumar Patro Date: Tue, 20 Jan 2026 20:41:05 +0530 Subject: iommu/arm-smmu-qcom: Restore ACTLR settings for MDSS on sa8775p The ACTLR configuration for the sa8775p MDSS client was inadvertently dropped while reworking the commit f91879fdf70b ("iommu/arm-smmu-qcom: Add actlr settings for mdss on Qualcomm platforms"). Without this entry, the sa8775p MDSS block does not receive the intended default ACTLR configuration. Restore the missing compatible entry so that the platform receives the expected behavior. Fixes: f91879fdf70b ("iommu/arm-smmu-qcom: Add actlr settings for mdss on Qualcomm platforms") Signed-off-by: Bibek Kumar Patro Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 456d5146831e..718d102356d9 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -43,6 +43,8 @@ static const struct of_device_id qcom_smmu_actlr_client_of_match[] = { .data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) }, { .compatible = "qcom,qcm2290-mdss", .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, + { .compatible = "qcom,sa8775p-mdss", + .data = (const void *) (PREFETCH_DEFAULT | CMTLB) }, { .compatible = "qcom,sc7280-mdss", .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, { .compatible = "qcom,sc7280-venus", -- cgit v1.2.3 From eb20758f86723bb94d30747fca7b65ed2e6846b8 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 13 Jan 2026 05:49:32 +0000 Subject: iommu/tegra241-cmdqv: Decouple driver from ACPI A platform device is created by acpi_create_platform_device() per CMDQV's adev. That means there is no point in going through _CRS of ACPI. Replace all the ACPI functions with standard platform functions. And drop all ACPI dependencies. This will make the driver compatible with DT also. Suggested-by: Robin Murphy Signed-off-by: Nicolin Chen Signed-off-by: Will Deacon --- drivers/iommu/arm/Kconfig | 1 - drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 +- drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 84 ++++---------------------- 3 files changed, 14 insertions(+), 74 deletions(-) diff --git a/drivers/iommu/arm/Kconfig b/drivers/iommu/arm/Kconfig index ef42bbe07dbe..5fac08b89dee 100644 --- a/drivers/iommu/arm/Kconfig +++ b/drivers/iommu/arm/Kconfig @@ -121,7 +121,6 @@ config ARM_SMMU_V3_KUNIT_TEST config TEGRA241_CMDQV bool "NVIDIA Tegra241 CMDQ-V extension support for ARM SMMUv3" - depends on ACPI help Support for NVIDIA CMDQ-Virtualization extension for ARM SMMUv3. The CMDQ-V extension is similar to v3.3 ECMDQ for multi command queues diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 62bdc4d39101..322abd6be8dc 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -4545,10 +4545,11 @@ static void acpi_smmu_dsdt_probe_tegra241_cmdqv(struct acpi_iort_node *node, adev = acpi_dev_get_first_match_dev("NVDA200C", uid, -1); if (adev) { /* Tegra241 CMDQV driver is responsible for put_device() */ - smmu->impl_dev = &adev->dev; + smmu->impl_dev = get_device(acpi_get_first_physical_node(adev)); smmu->options |= ARM_SMMU_OPT_TEGRA241_CMDQV; dev_info(smmu->dev, "found companion CMDQV device: %s\n", dev_name(smmu->impl_dev)); + acpi_dev_put(adev); } kfree(uid); } diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index 378104cd395e..1fc03b72beb8 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -3,17 +3,15 @@ #define dev_fmt(fmt) "tegra241_cmdqv: " fmt -#include #include #include #include #include #include #include +#include #include -#include - #include "arm-smmu-v3.h" /* CMDQV register page base and size defines */ @@ -854,69 +852,6 @@ static struct arm_smmu_impl_ops tegra241_cmdqv_impl_ops = { /* Probe Functions */ -static int tegra241_cmdqv_acpi_is_memory(struct acpi_resource *res, void *data) -{ - struct resource_win win; - - return !acpi_dev_resource_address_space(res, &win); -} - -static int tegra241_cmdqv_acpi_get_irqs(struct acpi_resource *ares, void *data) -{ - struct resource r; - int *irq = data; - - if (*irq <= 0 && acpi_dev_resource_interrupt(ares, 0, &r)) - *irq = r.start; - return 1; /* No need to add resource to the list */ -} - -static struct resource * -tegra241_cmdqv_find_acpi_resource(struct device *dev, int *irq) -{ - struct acpi_device *adev = to_acpi_device(dev); - struct list_head resource_list; - struct resource_entry *rentry; - struct resource *res = NULL; - int ret; - - INIT_LIST_HEAD(&resource_list); - ret = acpi_dev_get_resources(adev, &resource_list, - tegra241_cmdqv_acpi_is_memory, NULL); - if (ret < 0) { - dev_err(dev, "failed to get memory resource: %d\n", ret); - return NULL; - } - - rentry = list_first_entry_or_null(&resource_list, - struct resource_entry, node); - if (!rentry) { - dev_err(dev, "failed to get memory resource entry\n"); - goto free_list; - } - - /* Caller must free the res */ - res = kzalloc(sizeof(*res), GFP_KERNEL); - if (!res) - goto free_list; - - *res = *rentry->res; - - acpi_dev_free_resource_list(&resource_list); - - INIT_LIST_HEAD(&resource_list); - - if (irq) - ret = acpi_dev_get_resources(adev, &resource_list, - tegra241_cmdqv_acpi_get_irqs, irq); - if (ret < 0 || !irq || *irq <= 0) - dev_warn(dev, "no interrupt. errors will not be reported\n"); - -free_list: - acpi_dev_free_resource_list(&resource_list); - return res; -} - static int tegra241_cmdqv_init_structures(struct arm_smmu_device *smmu) { struct tegra241_cmdqv *cmdqv = @@ -1042,18 +977,23 @@ iounmap: struct arm_smmu_device *tegra241_cmdqv_probe(struct arm_smmu_device *smmu) { + struct platform_device *pdev = to_platform_device(smmu->impl_dev); struct arm_smmu_device *new_smmu; - struct resource *res = NULL; + struct resource *res; int irq; - if (!smmu->dev->of_node) - res = tegra241_cmdqv_find_acpi_resource(smmu->impl_dev, &irq); - if (!res) + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(&pdev->dev, "no memory resource found for CMDQV\n"); goto out_fallback; + } - new_smmu = __tegra241_cmdqv_probe(smmu, res, irq); - kfree(res); + irq = platform_get_irq_optional(pdev, 0); + if (irq <= 0) + dev_warn(&pdev->dev, + "no interrupt. errors will not be reported\n"); + new_smmu = __tegra241_cmdqv_probe(smmu, res, irq); if (new_smmu) return new_smmu; -- cgit v1.2.3 From ea69dc4e207b09fb9adb6dab1901739d64853090 Mon Sep 17 00:00:00 2001 From: Ashish Mhetre Date: Tue, 13 Jan 2026 05:49:33 +0000 Subject: iommu/arm-smmu-v3: Add device-tree support for CMDQV driver Add device tree support to the CMDQV driver to enable usage on Tegra264 SoCs. The implementation parses the nvidia,cmdqv phandle from the SMMU device tree node to associate each SMMU with its corresponding CMDQV instance based on compatible string. Reviewed-by: Nicolin Chen Signed-off-by: Ashish Mhetre Reviewed-by: Jon Hunter Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 322abd6be8dc..504d7880ce5a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -4533,6 +4533,35 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) return 0; } +#ifdef CONFIG_TEGRA241_CMDQV +static void tegra_cmdqv_dt_probe(struct device_node *smmu_node, + struct arm_smmu_device *smmu) +{ + struct platform_device *pdev; + struct device_node *np; + + np = of_parse_phandle(smmu_node, "nvidia,cmdqv", 0); + if (!np) + return; + + /* Tegra241 CMDQV driver is responsible for put_device() */ + pdev = of_find_device_by_node(np); + of_node_put(np); + if (!pdev) + return; + + smmu->impl_dev = &pdev->dev; + smmu->options |= ARM_SMMU_OPT_TEGRA241_CMDQV; + dev_dbg(smmu->dev, "found companion CMDQV device: %s\n", + dev_name(smmu->impl_dev)); +} +#else +static void tegra_cmdqv_dt_probe(struct device_node *smmu_node, + struct arm_smmu_device *smmu) +{ +} +#endif + #ifdef CONFIG_ACPI #ifdef CONFIG_TEGRA241_CMDQV static void acpi_smmu_dsdt_probe_tegra241_cmdqv(struct acpi_iort_node *node, @@ -4638,6 +4667,9 @@ static int arm_smmu_device_dt_probe(struct platform_device *pdev, if (of_dma_is_coherent(dev->of_node)) smmu->features |= ARM_SMMU_FEAT_COHERENCY; + if (of_device_is_compatible(dev->of_node, "nvidia,tegra264-smmu")) + tegra_cmdqv_dt_probe(dev->of_node, smmu); + return ret; } -- cgit v1.2.3 From 2781f2a930abb5d27f80b8afbabfa19684833b65 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 15 Jan 2026 10:23:28 -0800 Subject: iommu/arm-smmu-v3: Add update_safe bits to fix STE update sequence C_BAD_STE was observed when updating nested STE from an S1-bypass mode to an S1DSS-bypass mode. As both modes enabled S2, the used bit is slightly different than the normal S1-bypass and S1DSS-bypass modes. As a result, fields like MEV and EATS in S2's used list marked the word1 as a critical word that requested a STE.V=0. This breaks a hitless update. However, both MEV and EATS aren't critical in terms of STE update. One controls the merge of the events and the other controls the ATS that is managed by the driver at the same time via pci_enable_ats(). Add an arm_smmu_get_ste_update_safe() to allow STE update algorithm to relax those fields, avoiding the STE update breakages. After this change, entry_set has no caller checking its return value, so change it to void. Note that this change is required by both MEV and EATS fields, which were introduced in different kernel versions. So add get_update_safe() first. MEV and EATS will be added to arm_smmu_get_ste_update_safe() separately. Fixes: 1e8be08d1c91 ("iommu/arm-smmu-v3: Support IOMMU_DOMAIN_NESTED") Cc: stable@vger.kernel.org Signed-off-by: Jason Gunthorpe Reviewed-by: Shuai Xue Reviewed-by: Mostafa Saleh Reviewed-by: Pranjal Shrivastava Signed-off-by: Nicolin Chen Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c | 31 +++++++++++++++++++++--- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 28 +++++++++++++++------ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 4 +++ 3 files changed, 53 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c index d2671bfd3798..b254a94b2003 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c @@ -38,13 +38,16 @@ enum arm_smmu_test_master_feat { static bool arm_smmu_entry_differs_in_used_bits(const __le64 *entry, const __le64 *used_bits, const __le64 *target, + const __le64 *safe, unsigned int length) { bool differs = false; unsigned int i; for (i = 0; i < length; i++) { - if ((entry[i] & used_bits[i]) != target[i]) + __le64 used = used_bits[i] & ~safe[i]; + + if ((entry[i] & used) != (target[i] & used)) differs = true; } return differs; @@ -56,12 +59,24 @@ arm_smmu_test_writer_record_syncs(struct arm_smmu_entry_writer *writer) struct arm_smmu_test_writer *test_writer = container_of(writer, struct arm_smmu_test_writer, writer); __le64 *entry_used_bits; + __le64 *safe_target; + __le64 *safe_init; entry_used_bits = kunit_kzalloc( test_writer->test, sizeof(*entry_used_bits) * NUM_ENTRY_QWORDS, GFP_KERNEL); KUNIT_ASSERT_NOT_NULL(test_writer->test, entry_used_bits); + safe_target = kunit_kzalloc(test_writer->test, + sizeof(*safe_target) * NUM_ENTRY_QWORDS, + GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test_writer->test, safe_target); + + safe_init = kunit_kzalloc(test_writer->test, + sizeof(*safe_init) * NUM_ENTRY_QWORDS, + GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test_writer->test, safe_init); + pr_debug("STE value is now set to: "); print_hex_dump_debug(" ", DUMP_PREFIX_NONE, 16, 8, test_writer->entry, @@ -79,14 +94,23 @@ arm_smmu_test_writer_record_syncs(struct arm_smmu_entry_writer *writer) * configuration. */ writer->ops->get_used(test_writer->entry, entry_used_bits); + if (writer->ops->get_update_safe) + writer->ops->get_update_safe(test_writer->entry, + test_writer->init_entry, + safe_init); + if (writer->ops->get_update_safe) + writer->ops->get_update_safe(test_writer->entry, + test_writer->target_entry, + safe_target); KUNIT_EXPECT_FALSE( test_writer->test, arm_smmu_entry_differs_in_used_bits( test_writer->entry, entry_used_bits, - test_writer->init_entry, NUM_ENTRY_QWORDS) && + test_writer->init_entry, safe_init, + NUM_ENTRY_QWORDS) && arm_smmu_entry_differs_in_used_bits( test_writer->entry, entry_used_bits, - test_writer->target_entry, + test_writer->target_entry, safe_target, NUM_ENTRY_QWORDS)); } } @@ -106,6 +130,7 @@ arm_smmu_v3_test_debug_print_used_bits(struct arm_smmu_entry_writer *writer, static const struct arm_smmu_entry_writer_ops test_ste_ops = { .sync = arm_smmu_test_writer_record_syncs, .get_used = arm_smmu_get_ste_used, + .get_update_safe = arm_smmu_get_ste_update_safe, }; static const struct arm_smmu_entry_writer_ops test_cd_ops = { diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 504d7880ce5a..361246c6e684 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1093,6 +1093,13 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) } EXPORT_SYMBOL_IF_KUNIT(arm_smmu_get_ste_used); +VISIBLE_IF_KUNIT +void arm_smmu_get_ste_update_safe(const __le64 *cur, const __le64 *target, + __le64 *safe_bits) +{ +} +EXPORT_SYMBOL_IF_KUNIT(arm_smmu_get_ste_update_safe); + /* * Figure out if we can do a hitless update of entry to become target. Returns a * bit mask where 1 indicates that qword needs to be set disruptively. @@ -1105,13 +1112,22 @@ static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer, { __le64 target_used[NUM_ENTRY_QWORDS] = {}; __le64 cur_used[NUM_ENTRY_QWORDS] = {}; + __le64 safe[NUM_ENTRY_QWORDS] = {}; u8 used_qword_diff = 0; unsigned int i; writer->ops->get_used(entry, cur_used); writer->ops->get_used(target, target_used); + if (writer->ops->get_update_safe) + writer->ops->get_update_safe(entry, target, safe); for (i = 0; i != NUM_ENTRY_QWORDS; i++) { + /* + * Safe is only used for bits that are used by both entries, + * otherwise it is sequenced according to the unused entry. + */ + safe[i] &= target_used[i] & cur_used[i]; + /* * Check that masks are up to date, the make functions are not * allowed to set a bit to 1 if the used function doesn't say it @@ -1120,6 +1136,7 @@ static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer, WARN_ON_ONCE(target[i] & ~target_used[i]); /* Bits can change because they are not currently being used */ + cur_used[i] &= ~safe[i]; unused_update[i] = (entry[i] & cur_used[i]) | (target[i] & ~cur_used[i]); /* @@ -1132,7 +1149,7 @@ static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer, return used_qword_diff; } -static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry, +static void entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry, const __le64 *target, unsigned int start, unsigned int len) { @@ -1148,7 +1165,6 @@ static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry, if (changed) writer->ops->sync(writer); - return changed; } /* @@ -1218,12 +1234,9 @@ void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry, entry_set(writer, entry, target, 0, 1); } else { /* - * No inuse bit changed. Sanity check that all unused bits are 0 - * in the entry. The target was already sanity checked by - * compute_qword_diff(). + * No inuse bit changed, though safe bits may have changed. */ - WARN_ON_ONCE( - entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS)); + entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS); } } EXPORT_SYMBOL_IF_KUNIT(arm_smmu_write_entry); @@ -1554,6 +1567,7 @@ static void arm_smmu_ste_writer_sync_entry(struct arm_smmu_entry_writer *writer) static const struct arm_smmu_entry_writer_ops arm_smmu_ste_writer_ops = { .sync = arm_smmu_ste_writer_sync_entry, .get_used = arm_smmu_get_ste_used, + .get_update_safe = arm_smmu_get_ste_update_safe, }; static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 0a5bb57dbdfe..3c6d65d36164 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -898,6 +898,8 @@ struct arm_smmu_entry_writer { struct arm_smmu_entry_writer_ops { void (*get_used)(const __le64 *entry, __le64 *used); + void (*get_update_safe)(const __le64 *cur, const __le64 *target, + __le64 *safe_bits); void (*sync)(struct arm_smmu_entry_writer *writer); }; @@ -909,6 +911,8 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, #if IS_ENABLED(CONFIG_KUNIT) void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits); +void arm_smmu_get_ste_update_safe(const __le64 *cur, const __le64 *target, + __le64 *safe_bits); void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *cur, const __le64 *target); void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits); -- cgit v1.2.3 From f3c1d372dbb8e5a86923f20db66deabef42bfc9d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 15 Jan 2026 10:23:29 -0800 Subject: iommu/arm-smmu-v3: Mark STE MEV safe when computing the update sequence Nested CD tables set the MEV bit to try to reduce multi-fault spamming on the hypervisor. Since MEV is in STE word 1 this causes a breaking update sequence that is not required and impacts real workloads. For the purposes of STE updates the value of MEV doesn't matter, if it is set/cleared early or late it just results in a change to the fault reports that must be supported by the kernel anyhow. The spec says: Note: Software must expect, and be able to deal with, coalesced fault records even when MEV == 0. So mark STE MEV safe when computing the update sequence, to avoid creating a breaking update. Fixes: da0c56520e88 ("iommu/arm-smmu-v3: Set MEV bit in nested STE for DoS mitigations") Cc: stable@vger.kernel.org Signed-off-by: Jason Gunthorpe Reviewed-by: Shuai Xue Reviewed-by: Mostafa Saleh Reviewed-by: Pranjal Shrivastava Signed-off-by: Nicolin Chen Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 361246c6e684..3ab0f047c892 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1097,6 +1097,16 @@ VISIBLE_IF_KUNIT void arm_smmu_get_ste_update_safe(const __le64 *cur, const __le64 *target, __le64 *safe_bits) { + /* + * MEV does not meaningfully impact the operation of the HW, it only + * changes how many fault events are generated, thus we can relax it + * when computing the ordering. The spec notes the device can act like + * MEV=1 anyhow: + * + * Note: Software must expect, and be able to deal with, coalesced + * fault records even when MEV == 0. + */ + safe_bits[1] |= cpu_to_le64(STRTAB_STE_1_MEV); } EXPORT_SYMBOL_IF_KUNIT(arm_smmu_get_ste_update_safe); -- cgit v1.2.3 From 7cad800485956a263318930613f8f4a084af8c70 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 15 Jan 2026 10:23:30 -0800 Subject: iommu/arm-smmu-v3: Mark EATS_TRANS safe when computing the update sequence If VM wants to toggle EATS_TRANS off at the same time as changing the CFG, hypervisor will see EATS change to 0 and insert a V=0 breaking update into the STE even though the VM did not ask for that. In bare metal, EATS_TRANS is ignored by CFG=ABORT/BYPASS, which is why this does not cause a problem until we have the nested case where CFG is always a variation of S2 trans that does use EATS_TRANS. Relax the rules for EATS_TRANS sequencing, we don't need it to be exact as the enclosing code will always disable ATS at the PCI device when changing EATS_TRANS. This ensures there are no ATS transactions that can race with an EATS_TRANS change so we don't need to carefully sequence these bits. Fixes: 1e8be08d1c91 ("iommu/arm-smmu-v3: Support IOMMU_DOMAIN_NESTED") Cc: stable@vger.kernel.org Signed-off-by: Jason Gunthorpe Reviewed-by: Shuai Xue Signed-off-by: Nicolin Chen Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 3ab0f047c892..852379845359 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1097,6 +1097,32 @@ VISIBLE_IF_KUNIT void arm_smmu_get_ste_update_safe(const __le64 *cur, const __le64 *target, __le64 *safe_bits) { + const __le64 eats_s1chk = + FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_S1CHK); + const __le64 eats_trans = + FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_TRANS); + + /* + * When an STE changes EATS_TRANS, the sequencing code in the attach + * logic already will have the PCI cap for ATS disabled. Thus at this + * moment we can expect that the device will not generate ATS queries + * and so we don't care about the sequencing of EATS. The purpose of + * EATS_TRANS is to protect the system from hostile untrusted devices + * that issue ATS when the PCI config space is disabled. However, if + * EATS_TRANS is being changed, then we must have already trusted the + * device as the EATS_TRANS security block is being disabled. + * + * Note: now the EATS_TRANS update is moved to the first entry_set(). + * Changing S2S and EATS might transiently result in S2S=1 and EATS=1 + * which is a bad STE (see "5.2 Stream Table Entry"). In such a case, + * we can't do a hitless update. Also, it should not be added to the + * safe bits with STRTAB_STE_1_EATS_S1CHK, because EATS=0b11 would be + * effectively an errant 0b00 configuration. + */ + if (!((cur[1] | target[1]) & cpu_to_le64(eats_s1chk)) && + !((cur[2] | target[2]) & cpu_to_le64(STRTAB_STE_2_S2S))) + safe_bits[1] |= cpu_to_le64(eats_trans); + /* * MEV does not meaningfully impact the operation of the HW, it only * changes how many fault events are generated, thus we can relax it -- cgit v1.2.3 From a4f976edcb87a9daf5384f3e5e13f80e0e180aa6 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 15 Jan 2026 10:23:31 -0800 Subject: iommu/arm-smmu-v3-test: Add nested s1bypass/s1dssbypass coverage STE in a nested case requires both S1 and S2 fields. And this makes the use case different from the existing one. Add coverage for previously failed cases shifting between S2-only and S1+S2 STEs. Reviewed-by: Shuai Xue Reviewed-by: Mostafa Saleh Reviewed-by: Pranjal Shrivastava Signed-off-by: Nicolin Chen Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c | 47 ++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c index b254a94b2003..69c9ef441fc1 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c @@ -33,8 +33,12 @@ static struct mm_struct sva_mm = { enum arm_smmu_test_master_feat { ARM_SMMU_MASTER_TEST_ATS = BIT(0), ARM_SMMU_MASTER_TEST_STALL = BIT(1), + ARM_SMMU_MASTER_TEST_NESTED = BIT(2), }; +static void arm_smmu_test_make_s2_ste(struct arm_smmu_ste *ste, + enum arm_smmu_test_master_feat feat); + static bool arm_smmu_entry_differs_in_used_bits(const __le64 *entry, const __le64 *used_bits, const __le64 *target, @@ -210,6 +214,18 @@ static void arm_smmu_test_make_cdtable_ste(struct arm_smmu_ste *ste, }; arm_smmu_make_cdtable_ste(ste, &master, ats_enabled, s1dss); + if (feat & ARM_SMMU_MASTER_TEST_NESTED) { + struct arm_smmu_ste s2ste; + int i; + + arm_smmu_test_make_s2_ste(&s2ste, + feat & ~ARM_SMMU_MASTER_TEST_NESTED); + ste->data[0] |= cpu_to_le64( + FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_NESTED)); + ste->data[1] |= cpu_to_le64(STRTAB_STE_1_MEV); + for (i = 2; i < NUM_ENTRY_QWORDS; i++) + ste->data[i] = s2ste.data[i]; + } } static void arm_smmu_v3_write_ste_test_bypass_to_abort(struct kunit *test) @@ -567,6 +583,35 @@ static void arm_smmu_v3_write_ste_test_s2_to_s1_stall(struct kunit *test) NUM_EXPECTED_SYNCS(3)); } +static void +arm_smmu_v3_write_ste_test_nested_s1dssbypass_to_s1bypass(struct kunit *test) +{ + struct arm_smmu_ste s1_ste; + struct arm_smmu_ste s2_ste; + + arm_smmu_test_make_cdtable_ste( + &s1_ste, STRTAB_STE_1_S1DSS_BYPASS, fake_cdtab_dma_addr, + ARM_SMMU_MASTER_TEST_ATS | ARM_SMMU_MASTER_TEST_NESTED); + arm_smmu_test_make_s2_ste(&s2_ste, 0); + /* Expect an additional sync to unset ignored bits: EATS and MEV */ + arm_smmu_v3_test_ste_expect_hitless_transition(test, &s1_ste, &s2_ste, + NUM_EXPECTED_SYNCS(3)); +} + +static void +arm_smmu_v3_write_ste_test_nested_s1bypass_to_s1dssbypass(struct kunit *test) +{ + struct arm_smmu_ste s1_ste; + struct arm_smmu_ste s2_ste; + + arm_smmu_test_make_cdtable_ste( + &s1_ste, STRTAB_STE_1_S1DSS_BYPASS, fake_cdtab_dma_addr, + ARM_SMMU_MASTER_TEST_ATS | ARM_SMMU_MASTER_TEST_NESTED); + arm_smmu_test_make_s2_ste(&s2_ste, 0); + arm_smmu_v3_test_ste_expect_hitless_transition(test, &s2_ste, &s1_ste, + NUM_EXPECTED_SYNCS(2)); +} + static void arm_smmu_v3_write_cd_test_sva_clear(struct kunit *test) { struct arm_smmu_cd cd = {}; @@ -613,6 +658,8 @@ static struct kunit_case arm_smmu_v3_test_cases[] = { KUNIT_CASE(arm_smmu_v3_write_cd_test_s1_change_asid), KUNIT_CASE(arm_smmu_v3_write_ste_test_s1_to_s2_stall), KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_s1_stall), + KUNIT_CASE(arm_smmu_v3_write_ste_test_nested_s1dssbypass_to_s1bypass), + KUNIT_CASE(arm_smmu_v3_write_ste_test_nested_s1bypass_to_s1dssbypass), KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_clear), KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_release), {}, -- cgit v1.2.3 From a45dd34663025c75652b27e384e91c9c05ba1d80 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 14 Jan 2026 17:12:43 -0800 Subject: iommu/arm-smmu-v3: Do not set disable_ats unless vSTE is Translate A vSTE may have three configuration types: Abort, Bypass, and Translate. An Abort vSTE wouldn't enable ATS, but the other two might. It makes sense for a Transalte vSTE to rely on the guest vSTE.EATS field. For a Bypass vSTE, it would end up with an S2-only physical STE, similar to an attachment to a regular S2 domain. However, the nested case always disables ATS following the Bypass vSTE, while the regular S2 case always enables ATS so long as arm_smmu_ats_supported(master) == true. Note that ATS is needed for certain VM centric workloads and historically non-vSMMU cases have relied on this automatic enablement. So, having the nested case behave differently causes problems. To fix that, add a condition to disable_ats, so that it might enable ATS for a Bypass vSTE, aligning with the regular S2 case. Fixes: f27298a82ba0 ("iommu/arm-smmu-v3: Allow ATS for IOMMU_DOMAIN_NESTED") Cc: stable@vger.kernel.org Suggested-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Pranjal Shrivastava Reviewed-by: Jason Gunthorpe Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 93fdadd07431..823461a26659 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -177,7 +177,9 @@ static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, * config bit here base this off the EATS value in the STE. If the EATS * is set then the VM must generate ATC flushes. */ - state.disable_ats = !nested_domain->enable_ats; + if (FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0])) == + STRTAB_STE_0_CFG_S1_TRANS) + state.disable_ats = !nested_domain->enable_ats; ret = arm_smmu_attach_prepare(&state, domain); if (ret) { mutex_unlock(&arm_smmu_asid_lock); -- cgit v1.2.3 From 5b0530bb16ec5af3cc96fc25891d6a860fd37a8c Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 22 Jan 2026 14:42:38 -0700 Subject: iommu/amd: Fix type of type parameter to amd_iommufd_hw_info() When building with -Wincompatible-function-pointer-types-strict, a warning designed to catch kernel control flow integrity (kCFI) issues at build time, there is an instance around amd_iommufd_hw_info(): drivers/iommu/amd/iommu.c:3141:13: error: incompatible function pointer types initializing 'void *(*)(struct device *, u32 *, enum iommu_hw_info_type *)' (aka 'void *(*)(struct device *, unsigned int *, enum iommu_hw_info_type *)') with an expression of type 'void *(struct device *, u32 *, u32 *)' (aka 'void *(struct device *, unsigned int *, unsigned int *)') [-Werror,-Wincompatible-function-pointer-types-strict] 3141 | .hw_info = amd_iommufd_hw_info, | ^~~~~~~~~~~~~~~~~~~ While 'u32 *' and 'enum iommu_hw_info_type *' are ABI compatible, hence no regular warning from -Wincompatible-function-pointer-types, the mismatch will trigger a kCFI violation when amd_iommufd_hw_info() is called indirectly. Update the type parameter of amd_iommufd_hw_info() to be 'enum iommu_hw_info_type *' to match the prototype in 'struct iommu_ops', clearing up the warning and kCFI violation. Fixes: 7d8b06ecc45b ("iommu/amd: Add support for hw_info for iommu capability query") Signed-off-by: Nathan Chancellor Reviewed-by: Vasant Hegde Reviewed-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommufd.c | 2 +- drivers/iommu/amd/iommufd.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/iommufd.c b/drivers/iommu/amd/iommufd.c index ad627fd5ccc7..96ec6a4a760d 100644 --- a/drivers/iommu/amd/iommufd.c +++ b/drivers/iommu/amd/iommufd.c @@ -11,7 +11,7 @@ static const struct iommufd_viommu_ops amd_viommu_ops; -void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type) +void *amd_iommufd_hw_info(struct device *dev, u32 *length, enum iommu_hw_info_type *type) { struct iommu_hw_info_amd *hwinfo; diff --git a/drivers/iommu/amd/iommufd.h b/drivers/iommu/amd/iommufd.h index f05aad495b5b..62e9e1bebfbe 100644 --- a/drivers/iommu/amd/iommufd.h +++ b/drivers/iommu/amd/iommufd.h @@ -7,7 +7,7 @@ #define AMD_IOMMUFD_H #if IS_ENABLED(CONFIG_AMD_IOMMU_IOMMUFD) -void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type); +void *amd_iommufd_hw_info(struct device *dev, u32 *length, enum iommu_hw_info_type *type); size_t amd_iommufd_get_viommu_size(struct device *dev, enum iommu_viommu_type viommu_type); int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *parent, const struct iommu_user_data *user_data); -- cgit v1.2.3 From 9e249c48412828e807afddc21527eb734dc9bd3d Mon Sep 17 00:00:00 2001 From: Ankit Soni Date: Thu, 22 Jan 2026 15:30:38 +0000 Subject: iommu/amd: serialize sequence allocation under concurrent TLB invalidations With concurrent TLB invalidations, completion wait randomly gets timed out because cmd_sem_val was incremented outside the IOMMU spinlock, allowing CMD_COMPL_WAIT commands to be queued out of sequence and breaking the ordering assumption in wait_on_sem(). Move the cmd_sem_val increment under iommu->lock so completion sequence allocation is serialized with command queuing. And remove the unnecessary return. Fixes: d2a0cac10597 ("iommu/amd: move wait_on_sem() out of spinlock") Tested-by: Srikanth Aithal Reported-by: Srikanth Aithal Signed-off-by: Ankit Soni Reviewed-by: Vasant Hegde Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 2 +- drivers/iommu/amd/init.c | 2 +- drivers/iommu/amd/iommu.c | 18 ++++++++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index cfcbad6c28ff..c685d3771436 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -752,7 +752,7 @@ struct amd_iommu { u32 flags; volatile u64 *cmd_sem; - atomic64_t cmd_sem_val; + u64 cmd_sem_val; /* * Track physical address to directly use it in build_completion_wait() * and avoid adding any special checks and handling for kdump. diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index b1c344ed7dbd..02c0c64c5f6b 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1885,7 +1885,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h, iommu->pci_seg = pci_seg; raw_spin_lock_init(&iommu->lock); - atomic64_set(&iommu->cmd_sem_val, 0); + iommu->cmd_sem_val = 0; /* Add IOMMU to internal data structures */ list_add_tail(&iommu->list, &amd_iommu_list); diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 285ae635c324..58be841d624e 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1439,6 +1439,12 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) return iommu_queue_command_sync(iommu, cmd, true); } +static u64 get_cmdsem_val(struct amd_iommu *iommu) +{ + lockdep_assert_held(&iommu->lock); + return ++iommu->cmd_sem_val; +} + /* * This function queues a completion wait command into the command * buffer of an IOMMU @@ -1453,11 +1459,11 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (!iommu->need_sync) return 0; - data = atomic64_inc_return(&iommu->cmd_sem_val); - build_completion_wait(&cmd, iommu, data); - raw_spin_lock_irqsave(&iommu->lock, flags); + data = get_cmdsem_val(iommu); + build_completion_wait(&cmd, iommu, data); + ret = __iommu_queue_command_sync(iommu, &cmd, false); raw_spin_unlock_irqrestore(&iommu->lock, flags); @@ -3177,10 +3183,11 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) return; build_inv_irt(&cmd, devid); - data = atomic64_inc_return(&iommu->cmd_sem_val); - build_completion_wait(&cmd2, iommu, data); raw_spin_lock_irqsave(&iommu->lock, flags); + data = get_cmdsem_val(iommu); + build_completion_wait(&cmd2, iommu, data); + ret = __iommu_queue_command_sync(iommu, &cmd, true); if (ret) goto out_err; @@ -3194,7 +3201,6 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) out_err: raw_spin_unlock_irqrestore(&iommu->lock, flags); - return; } static inline u8 iommu_get_int_tablen(struct iommu_dev_data *dev_data) -- cgit v1.2.3 From b48ca920613858b477f75946907e72c74570af05 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Tue, 3 Feb 2026 16:29:34 +0800 Subject: iommupt: Always add IOVA range to iotlb_gather in gather_range_pages() Add current (iova, len) to the iotlb gather, regardless of the setting of PT_FEAT_FLUSH_RANGE or PT_FEAT_FLUSH_RANGE_NO_GAPS. In gather_range_pages(), the current IOVA range is only added to iotlb_gather when PT_FEAT_FLUSH_RANGE is set. Yet a virtual IOMMU with NpCache uses only PT_FEAT_FLUSH_RANGE_NO_GAPS. In that case, iotlb_gather will stay empty (start=ULONG_MAX, end=0) after initialization, and the current (iova, len) will not be added to the iotlb_gather, causing subsequent iommu_iotlb_sync() to perform IOTLB invalidation with wrong parameters (e.g., amd_iommu_iotlb_sync() computes size from gather->end - gather->start + 1, leading to an invalid range). The disjoint check and sync for PT_FEAT_FLUSH_RANGE_NO_GAPS remain unchanged: when the new range is disjoint from the existing gather, we still sync first and then add the new range, so semantics for NO_GAPS are preserved. Fixes: 7c53f4238aa8 ("iommupt: Add unmap_pages op") Cc: stable@vger.kernel.org Reviewed-by: Jason Gunthorpe Signed-off-by: Yu Zhang Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 3327116a441c..6de22fa6bb34 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -58,10 +58,9 @@ static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather, * Note that the sync frees the gather's free list, so we must * not have any pages on that list that are covered by iova/len */ - } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) { - iommu_iotlb_gather_add_range(iotlb_gather, iova, len); } + iommu_iotlb_gather_add_range(iotlb_gather, iova, len); iommu_pages_list_splice(free_list, &iotlb_gather->freelist); } -- cgit v1.2.3