diff options
| author | Nicolin Chen <nicolinc@nvidia.com> | 2025-12-15 13:42:20 -0800 |
|---|---|---|
| committer | Joerg Roedel <joerg.roedel@amd.com> | 2026-01-10 10:26:44 +0100 |
| commit | f5b16b802174fe2b67e2b6a27fa793b749981816 (patch) | |
| tree | fee6a41d87e57e2e2360a3b9e4a959add4edd930 | |
| parent | c279e83953d937470f8a6e69b69f62608714f13f (diff) | |
PCI: Suspend iommu function prior to resetting a device
PCIe permits a device to ignore ATS invalidation TLPs while processing a
reset. This creates a problem visible to the OS where an ATS invalidation
command will time out: e.g. an SVA domain will have no coordination with a
reset event and can racily issue ATS invalidations to a resetting device.
The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends SW to disable and
block ATS before initiating a Function Level Reset. It also mentions that
other reset methods could have the same vulnerability as well.
The IOMMU subsystem provides pci_dev_reset_iommu_prepare/done() callback
helpers for this matter. Use them in all the existing reset functions.
This will attach the device to its iommu_group->blocking_domain during the
device reset, so as to allow IOMMU driver to:
- invoke pci_disable_ats() and pci_enable_ats(), if necessary
- wait for all ATS invalidations to complete
- stop issuing new ATS invalidations
- fence any incoming ATS queries
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Dheeraj Kumar Srivastava <dheerajkumar.srivastava@amd.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
| -rw-r--r-- | drivers/pci/pci-acpi.c | 13 | ||||
| -rw-r--r-- | drivers/pci/pci.c | 65 | ||||
| -rw-r--r-- | drivers/pci/quirks.c | 19 |
3 files changed, 87 insertions, 10 deletions
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 9369377725fa..651d9b5561ff 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -9,6 +9,7 @@ #include <linux/delay.h> #include <linux/init.h> +#include <linux/iommu.h> #include <linux/irqdomain.h> #include <linux/pci.h> #include <linux/msi.h> @@ -971,6 +972,7 @@ void pci_set_acpi_fwnode(struct pci_dev *dev) int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) { acpi_handle handle = ACPI_HANDLE(&dev->dev); + int ret; if (!handle || !acpi_has_method(handle, "_RST")) return -ENOTTY; @@ -978,12 +980,19 @@ int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) if (probe) return 0; + ret = pci_dev_reset_iommu_prepare(dev); + if (ret) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); + return ret; + } + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_RST", NULL, NULL))) { pci_warn(dev, "ACPI _RST failed\n"); - return -ENOTTY; + ret = -ENOTTY; } - return 0; + pci_dev_reset_iommu_done(dev); + return ret; } bool acpi_pci_power_manageable(struct pci_dev *dev) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 13dbb405dc31..a0ba42ae7ee0 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -13,6 +13,7 @@ #include <linux/delay.h> #include <linux/dmi.h> #include <linux/init.h> +#include <linux/iommu.h> #include <linux/msi.h> #include <linux/of.h> #include <linux/pci.h> @@ -25,6 +26,7 @@ #include <linux/logic_pio.h> #include <linux/device.h> #include <linux/pm_runtime.h> +#include <linux/pci-ats.h> #include <linux/pci_hotplug.h> #include <linux/vmalloc.h> #include <asm/dma.h> @@ -4330,13 +4332,22 @@ EXPORT_SYMBOL(pci_wait_for_pending_transaction); */ int pcie_flr(struct pci_dev *dev) { + int ret; + if (!pci_wait_for_pending_transaction(dev)) pci_err(dev, "timed out waiting for pending transaction; performing function level reset anyway\n"); + /* Have to call it after waiting for pending DMA transaction */ + ret = pci_dev_reset_iommu_prepare(dev); + if (ret) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); + return ret; + } + pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_BCR_FLR); if (dev->imm_ready) - return 0; + goto done; /* * Per PCIe r4.0, sec 6.6.2, a device must complete an FLR within @@ -4345,7 +4356,10 @@ int pcie_flr(struct pci_dev *dev) */ msleep(100); - return pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS); + ret = pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS); +done: + pci_dev_reset_iommu_done(dev); + return ret; } EXPORT_SYMBOL_GPL(pcie_flr); @@ -4373,6 +4387,7 @@ EXPORT_SYMBOL_GPL(pcie_reset_flr); static int pci_af_flr(struct pci_dev *dev, bool probe) { + int ret; int pos; u8 cap; @@ -4399,10 +4414,17 @@ static int pci_af_flr(struct pci_dev *dev, bool probe) PCI_AF_STATUS_TP << 8)) pci_err(dev, "timed out waiting for pending transaction; performing AF function level reset anyway\n"); + /* Have to call it after waiting for pending DMA transaction */ + ret = pci_dev_reset_iommu_prepare(dev); + if (ret) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); + return ret; + } + pci_write_config_byte(dev, pos + PCI_AF_CTRL, PCI_AF_CTRL_FLR); if (dev->imm_ready) - return 0; + goto done; /* * Per Advanced Capabilities for Conventional PCI ECN, 13 April 2006, @@ -4412,7 +4434,10 @@ static int pci_af_flr(struct pci_dev *dev, bool probe) */ msleep(100); - return pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS); + ret = pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS); +done: + pci_dev_reset_iommu_done(dev); + return ret; } /** @@ -4433,6 +4458,7 @@ static int pci_af_flr(struct pci_dev *dev, bool probe) static int pci_pm_reset(struct pci_dev *dev, bool probe) { u16 csr; + int ret; if (!dev->pm_cap || dev->dev_flags & PCI_DEV_FLAGS_NO_PM_RESET) return -ENOTTY; @@ -4447,6 +4473,12 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe) if (dev->current_state != PCI_D0) return -EINVAL; + ret = pci_dev_reset_iommu_prepare(dev); + if (ret) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); + return ret; + } + csr &= ~PCI_PM_CTRL_STATE_MASK; csr |= PCI_D3hot; pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr); @@ -4457,7 +4489,9 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe) pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr); pci_dev_d3_sleep(dev); - return pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS); + ret = pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS); + pci_dev_reset_iommu_done(dev); + return ret; } /** @@ -4885,10 +4919,20 @@ static int pci_reset_bus_function(struct pci_dev *dev, bool probe) return -ENOTTY; } + rc = pci_dev_reset_iommu_prepare(dev); + if (rc) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", rc); + return rc; + } + rc = pci_dev_reset_slot_function(dev, probe); if (rc != -ENOTTY) - return rc; - return pci_parent_bus_reset(dev, probe); + goto done; + + rc = pci_parent_bus_reset(dev, probe); +done: + pci_dev_reset_iommu_done(dev); + return rc; } static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) @@ -4912,6 +4956,12 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) if (rc) return -ENOTTY; + rc = pci_dev_reset_iommu_prepare(dev); + if (rc) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", rc); + return rc; + } + if (reg & PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR) { val = reg; } else { @@ -4926,6 +4976,7 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL, reg); + pci_dev_reset_iommu_done(dev); return rc; } diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index b9c252aa6fe0..c6b999045c70 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -21,6 +21,7 @@ #include <linux/pci.h> #include <linux/isa-dma.h> /* isa_dma_bridge_buggy */ #include <linux/init.h> +#include <linux/iommu.h> #include <linux/delay.h> #include <linux/acpi.h> #include <linux/dmi.h> @@ -4228,6 +4229,22 @@ static const struct pci_dev_reset_methods pci_dev_reset_methods[] = { { 0 } }; +static int __pci_dev_specific_reset(struct pci_dev *dev, bool probe, + const struct pci_dev_reset_methods *i) +{ + int ret; + + ret = pci_dev_reset_iommu_prepare(dev); + if (ret) { + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); + return ret; + } + + ret = i->reset(dev, probe); + pci_dev_reset_iommu_done(dev); + return ret; +} + /* * These device-specific reset methods are here rather than in a driver * because when a host assigns a device to a guest VM, the host may need @@ -4242,7 +4259,7 @@ int pci_dev_specific_reset(struct pci_dev *dev, bool probe) i->vendor == (u16)PCI_ANY_ID) && (i->device == dev->device || i->device == (u16)PCI_ANY_ID)) - return i->reset(dev, probe); + return __pci_dev_specific_reset(dev, probe, i); } return -ENOTTY; |
