From fcc1d0dabdb65ca069f77e5b76d3b20277be4a15 Mon Sep 17 00:00:00 2001
From: Radu Rendec <rrendec@redhat.com>
Date: Fri, 28 Nov 2025 16:20:53 -0500
Subject: genirq: Add interrupt redirection infrastructure

Add infrastructure to redirect interrupt handler execution to a
different CPU when the current CPU is not part of the interrupt's CPU
affinity mask.

This is primarily aimed at (de)multiplexed interrupts, where the child
interrupt handler runs in the context of the parent interrupt handler,
and therefore CPU affinity control for the child interrupt is typically
not available.

With the new infrastructure, the child interrupt is allowed to freely
change its affinity setting, independently of the parent. If the
interrupt handler happens to be triggered on an "incompatible" CPU (a
CPU that's not part of the child interrupt's affinity mask), the handler
is redirected and runs in IRQ work context on a "compatible" CPU.

No functional change is being made to any existing irqchip driver, and
irqchip drivers must be explicitly modified to use the newly added
infrastructure to support interrupt redirection.

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Radu Rendec <rrendec@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/linux-pci/878qpg4o4t.ffs@tglx/
Link: https://patch.msgid.link/20251128212055.1409093-2-rrendec@redhat.com
---
 kernel/irq/chip.c    | 22 +++++++++++++-
 kernel/irq/irqdesc.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/irq/manage.c  | 15 +++++++--
 3 files changed, 118 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 678f094d261a..433f1dd2b0ca 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1122,7 +1122,7 @@ void irq_cpu_offline(void)
 }
 #endif
 
-#ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
 
 #ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS
 /**
@@ -1194,6 +1194,15 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq);
 
 #endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */
 
+#ifdef CONFIG_SMP
+void irq_chip_pre_redirect_parent(struct irq_data *data)
+{
+	data = data->parent_data;
+	data->chip->irq_pre_redirect(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_pre_redirect_parent);
+#endif
+
 /**
  * irq_chip_set_parent_state - set the state of a parent interrupt.
  *
@@ -1476,6 +1485,17 @@ void irq_chip_release_resources_parent(struct irq_data *data)
 		data->chip->irq_release_resources(data);
 }
 EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
+#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
+
+#ifdef CONFIG_SMP
+int irq_chip_redirect_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force)
+{
+	struct irq_redirect *redir = &irq_data_to_desc(data)->redirect;
+
+	WRITE_ONCE(redir->target_cpu, cpumask_first(dest));
+	return IRQ_SET_MASK_OK;
+}
+EXPORT_SYMBOL_GPL(irq_chip_redirect_set_affinity);
 #endif
 
 /**
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index f8e4e13dbe33..501a653d4153 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -78,8 +78,12 @@ static int alloc_masks(struct irq_desc *desc, int node)
 	return 0;
 }
 
-static void desc_smp_init(struct irq_desc *desc, int node,
-			  const struct cpumask *affinity)
+static void irq_redirect_work(struct irq_work *work)
+{
+	handle_irq_desc(container_of(work, struct irq_desc, redirect.work));
+}
+
+static void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity)
 {
 	if (!affinity)
 		affinity = irq_default_affinity;
@@ -91,6 +95,7 @@ static void desc_smp_init(struct irq_desc *desc, int node,
 #ifdef CONFIG_NUMA
 	desc->irq_common_data.node = node;
 #endif
+	desc->redirect.work = IRQ_WORK_INIT_HARD(irq_redirect_work);
 }
 
 static void free_masks(struct irq_desc *desc)
@@ -766,6 +771,83 @@ int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq)
 	WARN_ON_ONCE(!in_nmi());
 	return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
 }
+
+#ifdef CONFIG_SMP
+static bool demux_redirect_remote(struct irq_desc *desc)
+{
+	guard(raw_spinlock)(&desc->lock);
+	const struct cpumask *m = irq_data_get_effective_affinity_mask(&desc->irq_data);
+	unsigned int target_cpu = READ_ONCE(desc->redirect.target_cpu);
+
+	if (desc->irq_data.chip->irq_pre_redirect)
+		desc->irq_data.chip->irq_pre_redirect(&desc->irq_data);
+
+	/*
+	 * If the interrupt handler is already running on a CPU that's included
+	 * in the interrupt's affinity mask, redirection is not necessary.
+	 */
+	if (cpumask_test_cpu(smp_processor_id(), m))
+		return false;
+
+	/*
+	 * The desc->action check protects against IRQ shutdown: __free_irq() sets
+	 * desc->action to NULL while holding desc->lock, which we also hold.
+	 *
+	 * Calling irq_work_queue_on() here is safe w.r.t. CPU unplugging:
+	 *   - takedown_cpu() schedules multi_cpu_stop() on all active CPUs,
+	 *     including the one that's taken down.
+	 *   - multi_cpu_stop() acts like a barrier, which means all active
+	 *     CPUs go through MULTI_STOP_DISABLE_IRQ and disable hard IRQs
+	 *     *before* the dying CPU runs take_cpu_down() in MULTI_STOP_RUN.
+	 *   - Hard IRQs are re-enabled at the end of multi_cpu_stop(), *after*
+	 *     the dying CPU has run take_cpu_down() in MULTI_STOP_RUN.
+	 *   - Since we run in hard IRQ context, we run either before or after
+	 *     take_cpu_down() but never concurrently.
+	 *   - If we run before take_cpu_down(), the dying CPU hasn't been marked
+	 *     offline yet (it's marked via take_cpu_down() -> __cpu_disable()),
+	 *     so the WARN in irq_work_queue_on() can't occur.
+	 *   - Furthermore, the work item we queue will be flushed later via
+	 *     take_cpu_down() -> cpuhp_invoke_callback_range_nofail() ->
+	 *     smpcfd_dying_cpu() -> irq_work_run().
+	 *   - If we run after take_cpu_down(), target_cpu has been already
+	 *     updated via take_cpu_down() -> __cpu_disable(), which eventually
+	 *     calls irq_do_set_affinity() during IRQ migration. So, target_cpu
+	 *     no longer points to the dying CPU in this case.
+	 */
+	if (desc->action)
+		irq_work_queue_on(&desc->redirect.work, target_cpu);
+
+	return true;
+}
+#else /* CONFIG_SMP */
+static bool demux_redirect_remote(struct irq_desc *desc)
+{
+	return false;
+}
+#endif
+
+/**
+ * generic_handle_demux_domain_irq - Invoke the handler for a hardware interrupt
+ *				     of a demultiplexing domain.
+ * @domain:	The domain where to perform the lookup
+ * @hwirq:	The hardware interrupt number to convert to a logical one
+ *
+ * Returns:	True on success, or false if lookup has failed
+ */
+bool generic_handle_demux_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq)
+{
+	struct irq_desc *desc = irq_resolve_mapping(domain, hwirq);
+
+	if (unlikely(!desc))
+		return false;
+
+	if (demux_redirect_remote(desc))
+		return true;
+
+	return !handle_irq_desc(desc);
+}
+EXPORT_SYMBOL_GPL(generic_handle_demux_domain_irq);
+
 #endif
 
 /* Dynamic interrupt handling */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8b1b4c8a4f54..acb4c3de69c6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -35,6 +35,16 @@ static int __init setup_forced_irqthreads(char *arg)
 early_param("threadirqs", setup_forced_irqthreads);
 #endif
 
+#ifdef CONFIG_SMP
+static inline void synchronize_irqwork(struct irq_desc *desc)
+{
+	/* Synchronize pending or on the fly redirect work */
+	irq_work_sync(&desc->redirect.work);
+}
+#else
+static inline void synchronize_irqwork(struct irq_desc *desc) { }
+#endif
+
 static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state);
 
 static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
@@ -107,7 +117,9 @@ EXPORT_SYMBOL(synchronize_hardirq);
 
 static void __synchronize_irq(struct irq_desc *desc)
 {
+	synchronize_irqwork(desc);
 	__synchronize_hardirq(desc, true);
+
 	/*
 	 * We made sure that no hardirq handler is running. Now verify that no
 	 * threaded handlers are active.
@@ -217,8 +229,7 @@ static inline void irq_validate_effective_affinity(struct irq_data *data) { }
 
 static DEFINE_PER_CPU(struct cpumask, __tmp_mask);
 
-int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
-			bool force)
+int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
 	struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask);
 	struct irq_desc *desc = irq_data_to_desc(data);
-- 
cgit v1.2.3


From df439718afaf23b5aa7b5711b6c14e87b5836cae Mon Sep 17 00:00:00 2001
From: Radu Rendec <rrendec@redhat.com>
Date: Mon, 12 Jan 2026 16:14:02 -0500
Subject: genirq: Update effective affinity for redirected interrupts

For redirected interrupts, irq_chip_redirect_set_affinity() does not
update the effective affinity mask, which then triggers the warning in
irq_validate_effective_affinity(). Also, because the effective affinity
mask is empty, the cpumask_test_cpu(smp_processor_id(), m) condition in
demux_redirect_remote() is always false, and the interrupt is always
redirected, even if it's already running on the target CPU.

Set the effective affinity mask to be the same as the requested affinity
mask. It's worth noting that irq_do_set_affinity() filters out offline
CPUs before calling chip->irq_set_affinity() (unless `force` is set), so
the mask passed to irq_chip_redirect_set_affinity() is already filtered.

The solution is not ideal because it may lie about the effective
affinity of the demultiplexed ("child") interrupt. If the requested
affinity mask includes multiple CPUs, the effective affinity, in
reality, is the intersection between the requested mask and the
demultiplexing ("parent") interrupt's effective affinity mask, plus
the first CPU in the requested mask.

Accurately describing the effective affinity of the demultiplexed
interrupt is not trivial because it requires keeping track of the
demultiplexing interrupt's effective affinity. That is tricky in the
context of CPU hot(un)plugging, where interrupt migration ordering is
not guaranteed. The solution in the initial version of the fixed patch,
which stored the first CPU of the demultiplexing interrupt's effective
affinity in the `target_cpu` field, has its own drawbacks and
limitations.

Fixes: fcc1d0dabdb6 ("genirq: Add interrupt redirection infrastructure")
Reported-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Radu Rendec <rrendec@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Link: https://patch.msgid.link/20260112211402.2927336-1-rrendec@redhat.com
Closes: https://lore.kernel.org/all/44509520-f29b-4b8a-8986-5eae3e022eb7@nvidia.com/
---
 kernel/irq/chip.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 433f1dd2b0ca..35bc17bc369e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1493,6 +1493,8 @@ int irq_chip_redirect_set_affinity(struct irq_data *data, const struct cpumask *
 	struct irq_redirect *redir = &irq_data_to_desc(data)->redirect;
 
 	WRITE_ONCE(redir->target_cpu, cpumask_first(dest));
+	irq_data_update_effective_affinity(data, dest);
+
 	return IRQ_SET_MASK_OK;
 }
 EXPORT_SYMBOL_GPL(irq_chip_redirect_set_affinity);
-- 
cgit v1.2.3


From 37f9d5026cd78fbe80a124edbbadab382b26545f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@kernel.org>
Date: Tue, 27 Jan 2026 22:30:16 +0100
Subject: genirq/redirect: Prevent writing MSI message on affinity change

The interrupts which are handled by the redirection infrastructure provide
a irq_set_affinity() callback, which solely determines the target CPU for
redirection via irq_work and und updates the effective affinity mask.

Contrary to regular MSI interrupts this affinity setting does not change
the underlying interrupt message as the message is only created at setup
time to deliver to the demultiplexing interrupt.

Therefore the message write in msi_domain_set_affinity() is a pointless
exercise. In principle the write is harmless, but a Tegra system exposes a
full system hang during suspend due to that write.

It's unclear why the check for the PCI device state PCI_D0 in
pci_msi_domain_write_msg(), which prevents the actual hardware access if
a device is in powered down state, fails on this particular system, but
that's a different problem which needs to be investigated by the Tegra
experts.

The irq_set_affinity() callback can advise msi_domain_set_affinity() not to
write the MSI message by returning IRQ_SET_MASK_OK_DONE instead of
IRQ_SET_MASK_OK. Do exactly that.

Just to make it clear again:

This is not a correctness issue of the redirection code as returning
IRQ_SET_MASK_OK in that context is completely correct. From the core
code point of view this is solely a optimization to avoid an redundant
hardware write.

As a byproduct it papers over the underlying problem on the Tegra platform,
which fails to put the PCIe device[s] out of PCI_D0 despite the fact that
the devices and busses have been shut down. The redirect infrastructure
just unearthed the underlying issue, which is prone to happen in quite some
other code paths which use the PCI_D0 check to prevent hardware access to
powered down devices.

This therefore has neither a 'Fixes:' nor a 'Closes:' tag associated as the
underlying problem, which is outside the scope of the interrupt code, is
still unresolved.

Reported-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Link: https://lore.kernel.org/all/4e5b349c-6599-4871-9e3b-e10352ae0ca0@nvidia.com
Link: https://patch.msgid.link/87tsw6aglz.ffs@tglx
---
 kernel/irq/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 35bc17bc369e..ccdc47a7069d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1495,7 +1495,7 @@ int irq_chip_redirect_set_affinity(struct irq_data *data, const struct cpumask *
 	WRITE_ONCE(redir->target_cpu, cpumask_first(dest));
 	irq_data_update_effective_affinity(data, dest);
 
-	return IRQ_SET_MASK_OK;
+	return IRQ_SET_MASK_OK_DONE;
 }
 EXPORT_SYMBOL_GPL(irq_chip_redirect_set_affinity);
 #endif
-- 
cgit v1.2.3