From 530a8ba71b4c3b7fcee323dd997f4bab1be1a6ba Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 14:35:35 -0700 Subject: KVM: Bound the number of dirty ring entries in a single reset at INT_MAX Cap the number of ring entries that are reset in a single ioctl to INT_MAX to ensure userspace isn't confused by a wrap into negative space, and so that, in a truly pathological scenario, KVM doesn't miss a TLB flush due to the count wrapping to zero. While the size of the ring is fixed at 0x10000 entries and KVM (currently) supports at most 4096, userspace is allowed to harvest entries from the ring while the reset is in-progress, i.e. it's possible for the ring to always have harvested entries. Opportunistically return an actual error code from the helper so that a future fix to handle pending signals can gracefully return -EINTR. Drop the function comment now that the return code is a stanard 0/-errno (and because a future commit will add a proper lockdep assertion). Opportunistically drop a similarly stale comment for kvm_dirty_ring_push(). Cc: Peter Xu Cc: Yan Zhao Cc: Maxim Levitsky Cc: Binbin Wu Fixes: fb04a1eddb1a ("KVM: X86: Implement ring-based dirty memory tracking") Reviewed-by: James Houghton Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20250516213540.2546077-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_dirty_ring.h | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index da4d9b5f58f1..eb10d87adf7d 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -49,9 +49,10 @@ static inline int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *r } static inline int kvm_dirty_ring_reset(struct kvm *kvm, - struct kvm_dirty_ring *ring) + struct kvm_dirty_ring *ring, + int *nr_entries_reset) { - return 0; + return -ENOENT; } static inline void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, @@ -77,17 +78,8 @@ bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm); u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm); int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, int index, u32 size); - -/* - * called with kvm->slots_lock held, returns the number of - * processed pages. - */ -int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring); - -/* - * returns =0: successfully pushed - * <0: unable to push, need to wait - */ +int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, + int *nr_entries_reset); void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset); bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From cd4178d19420359554e3da6fd77ecfd0f58067ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Jun 2025 16:51:47 -0700 Subject: KVM: arm64: WARN if unmapping a vLPI fails in any path When unmapping a vLPI, WARN if nullifying vCPU affinity fails, not just if failure occurs when freeing an ITE. If undoing vCPU affinity fails, then odds are very good that vLPI state tracking has has gotten out of whack, i.e. that KVM and the GIC disagree on the state of an IRQ/vLPI. At best, inconsistent state means there is a lurking bug/flaw somewhere. At worst, the inconsistency could eventually be fatal to the host, e.g. if an ITS command fails because KVM's view of things doesn't match reality/hardware. Note, only the call from kvm_arch_irq_bypass_del_producer() by way of kvm_vgic_v4_unset_forwarding() doesn't already WARN. Common KVM's kvm_irq_routing_update() WARNs if kvm_arch_update_irqfd_routing() fails. For that path, if its_unmap_vlpi() fails in kvm_vgic_v4_unset_forwarding(), the only possible causes are that the GIC doesn't have a v4 ITS (from its_irq_set_vcpu_affinity()): /* Need a v4 ITS */ if (!is_v4(its_dev->its)) return -EINVAL; guard(raw_spinlock)(&its_dev->event_map.vlpi_lock); /* Unmap request? */ if (!info) return its_vlpi_unmap(d); or that KVM has gotten out of sync with the GIC/ITS (from its_vlpi_unmap()): if (!its_dev->event_map.vm || !irqd_is_forwarded_to_vcpu(d)) return -EINVAL; All of the above failure scenarios are warnable offences, as they should never occur absent a kernel/KVM bug. Acked-by: Oliver Upton Link: https://lore.kernel.org/all/aFWY2LTVIxz5rfhh@google.com Signed-off-by: Sean Christopherson --- include/linux/irqchip/arm-gic-v4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 7f1f11a5e4e4..0b0887099fd7 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -146,7 +146,7 @@ int its_commit_vpe(struct its_vpe *vpe); int its_invall_vpe(struct its_vpe *vpe); int its_map_vlpi(int irq, struct its_vlpi_map *map); int its_get_vlpi(int irq, struct its_vlpi_map *map); -int its_unmap_vlpi(int irq); +void its_unmap_vlpi(int irq); int its_prop_update_vlpi(int irq, u8 config, bool inv); int its_prop_update_vsgi(int irq, u8 priority, bool group); -- cgit v1.2.3 From 2b521d86ee80a436a92445b8206d38d75aeb39ea Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:29 -0700 Subject: irqbypass: Take ownership of producer/consumer token tracking Move ownership of IRQ bypass token tracking into irqbypass.ko, and explicitly require callers to pass an eventfd_ctx structure instead of a completely opaque token. Relying on producers and consumers to set the token appropriately is error prone, and hiding the fact that the token must be an eventfd_ctx pointer (for all intents and purposes) unnecessarily obfuscates the code and makes it more brittle. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-4-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/irqbypass.h | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h index 9bdb2a781841..1b57d15ac4cf 100644 --- a/include/linux/irqbypass.h +++ b/include/linux/irqbypass.h @@ -10,6 +10,7 @@ #include +struct eventfd_ctx; struct irq_bypass_consumer; /* @@ -18,20 +19,20 @@ struct irq_bypass_consumer; * The IRQ bypass manager is a simple set of lists and callbacks that allows * IRQ producers (ex. physical interrupt sources) to be matched to IRQ * consumers (ex. virtualization hardware that allows IRQ bypass or offload) - * via a shared token (ex. eventfd_ctx). Producers and consumers register - * independently. When a token match is found, the optional @stop callback - * will be called for each participant. The pair will then be connected via - * the @add_* callbacks, and finally the optional @start callback will allow - * any final coordination. When either participant is unregistered, the - * process is repeated using the @del_* callbacks in place of the @add_* - * callbacks. Match tokens must be unique per producer/consumer, 1:N pairings - * are not supported. + * via a shared eventfd_ctx. Producers and consumers register independently. + * When a producer and consumer are paired, i.e. an eventfd match is found, the + * optional @stop callback will be called for each participant. The pair will + * then be connected via the @add_* callbacks, and finally the optional @start + * callback will allow any final coordination. When either participant is + * unregistered, the process is repeated using the @del_* callbacks in place of + * the @add_* callbacks. eventfds must be unique per producer/consumer, 1:N + * pairings are not supported. */ /** * struct irq_bypass_producer - IRQ bypass producer definition * @node: IRQ bypass manager private list management - * @token: opaque token to match between producer and consumer (non-NULL) + * @eventfd: eventfd context used to match producers and consumers * @irq: Linux IRQ number for the producer device * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional) * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional) @@ -44,7 +45,7 @@ struct irq_bypass_consumer; */ struct irq_bypass_producer { struct list_head node; - void *token; + struct eventfd_ctx *eventfd; int irq; int (*add_consumer)(struct irq_bypass_producer *, struct irq_bypass_consumer *); @@ -57,7 +58,7 @@ struct irq_bypass_producer { /** * struct irq_bypass_consumer - IRQ bypass consumer definition * @node: IRQ bypass manager private list management - * @token: opaque token to match between producer and consumer (non-NULL) + * @eventfd: eventfd context used to match producers and consumers * @add_producer: Connect the IRQ consumer to an IRQ producer * @del_producer: Disconnect the IRQ consumer from an IRQ producer * @stop: Perform any quiesce operations necessary prior to add/del (optional) @@ -70,7 +71,7 @@ struct irq_bypass_producer { */ struct irq_bypass_consumer { struct list_head node; - void *token; + struct eventfd_ctx *eventfd; int (*add_producer)(struct irq_bypass_consumer *, struct irq_bypass_producer *); void (*del_producer)(struct irq_bypass_consumer *, @@ -79,9 +80,11 @@ struct irq_bypass_consumer { void (*start)(struct irq_bypass_consumer *); }; -int irq_bypass_register_producer(struct irq_bypass_producer *); -void irq_bypass_unregister_producer(struct irq_bypass_producer *); -int irq_bypass_register_consumer(struct irq_bypass_consumer *); -void irq_bypass_unregister_consumer(struct irq_bypass_consumer *); +int irq_bypass_register_producer(struct irq_bypass_producer *producer, + struct eventfd_ctx *eventfd); +void irq_bypass_unregister_producer(struct irq_bypass_producer *producer); +int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, + struct eventfd_ctx *eventfd); +void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer); #endif /* IRQBYPASS_H */ -- cgit v1.2.3 From add57f493e0893ac0fb4acbdc441918d3e800f10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:30 -0700 Subject: irqbypass: Explicitly track producer and consumer bindings Explicitly track IRQ bypass producer:consumer bindings. This will allow making removal an O(1) operation; searching through the list to find information that is trivially tracked (and useful for debug) is wasteful. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-5-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/irqbypass.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h index 1b57d15ac4cf..b28197c87483 100644 --- a/include/linux/irqbypass.h +++ b/include/linux/irqbypass.h @@ -29,10 +29,13 @@ struct irq_bypass_consumer; * pairings are not supported. */ +struct irq_bypass_consumer; + /** * struct irq_bypass_producer - IRQ bypass producer definition * @node: IRQ bypass manager private list management * @eventfd: eventfd context used to match producers and consumers + * @consumer: The connected consumer (NULL if no connection) * @irq: Linux IRQ number for the producer device * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional) * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional) @@ -46,6 +49,7 @@ struct irq_bypass_consumer; struct irq_bypass_producer { struct list_head node; struct eventfd_ctx *eventfd; + struct irq_bypass_consumer *consumer; int irq; int (*add_consumer)(struct irq_bypass_producer *, struct irq_bypass_consumer *); @@ -59,6 +63,7 @@ struct irq_bypass_producer { * struct irq_bypass_consumer - IRQ bypass consumer definition * @node: IRQ bypass manager private list management * @eventfd: eventfd context used to match producers and consumers + * @producer: The connected producer (NULL if no connection) * @add_producer: Connect the IRQ consumer to an IRQ producer * @del_producer: Disconnect the IRQ consumer from an IRQ producer * @stop: Perform any quiesce operations necessary prior to add/del (optional) @@ -72,6 +77,8 @@ struct irq_bypass_producer { struct irq_bypass_consumer { struct list_head node; struct eventfd_ctx *eventfd; + struct irq_bypass_producer *producer; + int (*add_producer)(struct irq_bypass_consumer *, struct irq_bypass_producer *); void (*del_producer)(struct irq_bypass_consumer *, -- cgit v1.2.3 From 8394b32faecd9c63b3c436e78e62519e9548e530 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:33 -0700 Subject: irqbypass: Use xarray to track producers and consumers Track IRQ bypass producers and consumers using an xarray to avoid the O(2n) insertion time associated with walking a list to check for duplicate entries, and to search for an partner. At low (tens or few hundreds) total producer/consumer counts, using a list is faster due to the need to allocate backing storage for xarray. But as count creeps into the thousands, xarray wins easily, and can provide several orders of magnitude better latency at high counts. E.g. hundreds of nanoseconds vs. hundreds of milliseconds. Cc: Oliver Upton Cc: David Matlack Cc: Like Xu Cc: Binbin Wu Reported-by: Yong He Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217379 Link: https://lore.kernel.org/all/20230801115646.33990-1-likexu@tencent.com Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-8-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/irqbypass.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h index b28197c87483..cd64fcaa88fe 100644 --- a/include/linux/irqbypass.h +++ b/include/linux/irqbypass.h @@ -33,7 +33,6 @@ struct irq_bypass_consumer; /** * struct irq_bypass_producer - IRQ bypass producer definition - * @node: IRQ bypass manager private list management * @eventfd: eventfd context used to match producers and consumers * @consumer: The connected consumer (NULL if no connection) * @irq: Linux IRQ number for the producer device @@ -47,7 +46,6 @@ struct irq_bypass_consumer; * for a physical device assigned to a VM. */ struct irq_bypass_producer { - struct list_head node; struct eventfd_ctx *eventfd; struct irq_bypass_consumer *consumer; int irq; @@ -61,7 +59,6 @@ struct irq_bypass_producer { /** * struct irq_bypass_consumer - IRQ bypass consumer definition - * @node: IRQ bypass manager private list management * @eventfd: eventfd context used to match producers and consumers * @producer: The connected producer (NULL if no connection) * @add_producer: Connect the IRQ consumer to an IRQ producer @@ -75,7 +72,6 @@ struct irq_bypass_producer { * portions of the interrupt handling to the VM. */ struct irq_bypass_consumer { - struct list_head node; struct eventfd_ctx *eventfd; struct irq_bypass_producer *producer; -- cgit v1.2.3 From 23b54381cee2928e8b5622e654ca4516f30d2f1a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:34 -0700 Subject: irqbypass: Require producers to pass in Linux IRQ number during registration Pass in the Linux IRQ associated with an IRQ bypass producer instead of relying on the caller to set the field prior to registration, as there's no benefit to relying on callers to do the right thing. Take care to set producer->irq before __connect(), as KVM expects the IRQ to be valid as soon as a connection is possible. Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20250516230734.2564775-9-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/irqbypass.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h index cd64fcaa88fe..ede1fa938152 100644 --- a/include/linux/irqbypass.h +++ b/include/linux/irqbypass.h @@ -84,7 +84,7 @@ struct irq_bypass_consumer { }; int irq_bypass_register_producer(struct irq_bypass_producer *producer, - struct eventfd_ctx *eventfd); + struct eventfd_ctx *eventfd, int irq); void irq_bypass_unregister_producer(struct irq_bypass_producer *producer); int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, struct eventfd_ctx *eventfd); -- cgit v1.2.3 From e295d2e7fbe69ddec772c951c466dfbfc1c96818 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:40 -0700 Subject: KVM: x86: Trigger I/O APIC route rescan in kvm_arch_irq_routing_update() Trigger the I/O APIC route rescan that's performed for a split IRQ chip after userspace updates IRQ routes in kvm_arch_irq_routing_update(), i.e. before dropping kvm->irq_lock. Calling kvm_make_all_cpus_request() under a mutex is perfectly safe, and the smp_wmb()+smp_mb__after_atomic() pair in __kvm_make_request()+kvm_check_request() ensures the new routing is visible to vCPUs prior to the request being visible to vCPUs. In all likelihood, commit b053b2aef25d ("KVM: x86: Add EOI exit bitmap inference") somewhat arbitrarily made the request outside of irq_lock to avoid holding irq_lock any longer than is strictly necessary. And then commit abdb080f7ac8 ("kvm/irqchip: kvm_arch_irq_routing_update renaming split") took the easy route of adding another arch hook instead of risking a functional change. Note, the call to synchronize_srcu_expedited() does NOT provide ordering guarantees with respect to vCPUs scanning the new routing; as above, the request infrastructure provides the necessary ordering. I.e. there's no need to wait for kvm_scan_ioapic_routes() to complete if it's actively running, because regardless of whether it grabs the old or new table, the vCPU will have another KVM_REQ_SCAN_IOAPIC pending, i.e. will rescan again and see the new mappings. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3bde4fb5c6aa..9461517b4e62 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1024,14 +1024,10 @@ void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm); -void kvm_arch_post_irq_routing_update(struct kvm *kvm); #else static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { } -static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm) -{ -} #endif #ifdef CONFIG_HAVE_KVM_IRQCHIP -- cgit v1.2.3 From 77a74b8ff41ae620f5a5d727d596b670b7b9994e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:48 -0700 Subject: KVM: x86: Move kvm_{request,free}_irq_source_id() to i8254.c (PIT) Move kvm_{request,free}_irq_source_id() to i8254.c, i.e. the dedicated PIT emulation file, in anticipation of removing them entirely in favor of hardcoding the PIT's "requested" source ID (the source ID can only ever be '2', and the request can never fail). No functional change intended. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-10-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9461517b4e62..cba8fc4529e8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1784,8 +1784,6 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); void kvm_unregister_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); -int kvm_request_irq_source_id(struct kvm *kvm); -void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); /* -- cgit v1.2.3 From 61423c413a746fd5fe5b0d865ea722e11b01105e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:49 -0700 Subject: KVM: x86: Hardcode the PIT IRQ source ID to '2' Hardcode the PIT's source IRQ ID to '2' instead of "finding" that bit 2 is always the first available bit in irq_sources_bitmap. Bits 0 and 1 are set/reserved by kvm_arch_init_vm(), i.e. long before kvm_create_pit() can be invoked, and KVM allows at most one in-kernel PIT instance, i.e. it's impossible for the PIT to find a different free bit (there are no other users of kvm_request_irq_source_id(). Delete the now-defunct irq_sources_bitmap and all its associated code. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-11-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cba8fc4529e8..4ff5ea29e343 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -190,6 +190,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 +#define KVM_PIT_IRQ_SOURCE_ID 2 extern struct mutex kvm_lock; extern struct list_head vm_list; -- cgit v1.2.3 From 628a27731e3f36de7ddce226f7e09ee70e40ed66 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:53 -0700 Subject: KVM: x86: Add CONFIG_KVM_IOAPIC to allow disabling in-kernel I/O APIC Add a Kconfig to allow building KVM without support for emulating a I/O APIC, PIC, and PIT, which is desirable for deployments that effectively don't support a fully in-kernel IRQ chip, i.e. never expect any VMM to create an in-kernel I/O APIC. E.g. compiling out support eliminates a few thousand lines of guest-facing code and gives security folks warm fuzzies. As a bonus, wrapping relevant paths with CONFIG_KVM_IOAPIC #ifdefs makes it much easier for readers to understand which bits and pieces exist specifically for fully in-kernel IRQ chips. Opportunistically convert all two in-kernel uses of __KVM_HAVE_IOAPIC to CONFIG_KVM_IOAPIC, e.g. rather than add a second #ifdef to generate a stub for kvm_arch_post_irq_routing_update(). Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-15-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4ff5ea29e343..3b5575d0b574 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1023,7 +1023,7 @@ void kvm_unlock_all_vcpus(struct kvm *kvm); void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); -#ifdef __KVM_HAVE_IOAPIC +#ifdef CONFIG_KVM_IOAPIC void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm); #else static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) -- cgit v1.2.3 From cb210737675ef4c1ad88721e84558eeb2f199312 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:06 -0700 Subject: KVM: Pass new routing entries and irqfd when updating IRTEs When updating IRTEs in response to a GSI routing or IRQ bypass change, pass the new/current routing information along with the associated irqfd. This will allow KVM x86 to harden, simplify, and deduplicate its code. Since adding/removing a bypass producer is now conveniently protected with irqfds.lock, i.e. can't run concurrently with kvm_irq_routing_update(), use the routing information cached in the irqfd instead of looking up the information in the current GSI routing tables. Opportunistically convert an existing printk() to pr_info() and put its string onto a single line (old code that strictly adhered to 80 chars). Link: https://lore.kernel.org/r/20250611224604.313496-5-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3b5575d0b574..a4160c1c0c6b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2401,6 +2401,8 @@ struct kvm_vcpu *kvm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) +struct kvm_kernel_irqfd; + bool kvm_arch_has_irq_bypass(void); int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); @@ -2408,8 +2410,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); +int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new); bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ -- cgit v1.2.3 From 05c5e23657e1d61c271c2f4a3a21d4d630b18a9b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:07 -0700 Subject: KVM: SVM: Track per-vCPU IRTEs using kvm_kernel_irqfd structure Track the IRTEs that are posting to an SVM vCPU via the associated irqfd structure and GSI routing instead of dynamically allocating a separate data structure. In addition to eliminating an atomic allocation, this will allow hoisting much of the IRTE update logic to common x86. Cc: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-6-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_irqfd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h index 8ad43692e3bb..6510a48e62aa 100644 --- a/include/linux/kvm_irqfd.h +++ b/include/linux/kvm_irqfd.h @@ -59,6 +59,9 @@ struct kvm_kernel_irqfd { struct work_struct shutdown; struct irq_bypass_consumer consumer; struct irq_bypass_producer *producer; + + struct list_head vcpu_list; + void *irq_bypass_data; }; #endif /* __LINUX_KVM_IRQFD_H */ -- cgit v1.2.3 From 0a917e9d4b7070fafcbf7a8ec32d2aa444b4e757 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:08 -0700 Subject: KVM: SVM: Delete IRTE link from previous vCPU before setting new IRTE Delete the previous per-vCPU IRTE link prior to modifying the IRTE. If forcing the IRTE back to remapped mode fails, the IRQ is already broken; keeping stale metadata won't change that, and the IOMMU should be sufficiently paranoid to sanitize the IRTE when the IRQ is freed and reallocated. This will allow hoisting the vCPU tracking to common x86, which in turn will allow most of the IRTE update code to be deduplicated. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-7-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_irqfd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h index 6510a48e62aa..361c07f4466d 100644 --- a/include/linux/kvm_irqfd.h +++ b/include/linux/kvm_irqfd.h @@ -60,6 +60,7 @@ struct kvm_kernel_irqfd { struct irq_bypass_consumer consumer; struct irq_bypass_producer *producer; + struct kvm_vcpu *irq_bypass_vcpu; struct list_head vcpu_list; void *irq_bypass_data; }; -- cgit v1.2.3 From 1da19c5ce0533796179d9e1b55e64bf78478c4c1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:09 -0700 Subject: iommu/amd: KVM: SVM: Delete now-unused cached/previous GA tag fields Delete the amd_ir_data.prev_ga_tag field now that all usage is superfluous. Reviewed-by: Vasant Hegde Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-8-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/amd-iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 062fbd4c9b77..1f9b13d803c5 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -19,8 +19,8 @@ struct amd_iommu; */ struct amd_iommu_pi_data { u32 ga_tag; - u32 prev_ga_tag; u64 base; + bool is_guest_mode; struct vcpu_data *vcpu_data; void *ir_data; -- cgit v1.2.3 From c4cdbaf9d81c8387da8b9b91567d4b0eb1b8a549 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:24 -0700 Subject: iommu/amd: KVM: SVM: Use pi_desc_addr to derive ga_root_ptr Use vcpu_data.pi_desc_addr instead of amd_iommu_pi_data.base to get the GA root pointer. KVM is the only source of amd_iommu_pi_data.base, and KVM's one and only path for writing amd_iommu_pi_data.base computes the exact same value for vcpu_data.pi_desc_addr and amd_iommu_pi_data.base, and fills amd_iommu_pi_data.base if and only if vcpu_data.pi_desc_addr is valid, i.e. amd_iommu_pi_data.base is fully redundant. Cc: Maxim Levitsky Reviewed-by: Joao Martins Reviewed-by: Vasant Hegde Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-23-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/amd-iommu.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 1f9b13d803c5..deeefc92a5cf 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -19,8 +19,6 @@ struct amd_iommu; */ struct amd_iommu_pi_data { u32 ga_tag; - u64 base; - bool is_guest_mode; struct vcpu_data *vcpu_data; void *ir_data; -- cgit v1.2.3 From 53527ea1b70224d16d29edbd5c850456469f00ec Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:34 -0700 Subject: iommu: KVM: Split "struct vcpu_data" into separate AMD vs. Intel structs Split the vcpu_data structure that serves as a handoff from KVM to IOMMU drivers into vendor specific structures. Overloading a single structure makes the code hard to read and maintain, is *very* misleading as it suggests that mixing vendors is actually supported, and bastardizing Intel's posted interrupt descriptor address when AMD's IOMMU already has its own structure is quite unnecessary. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-33-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/amd-iommu.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index deeefc92a5cf..99b4fa9a0296 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -12,18 +12,6 @@ struct amd_iommu; -/* - * This is mainly used to communicate information back-and-forth - * between SVM and IOMMU for setting up and tearing down posted - * interrupt - */ -struct amd_iommu_pi_data { - u32 ga_tag; - bool is_guest_mode; - struct vcpu_data *vcpu_data; - void *ir_data; -}; - #ifdef CONFIG_AMD_IOMMU struct task_struct; -- cgit v1.2.3 From b33252b9d17238a4a9fa5a29af6e6a2922a6c2b0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:35 -0700 Subject: KVM: Don't WARN if updating IRQ bypass route fails Don't bother WARNing if updating an IRTE route fails now that vendor code provides much more precise WARNs. The generic WARN doesn't provide enough information to actually debug the problem, and has obviously done nothing to surface the myriad bugs in KVM x86's implementation. Drop all of the associated return code plumbing that existed just so that common KVM could WARN. Link: https://lore.kernel.org/r/20250611224604.313496-34-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a4160c1c0c6b..8e74ac0f90b1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2410,9 +2410,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); -int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new); +void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new); bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ -- cgit v1.2.3 From 77bb184ab880171a1cedfbed9ab05977e6ae2258 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:36 -0700 Subject: KVM: Fold kvm_arch_irqfd_route_changed() into kvm_arch_update_irqfd_routing() Fold kvm_arch_irqfd_route_changed() into kvm_arch_update_irqfd_routing(). Calling arch code to know whether or not to call arch code is absurd. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250611224604.313496-35-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8e74ac0f90b1..fb9ec06aa807 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2413,8 +2413,6 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new); -bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, - struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ #ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS -- cgit v1.2.3 From 08d9ccdd1a5c75d7aca7ac3af56f723d780dd6ac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:43 -0700 Subject: iommu/amd: KVM: SVM: Infer IsRun from validity of pCPU destination Infer whether or not a vCPU should be marked running from the validity of the pCPU on which it is running. amd_iommu_update_ga() already skips the IRTE update if the pCPU is invalid, i.e. passing %true for is_run with an invalid pCPU would be a blatant and egregrious KVM bug. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-42-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/amd-iommu.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 99b4fa9a0296..fe0e16ffe0e5 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -30,8 +30,7 @@ static inline void amd_iommu_detect(void) { } /* IOMMU AVIC Function */ extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)); -extern int -amd_iommu_update_ga(int cpu, bool is_run, void *data); +extern int amd_iommu_update_ga(int cpu, void *data); extern int amd_iommu_activate_guest_mode(void *data); extern int amd_iommu_deactivate_guest_mode(void *data); @@ -44,8 +43,7 @@ amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) return 0; } -static inline int -amd_iommu_update_ga(int cpu, bool is_run, void *data) +static inline int amd_iommu_update_ga(int cpu, void *data) { return 0; } -- cgit v1.2.3 From f965255dc5033387ac7858787c6c792fd789ac34 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:45 -0700 Subject: iommu/amd: KVM: SVM: Set pCPU info in IRTE when setting vCPU affinity Now that setting vCPU affinity is guarded with ir_list_lock, i.e. now that avic_physical_id_entry can be safely accessed, set the pCPU info straight-away when setting vCPU affinity. Putting the IRTE into posted mode, and then immediately updating the IRTE a second time if the target vCPU is running is wasteful and confusing. This also fixes a flaw where a posted IRQ that arrives between putting the IRTE into guest_mode and setting the correct destination could cause the IOMMU to ring the doorbell on the wrong pCPU. Link: https://lore.kernel.org/r/20250611224604.313496-44-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/amd-iommu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index fe0e16ffe0e5..c9f2df0c4596 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -32,7 +32,7 @@ extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)); extern int amd_iommu_update_ga(int cpu, void *data); -extern int amd_iommu_activate_guest_mode(void *data); +extern int amd_iommu_activate_guest_mode(void *data, int cpu); extern int amd_iommu_deactivate_guest_mode(void *data); #else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ @@ -48,7 +48,7 @@ static inline int amd_iommu_update_ga(int cpu, void *data) return 0; } -static inline int amd_iommu_activate_guest_mode(void *data) +static inline int amd_iommu_activate_guest_mode(void *data, int cpu) { return 0; } -- cgit v1.2.3 From b9e53f9ff4a88f01b22524878c9a381a6c5f65ff Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:46:03 -0700 Subject: iommu/amd: KVM: SVM: Allow KVM to control need for GA log interrupts Add plumbing to the AMD IOMMU driver to allow KVM to control whether or not an IRTE is configured to generate GA log interrupts. KVM only needs a notification if the target vCPU is blocking, so the vCPU can be awakened. If a vCPU is preempted or exits to userspace, KVM clears is_run, but will set the vCPU back to running when userspace does KVM_RUN and/or the vCPU task is scheduled back in, i.e. KVM doesn't need a notification. Unconditionally pass "true" in all KVM paths to isolate the IOMMU changes from the KVM changes insofar as possible. Opportunistically swap the ordering of parameters for amd_iommu_update_ga() so that the match amd_iommu_activate_guest_mode(). Note, as of this writing, the AMD IOMMU manual doesn't list GALogIntr as a non-cached field, but per AMD hardware architects, it's not cached and can be safely updated without an invalidation. Link: https://lore.kernel.org/all/b29b8c22-2fd4-4b5e-b755-9198874157c7@amd.com Cc: Vasant Hegde Cc: Joao Martins Link: https://lore.kernel.org/r/20250611224604.313496-62-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/amd-iommu.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index c9f2df0c4596..8cced632ecd0 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -30,9 +30,8 @@ static inline void amd_iommu_detect(void) { } /* IOMMU AVIC Function */ extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)); -extern int amd_iommu_update_ga(int cpu, void *data); - -extern int amd_iommu_activate_guest_mode(void *data, int cpu); +extern int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr); +extern int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr); extern int amd_iommu_deactivate_guest_mode(void *data); #else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ @@ -43,12 +42,12 @@ amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) return 0; } -static inline int amd_iommu_update_ga(int cpu, void *data) +static inline int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr) { return 0; } -static inline int amd_iommu_activate_guest_mode(void *data, int cpu) +static inline int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr) { return 0; } -- cgit v1.2.3 From 283ed5001d6852f85c09ed2522331b2b197ba937 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:11 -0700 Subject: KVM: Use a local struct to do the initial vfs_poll() on an irqfd Use a function-local struct for the poll_table passed to vfs_poll(), as nothing in the vfs_poll() callchain grabs a long-term reference to the structure, i.e. its lifetime doesn't need to be tied to the irqfd. Using a local structure will also allow propagating failures out of the polling callback without further polluting kvm_kernel_irqfd. Opportunstically rename irqfd_ptable_queue_proc() to kvm_irqfd_register() to capture what it actually does. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_irqfd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h index 361c07f4466d..ef8c134ded8a 100644 --- a/include/linux/kvm_irqfd.h +++ b/include/linux/kvm_irqfd.h @@ -55,7 +55,6 @@ struct kvm_kernel_irqfd { /* Used for setup/shutdown */ struct eventfd_ctx *eventfd; struct list_head list; - poll_table pt; struct work_struct shutdown; struct irq_bypass_consumer consumer; struct irq_bypass_producer *producer; -- cgit v1.2.3 From 0d09582b3a607436fd91d6ce813048a048ecbf10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:18 -0700 Subject: sched/wait: Add a waitqueue helper for fully exclusive priority waiters Add a waitqueue helper to add a priority waiter that requires exclusive wakeups, i.e. that requires that it be the _only_ priority waiter. The API will be used by KVM to ensure that at most one of KVM's irqfds is bound to a single eventfd (across the entire kernel). Open code the helper instead of using __add_wait_queue() so that the common path doesn't need to "handle" impossible failures. Cc: K Prateek Nayak Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-9-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/wait.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 965a19809c7e..09855d819418 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -164,6 +164,8 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head, + struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) -- cgit v1.2.3 From bbc13ae593e0ea47357ff6e4740c533c16c2ae1e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 18:17:56 -0700 Subject: VFIO: KVM: x86: Drop kvm_arch_{start,end}_assignment() Drop kvm_arch_{start,end}_assignment() and all associated code now that KVM x86 no longer consumes assigned_device_count. Tracking whether or not a VFIO-assigned device is formally associated with a VM is fundamentally flawed, as such an association is optional for general usage, i.e. is prone to false negatives. E.g. prior to commit 2edd9cb79fb3 ("kvm: detect assigned device via irqbypass manager"), device passthrough via VFIO would fail to enable IRQ bypass if userspace omitted the formal VFIO<=>KVM binding. And device drivers that *need* the VFIO<=>KVM connection, e.g. KVM-GT, shouldn't be relying on generic x86 tracking infrastructure. Cc: Jim Mattson Link: https://lore.kernel.org/r/20250523011756.3243624-6-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fb9ec06aa807..15656b7fba6c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1690,24 +1690,6 @@ static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) return false; } #endif -#ifdef __KVM_HAVE_ARCH_ASSIGNED_DEVICE -void kvm_arch_start_assignment(struct kvm *kvm); -void kvm_arch_end_assignment(struct kvm *kvm); -bool kvm_arch_has_assigned_device(struct kvm *kvm); -#else -static inline void kvm_arch_start_assignment(struct kvm *kvm) -{ -} - -static inline void kvm_arch_end_assignment(struct kvm *kvm) -{ -} - -static __always_inline bool kvm_arch_has_assigned_device(struct kvm *kvm) -{ - return false; -} -#endif static inline struct rcuwait *kvm_arch_vcpu_get_wait(struct kvm_vcpu *vcpu) { -- cgit v1.2.3 From 7ec80fb3f025825e860b433685fb801d6de34bf3 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Thu, 3 Jul 2025 12:25:10 +0200 Subject: irqchip/gic-v5: Add GICv5 PPI support The GICv5 CPU interface implements support for PE-Private Peripheral Interrupts (PPI), that are handled (enabled/prioritized/delivered) entirely within the CPU interface hardware. To enable PPI interrupts, implement the baseline GICv5 host kernel driver infrastructure required to handle interrupts on a GICv5 system. Add the exception handling code path and definitions for GICv5 instructions. Add GICv5 PPI handling code as a specific IRQ domain to: - Set-up PPI priority - Manage PPI configuration and state - Manage IRQ flow handler - IRQs allocation/free - Hook-up a PPI specific IRQchip to provide the relevant methods PPI IRQ priority is chosen as the minimum allowed priority by the system design (after probing the number of priority bits implemented by the CPU interface). Co-developed-by: Sascha Bischoff Signed-off-by: Sascha Bischoff Co-developed-by: Timothy Hayes Signed-off-by: Timothy Hayes Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marc Zyngier Cc: Will Deacon Cc: Thomas Gleixner Cc: Catalin Marinas Cc: Marc Zyngier Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-20-12e71f1b3528@kernel.org Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v5.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/irqchip/arm-gic-v5.h (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h new file mode 100644 index 000000000000..b08ec0308a9b --- /dev/null +++ b/include/linux/irqchip/arm-gic-v5.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 ARM Limited, All Rights Reserved. + */ +#ifndef __LINUX_IRQCHIP_ARM_GIC_V5_H +#define __LINUX_IRQCHIP_ARM_GIC_V5_H + +#include + +/* + * INTID handling + */ +#define GICV5_HWIRQ_ID GENMASK(23, 0) +#define GICV5_HWIRQ_TYPE GENMASK(31, 29) +#define GICV5_HWIRQ_INTID GENMASK_ULL(31, 0) + +#define GICV5_HWIRQ_TYPE_PPI UL(0x1) + +#endif -- cgit v1.2.3 From 5cb1b6dab2def316671ea2565291a86ad58b884c Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Thu, 3 Jul 2025 12:25:11 +0200 Subject: irqchip/gic-v5: Add GICv5 IRS/SPI support The GICv5 Interrupt Routing Service (IRS) component implements interrupt management and routing in the GICv5 architecture. A GICv5 system comprises one or more IRSes, that together handle the interrupt routing and state for the system. An IRS supports Shared Peripheral Interrupts (SPIs), that are interrupt sources directly connected to the IRS; they do not rely on memory for storage. The number of supported SPIs is fixed for a given implementation and can be probed through IRS IDR registers. SPI interrupt state and routing are managed through GICv5 instructions. Each core (PE in GICv5 terms) in a GICv5 system is identified with an Interrupt AFFinity ID (IAFFID). An IRS manages a set of cores that are connected to it. Firmware provides a topology description that the driver uses to detect to which IRS a CPU (ie an IAFFID) is associated with. Use probeable information and firmware description to initialize the IRSes and implement GICv5 IRS SPIs support through an SPI-specific IRQ domain. The GICv5 IRS driver: - Probes IRSes in the system to detect SPI ranges - Associates an IRS with a set of cores connected to it - Adds an IRQchip structure for SPI handling SPIs priority is set to a value corresponding to the lowest permissible priority in the system (taking into account the implemented priority bits of the IRS and CPU interface). Since all IRQs are set to the same priority value, the value itself does not matter as long as it is a valid one. Co-developed-by: Sascha Bischoff Signed-off-by: Sascha Bischoff Co-developed-by: Timothy Hayes Signed-off-by: Timothy Hayes Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marc Zyngier Cc: Will Deacon Cc: Thomas Gleixner Cc: Catalin Marinas Cc: Marc Zyngier Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-21-12e71f1b3528@kernel.org Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v5.h | 140 +++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index b08ec0308a9b..1064a69ab33f 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -5,6 +5,8 @@ #ifndef __LINUX_IRQCHIP_ARM_GIC_V5_H #define __LINUX_IRQCHIP_ARM_GIC_V5_H +#include + #include /* @@ -15,5 +17,143 @@ #define GICV5_HWIRQ_INTID GENMASK_ULL(31, 0) #define GICV5_HWIRQ_TYPE_PPI UL(0x1) +#define GICV5_HWIRQ_TYPE_SPI UL(0x3) + +/* + * Tables attributes + */ +#define GICV5_NO_READ_ALLOC 0b0 +#define GICV5_READ_ALLOC 0b1 +#define GICV5_NO_WRITE_ALLOC 0b0 +#define GICV5_WRITE_ALLOC 0b1 + +#define GICV5_NON_CACHE 0b00 +#define GICV5_WB_CACHE 0b01 +#define GICV5_WT_CACHE 0b10 + +#define GICV5_NON_SHARE 0b00 +#define GICV5_OUTER_SHARE 0b10 +#define GICV5_INNER_SHARE 0b11 + +/* + * IRS registers + */ +#define GICV5_IRS_IDR1 0x0004 +#define GICV5_IRS_IDR2 0x0008 +#define GICV5_IRS_IDR5 0x0014 +#define GICV5_IRS_IDR6 0x0018 +#define GICV5_IRS_IDR7 0x001c +#define GICV5_IRS_CR0 0x0080 +#define GICV5_IRS_CR1 0x0084 +#define GICV5_IRS_SPI_SELR 0x0108 +#define GICV5_IRS_SPI_CFGR 0x0114 +#define GICV5_IRS_SPI_STATUSR 0x0118 +#define GICV5_IRS_PE_SELR 0x0140 +#define GICV5_IRS_PE_STATUSR 0x0144 +#define GICV5_IRS_PE_CR0 0x0148 + +#define GICV5_IRS_IDR1_PRIORITY_BITS GENMASK(22, 20) +#define GICV5_IRS_IDR1_IAFFID_BITS GENMASK(19, 16) + +#define GICV5_IRS_IDR1_PRIORITY_BITS_1BITS 0b000 +#define GICV5_IRS_IDR1_PRIORITY_BITS_2BITS 0b001 +#define GICV5_IRS_IDR1_PRIORITY_BITS_3BITS 0b010 +#define GICV5_IRS_IDR1_PRIORITY_BITS_4BITS 0b011 +#define GICV5_IRS_IDR1_PRIORITY_BITS_5BITS 0b100 + +#define GICV5_IRS_IDR2_ISTMD_SZ GENMASK(19, 15) +#define GICV5_IRS_IDR2_ISTMD BIT(14) +#define GICV5_IRS_IDR2_IST_L2SZ GENMASK(13, 11) +#define GICV5_IRS_IDR2_IST_LEVELS BIT(10) +#define GICV5_IRS_IDR2_MIN_LPI_ID_BITS GENMASK(9, 6) +#define GICV5_IRS_IDR2_LPI BIT(5) +#define GICV5_IRS_IDR2_ID_BITS GENMASK(4, 0) + +#define GICV5_IRS_IDR5_SPI_RANGE GENMASK(24, 0) +#define GICV5_IRS_IDR6_SPI_IRS_RANGE GENMASK(24, 0) +#define GICV5_IRS_IDR7_SPI_BASE GENMASK(23, 0) +#define GICV5_IRS_CR0_IDLE BIT(1) +#define GICV5_IRS_CR0_IRSEN BIT(0) + +#define GICV5_IRS_CR1_VPED_WA BIT(15) +#define GICV5_IRS_CR1_VPED_RA BIT(14) +#define GICV5_IRS_CR1_VMD_WA BIT(13) +#define GICV5_IRS_CR1_VMD_RA BIT(12) +#define GICV5_IRS_CR1_VPET_WA BIT(11) +#define GICV5_IRS_CR1_VPET_RA BIT(10) +#define GICV5_IRS_CR1_VMT_WA BIT(9) +#define GICV5_IRS_CR1_VMT_RA BIT(8) +#define GICV5_IRS_CR1_IST_WA BIT(7) +#define GICV5_IRS_CR1_IST_RA BIT(6) +#define GICV5_IRS_CR1_IC GENMASK(5, 4) +#define GICV5_IRS_CR1_OC GENMASK(3, 2) +#define GICV5_IRS_CR1_SH GENMASK(1, 0) + +#define GICV5_IRS_SPI_STATUSR_V BIT(1) +#define GICV5_IRS_SPI_STATUSR_IDLE BIT(0) + +#define GICV5_IRS_SPI_SELR_ID GENMASK(23, 0) + +#define GICV5_IRS_SPI_CFGR_TM BIT(0) + +#define GICV5_IRS_PE_SELR_IAFFID GENMASK(15, 0) + +#define GICV5_IRS_PE_STATUSR_V BIT(1) +#define GICV5_IRS_PE_STATUSR_IDLE BIT(0) + +#define GICV5_IRS_PE_CR0_DPS BIT(0) + +/* + * Global Data structures and functions + */ +struct gicv5_chip_data { + struct fwnode_handle *fwnode; + struct irq_domain *ppi_domain; + struct irq_domain *spi_domain; + u32 global_spi_count; + u8 cpuif_pri_bits; + u8 irs_pri_bits; +}; + +extern struct gicv5_chip_data gicv5_global_data __read_mostly; + +struct gicv5_irs_chip_data { + struct list_head entry; + struct fwnode_handle *fwnode; + void __iomem *irs_base; + u32 flags; + u32 spi_min; + u32 spi_range; + raw_spinlock_t spi_config_lock; +}; + +static inline int gicv5_wait_for_op_s_atomic(void __iomem *addr, u32 offset, + const char *reg_s, u32 mask, + u32 *val) +{ + void __iomem *reg = addr + offset; + u32 tmp; + int ret; + + ret = readl_poll_timeout_atomic(reg, tmp, tmp & mask, 1, 10 * USEC_PER_MSEC); + if (unlikely(ret == -ETIMEDOUT)) { + pr_err_ratelimited("%s timeout...\n", reg_s); + return ret; + } + + if (val) + *val = tmp; + + return 0; +} + +#define gicv5_wait_for_op_atomic(base, reg, mask, val) \ + gicv5_wait_for_op_s_atomic(base, reg, #reg, mask, val) +int gicv5_irs_of_probe(struct device_node *parent); +void gicv5_irs_remove(void); +int gicv5_irs_register_cpu(int cpuid); +int gicv5_irs_cpu_to_iaffid(int cpu_id, u16 *iaffid); +struct gicv5_irs_chip_data *gicv5_irs_lookup_by_spi_id(u32 spi_id); +int gicv5_spi_irq_set_type(struct irq_data *d, unsigned int type); #endif -- cgit v1.2.3 From 0f0101325876859eb463792d4df3fd22b6539d29 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Thu, 3 Jul 2025 12:25:12 +0200 Subject: irqchip/gic-v5: Add GICv5 LPI/IPI support An IRS supports Logical Peripheral Interrupts (LPIs) and implement Linux IPIs on top of it. LPIs are used for interrupt signals that are translated by a GICv5 ITS (Interrupt Translation Service) but also for software generated IRQs - namely interrupts that are not driven by a HW signal, ie IPIs. LPIs rely on memory storage for interrupt routing and state. LPIs state and routing information is kept in the Interrupt State Table (IST). IRSes provide support for 1- or 2-level IST tables configured to support a maximum number of interrupts that depend on the OS configuration and the HW capabilities. On systems that provide 2-level IST support, always allow the maximum number of LPIs; On systems with only 1-level support, limit the number of LPIs to 2^12 to prevent wasting memory (presumably a system that supports a 1-level only IST is not expecting a large number of interrupts). On a 2-level IST system, L2 entries are allocated on demand. The IST table memory is allocated using the kmalloc() interface; the allocation required may be smaller than a page and must be made up of contiguous physical pages if larger than a page. On systems where the IRS is not cache-coherent with the CPUs, cache mainteinance operations are executed to clean and invalidate the allocated memory to the point of coherency making it visible to the IRS components. On GICv5 systems, IPIs are implemented using LPIs. Add an LPI IRQ domain and implement an IPI-specific IRQ domain created as a child/subdomain of the LPI domain to allocate the required number of LPIs needed to implement the IPIs. IPIs are backed by LPIs, add LPIs allocation/de-allocation functions. The LPI INTID namespace is managed using an IDA to alloc/free LPI INTIDs. Associate an IPI irqchip with IPI IRQ descriptors to provide core code with the irqchip.ipi_send_single() method required to raise an IPI. Co-developed-by: Sascha Bischoff Signed-off-by: Sascha Bischoff Co-developed-by: Timothy Hayes Signed-off-by: Timothy Hayes Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marc Zyngier Cc: Will Deacon Cc: Thomas Gleixner Cc: Catalin Marinas Cc: Marc Zyngier Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-22-12e71f1b3528@kernel.org Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v5.h | 63 +++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index 1064a69ab33f..680eed794a35 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -7,8 +7,12 @@ #include +#include +#include #include +#define GICV5_IPIS_PER_CPU MAX_IPI + /* * INTID handling */ @@ -17,6 +21,7 @@ #define GICV5_HWIRQ_INTID GENMASK_ULL(31, 0) #define GICV5_HWIRQ_TYPE_PPI UL(0x1) +#define GICV5_HWIRQ_TYPE_LPI UL(0x2) #define GICV5_HWIRQ_TYPE_SPI UL(0x3) /* @@ -36,7 +41,7 @@ #define GICV5_INNER_SHARE 0b11 /* - * IRS registers + * IRS registers and tables structures */ #define GICV5_IRS_IDR1 0x0004 #define GICV5_IRS_IDR2 0x0008 @@ -51,6 +56,10 @@ #define GICV5_IRS_PE_SELR 0x0140 #define GICV5_IRS_PE_STATUSR 0x0144 #define GICV5_IRS_PE_CR0 0x0148 +#define GICV5_IRS_IST_BASER 0x0180 +#define GICV5_IRS_IST_CFGR 0x0190 +#define GICV5_IRS_IST_STATUSR 0x0194 +#define GICV5_IRS_MAP_L2_ISTR 0x01c0 #define GICV5_IRS_IDR1_PRIORITY_BITS GENMASK(22, 20) #define GICV5_IRS_IDR1_IAFFID_BITS GENMASK(19, 16) @@ -72,6 +81,11 @@ #define GICV5_IRS_IDR5_SPI_RANGE GENMASK(24, 0) #define GICV5_IRS_IDR6_SPI_IRS_RANGE GENMASK(24, 0) #define GICV5_IRS_IDR7_SPI_BASE GENMASK(23, 0) + +#define GICV5_IRS_IST_L2SZ_SUPPORT_4KB(r) FIELD_GET(BIT(11), (r)) +#define GICV5_IRS_IST_L2SZ_SUPPORT_16KB(r) FIELD_GET(BIT(12), (r)) +#define GICV5_IRS_IST_L2SZ_SUPPORT_64KB(r) FIELD_GET(BIT(13), (r)) + #define GICV5_IRS_CR0_IDLE BIT(1) #define GICV5_IRS_CR0_IRSEN BIT(0) @@ -103,6 +117,33 @@ #define GICV5_IRS_PE_CR0_DPS BIT(0) +#define GICV5_IRS_IST_STATUSR_IDLE BIT(0) + +#define GICV5_IRS_IST_CFGR_STRUCTURE BIT(16) +#define GICV5_IRS_IST_CFGR_ISTSZ GENMASK(8, 7) +#define GICV5_IRS_IST_CFGR_L2SZ GENMASK(6, 5) +#define GICV5_IRS_IST_CFGR_LPI_ID_BITS GENMASK(4, 0) + +#define GICV5_IRS_IST_CFGR_STRUCTURE_LINEAR 0b0 +#define GICV5_IRS_IST_CFGR_STRUCTURE_TWO_LEVEL 0b1 + +#define GICV5_IRS_IST_CFGR_ISTSZ_4 0b00 +#define GICV5_IRS_IST_CFGR_ISTSZ_8 0b01 +#define GICV5_IRS_IST_CFGR_ISTSZ_16 0b10 + +#define GICV5_IRS_IST_CFGR_L2SZ_4K 0b00 +#define GICV5_IRS_IST_CFGR_L2SZ_16K 0b01 +#define GICV5_IRS_IST_CFGR_L2SZ_64K 0b10 + +#define GICV5_IRS_IST_BASER_ADDR_MASK GENMASK_ULL(55, 6) +#define GICV5_IRS_IST_BASER_VALID BIT_ULL(0) + +#define GICV5_IRS_MAP_L2_ISTR_ID GENMASK(23, 0) + +#define GICV5_ISTL1E_VALID BIT_ULL(0) + +#define GICV5_ISTL1E_L2_ADDR_MASK GENMASK_ULL(55, 12) + /* * Global Data structures and functions */ @@ -110,9 +151,18 @@ struct gicv5_chip_data { struct fwnode_handle *fwnode; struct irq_domain *ppi_domain; struct irq_domain *spi_domain; + struct irq_domain *lpi_domain; + struct irq_domain *ipi_domain; u32 global_spi_count; u8 cpuif_pri_bits; + u8 cpuif_id_bits; u8 irs_pri_bits; + struct { + __le64 *l1ist_addr; + u32 l2_size; + u8 l2_bits; + bool l2; + } ist; }; extern struct gicv5_chip_data gicv5_global_data __read_mostly; @@ -150,10 +200,21 @@ static inline int gicv5_wait_for_op_s_atomic(void __iomem *addr, u32 offset, #define gicv5_wait_for_op_atomic(base, reg, mask, val) \ gicv5_wait_for_op_s_atomic(base, reg, #reg, mask, val) +void __init gicv5_init_lpi_domain(void); +void __init gicv5_free_lpi_domain(void); + int gicv5_irs_of_probe(struct device_node *parent); void gicv5_irs_remove(void); +int gicv5_irs_enable(void); int gicv5_irs_register_cpu(int cpuid); int gicv5_irs_cpu_to_iaffid(int cpu_id, u16 *iaffid); struct gicv5_irs_chip_data *gicv5_irs_lookup_by_spi_id(u32 spi_id); int gicv5_spi_irq_set_type(struct irq_data *d, unsigned int type); +int gicv5_irs_iste_alloc(u32 lpi); + +void gicv5_init_lpis(u32 max); +void gicv5_deinit_lpis(void); + +int gicv5_alloc_lpi(void); +void gicv5_free_lpi(u32 lpi); #endif -- cgit v1.2.3 From 31fd3becb920e0b31b99cf202ace637f75dd7e78 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Thu, 3 Jul 2025 12:25:14 +0200 Subject: of/irq: Add of_msi_xlate() helper function Add an of_msi_xlate() helper that maps a device ID and returns the device node of the MSI controller the device ID is mapped to. Required by core functions that need an MSI controller device node pointer at the same time as a mapped device ID, of_msi_map_id() is not sufficient for that purpose. Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marc Zyngier Cc: Rob Herring Cc: Marc Zyngier Reviewed-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-24-12e71f1b3528@kernel.org Signed-off-by: Marc Zyngier --- include/linux/of_irq.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 6337ad4e5fe8..a480063c9cb1 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -54,6 +54,7 @@ extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id, u32 bus_token); extern void of_msi_configure(struct device *dev, const struct device_node *np); +extern u32 of_msi_xlate(struct device *dev, struct device_node **msi_np, u32 id_in); u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in); #else static inline void of_irq_init(const struct of_device_id *matches) @@ -100,6 +101,10 @@ static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev static inline void of_msi_configure(struct device *dev, struct device_node *np) { } +static inline u32 of_msi_xlate(struct device *dev, struct device_node **msi_np, u32 id_in) +{ + return id_in; +} static inline u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in) { -- cgit v1.2.3 From cd0ec59affb1f31347170bbafadb9c5cc716bca9 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Thu, 3 Jul 2025 12:25:15 +0200 Subject: PCI/MSI: Add pci_msi_map_rid_ctlr_node() helper function IRQchip drivers need a PCI/MSI function to map a RID to a MSI controller deviceID namespace and at the same time retrieve the struct device_node pointer of the MSI controller the RID is mapped to. Add pci_msi_map_rid_ctlr_node() to achieve this purpose. Cc Bjorn Helgaas Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marc Zyngier Cc: Thomas Gleixner Cc: Marc Zyngier Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-25-12e71f1b3528@kernel.org Signed-off-by: Marc Zyngier --- include/linux/msi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 6863540f4b71..a418e2695b05 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -705,6 +705,7 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent); u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev); +u32 pci_msi_map_rid_ctlr_node(struct pci_dev *pdev, struct device_node **node); struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev); #else /* CONFIG_PCI_MSI */ static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev) -- cgit v1.2.3 From 8b65db1e93a227ad9cc1b67cb221b06869f0b35f Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Thu, 3 Jul 2025 12:25:17 +0200 Subject: irqchip/msi-lib: Add IRQ_DOMAIN_FLAG_FWNODE_PARENT handling In some irqchip implementations the fwnode representing the IRQdomain and the MSI controller fwnode do not match; in particular the IRQdomain fwnode is the MSI controller fwnode parent. To support selecting such IRQ domains, add a flag in core IRQ domain code that explicitly tells the MSI lib to use the parent fwnode while carrying out IRQ domain selection. Update the msi-lib select callback with the resulting logic. Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marc Zyngier Cc: Thomas Gleixner Cc: Marc Zyngier Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-27-12e71f1b3528@kernel.org Signed-off-by: Marc Zyngier --- include/linux/irqdomain.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 7387d183029b..25c7cbeed393 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -212,6 +212,9 @@ enum { /* Address and data pair is mutable when irq_set_affinity() */ IRQ_DOMAIN_FLAG_MSI_IMMUTABLE = (1 << 11), + /* IRQ domain requires parent fwnode matching */ + IRQ_DOMAIN_FLAG_FWNODE_PARENT = (1 << 12), + /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved * for implementation specific purposes and ignored by the -- cgit v1.2.3 From 57d72196dfc8502b7e376ecdffb11c4f8766f26d Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Thu, 3 Jul 2025 12:25:18 +0200 Subject: irqchip/gic-v5: Add GICv5 ITS support The GICv5 architecture implements Interrupt Translation Service (ITS) components in order to translate events coming from peripherals into interrupt events delivered to the connected IRSes. Events (ie MSI memory writes to ITS translate frame), are translated by the ITS using tables kept in memory. ITS translation tables for peripherals is kept in memory storage (device table [DT] and Interrupt Translation Table [ITT]) that is allocated by the driver on boot. Both tables can be 1- or 2-level; the structure is chosen by the driver after probing the ITS HW parameters and checking the allowed table splits and supported {device/event}_IDbits. DT table entries are allocated on demand (ie when a device is probed); the DT table is sized using the number of supported deviceID bits in that that's a system design decision (ie the number of deviceID bits implemented should reflect the number of devices expected in a system) therefore it makes sense to allocate a DT table that can cater for the maximum number of devices. DT and ITT tables are allocated using the kmalloc interface; the allocation size may be smaller than a page or larger, and must provide contiguous memory pages. LPIs INTIDs backing the device events are allocated one-by-one and only upon Linux IRQ allocation; this to avoid preallocating a large number of LPIs to cover the HW device MSI vector size whereas few MSI entries are actually enabled by a device. ITS cacheability/shareability attributes are programmed according to the provided firmware ITS description. The GICv5 partially reuses the GICv3 ITS MSI parent infrastructure and adds functions required to retrieve the ITS translate frame addresses out of msi-map and msi-parent properties to implement the GICv5 ITS MSI parent callbacks. Co-developed-by: Sascha Bischoff Signed-off-by: Sascha Bischoff Co-developed-by: Timothy Hayes Signed-off-by: Timothy Hayes Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marc Zyngier Cc: Thomas Gleixner Cc: Marc Zyngier Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-28-12e71f1b3528@kernel.org Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v5.h | 157 +++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index 680eed794a35..07b952549bfa 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -50,6 +50,8 @@ #define GICV5_IRS_IDR7 0x001c #define GICV5_IRS_CR0 0x0080 #define GICV5_IRS_CR1 0x0084 +#define GICV5_IRS_SYNCR 0x00c0 +#define GICV5_IRS_SYNC_STATUSR 0x00c4 #define GICV5_IRS_SPI_SELR 0x0108 #define GICV5_IRS_SPI_CFGR 0x0114 #define GICV5_IRS_SPI_STATUSR 0x0118 @@ -103,6 +105,10 @@ #define GICV5_IRS_CR1_OC GENMASK(3, 2) #define GICV5_IRS_CR1_SH GENMASK(1, 0) +#define GICV5_IRS_SYNCR_SYNC BIT(31) + +#define GICV5_IRS_SYNC_STATUSR_IDLE BIT(0) + #define GICV5_IRS_SPI_STATUSR_V BIT(1) #define GICV5_IRS_SPI_STATUSR_IDLE BIT(0) @@ -144,6 +150,104 @@ #define GICV5_ISTL1E_L2_ADDR_MASK GENMASK_ULL(55, 12) +/* + * ITS registers and tables structures + */ +#define GICV5_ITS_IDR1 0x0004 +#define GICV5_ITS_IDR2 0x0008 +#define GICV5_ITS_CR0 0x0080 +#define GICV5_ITS_CR1 0x0084 +#define GICV5_ITS_DT_BASER 0x00c0 +#define GICV5_ITS_DT_CFGR 0x00d0 +#define GICV5_ITS_DIDR 0x0100 +#define GICV5_ITS_EIDR 0x0108 +#define GICV5_ITS_INV_EVENTR 0x010c +#define GICV5_ITS_INV_DEVICER 0x0110 +#define GICV5_ITS_STATUSR 0x0120 +#define GICV5_ITS_SYNCR 0x0140 +#define GICV5_ITS_SYNC_STATUSR 0x0148 + +#define GICV5_ITS_IDR1_L2SZ GENMASK(10, 8) +#define GICV5_ITS_IDR1_ITT_LEVELS BIT(7) +#define GICV5_ITS_IDR1_DT_LEVELS BIT(6) +#define GICV5_ITS_IDR1_DEVICEID_BITS GENMASK(5, 0) + +#define GICV5_ITS_IDR1_L2SZ_SUPPORT_4KB(r) FIELD_GET(BIT(8), (r)) +#define GICV5_ITS_IDR1_L2SZ_SUPPORT_16KB(r) FIELD_GET(BIT(9), (r)) +#define GICV5_ITS_IDR1_L2SZ_SUPPORT_64KB(r) FIELD_GET(BIT(10), (r)) + +#define GICV5_ITS_IDR2_XDMN_EVENTs GENMASK(6, 5) +#define GICV5_ITS_IDR2_EVENTID_BITS GENMASK(4, 0) + +#define GICV5_ITS_CR0_IDLE BIT(1) +#define GICV5_ITS_CR0_ITSEN BIT(0) + +#define GICV5_ITS_CR1_ITT_RA BIT(7) +#define GICV5_ITS_CR1_DT_RA BIT(6) +#define GICV5_ITS_CR1_IC GENMASK(5, 4) +#define GICV5_ITS_CR1_OC GENMASK(3, 2) +#define GICV5_ITS_CR1_SH GENMASK(1, 0) + +#define GICV5_ITS_DT_CFGR_STRUCTURE BIT(16) +#define GICV5_ITS_DT_CFGR_L2SZ GENMASK(7, 6) +#define GICV5_ITS_DT_CFGR_DEVICEID_BITS GENMASK(5, 0) + +#define GICV5_ITS_DT_BASER_ADDR_MASK GENMASK_ULL(55, 3) + +#define GICV5_ITS_INV_DEVICER_I BIT(31) +#define GICV5_ITS_INV_DEVICER_EVENTID_BITS GENMASK(5, 1) +#define GICV5_ITS_INV_DEVICER_L1 BIT(0) + +#define GICV5_ITS_DIDR_DEVICEID GENMASK_ULL(31, 0) + +#define GICV5_ITS_EIDR_EVENTID GENMASK(15, 0) + +#define GICV5_ITS_INV_EVENTR_I BIT(31) +#define GICV5_ITS_INV_EVENTR_ITT_L2SZ GENMASK(2, 1) +#define GICV5_ITS_INV_EVENTR_L1 BIT(0) + +#define GICV5_ITS_STATUSR_IDLE BIT(0) + +#define GICV5_ITS_SYNCR_SYNC BIT_ULL(63) +#define GICV5_ITS_SYNCR_SYNCALL BIT_ULL(32) +#define GICV5_ITS_SYNCR_DEVICEID GENMASK_ULL(31, 0) + +#define GICV5_ITS_SYNC_STATUSR_IDLE BIT(0) + +#define GICV5_DTL1E_VALID BIT_ULL(0) +/* Note that there is no shift for the address by design */ +#define GICV5_DTL1E_L2_ADDR_MASK GENMASK_ULL(55, 3) +#define GICV5_DTL1E_SPAN GENMASK_ULL(63, 60) + +#define GICV5_DTL2E_VALID BIT_ULL(0) +#define GICV5_DTL2E_ITT_L2SZ GENMASK_ULL(2, 1) +/* Note that there is no shift for the address by design */ +#define GICV5_DTL2E_ITT_ADDR_MASK GENMASK_ULL(55, 3) +#define GICV5_DTL2E_ITT_DSWE BIT_ULL(57) +#define GICV5_DTL2E_ITT_STRUCTURE BIT_ULL(58) +#define GICV5_DTL2E_EVENT_ID_BITS GENMASK_ULL(63, 59) + +#define GICV5_ITTL1E_VALID BIT_ULL(0) +/* Note that there is no shift for the address by design */ +#define GICV5_ITTL1E_L2_ADDR_MASK GENMASK_ULL(55, 3) +#define GICV5_ITTL1E_SPAN GENMASK_ULL(63, 60) + +#define GICV5_ITTL2E_LPI_ID GENMASK_ULL(23, 0) +#define GICV5_ITTL2E_DAC GENMASK_ULL(29, 28) +#define GICV5_ITTL2E_VIRTUAL BIT_ULL(30) +#define GICV5_ITTL2E_VALID BIT_ULL(31) +#define GICV5_ITTL2E_VM_ID GENMASK_ULL(47, 32) + +#define GICV5_ITS_DT_ITT_CFGR_L2SZ_4k 0b00 +#define GICV5_ITS_DT_ITT_CFGR_L2SZ_16k 0b01 +#define GICV5_ITS_DT_ITT_CFGR_L2SZ_64k 0b10 + +#define GICV5_ITS_DT_ITT_CFGR_STRUCTURE_LINEAR 0 +#define GICV5_ITS_DT_ITT_CFGR_STRUCTURE_TWO_LEVEL 1 + +#define GICV5_ITS_HWIRQ_DEVICE_ID GENMASK_ULL(31, 0) +#define GICV5_ITS_HWIRQ_EVENT_ID GENMASK_ULL(63, 32) + /* * Global Data structures and functions */ @@ -197,24 +301,77 @@ static inline int gicv5_wait_for_op_s_atomic(void __iomem *addr, u32 offset, return 0; } +static inline int gicv5_wait_for_op_s(void __iomem *addr, u32 offset, + const char *reg_s, u32 mask) +{ + void __iomem *reg = addr + offset; + u32 val; + int ret; + + ret = readl_poll_timeout(reg, val, val & mask, 1, 10 * USEC_PER_MSEC); + if (unlikely(ret == -ETIMEDOUT)) { + pr_err_ratelimited("%s timeout...\n", reg_s); + return ret; + } + + return 0; +} + #define gicv5_wait_for_op_atomic(base, reg, mask, val) \ gicv5_wait_for_op_s_atomic(base, reg, #reg, mask, val) +#define gicv5_wait_for_op(base, reg, mask) \ + gicv5_wait_for_op_s(base, reg, #reg, mask) + void __init gicv5_init_lpi_domain(void); void __init gicv5_free_lpi_domain(void); int gicv5_irs_of_probe(struct device_node *parent); void gicv5_irs_remove(void); int gicv5_irs_enable(void); +void gicv5_irs_its_probe(void); int gicv5_irs_register_cpu(int cpuid); int gicv5_irs_cpu_to_iaffid(int cpu_id, u16 *iaffid); struct gicv5_irs_chip_data *gicv5_irs_lookup_by_spi_id(u32 spi_id); int gicv5_spi_irq_set_type(struct irq_data *d, unsigned int type); int gicv5_irs_iste_alloc(u32 lpi); +void gicv5_irs_syncr(void); + +struct gicv5_its_devtab_cfg { + union { + struct { + __le64 *devtab; + } linear; + struct { + __le64 *l1devtab; + __le64 **l2ptrs; + } l2; + }; + u32 cfgr; +}; + +struct gicv5_its_itt_cfg { + union { + struct { + __le64 *itt; + unsigned int num_ents; + } linear; + struct { + __le64 *l1itt; + __le64 **l2ptrs; + unsigned int num_l1_ents; + u8 l2sz; + } l2; + }; + u8 event_id_bits; + bool l2itt; +}; void gicv5_init_lpis(u32 max); void gicv5_deinit_lpis(void); int gicv5_alloc_lpi(void); void gicv5_free_lpi(u32 lpi); + +void __init gicv5_its_of_probe(struct device_node *parent); #endif -- cgit v1.2.3 From 695949d8b16f11f2f172d8d0c7ccc1ae09ed6cb7 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Thu, 3 Jul 2025 12:25:19 +0200 Subject: irqchip/gic-v5: Add GICv5 IWB support The GICv5 architecture implements the Interrupt Wire Bridge (IWB) in order to support wired interrupts that cannot be connected directly to an IRS and instead uses the ITS to translate a wire event into an IRQ signal. Add the wired-to-MSI IWB driver to manage IWB wired interrupts. An IWB is connected to an ITS and it has its own deviceID for all interrupt wires that it manages; the IWB input wire number must be exposed to the ITS as an eventID with a 1:1 mapping. This eventID is not programmable and therefore requires a new msi_alloc_info_t flag to make sure the ITS driver does not allocate an eventid for the wire but rather it uses the msi_alloc_info_t.hwirq number to gather the ITS eventID. Co-developed-by: Sascha Bischoff Signed-off-by: Sascha Bischoff Co-developed-by: Timothy Hayes Signed-off-by: Timothy Hayes Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marc Zyngier Cc: Thomas Gleixner Cc: Marc Zyngier Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-29-12e71f1b3528@kernel.org Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v5.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index 07b952549bfa..68ddcdb1cec5 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -248,6 +248,23 @@ #define GICV5_ITS_HWIRQ_DEVICE_ID GENMASK_ULL(31, 0) #define GICV5_ITS_HWIRQ_EVENT_ID GENMASK_ULL(63, 32) +/* + * IWB registers + */ +#define GICV5_IWB_IDR0 0x0000 +#define GICV5_IWB_CR0 0x0080 +#define GICV5_IWB_WENABLE_STATUSR 0x00c0 +#define GICV5_IWB_WENABLER 0x2000 +#define GICV5_IWB_WTMR 0x4000 + +#define GICV5_IWB_IDR0_INT_DOMS GENMASK(14, 11) +#define GICV5_IWB_IDR0_IW_RANGE GENMASK(10, 0) + +#define GICV5_IWB_CR0_IDLE BIT(1) +#define GICV5_IWB_CR0_IWBEN BIT(0) + +#define GICV5_IWB_WENABLE_STATUSR_IDLE BIT(0) + /* * Global Data structures and functions */ -- cgit v1.2.3 From 1ec38ce3d024bebdff2a9ffb526e4d198605204d Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Fri, 27 Jun 2025 10:09:01 +0000 Subject: irqchip/gic-v5: Populate struct gic_kvm_info Populate the gic_kvm_info struct based on support for FEAT_GCIE_LEGACY. The struct is used by KVM to probe for a compatible GIC. Co-authored-by: Timothy Hayes Signed-off-by: Timothy Hayes Signed-off-by: Sascha Bischoff Reviewed-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20250627100847.1022515-3-sascha.bischoff@arm.com Signed-off-by: Oliver Upton --- include/linux/irqchip/arm-vgic-info.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h index a75b2c7de69d..ca1713fac6e3 100644 --- a/include/linux/irqchip/arm-vgic-info.h +++ b/include/linux/irqchip/arm-vgic-info.h @@ -15,6 +15,8 @@ enum gic_type { GIC_V2, /* Full GICv3, optionally with v2 compat */ GIC_V3, + /* Full GICv5, optionally with v3 compat */ + GIC_V5, }; struct gic_kvm_info { @@ -34,6 +36,8 @@ struct gic_kvm_info { bool has_v4_1; /* Deactivation impared, subpar stuff */ bool no_hw_deactivation; + /* v3 compat support (GICv5 hosts, only) */ + bool has_gcie_v3_compat; }; #ifdef CONFIG_KVM -- cgit v1.2.3 From ee4a2e08c10188f02fe3fb36b6beddc3c2fdb287 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Jul 2025 19:27:41 +1000 Subject: entry: Add arch_in_rcu_eqs() All architectures have an interruptible RCU extended quiescent state (EQS) as part of their idle sequences, where interrupts can occur without RCU watching. Entry code must account for this and wake RCU as necessary; the common entry code deals with this in irqentry_enter() by treating any interrupt from an idle thread as potentially having occurred within an EQS and waking RCU for the duration of the interrupt via rcu_irq_enter() .. rcu_irq_exit(). Some architectures may have other interruptible EQSs which require similar treatment. For example, on s390 it is necessary to enable interrupts around guest entry in the middle of a period where core KVM code has entered an EQS. So that architectures can wake RCU in these cases, this patch adds a new arch_in_rcu_eqs() hook to the common entry code which is checked in addition to the existing is_idle_thread() check, with RCU woken if either returns true. A default implementation is provided which always returns false, which suffices for most architectures. As no architectures currently implement arch_in_rcu_eqs(), there should be no functional change as a result of this patch alone. A subsequent patch will add an s390 implementation to fix a latent bug with missing RCU wakeups. [ajd@linux.ibm.com: rebase, fix commit message] Signed-off-by: Mark Rutland Cc: Andy Lutomirski Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Claudio Imbrenda Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Janosch Frank Reviewed-by: Christian Borntraeger Signed-off-by: Andrew Donnellan Acked-by: Peter Zijlstra (Intel) Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20250708092742.104309-2-ajd@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20250708092742.104309-2-ajd@linux.ibm.com> --- include/linux/entry-common.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index f94f3fdf15fc..3bf99cbad8a3 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -86,6 +86,22 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs); static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {} #endif +/** + * arch_in_rcu_eqs - Architecture specific check for RCU extended quiescent + * states. + * + * Returns: true if the CPU is potentially in an RCU EQS, false otherwise. + * + * Architectures only need to define this if threads other than the idle thread + * may have an interruptible EQS. This does not need to handle idle threads. It + * is safe to over-estimate at the cost of redundant RCU management work. + * + * Invoked from irqentry_enter() + */ +#ifndef arch_in_rcu_eqs +static __always_inline bool arch_in_rcu_eqs(void) { return false; } +#endif + /** * enter_from_user_mode - Establish state when coming from user mode * -- cgit v1.2.3