diff options
Diffstat (limited to 'drivers/xen')
-rw-r--r-- | drivers/xen/Makefile | 4 | ||||
-rw-r--r-- | drivers/xen/balloon.c | 375 | ||||
-rw-r--r-- | drivers/xen/events.c | 826 | ||||
-rw-r--r-- | drivers/xen/gntdev.c | 44 | ||||
-rw-r--r-- | drivers/xen/grant-table.c | 31 | ||||
-rw-r--r-- | drivers/xen/manage.c | 164 | ||||
-rw-r--r-- | drivers/xen/platform-pci.c | 3 | ||||
-rw-r--r-- | drivers/xen/xen-balloon.c | 256 | ||||
-rw-r--r-- | drivers/xen/xenbus/xenbus_probe.c | 12 | ||||
-rw-r--r-- | drivers/xen/xenbus/xenbus_probe.h | 3 | ||||
-rw-r--r-- | drivers/xen/xenbus/xenbus_probe_frontend.c | 11 |
11 files changed, 990 insertions, 739 deletions
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 9585a1da52c6..f420f1ff7f13 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,4 +1,4 @@ -obj-y += grant-table.o features.o events.o manage.o +obj-y += grant-table.o features.o events.o manage.o balloon.o obj-y += xenbus/ nostackp := $(call cc-option, -fno-stack-protector) @@ -7,7 +7,7 @@ CFLAGS_features.o := $(nostackp) obj-$(CONFIG_BLOCK) += biomerge.o obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o -obj-$(CONFIG_XEN_BALLOON) += balloon.o +obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 43f9f02c7db0..043af8ad6b60 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -1,6 +1,4 @@ /****************************************************************************** - * balloon.c - * * Xen balloon driver - enables returning/claiming memory to/from Xen. * * Copyright (c) 2003, B Dragovic @@ -33,7 +31,6 @@ */ #include <linux/kernel.h> -#include <linux/module.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/mm.h> @@ -42,13 +39,11 @@ #include <linux/highmem.h> #include <linux/mutex.h> #include <linux/list.h> -#include <linux/sysdev.h> #include <linux/gfp.h> #include <asm/page.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> -#include <asm/uaccess.h> #include <asm/tlb.h> #include <asm/e820.h> @@ -58,35 +53,29 @@ #include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/interface/memory.h> -#include <xen/xenbus.h> +#include <xen/balloon.h> #include <xen/features.h> #include <xen/page.h> -#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) - -#define BALLOON_CLASS_NAME "xen_memory" +/* + * balloon_process() state: + * + * BP_DONE: done or nothing to do, + * BP_EAGAIN: error, go to sleep, + * BP_ECANCELED: error, balloon operation canceled. + */ -struct balloon_stats { - /* We aim for 'current allocation' == 'target allocation'. */ - unsigned long current_pages; - unsigned long target_pages; - /* - * Drivers may alter the memory reservation independently, but they - * must inform the balloon driver so we avoid hitting the hard limit. - */ - unsigned long driver_pages; - /* Number of pages in high- and low-memory balloons. */ - unsigned long balloon_low; - unsigned long balloon_high; +enum bp_state { + BP_DONE, + BP_EAGAIN, + BP_ECANCELED }; -static DEFINE_MUTEX(balloon_mutex); - -static struct sys_device balloon_sysdev; -static int register_balloon(struct sys_device *sysdev); +static DEFINE_MUTEX(balloon_mutex); -static struct balloon_stats balloon_stats; +struct balloon_stats balloon_stats; +EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; @@ -104,8 +93,7 @@ static LIST_HEAD(ballooned_pages); /* Main work function, always executed in process context. */ static void balloon_process(struct work_struct *work); -static DECLARE_WORK(balloon_worker, balloon_process); -static struct timer_list balloon_timer; +static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); /* When ballooning out (allocating memory to return to Xen) we don't really want the kernel to try too hard since that can trigger the oom killer. */ @@ -140,14 +128,17 @@ static void balloon_append(struct page *page) } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ -static struct page *balloon_retrieve(void) +static struct page *balloon_retrieve(bool prefer_highmem) { struct page *page; if (list_empty(&ballooned_pages)) return NULL; - page = list_entry(ballooned_pages.next, struct page, lru); + if (prefer_highmem) + page = list_entry(ballooned_pages.prev, struct page, lru); + else + page = list_entry(ballooned_pages.next, struct page, lru); list_del(&page->lru); if (PageHighMem(page)) { @@ -177,9 +168,29 @@ static struct page *balloon_next_page(struct page *page) return list_entry(next, struct page, lru); } -static void balloon_alarm(unsigned long unused) +static enum bp_state update_schedule(enum bp_state state) { - schedule_work(&balloon_worker); + if (state == BP_DONE) { + balloon_stats.schedule_delay = 1; + balloon_stats.retry_count = 1; + return BP_DONE; + } + + ++balloon_stats.retry_count; + + if (balloon_stats.max_retry_count != RETRY_UNLIMITED && + balloon_stats.retry_count > balloon_stats.max_retry_count) { + balloon_stats.schedule_delay = 1; + balloon_stats.retry_count = 1; + return BP_ECANCELED; + } + + balloon_stats.schedule_delay <<= 1; + + if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay) + balloon_stats.schedule_delay = balloon_stats.max_schedule_delay; + + return BP_EAGAIN; } static unsigned long current_target(void) @@ -194,11 +205,11 @@ static unsigned long current_target(void) return target; } -static int increase_reservation(unsigned long nr_pages) +static enum bp_state increase_reservation(unsigned long nr_pages) { + int rc; unsigned long pfn, i; struct page *page; - long rc; struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, @@ -210,7 +221,10 @@ static int increase_reservation(unsigned long nr_pages) page = balloon_first_page(); for (i = 0; i < nr_pages; i++) { - BUG_ON(page == NULL); + if (!page) { + nr_pages = i; + break; + } frame_list[i] = page_to_pfn(page); page = balloon_next_page(page); } @@ -218,11 +232,11 @@ static int increase_reservation(unsigned long nr_pages) set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - if (rc < 0) - goto out; + if (rc <= 0) + return BP_EAGAIN; for (i = 0; i < rc; i++) { - page = balloon_retrieve(); + page = balloon_retrieve(false); BUG_ON(page == NULL); pfn = page_to_pfn(page); @@ -232,7 +246,7 @@ static int increase_reservation(unsigned long nr_pages) set_phys_to_machine(pfn, frame_list[i]); /* Link back into the page tables if not highmem. */ - if (pfn < max_low_pfn) { + if (!xen_hvm_domain() && pfn < max_low_pfn) { int ret; ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), @@ -249,15 +263,14 @@ static int increase_reservation(unsigned long nr_pages) balloon_stats.current_pages += rc; - out: - return rc < 0 ? rc : rc != nr_pages; + return BP_DONE; } -static int decrease_reservation(unsigned long nr_pages) +static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) { + enum bp_state state = BP_DONE; unsigned long pfn, i; struct page *page; - int need_sleep = 0; int ret; struct xen_memory_reservation reservation = { .address_bits = 0, @@ -269,9 +282,9 @@ static int decrease_reservation(unsigned long nr_pages) nr_pages = ARRAY_SIZE(frame_list); for (i = 0; i < nr_pages; i++) { - if ((page = alloc_page(GFP_BALLOON)) == NULL) { + if ((page = alloc_page(gfp)) == NULL) { nr_pages = i; - need_sleep = 1; + state = BP_EAGAIN; break; } @@ -280,7 +293,7 @@ static int decrease_reservation(unsigned long nr_pages) scrub_page(page); - if (!PageHighMem(page)) { + if (!xen_hvm_domain() && !PageHighMem(page)) { ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), __pte_ma(0), 0); @@ -296,7 +309,7 @@ static int decrease_reservation(unsigned long nr_pages) /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { pfn = mfn_to_pfn(frame_list[i]); - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); balloon_append(pfn_to_page(pfn)); } @@ -307,7 +320,7 @@ static int decrease_reservation(unsigned long nr_pages) balloon_stats.current_pages -= nr_pages; - return need_sleep; + return state; } /* @@ -318,99 +331,125 @@ static int decrease_reservation(unsigned long nr_pages) */ static void balloon_process(struct work_struct *work) { - int need_sleep = 0; + enum bp_state state = BP_DONE; long credit; mutex_lock(&balloon_mutex); do { credit = current_target() - balloon_stats.current_pages; + if (credit > 0) - need_sleep = (increase_reservation(credit) != 0); + state = increase_reservation(credit); + if (credit < 0) - need_sleep = (decrease_reservation(-credit) != 0); + state = decrease_reservation(-credit, GFP_BALLOON); + + state = update_schedule(state); #ifndef CONFIG_PREEMPT if (need_resched()) schedule(); #endif - } while ((credit != 0) && !need_sleep); + } while (credit && state == BP_DONE); /* Schedule more work if there is some still to be done. */ - if (current_target() != balloon_stats.current_pages) - mod_timer(&balloon_timer, jiffies + HZ); + if (state == BP_EAGAIN) + schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ); mutex_unlock(&balloon_mutex); } /* Resets the Xen limit, sets new target, and kicks off processing. */ -static void balloon_set_new_target(unsigned long target) +void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ balloon_stats.target_pages = target; - schedule_work(&balloon_worker); + schedule_delayed_work(&balloon_worker, 0); } +EXPORT_SYMBOL_GPL(balloon_set_new_target); -static struct xenbus_watch target_watch = -{ - .node = "memory/target" -}; - -/* React to a change in the target key */ -static void watch_target(struct xenbus_watch *watch, - const char **vec, unsigned int len) +/** + * alloc_xenballooned_pages - get pages that have been ballooned out + * @nr_pages: Number of pages to get + * @pages: pages returned + * @return 0 on success, error otherwise + */ +int alloc_xenballooned_pages(int nr_pages, struct page** pages) { - unsigned long long new_target; - int err; - - err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); - if (err != 1) { - /* This is ok (for domain0 at least) - so just return */ - return; + int pgno = 0; + struct page* page; + mutex_lock(&balloon_mutex); + while (pgno < nr_pages) { + page = balloon_retrieve(true); + if (page) { + pages[pgno++] = page; + } else { + enum bp_state st; + st = decrease_reservation(nr_pages - pgno, GFP_HIGHUSER); + if (st != BP_DONE) + goto out_undo; + } } - - /* The given memory/target value is in KiB, so it needs converting to - * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. - */ - balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); + mutex_unlock(&balloon_mutex); + return 0; + out_undo: + while (pgno) + balloon_append(pages[--pgno]); + /* Free the memory back to the kernel soon */ + schedule_delayed_work(&balloon_worker, 0); + mutex_unlock(&balloon_mutex); + return -ENOMEM; } +EXPORT_SYMBOL(alloc_xenballooned_pages); -static int balloon_init_watcher(struct notifier_block *notifier, - unsigned long event, - void *data) +/** + * free_xenballooned_pages - return pages retrieved with get_ballooned_pages + * @nr_pages: Number of pages + * @pages: pages to return + */ +void free_xenballooned_pages(int nr_pages, struct page** pages) { - int err; + int i; - err = register_xenbus_watch(&target_watch); - if (err) - printk(KERN_ERR "Failed to set balloon watcher\n"); + mutex_lock(&balloon_mutex); - return NOTIFY_DONE; -} + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + balloon_append(pages[i]); + } + + /* The balloon may be too large now. Shrink it if needed. */ + if (current_target() != balloon_stats.current_pages) + schedule_delayed_work(&balloon_worker, 0); -static struct notifier_block xenstore_notifier; + mutex_unlock(&balloon_mutex); +} +EXPORT_SYMBOL(free_xenballooned_pages); static int __init balloon_init(void) { - unsigned long pfn, extra_pfn_end; + unsigned long pfn, nr_pages, extra_pfn_end; struct page *page; - if (!xen_pv_domain()) + if (!xen_domain()) return -ENODEV; - pr_info("xen_balloon: Initialising balloon driver.\n"); + pr_info("xen/balloon: Initialising balloon driver.\n"); - balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn); + if (xen_pv_domain()) + nr_pages = xen_start_info->nr_pages; + else + nr_pages = max_pfn; + balloon_stats.current_pages = min(nr_pages, max_pfn); balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; - balloon_stats.driver_pages = 0UL; - - init_timer(&balloon_timer); - balloon_timer.data = 0; - balloon_timer.function = balloon_alarm; - register_balloon(&balloon_sysdev); + balloon_stats.schedule_delay = 1; + balloon_stats.max_schedule_delay = 32; + balloon_stats.retry_count = 1; + balloon_stats.max_retry_count = RETRY_UNLIMITED; /* * Initialise the balloon with excess memory space. We need @@ -432,153 +471,9 @@ static int __init balloon_init(void) __balloon_append(page); } - target_watch.callback = watch_target; - xenstore_notifier.notifier_call = balloon_init_watcher; - - register_xenstore_notifier(&xenstore_notifier); - return 0; } subsys_initcall(balloon_init); -static void balloon_exit(void) -{ - /* XXX - release balloon here */ - return; -} - -module_exit(balloon_exit); - -#define BALLOON_SHOW(name, format, args...) \ - static ssize_t show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ - char *buf) \ - { \ - return sprintf(buf, format, ##args); \ - } \ - static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) - -BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); -BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); -BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); -BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages)); - -static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, - char *buf) -{ - return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); -} - -static ssize_t store_target_kb(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, - size_t count) -{ - char *endchar; - unsigned long long target_bytes; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - target_bytes = simple_strtoull(buf, &endchar, 0) * 1024; - - balloon_set_new_target(target_bytes >> PAGE_SHIFT); - - return count; -} - -static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, - show_target_kb, store_target_kb); - - -static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr, - char *buf) -{ - return sprintf(buf, "%llu\n", - (unsigned long long)balloon_stats.target_pages - << PAGE_SHIFT); -} - -static ssize_t store_target(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, - size_t count) -{ - char *endchar; - unsigned long long target_bytes; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - target_bytes = memparse(buf, &endchar); - - balloon_set_new_target(target_bytes >> PAGE_SHIFT); - - return count; -} - -static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR, - show_target, store_target); - - -static struct sysdev_attribute *balloon_attrs[] = { - &attr_target_kb, - &attr_target, -}; - -static struct attribute *balloon_info_attrs[] = { - &attr_current_kb.attr, - &attr_low_kb.attr, - &attr_high_kb.attr, - &attr_driver_kb.attr, - NULL -}; - -static struct attribute_group balloon_info_group = { - .name = "info", - .attrs = balloon_info_attrs, -}; - -static struct sysdev_class balloon_sysdev_class = { - .name = BALLOON_CLASS_NAME, -}; - -static int register_balloon(struct sys_device *sysdev) -{ - int i, error; - - error = sysdev_class_register(&balloon_sysdev_class); - if (error) - return error; - - sysdev->id = 0; - sysdev->cls = &balloon_sysdev_class; - - error = sysdev_register(sysdev); - if (error) { - sysdev_class_unregister(&balloon_sysdev_class); - return error; - } - - for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { - error = sysdev_create_file(sysdev, balloon_attrs[i]); - if (error) - goto fail; - } - - error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); - if (error) - goto fail; - - return 0; - - fail: - while (--i >= 0) - sysdev_remove_file(sysdev, balloon_attrs[i]); - sysdev_unregister(sysdev); - sysdev_class_unregister(&balloon_sysdev_class); - return error; -} - MODULE_LICENSE("GPL"); diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 74681478100a..35e02a10110b 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -5,7 +5,7 @@ * domain gets 1024 event channels, but NR_IRQ is not that large, we * must dynamically map irqs<->event channels. The event channels * interface with the rest of the kernel by defining a xen interrupt - * chip. When an event is recieved, it is mapped to an irq and sent + * chip. When an event is received, it is mapped to an irq and sent * through the normal interrupt processing path. * * There are four kinds of events which can be mapped to an event @@ -56,6 +56,8 @@ */ static DEFINE_SPINLOCK(irq_mapping_update_lock); +static LIST_HEAD(xen_irq_list_head); + /* IRQ <-> VIRQ mapping. */ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; @@ -85,7 +87,9 @@ enum xen_irq_type { */ struct irq_info { + struct list_head list; enum xen_irq_type type; /* type */ + unsigned irq; unsigned short evtchn; /* event channel */ unsigned short cpu; /* cpu bound */ @@ -97,29 +101,17 @@ struct irq_info unsigned short gsi; unsigned char vector; unsigned char flags; + uint16_t domid; } pirq; } u; }; #define PIRQ_NEEDS_EOI (1 << 0) #define PIRQ_SHAREABLE (1 << 1) -static struct irq_info *irq_info; -static int *pirq_to_irq; - static int *evtchn_to_irq; -struct cpu_evtchn_s { - unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG]; -}; - -static __initdata struct cpu_evtchn_s init_evtchn_mask = { - .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul, -}; -static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask; -static inline unsigned long *cpu_evtchn_mask(int cpu) -{ - return cpu_evtchn_mask_p[cpu].bits; -} +static DEFINE_PER_CPU(unsigned long [NR_EVENT_CHANNELS/BITS_PER_LONG], + cpu_evtchn_mask); /* Xen will never allocate port zero for any purpose. */ #define VALID_EVTCHN(chn) ((chn) != 0) @@ -128,46 +120,88 @@ static struct irq_chip xen_dynamic_chip; static struct irq_chip xen_percpu_chip; static struct irq_chip xen_pirq_chip; -/* Constructor for packed IRQ information. */ -static struct irq_info mk_unbound_info(void) +/* Get info for IRQ */ +static struct irq_info *info_for_irq(unsigned irq) { - return (struct irq_info) { .type = IRQT_UNBOUND }; + return irq_get_handler_data(irq); } -static struct irq_info mk_evtchn_info(unsigned short evtchn) +/* Constructors for packed IRQ information. */ +static void xen_irq_info_common_init(struct irq_info *info, + unsigned irq, + enum xen_irq_type type, + unsigned short evtchn, + unsigned short cpu) { - return (struct irq_info) { .type = IRQT_EVTCHN, .evtchn = evtchn, - .cpu = 0 }; + + BUG_ON(info->type != IRQT_UNBOUND && info->type != type); + + info->type = type; + info->irq = irq; + info->evtchn = evtchn; + info->cpu = cpu; + + evtchn_to_irq[evtchn] = irq; } -static struct irq_info mk_ipi_info(unsigned short evtchn, enum ipi_vector ipi) +static void xen_irq_info_evtchn_init(unsigned irq, + unsigned short evtchn) { - return (struct irq_info) { .type = IRQT_IPI, .evtchn = evtchn, - .cpu = 0, .u.ipi = ipi }; + struct irq_info *info = info_for_irq(irq); + + xen_irq_info_common_init(info, irq, IRQT_EVTCHN, evtchn, 0); } -static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq) +static void xen_irq_info_ipi_init(unsigned cpu, + unsigned irq, + unsigned short evtchn, + enum ipi_vector ipi) { - return (struct irq_info) { .type = IRQT_VIRQ, .evtchn = evtchn, - .cpu = 0, .u.virq = virq }; + struct irq_info *info = info_for_irq(irq); + + xen_irq_info_common_init(info, irq, IRQT_IPI, evtchn, 0); + + info->u.ipi = ipi; + + per_cpu(ipi_to_irq, cpu)[ipi] = irq; } -static struct irq_info mk_pirq_info(unsigned short evtchn, unsigned short pirq, - unsigned short gsi, unsigned short vector) +static void xen_irq_info_virq_init(unsigned cpu, + unsigned irq, + unsigned short evtchn, + unsigned short virq) { - return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn, - .cpu = 0, - .u.pirq = { .pirq = pirq, .gsi = gsi, .vector = vector } }; + struct irq_info *info = info_for_irq(irq); + + xen_irq_info_common_init(info, irq, IRQT_VIRQ, evtchn, 0); + + info->u.virq = virq; + + per_cpu(virq_to_irq, cpu)[virq] = irq; } -/* - * Accessors for packed IRQ information. - */ -static struct irq_info *info_for_irq(unsigned irq) +static void xen_irq_info_pirq_init(unsigned irq, + unsigned short evtchn, + unsigned short pirq, + unsigned short gsi, + unsigned short vector, + uint16_t domid, + unsigned char flags) { - return &irq_info[irq]; + struct irq_info *info = info_for_irq(irq); + + xen_irq_info_common_init(info, irq, IRQT_PIRQ, evtchn, 0); + + info->u.pirq.pirq = pirq; + info->u.pirq.gsi = gsi; + info->u.pirq.vector = vector; + info->u.pirq.domid = domid; + info->u.pirq.flags = flags; } +/* + * Accessors for packed IRQ information. + */ static unsigned int evtchn_from_irq(unsigned irq) { if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq))) @@ -212,26 +246,6 @@ static unsigned pirq_from_irq(unsigned irq) return info->u.pirq.pirq; } -static unsigned gsi_from_irq(unsigned irq) -{ - struct irq_info *info = info_for_irq(irq); - - BUG_ON(info == NULL); - BUG_ON(info->type != IRQT_PIRQ); - - return info->u.pirq.gsi; -} - -static unsigned vector_from_irq(unsigned irq) -{ - struct irq_info *info = info_for_irq(irq); - - BUG_ON(info == NULL); - BUG_ON(info->type != IRQT_PIRQ); - - return info->u.pirq.vector; -} - static enum xen_irq_type type_from_irq(unsigned irq) { return info_for_irq(irq)->type; @@ -267,7 +281,7 @@ static inline unsigned long active_evtchns(unsigned int cpu, unsigned int idx) { return (sh->evtchn_pending[idx] & - cpu_evtchn_mask(cpu)[idx] & + per_cpu(cpu_evtchn_mask, cpu)[idx] & ~sh->evtchn_mask[idx]); } @@ -277,31 +291,31 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) BUG_ON(irq == -1); #ifdef CONFIG_SMP - cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu)); + cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu)); #endif - clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq))); - set_bit(chn, cpu_evtchn_mask(cpu)); + clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_from_irq(irq))); + set_bit(chn, per_cpu(cpu_evtchn_mask, cpu)); - irq_info[irq].cpu = cpu; + info_for_irq(irq)->cpu = cpu; } static void init_evtchn_cpu_bindings(void) { int i; #ifdef CONFIG_SMP - struct irq_desc *desc; + struct irq_info *info; /* By default all event channels notify CPU#0. */ - for_each_irq_desc(i, desc) { - cpumask_copy(desc->affinity, cpumask_of(0)); + list_for_each_entry(info, &xen_irq_list_head, list) { + struct irq_desc *desc = irq_to_desc(info->irq); + cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); } #endif for_each_possible_cpu(i) - memset(cpu_evtchn_mask(i), - (i == 0) ? ~0 : 0, sizeof(struct cpu_evtchn_s)); - + memset(per_cpu(cpu_evtchn_mask, i), + (i == 0) ? ~0 : 0, sizeof(*per_cpu(cpu_evtchn_mask, i))); } static inline void clear_evtchn(int port) @@ -376,81 +390,90 @@ static void unmask_evtchn(int port) put_cpu(); } -static int get_nr_hw_irqs(void) +static void xen_irq_init(unsigned irq) { - int ret = 1; + struct irq_info *info; + struct irq_desc *desc = irq_to_desc(irq); -#ifdef CONFIG_X86_IO_APIC - ret = get_nr_irqs_gsi(); +#ifdef CONFIG_SMP + /* By default all event channels notify CPU#0. */ + cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); #endif - return ret; + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (info == NULL) + panic("Unable to allocate metadata for IRQ%d\n", irq); + + info->type = IRQT_UNBOUND; + + irq_set_handler_data(irq, info); + + list_add_tail(&info->list, &xen_irq_list_head); } -static int find_unbound_pirq(int type) +static int __must_check xen_allocate_irq_dynamic(void) { - int rc, i; - struct physdev_get_free_pirq op_get_free_pirq; - op_get_free_pirq.type = type; + int first = 0; + int irq; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); - if (!rc) - return op_get_free_pirq.pirq; +#ifdef CONFIG_X86_IO_APIC + /* + * For an HVM guest or domain 0 which see "real" (emulated or + * actual respectively) GSIs we allocate dynamic IRQs + * e.g. those corresponding to event channels or MSIs + * etc. from the range above those "real" GSIs to avoid + * collisions. + */ + if (xen_initial_domain() || xen_hvm_domain()) + first = get_nr_irqs_gsi(); +#endif - for (i = 0; i < nr_irqs; i++) { - if (pirq_to_irq[i] < 0) - return i; - } - return -1; + irq = irq_alloc_desc_from(first, -1); + + xen_irq_init(irq); + + return irq; } -static int find_unbound_irq(void) +static int __must_check xen_allocate_irq_gsi(unsigned gsi) { - struct irq_data *data; - int irq, res; - int bottom = get_nr_hw_irqs(); - int top = nr_irqs-1; - - if (bottom == nr_irqs) - goto no_irqs; + int irq; - /* This loop starts from the top of IRQ space and goes down. - * We need this b/c if we have a PCI device in a Xen PV guest - * we do not have an IO-APIC (though the backend might have them) - * mapped in. To not have a collision of physical IRQs with the Xen - * event channels start at the top of the IRQ space for virtual IRQs. + /* + * A PV guest has no concept of a GSI (since it has no ACPI + * nor access to/knowledge of the physical APICs). Therefore + * all IRQs are dynamically allocated from the entire IRQ + * space. */ - for (irq = top; irq > bottom; irq--) { - data = irq_get_irq_data(irq); - /* only 15->0 have init'd desc; handle irq > 16 */ - if (!data) - break; - if (data->chip == &no_irq_chip) - break; - if (data->chip != &xen_dynamic_chip) - continue; - if (irq_info[irq].type == IRQT_UNBOUND) - return irq; - } + if (xen_pv_domain() && !xen_initial_domain()) + return xen_allocate_irq_dynamic(); - if (irq == bottom) - goto no_irqs; - - res = irq_alloc_desc_at(irq, -1); + /* Legacy IRQ descriptors are already allocated by the arch. */ + if (gsi < NR_IRQS_LEGACY) + irq = gsi; + else + irq = irq_alloc_desc_at(gsi, -1); - if (WARN_ON(res != irq)) - return -1; + xen_irq_init(irq); return irq; - -no_irqs: - panic("No available IRQ to bind to: increase nr_irqs!\n"); } -static bool identity_mapped_irq(unsigned irq) +static void xen_free_irq(unsigned irq) { - /* identity map all the hardware irqs */ - return irq < get_nr_hw_irqs(); + struct irq_info *info = irq_get_handler_data(irq); + + list_del(&info->list); + + irq_set_handler_data(irq, NULL); + + kfree(info); + + /* Legacy IRQ descriptors are managed by the arch. */ + if (irq < NR_IRQS_LEGACY) + return; + + irq_free_desc(irq); } static void pirq_unmask_notify(int irq) @@ -486,7 +509,7 @@ static bool probing_irq(int irq) return desc && desc->action == NULL; } -static unsigned int startup_pirq(unsigned int irq) +static unsigned int __startup_pirq(unsigned int irq) { struct evtchn_bind_pirq bind_pirq; struct irq_info *info = info_for_irq(irq); @@ -524,9 +547,15 @@ out: return 0; } -static void shutdown_pirq(unsigned int irq) +static unsigned int startup_pirq(struct irq_data *data) +{ + return __startup_pirq(data->irq); +} + +static void shutdown_pirq(struct irq_data *data) { struct evtchn_close close; + unsigned int irq = data->irq; struct irq_info *info = info_for_irq(irq); int evtchn = evtchn_from_irq(irq); @@ -546,20 +575,20 @@ static void shutdown_pirq(unsigned int irq) info->evtchn = 0; } -static void enable_pirq(unsigned int irq) +static void enable_pirq(struct irq_data *data) { - startup_pirq(irq); + startup_pirq(data); } -static void disable_pirq(unsigned int irq) +static void disable_pirq(struct irq_data *data) { } -static void ack_pirq(unsigned int irq) +static void ack_pirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); - move_native_irq(irq); + irq_move_irq(data); if (VALID_EVTCHN(evtchn)) { mask_evtchn(evtchn); @@ -567,70 +596,41 @@ static void ack_pirq(unsigned int irq) } } -static void end_pirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - struct irq_desc *desc = irq_to_desc(irq); - - if (WARN_ON(!desc)) - return; - - if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) == - (IRQ_DISABLED|IRQ_PENDING)) { - shutdown_pirq(irq); - } else if (VALID_EVTCHN(evtchn)) { - unmask_evtchn(evtchn); - pirq_unmask_notify(irq); - } -} - static int find_irq_by_gsi(unsigned gsi) { - int irq; - - for (irq = 0; irq < nr_irqs; irq++) { - struct irq_info *info = info_for_irq(irq); + struct irq_info *info; - if (info == NULL || info->type != IRQT_PIRQ) + list_for_each_entry(info, &xen_irq_list_head, list) { + if (info->type != IRQT_PIRQ) continue; - if (gsi_from_irq(irq) == gsi) - return irq; + if (info->u.pirq.gsi == gsi) + return info->irq; } return -1; } -int xen_allocate_pirq(unsigned gsi, int shareable, char *name) +int xen_allocate_pirq_gsi(unsigned gsi) { - return xen_map_pirq_gsi(gsi, gsi, shareable, name); + return gsi; } -/* xen_map_pirq_gsi might allocate irqs from the top down, as a - * consequence don't assume that the irq number returned has a low value - * or can be used as a pirq number unless you know otherwise. - * - * One notable exception is when xen_map_pirq_gsi is called passing an - * hardware gsi as argument, in that case the irq number returned - * matches the gsi number passed as second argument. +/* + * Do not make any assumptions regarding the relationship between the + * IRQ number returned here and the Xen pirq argument. * * Note: We don't assign an event channel until the irq actually started * up. Return an existing irq if we've already got one for the gsi. */ -int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) +int xen_bind_pirq_gsi_to_irq(unsigned gsi, + unsigned pirq, int shareable, char *name) { - int irq = 0; + int irq = -1; struct physdev_irq irq_op; spin_lock(&irq_mapping_update_lock); - if ((pirq > nr_irqs) || (gsi > nr_irqs)) { - printk(KERN_WARNING "xen_map_pirq_gsi: %s %s is incorrect!\n", - pirq > nr_irqs ? "pirq" :"", - gsi > nr_irqs ? "gsi" : ""); - goto out; - } - irq = find_irq_by_gsi(gsi); if (irq != -1) { printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n", @@ -638,17 +638,12 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) goto out; /* XXX need refcount? */ } - /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore - * we are using the !xen_initial_domain() to drop in the function.*/ - if (identity_mapped_irq(gsi) || (!xen_initial_domain() && - xen_pv_domain())) { - irq = gsi; - irq_alloc_desc_at(irq, -1); - } else - irq = find_unbound_irq(); + irq = xen_allocate_irq_gsi(gsi); + if (irq < 0) + goto out; - set_irq_chip_and_handler_name(irq, &xen_pirq_chip, - handle_level_irq, name); + irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, + name); irq_op.irq = irq; irq_op.vector = 0; @@ -658,14 +653,13 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) * this in the priv domain. */ if (xen_initial_domain() && HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { - irq_free_desc(irq); + xen_free_irq(irq); irq = -ENOSPC; goto out; } - irq_info[irq] = mk_pirq_info(0, pirq, gsi, irq_op.vector); - irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0; - pirq_to_irq[pirq] = irq; + xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector, DOMID_SELF, + shareable ? PIRQ_SHAREABLE : 0); out: spin_unlock(&irq_mapping_update_lock); @@ -674,87 +668,46 @@ out: } #ifdef CONFIG_PCI_MSI -#include <linux/msi.h> -#include "../pci/msi.h" - -void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc) +int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) { - spin_lock(&irq_mapping_update_lock); - - if (alloc & XEN_ALLOC_IRQ) { - *irq = find_unbound_irq(); - if (*irq == -1) - goto out; - } - - if (alloc & XEN_ALLOC_PIRQ) { - *pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI); - if (*pirq == -1) - goto out; - } + int rc; + struct physdev_get_free_pirq op_get_free_pirq; - set_irq_chip_and_handler_name(*irq, &xen_pirq_chip, - handle_level_irq, name); + op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); - irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0); - pirq_to_irq[*pirq] = *irq; + WARN_ONCE(rc == -ENOSYS, + "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n"); -out: - spin_unlock(&irq_mapping_update_lock); + return rc ? -1 : op_get_free_pirq.pirq; } -int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) +int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, + int pirq, int vector, const char *name, + domid_t domid) { - int irq = -1; - struct physdev_map_pirq map_irq; - int rc; - int pos; - u32 table_offset, bir; - - memset(&map_irq, 0, sizeof(map_irq)); - map_irq.domid = DOMID_SELF; - map_irq.type = MAP_PIRQ_TYPE_MSI; - map_irq.index = -1; - map_irq.pirq = -1; - map_irq.bus = dev->bus->number; - map_irq.devfn = dev->devfn; - - if (type == PCI_CAP_ID_MSIX) { - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - - pci_read_config_dword(dev, msix_table_offset_reg(pos), - &table_offset); - bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK); - - map_irq.table_base = pci_resource_start(dev, bir); - map_irq.entry_nr = msidesc->msi_attrib.entry_nr; - } + int irq, ret; spin_lock(&irq_mapping_update_lock); - irq = find_unbound_irq(); - + irq = xen_allocate_irq_dynamic(); if (irq == -1) goto out; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); - if (rc) { - printk(KERN_WARNING "xen map irq failed %d\n", rc); - - irq_free_desc(irq); - - irq = -1; - goto out; - } - irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index); - - set_irq_chip_and_handler_name(irq, &xen_pirq_chip, - handle_level_irq, - (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi"); + irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, + name); + xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, domid, 0); + ret = irq_set_msi_desc(irq, msidesc); + if (ret < 0) + goto error_irq; out: spin_unlock(&irq_mapping_update_lock); return irq; +error_irq: + spin_unlock(&irq_mapping_update_lock); + xen_free_irq(irq); + return -1; } #endif @@ -773,38 +726,56 @@ int xen_destroy_irq(int irq) if (xen_initial_domain()) { unmap_irq.pirq = info->u.pirq.pirq; - unmap_irq.domid = DOMID_SELF; + unmap_irq.domid = info->u.pirq.domid; rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); - if (rc) { + /* If another domain quits without making the pci_disable_msix + * call, the Xen hypervisor takes care of freeing the PIRQs + * (free_domain_pirqs). + */ + if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF)) + printk(KERN_INFO "domain %d does not have %d anymore\n", + info->u.pirq.domid, info->u.pirq.pirq); + else if (rc) { printk(KERN_WARNING "unmap irq failed %d\n", rc); goto out; } - pirq_to_irq[info->u.pirq.pirq] = -1; } - irq_info[irq] = mk_unbound_info(); - irq_free_desc(irq); + xen_free_irq(irq); out: spin_unlock(&irq_mapping_update_lock); return rc; } -int xen_vector_from_irq(unsigned irq) +int xen_irq_from_pirq(unsigned pirq) { - return vector_from_irq(irq); -} + int irq; -int xen_gsi_from_irq(unsigned irq) -{ - return gsi_from_irq(irq); + struct irq_info *info; + + spin_lock(&irq_mapping_update_lock); + + list_for_each_entry(info, &xen_irq_list_head, list) { + if (info == NULL || info->type != IRQT_PIRQ) + continue; + irq = info->irq; + if (info->u.pirq.pirq == pirq) + goto out; + } + irq = -1; +out: + spin_unlock(&irq_mapping_update_lock); + + return irq; } -int xen_irq_from_pirq(unsigned pirq) + +int xen_pirq_from_irq(unsigned irq) { - return pirq_to_irq[pirq]; + return pirq_from_irq(irq); } - +EXPORT_SYMBOL_GPL(xen_pirq_from_irq); int bind_evtchn_to_irq(unsigned int evtchn) { int irq; @@ -814,15 +785,17 @@ int bind_evtchn_to_irq(unsigned int evtchn) irq = evtchn_to_irq[evtchn]; if (irq == -1) { - irq = find_unbound_irq(); + irq = xen_allocate_irq_dynamic(); + if (irq == -1) + goto out; - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, handle_fasteoi_irq, "event"); - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_evtchn_info(evtchn); + xen_irq_info_evtchn_init(irq, evtchn); } +out: spin_unlock(&irq_mapping_update_lock); return irq; @@ -839,11 +812,11 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) irq = per_cpu(ipi_to_irq, cpu)[ipi]; if (irq == -1) { - irq = find_unbound_irq(); + irq = xen_allocate_irq_dynamic(); if (irq < 0) goto out; - set_irq_chip_and_handler_name(irq, &xen_percpu_chip, + irq_set_chip_and_handler_name(irq, &xen_percpu_chip, handle_percpu_irq, "ipi"); bind_ipi.vcpu = cpu; @@ -852,9 +825,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) BUG(); evtchn = bind_ipi.port; - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_ipi_info(evtchn, ipi); - per_cpu(ipi_to_irq, cpu)[ipi] = irq; + xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); bind_evtchn_to_cpu(evtchn, cpu); } @@ -864,6 +835,21 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) return irq; } +static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, + unsigned int remote_port) +{ + struct evtchn_bind_interdomain bind_interdomain; + int err; + + bind_interdomain.remote_dom = remote_domain; + bind_interdomain.remote_port = remote_port; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind_interdomain); + + return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); +} + int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { @@ -875,9 +861,11 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) irq = per_cpu(virq_to_irq, cpu)[virq]; if (irq == -1) { - irq = find_unbound_irq(); + irq = xen_allocate_irq_dynamic(); + if (irq == -1) + goto out; - set_irq_chip_and_handler_name(irq, &xen_percpu_chip, + irq_set_chip_and_handler_name(irq, &xen_percpu_chip, handle_percpu_irq, "virq"); bind_virq.virq = virq; @@ -887,14 +875,12 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) BUG(); evtchn = bind_virq.port; - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_virq_info(evtchn, virq); - - per_cpu(virq_to_irq, cpu)[virq] = irq; + xen_irq_info_virq_init(cpu, irq, evtchn, virq); bind_evtchn_to_cpu(evtchn, cpu); } +out: spin_unlock(&irq_mapping_update_lock); return irq; @@ -931,11 +917,9 @@ static void unbind_from_irq(unsigned int irq) evtchn_to_irq[evtchn] = -1; } - if (irq_info[irq].type != IRQT_UNBOUND) { - irq_info[irq] = mk_unbound_info(); + BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); - irq_free_desc(irq); - } + xen_free_irq(irq); spin_unlock(&irq_mapping_update_lock); } @@ -945,10 +929,11 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn, unsigned long irqflags, const char *devname, void *dev_id) { - unsigned int irq; - int retval; + int irq, retval; irq = bind_evtchn_to_irq(evtchn); + if (irq < 0) + return irq; retval = request_irq(irq, handler, irqflags, devname, dev_id); if (retval != 0) { unbind_from_irq(irq); @@ -959,14 +944,38 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn, } EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); +int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, + unsigned int remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler); + int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { - unsigned int irq; - int retval; + int irq, retval; irq = bind_virq_to_irq(virq, cpu); + if (irq < 0) + return irq; retval = request_irq(irq, handler, irqflags, devname, dev_id); if (retval != 0) { unbind_from_irq(irq); @@ -990,7 +999,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, if (irq < 0) return irq; - irqflags |= IRQF_NO_SUSPEND; + irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME; retval = request_irq(irq, handler, irqflags, devname, dev_id); if (retval != 0) { unbind_from_irq(irq); @@ -1018,7 +1027,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) { struct shared_info *sh = HYPERVISOR_shared_info; int cpu = smp_processor_id(); - unsigned long *cpu_evtchn = cpu_evtchn_mask(cpu); + unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); int i; unsigned long flags; static DEFINE_SPINLOCK(debug_lock); @@ -1096,6 +1105,13 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) } static DEFINE_PER_CPU(unsigned, xed_nesting_count); +static DEFINE_PER_CPU(unsigned int, current_word_idx); +static DEFINE_PER_CPU(unsigned int, current_bit_idx); + +/* + * Mask out the i least significant bits of w + */ +#define MASK_LSBS(w, i) (w & ((~0UL) << i)) /* * Search the CPUs pending events bitmasks. For each one found, map @@ -1108,6 +1124,9 @@ static DEFINE_PER_CPU(unsigned, xed_nesting_count); */ static void __xen_evtchn_do_upcall(void) { + int start_word_idx, start_bit_idx; + int word_idx, bit_idx; + int i; int cpu = get_cpu(); struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); @@ -1126,17 +1145,57 @@ static void __xen_evtchn_do_upcall(void) wmb(); #endif pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); - while (pending_words != 0) { + + start_word_idx = __this_cpu_read(current_word_idx); + start_bit_idx = __this_cpu_read(current_bit_idx); + + word_idx = start_word_idx; + + for (i = 0; pending_words != 0; i++) { unsigned long pending_bits; - int word_idx = __ffs(pending_words); - pending_words &= ~(1UL << word_idx); + unsigned long words; + + words = MASK_LSBS(pending_words, word_idx); + + /* + * If we masked out all events, wrap to beginning. + */ + if (words == 0) { + word_idx = 0; + bit_idx = 0; + continue; + } + word_idx = __ffs(words); + + pending_bits = active_evtchns(cpu, s, word_idx); + bit_idx = 0; /* usually scan entire word from start */ + if (word_idx == start_word_idx) { + /* We scan the starting word in two parts */ + if (i == 0) + /* 1st time: start in the middle */ + bit_idx = start_bit_idx; + else + /* 2nd time: mask bits done already */ + bit_idx &= (1UL << start_bit_idx) - 1; + } - while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { - int bit_idx = __ffs(pending_bits); - int port = (word_idx * BITS_PER_LONG) + bit_idx; - int irq = evtchn_to_irq[port]; + do { + unsigned long bits; + int port, irq; struct irq_desc *desc; + bits = MASK_LSBS(pending_bits, bit_idx); + + /* If we masked out all events, move on. */ + if (bits == 0) + break; + + bit_idx = __ffs(bits); + + /* Process port. */ + port = (word_idx * BITS_PER_LONG) + bit_idx; + irq = evtchn_to_irq[port]; + mask_evtchn(port); clear_evtchn(port); @@ -1145,7 +1204,21 @@ static void __xen_evtchn_do_upcall(void) if (desc) generic_handle_irq_desc(irq, desc); } - } + + bit_idx = (bit_idx + 1) % BITS_PER_LONG; + + /* Next caller starts at last processed + 1 */ + __this_cpu_write(current_word_idx, + bit_idx ? word_idx : + (word_idx+1) % BITS_PER_LONG); + __this_cpu_write(current_bit_idx, bit_idx); + } while (bit_idx != 0); + + /* Scan start_l1i twice; all others once. */ + if ((word_idx != start_word_idx) || (i != 0)) + pending_words &= ~(1UL << word_idx); + + word_idx = (word_idx + 1) % BITS_PER_LONG; } BUG_ON(!irqs_disabled()); @@ -1195,8 +1268,7 @@ void rebind_evtchn_irq(int evtchn, int irq) so there should be a proper type */ BUG_ON(info->type == IRQT_UNBOUND); - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_evtchn_info(evtchn); + xen_irq_info_evtchn_init(irq, evtchn); spin_unlock(&irq_mapping_update_lock); @@ -1213,10 +1285,14 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) struct evtchn_bind_vcpu bind_vcpu; int evtchn = evtchn_from_irq(irq); - /* events delivered via platform PCI interrupts are always - * routed to vcpu 0 */ - if (!VALID_EVTCHN(evtchn) || - (xen_hvm_domain() && !xen_have_vector_callback)) + if (!VALID_EVTCHN(evtchn)) + return -1; + + /* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ + if (xen_hvm_domain() && !xen_have_vector_callback) return -1; /* Send future instances of this interrupt to other vcpu. */ @@ -1234,11 +1310,12 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) return 0; } -static int set_affinity_irq(unsigned irq, const struct cpumask *dest) +static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, + bool force) { unsigned tcpu = cpumask_first(dest); - return rebind_irq_to_cpu(irq, tcpu); + return rebind_irq_to_cpu(data->irq, tcpu); } int resend_irq_on_evtchn(unsigned int irq) @@ -1257,35 +1334,35 @@ int resend_irq_on_evtchn(unsigned int irq) return 1; } -static void enable_dynirq(unsigned int irq) +static void enable_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); if (VALID_EVTCHN(evtchn)) unmask_evtchn(evtchn); } -static void disable_dynirq(unsigned int irq) +static void disable_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); if (VALID_EVTCHN(evtchn)) mask_evtchn(evtchn); } -static void ack_dynirq(unsigned int irq) +static void ack_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); - move_masked_irq(irq); + irq_move_masked_irq(data); if (VALID_EVTCHN(evtchn)) unmask_evtchn(evtchn); } -static int retrigger_dynirq(unsigned int irq) +static int retrigger_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); struct shared_info *sh = HYPERVISOR_shared_info; int ret = 0; @@ -1302,19 +1379,22 @@ static int retrigger_dynirq(unsigned int irq) return ret; } -static void restore_cpu_pirqs(void) +static void restore_pirqs(void) { int pirq, rc, irq, gsi; struct physdev_map_pirq map_irq; + struct irq_info *info; - for (pirq = 0; pirq < nr_irqs; pirq++) { - irq = pirq_to_irq[pirq]; - if (irq == -1) + list_for_each_entry(info, &xen_irq_list_head, list) { + if (info->type != IRQT_PIRQ) continue; + pirq = info->u.pirq.pirq; + gsi = info->u.pirq.gsi; + irq = info->irq; + /* save/restore of PT devices doesn't work, so at this point the * only devices present are GSI based emulated devices */ - gsi = gsi_from_irq(irq); if (!gsi) continue; @@ -1327,14 +1407,13 @@ static void restore_cpu_pirqs(void) if (rc) { printk(KERN_WARNING "xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", gsi, irq, pirq, rc); - irq_info[irq] = mk_unbound_info(); - pirq_to_irq[pirq] = -1; + xen_free_irq(irq); continue; } printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); - startup_pirq(irq); + __startup_pirq(irq); } } @@ -1358,8 +1437,7 @@ static void restore_cpu_virqs(unsigned int cpu) evtchn = bind_virq.port; /* Record the new mapping. */ - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_virq_info(evtchn, virq); + xen_irq_info_virq_init(cpu, irq, evtchn, virq); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1383,8 +1461,7 @@ static void restore_cpu_ipis(unsigned int cpu) evtchn = bind_ipi.port; /* Record the new mapping. */ - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_ipi_info(evtchn, ipi); + xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1442,10 +1519,22 @@ void xen_poll_irq(int irq) xen_poll_irq_timeout(irq, 0 /* no timeout */); } +/* Check whether the IRQ line is shared with other guests. */ +int xen_test_irq_shared(int irq) +{ + struct irq_info *info = info_for_irq(irq); + struct physdev_irq_status_query irq_status = { .irq = info->u.pirq.pirq }; + + if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) + return 0; + return !(irq_status.flags & XENIRQSTAT_shared); +} +EXPORT_SYMBOL_GPL(xen_test_irq_shared); + void xen_irq_resume(void) { - unsigned int cpu, irq, evtchn; - struct irq_desc *desc; + unsigned int cpu, evtchn; + struct irq_info *info; init_evtchn_cpu_bindings(); @@ -1454,8 +1543,8 @@ void xen_irq_resume(void) mask_evtchn(evtchn); /* No IRQ <-> event-channel mappings. */ - for (irq = 0; irq < nr_irqs; irq++) - irq_info[irq].evtchn = 0; /* zap event-channel binding */ + list_for_each_entry(info, &xen_irq_list_head, list) + info->evtchn = 0; /* zap event-channel binding */ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) evtchn_to_irq[evtchn] = -1; @@ -1465,66 +1554,48 @@ void xen_irq_resume(void) restore_cpu_ipis(cpu); } - /* - * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These - * are not handled by the IRQ core. - */ - for_each_irq_desc(irq, desc) { - if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND)) - continue; - if (desc->status & IRQ_DISABLED) - continue; - - evtchn = evtchn_from_irq(irq); - if (evtchn == -1) - continue; - - unmask_evtchn(evtchn); - } - - restore_cpu_pirqs(); + restore_pirqs(); } static struct irq_chip xen_dynamic_chip __read_mostly = { - .name = "xen-dyn", + .name = "xen-dyn", - .disable = disable_dynirq, - .mask = disable_dynirq, - .unmask = enable_dynirq, + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, - .eoi = ack_dynirq, - .set_affinity = set_affinity_irq, - .retrigger = retrigger_dynirq, + .irq_eoi = ack_dynirq, + .irq_set_affinity = set_affinity_irq, + .irq_retrigger = retrigger_dynirq, }; static struct irq_chip xen_pirq_chip __read_mostly = { - .name = "xen-pirq", + .name = "xen-pirq", - .startup = startup_pirq, - .shutdown = shutdown_pirq, + .irq_startup = startup_pirq, + .irq_shutdown = shutdown_pirq, - .enable = enable_pirq, - .unmask = enable_pirq, + .irq_enable = enable_pirq, + .irq_unmask = enable_pirq, - .disable = disable_pirq, - .mask = disable_pirq, + .irq_disable = disable_pirq, + .irq_mask = disable_pirq, - .ack = ack_pirq, - .end = end_pirq, + .irq_ack = ack_pirq, - .set_affinity = set_affinity_irq, + .irq_set_affinity = set_affinity_irq, - .retrigger = retrigger_dynirq, + .irq_retrigger = retrigger_dynirq, }; static struct irq_chip xen_percpu_chip __read_mostly = { - .name = "xen-percpu", + .name = "xen-percpu", - .disable = disable_dynirq, - .mask = disable_dynirq, - .unmask = enable_dynirq, + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, - .ack = ack_dynirq, + .irq_ack = ack_dynirq, }; int xen_set_callback_via(uint64_t via) @@ -1569,17 +1640,6 @@ void __init xen_init_IRQ(void) { int i; - cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s), - GFP_KERNEL); - irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL); - - /* We are using nr_irqs as the maximum number of pirq available but - * that number is actually chosen by Xen and we don't know exactly - * what it is. Be careful choosing high pirq numbers. */ - pirq_to_irq = kcalloc(nr_irqs, sizeof(*pirq_to_irq), GFP_KERNEL); - for (i = 0; i < nr_irqs; i++) - pirq_to_irq[i] = -1; - evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), GFP_KERNEL); for (i = 0; i < NR_EVENT_CHANNELS; i++) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 687761f65e5c..f914b26cf0c2 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -36,6 +36,7 @@ #include <xen/xen.h> #include <xen/grant_table.h> +#include <xen/balloon.h> #include <xen/gntdev.h> #include <xen/events.h> #include <asm/xen/hypervisor.h> @@ -122,10 +123,10 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) NULL == add->pages) goto err; + if (alloc_xenballooned_pages(count, add->pages)) + goto err; + for (i = 0; i < count; i++) { - add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (add->pages[i] == NULL) - goto err; add->map_ops[i].handle = -1; add->unmap_ops[i].handle = -1; } @@ -137,11 +138,6 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) return add; err: - if (add->pages) - for (i = 0; i < count; i++) { - if (add->pages[i]) - __free_page(add->pages[i]); - } kfree(add->pages); kfree(add->grants); kfree(add->map_ops); @@ -184,8 +180,6 @@ static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, static void gntdev_put_map(struct grant_map *map) { - int i; - if (!map) return; @@ -202,29 +196,7 @@ static void gntdev_put_map(struct grant_map *map) if (!use_ptemod) unmap_grant_pages(map, 0, map->count); - for (i = 0; i < map->count; i++) { - uint32_t check, *tmp; - if (!map->pages[i]) - continue; - /* XXX When unmapping in an HVM domain, Xen will - * sometimes end up mapping the GFN to an invalid MFN. - * In this case, writes will be discarded and reads will - * return all 0xFF bytes. Leak these unusable GFNs - * until Xen supports fixing their p2m mapping. - * - * Confirmed present in Xen 4.1-RC3 with HVM source - */ - tmp = kmap(map->pages[i]); - *tmp = 0xdeaddead; - mb(); - check = *tmp; - kunmap(map->pages[i]); - if (check == 0xdeaddead) - __free_page(map->pages[i]); - else - pr_debug("Discard page %d=%ld\n", i, - page_to_pfn(map->pages[i])); - } + free_xenballooned_pages(map->count, map->pages); } kfree(map->pages); kfree(map->grants); @@ -301,7 +273,7 @@ static int __unmap_grant_pages(struct grant_map *map, int offset, int pages) map->vma->vm_start + map->notify.addr; err = copy_to_user(tmp, &err, 1); if (err) - return err; + return -EFAULT; map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; } else if (pgno >= offset && pgno < offset + pages) { uint8_t *tmp = kmap(map->pages[pgno]); @@ -702,7 +674,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) if (map->flags) { if ((vma->vm_flags & VM_WRITE) && (map->flags & GNTMAP_readonly)) - return -EINVAL; + goto out_unlock_put; } else { map->flags = GNTMAP_host_map; if (!(vma->vm_flags & VM_WRITE)) @@ -740,6 +712,8 @@ unlock_out: spin_unlock(&priv->lock); return err; +out_unlock_put: + spin_unlock(&priv->lock); out_put_map: if (use_ptemod) map->vma = NULL; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 3745a318defc..fd725cde6ad1 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -466,13 +466,30 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, if (map_ops[i].status) continue; - /* m2p override only supported for GNTMAP_contains_pte mappings */ - if (!(map_ops[i].flags & GNTMAP_contains_pte)) - continue; - pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + + if (map_ops[i].flags & GNTMAP_contains_pte) { + pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + (map_ops[i].host_addr & ~PAGE_MASK)); - mfn = pte_mfn(*pte); - ret = m2p_add_override(mfn, pages[i]); + mfn = pte_mfn(*pte); + } else { + /* If you really wanted to do this: + * mfn = PFN_DOWN(map_ops[i].dev_bus_addr); + * + * The reason we do not implement it is b/c on the + * unmap path (gnttab_unmap_refs) we have no means of + * checking whether the page is !GNTMAP_contains_pte. + * + * That is without some extra data-structure to carry + * the struct page, bool clear_pte, and list_head next + * tuples and deal with allocation/delallocation, etc. + * + * The users of this API set the GNTMAP_contains_pte + * flag so lets just return not supported until it + * becomes neccessary to implement. + */ + return -EOPNOTSUPP; + } + ret = m2p_add_override(mfn, pages[i], + map_ops[i].flags & GNTMAP_contains_pte); if (ret) return ret; } @@ -494,7 +511,7 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, return ret; for (i = 0; i < count; i++) { - ret = m2p_remove_override(pages[i]); + ret = m2p_remove_override(pages[i], true /* clear the PTE */); if (ret) return ret; } diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index db8c4c4ac880..a2eee574784e 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -8,6 +8,7 @@ #include <linux/sysrq.h> #include <linux/stop_machine.h> #include <linux/freezer.h> +#include <linux/syscore_ops.h> #include <xen/xen.h> #include <xen/xenbus.h> @@ -34,63 +35,73 @@ enum shutdown_state { /* Ignore multiple shutdown requests. */ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; -#ifdef CONFIG_PM_SLEEP -static int xen_hvm_suspend(void *data) -{ - struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; - int *cancelled = data; - - BUG_ON(!irqs_disabled()); - - *cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); +struct suspend_info { + int cancelled; + unsigned long arg; /* extra hypercall argument */ + void (*pre)(void); + void (*post)(int cancelled); +}; - xen_hvm_post_suspend(*cancelled); +static void xen_hvm_post_suspend(int cancelled) +{ + xen_arch_hvm_post_suspend(cancelled); gnttab_resume(); +} - if (!*cancelled) { - xen_irq_resume(); - xen_console_resume(); - xen_timer_resume(); - } +static void xen_pre_suspend(void) +{ + xen_mm_pin_all(); + gnttab_suspend(); + xen_arch_pre_suspend(); +} - return 0; +static void xen_post_suspend(int cancelled) +{ + xen_arch_post_suspend(cancelled); + gnttab_resume(); + xen_mm_unpin_all(); } +#ifdef CONFIG_HIBERNATE_CALLBACKS static int xen_suspend(void *data) { + struct suspend_info *si = data; int err; - int *cancelled = data; BUG_ON(!irqs_disabled()); - err = sysdev_suspend(PMSG_SUSPEND); + err = sysdev_suspend(PMSG_FREEZE); + if (!err) { + err = syscore_suspend(); + if (err) + sysdev_resume(); + } if (err) { - printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n", + printk(KERN_ERR "xen_suspend: system core suspend failed: %d\n", err); return err; } - xen_mm_pin_all(); - gnttab_suspend(); - xen_pre_suspend(); + if (si->pre) + si->pre(); /* * This hypercall returns 1 if suspend was cancelled * or the domain was merely checkpointed, and 0 if it * is resuming in a new domain. */ - *cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); + si->cancelled = HYPERVISOR_suspend(si->arg); - xen_post_suspend(*cancelled); - gnttab_resume(); - xen_mm_unpin_all(); + if (si->post) + si->post(si->cancelled); - if (!*cancelled) { + if (!si->cancelled) { xen_irq_resume(); xen_console_resume(); xen_timer_resume(); } + syscore_resume(); sysdev_resume(); return 0; @@ -99,7 +110,7 @@ static int xen_suspend(void *data) static void do_suspend(void) { int err; - int cancelled = 1; + struct suspend_info si; shutting_down = SHUTDOWN_SUSPEND; @@ -114,7 +125,7 @@ static void do_suspend(void) } #endif - err = dpm_suspend_start(PMSG_SUSPEND); + err = dpm_suspend_start(PMSG_FREEZE); if (err) { printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err); goto out_thaw; @@ -123,32 +134,41 @@ static void do_suspend(void) printk(KERN_DEBUG "suspending xenstore...\n"); xs_suspend(); - err = dpm_suspend_noirq(PMSG_SUSPEND); + err = dpm_suspend_noirq(PMSG_FREEZE); if (err) { printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err); goto out_resume; } - if (xen_hvm_domain()) - err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0)); - else - err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); + si.cancelled = 1; - dpm_resume_noirq(PMSG_RESUME); + if (xen_hvm_domain()) { + si.arg = 0UL; + si.pre = NULL; + si.post = &xen_hvm_post_suspend; + } else { + si.arg = virt_to_mfn(xen_start_info); + si.pre = &xen_pre_suspend; + si.post = &xen_post_suspend; + } + + err = stop_machine(xen_suspend, &si, cpumask_of(0)); + + dpm_resume_noirq(si.cancelled ? PMSG_THAW : PMSG_RESTORE); if (err) { printk(KERN_ERR "failed to start xen_suspend: %d\n", err); - cancelled = 1; + si.cancelled = 1; } out_resume: - if (!cancelled) { + if (!si.cancelled) { xen_arch_resume(); xs_resume(); } else xs_suspend_cancel(); - dpm_resume_end(PMSG_RESUME); + dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE); /* Make sure timer events get retriggered on all CPUs */ clock_was_set(); @@ -160,7 +180,24 @@ out: #endif shutting_down = SHUTDOWN_INVALID; } -#endif /* CONFIG_PM_SLEEP */ +#endif /* CONFIG_HIBERNATE_CALLBACKS */ + +struct shutdown_handler { + const char *command; + void (*cb)(void); +}; + +static void do_poweroff(void) +{ + shutting_down = SHUTDOWN_POWEROFF; + orderly_poweroff(false); +} + +static void do_reboot(void) +{ + shutting_down = SHUTDOWN_POWEROFF; /* ? */ + ctrl_alt_del(); +} static void shutdown_handler(struct xenbus_watch *watch, const char **vec, unsigned int len) @@ -168,6 +205,16 @@ static void shutdown_handler(struct xenbus_watch *watch, char *str; struct xenbus_transaction xbt; int err; + static struct shutdown_handler handlers[] = { + { "poweroff", do_poweroff }, + { "halt", do_poweroff }, + { "reboot", do_reboot }, +#ifdef CONFIG_HIBERNATE_CALLBACKS + { "suspend", do_suspend }, +#endif + {NULL, NULL}, + }; + static struct shutdown_handler *handler; if (shutting_down != SHUTDOWN_INVALID) return; @@ -184,7 +231,14 @@ static void shutdown_handler(struct xenbus_watch *watch, return; } - xenbus_write(xbt, "control", "shutdown", ""); + for (handler = &handlers[0]; handler->command; handler++) { + if (strcmp(str, handler->command) == 0) + break; + } + + /* Only acknowledge commands which we are prepared to handle. */ + if (handler->cb) + xenbus_write(xbt, "control", "shutdown", ""); err = xenbus_transaction_end(xbt, 0); if (err == -EAGAIN) { @@ -192,17 +246,8 @@ static void shutdown_handler(struct xenbus_watch *watch, goto again; } - if (strcmp(str, "poweroff") == 0 || - strcmp(str, "halt") == 0) { - shutting_down = SHUTDOWN_POWEROFF; - orderly_poweroff(false); - } else if (strcmp(str, "reboot") == 0) { - shutting_down = SHUTDOWN_POWEROFF; /* ? */ - ctrl_alt_del(); -#ifdef CONFIG_PM_SLEEP - } else if (strcmp(str, "suspend") == 0) { - do_suspend(); -#endif + if (handler->cb) { + handler->cb(); } else { printk(KERN_INFO "Ignoring shutdown request: %s\n", str); shutting_down = SHUTDOWN_INVALID; @@ -281,27 +326,18 @@ static int shutdown_event(struct notifier_block *notifier, return NOTIFY_DONE; } -static int __init __setup_shutdown_event(void) -{ - /* Delay initialization in the PV on HVM case */ - if (xen_hvm_domain()) - return 0; - - if (!xen_pv_domain()) - return -ENODEV; - - return xen_setup_shutdown_event(); -} - int xen_setup_shutdown_event(void) { static struct notifier_block xenstore_notifier = { .notifier_call = shutdown_event }; + + if (!xen_domain()) + return -ENODEV; register_xenstore_notifier(&xenstore_notifier); return 0; } EXPORT_SYMBOL_GPL(xen_setup_shutdown_event); -subsys_initcall(__setup_shutdown_event); +subsys_initcall(xen_setup_shutdown_event); diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index afbe041f42c5..319dd0a94d51 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -156,9 +156,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, if (ret) goto out; xenbus_probe(NULL); - ret = xen_setup_shutdown_event(); - if (ret) - goto out; return 0; out: diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c new file mode 100644 index 000000000000..a4ff225ee868 --- /dev/null +++ b/drivers/xen/xen-balloon.c @@ -0,0 +1,256 @@ +/****************************************************************************** + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sysdev.h> +#include <linux/capability.h> + +#include <xen/xen.h> +#include <xen/interface/xen.h> +#include <xen/balloon.h> +#include <xen/xenbus.h> +#include <xen/features.h> +#include <xen/page.h> + +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) + +#define BALLOON_CLASS_NAME "xen_memory" + +static struct sys_device balloon_sysdev; + +static int register_balloon(struct sys_device *sysdev); + +static struct xenbus_watch target_watch = +{ + .node = "memory/target" +}; + +/* React to a change in the target key */ +static void watch_target(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + unsigned long long new_target; + int err; + + err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); + if (err != 1) { + /* This is ok (for domain0 at least) - so just return */ + return; + } + + /* The given memory/target value is in KiB, so it needs converting to + * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. + */ + balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); +} + +static int balloon_init_watcher(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + int err; + + err = register_xenbus_watch(&target_watch); + if (err) + printk(KERN_ERR "Failed to set balloon watcher\n"); + + return NOTIFY_DONE; +} + +static struct notifier_block xenstore_notifier; + +static int __init balloon_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + pr_info("xen-balloon: Initialising balloon driver.\n"); + + register_balloon(&balloon_sysdev); + + target_watch.callback = watch_target; + xenstore_notifier.notifier_call = balloon_init_watcher; + + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} +subsys_initcall(balloon_init); + +static void balloon_exit(void) +{ + /* XXX - release balloon here */ + return; +} + +module_exit(balloon_exit); + +#define BALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ + { \ + return sprintf(buf, format, ##args); \ + } \ + static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) + +BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); +BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); +BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); + +static SYSDEV_ULONG_ATTR(schedule_delay, 0444, balloon_stats.schedule_delay); +static SYSDEV_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay); +static SYSDEV_ULONG_ATTR(retry_count, 0444, balloon_stats.retry_count); +static SYSDEV_ULONG_ATTR(max_retry_count, 0644, balloon_stats.max_retry_count); + +static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); +} + +static ssize_t store_target_kb(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + target_bytes = simple_strtoull(buf, &endchar, 0) * 1024; + + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, + show_target_kb, store_target_kb); + + +static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%llu\n", + (unsigned long long)balloon_stats.target_pages + << PAGE_SHIFT); +} + +static ssize_t store_target(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + target_bytes = memparse(buf, &endchar); + + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR, + show_target, store_target); + + +static struct sysdev_attribute *balloon_attrs[] = { + &attr_target_kb, + &attr_target, + &attr_schedule_delay.attr, + &attr_max_schedule_delay.attr, + &attr_retry_count.attr, + &attr_max_retry_count.attr +}; + +static struct attribute *balloon_info_attrs[] = { + &attr_current_kb.attr, + &attr_low_kb.attr, + &attr_high_kb.attr, + NULL +}; + +static struct attribute_group balloon_info_group = { + .name = "info", + .attrs = balloon_info_attrs +}; + +static struct sysdev_class balloon_sysdev_class = { + .name = BALLOON_CLASS_NAME +}; + +static int register_balloon(struct sys_device *sysdev) +{ + int i, error; + + error = sysdev_class_register(&balloon_sysdev_class); + if (error) + return error; + + sysdev->id = 0; + sysdev->cls = &balloon_sysdev_class; + + error = sysdev_register(sysdev); + if (error) { + sysdev_class_unregister(&balloon_sysdev_class); + return error; + } + + for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { + error = sysdev_create_file(sysdev, balloon_attrs[i]); + if (error) + goto fail; + } + + error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); + if (error) + goto fail; + + return 0; + + fail: + while (--i >= 0) + sysdev_remove_file(sysdev, balloon_attrs[i]); + sysdev_unregister(sysdev); + sysdev_class_unregister(&balloon_sysdev_class); + return error; +} + +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index baa65e7fbbc7..739769551e33 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -577,7 +577,7 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus) } EXPORT_SYMBOL_GPL(xenbus_dev_changed); -int xenbus_dev_suspend(struct device *dev, pm_message_t state) +int xenbus_dev_suspend(struct device *dev) { int err = 0; struct xenbus_driver *drv; @@ -590,7 +590,7 @@ int xenbus_dev_suspend(struct device *dev, pm_message_t state) return 0; drv = to_xenbus_driver(dev->driver); if (drv->suspend) - err = drv->suspend(xdev, state); + err = drv->suspend(xdev); if (err) printk(KERN_WARNING "xenbus: suspend %s failed: %i\n", dev_name(dev), err); @@ -642,6 +642,14 @@ int xenbus_dev_resume(struct device *dev) } EXPORT_SYMBOL_GPL(xenbus_dev_resume); +int xenbus_dev_cancel(struct device *dev) +{ + /* Do nothing */ + DPRINTK("cancel"); + return 0; +} +EXPORT_SYMBOL_GPL(xenbus_dev_cancel); + /* A flag to determine if xenstored is 'ready' (i.e. has started) */ int xenstored_ready = 0; diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h index 24665812316a..888b9900ca08 100644 --- a/drivers/xen/xenbus/xenbus_probe.h +++ b/drivers/xen/xenbus/xenbus_probe.h @@ -64,8 +64,9 @@ extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); extern void xenbus_dev_shutdown(struct device *_dev); -extern int xenbus_dev_suspend(struct device *dev, pm_message_t state); +extern int xenbus_dev_suspend(struct device *dev); extern int xenbus_dev_resume(struct device *dev); +extern int xenbus_dev_cancel(struct device *dev); extern void xenbus_otherend_changed(struct xenbus_watch *watch, const char **vec, unsigned int len, diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index 5bcc2d6cf129..b6a2690c9d49 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -85,6 +85,14 @@ static struct device_attribute xenbus_frontend_dev_attrs[] = { __ATTR_NULL }; +static const struct dev_pm_ops xenbus_pm_ops = { + .suspend = xenbus_dev_suspend, + .resume = xenbus_dev_resume, + .freeze = xenbus_dev_suspend, + .thaw = xenbus_dev_cancel, + .restore = xenbus_dev_resume, +}; + static struct xen_bus_type xenbus_frontend = { .root = "device", .levels = 2, /* device/type/<id> */ @@ -100,8 +108,7 @@ static struct xen_bus_type xenbus_frontend = { .shutdown = xenbus_dev_shutdown, .dev_attrs = xenbus_frontend_dev_attrs, - .suspend = xenbus_dev_suspend, - .resume = xenbus_dev_resume, + .pm = &xenbus_pm_ops, }, }; |