diff options
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r-- | virt/kvm/kvm_main.c | 207 |
1 files changed, 183 insertions, 24 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index dbbfcd082513..deff4b3eb972 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -51,6 +51,7 @@ #include <linux/slab.h> #include <linux/sort.h> #include <linux/bsearch.h> +#include <linux/kthread.h> #include <asm/processor.h> #include <asm/io.h> @@ -92,7 +93,7 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ -DEFINE_SPINLOCK(kvm_lock); +DEFINE_MUTEX(kvm_lock); static DEFINE_RAW_SPINLOCK(kvm_count_lock); LIST_HEAD(vm_list); @@ -141,10 +142,30 @@ __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, { } +bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) +{ + /* + * The metadata used by is_zone_device_page() to determine whether or + * not a page is ZONE_DEVICE is guaranteed to be valid if and only if + * the device has been pinned, e.g. by get_user_pages(). WARN if the + * page_count() is zero to help detect bad usage of this helper. + */ + if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) + return false; + + return is_zone_device_page(pfn_to_page(pfn)); +} + bool kvm_is_reserved_pfn(kvm_pfn_t pfn) { + /* + * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting + * perspective they are "normal" pages, albeit with slightly different + * usage rules. + */ if (pfn_valid(pfn)) - return PageReserved(pfn_to_page(pfn)); + return PageReserved(pfn_to_page(pfn)) && + !kvm_is_zone_device_pfn(pfn); return true; } @@ -596,8 +617,9 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) stat_data->kvm = kvm; stat_data->offset = p->offset; + stat_data->mode = p->mode ? p->mode : 0644; kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; - if (!debugfs_create_file(p->name, 0644, + if (!debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry, stat_data, stat_fops_per_vm[p->kind])) @@ -606,6 +628,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) return 0; } +/* + * Called after the VM is otherwise initialized, but just before adding it to + * the vm_list. + */ +int __weak kvm_arch_post_init_vm(struct kvm *kvm) +{ + return 0; +} + +/* + * Called just after removing the VM from the vm_list, but before doing any + * other destruction. + */ +void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) +{ +} + static struct kvm *kvm_create_vm(unsigned long type) { int r, i; @@ -660,22 +699,31 @@ static struct kvm *kvm_create_vm(unsigned long type) rcu_assign_pointer(kvm->buses[i], kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); if (!kvm->buses[i]) - goto out_err; + goto out_err_no_mmu_notifier; } r = kvm_init_mmu_notifier(kvm); if (r) + goto out_err_no_mmu_notifier; + + r = kvm_arch_post_init_vm(kvm); + if (r) goto out_err; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); preempt_notifier_inc(); return kvm; out_err: +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) + if (kvm->mmu_notifier.ops) + mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); +#endif +out_err_no_mmu_notifier: cleanup_srcu_struct(&kvm->irq_srcu); out_err_no_irq_srcu: cleanup_srcu_struct(&kvm->srcu); @@ -715,9 +763,11 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_del(&kvm->vm_list); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); + kvm_arch_pre_destroy_vm(kvm); + kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { struct kvm_io_bus *bus = kvm_get_bus(kvm, i); @@ -1700,7 +1750,7 @@ static void kvm_release_pfn_dirty(kvm_pfn_t pfn) void kvm_set_pfn_dirty(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn)) { + if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) { struct page *page = pfn_to_page(pfn); if (!PageReserved(page)) @@ -1711,7 +1761,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); void kvm_set_pfn_accessed(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn)) + if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) mark_page_accessed(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); @@ -2314,6 +2364,29 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) #endif } +/* + * Unlike kvm_arch_vcpu_runnable, this function is called outside + * a vcpu_load/vcpu_put pair. However, for most architectures + * kvm_arch_vcpu_runnable does not require vcpu_load. + */ +bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + return kvm_arch_vcpu_runnable(vcpu); +} + +static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (kvm_arch_dy_runnable(vcpu)) + return true; + +#ifdef CONFIG_KVM_ASYNC_PF + if (!list_empty_careful(&vcpu->async_pf.done)) + return true; +#endif + + return false; +} + void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { struct kvm *kvm = me->kvm; @@ -2343,7 +2416,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) continue; if (vcpu == me) continue; - if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) + if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu)) continue; if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu)) continue; @@ -3690,7 +3763,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) return -ENOENT; - if (simple_attr_open(inode, file, get, set, fmt)) { + if (simple_attr_open(inode, file, get, + stat_data->mode & S_IWUGO ? set : NULL, + fmt)) { kvm_put_kvm(stat_data->kvm); return -ENOMEM; } @@ -3804,13 +3879,13 @@ static int vm_stat_get(void *_offset, u64 *val) u64 tmp_val; *val = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { stat_tmp.kvm = kvm; vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); *val += tmp_val; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -3823,12 +3898,12 @@ static int vm_stat_clear(void *_offset, u64 val) if (val) return -EINVAL; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { stat_tmp.kvm = kvm; vm_stat_clear_per_vm((void *)&stat_tmp, 0); } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -3843,13 +3918,13 @@ static int vcpu_stat_get(void *_offset, u64 *val) u64 tmp_val; *val = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { stat_tmp.kvm = kvm; vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); *val += tmp_val; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -3862,12 +3937,12 @@ static int vcpu_stat_clear(void *_offset, u64 val) if (val) return -EINVAL; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { stat_tmp.kvm = kvm; vcpu_stat_clear_per_vm((void *)&stat_tmp, 0); } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -3888,7 +3963,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) if (!kvm_dev.this_device || !kvm) return; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); if (type == KVM_EVENT_CREATE_VM) { kvm_createvm_count++; kvm_active_vms++; @@ -3897,7 +3972,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) } created = kvm_createvm_count; active = kvm_active_vms; - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); env = kzalloc(sizeof(*env), GFP_KERNEL); if (!env) @@ -3914,7 +3989,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) } add_uevent_var(env, "PID=%d", kvm->userspace_pid); - if (kvm->debugfs_dentry) { + if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) { char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); if (p) { @@ -3941,7 +4016,8 @@ static int kvm_init_debug(void) kvm_debugfs_num_entries = 0; for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { - if (!debugfs_create_file(p->name, 0644, kvm_debugfs_dir, + int mode = p->mode ? p->mode : 0644; + if (!debugfs_create_file(p->name, mode, kvm_debugfs_dir, (void *)(long)p->offset, stat_fops[p->kind])) goto out_dir; @@ -4128,3 +4204,86 @@ void kvm_exit(void) kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); + +struct kvm_vm_worker_thread_context { + struct kvm *kvm; + struct task_struct *parent; + struct completion init_done; + kvm_vm_thread_fn_t thread_fn; + uintptr_t data; + int err; +}; + +static int kvm_vm_worker_thread(void *context) +{ + /* + * The init_context is allocated on the stack of the parent thread, so + * we have to locally copy anything that is needed beyond initialization + */ + struct kvm_vm_worker_thread_context *init_context = context; + struct kvm *kvm = init_context->kvm; + kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; + uintptr_t data = init_context->data; + int err; + + err = kthread_park(current); + /* kthread_park(current) is never supposed to return an error */ + WARN_ON(err != 0); + if (err) + goto init_complete; + + err = cgroup_attach_task_all(init_context->parent, current); + if (err) { + kvm_err("%s: cgroup_attach_task_all failed with err %d\n", + __func__, err); + goto init_complete; + } + + set_user_nice(current, task_nice(init_context->parent)); + +init_complete: + init_context->err = err; + complete(&init_context->init_done); + init_context = NULL; + + if (err) + return err; + + /* Wait to be woken up by the spawner before proceeding. */ + kthread_parkme(); + + if (!kthread_should_stop()) + err = thread_fn(kvm, data); + + return err; +} + +int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, + uintptr_t data, const char *name, + struct task_struct **thread_ptr) +{ + struct kvm_vm_worker_thread_context init_context = {}; + struct task_struct *thread; + + *thread_ptr = NULL; + init_context.kvm = kvm; + init_context.parent = current; + init_context.thread_fn = thread_fn; + init_context.data = data; + init_completion(&init_context.init_done); + + thread = kthread_run(kvm_vm_worker_thread, &init_context, + "%s-%d", name, task_pid_nr(current)); + if (IS_ERR(thread)) + return PTR_ERR(thread); + + /* kthread_run is never supposed to return NULL */ + WARN_ON(thread == NULL); + + wait_for_completion(&init_context.init_done); + + if (!init_context.err) + *thread_ptr = thread; + + return init_context.err; +} |