diff options
-rw-r--r-- | fs/userfaultfd.c | 65 |
1 files changed, 43 insertions, 22 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1f2ddaaf3c03..0877222dfa47 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -52,6 +52,10 @@ struct userfaultfd_ctx { struct userfaultfd_wait_queue { struct uffd_msg msg; wait_queue_t wq; + /* + * Only relevant when queued in fault_wqh and only used by the + * read operation to avoid reading the same userfault twice. + */ bool pending; struct userfaultfd_ctx *ctx; }; @@ -71,9 +75,6 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, uwq = container_of(wq, struct userfaultfd_wait_queue, wq); ret = 0; - /* don't wake the pending ones to avoid reads to block */ - if (uwq->pending && !ACCESS_ONCE(uwq->ctx->released)) - goto out; /* len == 0 means wake all */ start = range->start; len = range->len; @@ -196,12 +197,14 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, struct mm_struct *mm = vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; + int ret; BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + ret = VM_FAULT_SIGBUS; ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx) - return VM_FAULT_SIGBUS; + goto out; BUG_ON(ctx->mm != mm); @@ -214,7 +217,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, * caller of handle_userfault to release the mmap_sem. */ if (unlikely(ACCESS_ONCE(ctx->released))) - return VM_FAULT_SIGBUS; + goto out; /* * Check that we can return VM_FAULT_RETRY. @@ -240,15 +243,16 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, dump_stack(); } #endif - return VM_FAULT_SIGBUS; + goto out; } /* * Handle nowait, not much to do other than tell it to retry * and wait. */ + ret = VM_FAULT_RETRY; if (flags & FAULT_FLAG_RETRY_NOWAIT) - return VM_FAULT_RETRY; + goto out; /* take the reference before dropping the mmap_sem */ userfaultfd_ctx_get(ctx); @@ -268,21 +272,23 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, * through poll/read(). */ __add_wait_queue(&ctx->fault_wqh, &uwq.wq); - for (;;) { - set_current_state(TASK_KILLABLE); - if (!uwq.pending || ACCESS_ONCE(ctx->released) || - fatal_signal_pending(current)) - break; - spin_unlock(&ctx->fault_wqh.lock); + set_current_state(TASK_KILLABLE); + spin_unlock(&ctx->fault_wqh.lock); + if (likely(!ACCESS_ONCE(ctx->released) && + !fatal_signal_pending(current))) { wake_up_poll(&ctx->fd_wqh, POLLIN); schedule(); + ret |= VM_FAULT_MAJOR; + } + __set_current_state(TASK_RUNNING); + /* see finish_wait() comment for why list_empty_careful() */ + if (!list_empty_careful(&uwq.wq.task_list)) { spin_lock(&ctx->fault_wqh.lock); + list_del_init(&uwq.wq.task_list); + spin_unlock(&ctx->fault_wqh.lock); } - __remove_wait_queue(&ctx->fault_wqh, &uwq.wq); - __set_current_state(TASK_RUNNING); - spin_unlock(&ctx->fault_wqh.lock); /* * ctx may go away after this if the userfault pseudo fd is @@ -290,7 +296,8 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, */ userfaultfd_ctx_put(ctx); - return VM_FAULT_RETRY; +out: + return ret; } static int userfaultfd_release(struct inode *inode, struct file *file) @@ -404,6 +411,12 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) case UFFD_STATE_WAIT_API: return POLLERR; case UFFD_STATE_RUNNING: + /* + * poll() never guarantees that read won't block. + * userfaults can be waken before they're read(). + */ + if (unlikely(!(file->f_flags & O_NONBLOCK))) + return POLLERR; spin_lock(&ctx->fault_wqh.lock); ret = find_userfault(ctx, NULL); spin_unlock(&ctx->fault_wqh.lock); @@ -834,11 +847,19 @@ out: } /* - * This is mostly needed to re-wakeup those userfaults that were still - * pending when userland wake them up the first time. We don't wake - * the pending one to avoid blocking reads to block, or non blocking - * read to return -EAGAIN, if used with POLLIN, to avoid userland - * doubts on why POLLIN wasn't reliable. + * userfaultfd_wake is needed in case an userfault is in flight by the + * time a UFFDIO_COPY (or other ioctl variants) completes. The page + * may be well get mapped and the page fault if repeated wouldn't lead + * to a userfault anymore, but before scheduling in TASK_KILLABLE mode + * handle_userfault() doesn't recheck the pagetables and it doesn't + * serialize against UFFDO_COPY (or other ioctl variants). Ultimately + * the knowledge of which pages are mapped is left to userland who is + * responsible for handling the race between read() userfaults and + * background UFFDIO_COPY (or other ioctl variants), if done by + * separate concurrent threads. + * + * userfaultfd_wake may be used in combination with the + * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. */ static int userfaultfd_wake(struct userfaultfd_ctx *ctx, unsigned long arg) |