From af73e4d9506d3b797509f3c030e7dcd554f7d9c4 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 7 May 2013 16:18:13 -0700 Subject: hugetlbfs: fix mmap failure in unaligned size request The current kernel returns -EINVAL unless a given mmap length is "almost" hugepage aligned. This is because in sys_mmap_pgoff() the given length is passed to vm_mmap_pgoff() as it is without being aligned with hugepage boundary. This is a regression introduced in commit 40716e29243d ("hugetlbfs: fix alignment of huge page requests"), where alignment code is pushed into hugetlb_file_setup() and the variable len in caller side is not changed. To fix this, this patch partially reverts that commit, and adds alignment code in caller side. And it also introduces hstate_sizelog() in order to get proper hstate to specified hugepage size. Addresses https://bugzilla.kernel.org/show_bug.cgi?id=56881 [akpm@linux-foundation.org: fix warning when CONFIG_HUGETLB_PAGE=n] Signed-off-by: Naoya Horiguchi Signed-off-by: Johannes Weiner Reported-by: Cc: Steven Truelove Cc: Jianguo Wu Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 523464e62849..a3f868ae3fd4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -909,11 +909,8 @@ static int can_do_hugetlb_shm(void) static int get_hstate_idx(int page_size_log) { - struct hstate *h; + struct hstate *h = hstate_sizelog(page_size_log); - if (!page_size_log) - return default_hstate_idx; - h = size_to_hstate(1 << page_size_log); if (!h) return -1; return h - hstates; @@ -929,9 +926,12 @@ static struct dentry_operations anon_ops = { .d_dname = hugetlb_dname }; -struct file *hugetlb_file_setup(const char *name, unsigned long addr, - size_t size, vm_flags_t acctflag, - struct user_struct **user, +/* + * Note that size should be aligned to proper hugepage size in caller side, + * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. + */ +struct file *hugetlb_file_setup(const char *name, size_t size, + vm_flags_t acctflag, struct user_struct **user, int creat_flags, int page_size_log) { struct file *file = ERR_PTR(-ENOMEM); @@ -939,8 +939,6 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, struct path path; struct super_block *sb; struct qstr quick_string; - struct hstate *hstate; - unsigned long num_pages; int hstate_idx; hstate_idx = get_hstate_idx(page_size_log); @@ -980,12 +978,10 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, if (!inode) goto out_dentry; - hstate = hstate_inode(inode); - size += addr & ~huge_page_mask(hstate); - num_pages = ALIGN(size, huge_page_size(hstate)) >> - huge_page_shift(hstate); file = ERR_PTR(-ENOMEM); - if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag)) + if (hugetlb_reserve_pages(inode, 0, + size >> huge_page_shift(hstate_inode(inode)), NULL, + acctflag)) goto out_inode; d_instantiate(path.dentry, inode); -- cgit v1.2.3 From 41003a7bcfed1255032e1e7c7b487e505b22e298 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 7 May 2013 16:18:25 -0700 Subject: aio: remove retry-based AIO This removes the retry-based AIO infrastructure now that nothing in tree is using it. We want to remove retry-based AIO because it is fundemantally unsafe. It retries IO submission from a kernel thread that has only assumed the mm of the submitting task. All other task_struct references in the IO submission path will see the kernel thread, not the submitting task. This design flaw means that nothing of any meaningful complexity can use retry-based AIO. This removes all the code and data associated with the retry machinery. The most significant benefit of this is the removal of the locking around the unused run list in the submission path. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kent Overstreet Signed-off-by: Zach Brown Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Acked-by: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 351 +++++------------------------------------------------ fs/ocfs2/dlmglue.c | 2 +- fs/read_write.c | 34 +----- 3 files changed, 31 insertions(+), 356 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 351afe7ac78e..6e095a95a7c6 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -54,11 +54,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; -static struct workqueue_struct *aio_wq; - -static void aio_kick_handler(struct work_struct *); -static void aio_queue_work(struct kioctx *); - /* aio_setup * Creates the slab caches used by the aio routines, panic on * failure as this is done early during the boot sequence. @@ -68,9 +63,6 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); - aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */ - BUG_ON(!aio_wq); - pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); return 0; @@ -86,7 +78,6 @@ static void aio_free_ring(struct kioctx *ctx) put_page(info->ring_pages[i]); if (info->mmap_size) { - BUG_ON(ctx->mm != current->mm); vm_munmap(info->mmap_base, info->mmap_size); } @@ -101,6 +92,7 @@ static int aio_setup_ring(struct kioctx *ctx) struct aio_ring *ring; struct aio_ring_info *info = &ctx->ring_info; unsigned nr_events = ctx->max_reqs; + struct mm_struct *mm = current->mm; unsigned long size, populate; int nr_pages; @@ -126,23 +118,22 @@ static int aio_setup_ring(struct kioctx *ctx) info->mmap_size = nr_pages * PAGE_SIZE; dprintk("attempting mmap of %lu bytes\n", info->mmap_size); - down_write(&ctx->mm->mmap_sem); + down_write(&mm->mmap_sem); info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); if (IS_ERR((void *)info->mmap_base)) { - up_write(&ctx->mm->mmap_sem); + up_write(&mm->mmap_sem); info->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; } dprintk("mmap address: 0x%08lx\n", info->mmap_base); - info->nr_pages = get_user_pages(current, ctx->mm, - info->mmap_base, nr_pages, + info->nr_pages = get_user_pages(current, mm, info->mmap_base, nr_pages, 1, 0, info->ring_pages, NULL); - up_write(&ctx->mm->mmap_sem); + up_write(&mm->mmap_sem); if (unlikely(info->nr_pages != nr_pages)) { aio_free_ring(ctx); @@ -206,10 +197,7 @@ static void __put_ioctx(struct kioctx *ctx) unsigned nr_events = ctx->max_reqs; BUG_ON(ctx->reqs_active); - cancel_delayed_work_sync(&ctx->wq); aio_free_ring(ctx); - mmdrop(ctx->mm); - ctx->mm = NULL; if (nr_events) { spin_lock(&aio_nr_lock); BUG_ON(aio_nr - nr_events > aio_nr); @@ -237,7 +225,7 @@ static inline void put_ioctx(struct kioctx *kioctx) */ static struct kioctx *ioctx_alloc(unsigned nr_events) { - struct mm_struct *mm; + struct mm_struct *mm = current->mm; struct kioctx *ctx; int err = -ENOMEM; @@ -256,8 +244,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-ENOMEM); ctx->max_reqs = nr_events; - mm = ctx->mm = current->mm; - atomic_inc(&mm->mm_count); atomic_set(&ctx->users, 2); spin_lock_init(&ctx->ctx_lock); @@ -265,8 +251,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); - INIT_LIST_HEAD(&ctx->run_list); - INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); if (aio_setup_ring(ctx) < 0) goto out_freectx; @@ -287,14 +271,13 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) spin_unlock(&mm->ioctx_lock); dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", - ctx, ctx->user_id, current->mm, ctx->ring_info.nr); + ctx, ctx->user_id, mm, ctx->ring_info.nr); return ctx; out_cleanup: err = -EAGAIN; aio_free_ring(ctx); out_freectx: - mmdrop(mm); kmem_cache_free(kioctx_cachep, ctx); dprintk("aio: error allocating ioctx %d\n", err); return ERR_PTR(err); @@ -391,8 +374,6 @@ void exit_aio(struct mm_struct *mm) * as indicator that it needs to unmap the area, * just set it to 0; aio_free_ring() is the only * place that uses ->mmap_size, so it's safe. - * That way we get all munmap done to current->mm - - * all other callers have ctx->mm == current->mm. */ ctx->ring_info.mmap_size = 0; put_ioctx(ctx); @@ -426,7 +407,6 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx) req->ki_dtor = NULL; req->private = NULL; req->ki_iovec = NULL; - INIT_LIST_HEAD(&req->ki_run_list); req->ki_eventfd = NULL; return req; @@ -611,281 +591,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) return ret; } -/* - * Queue up a kiocb to be retried. Assumes that the kiocb - * has already been marked as kicked, and places it on - * the retry run list for the corresponding ioctx, if it - * isn't already queued. Returns 1 if it actually queued - * the kiocb (to tell the caller to activate the work - * queue to process it), or 0, if it found that it was - * already queued. - */ -static inline int __queue_kicked_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - - assert_spin_locked(&ctx->ctx_lock); - - if (list_empty(&iocb->ki_run_list)) { - list_add_tail(&iocb->ki_run_list, - &ctx->run_list); - return 1; - } - return 0; -} - -/* aio_run_iocb - * This is the core aio execution routine. It is - * invoked both for initial i/o submission and - * subsequent retries via the aio_kick_handler. - * Expects to be invoked with iocb->ki_ctx->lock - * already held. The lock is released and reacquired - * as needed during processing. - * - * Calls the iocb retry method (already setup for the - * iocb on initial submission) for operation specific - * handling, but takes care of most of common retry - * execution details for a given iocb. The retry method - * needs to be non-blocking as far as possible, to avoid - * holding up other iocbs waiting to be serviced by the - * retry kernel thread. - * - * The trickier parts in this code have to do with - * ensuring that only one retry instance is in progress - * for a given iocb at any time. Providing that guarantee - * simplifies the coding of individual aio operations as - * it avoids various potential races. - */ -static ssize_t aio_run_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - ssize_t (*retry)(struct kiocb *); - ssize_t ret; - - if (!(retry = iocb->ki_retry)) { - printk("aio_run_iocb: iocb->ki_retry = NULL\n"); - return 0; - } - - /* - * We don't want the next retry iteration for this - * operation to start until this one has returned and - * updated the iocb state. However, wait_queue functions - * can trigger a kick_iocb from interrupt context in the - * meantime, indicating that data is available for the next - * iteration. We want to remember that and enable the - * next retry iteration _after_ we are through with - * this one. - * - * So, in order to be able to register a "kick", but - * prevent it from being queued now, we clear the kick - * flag, but make the kick code *think* that the iocb is - * still on the run list until we are actually done. - * When we are done with this iteration, we check if - * the iocb was kicked in the meantime and if so, queue - * it up afresh. - */ - - kiocbClearKicked(iocb); - - /* - * This is so that aio_complete knows it doesn't need to - * pull the iocb off the run list (We can't just call - * INIT_LIST_HEAD because we don't want a kick_iocb to - * queue this on the run list yet) - */ - iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; - spin_unlock_irq(&ctx->ctx_lock); - - /* Quit retrying if the i/o has been cancelled */ - if (kiocbIsCancelled(iocb)) { - ret = -EINTR; - aio_complete(iocb, ret, 0); - /* must not access the iocb after this */ - goto out; - } - - /* - * Now we are all set to call the retry method in async - * context. - */ - ret = retry(iocb); - - if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { - /* - * There's no easy way to restart the syscall since other AIO's - * may be already running. Just fail this IO with EINTR. - */ - if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || - ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) - ret = -EINTR; - aio_complete(iocb, ret, 0); - } -out: - spin_lock_irq(&ctx->ctx_lock); - - if (-EIOCBRETRY == ret) { - /* - * OK, now that we are done with this iteration - * and know that there is more left to go, - * this is where we let go so that a subsequent - * "kick" can start the next iteration - */ - - /* will make __queue_kicked_iocb succeed from here on */ - INIT_LIST_HEAD(&iocb->ki_run_list); - /* we must queue the next iteration ourselves, if it - * has already been kicked */ - if (kiocbIsKicked(iocb)) { - __queue_kicked_iocb(iocb); - - /* - * __queue_kicked_iocb will always return 1 here, because - * iocb->ki_run_list is empty at this point so it should - * be safe to unconditionally queue the context into the - * work queue. - */ - aio_queue_work(ctx); - } - } - return ret; -} - -/* - * __aio_run_iocbs: - * Process all pending retries queued on the ioctx - * run list. - * Assumes it is operating within the aio issuer's mm - * context. - */ -static int __aio_run_iocbs(struct kioctx *ctx) -{ - struct kiocb *iocb; - struct list_head run_list; - - assert_spin_locked(&ctx->ctx_lock); - - list_replace_init(&ctx->run_list, &run_list); - while (!list_empty(&run_list)) { - iocb = list_entry(run_list.next, struct kiocb, - ki_run_list); - list_del(&iocb->ki_run_list); - /* - * Hold an extra reference while retrying i/o. - */ - iocb->ki_users++; /* grab extra reference */ - aio_run_iocb(iocb); - __aio_put_req(ctx, iocb); - } - if (!list_empty(&ctx->run_list)) - return 1; - return 0; -} - -static void aio_queue_work(struct kioctx * ctx) -{ - unsigned long timeout; - /* - * if someone is waiting, get the work started right - * away, otherwise, use a longer delay - */ - smp_mb(); - if (waitqueue_active(&ctx->wait)) - timeout = 1; - else - timeout = HZ/10; - queue_delayed_work(aio_wq, &ctx->wq, timeout); -} - -/* - * aio_run_all_iocbs: - * Process all pending retries queued on the ioctx - * run list, and keep running them until the list - * stays empty. - * Assumes it is operating within the aio issuer's mm context. - */ -static inline void aio_run_all_iocbs(struct kioctx *ctx) -{ - spin_lock_irq(&ctx->ctx_lock); - while (__aio_run_iocbs(ctx)) - ; - spin_unlock_irq(&ctx->ctx_lock); -} - -/* - * aio_kick_handler: - * Work queue handler triggered to process pending - * retries on an ioctx. Takes on the aio issuer's - * mm context before running the iocbs, so that - * copy_xxx_user operates on the issuer's address - * space. - * Run on aiod's context. - */ -static void aio_kick_handler(struct work_struct *work) -{ - struct kioctx *ctx = container_of(work, struct kioctx, wq.work); - mm_segment_t oldfs = get_fs(); - struct mm_struct *mm; - int requeue; - - set_fs(USER_DS); - use_mm(ctx->mm); - spin_lock_irq(&ctx->ctx_lock); - requeue =__aio_run_iocbs(ctx); - mm = ctx->mm; - spin_unlock_irq(&ctx->ctx_lock); - unuse_mm(mm); - set_fs(oldfs); - /* - * we're in a worker thread already; no point using non-zero delay - */ - if (requeue) - queue_delayed_work(aio_wq, &ctx->wq, 0); -} - - -/* - * Called by kick_iocb to queue the kiocb for retry - * and if required activate the aio work queue to process - * it - */ -static void try_queue_kicked_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - unsigned long flags; - int run = 0; - - spin_lock_irqsave(&ctx->ctx_lock, flags); - /* set this inside the lock so that we can't race with aio_run_iocb() - * testing it and putting the iocb on the run list under the lock */ - if (!kiocbTryKick(iocb)) - run = __queue_kicked_iocb(iocb); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - if (run) - aio_queue_work(ctx); -} - -/* - * kick_iocb: - * Called typically from a wait queue callback context - * to trigger a retry of the iocb. - * The retry is usually executed by aio workqueue - * threads (See aio_kick_handler). - */ -void kick_iocb(struct kiocb *iocb) -{ - /* sync iocbs are easy: they can only ever be executing from a - * single context. */ - if (is_sync_kiocb(iocb)) { - kiocbSetKicked(iocb); - wake_up_process(iocb->ki_obj.tsk); - return; - } - - try_queue_kicked_iocb(iocb); -} -EXPORT_SYMBOL(kick_iocb); - /* aio_complete * Called when the io request on the given iocb is complete. * Returns true if this is the last user of the request. The @@ -926,9 +631,6 @@ int aio_complete(struct kiocb *iocb, long res, long res2) */ spin_lock_irqsave(&ctx->ctx_lock, flags); - if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) - list_del_init(&iocb->ki_run_list); - /* * cancelled requests don't get events, userland was given one * when the event got cancelled. @@ -1083,13 +785,11 @@ static int read_events(struct kioctx *ctx, int i = 0; struct io_event ent; struct aio_timeout to; - int retry = 0; /* needed to zero any padding within an entry (there shouldn't be * any, but C is fun! */ memset(&ent, 0, sizeof(ent)); -retry: ret = 0; while (likely(i < nr)) { ret = aio_read_evt(ctx, &ent); @@ -1119,13 +819,6 @@ retry: /* End fast path */ - /* racey check, but it gets redone */ - if (!retry && unlikely(!list_empty(&ctx->run_list))) { - retry = 1; - aio_run_all_iocbs(ctx); - goto retry; - } - init_timeout(&to); if (timeout) { struct timespec ts; @@ -1349,7 +1042,7 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) /* If we managed to write some out we return that, rather than * the eventual error. */ if (opcode == IOCB_CMD_PWRITEV - && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY + && ret < 0 && ret != -EIOCBQUEUED && iocb->ki_nbytes - iocb->ki_left) ret = iocb->ki_nbytes - iocb->ki_left; @@ -1591,18 +1284,28 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, * don't see ctx->dead set here, io_destroy() waits for our IO to * finish. */ - if (ctx->dead) { - spin_unlock_irq(&ctx->ctx_lock); + if (ctx->dead) ret = -EINVAL; + spin_unlock_irq(&ctx->ctx_lock); + if (ret) goto out_put_req; + + if (unlikely(kiocbIsCancelled(req))) + ret = -EINTR; + else + ret = req->ki_retry(req); + + if (ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || + ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; + aio_complete(req, ret, 0); } - aio_run_iocb(req); - if (!list_empty(&ctx->run_list)) { - /* drain the run list */ - while (__aio_run_iocbs(ctx)) - ; - } - spin_unlock_irq(&ctx->ctx_lock); aio_put_req(req); /* drop extra ref to req */ return 0; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 12ae194ac943..3a44a648dae7 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags, subclass, _RET_IP_); if (status < 0) { - if (status != -EAGAIN && status != -EIOCBRETRY) + if (status != -EAGAIN) mlog_errno(status); goto bail; } diff --git a/fs/read_write.c b/fs/read_write.c index 90ba3b350e50..bce289afd21a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -329,16 +329,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; } -static void wait_on_retry_sync_kiocb(struct kiocb *iocb) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - if (!kiocbIsKicked(iocb)) - schedule(); - else - kiocbClearKicked(iocb); - __set_current_state(TASK_RUNNING); -} - ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = len }; @@ -350,13 +340,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -406,13 +390,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -592,13 +570,7 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; -- cgit v1.2.3 From 2d68449e86168744513ca4f13477f081ce167130 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:29 -0700 Subject: aio: kill return value of aio_complete() Nothing used the return value, and it probably wasn't possible to use it safely for the locked versions (aio_complete(), aio_put_req()). Just kill it. Signed-off-by: Kent Overstreet Acked-by: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Acked-by: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 6e095a95a7c6..8b43d6bf8c50 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -531,7 +531,7 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) /* __aio_put_req * Returns true if this put was the last user of the request. */ -static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) +static void __aio_put_req(struct kioctx *ctx, struct kiocb *req) { dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", req, atomic_long_read(&req->ki_filp->f_count)); @@ -541,7 +541,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) req->ki_users--; BUG_ON(req->ki_users < 0); if (likely(req->ki_users)) - return 0; + return; list_del(&req->ki_list); /* remove from active_reqs */ req->ki_cancel = NULL; req->ki_retry = NULL; @@ -549,21 +549,18 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) fput(req->ki_filp); req->ki_filp = NULL; really_put_req(ctx, req); - return 1; } /* aio_put_req * Returns true if this put was the last user of the kiocb, * false if the request is still in use. */ -int aio_put_req(struct kiocb *req) +void aio_put_req(struct kiocb *req) { struct kioctx *ctx = req->ki_ctx; - int ret; spin_lock_irq(&ctx->ctx_lock); - ret = __aio_put_req(ctx, req); + __aio_put_req(ctx, req); spin_unlock_irq(&ctx->ctx_lock); - return ret; } EXPORT_SYMBOL(aio_put_req); @@ -593,10 +590,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) /* aio_complete * Called when the io request on the given iocb is complete. - * Returns true if this is the last user of the request. The - * only other user of the request can be the cancellation code. */ -int aio_complete(struct kiocb *iocb, long res, long res2) +void aio_complete(struct kiocb *iocb, long res, long res2) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring_info *info; @@ -604,7 +599,6 @@ int aio_complete(struct kiocb *iocb, long res, long res2) struct io_event *event; unsigned long flags; unsigned long tail; - int ret; /* * Special case handling for sync iocbs: @@ -618,7 +612,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2) iocb->ki_user_data = res; iocb->ki_users = 0; wake_up_process(iocb->ki_obj.tsk); - return 1; + return; } info = &ctx->ring_info; @@ -677,7 +671,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2) put_rq: /* everything turned out well, dispose of the aiocb. */ - ret = __aio_put_req(ctx, iocb); + __aio_put_req(ctx, iocb); /* * We have to order our ring_info tail store above and test @@ -691,7 +685,6 @@ put_rq: wake_up(&ctx->wait); spin_unlock_irqrestore(&ctx->ctx_lock, flags); - return ret; } EXPORT_SYMBOL(aio_complete); -- cgit v1.2.3 From 906b973cf0ae8f10a6e8182845729b2ecc0da95a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:31 -0700 Subject: aio: add kiocb_cancel() Minor refactoring, to get rid of some duplicated code [akpm@linux-foundation.org: fix warning] Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Acked-by: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 79 +++++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 43 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 8b43d6bf8c50..dbfcd67003ef 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -220,6 +220,29 @@ static inline void put_ioctx(struct kioctx *kioctx) __put_ioctx(kioctx); } +static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, + struct io_event *res) +{ + int (*cancel)(struct kiocb *, struct io_event *); + int ret = -EINVAL; + + cancel = kiocb->ki_cancel; + kiocbSetCancelled(kiocb); + if (cancel) { + kiocb->ki_users++; + spin_unlock_irq(&ctx->ctx_lock); + + memset(res, 0, sizeof(*res)); + res->obj = (u64)(unsigned long)kiocb->ki_obj.user; + res->data = kiocb->ki_user_data; + ret = cancel(kiocb, res); + + spin_lock_irq(&ctx->ctx_lock); + } + + return ret; +} + /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ @@ -290,25 +313,19 @@ out_freectx: */ static void kill_ctx(struct kioctx *ctx) { - int (*cancel)(struct kiocb *, struct io_event *); struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); struct io_event res; + struct kiocb *req; spin_lock_irq(&ctx->ctx_lock); ctx->dead = 1; while (!list_empty(&ctx->active_reqs)) { - struct list_head *pos = ctx->active_reqs.next; - struct kiocb *iocb = list_kiocb(pos); - list_del_init(&iocb->ki_list); - cancel = iocb->ki_cancel; - kiocbSetCancelled(iocb); - if (cancel) { - iocb->ki_users++; - spin_unlock_irq(&ctx->ctx_lock); - cancel(iocb, &res); - spin_lock_irq(&ctx->ctx_lock); - } + req = list_first_entry(&ctx->active_reqs, + struct kiocb, ki_list); + + list_del_init(&req->ki_list); + kiocb_cancel(ctx, req, &res); } if (!ctx->reqs_active) @@ -1416,7 +1433,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { - int (*cancel)(struct kiocb *iocb, struct io_event *res); + struct io_event res; struct kioctx *ctx; struct kiocb *kiocb; u32 key; @@ -1431,32 +1448,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - ret = -EAGAIN; + kiocb = lookup_kiocb(ctx, iocb, key); - if (kiocb && kiocb->ki_cancel) { - cancel = kiocb->ki_cancel; - kiocb->ki_users ++; - kiocbSetCancelled(kiocb); - } else - cancel = NULL; + if (kiocb) + ret = kiocb_cancel(ctx, kiocb, &res); + else + ret = -EINVAL; + spin_unlock_irq(&ctx->ctx_lock); - if (NULL != cancel) { - struct io_event tmp; - pr_debug("calling cancel\n"); - memset(&tmp, 0, sizeof(tmp)); - tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; - tmp.data = kiocb->ki_user_data; - ret = cancel(kiocb, &tmp); - if (!ret) { - /* Cancellation succeeded -- copy the result - * into the user's buffer. - */ - if (copy_to_user(result, &tmp, sizeof(tmp))) - ret = -EFAULT; - } - } else - ret = -EINVAL; + if (!ret) { + /* Cancellation succeeded -- copy the result + * into the user's buffer. + */ + if (copy_to_user(result, &res, sizeof(res))) + ret = -EFAULT; + } put_ioctx(ctx); -- cgit v1.2.3 From 4e179bca6718693148c7445c236bc3e0e0013ffd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:33 -0700 Subject: aio: move private stuff out of aio.h Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Acked-by: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index dbfcd67003ef..670cb8b84345 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -45,6 +45,67 @@ #define dprintk(x...) do { ; } while (0) #endif +#define AIO_RING_MAGIC 0xa10a10a1 +#define AIO_RING_COMPAT_FEATURES 1 +#define AIO_RING_INCOMPAT_FEATURES 0 +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; + unsigned tail; + + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ + + + struct io_event io_events[0]; +}; /* 128 bytes + ring size */ + +#define AIO_RING_PAGES 8 +struct aio_ring_info { + unsigned long mmap_base; + unsigned long mmap_size; + + struct page **ring_pages; + spinlock_t ring_lock; + long nr_pages; + + unsigned nr, tail; + + struct page *internal_pages[AIO_RING_PAGES]; +}; + +static inline unsigned aio_ring_avail(struct aio_ring_info *info, + struct aio_ring *ring) +{ + return (ring->head + info->nr - 1 - ring->tail) % info->nr; +} + +struct kioctx { + atomic_t users; + int dead; + + /* This needs improving */ + unsigned long user_id; + struct hlist_node list; + + wait_queue_head_t wait; + + spinlock_t ctx_lock; + + int reqs_active; + struct list_head active_reqs; /* used for cancellation */ + + /* sys_io_setup currently limits this to an unsigned int */ + unsigned max_reqs; + + struct aio_ring_info ring_info; + + struct rcu_head rcu_head; +}; + /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); unsigned long aio_nr; /* current system wide number of aio requests */ -- cgit v1.2.3 From caf4167aa73bef20f5ce994b776ecb2e44226e5e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:35 -0700 Subject: aio: dprintk() -> pr_debug() Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Acked-by: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 57 ++++++++++++++++++++++++--------------------------------- 1 file changed, 24 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 670cb8b84345..1574cb2a9eac 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -8,6 +8,8 @@ * * See ../COPYING for licensing terms. */ +#define pr_fmt(fmt) "%s: " fmt, __func__ + #include #include #include @@ -18,8 +20,6 @@ #include #include -#define DEBUG 0 - #include #include #include @@ -39,12 +39,6 @@ #include #include -#if DEBUG > 1 -#define dprintk printk -#else -#define dprintk(x...) do { ; } while (0) -#endif - #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 @@ -124,7 +118,7 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); - pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); + pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); return 0; } @@ -178,7 +172,7 @@ static int aio_setup_ring(struct kioctx *ctx) } info->mmap_size = nr_pages * PAGE_SIZE; - dprintk("attempting mmap of %lu bytes\n", info->mmap_size); + pr_debug("attempting mmap of %lu bytes\n", info->mmap_size); down_write(&mm->mmap_sem); info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, PROT_READ|PROT_WRITE, @@ -191,7 +185,7 @@ static int aio_setup_ring(struct kioctx *ctx) return -EAGAIN; } - dprintk("mmap address: 0x%08lx\n", info->mmap_base); + pr_debug("mmap address: 0x%08lx\n", info->mmap_base); info->nr_pages = get_user_pages(current, mm, info->mmap_base, nr_pages, 1, 0, info->ring_pages, NULL); up_write(&mm->mmap_sem); @@ -265,7 +259,7 @@ static void __put_ioctx(struct kioctx *ctx) aio_nr -= nr_events; spin_unlock(&aio_nr_lock); } - pr_debug("__put_ioctx: freeing %p\n", ctx); + pr_debug("freeing %p\n", ctx); call_rcu(&ctx->rcu_head, ctx_rcu_free); } @@ -354,7 +348,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); spin_unlock(&mm->ioctx_lock); - dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", + pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->ring_info.nr); return ctx; @@ -363,7 +357,7 @@ out_cleanup: aio_free_ring(ctx); out_freectx: kmem_cache_free(kioctx_cachep, ctx); - dprintk("aio: error allocating ioctx %d\n", err); + pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); } @@ -611,8 +605,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) */ static void __aio_put_req(struct kioctx *ctx, struct kiocb *req) { - dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", - req, atomic_long_read(&req->ki_filp->f_count)); + pr_debug("(%p): f_count=%ld\n", + req, atomic_long_read(&req->ki_filp->f_count)); assert_spin_locked(&ctx->ctx_lock); @@ -722,9 +716,9 @@ void aio_complete(struct kiocb *iocb, long res, long res2) event->res = res; event->res2 = res2; - dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, - res, res2); + pr_debug("%p[%lu]: %p: %p %Lx %lx %lx\n", + ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, + res, res2); /* after flagging the request as done, we * must never even look at it again @@ -780,9 +774,7 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) int ret = 0; ring = kmap_atomic(info->ring_pages[0]); - dprintk("in aio_read_evt h%lu t%lu m%lu\n", - (unsigned long)ring->head, (unsigned long)ring->tail, - (unsigned long)ring->nr); + pr_debug("h%u t%u m%u\n", ring->head, ring->tail, ring->nr); if (ring->head == ring->tail) goto out; @@ -802,9 +794,8 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) spin_unlock(&info->ring_lock); out: - dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, - (unsigned long)ring->head, (unsigned long)ring->tail); kunmap_atomic(ring); + pr_debug("%d h%u t%u\n", ret, ring->head, ring->tail); return ret; } @@ -867,13 +858,13 @@ static int read_events(struct kioctx *ctx, if (unlikely(ret <= 0)) break; - dprintk("read event: %Lx %Lx %Lx %Lx\n", - ent.data, ent.obj, ent.res, ent.res2); + pr_debug("%Lx %Lx %Lx %Lx\n", + ent.data, ent.obj, ent.res, ent.res2); /* Could we split the check in two? */ ret = -EFAULT; if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - dprintk("aio: lost an event due to EFAULT.\n"); + pr_debug("lost an event due to EFAULT.\n"); break; } ret = 0; @@ -936,7 +927,7 @@ static int read_events(struct kioctx *ctx, ret = -EFAULT; if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - dprintk("aio: lost an event due to EFAULT.\n"); + pr_debug("lost an event due to EFAULT.\n"); break; } @@ -967,7 +958,7 @@ static void io_destroy(struct kioctx *ioctx) hlist_del_rcu(&ioctx->list); spin_unlock(&mm->ioctx_lock); - dprintk("aio_release(%p)\n", ioctx); + pr_debug("(%p)\n", ioctx); if (likely(!was_dead)) put_ioctx(ioctx); /* twice for the list */ @@ -1264,7 +1255,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) kiocb->ki_retry = aio_fsync; break; default: - dprintk("EINVAL: io_submit: no operation provided\n"); + pr_debug("EINVAL: no operation provided\n"); ret = -EINVAL; } @@ -1284,7 +1275,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, /* enforce forwards compatibility on users */ if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { - pr_debug("EINVAL: io_submit: reserve field set\n"); + pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } @@ -1325,7 +1316,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, ret = put_user(req->ki_key, &user_iocb->aio_key); if (unlikely(ret)) { - dprintk("EFAULT: aio_key\n"); + pr_debug("EFAULT: aio_key\n"); goto out_put_req; } @@ -1407,7 +1398,7 @@ long do_io_submit(aio_context_t ctx_id, long nr, ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { - pr_debug("EINVAL: io_submit: invalid context id\n"); + pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } -- cgit v1.2.3 From 1d98ebfccc15aeea87a7c48d50d7343e1ce8daae Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:37 -0700 Subject: aio: do fget() after aio_get_req() aio_get_req() will fail if we have the maximum number of requests outstanding, which depending on the application may not be uncommon. So avoid doing an unnecessary fget(). Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Acked-by: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 1574cb2a9eac..ffa8b8d2cb8e 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -587,6 +587,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) { assert_spin_locked(&ctx->ctx_lock); + if (req->ki_filp) + fput(req->ki_filp); if (req->ki_eventfd != NULL) eventfd_ctx_put(req->ki_eventfd); if (req->ki_dtor) @@ -605,9 +607,6 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) */ static void __aio_put_req(struct kioctx *ctx, struct kiocb *req) { - pr_debug("(%p): f_count=%ld\n", - req, atomic_long_read(&req->ki_filp->f_count)); - assert_spin_locked(&ctx->ctx_lock); req->ki_users--; @@ -618,8 +617,6 @@ static void __aio_put_req(struct kioctx *ctx, struct kiocb *req) req->ki_cancel = NULL; req->ki_retry = NULL; - fput(req->ki_filp); - req->ki_filp = NULL; really_put_req(ctx, req); } @@ -1270,7 +1267,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, bool compat) { struct kiocb *req; - struct file *file; ssize_t ret; /* enforce forwards compatibility on users */ @@ -1289,16 +1285,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return -EINVAL; } - file = fget(iocb->aio_fildes); - if (unlikely(!file)) - return -EBADF; - req = aio_get_req(ctx, batch); /* returns with 2 references to req */ - if (unlikely(!req)) { - fput(file); + if (unlikely(!req)) return -EAGAIN; + + req->ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->ki_filp)) { + ret = -EBADF; + goto out_put_req; } - req->ki_filp = file; + if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an -- cgit v1.2.3 From 11599ebac4a249ab3c8b9a535c21db7a51458c0a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:39 -0700 Subject: aio: make aio_put_req() lockless Freeing a kiocb needed to touch the kioctx for three things: * Pull it off the reqs_active list * Decrementing reqs_active * Issuing a wakeup, if the kioctx was in the process of being freed. This patch moves these to aio_complete(), for a couple reasons: * aio_complete() already has to issue the wakeup, so if we drop the kioctx refcount before aio_complete does its wakeup we don't have to do it twice. * aio_complete currently has to take the kioctx lock, so it makes sense for it to pull the kiocb off the reqs_active list too. * A later patch is going to change reqs_active to include unreaped completions - this will mean allocating a kiocb doesn't have to look at the ringbuffer. So taking the decrement of reqs_active out of kiocb_free() is useful prep work for that patch. This doesn't really affect cancellation, since existing (usb) code that implements a cancel function still calls aio_complete() - we just have to make sure that aio_complete does the necessary teardown for cancelled kiocbs. It does affect code paths where we free kiocbs that were never submitted; they need to decrement reqs_active and pull the kiocb off the reqs_active list. This occurs in two places: kiocb_batch_free(), which is going away in a later patch, and the error path in io_submit_one. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Acked-by: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 86 +++++++++++++++++++++++++--------------------------------------- 1 file changed, 34 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index ffa8b8d2cb8e..f877417f3c42 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -89,7 +89,7 @@ struct kioctx { spinlock_t ctx_lock; - int reqs_active; + atomic_t reqs_active; struct list_head active_reqs; /* used for cancellation */ /* sys_io_setup currently limits this to an unsigned int */ @@ -250,7 +250,7 @@ static void ctx_rcu_free(struct rcu_head *head) static void __put_ioctx(struct kioctx *ctx) { unsigned nr_events = ctx->max_reqs; - BUG_ON(ctx->reqs_active); + BUG_ON(atomic_read(&ctx->reqs_active)); aio_free_ring(ctx); if (nr_events) { @@ -284,7 +284,7 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, cancel = kiocb->ki_cancel; kiocbSetCancelled(kiocb); if (cancel) { - kiocb->ki_users++; + atomic_inc(&kiocb->ki_users); spin_unlock_irq(&ctx->ctx_lock); memset(res, 0, sizeof(*res)); @@ -383,12 +383,12 @@ static void kill_ctx(struct kioctx *ctx) kiocb_cancel(ctx, req, &res); } - if (!ctx->reqs_active) + if (!atomic_read(&ctx->reqs_active)) goto out; add_wait_queue(&ctx->wait, &wait); set_task_state(tsk, TASK_UNINTERRUPTIBLE); - while (ctx->reqs_active) { + while (atomic_read(&ctx->reqs_active)) { spin_unlock_irq(&ctx->ctx_lock); io_schedule(); set_task_state(tsk, TASK_UNINTERRUPTIBLE); @@ -406,9 +406,9 @@ out: */ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { - while (iocb->ki_users) { + while (atomic_read(&iocb->ki_users)) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!iocb->ki_users) + if (!atomic_read(&iocb->ki_users)) break; io_schedule(); } @@ -438,7 +438,7 @@ void exit_aio(struct mm_struct *mm) printk(KERN_DEBUG "exit_aio:ioctx still alive: %d %d %d\n", atomic_read(&ctx->users), ctx->dead, - ctx->reqs_active); + atomic_read(&ctx->reqs_active)); /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -453,11 +453,11 @@ void exit_aio(struct mm_struct *mm) } /* aio_get_req - * Allocate a slot for an aio request. Increments the users count + * Allocate a slot for an aio request. Increments the ki_users count * of the kioctx so that the kioctx stays around until all requests are * complete. Returns NULL if no requests are free. * - * Returns with kiocb->users set to 2. The io submit code path holds + * Returns with kiocb->ki_users set to 2. The io submit code path holds * an extra reference while submitting the i/o. * This prevents races between the aio code path referencing the * req (after submitting it) and aio_complete() freeing the req. @@ -471,7 +471,7 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx) return NULL; req->ki_flags = 0; - req->ki_users = 2; + atomic_set(&req->ki_users, 2); req->ki_key = 0; req->ki_ctx = ctx; req->ki_cancel = NULL; @@ -512,9 +512,9 @@ static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) list_del(&req->ki_batch); list_del(&req->ki_list); kmem_cache_free(kiocb_cachep, req); - ctx->reqs_active--; + atomic_dec(&ctx->reqs_active); } - if (unlikely(!ctx->reqs_active && ctx->dead)) + if (unlikely(!atomic_read(&ctx->reqs_active) && ctx->dead)) wake_up_all(&ctx->wait); spin_unlock_irq(&ctx->ctx_lock); } @@ -545,7 +545,8 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) spin_lock_irq(&ctx->ctx_lock); ring = kmap_atomic(ctx->ring_info.ring_pages[0]); - avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; + avail = aio_ring_avail(&ctx->ring_info, ring) - + atomic_read(&ctx->reqs_active); BUG_ON(avail < 0); if (avail < allocated) { /* Trim back the number of requests. */ @@ -560,7 +561,7 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) batch->count -= allocated; list_for_each_entry(req, &batch->head, ki_batch) { list_add(&req->ki_list, &ctx->active_reqs); - ctx->reqs_active++; + atomic_inc(&ctx->reqs_active); } kunmap_atomic(ring); @@ -583,10 +584,8 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx, return req; } -static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) +static void kiocb_free(struct kiocb *req) { - assert_spin_locked(&ctx->ctx_lock); - if (req->ki_filp) fput(req->ki_filp); if (req->ki_eventfd != NULL) @@ -596,40 +595,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) if (req->ki_iovec != &req->ki_inline_vec) kfree(req->ki_iovec); kmem_cache_free(kiocb_cachep, req); - ctx->reqs_active--; - - if (unlikely(!ctx->reqs_active && ctx->dead)) - wake_up_all(&ctx->wait); } -/* __aio_put_req - * Returns true if this put was the last user of the request. - */ -static void __aio_put_req(struct kioctx *ctx, struct kiocb *req) -{ - assert_spin_locked(&ctx->ctx_lock); - - req->ki_users--; - BUG_ON(req->ki_users < 0); - if (likely(req->ki_users)) - return; - list_del(&req->ki_list); /* remove from active_reqs */ - req->ki_cancel = NULL; - req->ki_retry = NULL; - - really_put_req(ctx, req); -} - -/* aio_put_req - * Returns true if this put was the last user of the kiocb, - * false if the request is still in use. - */ void aio_put_req(struct kiocb *req) { - struct kioctx *ctx = req->ki_ctx; - spin_lock_irq(&ctx->ctx_lock); - __aio_put_req(ctx, req); - spin_unlock_irq(&ctx->ctx_lock); + if (atomic_dec_and_test(&req->ki_users)) + kiocb_free(req); } EXPORT_SYMBOL(aio_put_req); @@ -677,9 +648,9 @@ void aio_complete(struct kiocb *iocb, long res, long res2) * - the sync task helpfully left a reference to itself in the iocb */ if (is_sync_kiocb(iocb)) { - BUG_ON(iocb->ki_users != 1); + BUG_ON(atomic_read(&iocb->ki_users) != 1); iocb->ki_user_data = res; - iocb->ki_users = 0; + atomic_set(&iocb->ki_users, 0); wake_up_process(iocb->ki_obj.tsk); return; } @@ -694,6 +665,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2) */ spin_lock_irqsave(&ctx->ctx_lock, flags); + list_del(&iocb->ki_list); /* remove from active_reqs */ + /* * cancelled requests don't get events, userland was given one * when the event got cancelled. @@ -740,7 +713,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2) put_rq: /* everything turned out well, dispose of the aiocb. */ - __aio_put_req(ctx, iocb); + aio_put_req(iocb); + atomic_dec(&ctx->reqs_active); /* * We have to order our ring_info tail store above and test @@ -905,7 +879,7 @@ static int read_events(struct kioctx *ctx, break; /* Try to only show up in io wait if there are ops * in flight */ - if (ctx->reqs_active) + if (atomic_read(&ctx->reqs_active)) io_schedule(); else schedule(); @@ -1369,6 +1343,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return 0; out_put_req: + spin_lock_irq(&ctx->ctx_lock); + list_del(&req->ki_list); + spin_unlock_irq(&ctx->ctx_lock); + + atomic_dec(&ctx->reqs_active); + if (unlikely(!atomic_read(&ctx->reqs_active) && ctx->dead)) + wake_up_all(&ctx->wait); + aio_put_req(req); /* drop extra ref to req */ aio_put_req(req); /* drop i/o ref to req */ return ret; -- cgit v1.2.3 From 36f5588905c10a8c4568a210d601fe8c3c27e0f0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:41 -0700 Subject: aio: refcounting cleanup The usage of ctx->dead was fubar - it makes no sense to explicitly check it all over the place, especially when we're already using RCU. Now, ctx->dead only indicates whether we've dropped the initial refcount. The new teardown sequence is: set ctx->dead hlist_del_rcu(); synchronize_rcu(); Now we know no system calls can take a new ref, and it's safe to drop the initial ref: put_ioctx(); We also need to ensure there are no more outstanding kiocbs. This was done incorrectly - it was being done in kill_ctx(), and before dropping the initial refcount. At this point, other syscalls may still be submitting kiocbs! Now, we cancel and wait for outstanding kiocbs in free_ioctx(), after kioctx->users has dropped to 0 and we know no more iocbs could be submitted. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 272 ++++++++++++++++++++++++++++----------------------------------- 1 file changed, 119 insertions(+), 153 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index f877417f3c42..96f55bf207ed 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -79,7 +79,7 @@ static inline unsigned aio_ring_avail(struct aio_ring_info *info, struct kioctx { atomic_t users; - int dead; + atomic_t dead; /* This needs improving */ unsigned long user_id; @@ -98,6 +98,7 @@ struct kioctx { struct aio_ring_info ring_info; struct rcu_head rcu_head; + struct work_struct rcu_work; }; /*------ sysctl variables----*/ @@ -237,44 +238,6 @@ static int aio_setup_ring(struct kioctx *ctx) kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \ } while(0) -static void ctx_rcu_free(struct rcu_head *head) -{ - struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); - kmem_cache_free(kioctx_cachep, ctx); -} - -/* __put_ioctx - * Called when the last user of an aio context has gone away, - * and the struct needs to be freed. - */ -static void __put_ioctx(struct kioctx *ctx) -{ - unsigned nr_events = ctx->max_reqs; - BUG_ON(atomic_read(&ctx->reqs_active)); - - aio_free_ring(ctx); - if (nr_events) { - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - nr_events > aio_nr); - aio_nr -= nr_events; - spin_unlock(&aio_nr_lock); - } - pr_debug("freeing %p\n", ctx); - call_rcu(&ctx->rcu_head, ctx_rcu_free); -} - -static inline int try_get_ioctx(struct kioctx *kioctx) -{ - return atomic_inc_not_zero(&kioctx->users); -} - -static inline void put_ioctx(struct kioctx *kioctx) -{ - BUG_ON(atomic_read(&kioctx->users) <= 0); - if (unlikely(atomic_dec_and_test(&kioctx->users))) - __put_ioctx(kioctx); -} - static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, struct io_event *res) { @@ -298,6 +261,61 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, return ret; } +static void free_ioctx_rcu(struct rcu_head *head) +{ + struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + kmem_cache_free(kioctx_cachep, ctx); +} + +/* + * When this function runs, the kioctx has been removed from the "hash table" + * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - + * now it's safe to cancel any that need to be. + */ +static void free_ioctx(struct kioctx *ctx) +{ + struct io_event res; + struct kiocb *req; + + spin_lock_irq(&ctx->ctx_lock); + + while (!list_empty(&ctx->active_reqs)) { + req = list_first_entry(&ctx->active_reqs, + struct kiocb, ki_list); + + list_del_init(&req->ki_list); + kiocb_cancel(ctx, req, &res); + } + + spin_unlock_irq(&ctx->ctx_lock); + + wait_event(ctx->wait, !atomic_read(&ctx->reqs_active)); + + aio_free_ring(ctx); + + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - ctx->max_reqs > aio_nr); + aio_nr -= ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + pr_debug("freeing %p\n", ctx); + + /* + * Here the call_rcu() is between the wait_event() for reqs_active to + * hit 0, and freeing the ioctx. + * + * aio_complete() decrements reqs_active, but it has to touch the ioctx + * after to issue a wakeup so we use rcu. + */ + call_rcu(&ctx->rcu_head, free_ioctx_rcu); +} + +static void put_ioctx(struct kioctx *ctx) +{ + if (unlikely(atomic_dec_and_test(&ctx->users))) + free_ioctx(ctx); +} + /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ @@ -324,6 +342,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->max_reqs = nr_events; atomic_set(&ctx->users, 2); + atomic_set(&ctx->dead, 0); spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->ring_info.ring_lock); init_waitqueue_head(&ctx->wait); @@ -361,44 +380,43 @@ out_freectx: return ERR_PTR(err); } -/* kill_ctx - * Cancels all outstanding aio requests on an aio context. Used - * when the processes owning a context have all exited to encourage - * the rapid destruction of the kioctx. - */ -static void kill_ctx(struct kioctx *ctx) +static void kill_ioctx_work(struct work_struct *work) { - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - struct io_event res; - struct kiocb *req; + struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); - spin_lock_irq(&ctx->ctx_lock); - ctx->dead = 1; - while (!list_empty(&ctx->active_reqs)) { - req = list_first_entry(&ctx->active_reqs, - struct kiocb, ki_list); + wake_up_all(&ctx->wait); + put_ioctx(ctx); +} - list_del_init(&req->ki_list); - kiocb_cancel(ctx, req, &res); - } +static void kill_ioctx_rcu(struct rcu_head *head) +{ + struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); - if (!atomic_read(&ctx->reqs_active)) - goto out; + INIT_WORK(&ctx->rcu_work, kill_ioctx_work); + schedule_work(&ctx->rcu_work); +} - add_wait_queue(&ctx->wait, &wait); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - while (atomic_read(&ctx->reqs_active)) { - spin_unlock_irq(&ctx->ctx_lock); - io_schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - spin_lock_irq(&ctx->ctx_lock); - } - __set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&ctx->wait, &wait); +/* kill_ioctx + * Cancels all outstanding aio requests on an aio context. Used + * when the processes owning a context have all exited to encourage + * the rapid destruction of the kioctx. + */ +static void kill_ioctx(struct kioctx *ctx) +{ + if (!atomic_xchg(&ctx->dead, 1)) { + hlist_del_rcu(&ctx->list); + /* Between hlist_del_rcu() and dropping the initial ref */ + synchronize_rcu(); -out: - spin_unlock_irq(&ctx->ctx_lock); + /* + * We can't punt to workqueue here because put_ioctx() -> + * free_ioctx() will unmap the ringbuffer, and that has to be + * done in the original process's context. kill_ioctx_rcu/work() + * exist for exit_aio(), as in that path free_ioctx() won't do + * the unmap. + */ + kill_ioctx_work(&ctx->rcu_work); + } } /* wait_on_sync_kiocb: @@ -417,27 +435,25 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) } EXPORT_SYMBOL(wait_on_sync_kiocb); -/* exit_aio: called when the last user of mm goes away. At this point, - * there is no way for any new requests to be submited or any of the - * io_* syscalls to be called on the context. However, there may be - * outstanding requests which hold references to the context; as they - * go away, they will call put_ioctx and release any pinned memory - * associated with the request (held via struct page * references). +/* + * exit_aio: called when the last user of mm goes away. At this point, there is + * no way for any new requests to be submited or any of the io_* syscalls to be + * called on the context. + * + * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on + * them. */ void exit_aio(struct mm_struct *mm) { struct kioctx *ctx; + struct hlist_node *n; - while (!hlist_empty(&mm->ioctx_list)) { - ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); - hlist_del_rcu(&ctx->list); - - kill_ctx(ctx); - + hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { if (1 != atomic_read(&ctx->users)) printk(KERN_DEBUG "exit_aio:ioctx still alive: %d %d %d\n", - atomic_read(&ctx->users), ctx->dead, + atomic_read(&ctx->users), + atomic_read(&ctx->dead), atomic_read(&ctx->reqs_active)); /* * We don't need to bother with munmap() here - @@ -448,7 +464,11 @@ void exit_aio(struct mm_struct *mm) * place that uses ->mmap_size, so it's safe. */ ctx->ring_info.mmap_size = 0; - put_ioctx(ctx); + + if (!atomic_xchg(&ctx->dead, 1)) { + hlist_del_rcu(&ctx->list); + call_rcu(&ctx->rcu_head, kill_ioctx_rcu); + } } } @@ -514,8 +534,6 @@ static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) kmem_cache_free(kiocb_cachep, req); atomic_dec(&ctx->reqs_active); } - if (unlikely(!atomic_read(&ctx->reqs_active) && ctx->dead)) - wake_up_all(&ctx->wait); spin_unlock_irq(&ctx->ctx_lock); } @@ -612,13 +630,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) rcu_read_lock(); hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { - /* - * RCU protects us against accessing freed memory but - * we have to be careful not to get a reference when the - * reference count already dropped to 0 (ctx->dead test - * is unreliable because of races). - */ - if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){ + if (ctx->user_id == ctx_id) { + atomic_inc(&ctx->users); ret = ctx; break; } @@ -657,12 +670,15 @@ void aio_complete(struct kiocb *iocb, long res, long res2) info = &ctx->ring_info; - /* add a completion event to the ring buffer. - * must be done holding ctx->ctx_lock to prevent - * other code from messing with the tail - * pointer since we might be called from irq - * context. + /* + * Add a completion event to the ring buffer. Must be done holding + * ctx->ctx_lock to prevent other code from messing with the tail + * pointer since we might be called from irq context. + * + * Take rcu_read_lock() in case the kioctx is being destroyed, as we + * need to issue a wakeup after decrementing reqs_active. */ + rcu_read_lock(); spin_lock_irqsave(&ctx->ctx_lock, flags); list_del(&iocb->ki_list); /* remove from active_reqs */ @@ -728,6 +744,7 @@ put_rq: wake_up(&ctx->wait); spin_unlock_irqrestore(&ctx->ctx_lock, flags); + rcu_read_unlock(); } EXPORT_SYMBOL(aio_complete); @@ -871,7 +888,7 @@ static int read_events(struct kioctx *ctx, break; if (min_nr <= i) break; - if (unlikely(ctx->dead)) { + if (unlikely(atomic_read(&ctx->dead))) { ret = -EINVAL; break; } @@ -914,35 +931,6 @@ out: return i ? i : ret; } -/* Take an ioctx and remove it from the list of ioctx's. Protects - * against races with itself via ->dead. - */ -static void io_destroy(struct kioctx *ioctx) -{ - struct mm_struct *mm = current->mm; - int was_dead; - - /* delete the entry from the list is someone else hasn't already */ - spin_lock(&mm->ioctx_lock); - was_dead = ioctx->dead; - ioctx->dead = 1; - hlist_del_rcu(&ioctx->list); - spin_unlock(&mm->ioctx_lock); - - pr_debug("(%p)\n", ioctx); - if (likely(!was_dead)) - put_ioctx(ioctx); /* twice for the list */ - - kill_ctx(ioctx); - - /* - * Wake up any waiters. The setting of ctx->dead must be seen - * by other CPUs at this point. Right now, we rely on the - * locking done by the above calls to ensure this consistency. - */ - wake_up_all(&ioctx->wait); -} - /* sys_io_setup: * Create an aio_context capable of receiving at least nr_events. * ctxp must not point to an aio_context that already exists, and @@ -978,7 +966,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - io_destroy(ioctx); + kill_ioctx(ioctx); put_ioctx(ioctx); } @@ -996,7 +984,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - io_destroy(ioctx); + kill_ioctx(ioctx); put_ioctx(ioctx); return 0; } @@ -1300,25 +1288,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, ret = aio_setup_iocb(req, compat); - if (ret) - goto out_put_req; - - spin_lock_irq(&ctx->ctx_lock); - /* - * We could have raced with io_destroy() and are currently holding a - * reference to ctx which should be destroyed. We cannot submit IO - * since ctx gets freed as soon as io_submit() puts its reference. The - * check here is reliable: io_destroy() sets ctx->dead before waiting - * for outstanding IO and the barrier between these two is realized by - * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we - * increment ctx->reqs_active before checking for ctx->dead and the - * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we - * don't see ctx->dead set here, io_destroy() waits for our IO to - * finish. - */ - if (ctx->dead) - ret = -EINVAL; - spin_unlock_irq(&ctx->ctx_lock); if (ret) goto out_put_req; @@ -1348,9 +1317,6 @@ out_put_req: spin_unlock_irq(&ctx->ctx_lock); atomic_dec(&ctx->reqs_active); - if (unlikely(!atomic_read(&ctx->reqs_active) && ctx->dead)) - wake_up_all(&ctx->wait); - aio_put_req(req); /* drop extra ref to req */ aio_put_req(req); /* drop i/o ref to req */ return ret; -- cgit v1.2.3 From a31ad380bed817aa25f8830ad23e1a0480fef797 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:45 -0700 Subject: aio: make aio_read_evt() more efficient, convert to hrtimers Previously, aio_read_event() pulled a single completion off the ringbuffer at a time, locking and unlocking each time. Change it to pull off as many events as it can at a time, and copy them directly to userspace. This also fixes a bug where if copying the event to userspace failed, we'd lose the event. Also convert it to wait_event_interruptible_hrtimeout(), which simplifies it quite a bit. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 240 ++++++++++++++++++++++++--------------------------------------- 1 file changed, 90 insertions(+), 150 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 96f55bf207ed..3f348a45e23b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -63,7 +63,7 @@ struct aio_ring_info { unsigned long mmap_size; struct page **ring_pages; - spinlock_t ring_lock; + struct mutex ring_lock; long nr_pages; unsigned nr, tail; @@ -344,7 +344,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) atomic_set(&ctx->users, 2); atomic_set(&ctx->dead, 0); spin_lock_init(&ctx->ctx_lock); - spin_lock_init(&ctx->ring_info.ring_lock); + mutex_init(&ctx->ring_info.ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); @@ -748,187 +748,127 @@ put_rq: } EXPORT_SYMBOL(aio_complete); -/* aio_read_evt - * Pull an event off of the ioctx's event ring. Returns the number of - * events fetched (0 or 1 ;-) - * FIXME: make this use cmpxchg. - * TODO: make the ringbuffer user mmap()able (requires FIXME). +/* aio_read_events + * Pull an event off of the ioctx's event ring. Returns the number of + * events fetched */ -static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) +static long aio_read_events_ring(struct kioctx *ctx, + struct io_event __user *event, long nr) { - struct aio_ring_info *info = &ioctx->ring_info; + struct aio_ring_info *info = &ctx->ring_info; struct aio_ring *ring; - unsigned long head; - int ret = 0; + unsigned head, pos; + long ret = 0; + int copy_ret; + + mutex_lock(&info->ring_lock); ring = kmap_atomic(info->ring_pages[0]); - pr_debug("h%u t%u m%u\n", ring->head, ring->tail, ring->nr); + head = ring->head; + kunmap_atomic(ring); + + pr_debug("h%u t%u m%u\n", head, info->tail, info->nr); - if (ring->head == ring->tail) + if (head == info->tail) goto out; - spin_lock(&info->ring_lock); - - head = ring->head % info->nr; - if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head); - *ent = *evp; - head = (head + 1) % info->nr; - smp_mb(); /* finish reading the event before updatng the head */ - ring->head = head; - ret = 1; - put_aio_ring_event(evp); + while (ret < nr) { + long avail; + struct io_event *ev; + struct page *page; + + avail = (head <= info->tail ? info->tail : info->nr) - head; + if (head == info->tail) + break; + + avail = min(avail, nr - ret); + avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - + ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); + + pos = head + AIO_EVENTS_OFFSET; + page = info->ring_pages[pos / AIO_EVENTS_PER_PAGE]; + pos %= AIO_EVENTS_PER_PAGE; + + ev = kmap(page); + copy_ret = copy_to_user(event + ret, ev + pos, + sizeof(*ev) * avail); + kunmap(page); + + if (unlikely(copy_ret)) { + ret = -EFAULT; + goto out; + } + + ret += avail; + head += avail; + head %= info->nr; } - spin_unlock(&info->ring_lock); -out: + ring = kmap_atomic(info->ring_pages[0]); + ring->head = head; kunmap_atomic(ring); - pr_debug("%d h%u t%u\n", ret, ring->head, ring->tail); + + pr_debug("%li h%u t%u\n", ret, head, info->tail); +out: + mutex_unlock(&info->ring_lock); + return ret; } -struct aio_timeout { - struct timer_list timer; - int timed_out; - struct task_struct *p; -}; - -static void timeout_func(unsigned long data) +static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, long *i) { - struct aio_timeout *to = (struct aio_timeout *)data; + long ret = aio_read_events_ring(ctx, event + *i, nr - *i); - to->timed_out = 1; - wake_up_process(to->p); -} + if (ret > 0) + *i += ret; -static inline void init_timeout(struct aio_timeout *to) -{ - setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); - to->timed_out = 0; - to->p = current; -} + if (unlikely(atomic_read(&ctx->dead))) + ret = -EINVAL; -static inline void set_timeout(long start_jiffies, struct aio_timeout *to, - const struct timespec *ts) -{ - to->timer.expires = start_jiffies + timespec_to_jiffies(ts); - if (time_after(to->timer.expires, jiffies)) - add_timer(&to->timer); - else - to->timed_out = 1; -} + if (!*i) + *i = ret; -static inline void clear_timeout(struct aio_timeout *to) -{ - del_singleshot_timer_sync(&to->timer); + return ret < 0 || *i >= min_nr; } -static int read_events(struct kioctx *ctx, - long min_nr, long nr, +static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, struct timespec __user *timeout) { - long start_jiffies = jiffies; - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - int ret; - int i = 0; - struct io_event ent; - struct aio_timeout to; - - /* needed to zero any padding within an entry (there shouldn't be - * any, but C is fun! - */ - memset(&ent, 0, sizeof(ent)); - ret = 0; - while (likely(i < nr)) { - ret = aio_read_evt(ctx, &ent); - if (unlikely(ret <= 0)) - break; - - pr_debug("%Lx %Lx %Lx %Lx\n", - ent.data, ent.obj, ent.res, ent.res2); - - /* Could we split the check in two? */ - ret = -EFAULT; - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - pr_debug("lost an event due to EFAULT.\n"); - break; - } - ret = 0; - - /* Good, event copied to userland, update counts. */ - event ++; - i ++; - } - - if (min_nr <= i) - return i; - if (ret) - return ret; - - /* End fast path */ + ktime_t until = { .tv64 = KTIME_MAX }; + long ret = 0; - init_timeout(&to); if (timeout) { struct timespec ts; - ret = -EFAULT; + if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) - goto out; + return -EFAULT; - set_timeout(start_jiffies, &to, &ts); + until = timespec_to_ktime(ts); } - while (likely(i < nr)) { - add_wait_queue_exclusive(&ctx->wait, &wait); - do { - set_task_state(tsk, TASK_INTERRUPTIBLE); - ret = aio_read_evt(ctx, &ent); - if (ret) - break; - if (min_nr <= i) - break; - if (unlikely(atomic_read(&ctx->dead))) { - ret = -EINVAL; - break; - } - if (to.timed_out) /* Only check after read evt */ - break; - /* Try to only show up in io wait if there are ops - * in flight */ - if (atomic_read(&ctx->reqs_active)) - io_schedule(); - else - schedule(); - if (signal_pending(tsk)) { - ret = -EINTR; - break; - } - /*ret = aio_read_evt(ctx, &ent);*/ - } while (1) ; - - set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&ctx->wait, &wait); - - if (unlikely(ret <= 0)) - break; - - ret = -EFAULT; - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - pr_debug("lost an event due to EFAULT.\n"); - break; - } + /* + * Note that aio_read_events() is being called as the conditional - i.e. + * we're calling it after prepare_to_wait() has set task state to + * TASK_INTERRUPTIBLE. + * + * But aio_read_events() can block, and if it blocks it's going to flip + * the task state back to TASK_RUNNING. + * + * This should be ok, provided it doesn't flip the state back to + * TASK_RUNNING and return 0 too much - that causes us to spin. That + * will only happen if the mutex_lock() call blocks, and we then find + * the ringbuffer empty. So in practice we should be ok, but it's + * something to be aware of when touching this code. + */ + wait_event_interruptible_hrtimeout(ctx->wait, + aio_read_events(ctx, min_nr, nr, event, &ret), until); - /* Good, event copied to userland, update counts. */ - event ++; - i ++; - } + if (!ret && signal_pending(current)) + ret = -EINTR; - if (timeout) - clear_timeout(&to); -out: - destroy_timer_on_stack(&to.timer); - return i ? i : ret; + return ret; } /* sys_io_setup: -- cgit v1.2.3 From 21b40200cfe961b1428a529c63c33b1f1e1b4738 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:47 -0700 Subject: aio: use flush_dcache_page() This wasn't causing problems before because it's not needed on x86, but it is needed on other architectures. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 45 +++++++++++++++++---------------------------- 1 file changed, 17 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 3f348a45e23b..aea060d8c1e8 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -211,33 +211,15 @@ static int aio_setup_ring(struct kioctx *ctx) ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); kunmap_atomic(ring); + flush_dcache_page(info->ring_pages[0]); return 0; } - -/* aio_ring_event: returns a pointer to the event at the given index from - * kmap_atomic(). Release the pointer with put_aio_ring_event(); - */ #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ - struct io_event *__event; \ - __event = kmap_atomic( \ - (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \ - __event += pos % AIO_EVENTS_PER_PAGE; \ - __event; \ -}) - -#define put_aio_ring_event(event) do { \ - struct io_event *__event = (event); \ - (void)__event; \ - kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \ -} while(0) - static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, struct io_event *res) { @@ -649,9 +631,9 @@ void aio_complete(struct kiocb *iocb, long res, long res2) struct kioctx *ctx = iocb->ki_ctx; struct aio_ring_info *info; struct aio_ring *ring; - struct io_event *event; + struct io_event *ev_page, *event; unsigned long flags; - unsigned long tail; + unsigned tail, pos; /* * Special case handling for sync iocbs: @@ -690,19 +672,24 @@ void aio_complete(struct kiocb *iocb, long res, long res2) if (kiocbIsCancelled(iocb)) goto put_rq; - ring = kmap_atomic(info->ring_pages[0]); - tail = info->tail; - event = aio_ring_event(info, tail); + pos = tail + AIO_EVENTS_OFFSET; + if (++tail >= info->nr) tail = 0; + ev_page = kmap_atomic(info->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + event = ev_page + pos % AIO_EVENTS_PER_PAGE; + event->obj = (u64)(unsigned long)iocb->ki_obj.user; event->data = iocb->ki_user_data; event->res = res; event->res2 = res2; - pr_debug("%p[%lu]: %p: %p %Lx %lx %lx\n", + kunmap_atomic(ev_page); + flush_dcache_page(info->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + + pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, res, res2); @@ -712,12 +699,13 @@ void aio_complete(struct kiocb *iocb, long res, long res2) smp_wmb(); /* make event visible before updating tail */ info->tail = tail; - ring->tail = tail; - put_aio_ring_event(event); + ring = kmap_atomic(info->ring_pages[0]); + ring->tail = tail; kunmap_atomic(ring); + flush_dcache_page(info->ring_pages[0]); - pr_debug("added to ring %p at [%lu]\n", iocb, tail); + pr_debug("added to ring %p at [%u]\n", iocb, tail); /* * Check if the user asked us to deliver the result through an @@ -807,6 +795,7 @@ static long aio_read_events_ring(struct kioctx *ctx, ring = kmap_atomic(info->ring_pages[0]); ring->head = head; kunmap_atomic(ring); + flush_dcache_page(info->ring_pages[0]); pr_debug("%li h%u t%u\n", ret, head, info->tail); out: -- cgit v1.2.3 From 0460fef2a9215680f7f85415b57731b7e0fdf673 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:49 -0700 Subject: aio: use cancellation list lazily Cancelling kiocbs requires adding them to a per kioctx linked list, which is one of the few things we need to take the kioctx lock for in the fast path. But most kiocbs can't be cancelled - so if we just do this lazily, we can avoid quite a bit of locking overhead. While we're at it, instead of using a flag bit switch to using ki_cancel itself to indicate that a kiocb has been cancelled/completed. This lets us get rid of ki_flags entirely. [akpm@linux-foundation.org: remove buggy BUG()] Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 106 +++++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 62 insertions(+), 44 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index aea060d8c1e8..3428e9ae2f1d 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -97,6 +97,8 @@ struct kioctx { struct aio_ring_info ring_info; + spinlock_t completion_lock; + struct rcu_head rcu_head; struct work_struct rcu_work; }; @@ -220,25 +222,51 @@ static int aio_setup_ring(struct kioctx *ctx) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) +void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) +{ + struct kioctx *ctx = req->ki_ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + + if (!req->ki_list.next) + list_add(&req->ki_list, &ctx->active_reqs); + + req->ki_cancel = cancel; + + spin_unlock_irqrestore(&ctx->ctx_lock, flags); +} +EXPORT_SYMBOL(kiocb_set_cancel_fn); + static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, struct io_event *res) { - int (*cancel)(struct kiocb *, struct io_event *); + kiocb_cancel_fn *old, *cancel; int ret = -EINVAL; - cancel = kiocb->ki_cancel; - kiocbSetCancelled(kiocb); - if (cancel) { - atomic_inc(&kiocb->ki_users); - spin_unlock_irq(&ctx->ctx_lock); + /* + * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it + * actually has a cancel function, hence the cmpxchg() + */ + + cancel = ACCESS_ONCE(kiocb->ki_cancel); + do { + if (!cancel || cancel == KIOCB_CANCELLED) + return ret; - memset(res, 0, sizeof(*res)); - res->obj = (u64)(unsigned long)kiocb->ki_obj.user; - res->data = kiocb->ki_user_data; - ret = cancel(kiocb, res); + old = cancel; + cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); + } while (cancel != old); - spin_lock_irq(&ctx->ctx_lock); - } + atomic_inc(&kiocb->ki_users); + spin_unlock_irq(&ctx->ctx_lock); + + memset(res, 0, sizeof(*res)); + res->obj = (u64)(unsigned long)kiocb->ki_obj.user; + res->data = kiocb->ki_user_data; + ret = cancel(kiocb, res); + + spin_lock_irq(&ctx->ctx_lock); return ret; } @@ -326,6 +354,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) atomic_set(&ctx->users, 2); atomic_set(&ctx->dead, 0); spin_lock_init(&ctx->ctx_lock); + spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_info.ring_lock); init_waitqueue_head(&ctx->wait); @@ -468,20 +497,12 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx) { struct kiocb *req = NULL; - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); if (unlikely(!req)) return NULL; - req->ki_flags = 0; atomic_set(&req->ki_users, 2); - req->ki_key = 0; req->ki_ctx = ctx; - req->ki_cancel = NULL; - req->ki_retry = NULL; - req->ki_dtor = NULL; - req->private = NULL; - req->ki_iovec = NULL; - req->ki_eventfd = NULL; return req; } @@ -512,7 +533,6 @@ static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) spin_lock_irq(&ctx->ctx_lock); list_for_each_entry_safe(req, n, &batch->head, ki_batch) { list_del(&req->ki_batch); - list_del(&req->ki_list); kmem_cache_free(kiocb_cachep, req); atomic_dec(&ctx->reqs_active); } @@ -559,10 +579,7 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) } batch->count -= allocated; - list_for_each_entry(req, &batch->head, ki_batch) { - list_add(&req->ki_list, &ctx->active_reqs); - atomic_inc(&ctx->reqs_active); - } + atomic_add(allocated, &ctx->reqs_active); kunmap_atomic(ring); spin_unlock_irq(&ctx->ctx_lock); @@ -653,25 +670,34 @@ void aio_complete(struct kiocb *iocb, long res, long res2) info = &ctx->ring_info; /* - * Add a completion event to the ring buffer. Must be done holding - * ctx->ctx_lock to prevent other code from messing with the tail - * pointer since we might be called from irq context. - * * Take rcu_read_lock() in case the kioctx is being destroyed, as we * need to issue a wakeup after decrementing reqs_active. */ rcu_read_lock(); - spin_lock_irqsave(&ctx->ctx_lock, flags); - list_del(&iocb->ki_list); /* remove from active_reqs */ + if (iocb->ki_list.next) { + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_del(&iocb->ki_list); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + } /* * cancelled requests don't get events, userland was given one * when the event got cancelled. */ - if (kiocbIsCancelled(iocb)) + if (unlikely(xchg(&iocb->ki_cancel, + KIOCB_CANCELLED) == KIOCB_CANCELLED)) goto put_rq; + /* + * Add a completion event to the ring buffer. Must be done holding + * ctx->ctx_lock to prevent other code from messing with the tail + * pointer since we might be called from irq context. + */ + spin_lock_irqsave(&ctx->completion_lock, flags); + tail = info->tail; pos = tail + AIO_EVENTS_OFFSET; @@ -705,6 +731,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2) kunmap_atomic(ring); flush_dcache_page(info->ring_pages[0]); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + pr_debug("added to ring %p at [%u]\n", iocb, tail); /* @@ -731,7 +759,6 @@ put_rq: if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); rcu_read_unlock(); } EXPORT_SYMBOL(aio_complete); @@ -1216,15 +1243,10 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_opcode = iocb->aio_lio_opcode; ret = aio_setup_iocb(req, compat); - if (ret) goto out_put_req; - if (unlikely(kiocbIsCancelled(req))) - ret = -EINTR; - else - ret = req->ki_retry(req); - + ret = req->ki_retry(req); if (ret != -EIOCBQUEUED) { /* * There's no easy way to restart the syscall since other AIO's @@ -1241,10 +1263,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return 0; out_put_req: - spin_lock_irq(&ctx->ctx_lock); - list_del(&req->ki_list); - spin_unlock_irq(&ctx->ctx_lock); - atomic_dec(&ctx->reqs_active); aio_put_req(req); /* drop extra ref to req */ aio_put_req(req); /* drop i/o ref to req */ -- cgit v1.2.3 From 3e845ce01a391d7c5d59ff2f28db5381bf02fa27 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:51 -0700 Subject: aio: change reqs_active to include unreaped completions The aio code tries really hard to avoid having to deal with the completion ringbuffer overflowing. To do that, it has to keep track of the number of outstanding kiocbs, and the number of completions currently in the ringbuffer - and it's got to check that every time we allocate a kiocb. Ouch. But - we can improve this quite a bit if we just change reqs_active to mean "number of outstanding requests and unreaped completions" - that means kiocb allocation doesn't have to look at the ringbuffer, which is a fairly significant win. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Signed-off-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 3428e9ae2f1d..d3bff60b5fe6 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -71,12 +71,6 @@ struct aio_ring_info { struct page *internal_pages[AIO_RING_PAGES]; }; -static inline unsigned aio_ring_avail(struct aio_ring_info *info, - struct aio_ring *ring) -{ - return (ring->head + info->nr - 1 - ring->tail) % info->nr; -} - struct kioctx { atomic_t users; atomic_t dead; @@ -92,7 +86,13 @@ struct kioctx { atomic_t reqs_active; struct list_head active_reqs; /* used for cancellation */ - /* sys_io_setup currently limits this to an unsigned int */ + /* + * This is what userspace passed to io_setup(), it's not used for + * anything but counting against the global max_reqs quota. + * + * The real limit is ring->nr - 1, which will be larger (see + * aio_setup_ring()) + */ unsigned max_reqs; struct aio_ring_info ring_info; @@ -284,8 +284,11 @@ static void free_ioctx_rcu(struct rcu_head *head) */ static void free_ioctx(struct kioctx *ctx) { + struct aio_ring_info *info = &ctx->ring_info; + struct aio_ring *ring; struct io_event res; struct kiocb *req; + unsigned head, avail; spin_lock_irq(&ctx->ctx_lock); @@ -299,7 +302,21 @@ static void free_ioctx(struct kioctx *ctx) spin_unlock_irq(&ctx->ctx_lock); - wait_event(ctx->wait, !atomic_read(&ctx->reqs_active)); + ring = kmap_atomic(info->ring_pages[0]); + head = ring->head; + kunmap_atomic(ring); + + while (atomic_read(&ctx->reqs_active) > 0) { + wait_event(ctx->wait, head != info->tail); + + avail = (head <= info->tail ? info->tail : info->nr) - head; + + atomic_sub(avail, &ctx->reqs_active); + head += avail; + head %= info->nr; + } + + WARN_ON(atomic_read(&ctx->reqs_active) < 0); aio_free_ring(ctx); @@ -548,7 +565,6 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) unsigned short allocated, to_alloc; long avail; struct kiocb *req, *n; - struct aio_ring *ring; to_alloc = min(batch->count, KIOCB_BATCH_SIZE); for (allocated = 0; allocated < to_alloc; allocated++) { @@ -563,10 +579,8 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) goto out; spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0]); - avail = aio_ring_avail(&ctx->ring_info, ring) - - atomic_read(&ctx->reqs_active); + avail = ctx->ring_info.nr - atomic_read(&ctx->reqs_active) - 1; BUG_ON(avail < 0); if (avail < allocated) { /* Trim back the number of requests. */ @@ -581,7 +595,6 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) batch->count -= allocated; atomic_add(allocated, &ctx->reqs_active); - kunmap_atomic(ring); spin_unlock_irq(&ctx->ctx_lock); out: @@ -688,8 +701,11 @@ void aio_complete(struct kiocb *iocb, long res, long res2) * when the event got cancelled. */ if (unlikely(xchg(&iocb->ki_cancel, - KIOCB_CANCELLED) == KIOCB_CANCELLED)) + KIOCB_CANCELLED) == KIOCB_CANCELLED)) { + atomic_dec(&ctx->reqs_active); + /* Still need the wake_up in case free_ioctx is waiting */ goto put_rq; + } /* * Add a completion event to the ring buffer. Must be done holding @@ -746,7 +762,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) put_rq: /* everything turned out well, dispose of the aiocb. */ aio_put_req(iocb); - atomic_dec(&ctx->reqs_active); /* * We have to order our ring_info tail store above and test @@ -825,6 +840,8 @@ static long aio_read_events_ring(struct kioctx *ctx, flush_dcache_page(info->ring_pages[0]); pr_debug("%li h%u t%u\n", ret, head, info->tail); + + atomic_sub(ret, &ctx->reqs_active); out: mutex_unlock(&info->ring_lock); -- cgit v1.2.3 From a1c8eae75ea3b0168fca93788db1b5aef2424921 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:53 -0700 Subject: aio: kill batch allocation Previously, allocating a kiocb required touching quite a few global (well, per kioctx) cachelines... so batching up allocation to amortize those was worthwhile. But we've gotten rid of some of those, and in another couple of patches kiocb allocation won't require writing to any shared cachelines, so that means we can just rip this code out. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 116 +++++++++------------------------------------------------------ 1 file changed, 15 insertions(+), 101 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index d3bff60b5fe6..263ebce940c0 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -510,108 +510,27 @@ void exit_aio(struct mm_struct *mm) * This prevents races between the aio code path referencing the * req (after submitting it) and aio_complete() freeing the req. */ -static struct kiocb *__aio_get_req(struct kioctx *ctx) +static inline struct kiocb *aio_get_req(struct kioctx *ctx) { - struct kiocb *req = NULL; + struct kiocb *req; + + if (atomic_read(&ctx->reqs_active) >= ctx->ring_info.nr) + return NULL; + + if (atomic_inc_return(&ctx->reqs_active) > ctx->ring_info.nr - 1) + goto out_put; req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); if (unlikely(!req)) - return NULL; + goto out_put; atomic_set(&req->ki_users, 2); req->ki_ctx = ctx; return req; -} - -/* - * struct kiocb's are allocated in batches to reduce the number of - * times the ctx lock is acquired and released. - */ -#define KIOCB_BATCH_SIZE 32L -struct kiocb_batch { - struct list_head head; - long count; /* number of requests left to allocate */ -}; - -static void kiocb_batch_init(struct kiocb_batch *batch, long total) -{ - INIT_LIST_HEAD(&batch->head); - batch->count = total; -} - -static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) -{ - struct kiocb *req, *n; - - if (list_empty(&batch->head)) - return; - - spin_lock_irq(&ctx->ctx_lock); - list_for_each_entry_safe(req, n, &batch->head, ki_batch) { - list_del(&req->ki_batch); - kmem_cache_free(kiocb_cachep, req); - atomic_dec(&ctx->reqs_active); - } - spin_unlock_irq(&ctx->ctx_lock); -} - -/* - * Allocate a batch of kiocbs. This avoids taking and dropping the - * context lock a lot during setup. - */ -static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) -{ - unsigned short allocated, to_alloc; - long avail; - struct kiocb *req, *n; - - to_alloc = min(batch->count, KIOCB_BATCH_SIZE); - for (allocated = 0; allocated < to_alloc; allocated++) { - req = __aio_get_req(ctx); - if (!req) - /* allocation failed, go with what we've got */ - break; - list_add(&req->ki_batch, &batch->head); - } - - if (allocated == 0) - goto out; - - spin_lock_irq(&ctx->ctx_lock); - - avail = ctx->ring_info.nr - atomic_read(&ctx->reqs_active) - 1; - BUG_ON(avail < 0); - if (avail < allocated) { - /* Trim back the number of requests. */ - list_for_each_entry_safe(req, n, &batch->head, ki_batch) { - list_del(&req->ki_batch); - kmem_cache_free(kiocb_cachep, req); - if (--allocated <= avail) - break; - } - } - - batch->count -= allocated; - atomic_add(allocated, &ctx->reqs_active); - - spin_unlock_irq(&ctx->ctx_lock); - -out: - return allocated; -} - -static inline struct kiocb *aio_get_req(struct kioctx *ctx, - struct kiocb_batch *batch) -{ - struct kiocb *req; - - if (list_empty(&batch->head)) - if (kiocb_batch_refill(ctx, batch) == 0) - return NULL; - req = list_first_entry(&batch->head, struct kiocb, ki_batch); - list_del(&req->ki_batch); - return req; +out_put: + atomic_dec(&ctx->reqs_active); + return NULL; } static void kiocb_free(struct kiocb *req) @@ -1198,8 +1117,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, struct kiocb_batch *batch, - bool compat) + struct iocb *iocb, bool compat) { struct kiocb *req; ssize_t ret; @@ -1220,7 +1138,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return -EINVAL; } - req = aio_get_req(ctx, batch); /* returns with 2 references to req */ + req = aio_get_req(ctx); /* returns with 2 references to req */ if (unlikely(!req)) return -EAGAIN; @@ -1293,7 +1211,6 @@ long do_io_submit(aio_context_t ctx_id, long nr, long ret = 0; int i = 0; struct blk_plug plug; - struct kiocb_batch batch; if (unlikely(nr < 0)) return -EINVAL; @@ -1310,8 +1227,6 @@ long do_io_submit(aio_context_t ctx_id, long nr, return -EINVAL; } - kiocb_batch_init(&batch, nr); - blk_start_plug(&plug); /* @@ -1332,13 +1247,12 @@ long do_io_submit(aio_context_t ctx_id, long nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat); + ret = io_submit_one(ctx, user_iocb, &tmp, compat); if (ret) break; } blk_finish_plug(&plug); - kiocb_batch_free(ctx, &batch); put_ioctx(ctx); return i ? i : ret; } -- cgit v1.2.3 From 58c85dc20a2c5ba0e53740c5dd1945e658da0e1c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:55 -0700 Subject: aio: kill struct aio_ring_info struct aio_ring_info was kind of odd, the only place it's used is where it's embedded in struct kioctx - there's no real need for it. The next patch rearranges struct kioctx and puts various things on their own cachelines - getting rid of struct aio_ring_info now makes that reordering a bit clearer. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 155 ++++++++++++++++++++++++++++++--------------------------------- 1 file changed, 74 insertions(+), 81 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 263ebce940c0..16239fbd6458 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -58,18 +58,6 @@ struct aio_ring { }; /* 128 bytes + ring size */ #define AIO_RING_PAGES 8 -struct aio_ring_info { - unsigned long mmap_base; - unsigned long mmap_size; - - struct page **ring_pages; - struct mutex ring_lock; - long nr_pages; - - unsigned nr, tail; - - struct page *internal_pages[AIO_RING_PAGES]; -}; struct kioctx { atomic_t users; @@ -90,14 +78,30 @@ struct kioctx { * This is what userspace passed to io_setup(), it's not used for * anything but counting against the global max_reqs quota. * - * The real limit is ring->nr - 1, which will be larger (see + * The real limit is nr_events - 1, which will be larger (see * aio_setup_ring()) */ unsigned max_reqs; - struct aio_ring_info ring_info; + /* Size of ringbuffer, in units of struct io_event */ + unsigned nr_events; - spinlock_t completion_lock; + unsigned long mmap_base; + unsigned long mmap_size; + + struct page **ring_pages; + long nr_pages; + + struct { + struct mutex ring_lock; + } ____cacheline_aligned; + + struct { + unsigned tail; + spinlock_t completion_lock; + } ____cacheline_aligned; + + struct page *internal_pages[AIO_RING_PAGES]; struct rcu_head rcu_head; struct work_struct rcu_work; @@ -129,26 +133,21 @@ __initcall(aio_setup); static void aio_free_ring(struct kioctx *ctx) { - struct aio_ring_info *info = &ctx->ring_info; long i; - for (i=0; inr_pages; i++) - put_page(info->ring_pages[i]); + for (i = 0; i < ctx->nr_pages; i++) + put_page(ctx->ring_pages[i]); - if (info->mmap_size) { - vm_munmap(info->mmap_base, info->mmap_size); - } + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); - if (info->ring_pages && info->ring_pages != info->internal_pages) - kfree(info->ring_pages); - info->ring_pages = NULL; - info->nr = 0; + if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) + kfree(ctx->ring_pages); } static int aio_setup_ring(struct kioctx *ctx) { struct aio_ring *ring; - struct aio_ring_info *info = &ctx->ring_info; unsigned nr_events = ctx->max_reqs; struct mm_struct *mm = current->mm; unsigned long size, populate; @@ -166,45 +165,44 @@ static int aio_setup_ring(struct kioctx *ctx) nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info->nr = 0; - info->ring_pages = info->internal_pages; + ctx->nr_events = 0; + ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { - info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!info->ring_pages) + ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!ctx->ring_pages) return -ENOMEM; } - info->mmap_size = nr_pages * PAGE_SIZE; - pr_debug("attempting mmap of %lu bytes\n", info->mmap_size); + ctx->mmap_size = nr_pages * PAGE_SIZE; + pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); down_write(&mm->mmap_sem); - info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0, - &populate); - if (IS_ERR((void *)info->mmap_base)) { + ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, + PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); + if (IS_ERR((void *)ctx->mmap_base)) { up_write(&mm->mmap_sem); - info->mmap_size = 0; + ctx->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; } - pr_debug("mmap address: 0x%08lx\n", info->mmap_base); - info->nr_pages = get_user_pages(current, mm, info->mmap_base, nr_pages, - 1, 0, info->ring_pages, NULL); + pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); + ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, + 1, 0, ctx->ring_pages, NULL); up_write(&mm->mmap_sem); - if (unlikely(info->nr_pages != nr_pages)) { + if (unlikely(ctx->nr_pages != nr_pages)) { aio_free_ring(ctx); return -EAGAIN; } if (populate) - mm_populate(info->mmap_base, populate); + mm_populate(ctx->mmap_base, populate); - ctx->user_id = info->mmap_base; + ctx->user_id = ctx->mmap_base; + ctx->nr_events = nr_events; /* trusted copy */ - info->nr = nr_events; /* trusted copy */ - - ring = kmap_atomic(info->ring_pages[0]); + ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ ring->id = ctx->user_id; ring->head = ring->tail = 0; @@ -213,7 +211,7 @@ static int aio_setup_ring(struct kioctx *ctx) ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); kunmap_atomic(ring); - flush_dcache_page(info->ring_pages[0]); + flush_dcache_page(ctx->ring_pages[0]); return 0; } @@ -284,7 +282,6 @@ static void free_ioctx_rcu(struct rcu_head *head) */ static void free_ioctx(struct kioctx *ctx) { - struct aio_ring_info *info = &ctx->ring_info; struct aio_ring *ring; struct io_event res; struct kiocb *req; @@ -302,18 +299,18 @@ static void free_ioctx(struct kioctx *ctx) spin_unlock_irq(&ctx->ctx_lock); - ring = kmap_atomic(info->ring_pages[0]); + ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; kunmap_atomic(ring); while (atomic_read(&ctx->reqs_active) > 0) { - wait_event(ctx->wait, head != info->tail); + wait_event(ctx->wait, head != ctx->tail); - avail = (head <= info->tail ? info->tail : info->nr) - head; + avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; atomic_sub(avail, &ctx->reqs_active); head += avail; - head %= info->nr; + head %= ctx->nr_events; } WARN_ON(atomic_read(&ctx->reqs_active) < 0); @@ -372,7 +369,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) atomic_set(&ctx->dead, 0); spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); - mutex_init(&ctx->ring_info.ring_lock); + mutex_init(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); @@ -396,7 +393,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) spin_unlock(&mm->ioctx_lock); pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", - ctx, ctx->user_id, mm, ctx->ring_info.nr); + ctx, ctx->user_id, mm, ctx->nr_events); return ctx; out_cleanup: @@ -491,7 +488,7 @@ void exit_aio(struct mm_struct *mm) * just set it to 0; aio_free_ring() is the only * place that uses ->mmap_size, so it's safe. */ - ctx->ring_info.mmap_size = 0; + ctx->mmap_size = 0; if (!atomic_xchg(&ctx->dead, 1)) { hlist_del_rcu(&ctx->list); @@ -514,10 +511,10 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (atomic_read(&ctx->reqs_active) >= ctx->ring_info.nr) + if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) return NULL; - if (atomic_inc_return(&ctx->reqs_active) > ctx->ring_info.nr - 1) + if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) goto out_put; req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); @@ -578,7 +575,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) void aio_complete(struct kiocb *iocb, long res, long res2) { struct kioctx *ctx = iocb->ki_ctx; - struct aio_ring_info *info; struct aio_ring *ring; struct io_event *ev_page, *event; unsigned long flags; @@ -599,8 +595,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) return; } - info = &ctx->ring_info; - /* * Take rcu_read_lock() in case the kioctx is being destroyed, as we * need to issue a wakeup after decrementing reqs_active. @@ -633,13 +627,13 @@ void aio_complete(struct kiocb *iocb, long res, long res2) */ spin_lock_irqsave(&ctx->completion_lock, flags); - tail = info->tail; + tail = ctx->tail; pos = tail + AIO_EVENTS_OFFSET; - if (++tail >= info->nr) + if (++tail >= ctx->nr_events) tail = 0; - ev_page = kmap_atomic(info->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; event->obj = (u64)(unsigned long)iocb->ki_obj.user; @@ -648,7 +642,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) event->res2 = res2; kunmap_atomic(ev_page); - flush_dcache_page(info->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, @@ -659,12 +653,12 @@ void aio_complete(struct kiocb *iocb, long res, long res2) */ smp_wmb(); /* make event visible before updating tail */ - info->tail = tail; + ctx->tail = tail; - ring = kmap_atomic(info->ring_pages[0]); + ring = kmap_atomic(ctx->ring_pages[0]); ring->tail = tail; kunmap_atomic(ring); - flush_dcache_page(info->ring_pages[0]); + flush_dcache_page(ctx->ring_pages[0]); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -704,21 +698,20 @@ EXPORT_SYMBOL(aio_complete); static long aio_read_events_ring(struct kioctx *ctx, struct io_event __user *event, long nr) { - struct aio_ring_info *info = &ctx->ring_info; struct aio_ring *ring; unsigned head, pos; long ret = 0; int copy_ret; - mutex_lock(&info->ring_lock); + mutex_lock(&ctx->ring_lock); - ring = kmap_atomic(info->ring_pages[0]); + ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; kunmap_atomic(ring); - pr_debug("h%u t%u m%u\n", head, info->tail, info->nr); + pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); - if (head == info->tail) + if (head == ctx->tail) goto out; while (ret < nr) { @@ -726,8 +719,8 @@ static long aio_read_events_ring(struct kioctx *ctx, struct io_event *ev; struct page *page; - avail = (head <= info->tail ? info->tail : info->nr) - head; - if (head == info->tail) + avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; + if (head == ctx->tail) break; avail = min(avail, nr - ret); @@ -735,7 +728,7 @@ static long aio_read_events_ring(struct kioctx *ctx, ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); pos = head + AIO_EVENTS_OFFSET; - page = info->ring_pages[pos / AIO_EVENTS_PER_PAGE]; + page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; pos %= AIO_EVENTS_PER_PAGE; ev = kmap(page); @@ -750,19 +743,19 @@ static long aio_read_events_ring(struct kioctx *ctx, ret += avail; head += avail; - head %= info->nr; + head %= ctx->nr_events; } - ring = kmap_atomic(info->ring_pages[0]); + ring = kmap_atomic(ctx->ring_pages[0]); ring->head = head; kunmap_atomic(ring); - flush_dcache_page(info->ring_pages[0]); + flush_dcache_page(ctx->ring_pages[0]); - pr_debug("%li h%u t%u\n", ret, head, info->tail); + pr_debug("%li h%u t%u\n", ret, head, ctx->tail); atomic_sub(ret, &ctx->reqs_active); out: - mutex_unlock(&info->ring_lock); + mutex_unlock(&ctx->ring_lock); return ret; } -- cgit v1.2.3 From 4e23bcaeb9e8df234e47840ac2c757ab79a0b572 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:18:56 -0700 Subject: aio: give shared kioctx fields their own cachelines [akpm@linux-foundation.org: make reqs_active __cacheline_aligned_in_smp] Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 16239fbd6458..986ff305a856 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -67,13 +67,6 @@ struct kioctx { unsigned long user_id; struct hlist_node list; - wait_queue_head_t wait; - - spinlock_t ctx_lock; - - atomic_t reqs_active; - struct list_head active_reqs; /* used for cancellation */ - /* * This is what userspace passed to io_setup(), it's not used for * anything but counting against the global max_reqs quota. @@ -92,19 +85,29 @@ struct kioctx { struct page **ring_pages; long nr_pages; + struct rcu_head rcu_head; + struct work_struct rcu_work; + + struct { + atomic_t reqs_active; + } ____cacheline_aligned_in_smp; + + struct { + spinlock_t ctx_lock; + struct list_head active_reqs; /* used for cancellation */ + } ____cacheline_aligned_in_smp; + struct { struct mutex ring_lock; - } ____cacheline_aligned; + wait_queue_head_t wait; + } ____cacheline_aligned_in_smp; struct { unsigned tail; spinlock_t completion_lock; - } ____cacheline_aligned; + } ____cacheline_aligned_in_smp; struct page *internal_pages[AIO_RING_PAGES]; - - struct rcu_head rcu_head; - struct work_struct rcu_work; }; /*------ sysctl variables----*/ -- cgit v1.2.3 From 8a6608907cf165b3ae658c9de2efe6af4be68bff Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:19:10 -0700 Subject: aio: kill ki_key ki_key wasn't actually used for anything previously - it was always 0. Drop it to trim struct kiocb a bit. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 986ff305a856..5d7dad365f5f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1159,7 +1159,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, } } - ret = put_user(req->ki_key, &user_iocb->aio_key); + ret = put_user(KIOCB_KEY, &user_iocb->aio_key); if (unlikely(ret)) { pr_debug("EFAULT: aio_key\n"); goto out_put_req; @@ -1281,10 +1281,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, assert_spin_locked(&ctx->ctx_lock); + if (key != KIOCB_KEY) + return NULL; + /* TODO: use a hash or array, this sucks. */ list_for_each(pos, &ctx->active_reqs) { struct kiocb *kiocb = list_kiocb(pos); - if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) + if (kiocb->ki_obj.user == iocb) return kiocb; } return NULL; -- cgit v1.2.3 From 41ef4eb8eef8d06bc1399e7b00c940d771554711 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:19:11 -0700 Subject: aio: kill ki_retry Thanks to Zach Brown's work to rip out the retry infrastructure, we don't need this anymore - ki_retry was only called right after the kiocb was initialized. This also refactors and trims some duplicated code, as well as cleaning up the refcounting/error handling a bit. [akpm@linux-foundation.org: use fmode_t in aio_run_iocb()] [akpm@linux-foundation.org: fix file_start_write/file_end_write tests] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 224 ++++++++++++++++++++++++--------------------------------------- 1 file changed, 85 insertions(+), 139 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 5d7dad365f5f..c5b1a8c10411 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -903,30 +903,21 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) BUG_ON(ret > 0 && iocb->ki_left == 0); } -static ssize_t aio_rw_vect_retry(struct kiocb *iocb) +typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + +static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - ssize_t (*rw_op)(struct kiocb *, const struct iovec *, - unsigned long, loff_t); ssize_t ret = 0; - unsigned short opcode; - - if ((iocb->ki_opcode == IOCB_CMD_PREADV) || - (iocb->ki_opcode == IOCB_CMD_PREAD)) { - rw_op = file->f_op->aio_read; - opcode = IOCB_CMD_PREADV; - } else { - rw_op = file->f_op->aio_write; - opcode = IOCB_CMD_PWRITEV; - } /* This matches the pread()/pwrite() logic */ if (iocb->ki_pos < 0) return -EINVAL; - if (opcode == IOCB_CMD_PWRITEV) + if (rw == WRITE) file_start_write(file); do { ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], @@ -938,9 +929,9 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) /* retry all partial writes. retry partial reads as long as its a * regular file. */ } while (ret > 0 && iocb->ki_left > 0 && - (opcode == IOCB_CMD_PWRITEV || + (rw == WRITE || (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); - if (opcode == IOCB_CMD_PWRITEV) + if (rw == WRITE) file_end_write(file); /* This means we must have transferred all that we could */ @@ -950,7 +941,7 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) /* If we managed to write some out we return that, rather than * the eventual error. */ - if (opcode == IOCB_CMD_PWRITEV + if (rw == WRITE && ret < 0 && ret != -EIOCBQUEUED && iocb->ki_nbytes - iocb->ki_left) ret = iocb->ki_nbytes - iocb->ki_left; @@ -958,73 +949,41 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) return ret; } -static ssize_t aio_fdsync(struct kiocb *iocb) -{ - struct file *file = iocb->ki_filp; - ssize_t ret = -EINVAL; - - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(iocb, 1); - return ret; -} - -static ssize_t aio_fsync(struct kiocb *iocb) -{ - struct file *file = iocb->ki_filp; - ssize_t ret = -EINVAL; - - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(iocb, 0); - return ret; -} - -static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) +static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) { ssize_t ret; + kiocb->ki_nr_segs = kiocb->ki_nbytes; + #ifdef CONFIG_COMPAT if (compat) - ret = compat_rw_copy_check_uvector(type, + ret = compat_rw_copy_check_uvector(rw, (struct compat_iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, &kiocb->ki_iovec); else #endif - ret = rw_copy_check_uvector(type, + ret = rw_copy_check_uvector(rw, (struct iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, &kiocb->ki_iovec); if (ret < 0) - goto out; - - ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret); - if (ret < 0) - goto out; + return ret; - kiocb->ki_nr_segs = kiocb->ki_nbytes; - kiocb->ki_cur_seg = 0; - /* ki_nbytes/left now reflect bytes instead of segs */ + /* ki_nbytes now reflect bytes instead of segs */ kiocb->ki_nbytes = ret; - kiocb->ki_left = ret; - - ret = 0; -out: - return ret; + return 0; } -static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb) +static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) { - int bytes; - - bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left); - if (bytes < 0) - return bytes; + if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) + return -EFAULT; kiocb->ki_iovec = &kiocb->ki_inline_vec; kiocb->ki_iovec->iov_base = kiocb->ki_buf; - kiocb->ki_iovec->iov_len = bytes; + kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; kiocb->ki_nr_segs = 1; - kiocb->ki_cur_seg = 0; return 0; } @@ -1033,81 +992,82 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc * Performs the initial checks and aio retry method * setup for the kiocb at the time of io submission. */ -static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) +static ssize_t aio_run_iocb(struct kiocb *req, bool compat) { - struct file *file = kiocb->ki_filp; - ssize_t ret = 0; + struct file *file = req->ki_filp; + ssize_t ret; + int rw; + fmode_t mode; + aio_rw_op *rw_op; - switch (kiocb->ki_opcode) { + switch (req->ki_opcode) { case IOCB_CMD_PREAD: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - break; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, - kiocb->ki_left))) - break; - ret = aio_setup_single_vector(READ, file, kiocb); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_read) - kiocb->ki_retry = aio_rw_vect_retry; - break; - case IOCB_CMD_PWRITE: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - break; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, - kiocb->ki_left))) - break; - ret = aio_setup_single_vector(WRITE, file, kiocb); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_write) - kiocb->ki_retry = aio_rw_vect_retry; - break; case IOCB_CMD_PREADV: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - break; - ret = aio_setup_vectored_rw(READ, kiocb, compat); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_read) - kiocb->ki_retry = aio_rw_vect_retry; - break; + mode = FMODE_READ; + rw = READ; + rw_op = file->f_op->aio_read; + goto rw_common; + + case IOCB_CMD_PWRITE: case IOCB_CMD_PWRITEV: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - break; - ret = aio_setup_vectored_rw(WRITE, kiocb, compat); + mode = FMODE_WRITE; + rw = WRITE; + rw_op = file->f_op->aio_write; + goto rw_common; +rw_common: + if (unlikely(!(file->f_mode & mode))) + return -EBADF; + + if (!rw_op) + return -EINVAL; + + ret = (req->ki_opcode == IOCB_CMD_PREADV || + req->ki_opcode == IOCB_CMD_PWRITEV) + ? aio_setup_vectored_rw(rw, req, compat) + : aio_setup_single_vector(rw, req); if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_write) - kiocb->ki_retry = aio_rw_vect_retry; + return ret; + + ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + if (ret < 0) + return ret; + + req->ki_nbytes = ret; + req->ki_left = ret; + + ret = aio_rw_vect_retry(req, rw, rw_op); break; + case IOCB_CMD_FDSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - kiocb->ki_retry = aio_fdsync; + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 1); break; + case IOCB_CMD_FSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - kiocb->ki_retry = aio_fsync; + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 0); break; + default: pr_debug("EINVAL: no operation provided\n"); - ret = -EINVAL; + return -EINVAL; } - if (!kiocb->ki_retry) - return ret; + if (ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || + ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; + aio_complete(req, ret, 0); + } return 0; } @@ -1134,7 +1094,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return -EINVAL; } - req = aio_get_req(ctx); /* returns with 2 references to req */ + req = aio_get_req(ctx); if (unlikely(!req)) return -EAGAIN; @@ -1173,26 +1133,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_left = req->ki_nbytes = iocb->aio_nbytes; req->ki_opcode = iocb->aio_lio_opcode; - ret = aio_setup_iocb(req, compat); + ret = aio_run_iocb(req, compat); if (ret) goto out_put_req; - ret = req->ki_retry(req); - if (ret != -EIOCBQUEUED) { - /* - * There's no easy way to restart the syscall since other AIO's - * may be already running. Just fail this IO with EINTR. - */ - if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || - ret == -ERESTARTNOHAND || - ret == -ERESTART_RESTARTBLOCK)) - ret = -EINTR; - aio_complete(req, ret, 0); - } - aio_put_req(req); /* drop extra ref to req */ return 0; - out_put_req: atomic_dec(&ctx->reqs_active); aio_put_req(req); /* drop extra ref to req */ -- cgit v1.2.3 From a27bb332c04cec8c4afd7912df0dc7890db27560 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 May 2013 16:19:08 -0700 Subject: aio: don't include aio.h in sched.h Faster kernel compiles by way of fewer unnecessary includes. [akpm@linux-foundation.org: fix fallout] [akpm@linux-foundation.org: fix build] Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_addr.c | 1 + fs/afs/write.c | 1 + fs/bio.c | 1 + fs/block_dev.c | 1 + fs/btrfs/file.c | 1 + fs/btrfs/inode.c | 1 + fs/ceph/file.c | 1 + fs/compat.c | 1 + fs/direct-io.c | 1 + fs/ecryptfs/file.c | 1 + fs/ext2/inode.c | 1 + fs/ext3/inode.c | 1 + fs/ext4/file.c | 1 + fs/ext4/indirect.c | 1 + fs/ext4/inode.c | 1 + fs/ext4/page-io.c | 1 + fs/f2fs/data.c | 1 + fs/fat/inode.c | 1 + fs/fuse/cuse.c | 1 + fs/fuse/dev.c | 1 + fs/fuse/file.c | 1 + fs/gfs2/aops.c | 1 + fs/gfs2/file.c | 1 + fs/hfs/inode.c | 1 + fs/hfsplus/inode.c | 1 + fs/jfs/inode.c | 1 + fs/nilfs2/inode.c | 2 +- fs/ntfs/file.c | 1 + fs/ntfs/inode.c | 1 + fs/ocfs2/aops.h | 2 ++ fs/ocfs2/inode.h | 2 -- fs/pipe.c | 1 + fs/read_write.c | 1 + fs/reiserfs/inode.c | 1 + fs/ubifs/file.c | 1 + fs/udf/inode.c | 1 + fs/xfs/xfs_aops.c | 1 + fs/xfs/xfs_file.c | 1 + 38 files changed, 38 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 0ad61c6a65a5..055562c580b4 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include diff --git a/fs/afs/write.c b/fs/afs/write.c index 7e03eadb40c0..a890db4b9898 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" static int afs_write_back_from_locked_page(struct afs_writeback *wb, diff --git a/fs/bio.c b/fs/bio.c index b96fc6ce4855..954d73124b41 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/block_dev.c b/fs/block_dev.c index ce08de7467a3..95ff88b54e98 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "internal.h" diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bb8b7a0e28a6..bc4d54c465a0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 09c58a35b429..898da0a01e04 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d70830c66833..656e16907430 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" diff --git a/fs/compat.c b/fs/compat.c index 93f7d021b716..fc3b55dce184 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include diff --git a/fs/direct-io.c b/fs/direct-io.c index cfb816dc6d9f..51d16e067d68 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,6 +37,7 @@ #include #include #include +#include /* * How many user pages to map in one call to get_user_pages(). This determines diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 63b1f54b6a1f..201f0a0d6b0a 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "ecryptfs_kernel.h" /** diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index fe60cc1117d8..0a87bb10998d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "ext2.h" #include "acl.h" #include "xip.h" diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index d706dbfa6220..23c712825640 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "ext3.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 64848b595b24..4959e29573b6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include "ext4.h" diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 98be6f697463..b8d5d351e24f 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -20,6 +20,7 @@ * (sct@redhat.com), 1993, 1998 */ +#include #include "ext4_jbd2.h" #include "truncate.h" #include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 793d44b84d7f..0723774bdfb5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "xattr.h" diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5929cd0baa20..19599bded62a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7bd22a201125..d0ed4ba4b61b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 4ff901632b26..dfce656ddb33 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 6f96a8def147..06b5e086ab3a 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 9bfd1a3214e6..a45c19093eb4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -19,6 +19,7 @@ #include #include #include +#include MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d15c6f21c17f..82f7ee581245 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -15,6 +15,7 @@ #include #include #include +#include static const struct file_operations fuse_direct_io_file_operations; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9883694f1e7c..0bad69ed6336 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index d79c2dadc536..acd16764b133 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 716e1aafb2e2..f9299d8a64e3 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "hfs_fs.h" #include "btree.h" diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 7faaa964968e..f833d35630ab 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "hfsplus_fs.h" #include "hfsplus_raw.h" diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 77554b61d124..730f24e282a6 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index cf02f5530713..689fb608648e 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include "nilfs.h" #include "btnode.h" #include "segment.h" diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 1da4b81e6f76..c5670b8d198c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index d3e118cc6ffa..2778b0255dc6 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "aops.h" #include "attrib.h" diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index ffb2da370a99..f671e49beb34 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,6 +22,8 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H +#include + handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, unsigned from, diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 88924a3133fa..621fc73bf23d 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -147,8 +147,6 @@ void ocfs2_refresh_inode(struct inode *inode, int ocfs2_mark_inode_dirty(handle_t *handle, struct inode *inode, struct buffer_head *bh); -int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); -int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); struct buffer_head *ocfs2_bread(struct inode *inode, int block, int *err, int reada); diff --git a/fs/pipe.c b/fs/pipe.c index a029a14bacf1..d2c45e14e6d8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/fs/read_write.c b/fs/read_write.c index bce289afd21a..03430008704e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ea5061fd4f3e..77d6d47abc83 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -18,6 +18,7 @@ #include #include #include +#include int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f12189d2db1d..14374530784c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -50,6 +50,7 @@ */ #include "ubifs.h" +#include #include #include #include diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7a12e48ad819..b6d15d349810 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3244c988d379..2b2691b73428 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,6 +31,7 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" #include "xfs_bmap.h" +#include #include #include #include diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 054d60c0ac57..a5f2042aec8b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -36,6 +36,7 @@ #include "xfs_ioctl.h" #include "xfs_trace.h" +#include #include #include #include -- cgit v1.2.3