From 7bf7f352194252e6f05981d44fb8cb55668606cd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:45 +1100 Subject: xfs: fix broken error handling in xfs_vm_writepage When we shut down the filesystem, it might first be detected in writeback when we are allocating a inode size transaction. This happens after we have moved all the pages into the writeback state and unlocked them. Unfortunately, if we fail to set up the transaction we then abort writeback and try to invalidate the current page. This then triggers are BUG() in block_invalidatepage() because we are trying to invalidate an unlocked page. Fixing this is a bit of a chicken and egg problem - we can't allocate the transaction until we've clustered all the pages into the IO and we know the size of it (i.e. whether the last block of the IO is beyond the current EOF or not). However, we don't want to hold pages locked for long periods of time, especially while we lock other pages to cluster them into the write. To fix this, we need to make a clear delineation in writeback where errors can only be handled by IO completion processing. That is, once we have marked a page for writeback and unlocked it, we have to report errors via IO completion because we've already started the IO. We may not have submitted any IO, but we've changed the page state to indicate that it is under IO so we must now use the IO completion path to report errors. To do this, add an error field to xfs_submit_ioend() to pass it the error that occurred during the building on the ioend chain. When this is non-zero, mark each ioend with the error and call xfs_finish_ioend() directly rather than building bios. This will immediately push the ioends through completion processing with the error that has occurred. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 54 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 15 deletions(-) (limited to 'fs/xfs/xfs_aops.c') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e562dd43f41f..e57e2daa357c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -481,11 +481,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) * * The fix is two passes across the ioend list - one to start writeback on the * buffer_heads, and then submit them for I/O on the second pass. + * + * If @fail is non-zero, it means that we have a situation where some part of + * the submission process has failed after we have marked paged for writeback + * and unlocked them. In this situation, we need to fail the ioend chain rather + * than submit it to IO. This typically only happens on a filesystem shutdown. */ STATIC void xfs_submit_ioend( struct writeback_control *wbc, - xfs_ioend_t *ioend) + xfs_ioend_t *ioend, + int fail) { xfs_ioend_t *head = ioend; xfs_ioend_t *next; @@ -506,6 +512,18 @@ xfs_submit_ioend( next = ioend->io_list; bio = NULL; + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + if (fail) { + ioend->io_error = -fail; + xfs_finish_ioend(ioend); + continue; + } + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { if (!bio) { @@ -1060,7 +1078,18 @@ xfs_vm_writepage( xfs_start_page_writeback(page, 1, count); - if (ioend && imap_valid) { + /* if there is no IO to be submitted for this page, we are done */ + if (!ioend) + return 0; + + ASSERT(iohead); + + /* + * Any errors from this point onwards need tobe reported through the IO + * completion path as we have marked the initial page as under writeback + * and unlocked it. + */ + if (imap_valid) { xfs_off_t end_index; end_index = imap.br_startoff + imap.br_blockcount; @@ -1079,20 +1108,15 @@ xfs_vm_writepage( wbc, end_index); } - if (iohead) { - /* - * Reserve log space if we might write beyond the on-disk - * inode size. - */ - if (ioend->io_type != XFS_IO_UNWRITTEN && - xfs_ioend_is_append(ioend)) { - err = xfs_setfilesize_trans_alloc(ioend); - if (err) - goto error; - } - xfs_submit_ioend(wbc, iohead); - } + /* + * Reserve log space if we might write beyond the on-disk inode size. + */ + err = 0; + if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + err = xfs_setfilesize_trans_alloc(ioend); + + xfs_submit_ioend(wbc, iohead, err); return 0; -- cgit v1.2.3 From 4bc1ea6b8ddd4f2bd78944fbe5a1042ac14b1f5f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:56 +1100 Subject: xfs: remove xfs_flush_pages It is a complex wrapper around VFS functions, but there are VFS functions that provide exactly the same functionality. Call the VFS functions directly and remove the unnecessary indirection and complexity. We don't need to care about clearing the XFS_ITRUNCATED flag, as that is done during .writepages. Hence is cleared by the VFS writeback path if there is anything to write back during the flush. Signed-off-by: Dave Chinner Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs/xfs_aops.c') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e57e2daa357c..71361da1f77c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1641,7 +1641,7 @@ xfs_vm_bmap( trace_xfs_vm_bmap(XFS_I(inode)); xfs_ilock(ip, XFS_IOLOCK_SHARED); - xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); + filemap_write_and_wait(mapping); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } -- cgit v1.2.3 From 437a255aa23766666aec78af63be4c253faa8d57 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Nov 2012 13:01:00 +1100 Subject: xfs: fix direct IO nested transaction deadlock. The direct IO path can do a nested transaction reservation when writing past the EOF. The first transaction is the append transaction for setting the filesize at IO completion, but we can also need a transaction for allocation of blocks. If the log is low on space due to reservations and small log, the append transaction can be granted after wating for space as the only active transaction in the system. This then attempts a reservation for an allocation, which there isn't space in the log for, and the reservation sleeps. The result is that there is nothing left in the system to wake up all the processes waiting for log space to come free. The stack trace that shows this deadlock is relatively innocuous: xlog_grant_head_wait xlog_grant_head_check xfs_log_reserve xfs_trans_reserve xfs_iomap_write_direct __xfs_get_blocks xfs_get_blocks_direct do_blockdev_direct_IO __blockdev_direct_IO xfs_vm_direct_IO generic_file_direct_write xfs_file_dio_aio_writ xfs_file_aio_write do_sync_write vfs_write This was discovered on a filesystem with a log of only 10MB, and a log stripe unit of 256k whih increased the base reservations by 512k. Hence a allocation transaction requires 1.2MB of log space to be available instead of only 260k, and so greatly increased the chance that there wouldn't be enough log space available for the nested transaction to succeed. The key to reproducing it is this mkfs command: mkfs.xfs -f -d agcount=16,su=256k,sw=12 -l su=256k,size=2560b $SCRATCH_DEV The test case was a 1000 fsstress processes running with random freeze and unfreezes every few seconds. Thanks to Eryu Guan (eguan@redhat.com) for writing the test that found this on a system with a somewhat unique default configuration.... cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 81 ++++++++++++++++++++----------------------------------- 1 file changed, 29 insertions(+), 52 deletions(-) (limited to 'fs/xfs/xfs_aops.c') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 71361da1f77c..4111a40ebe1a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc( ioend->io_append_trans = tp; /* - * We will pass freeze protection with a transaction. So tell lockdep + * We may pass freeze protection with a transaction. So tell lockdep * we released it. */ rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], @@ -149,11 +149,13 @@ xfs_setfilesize( xfs_fsize_t isize; /* - * The transaction was allocated in the I/O submission thread, - * thus we need to mark ourselves as beeing in a transaction - * manually. + * The transaction may have been allocated in the I/O submission thread, + * thus we need to mark ourselves as beeing in a transaction manually. + * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); @@ -187,7 +189,8 @@ xfs_finish_ioend( if (ioend->io_type == XFS_IO_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) + else if (ioend->io_append_trans || + (ioend->io_isdirect && xfs_ioend_is_append(ioend))) queue_work(mp->m_data_workqueue, &ioend->io_work); else xfs_destroy_ioend(ioend); @@ -205,15 +208,6 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; - if (ioend->io_append_trans) { - /* - * We've got freeze protection passed with the transaction. - * Tell lockdep about it. - */ - rwsem_acquire_read( - &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ioend->io_error = -EIO; goto done; @@ -226,35 +220,31 @@ xfs_end_io( * range to normal written extens after the data I/O has finished. */ if (ioend->io_type == XFS_IO_UNWRITTEN) { + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) { /* - * For buffered I/O we never preallocate a transaction when - * doing the unwritten extent conversion, but for direct I/O - * we do not know if we are converting an unwritten extent - * or not at the point where we preallocate the transaction. + * For direct I/O we do not know if we need to allocate blocks + * or not so we can't preallocate an append transaction as that + * results in nested reservations and log space deadlocks. Hence + * allocate the transaction here. While this is sub-optimal and + * can block IO completion for some time, we're stuck with doing + * it this way until we can pass the ioend to the direct IO + * allocation callbacks and avoid nesting that way. */ - if (ioend->io_append_trans) { - ASSERT(ioend->io_isdirect); - - current_set_flags_nested( - &ioend->io_append_trans->t_pflags, PF_FSTRANS); - xfs_trans_cancel(ioend->io_append_trans, 0); - } - - error = xfs_iomap_write_unwritten(ip, ioend->io_offset, - ioend->io_size); - if (error) { - ioend->io_error = -error; + error = xfs_setfilesize_trans_alloc(ioend); + if (error) goto done; - } + error = xfs_setfilesize(ioend); } else if (ioend->io_append_trans) { error = xfs_setfilesize(ioend); - if (error) - ioend->io_error = -error; } else { ASSERT(!xfs_ioend_is_append(ioend)); } done: + if (error) + ioend->io_error = -error; xfs_destroy_ioend(ioend); } @@ -1432,25 +1422,21 @@ xfs_vm_direct_IO( size_t size = iov_length(iov, nr_segs); /* - * We need to preallocate a transaction for a size update - * here. In the case that this write both updates the size - * and converts at least on unwritten extent we will cancel - * the still clean transaction after the I/O has finished. + * We cannot preallocate a size update transaction here as we + * don't know whether allocation is necessary or not. Hence we + * can only tell IO completion that one is necessary if we are + * not doing unwritten extent conversion. */ iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); - if (offset + size > XFS_I(inode)->i_d.di_size) { - ret = xfs_setfilesize_trans_alloc(ioend); - if (ret) - goto out_destroy_ioend; + if (offset + size > XFS_I(inode)->i_d.di_size) ioend->io_isdirect = 1; - } ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, 0); if (ret != -EIOCBQUEUED && iocb->private) - goto out_trans_cancel; + goto out_destroy_ioend; } else { ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, @@ -1460,15 +1446,6 @@ xfs_vm_direct_IO( return ret; -out_trans_cancel: - if (ioend->io_append_trans) { - current_set_flags_nested(&ioend->io_append_trans->t_pflags, - PF_FSTRANS); - rwsem_acquire_read( - &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - xfs_trans_cancel(ioend->io_append_trans, 0); - } out_destroy_ioend: xfs_destroy_ioend(ioend); return ret; -- cgit v1.2.3