From 37eb17e604ac7398bbb133c82f281475d704fff7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:46 +1100 Subject: xfs: drop buffer io reference when a bad bio is built Error handling in xfs_buf_ioapply_map() does not handle IO reference counts correctly. We increment the b_io_remaining count before building the bio, but then fail to decrement it in the failure case. This leads to the buffer never running IO completion and releasing the reference that the IO holds, so at unmount we can leak the buffer. This leak is captured by this assert failure during unmount: XFS: Assertion failed: atomic_read(&pag->pag_ref) == 0, file: fs/xfs/xfs_mount.c, line: 273 This is not a new bug - the b_io_remaining accounting has had this problem for a long, long time - it's just very hard to get a zero length bio being built by this code... Further, the buffer IO error can be overwritten on a multi-segment buffer by subsequent bio completions for partial sections of the buffer. Hence we should only set the buffer error status if the buffer is not already carrying an error status. This ensures that a partial IO error on a multi-segment buffer will not be lost. This part of the problem is a regression, however. cc: Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs/xfs/xfs_buf.c') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 933b7930b863..4b0b8dd1b7b0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1197,9 +1197,14 @@ xfs_buf_bio_end_io( { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; - xfs_buf_ioerror(bp, -error); + /* + * don't overwrite existing errors - otherwise we can lose errors on + * buffers that require multiple bios to complete. + */ + if (!bp->b_error) + xfs_buf_ioerror(bp, -error); - if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); _xfs_buf_ioend(bp, 1); @@ -1279,6 +1284,11 @@ next_chunk: if (size) goto next_chunk; } else { + /* + * This is guaranteed not to be the last io reference count + * because the caller (xfs_buf_iorequest) holds a count itself. + */ + atomic_dec(&bp->b_io_remaining); xfs_buf_ioerror(bp, EIO); bio_put(bio); } -- cgit v1.2.3 From c3f8fc73ac97b76a12692088ef9cace9af8422c0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:01 +1100 Subject: xfs: make buffer read verication an IO completion function Add a verifier function callback capability to the buffer read interfaces. This will be used by the callers to supply a function that verifies the contents of the buffer when it is read from disk. This patch does not provide callback functions, but simply modifies the interfaces to allow them to be called. The reason for adding this to the read interfaces is that it is very difficult to tell fom the outside is a buffer was just read from disk or whether we just pulled it out of cache. Supplying a callbck allows the buffer cache to use it's internal knowledge of the buffer to execute it only when the buffer is read from disk. It is intended that the verifier functions will mark the buffer with an EFSCORRUPTED error when verification fails. This allows the reading context to distinguish a verification error from an IO error, and potentially take further actions on the buffer (e.g. attempt repair) based on the error reported. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs/xfs/xfs_buf.c') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4b0b8dd1b7b0..0298dd684798 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -654,7 +654,8 @@ xfs_buf_read_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + xfs_buf_iodone_t verify) { struct xfs_buf *bp; @@ -666,6 +667,7 @@ xfs_buf_read_map( if (!XFS_BUF_ISDONE(bp)) { XFS_STATS_INC(xb_get_read); + bp->b_iodone = verify; _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { /* @@ -691,13 +693,14 @@ void xfs_buf_readahead_map( struct xfs_buftarg *target, struct xfs_buf_map *map, - int nmaps) + int nmaps, + xfs_buf_iodone_t verify) { if (bdi_read_congested(target->bt_bdi)) return; xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, verify); } /* @@ -709,7 +712,8 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - int flags) + int flags, + xfs_buf_iodone_t verify) { xfs_buf_t *bp; int error; @@ -723,6 +727,7 @@ xfs_buf_read_uncached( bp->b_bn = daddr; bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; + bp->b_iodone = verify; xfsbdstrat(target->bt_mount, bp); error = xfs_buf_iowait(bp); -- cgit v1.2.3 From eab4e63368b4cfa597dbdac66d1a7a836a693b7d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:02 +1100 Subject: xfs: uncached buffer reads need to return an error With verification being done as an IO completion callback, different errors can be returned from a read. Uncached reads only return a buffer or NULL on failure, which means the verification error cannot be returned to the caller. Split the error handling for these reads into two - a failure to get a buffer will still return NULL, but a read error will return a referenced buffer with b_error set rather than NULL. The caller is responsible for checking the error state of the buffer returned. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs/xfs/xfs_buf.c') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 0298dd684798..fbc965fc075a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -715,8 +715,7 @@ xfs_buf_read_uncached( int flags, xfs_buf_iodone_t verify) { - xfs_buf_t *bp; - int error; + struct xfs_buf *bp; bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) @@ -730,11 +729,7 @@ xfs_buf_read_uncached( bp->b_iodone = verify; xfsbdstrat(target->bt_mount, bp); - error = xfs_buf_iowait(bp); - if (error) { - xfs_buf_relse(bp); - return NULL; - } + xfs_buf_iowait(bp); return bp; } -- cgit v1.2.3 From cfb02852226aa449fe27075caffe88726507668c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:19 +1100 Subject: xfs: add buffer pre-write callback Add a callback to the buffer write path to enable verification of the buffer and CRC calculation prior to issuing the write to the underlying storage. If the callback function detects some kind of failure or error condition, it must mark the buffer with an error so that the caller can take appropriate action. In the case of xfs_buf_ioapply(), a corrupt metadta buffer willt rigger a shutdown of the filesystem, because something is clearly wrong and we can't allow corrupt metadata to be written to disk. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs/xfs/xfs_buf.c') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index fbc965fc075a..bd1a948ee39c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -569,7 +569,9 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); + ASSERT(bp->b_iodone == NULL); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_pre_io = NULL; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -1323,6 +1325,20 @@ _xfs_buf_ioapply( /* we only use the buffer cache for meta-data */ rw |= REQ_META; + /* + * run the pre-io callback function if it exists. If this function + * fails it will mark the buffer with an error and the IO should + * not be dispatched. + */ + if (bp->b_pre_io) { + bp->b_pre_io(bp); + if (bp->b_error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_CORRUPT_INCORE); + return; + } + } + /* * Walk all the vectors issuing IO on them. Set up the initial offset * into the buffer and the desired IO size before we start - -- cgit v1.2.3 From 1813dd64057490e7a0678a885c4fe6d02f78bdc1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:54:40 +1100 Subject: xfs: convert buffer verifiers to an ops structure. To separate the verifiers from iodone functions and associate read and write verifiers at the same time, introduce a buffer verifier operations structure to the xfs_buf. This avoids the need for assigning the write verifier, clearing the iodone function and re-running ioend processing in the read verifier, and gets rid of the nasty "b_pre_io" name for the write verifier function pointer. If we ever need to, it will also be easier to add further content specific callbacks to a buffer with an ops structure in place. We also avoid needing to export verifier functions, instead we can simply export the ops structures for those that are needed outside the function they are defined in. This patch also fixes a directory block readahead verifier issue it exposed. This patch also adds ops callbacks to the inode/alloc btree blocks initialised by growfs. These will need more work before they will work with CRCs. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 63 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 26 deletions(-) (limited to 'fs/xfs/xfs_buf.c') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index bd1a948ee39c..26673a0b20e7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -571,7 +571,7 @@ found: ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); ASSERT(bp->b_iodone == NULL); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; - bp->b_pre_io = NULL; + bp->b_ops = NULL; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -657,7 +657,7 @@ xfs_buf_read_map( struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; @@ -669,7 +669,7 @@ xfs_buf_read_map( if (!XFS_BUF_ISDONE(bp)) { XFS_STATS_INC(xb_get_read); - bp->b_iodone = verify; + bp->b_ops = ops; _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { /* @@ -696,13 +696,13 @@ xfs_buf_readahead_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { if (bdi_read_congested(target->bt_bdi)) return; xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, verify); + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); } /* @@ -715,7 +715,7 @@ xfs_buf_read_uncached( xfs_daddr_t daddr, size_t numblks, int flags, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; @@ -728,7 +728,7 @@ xfs_buf_read_uncached( bp->b_bn = daddr; bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; - bp->b_iodone = verify; + bp->b_ops = ops; xfsbdstrat(target->bt_mount, bp); xfs_buf_iowait(bp); @@ -1001,27 +1001,37 @@ STATIC void xfs_buf_iodone_work( struct work_struct *work) { - xfs_buf_t *bp = + struct xfs_buf *bp = container_of(work, xfs_buf_t, b_iodone_work); + bool read = !!(bp->b_flags & XBF_READ); + + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); + if (read && bp->b_ops) + bp->b_ops->verify_read(bp); if (bp->b_iodone) (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); + else { + ASSERT(read && bp->b_ops); + complete(&bp->b_iowait); + } } void xfs_buf_ioend( - xfs_buf_t *bp, - int schedule) + struct xfs_buf *bp, + int schedule) { + bool read = !!(bp->b_flags & XBF_READ); + trace_xfs_buf_iodone(bp, _RET_IP_); - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); if (bp->b_error == 0) bp->b_flags |= XBF_DONE; - if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { + if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) { if (schedule) { INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); queue_work(xfslogd_workqueue, &bp->b_iodone_work); @@ -1029,6 +1039,7 @@ xfs_buf_ioend( xfs_buf_iodone_work(&bp->b_iodone_work); } } else { + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); complete(&bp->b_iowait); } } @@ -1316,6 +1327,20 @@ _xfs_buf_ioapply( rw |= REQ_FUA; if (bp->b_flags & XBF_FLUSH) rw |= REQ_FLUSH; + + /* + * Run the write verifier callback function if it exists. If + * this function fails it will mark the buffer with an error and + * the IO should not be dispatched. + */ + if (bp->b_ops) { + bp->b_ops->verify_write(bp); + if (bp->b_error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_CORRUPT_INCORE); + return; + } + } } else if (bp->b_flags & XBF_READ_AHEAD) { rw = READA; } else { @@ -1325,20 +1350,6 @@ _xfs_buf_ioapply( /* we only use the buffer cache for meta-data */ rw |= REQ_META; - /* - * run the pre-io callback function if it exists. If this function - * fails it will mark the buffer with an error and the IO should - * not be dispatched. - */ - if (bp->b_pre_io) { - bp->b_pre_io(bp); - if (bp->b_error) { - xfs_force_shutdown(bp->b_target->bt_mount, - SHUTDOWN_CORRUPT_INCORE); - return; - } - } - /* * Walk all the vectors issuing IO on them. Set up the initial offset * into the buffer and the desired IO size before we start - -- cgit v1.2.3