From d3cf209476b72c83907a412b6708c5e498410aa7 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Tue, 8 May 2007 13:49:27 +1000 Subject: [XFS] propogate return codes from flush routines This patch handles error return values in fs_flush_pages and fs_flushinval_pages. It changes the prototype of fs_flushinval_pages so we can propogate the errors and handle them at higher layers. I also modified xfs_itruncate_start so that it could propogate the error further. SGI-PV: 961990 SGI-Modid: xfs-linux-melb:xfs-kern:28231a Signed-off-by: Lachlan McIlroy Signed-off-by: Stewart Smith Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_lrw.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/xfs/linux-2.6/xfs_lrw.c') diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index ff8d64eba9f8..8e46c9798fbf 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -191,7 +191,7 @@ xfs_read( struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; size_t size = 0; - ssize_t ret; + ssize_t ret = 0; xfs_fsize_t n; xfs_inode_t *ip; xfs_mount_t *mp; @@ -263,9 +263,13 @@ xfs_read( if (unlikely(ioflags & IO_ISDIRECT)) { if (VN_CACHED(vp)) - bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)), + ret = bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)), -1, FI_REMAPF_LOCKED); mutex_unlock(&inode->i_mutex); + if (ret) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; + } } xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore, @@ -814,8 +818,10 @@ retry: if (need_flush) { xfs_inval_cached_trace(io, pos, -1, ctooff(offtoct(pos)), -1); - bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)), + error = bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)), -1, FI_REMAPF_LOCKED); + if (error) + goto out_unlock_internal; } if (need_i_mutex) { -- cgit v1.2.3 From 2a32963130aec5e157b58ff7dfa3dfa1afdf7ca1 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Tue, 8 May 2007 13:49:39 +1000 Subject: [XFS] Fix race condition in xfs_write(). This change addresses a race in xfs_write() where, for direct I/O, the flags need_i_mutex and need_flush are setup before the iolock is acquired. The logic used to setup the flags may change between setting the flags and acquiring the iolock resulting in these flags having incorrect values. For example, if a file is not currently cached then need_i_mutex is set to zero and then if the file is cached before the iolock is acquired we will fail to do the flushinval before the direct write. The flush (and also the call to xfs_zero_eof()) need to be done with the iolock held exclusive so we need to acquire the iolock before checking for cached data (or if the write begins after eof) to prevent this state from changing. For direct I/O I've chosen to always acquire the iolock in shared mode initially and if there is a need to promote it then drop it and reacquire it. There's also some other tidy-ups including removing the O_APPEND offset adjustment since that work is done in generic_write_checks() (and we don't use offset as an input parameter anywhere). SGI-PV: 962170 SGI-Modid: xfs-linux-melb:xfs-kern:28319a Signed-off-by: Lachlan McIlroy Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_lrw.c | 61 ++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 29 deletions(-) (limited to 'fs/xfs/linux-2.6/xfs_lrw.c') diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 8e46c9798fbf..80fe31233471 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -649,7 +649,7 @@ xfs_write( bhv_vrwlock_t locktype; size_t ocount = 0, count; loff_t pos; - int need_i_mutex = 1, need_flush = 0; + int need_i_mutex; XFS_STATS_INC(xs_write_calls); @@ -689,39 +689,20 @@ xfs_write( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (ioflags & IO_ISDIRECT) { - xfs_buftarg_t *target = - (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - if ((pos & target->bt_smask) || (count & target->bt_smask)) - return XFS_ERROR(-EINVAL); - - if (!VN_CACHED(vp) && pos < i_size_read(inode)) - need_i_mutex = 0; - - if (VN_CACHED(vp)) - need_flush = 1; - } - relock: - if (need_i_mutex) { + if (ioflags & IO_ISDIRECT) { + iolock = XFS_IOLOCK_SHARED; + locktype = VRWLOCK_WRITE_DIRECT; + need_i_mutex = 0; + } else { iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; - + need_i_mutex = 1; mutex_lock(&inode->i_mutex); - } else { - iolock = XFS_IOLOCK_SHARED; - locktype = VRWLOCK_WRITE_DIRECT; } xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); - isize = i_size_read(inode); - - if (file->f_flags & O_APPEND) - *offset = isize; - start: error = -generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); @@ -730,6 +711,29 @@ start: goto out_unlock_mutex; } + isize = i_size_read(inode); + + if (ioflags & IO_ISDIRECT) { + xfs_buftarg_t *target = + (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + if ((pos & target->bt_smask) || (count & target->bt_smask)) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + return XFS_ERROR(-EINVAL); + } + + if (!need_i_mutex && (VN_CACHED(vp) || pos > isize)) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + iolock = XFS_IOLOCK_EXCL; + locktype = VRWLOCK_WRITE; + need_i_mutex = 1; + mutex_lock(&inode->i_mutex); + xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); + goto start; + } + } + new_size = pos + count; if (new_size > isize) io->io_new_size = new_size; @@ -761,7 +765,6 @@ start: * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && savedsize != isize) { - pos = isize = xip->i_d.di_size; goto start; } } @@ -815,7 +818,8 @@ retry: current->backing_dev_info = mapping->backing_dev_info; if ((ioflags & IO_ISDIRECT)) { - if (need_flush) { + if (VN_CACHED(vp)) { + WARN_ON(need_i_mutex == 0); xfs_inval_cached_trace(io, pos, -1, ctooff(offtoct(pos)), -1); error = bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)), @@ -849,7 +853,6 @@ retry: pos += ret; count -= ret; - need_i_mutex = 1; ioflags &= ~IO_ISDIRECT; xfs_iunlock(xip, iolock); goto relock; -- cgit v1.2.3 From ba87ea699ebd9dd577bf055ebc4a98200e337542 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Tue, 8 May 2007 13:49:46 +1000 Subject: [XFS] Fix to prevent the notorious 'NULL files' problem after a crash. The problem that has been addressed is that of synchronising updates of the file size with writes that extend a file. Without the fix the update of a file's size, as a result of a write beyond eof, is independent of when the cached data is flushed to disk. Often the file size update would be written to the filesystem log before the data is flushed to disk. When a system crashes between these two events and the filesystem log is replayed on mount the file's size will be set but since the contents never made it to disk the file is full of holes. If some of the cached data was flushed to disk then it may just be a section of the file at the end that has holes. There are existing fixes to help alleviate this problem, particularly in the case where a file has been truncated, that force cached data to be flushed to disk when the file is closed. If the system crashes while the file(s) are still open then this flushing will never occur. The fix that we have implemented is to introduce a second file size, called the in-memory file size, that represents the current file size as viewed by the user. The existing file size, called the on-disk file size, is the one that get's written to the filesystem log and we only update it when it is safe to do so. When we write to a file beyond eof we only update the in- memory file size in the write operation. Later when the I/O operation, that flushes the cached data to disk completes, an I/O completion routine will update the on-disk file size. The on-disk file size will be updated to the maximum offset of the I/O or to the value of the in-memory file size if the I/O includes eof. SGI-PV: 958522 SGI-Modid: xfs-linux-melb:xfs-kern:28322a Signed-off-by: Lachlan McIlroy Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_lrw.c | 91 ++++++++++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 36 deletions(-) (limited to 'fs/xfs/linux-2.6/xfs_lrw.c') diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 80fe31233471..82ab792c7fc9 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -224,7 +224,7 @@ xfs_read( mp->m_rtdev_targp : mp->m_ddev_targp; if ((*offset & target->bt_smask) || (size & target->bt_smask)) { - if (*offset == ip->i_d.di_size) { + if (*offset == ip->i_size) { return (0); } return -XFS_ERROR(EINVAL); @@ -387,9 +387,10 @@ xfs_splice_write( { xfs_inode_t *ip = XFS_BHVTOI(bdp); xfs_mount_t *mp = ip->i_mount; + xfs_iocore_t *io = &ip->i_iocore; ssize_t ret; struct inode *inode = outfilp->f_mapping->host; - xfs_fsize_t isize; + xfs_fsize_t isize, new_size; XFS_STATS_INC(xs_write_calls); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -410,6 +411,14 @@ xfs_splice_write( return -error; } } + + new_size = *ppos + count; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (new_size > ip->i_size) + io->io_new_size = new_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, pipe, count, *ppos, ioflags); ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); @@ -420,14 +429,18 @@ xfs_splice_write( if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) *ppos = isize; - if (*ppos > ip->i_d.di_size) { + if (*ppos > ip->i_size) { xfs_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_d.di_size) { - ip->i_d.di_size = *ppos; - i_size_write(inode, *ppos); - ip->i_update_core = 1; - ip->i_update_size = 1; - } + if (*ppos > ip->i_size) + ip->i_size = *ppos; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (io->io_new_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + io->io_new_size = 0; + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; xfs_iunlock(ip, XFS_ILOCK_EXCL); } xfs_iunlock(ip, XFS_IOLOCK_EXCL); @@ -711,8 +724,6 @@ start: goto out_unlock_mutex; } - isize = i_size_read(inode); - if (ioflags & IO_ISDIRECT) { xfs_buftarg_t *target = (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? @@ -723,7 +734,7 @@ start: return XFS_ERROR(-EINVAL); } - if (!need_i_mutex && (VN_CACHED(vp) || pos > isize)) { + if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; @@ -735,7 +746,7 @@ start: } new_size = pos + count; - if (new_size > isize) + if (new_size > xip->i_size) io->io_new_size = new_size; if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && @@ -751,8 +762,7 @@ start: pos, count, dmflags, &locktype); if (error) { - xfs_iunlock(xip, iolock); - goto out_unlock_mutex; + goto out_unlock_internal; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; @@ -764,9 +774,8 @@ start: * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ - if ((file->f_flags & O_APPEND) && savedsize != isize) { + if ((file->f_flags & O_APPEND) && savedsize != xip->i_size) goto start; - } } if (likely(!(ioflags & IO_INVIS))) { @@ -784,11 +793,11 @@ start: * to zero it out up to the new size. */ - if (pos > isize) { - error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize); + if (pos > xip->i_size) { + error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, xip->i_size); if (error) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - goto out_unlock_mutex; + xfs_iunlock(xip, XFS_ILOCK_EXCL); + goto out_unlock_internal; } } xfs_iunlock(xip, XFS_ILOCK_EXCL); @@ -808,8 +817,7 @@ start: if (likely(!error)) error = -remove_suid(file->f_path.dentry); if (unlikely(error)) { - xfs_iunlock(xip, iolock); - goto out_unlock_mutex; + goto out_unlock_internal; } } @@ -879,12 +887,12 @@ retry: error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ - if (error) - goto out_nounlocks; if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_rwlock(bdp, locktype); - pos = xip->i_d.di_size; + if (error) + goto out_unlock_internal; + pos = xip->i_size; ret = 0; goto retry; } @@ -893,14 +901,10 @@ retry: if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) *offset = isize; - if (*offset > xip->i_d.di_size) { + if (*offset > xip->i_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); - if (*offset > xip->i_d.di_size) { - xip->i_d.di_size = *offset; - i_size_write(inode, *offset); - xip->i_update_core = 1; - xip->i_update_size = 1; - } + if (*offset > xip->i_size) + xip->i_size = *offset; xfs_iunlock(xip, XFS_ILOCK_EXCL); } @@ -922,16 +926,31 @@ retry: error = sync_page_range(inode, mapping, pos, ret); if (!error) - error = ret; - return error; + error = -ret; + if (need_i_mutex) + mutex_lock(&inode->i_mutex); + xfs_rwlock(bdp, locktype); } out_unlock_internal: + if (io->io_new_size) { + xfs_ilock(xip, XFS_ILOCK_EXCL); + io->io_new_size = 0; + /* + * If this was a direct or synchronous I/O that failed (such + * as ENOSPC) then part of the I/O may have been written to + * disk before the error occured. In this case the on-disk + * file size may have been adjusted beyond the in-memory file + * size and now needs to be truncated back. + */ + if (xip->i_d.di_size > xip->i_size) + xip->i_d.di_size = xip->i_size; + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } xfs_rwunlock(bdp, locktype); out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); - out_nounlocks: return -error; } -- cgit v1.2.3 From 71dfd5a396d11512aa6c8ed0d35b268bc084bb9b Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Tue, 8 May 2007 13:50:12 +1000 Subject: [XFS] Fix race in xfs_write() b/w dmapi callout and direct I/O checks. In xfs_write() the iolock is dropped and reacquired in XFS_SEND_DATA() which means that the file could change from not-cached to cached and we need to redo the direct I/O checks. We should also redo the direct I/O checks when the file size changes regardless if O_APPEND is set or not. SGI-PV: 963483 SGI-Modid: xfs-linux-melb:xfs-kern:28440a Signed-off-by: Lachlan McIlroy Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_lrw.c | 53 +++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 27 deletions(-) (limited to 'fs/xfs/linux-2.6/xfs_lrw.c') diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 82ab792c7fc9..b2a1beb33888 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -724,34 +724,8 @@ start: goto out_unlock_mutex; } - if (ioflags & IO_ISDIRECT) { - xfs_buftarg_t *target = - (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - if ((pos & target->bt_smask) || (count & target->bt_smask)) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - return XFS_ERROR(-EINVAL); - } - - if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - iolock = XFS_IOLOCK_EXCL; - locktype = VRWLOCK_WRITE; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); - goto start; - } - } - - new_size = pos + count; - if (new_size > xip->i_size) - io->io_new_size = new_size; - if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { - loff_t savedsize = pos; int dmflags = FILP_DELAY_FLAG(file); if (need_i_mutex) @@ -774,10 +748,35 @@ start: * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ - if ((file->f_flags & O_APPEND) && savedsize != xip->i_size) + if ((file->f_flags & O_APPEND) && pos != xip->i_size) goto start; } + if (ioflags & IO_ISDIRECT) { + xfs_buftarg_t *target = + (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + if ((pos & target->bt_smask) || (count & target->bt_smask)) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + return XFS_ERROR(-EINVAL); + } + + if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + iolock = XFS_IOLOCK_EXCL; + locktype = VRWLOCK_WRITE; + need_i_mutex = 1; + mutex_lock(&inode->i_mutex); + xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); + goto start; + } + } + + new_size = pos + count; + if (new_size > xip->i_size) + io->io_new_size = new_size; + if (likely(!(ioflags & IO_INVIS))) { file_update_time(file); xfs_ichgtime_fast(xip, inode, -- cgit v1.2.3