From 178ea73521d64ba41d7aa5488fb9f549c6d4507d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 11:31:30 -0400 Subject: kill check_acl callback of generic_permission() its value depends only on inode and does not change; we might as well store it in ->i_op->check_acl and be done with that. Signed-off-by: Al Viro --- fs/ocfs2/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/ocfs2/file.c') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index b1e35a392ca5..d058cb7e12d4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1293,7 +1293,7 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags) goto out; } - ret = generic_permission(inode, mask, flags, ocfs2_check_acl); + ret = generic_permission(inode, mask, flags); ocfs2_inode_unlock(inode, 0); out: @@ -2593,12 +2593,14 @@ const struct inode_operations ocfs2_file_iops = { .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, + .check_acl = ocfs2_check_acl, }; const struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, + .check_acl = ocfs2_check_acl, }; /* -- cgit v1.2.3 From 2830ba7f34ebb27c4e5b8b6ef408cd6d74860890 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 19:16:29 -0400 Subject: ->permission() sanitizing: don't pass flags to generic_permission() redundant; all callers get it duplicated in mask & MAY_NOT_BLOCK and none of them removes that bit. Signed-off-by: Al Viro --- fs/ocfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ocfs2/file.c') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d058cb7e12d4..ecb52b028964 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1293,7 +1293,7 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags) goto out; } - ret = generic_permission(inode, mask, flags); + ret = generic_permission(inode, mask); ocfs2_inode_unlock(inode, 0); out: -- cgit v1.2.3 From 10556cb21a0d0b24d95f00ea6df16f599a3345b2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jun 2011 19:28:19 -0400 Subject: ->permission() sanitizing: don't pass flags to ->permission() not used by the instances anymore. Signed-off-by: Al Viro --- fs/ocfs2/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ocfs2/file.c') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ecb52b028964..1406c37a5722 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1279,11 +1279,11 @@ bail: return err; } -int ocfs2_permission(struct inode *inode, int mask, unsigned int flags) +int ocfs2_permission(struct inode *inode, int mask) { int ret; - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; ret = ocfs2_inode_lock(inode, NULL, 0); -- cgit v1.2.3 From bd5fe6c5eb9c548d7f07fe8f89a150bb6705e8e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:43 -0400 Subject: fs: kill i_alloc_sem i_alloc_sem is a rather special rw_semaphore. It's the last one that may be released by a non-owner, and it's write side is always mirrored by real exclusion. It's intended use it to wait for all pending direct I/O requests to finish before starting a truncate. Replace it with a hand-grown construct: - exclusion for truncates is already guaranteed by i_mutex, so it can simply fall way - the reader side is replaced by an i_dio_count member in struct inode that counts the number of pending direct I/O requests. Truncate can't proceed as long as it's non-zero - when i_dio_count reaches non-zero we wake up a pending truncate using wake_up_bit on a new bit in i_flags - new references to i_dio_count can't appear while we are waiting for it to read zero because the direct I/O count always needs i_mutex (or an equivalent like XFS's i_iolock) for starting a new operation. This scheme is much simpler, and saves the space of a spinlock_t and a struct list_head in struct inode (typically 160 bits on a non-debug 64-bit system). Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/ocfs2/file.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs/ocfs2/file.c') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1406c37a5722..2c3a465514a2 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2236,9 +2236,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, ocfs2_iocb_clear_sem_locked(iocb); relock: - /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ + /* to match setattr's i_mutex -> rw_lock ordering */ if (direct_io) { - down_read(&inode->i_alloc_sem); + atomic_inc(&inode->i_dio_count); have_alloc_sem = 1; /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_sem_locked(iocb); @@ -2290,7 +2290,7 @@ relock: */ if (direct_io && !can_do_direct) { ocfs2_rw_unlock(inode, rw_level); - up_read(&inode->i_alloc_sem); + inode_dio_done(inode); have_alloc_sem = 0; rw_level = -1; @@ -2361,8 +2361,7 @@ out_dio: /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * function pointer which is called when o_direct io completes so that - * it can unlock our rw lock. (it's the clustered equivalent of - * i_alloc_sem; protects truncate from racing with pending ios). + * it can unlock our rw lock. * Unfortunately there are error cases which call end_io and others * that don't. so we don't have to unlock the rw_lock if either an * async dio is going to do it in the future or an end_io after an @@ -2379,7 +2378,7 @@ out: out_sems: if (have_alloc_sem) { - up_read(&inode->i_alloc_sem); + inode_dio_done(inode); ocfs2_iocb_clear_sem_locked(iocb); } @@ -2531,8 +2530,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, * need locks to protect pending reads from racing with truncate. */ if (filp->f_flags & O_DIRECT) { - down_read(&inode->i_alloc_sem); have_alloc_sem = 1; + atomic_inc(&inode->i_dio_count); ocfs2_iocb_set_sem_locked(iocb); ret = ocfs2_rw_lock(inode, 0); @@ -2575,7 +2574,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, bail: if (have_alloc_sem) { - up_read(&inode->i_alloc_sem); + inode_dio_done(inode); ocfs2_iocb_clear_sem_locked(iocb); } if (rw_level != -1) -- cgit v1.2.3 From 562c72aa57c36b178eacc3500a0215651eca9429 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:45 -0400 Subject: fs: move inode_dio_wait calls into ->setattr Let filesystems handle waiting for direct I/O requests themselves instead of doing it beforehand. This means filesystem-specific locks to prevent new dio referenes from appearing can be held. This is important to allow generalizing i_dio_count to non-DIO_LOCKING filesystems. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/ocfs2/file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ocfs2/file.c') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 2c3a465514a2..736283ca4a4c 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1142,6 +1142,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (status) goto bail_unlock; + inode_dio_wait(inode); + if (i_size_read(inode) > attr->ia_size) { if (ocfs2_should_order_data(inode)) { status = ocfs2_begin_ordered_truncate(inode, -- cgit v1.2.3 From df2d6f26586f12a24f3ae5df4e236dc5c08d6eb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:46 -0400 Subject: fs: always maintain i_dio_count Maintain i_dio_count for all filesystems, not just those using DIO_LOCKING. This these filesystems to also protect truncate against direct I/O requests by using common code. Right now the only non-DIO_LOCKING filesystem that appears to do so is XFS, which uses an opencoded variant of the i_dio_count scheme. Behaviour doesn't change for filesystems never calling inode_dio_wait. For ext4 behaviour changes when using the dioread_nonlock option, which previously was missing any protection between truncate and direct I/O reads. For ocfs2 that handcrafted i_dio_count manipulations are replaced with the common code now enable. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/ocfs2/file.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'fs/ocfs2/file.c') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 736283ca4a4c..22d604601957 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2240,7 +2240,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, relock: /* to match setattr's i_mutex -> rw_lock ordering */ if (direct_io) { - atomic_inc(&inode->i_dio_count); have_alloc_sem = 1; /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_sem_locked(iocb); @@ -2292,7 +2291,6 @@ relock: */ if (direct_io && !can_do_direct) { ocfs2_rw_unlock(inode, rw_level); - inode_dio_done(inode); have_alloc_sem = 0; rw_level = -1; @@ -2379,10 +2377,8 @@ out: ocfs2_rw_unlock(inode, rw_level); out_sems: - if (have_alloc_sem) { - inode_dio_done(inode); + if (have_alloc_sem) ocfs2_iocb_clear_sem_locked(iocb); - } mutex_unlock(&inode->i_mutex); @@ -2533,7 +2529,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, */ if (filp->f_flags & O_DIRECT) { have_alloc_sem = 1; - atomic_inc(&inode->i_dio_count); ocfs2_iocb_set_sem_locked(iocb); ret = ocfs2_rw_lock(inode, 0); @@ -2575,10 +2570,9 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, } bail: - if (have_alloc_sem) { - inode_dio_done(inode); + if (have_alloc_sem) ocfs2_iocb_clear_sem_locked(iocb); - } + if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); -- cgit v1.2.3 From 02c24a82187d5a628c68edfe71ae60dc135cd178 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 16 Jul 2011 20:44:56 -0400 Subject: fs: push i_mutex and filemap_write_and_wait down into ->fsync() handlers Btrfs needs to be able to control how filemap_write_and_wait_range() is called in fsync to make it less of a painful operation, so push down taking i_mutex and the calling of filemap_write_and_wait() down into the ->fsync() handlers. Some file systems can drop taking the i_mutex altogether it seems, like ext3 and ocfs2. For correctness sake I just pushed everything down in all cases to make sure that we keep the current behavior the same for everybody, and then each individual fs maintainer can make up their mind about what to do from there. Thanks, Acked-by: Jan Kara Signed-off-by: Josef Bacik Signed-off-by: Al Viro --- fs/ocfs2/file.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs/ocfs2/file.c') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 22d604601957..0fc2bd34039d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -171,7 +171,8 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file) return 0; } -static int ocfs2_sync_file(struct file *file, int datasync) +static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, + int datasync) { int err = 0; journal_t *journal; @@ -184,6 +185,16 @@ static int ocfs2_sync_file(struct file *file, int datasync) file->f_path.dentry->d_name.name, (unsigned long long)datasync); + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + /* + * Probably don't need the i_mutex at all in here, just putting it here + * to be consistent with how fsync used to be called, someone more + * familiar with the fs could possibly remove it. + */ + mutex_lock(&inode->i_mutex); if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { /* * We still have to flush drive's caches to get data to the @@ -200,6 +211,7 @@ static int ocfs2_sync_file(struct file *file, int datasync) bail: if (err) mlog_errno(err); + mutex_unlock(&inode->i_mutex); return (err < 0) ? -EIO : 0; } -- cgit v1.2.3 From 93862d5e1ab875664c6cc95254fc365028a48bb1 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Mon, 25 Jul 2011 14:58:15 -0700 Subject: ocfs2: Implement llseek() ocfs2 implements its own llseek() to provide the SEEK_HOLE/SEEK_DATA functionality. SEEK_HOLE sets the file pointer to the start of either a hole or an unwritten (preallocated) extent, that is greater than or equal to the supplied offset. SEEK_DATA sets the file pointer to the start of an allocated extent (not unwritten) that is greater than or equal to the supplied offset. If the supplied offset is on a desired region, then the file pointer is set to it. Offsets greater than or equal to the file size return -ENXIO. Unwritten (preallocated) extents are considered holes because the file system treats reads to such regions in the same way as it does to holes. Signed-off-by: Sunil Mushran --- fs/ocfs2/file.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) (limited to 'fs/ocfs2/file.c') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0fc2bd34039d..c0f015e11c28 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2591,6 +2591,57 @@ bail: return ret; } +/* Refer generic_file_llseek_unlocked() */ +static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + switch (origin) { + case SEEK_SET: + break; + case SEEK_END: + offset += inode->i_size; + break; + case SEEK_CUR: + if (offset == 0) { + offset = file->f_pos; + goto out; + } + offset += file->f_pos; + break; + case SEEK_DATA: + case SEEK_HOLE: + ret = ocfs2_seek_data_hole_offset(file, &offset, origin); + if (ret) + goto out; + break; + default: + ret = -EINVAL; + goto out; + } + + if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) + ret = -EINVAL; + if (!ret && offset > inode->i_sb->s_maxbytes) + ret = -EINVAL; + if (ret) + goto out; + + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + +out: + mutex_unlock(&inode->i_mutex); + if (ret) + return ret; + return offset; +} + const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, @@ -2615,7 +2666,7 @@ const struct inode_operations ocfs2_special_file_iops = { * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! */ const struct file_operations ocfs2_fops = { - .llseek = generic_file_llseek, + .llseek = ocfs2_file_llseek, .read = do_sync_read, .write = do_sync_write, .mmap = ocfs2_mmap, @@ -2663,7 +2714,7 @@ const struct file_operations ocfs2_dops = { * the cluster. */ const struct file_operations ocfs2_fops_no_plocks = { - .llseek = generic_file_llseek, + .llseek = ocfs2_file_llseek, .read = do_sync_read, .write = do_sync_write, .mmap = ocfs2_mmap, -- cgit v1.2.3