summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/ext4/ext4.h10
-rw-r--r--fs/ext4/extents.c10
-rw-r--r--fs/ext4/file.c60
-rw-r--r--fs/ext4/page-io.c25
-rw-r--r--fs/ext4/super.c13
5 files changed, 100 insertions, 18 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0c8d97b56f34..3aa0b72b3b94 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -848,6 +848,7 @@ struct ext4_inode_info {
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
/* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio;
+ atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ 37
+#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
+ EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
+ EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
#endif /* __KERNEL__ */
#endif /* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 63a75810b7c3..ccce8a7e94ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3174,9 +3174,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
* that this IO needs to convertion to written when IO is
* completed
*/
- if (io)
+ if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
io->flag = EXT4_IO_END_UNWRITTEN;
- else
+ atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+ } else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
if (ext4_should_dioread_nolock(inode))
map->m_flags |= EXT4_MAP_UNINIT;
@@ -3463,9 +3464,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* that we need to perform convertion when IO is done.
*/
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- if (io)
+ if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
io->flag = EXT4_IO_END_UNWRITTEN;
- else
+ atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+ } else
ext4_set_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN);
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2e8322c8aa88..7b80d543b89e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
+static void ext4_aiodio_wait(struct inode *inode)
+{
+ wait_queue_head_t *wq = ext4_ioend_wq(inode);
+
+ wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+}
+
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete. Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block. If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct super_block *sb = inode->i_sb;
+ int blockmask = sb->s_blocksize - 1;
+ size_t count = iov_length(iov, nr_segs);
+ loff_t final_size = pos + count;
+
+ if (pos >= inode->i_size)
+ return 0;
+
+ if ((pos & blockmask) || (final_size & blockmask))
+ return 1;
+
+ return 0;
+}
+
static ssize_t
ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ int unaligned_aio = 0;
+ int ret;
/*
* If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
sbi->s_bitmap_maxbytes - pos);
}
+ } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
+ !is_sync_kiocb(iocb))) {
+ unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
}
- return generic_file_aio_write(iocb, iov, nr_segs, pos);
+ /* Unaligned direct AIO must be serialized; see comment above */
+ if (unaligned_aio) {
+ static unsigned long unaligned_warn_time;
+
+ /* Warn about this once per day */
+ if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
+ ext4_msg(inode->i_sb, KERN_WARNING,
+ "Unaligned AIO/DIO on inode %ld by %s; "
+ "performance will be poor.",
+ inode->i_ino, current->comm);
+ mutex_lock(ext4_aio_mutex(inode));
+ ext4_aiodio_wait(inode);
+ }
+
+ ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+ if (unaligned_aio)
+ mutex_unlock(ext4_aio_mutex(inode));
+
+ return ret;
}
static const struct vm_operations_struct ext4_file_vm_ops = {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4e9b0a242f4c..955cc309142f 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,14 +32,8 @@
static struct kmem_cache *io_page_cachep, *io_end_cachep;
-#define WQ_HASH_SZ 37
-#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
-static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
-
int __init ext4_init_pageio(void)
{
- int i;
-
io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
if (io_page_cachep == NULL)
return -ENOMEM;
@@ -48,9 +42,6 @@ int __init ext4_init_pageio(void)
kmem_cache_destroy(io_page_cachep);
return -ENOMEM;
}
- for (i = 0; i < WQ_HASH_SZ; i++)
- init_waitqueue_head(&ioend_wq[i]);
-
return 0;
}
@@ -62,7 +53,7 @@ void ext4_exit_pageio(void)
void ext4_ioend_wait(struct inode *inode)
{
- wait_queue_head_t *wq = to_ioend_wq(inode);
+ wait_queue_head_t *wq = ext4_ioend_wq(inode);
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
}
@@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
for (i = 0; i < io->num_io_pages; i++)
put_io_page(io->pages[i]);
io->num_io_pages = 0;
- wq = to_ioend_wq(io->inode);
+ wq = ext4_ioend_wq(io->inode);
if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
waitqueue_active(wq))
wake_up_all(wq);
@@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
struct inode *inode = io->inode;
loff_t offset = io->offset;
ssize_t size = io->size;
+ wait_queue_head_t *wq;
int ret = 0;
ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
if (io->iocb)
aio_complete(io->iocb, io->result, 0);
/* clear the DIO AIO unwritten flag */
- io->flag &= ~EXT4_IO_END_UNWRITTEN;
+ if (io->flag & EXT4_IO_END_UNWRITTEN) {
+ io->flag &= ~EXT4_IO_END_UNWRITTEN;
+ /* Wake up anyone waiting on unwritten extent conversion */
+ wq = ext4_ioend_wq(io->inode);
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
+ waitqueue_active(wq)) {
+ wake_up_all(wq);
+ }
+ }
+
return ret;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 86b05486dc63..f6a318f836b2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -833,6 +833,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
+ atomic_set(&ei->i_aiodio_unwritten, 0);
return &ei->vfs_inode;
}
@@ -4800,11 +4801,21 @@ static void ext4_exit_feat_adverts(void)
kfree(ext4_feat);
}
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
static int __init ext4_init_fs(void)
{
- int err;
+ int i, err;
ext4_check_flag_values();
+
+ for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+ mutex_init(&ext4__aio_mutex[i]);
+ init_waitqueue_head(&ext4__ioend_wq[i]);
+ }
+
err = ext4_init_pageio();
if (err)
return err;