From 1af60fbd759d31f565552fea315c2033947cfbe6 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 2 Oct 2009 18:56:53 -0400 Subject: block: get rid of the WRITE_ODIRECT flag Hi, The WRITE_ODIRECT flag is only used in one place, and that code path happens to also call blk_run_address_space. The introduction of this flag, then, could result in the device being unplugged twice for every I/O. Further, with the batching changes in the next patch, we don't want an O_DIRECT write to imply a queue unplug. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- fs/direct-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 8b10b87dc01a..c86d35f142de 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, int acquire_i_mutex = 0; if (rw & WRITE) - rw = WRITE_ODIRECT; + rw = WRITE_SYNC_PLUG; if (bdev) bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); -- cgit v1.2.3 From cfb1e33eed48165763edc7a4a067cf5f74898d0b Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 2 Oct 2009 18:57:36 -0400 Subject: aio: implement request batching Hi, Some workloads issue batches of small I/O, and the performance is poor due to the call to blk_run_address_space for every single iocb. Nathan Roberts pointed this out, and suggested that by deferring this call until all I/Os in the iocb array are submitted to the block layer, we can realize some impressive performance gains (up to 30% for sequential 4k reads in batches of 16). Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- fs/aio.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/direct-io.c | 8 ++++---- 2 files changed, 63 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 02a2c9340573..cf0bef428f88 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -32,6 +32,9 @@ #include #include #include +#include +#include +#include #include #include @@ -60,6 +63,14 @@ static DECLARE_WORK(fput_work, aio_fput_routine); static DEFINE_SPINLOCK(fput_lock); static LIST_HEAD(fput_head); +#define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */ +#define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS) +struct aio_batch_entry { + struct hlist_node list; + struct address_space *mapping; +}; +mempool_t *abe_pool; + static void aio_kick_handler(struct work_struct *); static void aio_queue_work(struct kioctx *); @@ -73,6 +84,8 @@ static int __init aio_setup(void) kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); aio_wq = create_workqueue("aio"); + abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); + BUG_ON(!abe_pool); pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); @@ -1531,8 +1544,44 @@ static int aio_wake_function(wait_queue_t *wait, unsigned mode, return 1; } +static void aio_batch_add(struct address_space *mapping, + struct hlist_head *batch_hash) +{ + struct aio_batch_entry *abe; + struct hlist_node *pos; + unsigned bucket; + + bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS); + hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) { + if (abe->mapping == mapping) + return; + } + + abe = mempool_alloc(abe_pool, GFP_KERNEL); + BUG_ON(!igrab(mapping->host)); + abe->mapping = mapping; + hlist_add_head(&abe->list, &batch_hash[bucket]); + return; +} + +static void aio_batch_free(struct hlist_head *batch_hash) +{ + struct aio_batch_entry *abe; + struct hlist_node *pos, *n; + int i; + + for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) { + hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) { + blk_run_address_space(abe->mapping); + iput(abe->mapping->host); + hlist_del(&abe->list); + mempool_free(abe, abe_pool); + } + } +} + static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb) + struct iocb *iocb, struct hlist_head *batch_hash) { struct kiocb *req; struct file *file; @@ -1608,6 +1657,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, ; } spin_unlock_irq(&ctx->ctx_lock); + if (req->ki_opcode == IOCB_CMD_PREAD || + req->ki_opcode == IOCB_CMD_PREADV || + req->ki_opcode == IOCB_CMD_PWRITE || + req->ki_opcode == IOCB_CMD_PWRITEV) + aio_batch_add(file->f_mapping, batch_hash); + aio_put_req(req); /* drop extra ref to req */ return 0; @@ -1635,6 +1690,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, struct kioctx *ctx; long ret = 0; int i; + struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, }; if (unlikely(nr < 0)) return -EINVAL; @@ -1666,10 +1722,11 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp); + ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash); if (ret) break; } + aio_batch_free(batch_hash); put_ioctx(ctx); return i ? i : ret; diff --git a/fs/direct-io.c b/fs/direct-io.c index c86d35f142de..3af761c8c5cc 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1028,9 +1028,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, if (dio->bio) dio_bio_submit(dio); - /* All IO is now issued, send it on its way */ - blk_run_address_space(inode->i_mapping); - /* * It is possible that, we return short IO due to end of file. * In that case, we need to release all the pages we got hold on. @@ -1057,8 +1054,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, ((rw & READ) || (dio->result == dio->size))) ret = -EIOCBQUEUED; - if (ret != -EIOCBQUEUED) + if (ret != -EIOCBQUEUED) { + /* All IO is now issued, send it on its way */ + blk_run_address_space(inode->i_mapping); dio_await_completion(dio); + } /* * Sync will always be dropping the final ref and completing the -- cgit v1.2.3 From b9d128f1088ea5245109dfc9bbceb128b6371a77 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 Oct 2009 13:59:26 +0100 Subject: block: move bdi/address_space unplug functions to backing-dev.h There's nothing block related about them, the backing device is used by things like NFS etc as well. This gets rid of the need to protect such calls by CONFIG_BLOCK. Signed-off-by: Jens Axboe --- fs/aio.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index cf0bef428f88..c30dfc006108 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #define DEBUG 0 -- cgit v1.2.3 From ab0a9735e06914ce4d2a94ffa41497dbc142fe7f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2009 14:14:04 +0100 Subject: blkdev: flush disk cache on ->fsync Currently there is no barrier support in the block device code. That means we cannot guarantee any sort of data integerity when using the block device node with dis kwrite caches enabled. Using the raw block device node is a typical use case for virtualization (and I assume databases, too). This patch changes block_fsync to issue a cache flush and thus make fsync on block device nodes actually useful. Note that in mainline we would also need to add such code to the ->aio_write method for O_SYNC handling, but assuming that Jan's patch series for the O_SYNC rewrite goes in it will also call into ->fsync for 2.6.32. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 9cf4b926f8e4..dde91e7e1c3a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -405,7 +405,17 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin) static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) { - return sync_blockdev(I_BDEV(filp->f_mapping->host)); + struct block_device *bdev = I_BDEV(filp->f_mapping->host); + int error; + + error = sync_blockdev(bdev); + if (error) + return error; + + error = blkdev_issue_flush(bdev, NULL); + if (error == -EOPNOTSUPP) + error = 0; + return error; } /* -- cgit v1.2.3 From cc56f7de7f00d188c7c4da1e9861581853b9e92f Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Wed, 4 Nov 2009 09:09:52 +0100 Subject: sendfile(): check f_op.splice_write() rather than f_op.sendpage() sendfile(2) was reworked with the splice infrastructure, but it still checks f_op.sendpage() instead of f_op.splice_write() wrongly. Although if f_op.sendpage() exists, f_op.splice_write() always exists at the same time currently, the assumption will be broken in future silently. This patch also brings a side effect: sendfile(2) can work with any output file. Some security checks related to f_op are added too. Signed-off-by: Changli Gao Signed-off-by: Jens Axboe --- fs/read_write.c | 2 -- fs/splice.c | 24 +++++++++++++++--------- 2 files changed, 15 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 3ac28987f22a..b7f4a1f94d48 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -826,8 +826,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!(out_file->f_mode & FMODE_WRITE)) goto fput_out; retval = -EINVAL; - if (!out_file->f_op || !out_file->f_op->sendpage) - goto fput_out; in_inode = in_file->f_path.dentry->d_inode; out_inode = out_file->f_path.dentry->d_inode; retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); diff --git a/fs/splice.c b/fs/splice.c index 7394e9e17534..39208663aaf1 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -648,9 +648,11 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, ret = buf->ops->confirm(pipe, buf); if (!ret) { more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - - ret = file->f_op->sendpage(file, buf->page, buf->offset, - sd->len, &pos, more); + if (file->f_op && file->f_op->sendpage) + ret = file->f_op->sendpage(file, buf->page, buf->offset, + sd->len, &pos, more); + else + ret = -EINVAL; } return ret; @@ -1068,8 +1070,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (unlikely(ret < 0)) return ret; - splice_write = out->f_op->splice_write; - if (!splice_write) + if (out->f_op && out->f_op->splice_write) + splice_write = out->f_op->splice_write; + else splice_write = default_file_splice_write; return splice_write(pipe, out, ppos, len, flags); @@ -1093,8 +1096,9 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; - splice_read = in->f_op->splice_read; - if (!splice_read) + if (in->f_op && in->f_op->splice_read) + splice_read = in->f_op->splice_read; + else splice_read = default_file_splice_read; return splice_read(in, ppos, pipe, len, flags); @@ -1316,7 +1320,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_in) return -ESPIPE; if (off_out) { - if (out->f_op->llseek == no_llseek) + if (!out->f_op || !out->f_op->llseek || + out->f_op->llseek == no_llseek) return -EINVAL; if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; @@ -1336,7 +1341,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_out) return -ESPIPE; if (off_in) { - if (in->f_op->llseek == no_llseek) + if (!in->f_op || !in->f_op->llseek || + in->f_op->llseek == no_llseek) return -EINVAL; if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; -- cgit v1.2.3 From 86b37281411cf1e9bc0a6b5406c45edb7bd9ea5d Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 10 Nov 2009 11:50:21 +0100 Subject: block: Expose discard granularity While SSDs track block usage on a per-sector basis, RAID arrays often have allocation blocks that are bigger. Allow the discard granularity and alignment to be set and teach the topology stacking logic how to handle them. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- fs/partitions/check.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7b685e10cbad..64bc8998ac9a 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -226,6 +226,13 @@ ssize_t part_alignment_offset_show(struct device *dev, return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); } +ssize_t part_discard_alignment_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%u\n", p->discard_alignment); +} + ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -288,6 +295,8 @@ static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); +static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, + NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -300,6 +309,7 @@ static struct attribute *part_attrs[] = { &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_alignment_offset.attr, + &dev_attr_discard_alignment.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -403,6 +413,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, p->start_sect = start; p->alignment_offset = queue_sector_alignment_offset(disk->queue, start); + p->discard_alignment = queue_sector_discard_alignment(disk->queue, + start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); -- cgit v1.2.3 From 7d13af3279985f554784a45cc961f706dbcdbdd1 Mon Sep 17 00:00:00 2001 From: Karel Zak Date: Mon, 23 Nov 2009 09:29:13 +0100 Subject: partitions: use sector size for EFI GPT Currently, kernel uses strictly 512-byte sectors for EFI GPT parsing. That's wrong. UEFI standard (version 2.3, May 2009, 5.3.1 GUID Format overview, page 95) defines that LBA is always based on the logical block size. It means bdev_logical_block_size() (aka BLKSSZGET) for Linux. This patch removes static sector size from EFI GPT parser. The problem is reproducible with the latest GNU Parted: # modprobe scsi_debug dev_size_mb=50 sector_size=4096 # ./parted /dev/sdb print Model: Linux scsi_debug (scsi) Disk /dev/sdb: 52.4MB Sector size (logical/physical): 4096B/4096B Partition Table: gpt Number Start End Size File system Name Flags 1 24.6kB 3002kB 2978kB primary 2 3002kB 6001kB 2998kB primary 3 6001kB 9003kB 3002kB primary # blockdev --rereadpt /dev/sdb # dmesg | tail -1 sdb: unknown partition table <---- !!! with this patch: # blockdev --rereadpt /dev/sdb # dmesg | tail -1 sdb: sdb1 sdb2 sdb3 Signed-off-by: Karel Zak Signed-off-by: Jens Axboe --- fs/partitions/efi.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 038a6022152f..80eeff5fdfe0 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -1,7 +1,9 @@ /************************************************************ * EFI GUID Partition Table handling - * Per Intel EFI Specification v1.02 - * http://developer.intel.com/technology/efi/efi.htm + * + * http://www.uefi.org/specs/ + * http://www.intel.com/technology/efi/ + * * efi.[ch] by Matt Domsch * Copyright 2000,2001,2002,2004 Dell Inc. * @@ -92,6 +94,7 @@ * ************************************************************/ #include +#include #include "check.h" #include "efi.h" @@ -141,7 +144,8 @@ last_lba(struct block_device *bdev) { if (!bdev || !bdev->bd_inode) return 0; - return (bdev->bd_inode->i_size >> 9) - 1ULL; + return div_u64(bdev->bd_inode->i_size, + bdev_logical_block_size(bdev)) - 1ULL; } static inline int @@ -188,6 +192,7 @@ static size_t read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) { size_t totalreadcount = 0; + sector_t n = lba * (bdev_logical_block_size(bdev) / 512); if (!bdev || !buffer || lba > last_lba(bdev)) return 0; @@ -195,7 +200,7 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) while (count) { int copied = 512; Sector sect; - unsigned char *data = read_dev_sector(bdev, lba++, §); + unsigned char *data = read_dev_sector(bdev, n++, §); if (!data) break; if (copied > count) @@ -601,6 +606,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; + unsigned ssz = bdev_logical_block_size(bdev) / 512; if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); @@ -611,13 +617,14 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) pr_debug("GUID Partition Table is valid! Yea!\n"); for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { + u64 start = le64_to_cpu(ptes[i].starting_lba); + u64 size = le64_to_cpu(ptes[i].ending_lba) - + le64_to_cpu(ptes[i].starting_lba) + 1ULL; + if (!is_pte_valid(&ptes[i], last_lba(bdev))) continue; - put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba), - (le64_to_cpu(ptes[i].ending_lba) - - le64_to_cpu(ptes[i].starting_lba) + - 1ULL)); + put_partition(state, i+1, start * ssz, size * ssz); /* If this is a RAID volume, tell md */ if (!efi_guidcmp(ptes[i].partition_type_guid, -- cgit v1.2.3 From 87038c2d5bda2418fda8b1456a0ae81cc3ff5bd8 Mon Sep 17 00:00:00 2001 From: Karel Zak Date: Mon, 23 Nov 2009 09:29:58 +0100 Subject: partitions: read whole sector with EFI GPT header The size of EFI GPT header is not static, but whole sector is allocated for the header. The HeaderSize field must be greater than 92 (= sizeof(struct gpt_header) and must be less than or equal to the logical block size. It means we have to read whole sector with the header, because the header crc32 checksum is calculated according to HeaderSize. For more details see UEFI standard (version 2.3, May 2009): - 5.3.1 GUID Format overview, page 93 - Table 13. GUID Partition Table Header, page 96 Signed-off-by: Karel Zak Signed-off-by: Jens Axboe --- fs/partitions/efi.c | 7 ++++--- fs/partitions/efi.h | 8 ++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 80eeff5fdfe0..49cfd5f54238 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -262,15 +262,16 @@ static gpt_header * alloc_read_gpt_header(struct block_device *bdev, u64 lba) { gpt_header *gpt; + unsigned ssz = bdev_logical_block_size(bdev); + if (!bdev) return NULL; - gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL); + gpt = kzalloc(ssz, GFP_KERNEL); if (!gpt) return NULL; - if (read_lba(bdev, lba, (u8 *) gpt, - sizeof (gpt_header)) < sizeof (gpt_header)) { + if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { kfree(gpt); gpt=NULL; return NULL; diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h index 2cc89d0475bf..6998b589abf9 100644 --- a/fs/partitions/efi.h +++ b/fs/partitions/efi.h @@ -37,7 +37,6 @@ #define EFI_PMBR_OSTYPE_EFI 0xEF #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE -#define GPT_BLOCK_SIZE 512 #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL #define GPT_HEADER_REVISION_V1 0x00010000 #define GPT_PRIMARY_PARTITION_TABLE_LBA 1 @@ -79,7 +78,12 @@ typedef struct _gpt_header { __le32 num_partition_entries; __le32 sizeof_partition_entry; __le32 partition_entry_array_crc32; - u8 reserved2[GPT_BLOCK_SIZE - 92]; + + /* The rest of the logical block is reserved by UEFI and must be zero. + * EFI standard handles this by: + * + * uint8_t reserved2[ BlockSize - 92 ]; + */ } __attribute__ ((packed)) gpt_header; typedef struct _gpt_entry_attributes { -- cgit v1.2.3 From 2d4dc890b5c8fabd818a8586607e6843c4375e62 Mon Sep 17 00:00:00 2001 From: Ilya Loginov Date: Thu, 26 Nov 2009 09:16:19 +0100 Subject: block: add helpers to run flush_dcache_page() against a bio and a request's pages Mtdblock driver doesn't call flush_dcache_page for pages in request. So, this causes problems on architectures where the icache doesn't fill from the dcache or with dcache aliases. The patch fixes this. The ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE symbol was introduced to avoid pointless empty cache-thrashing loops on architectures for which flush_dcache_page() is a no-op. Every architecture was provided with this flush pages on architectires where ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE is equal 1 or do nothing otherwise. See "fix mtd_blkdevs problem with caches on some architectures" discussion on LKML for more information. Signed-off-by: Ilya Loginov Cc: Ingo Molnar Cc: David Woodhouse Cc: Peter Horton Cc: "Ed L. Cashin" Signed-off-by: Jens Axboe --- fs/bio.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 12da5db8682c..e23a63f4f7de 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1393,6 +1393,18 @@ void bio_check_pages_dirty(struct bio *bio) } } +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +void bio_flush_dcache_pages(struct bio *bi) +{ + int i; + struct bio_vec *bvec; + + bio_for_each_segment(bvec, bi, i) + flush_dcache_page(bvec->bv_page); +} +EXPORT_SYMBOL(bio_flush_dcache_pages); +#endif + /** * bio_endio - end I/O on a bio * @bio: bio -- cgit v1.2.3 From d9449ce35a1e8fb58dd2d419f9215562a14ecca0 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 26 Nov 2009 09:45:40 +0100 Subject: Fix regression in direct writes performance due to WRITE_ODIRECT flag removal There seems to be a regression in direct write path due to following commit in for-2.6.33 branch of block tree. commit 1af60fbd759d31f565552fea315c2033947cfbe6 Author: Jeff Moyer Date: Fri Oct 2 18:56:53 2009 -0400 block: get rid of the WRITE_ODIRECT flag Marking direct writes as WRITE_SYNC_PLUG instead of WRITE_ODIRECT, sets the NOIDLE flag in bio and hence in request. This tells CFQ to not expect more request from the queue and not idle on it (despite the fact that queue's think time is less and it is not seeky). So direct writers lose big time when competing with sequential readers. Using fio, I have run one direct writer and two sequential readers and following are the results with 2.6.32-rc7 kernel and with for-2.6.33 branch. Test ==== 1 direct writer and 2 sequential reader running simultaneously. [global] directory=/mnt/sdc/fio/ runtime=10 [seqwrite] rw=write size=4G direct=1 [seqread] rw=read size=2G numjobs=2 2.6.32-rc7 ========== direct writes: aggrb=2,968KB/s readers : aggrb=101MB/s for-2.6.33 branch ================= direct write: aggrb=19KB/s readers aggrb=137MB/s This patch brings back the WRITE_ODIRECT flag, with the difference that we don't set the BIO_RW_UNPLUG flag so that device is not unplugged after submission of request and an explicit unplug from submitter is required. That way we fix the jeff's issue of not enough merging taking place in aio path as well as make sure direct writes get their fair share. After the fix ============= for-2.6.33 + fix ---------------- direct writes: aggrb=2,728KB/s reads: aggrb=103MB/s Thanks Vivek Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- fs/direct-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 3af761c8c5cc..b912270942fa 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, int acquire_i_mutex = 0; if (rw & WRITE) - rw = WRITE_SYNC_PLUG; + rw = WRITE_ODIRECT_PLUG; if (bdev) bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); -- cgit v1.2.3 From 951c30d135390a108f102b0f6e3cfa6241f2a1aa Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 3 Dec 2009 13:54:25 +0100 Subject: writeback: remove the always false bdi_cap_writeback_dirty() test This is dead code because no bdi flush thread will be started for !bdi_cap_writeback_dirty bdi. Signed-off-by: Wu Fengguang Cc: Jens Axboe Cc: Trond Myklebust Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9d5360c4c2af..0306c8e7d6b5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -614,7 +614,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, struct writeback_control *wbc) { struct super_block *sb = wbc->sb, *pin_sb = NULL; - const int is_blkdev_sb = sb_is_blkdev_sb(sb); const unsigned long start = jiffies; /* livelock avoidance */ spin_lock(&inode_lock); @@ -635,23 +634,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, continue; } - if (!bdi_cap_writeback_dirty(wb->bdi)) { - redirty_tail(inode); - if (is_blkdev_sb) { - /* - * Dirty memory-backed blockdev: the ramdisk - * driver does this. Skip just this inode - */ - continue; - } - /* - * Dirty memory-backed inode against a filesystem other - * than the kernel-internal bdev filesystem. Skip the - * entire superblock. - */ - break; - } - if (inode->i_state & (I_NEW | I_WILL_FREE)) { requeue_io(inode); continue; -- cgit v1.2.3 From b17621fed6aa039387e35f9b4d34d98f213e5673 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 3 Dec 2009 13:54:25 +0100 Subject: writeback: introduce wbc.for_background It will lower the flush priority for NFS, and maybe more in future. Signed-off-by: Wu Fengguang Cc: Trond Myklebust Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 1 + fs/nfs/write.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0306c8e7d6b5..0793961f7699 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -738,6 +738,7 @@ static long wb_writeback(struct bdi_writeback *wb, .sync_mode = args->sync_mode, .older_than_this = NULL, .for_kupdate = args->for_kupdate, + .for_background = args->for_background, .range_cyclic = args->range_cyclic, }; unsigned long oldest_jif; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 53eb26c16b50..c84b5cc1a943 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -178,7 +178,7 @@ static int wb_priority(struct writeback_control *wbc) { if (wbc->for_reclaim) return FLUSH_HIGHPRI | FLUSH_STABLE; - if (wbc->for_kupdate) + if (wbc->for_kupdate || wbc->for_background) return FLUSH_LOWPRI; return 0; } -- cgit v1.2.3 From 0d99519efef15fd0cf84a849492c7b1deee1e4b7 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 3 Dec 2009 13:54:25 +0100 Subject: writeback: remove unused nonblocking and congestion checks - no one is calling wb_writeback and write_cache_pages with wbc.nonblocking=1 any more - lumpy pageout will want to do nonblocking writeback without the congestion wait So remove the congestion checks as suggested by Chris. Signed-off-by: Wu Fengguang Cc: Chris Mason Cc: Jens Axboe Cc: Trond Myklebust Cc: Christoph Hellwig Cc: Dave Chinner Cc: Evgeniy Polyakov Cc: Alex Elder Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 9 --------- fs/xfs/linux-2.6/xfs_aops.c | 9 +-------- 2 files changed, 1 insertion(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0793961f7699..49bc1b8e8f19 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -639,14 +639,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, continue; } - if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { - wbc->encountered_congestion = 1; - if (!is_blkdev_sb) - break; /* Skip a congested fs */ - requeue_io(inode); - continue; /* Skip a congested blockdev */ - } - /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. @@ -770,7 +762,6 @@ static long wb_writeback(struct bdi_writeback *wb, break; wbc.more_io = 0; - wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; writeback_inodes_wb(wb, &wbc); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c2e30eea74dc..70f989895d15 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -904,16 +904,9 @@ xfs_convert_page( if (startio) { if (count) { - struct backing_dev_info *bdi; - - bdi = inode->i_mapping->backing_dev_info; wbc->nr_to_write--; - if (bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; - } else if (wbc->nr_to_write <= 0) { + if (wbc->nr_to_write <= 0) done = 1; - } } xfs_start_page_writeback(page, !page_dirty, count); } -- cgit v1.2.3