summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-12-08 08:19:16 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-08 08:19:16 -0800
commit6035ccd8e9e40bb654fbfdef325902ab531679a5 (patch)
treec1810d8a4d4ef150cdf14af72e6087dfc3f4b6e0 /fs
parent23eb3b64b5e44680c867e165fe1cd18e57fba255 (diff)
parent878eaddd05d251cefa9632c2b8046833c5eead66 (diff)
Merge branch 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block: (113 commits) cfq-iosched: Do not access cfqq after freeing it block: include linux/err.h to use ERR_PTR cfq-iosched: use call_rcu() instead of doing grace period stall on queue exit blkio: Allow CFQ group IO scheduling even when CFQ is a module blkio: Implement dynamic io controlling policy registration blkio: Export some symbols from blkio as its user CFQ can be a module block: Fix io_context leak after failure of clone with CLONE_IO block: Fix io_context leak after clone with CLONE_IO cfq-iosched: make nonrot check logic consistent io controller: quick fix for blk-cgroup and modular CFQ cfq-iosched: move IO controller declerations to a header file cfq-iosched: fix compile problem with !CONFIG_CGROUP blkio: Documentation blkio: Wait on sync-noidle queue even if rq_noidle = 1 blkio: Implement group_isolation tunable blkio: Determine async workload length based on total number of queues blkio: Wait for cfq queue to get backlogged if group is empty blkio: Propagate cgroup weight updation to cfq groups blkio: Drop the reference to queue once the task changes cgroup blkio: Provide some isolation between groups ...
Diffstat (limited to 'fs')
-rw-r--r--fs/aio.c62
-rw-r--r--fs/bio.c12
-rw-r--r--fs/block_dev.c12
-rw-r--r--fs/direct-io.c10
-rw-r--r--fs/fs-writeback.c28
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/partitions/check.c12
-rw-r--r--fs/partitions/efi.c30
-rw-r--r--fs/partitions/efi.h8
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/splice.c24
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c9
12 files changed, 143 insertions, 68 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 02a2c9340573..c30dfc006108 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -15,6 +15,7 @@
#include <linux/aio_abi.h>
#include <linux/module.h>
#include <linux/syscalls.h>
+#include <linux/backing-dev.h>
#include <linux/uio.h>
#define DEBUG 0
@@ -32,6 +33,9 @@
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/eventfd.h>
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/hash.h>
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
@@ -60,6 +64,14 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
static DEFINE_SPINLOCK(fput_lock);
static LIST_HEAD(fput_head);
+#define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */
+#define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS)
+struct aio_batch_entry {
+ struct hlist_node list;
+ struct address_space *mapping;
+};
+mempool_t *abe_pool;
+
static void aio_kick_handler(struct work_struct *);
static void aio_queue_work(struct kioctx *);
@@ -73,6 +85,8 @@ static int __init aio_setup(void)
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
aio_wq = create_workqueue("aio");
+ abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
+ BUG_ON(!abe_pool);
pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
@@ -1531,8 +1545,44 @@ static int aio_wake_function(wait_queue_t *wait, unsigned mode,
return 1;
}
+static void aio_batch_add(struct address_space *mapping,
+ struct hlist_head *batch_hash)
+{
+ struct aio_batch_entry *abe;
+ struct hlist_node *pos;
+ unsigned bucket;
+
+ bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
+ hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
+ if (abe->mapping == mapping)
+ return;
+ }
+
+ abe = mempool_alloc(abe_pool, GFP_KERNEL);
+ BUG_ON(!igrab(mapping->host));
+ abe->mapping = mapping;
+ hlist_add_head(&abe->list, &batch_hash[bucket]);
+ return;
+}
+
+static void aio_batch_free(struct hlist_head *batch_hash)
+{
+ struct aio_batch_entry *abe;
+ struct hlist_node *pos, *n;
+ int i;
+
+ for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
+ hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
+ blk_run_address_space(abe->mapping);
+ iput(abe->mapping->host);
+ hlist_del(&abe->list);
+ mempool_free(abe, abe_pool);
+ }
+ }
+}
+
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- struct iocb *iocb)
+ struct iocb *iocb, struct hlist_head *batch_hash)
{
struct kiocb *req;
struct file *file;
@@ -1608,6 +1658,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
;
}
spin_unlock_irq(&ctx->ctx_lock);
+ if (req->ki_opcode == IOCB_CMD_PREAD ||
+ req->ki_opcode == IOCB_CMD_PREADV ||
+ req->ki_opcode == IOCB_CMD_PWRITE ||
+ req->ki_opcode == IOCB_CMD_PWRITEV)
+ aio_batch_add(file->f_mapping, batch_hash);
+
aio_put_req(req); /* drop extra ref to req */
return 0;
@@ -1635,6 +1691,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
struct kioctx *ctx;
long ret = 0;
int i;
+ struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
if (unlikely(nr < 0))
return -EINVAL;
@@ -1666,10 +1723,11 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
break;
}
- ret = io_submit_one(ctx, user_iocb, &tmp);
+ ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
if (ret)
break;
}
+ aio_batch_free(batch_hash);
put_ioctx(ctx);
return i ? i : ret;
diff --git a/fs/bio.c b/fs/bio.c
index 12da5db8682c..e23a63f4f7de 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1393,6 +1393,18 @@ void bio_check_pages_dirty(struct bio *bio)
}
}
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+void bio_flush_dcache_pages(struct bio *bi)
+{
+ int i;
+ struct bio_vec *bvec;
+
+ bio_for_each_segment(bvec, bi, i)
+ flush_dcache_page(bvec->bv_page);
+}
+EXPORT_SYMBOL(bio_flush_dcache_pages);
+#endif
+
/**
* bio_endio - end I/O on a bio
* @bio: bio
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8bed0557d88c..73d6a735b8f3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -405,7 +405,17 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
{
- return sync_blockdev(I_BDEV(filp->f_mapping->host));
+ struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+ int error;
+
+ error = sync_blockdev(bdev);
+ if (error)
+ return error;
+
+ error = blkdev_issue_flush(bdev, NULL);
+ if (error == -EOPNOTSUPP)
+ error = 0;
+ return error;
}
/*
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b10b87dc01a..b912270942fa 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1028,9 +1028,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
if (dio->bio)
dio_bio_submit(dio);
- /* All IO is now issued, send it on its way */
- blk_run_address_space(inode->i_mapping);
-
/*
* It is possible that, we return short IO due to end of file.
* In that case, we need to release all the pages we got hold on.
@@ -1057,8 +1054,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
((rw & READ) || (dio->result == dio->size)))
ret = -EIOCBQUEUED;
- if (ret != -EIOCBQUEUED)
+ if (ret != -EIOCBQUEUED) {
+ /* All IO is now issued, send it on its way */
+ blk_run_address_space(inode->i_mapping);
dio_await_completion(dio);
+ }
/*
* Sync will always be dropping the final ref and completing the
@@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
int acquire_i_mutex = 0;
if (rw & WRITE)
- rw = WRITE_ODIRECT;
+ rw = WRITE_ODIRECT_PLUG;
if (bdev)
bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9d5360c4c2af..49bc1b8e8f19 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -614,7 +614,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
struct writeback_control *wbc)
{
struct super_block *sb = wbc->sb, *pin_sb = NULL;
- const int is_blkdev_sb = sb_is_blkdev_sb(sb);
const unsigned long start = jiffies; /* livelock avoidance */
spin_lock(&inode_lock);
@@ -635,36 +634,11 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
continue;
}
- if (!bdi_cap_writeback_dirty(wb->bdi)) {
- redirty_tail(inode);
- if (is_blkdev_sb) {
- /*
- * Dirty memory-backed blockdev: the ramdisk
- * driver does this. Skip just this inode
- */
- continue;
- }
- /*
- * Dirty memory-backed inode against a filesystem other
- * than the kernel-internal bdev filesystem. Skip the
- * entire superblock.
- */
- break;
- }
-
if (inode->i_state & (I_NEW | I_WILL_FREE)) {
requeue_io(inode);
continue;
}
- if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
- wbc->encountered_congestion = 1;
- if (!is_blkdev_sb)
- break; /* Skip a congested fs */
- requeue_io(inode);
- continue; /* Skip a congested blockdev */
- }
-
/*
* Was this inode dirtied after sync_sb_inodes was called?
* This keeps sync from extra jobs and livelock.
@@ -756,6 +730,7 @@ static long wb_writeback(struct bdi_writeback *wb,
.sync_mode = args->sync_mode,
.older_than_this = NULL,
.for_kupdate = args->for_kupdate,
+ .for_background = args->for_background,
.range_cyclic = args->range_cyclic,
};
unsigned long oldest_jif;
@@ -787,7 +762,6 @@ static long wb_writeback(struct bdi_writeback *wb,
break;
wbc.more_io = 0;
- wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
wbc.pages_skipped = 0;
writeback_inodes_wb(wb, &wbc);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53eb26c16b50..c84b5cc1a943 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -178,7 +178,7 @@ static int wb_priority(struct writeback_control *wbc)
{
if (wbc->for_reclaim)
return FLUSH_HIGHPRI | FLUSH_STABLE;
- if (wbc->for_kupdate)
+ if (wbc->for_kupdate || wbc->for_background)
return FLUSH_LOWPRI;
return 0;
}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e10cbad..64bc8998ac9a 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -226,6 +226,13 @@ ssize_t part_alignment_offset_show(struct device *dev,
return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
}
+ssize_t part_discard_alignment_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+ return sprintf(buf, "%u\n", p->discard_alignment);
+}
+
ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -288,6 +295,8 @@ static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
+ NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -300,6 +309,7 @@ static struct attribute *part_attrs[] = {
&dev_attr_start.attr,
&dev_attr_size.attr,
&dev_attr_alignment_offset.attr,
+ &dev_attr_discard_alignment.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -403,6 +413,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
p->start_sect = start;
p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
+ p->discard_alignment = queue_sector_discard_alignment(disk->queue,
+ start);
p->nr_sects = len;
p->partno = partno;
p->policy = get_disk_ro(disk);
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 038a6022152f..49cfd5f54238 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -1,7 +1,9 @@
/************************************************************
* EFI GUID Partition Table handling
- * Per Intel EFI Specification v1.02
- * http://developer.intel.com/technology/efi/efi.htm
+ *
+ * http://www.uefi.org/specs/
+ * http://www.intel.com/technology/efi/
+ *
* efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
* Copyright 2000,2001,2002,2004 Dell Inc.
*
@@ -92,6 +94,7 @@
*
************************************************************/
#include <linux/crc32.h>
+#include <linux/math64.h>
#include "check.h"
#include "efi.h"
@@ -141,7 +144,8 @@ last_lba(struct block_device *bdev)
{
if (!bdev || !bdev->bd_inode)
return 0;
- return (bdev->bd_inode->i_size >> 9) - 1ULL;
+ return div_u64(bdev->bd_inode->i_size,
+ bdev_logical_block_size(bdev)) - 1ULL;
}
static inline int
@@ -188,6 +192,7 @@ static size_t
read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
{
size_t totalreadcount = 0;
+ sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
if (!bdev || !buffer || lba > last_lba(bdev))
return 0;
@@ -195,7 +200,7 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
while (count) {
int copied = 512;
Sector sect;
- unsigned char *data = read_dev_sector(bdev, lba++, &sect);
+ unsigned char *data = read_dev_sector(bdev, n++, &sect);
if (!data)
break;
if (copied > count)
@@ -257,15 +262,16 @@ static gpt_header *
alloc_read_gpt_header(struct block_device *bdev, u64 lba)
{
gpt_header *gpt;
+ unsigned ssz = bdev_logical_block_size(bdev);
+
if (!bdev)
return NULL;
- gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL);
+ gpt = kzalloc(ssz, GFP_KERNEL);
if (!gpt)
return NULL;
- if (read_lba(bdev, lba, (u8 *) gpt,
- sizeof (gpt_header)) < sizeof (gpt_header)) {
+ if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
kfree(gpt);
gpt=NULL;
return NULL;
@@ -601,6 +607,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
gpt_header *gpt = NULL;
gpt_entry *ptes = NULL;
u32 i;
+ unsigned ssz = bdev_logical_block_size(bdev) / 512;
if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
kfree(gpt);
@@ -611,13 +618,14 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
pr_debug("GUID Partition Table is valid! Yea!\n");
for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+ u64 start = le64_to_cpu(ptes[i].starting_lba);
+ u64 size = le64_to_cpu(ptes[i].ending_lba) -
+ le64_to_cpu(ptes[i].starting_lba) + 1ULL;
+
if (!is_pte_valid(&ptes[i], last_lba(bdev)))
continue;
- put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba),
- (le64_to_cpu(ptes[i].ending_lba) -
- le64_to_cpu(ptes[i].starting_lba) +
- 1ULL));
+ put_partition(state, i+1, start * ssz, size * ssz);
/* If this is a RAID volume, tell md */
if (!efi_guidcmp(ptes[i].partition_type_guid,
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 2cc89d0475bf..6998b589abf9 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -37,7 +37,6 @@
#define EFI_PMBR_OSTYPE_EFI 0xEF
#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
-#define GPT_BLOCK_SIZE 512
#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
#define GPT_HEADER_REVISION_V1 0x00010000
#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
@@ -79,7 +78,12 @@ typedef struct _gpt_header {
__le32 num_partition_entries;
__le32 sizeof_partition_entry;
__le32 partition_entry_array_crc32;
- u8 reserved2[GPT_BLOCK_SIZE - 92];
+
+ /* The rest of the logical block is reserved by UEFI and must be zero.
+ * EFI standard handles this by:
+ *
+ * uint8_t reserved2[ BlockSize - 92 ];
+ */
} __attribute__ ((packed)) gpt_header;
typedef struct _gpt_entry_attributes {
diff --git a/fs/read_write.c b/fs/read_write.c
index 3ac28987f22a..b7f4a1f94d48 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -826,8 +826,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
if (!(out_file->f_mode & FMODE_WRITE))
goto fput_out;
retval = -EINVAL;
- if (!out_file->f_op || !out_file->f_op->sendpage)
- goto fput_out;
in_inode = in_file->f_path.dentry->d_inode;
out_inode = out_file->f_path.dentry->d_inode;
retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
diff --git a/fs/splice.c b/fs/splice.c
index 7394e9e17534..39208663aaf1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -648,9 +648,11 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
ret = buf->ops->confirm(pipe, buf);
if (!ret) {
more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
-
- ret = file->f_op->sendpage(file, buf->page, buf->offset,
- sd->len, &pos, more);
+ if (file->f_op && file->f_op->sendpage)
+ ret = file->f_op->sendpage(file, buf->page, buf->offset,
+ sd->len, &pos, more);
+ else
+ ret = -EINVAL;
}
return ret;
@@ -1068,8 +1070,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
if (unlikely(ret < 0))
return ret;
- splice_write = out->f_op->splice_write;
- if (!splice_write)
+ if (out->f_op && out->f_op->splice_write)
+ splice_write = out->f_op->splice_write;
+ else
splice_write = default_file_splice_write;
return splice_write(pipe, out, ppos, len, flags);
@@ -1093,8 +1096,9 @@ static long do_splice_to(struct file *in, loff_t *ppos,
if (unlikely(ret < 0))
return ret;
- splice_read = in->f_op->splice_read;
- if (!splice_read)
+ if (in->f_op && in->f_op->splice_read)
+ splice_read = in->f_op->splice_read;
+ else
splice_read = default_file_splice_read;
return splice_read(in, ppos, pipe, len, flags);
@@ -1316,7 +1320,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (off_in)
return -ESPIPE;
if (off_out) {
- if (out->f_op->llseek == no_llseek)
+ if (!out->f_op || !out->f_op->llseek ||
+ out->f_op->llseek == no_llseek)
return -EINVAL;
if (copy_from_user(&offset, off_out, sizeof(loff_t)))
return -EFAULT;
@@ -1336,7 +1341,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (off_out)
return -ESPIPE;
if (off_in) {
- if (in->f_op->llseek == no_llseek)
+ if (!in->f_op || !in->f_op->llseek ||
+ in->f_op->llseek == no_llseek)
return -EINVAL;
if (copy_from_user(&offset, off_in, sizeof(loff_t)))
return -EFAULT;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c2e30eea74dc..70f989895d15 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -904,16 +904,9 @@ xfs_convert_page(
if (startio) {
if (count) {
- struct backing_dev_info *bdi;
-
- bdi = inode->i_mapping->backing_dev_info;
wbc->nr_to_write--;
- if (bdi_write_congested(bdi)) {
- wbc->encountered_congestion = 1;
- done = 1;
- } else if (wbc->nr_to_write <= 0) {
+ if (wbc->nr_to_write <= 0)
done = 1;
- }
}
xfs_start_page_writeback(page, !page_dirty, count);
}