diff options
Diffstat (limited to 'fs')
60 files changed, 995 insertions, 513 deletions
diff --git a/fs/afs/cache.h b/fs/afs/cache.h deleted file mode 100644 index 5c4f6b499e90..000000000000 --- a/fs/afs/cache.h +++ /dev/null @@ -1,12 +0,0 @@ -/* AFS local cache management interface - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/fscache.h> diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 106be66dafd2..6ece2a13bf71 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -18,10 +18,10 @@ #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> +#include <linux/fscache.h> #include "afs.h" #include "afs_vl.h" -#include "cache.h" #define AFS_CELL_MAX_ADDRS 15 diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index d11c51fc2a3f..2ca7a7cafdbf 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -8,8 +8,10 @@ * */ +#include <linux/cred.h> #include <linux/file.h> #include <linux/poll.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/fs.h> @@ -249,6 +249,7 @@ void bio_free(struct bio *bio, struct bio_set *bs) mempool_free(p, bs->bio_pool); } +EXPORT_SYMBOL(bio_free); void bio_init(struct bio *bio) { @@ -257,6 +258,7 @@ void bio_init(struct bio *bio) bio->bi_comp_cpu = -1; atomic_set(&bio->bi_cnt, 1); } +EXPORT_SYMBOL(bio_init); /** * bio_alloc_bioset - allocate a bio for I/O @@ -311,6 +313,7 @@ err_free: mempool_free(p, bs->bio_pool); return NULL; } +EXPORT_SYMBOL(bio_alloc_bioset); static void bio_fs_destructor(struct bio *bio) { @@ -337,6 +340,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) return bio; } +EXPORT_SYMBOL(bio_alloc); static void bio_kmalloc_destructor(struct bio *bio) { @@ -380,6 +384,7 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) return bio; } +EXPORT_SYMBOL(bio_kmalloc); void zero_fill_bio(struct bio *bio) { @@ -416,6 +421,7 @@ void bio_put(struct bio *bio) bio->bi_destructor(bio); } } +EXPORT_SYMBOL(bio_put); inline int bio_phys_segments(struct request_queue *q, struct bio *bio) { @@ -424,6 +430,7 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio) return bio->bi_phys_segments; } +EXPORT_SYMBOL(bio_phys_segments); /** * __bio_clone - clone a bio @@ -451,6 +458,7 @@ void __bio_clone(struct bio *bio, struct bio *bio_src) bio->bi_size = bio_src->bi_size; bio->bi_idx = bio_src->bi_idx; } +EXPORT_SYMBOL(__bio_clone); /** * bio_clone - clone a bio @@ -482,6 +490,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) return b; } +EXPORT_SYMBOL(bio_clone); /** * bio_get_nr_vecs - return approx number of vecs @@ -505,6 +514,7 @@ int bio_get_nr_vecs(struct block_device *bdev) return nr_pages; } +EXPORT_SYMBOL(bio_get_nr_vecs); static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, @@ -635,6 +645,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, return __bio_add_page(q, bio, page, len, offset, queue_max_hw_sectors(q)); } +EXPORT_SYMBOL(bio_add_pc_page); /** * bio_add_page - attempt to add page to bio @@ -655,6 +666,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, struct request_queue *q = bdev_get_queue(bio->bi_bdev); return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); } +EXPORT_SYMBOL(bio_add_page); struct bio_map_data { struct bio_vec *iovecs; @@ -776,6 +788,7 @@ int bio_uncopy_user(struct bio *bio) bio_put(bio); return ret; } +EXPORT_SYMBOL(bio_uncopy_user); /** * bio_copy_user_iov - copy user data to bio @@ -920,6 +933,7 @@ struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); } +EXPORT_SYMBOL(bio_copy_user); static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, @@ -1050,6 +1064,7 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); } +EXPORT_SYMBOL(bio_map_user); /** * bio_map_user_iov - map user sg_iovec table into bio @@ -1117,13 +1132,13 @@ void bio_unmap_user(struct bio *bio) __bio_unmap_user(bio); bio_put(bio); } +EXPORT_SYMBOL(bio_unmap_user); static void bio_map_kern_endio(struct bio *bio, int err) { bio_put(bio); } - static struct bio *__bio_map_kern(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) { @@ -1189,6 +1204,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, bio_put(bio); return ERR_PTR(-EINVAL); } +EXPORT_SYMBOL(bio_map_kern); static void bio_copy_kern_endio(struct bio *bio, int err) { @@ -1250,6 +1266,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, return bio; } +EXPORT_SYMBOL(bio_copy_kern); /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions @@ -1400,6 +1417,7 @@ void bio_endio(struct bio *bio, int error) if (bio->bi_end_io) bio->bi_end_io(bio, error); } +EXPORT_SYMBOL(bio_endio); void bio_pair_release(struct bio_pair *bp) { @@ -1410,6 +1428,7 @@ void bio_pair_release(struct bio_pair *bp) mempool_free(bp, bp->bio2.bi_private); } } +EXPORT_SYMBOL(bio_pair_release); static void bio_pair_end_1(struct bio *bi, int err) { @@ -1477,6 +1496,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) return bp; } +EXPORT_SYMBOL(bio_split); /** * bio_sector_offset - Find hardware sector offset in bio @@ -1547,6 +1567,7 @@ void bioset_free(struct bio_set *bs) kfree(bs); } +EXPORT_SYMBOL(bioset_free); /** * bioset_create - Create a bio_set @@ -1592,6 +1613,7 @@ bad: bioset_free(bs); return NULL; } +EXPORT_SYMBOL(bioset_create); static void __init biovec_init_slabs(void) { @@ -1636,29 +1658,4 @@ static int __init init_bio(void) return 0; } - subsys_initcall(init_bio); - -EXPORT_SYMBOL(bio_alloc); -EXPORT_SYMBOL(bio_kmalloc); -EXPORT_SYMBOL(bio_put); -EXPORT_SYMBOL(bio_free); -EXPORT_SYMBOL(bio_endio); -EXPORT_SYMBOL(bio_init); -EXPORT_SYMBOL(__bio_clone); -EXPORT_SYMBOL(bio_clone); -EXPORT_SYMBOL(bio_phys_segments); -EXPORT_SYMBOL(bio_add_page); -EXPORT_SYMBOL(bio_add_pc_page); -EXPORT_SYMBOL(bio_get_nr_vecs); -EXPORT_SYMBOL(bio_map_user); -EXPORT_SYMBOL(bio_unmap_user); -EXPORT_SYMBOL(bio_map_kern); -EXPORT_SYMBOL(bio_copy_kern); -EXPORT_SYMBOL(bio_pair_release); -EXPORT_SYMBOL(bio_split); -EXPORT_SYMBOL(bio_copy_user); -EXPORT_SYMBOL(bio_uncopy_user); -EXPORT_SYMBOL(bioset_create); -EXPORT_SYMBOL(bioset_free); -EXPORT_SYMBOL(bio_alloc_bioset); diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index f128427b995b..69b355ae7f49 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -27,7 +27,7 @@ #include "btrfs_inode.h" #include "xattr.h" -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_POSIX_ACL static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { @@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = { .set = btrfs_xattr_acl_access_set, }; -#else /* CONFIG_FS_POSIX_ACL */ +#else /* CONFIG_BTRFS_POSIX_ACL */ int btrfs_acl_chmod(struct inode *inode) { @@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) return 0; } -#endif /* CONFIG_FS_POSIX_ACL */ +#endif /* CONFIG_BTRFS_POSIX_ACL */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 82ee56bba299..a54d354cefcb 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -128,6 +128,14 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * These two counters are for delalloc metadata reservations. We keep + * track of how many extents we've accounted for vs how many extents we + * have. + */ + int delalloc_reserved_extents; + int delalloc_extents; + + /* * ordered_data_close is set by truncate when a file that used * to have good data has been truncated to zero. When it is set * the btrfs file release call will add this inode to the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 80599b4e42bd..dd8ced9814c4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -675,18 +675,19 @@ struct btrfs_space_info { current allocations */ u64 bytes_readonly; /* total bytes that are read only */ u64 bytes_super; /* total bytes reserved for the super blocks */ - - /* delalloc accounting */ - u64 bytes_delalloc; /* number of bytes reserved for allocation, - this space is not necessarily reserved yet - by the allocator */ + u64 bytes_root; /* the number of bytes needed to commit a + transaction */ u64 bytes_may_use; /* number of bytes that may be used for - delalloc */ + delalloc/allocations */ + u64 bytes_delalloc; /* number of bytes currently reserved for + delayed allocation */ int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for this space */ + int force_delalloc; /* make people start doing filemap_flush until + we're under a threshold */ struct list_head list; @@ -695,6 +696,9 @@ struct btrfs_space_info { spinlock_t lock; struct rw_semaphore groups_sem; atomic_t caching_threads; + + int allocating_chunk; + wait_queue_head_t wait; }; /* @@ -2022,7 +2026,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); -int btrfs_check_metadata_free_space(struct btrfs_root *root); +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items); +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items); int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct btrfs_root *root, @@ -2326,7 +2335,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); int btrfs_check_file(struct btrfs_root *root, struct inode *inode); -extern struct file_operations btrfs_file_operations; +extern const struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, u64 locked_end, @@ -2357,7 +2366,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_POSIX_ACL int btrfs_check_acl(struct inode *inode, int mask); #else #define btrfs_check_acl NULL diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 644e796fd643..af0435f79fa6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -822,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, int btrfs_write_tree_block(struct extent_buffer *buf) { - return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, - buf->start + buf->len - 1, WB_SYNC_ALL); + return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, + buf->start + buf->len - 1); } int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, - buf->start, buf->start + buf->len - 1); + return filemap_fdatawait_range(buf->first_page->mapping, + buf->start, buf->start + buf->len - 1); } struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, @@ -1630,7 +1630,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->sb = sb; fs_info->max_extent = (u64)-1; fs_info->max_inline = 8192 * 1024; - fs_info->metadata_ratio = 8; + fs_info->metadata_ratio = 0; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 993f93ff7ba6..359a754c782c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -68,6 +68,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, struct extent_buffer **must_clean); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -2765,67 +2767,346 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) alloc_target); } +static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) +{ + u64 num_bytes; + int level; + + level = BTRFS_MAX_LEVEL - 2; + /* + * NOTE: these calculations are absolutely the worst possible case. + * This assumes that _every_ item we insert will require a new leaf, and + * that the tree has grown to its maximum level size. + */ + + /* + * for every item we insert we could insert both an extent item and a + * extent ref item. Then for ever item we insert, we will need to cow + * both the original leaf, plus the leaf to the left and right of it. + * + * Unless we are talking about the extent root, then we just want the + * number of items * 2, since we just need the extent item plus its ref. + */ + if (root == root->fs_info->extent_root) + num_bytes = num_items * 2; + else + num_bytes = (num_items + (2 * num_items)) * 3; + + /* + * num_bytes is total number of leaves we could need times the leaf + * size, and then for every leaf we could end up cow'ing 2 nodes per + * level, down to the leaf level. + */ + num_bytes = (num_bytes * root->leafsize) + + (num_bytes * (level * 2)) * root->nodesize; + + return num_bytes; +} + /* - * for now this just makes sure we have at least 5% of our metadata space free - * for use. + * Unreserve metadata space for delalloc. If we have less reserved credits than + * we have extents, this function does nothing. */ -int btrfs_check_metadata_free_space(struct btrfs_root *root) +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items) { struct btrfs_fs_info *info = root->fs_info; struct btrfs_space_info *meta_sinfo; - u64 alloc_target, thresh; - int committed = 0, ret; + u64 num_bytes; + u64 alloc_target; + bool bug = false; /* get the space info for where the metadata will live */ alloc_target = btrfs_get_alloc_profile(root, 0); meta_sinfo = __find_space_info(info, alloc_target); - if (!meta_sinfo) - goto alloc; -again: + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, + num_items); + spin_lock(&meta_sinfo->lock); - if (!meta_sinfo->full) - thresh = meta_sinfo->total_bytes * 80; - else - thresh = meta_sinfo->total_bytes * 95; + if (BTRFS_I(inode)->delalloc_reserved_extents <= + BTRFS_I(inode)->delalloc_extents) { + spin_unlock(&meta_sinfo->lock); + return 0; + } + + BTRFS_I(inode)->delalloc_reserved_extents--; + BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0); + + if (meta_sinfo->bytes_delalloc < num_bytes) { + bug = true; + meta_sinfo->bytes_delalloc = 0; + } else { + meta_sinfo->bytes_delalloc -= num_bytes; + } + spin_unlock(&meta_sinfo->lock); + BUG_ON(bug); + + return 0; +} + +static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) +{ + u64 thresh; + + thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use; + + thresh = meta_sinfo->total_bytes - thresh; + thresh *= 80; do_div(thresh, 100); + if (thresh <= meta_sinfo->bytes_delalloc) + meta_sinfo->force_delalloc = 1; + else + meta_sinfo->force_delalloc = 0; +} - if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super > thresh) { - struct btrfs_trans_handle *trans; - if (!meta_sinfo->full) { - meta_sinfo->force_alloc = 1; +static int maybe_allocate_chunk(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_trans_handle *trans; + bool wait = false; + int ret = 0; + u64 min_metadata; + u64 free_space; + + free_space = btrfs_super_total_bytes(disk_super); + /* + * we allow the metadata to grow to a max of either 5gb or 5% of the + * space in the volume. + */ + min_metadata = min((u64)5 * 1024 * 1024 * 1024, + div64_u64(free_space * 5, 100)); + if (info->total_bytes >= min_metadata) { + spin_unlock(&info->lock); + return 0; + } + + if (info->full) { + spin_unlock(&info->lock); + return 0; + } + + if (!info->allocating_chunk) { + info->force_alloc = 1; + info->allocating_chunk = 1; + init_waitqueue_head(&info->wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_event(info->wait, + !info->allocating_chunk); + return 1; + } + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; + } + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + 4096 + 2 * 1024 * 1024, + info->flags, 0); + btrfs_end_transaction(trans, root); + if (ret) + goto out; +out: + spin_lock(&info->lock); + info->allocating_chunk = 0; + spin_unlock(&info->lock); + wake_up(&info->wait); + + if (ret) + return 0; + return 1; +} + +/* + * Reserve metadata space for delalloc. + */ +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 used; + u64 alloc_target; + int flushed = 0; + int force_delalloc; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, + num_items); +again: + spin_lock(&meta_sinfo->lock); + + force_delalloc = meta_sinfo->force_delalloc; + + if (unlikely(!meta_sinfo->bytes_root)) + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + + if (!flushed) + meta_sinfo->bytes_delalloc += num_bytes; + + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + + if (used > meta_sinfo->total_bytes) { + flushed++; + + if (flushed == 1) { + if (maybe_allocate_chunk(root, meta_sinfo)) + goto again; + flushed++; + } else { spin_unlock(&meta_sinfo->lock); -alloc: - trans = btrfs_start_transaction(root, 1); - if (!trans) - return -ENOMEM; + } - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, alloc_target, 0); - btrfs_end_transaction(trans, root); - if (!meta_sinfo) { - meta_sinfo = __find_space_info(info, - alloc_target); - } + if (flushed == 2) { + filemap_flush(inode->i_mapping); + goto again; + } else if (flushed == 3) { + btrfs_start_delalloc_inodes(root); + btrfs_wait_ordered_extents(root, 0); goto again; } + spin_lock(&meta_sinfo->lock); + meta_sinfo->bytes_delalloc -= num_bytes; spin_unlock(&meta_sinfo->lock); + printk(KERN_ERR "enospc, has %d, reserved %d\n", + BTRFS_I(inode)->delalloc_extents, + BTRFS_I(inode)->delalloc_reserved_extents); + dump_space_info(meta_sinfo, 0, 0); + return -ENOSPC; + } - if (!committed) { - committed = 1; - trans = btrfs_join_transaction(root, 1); - if (!trans) - return -ENOMEM; - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; + BTRFS_I(inode)->delalloc_reserved_extents++; + check_force_delalloc(meta_sinfo); + spin_unlock(&meta_sinfo->lock); + + if (!flushed && force_delalloc) + filemap_flush(inode->i_mapping); + + return 0; +} + +/* + * unreserve num_items number of items worth of metadata space. This needs to + * be paired with btrfs_reserve_metadata_space. + * + * NOTE: if you have the option, run this _AFTER_ you do a + * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref + * oprations which will result in more used metadata, so we want to make sure we + * can do that without issue. + */ +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 alloc_target; + bool bug = false; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root, num_items); + + spin_lock(&meta_sinfo->lock); + if (meta_sinfo->bytes_may_use < num_bytes) { + bug = true; + meta_sinfo->bytes_may_use = 0; + } else { + meta_sinfo->bytes_may_use -= num_bytes; + } + spin_unlock(&meta_sinfo->lock); + + BUG_ON(bug); + + return 0; +} + +/* + * Reserve some metadata space for use. We'll calculate the worste case number + * of bytes that would be needed to modify num_items number of items. If we + * have space, fantastic, if not, you get -ENOSPC. Please call + * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of + * items you reserved, since whatever metadata you needed should have already + * been allocated. + * + * This will commit the transaction to make more space if we don't have enough + * metadata space. THe only time we don't do this is if we're reserving space + * inside of a transaction, then we will just return -ENOSPC and it is the + * callers responsibility to handle it properly. + */ +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 used; + u64 alloc_target; + int retries = 0; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root, num_items); +again: + spin_lock(&meta_sinfo->lock); + + if (unlikely(!meta_sinfo->bytes_root)) + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + + if (!retries) + meta_sinfo->bytes_may_use += num_bytes; + + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + + if (used > meta_sinfo->total_bytes) { + retries++; + if (retries == 1) { + if (maybe_allocate_chunk(root, meta_sinfo)) + goto again; + retries++; + } else { + spin_unlock(&meta_sinfo->lock); + } + + if (retries == 2) { + btrfs_start_delalloc_inodes(root); + btrfs_wait_ordered_extents(root, 0); goto again; } + spin_lock(&meta_sinfo->lock); + meta_sinfo->bytes_may_use -= num_bytes; + spin_unlock(&meta_sinfo->lock); + + dump_space_info(meta_sinfo, 0, 0); return -ENOSPC; } + + check_force_delalloc(meta_sinfo); spin_unlock(&meta_sinfo->lock); return 0; @@ -2888,7 +3169,7 @@ alloc: spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ - if (!committed) { + if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; trans = btrfs_join_transaction(root, 1); if (!trans) @@ -2916,7 +3197,7 @@ alloc: BTRFS_I(inode)->reserved_bytes += bytes; spin_unlock(&data_sinfo->lock); - return btrfs_check_metadata_free_space(root); + return 0; } /* @@ -3015,17 +3296,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, BUG_ON(!space_info); spin_lock(&space_info->lock); - if (space_info->force_alloc) { + if (space_info->force_alloc) force = 1; - space_info->force_alloc = 0; - } if (space_info->full) { spin_unlock(&space_info->lock); goto out; } thresh = space_info->total_bytes - space_info->bytes_readonly; - thresh = div_factor(thresh, 6); + thresh = div_factor(thresh, 8); if (!force && (space_info->bytes_used + space_info->bytes_pinned + space_info->bytes_reserved + alloc_bytes) < thresh) { @@ -3039,7 +3318,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, * we keep a reasonable number of metadata chunks allocated in the * FS as well. */ - if (flags & BTRFS_BLOCK_GROUP_DATA) { + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { fs_info->data_chunk_allocations++; if (!(fs_info->data_chunk_allocations % fs_info->metadata_ratio)) @@ -3047,8 +3326,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, } ret = btrfs_alloc_chunk(trans, extent_root, flags); + spin_lock(&space_info->lock); if (ret) space_info->full = 1; + space_info->force_alloc = 0; + spin_unlock(&space_info->lock); out: mutex_unlock(&extent_root->fs_info->chunk_mutex); return ret; @@ -4063,21 +4345,32 @@ loop: return ret; } -static void dump_space_info(struct btrfs_space_info *info, u64 bytes) +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) { struct btrfs_block_group_cache *cache; + spin_lock(&info->lock); printk(KERN_INFO "space_info has %llu free, is %sfull\n", (unsigned long long)(info->total_bytes - info->bytes_used - - info->bytes_pinned - info->bytes_reserved), + info->bytes_pinned - info->bytes_reserved - + info->bytes_super), (info->full) ? "" : "not "); printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu\n", + " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" + "\n", (unsigned long long)info->total_bytes, (unsigned long long)info->bytes_pinned, (unsigned long long)info->bytes_delalloc, (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_used); + (unsigned long long)info->bytes_used, + (unsigned long long)info->bytes_root, + (unsigned long long)info->bytes_super, + (unsigned long long)info->bytes_reserved); + spin_unlock(&info->lock); + + if (!dump_block_groups) + return; down_read(&info->groups_sem); list_for_each_entry(cache, &info->block_groups, list) { @@ -4145,7 +4438,7 @@ again: printk(KERN_ERR "btrfs allocation failed flags %llu, " "wanted %llu\n", (unsigned long long)data, (unsigned long long)num_bytes); - dump_space_info(sinfo, num_bytes); + dump_space_info(sinfo, num_bytes, 1); } return ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0cb88f8146ea..de1793ba004a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree, return NULL; } +static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, + struct extent_state *other) +{ + if (tree->ops && tree->ops->merge_extent_hook) + tree->ops->merge_extent_hook(tree->mapping->host, new, + other); +} + /* * utility function to look for merge candidates inside a given range. * Any extents with matching state are merged together into a single @@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->end == state->start - 1 && other->state == state->state) { + merge_cb(tree, state, other); state->start = other->start; other->tree = NULL; rb_erase(&other->rb_node, &tree->state); @@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->start == state->end + 1 && other->state == state->state) { + merge_cb(tree, state, other); other->start = state->start; state->tree = NULL; rb_erase(&state->rb_node, &tree->state); free_extent_state(state); + state = NULL; } } + return 0; } -static void set_state_cb(struct extent_io_tree *tree, +static int set_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned long bits) { if (tree->ops && tree->ops->set_bit_hook) { - tree->ops->set_bit_hook(tree->mapping->host, state->start, - state->end, state->state, bits); + return tree->ops->set_bit_hook(tree->mapping->host, + state->start, state->end, + state->state, bits); } + + return 0; } static void clear_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned long bits) { - if (tree->ops && tree->ops->clear_bit_hook) { - tree->ops->clear_bit_hook(tree->mapping->host, state->start, - state->end, state->state, bits); - } + if (tree->ops && tree->ops->clear_bit_hook) + tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } /* @@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree, int bits) { struct rb_node *node; + int ret; if (end < start) { printk(KERN_ERR "btrfs end < start %llu %llu\n", @@ -365,11 +379,14 @@ static int insert_state(struct extent_io_tree *tree, (unsigned long long)start); WARN_ON(1); } - if (bits & EXTENT_DIRTY) - tree->dirty_bytes += end - start + 1; state->start = start; state->end = end; - set_state_cb(tree, state, bits); + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + + if (bits & EXTENT_DIRTY) + tree->dirty_bytes += end - start + 1; state->state |= bits; node = tree_insert(&tree->state, end, &state->rb_node); if (node) { @@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree, return 0; } +static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, + u64 split) +{ + if (tree->ops && tree->ops->split_extent_hook) + return tree->ops->split_extent_hook(tree->mapping->host, + orig, split); + return 0; +} + /* * split a given extent state struct in two, inserting the preallocated * struct 'prealloc' as the newly created second half. 'split' indicates an @@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct extent_state *prealloc, u64 split) { struct rb_node *node; + + split_cb(tree, orig, split); + prealloc->start = orig->start; prealloc->end = split - 1; prealloc->state = orig->state; @@ -542,8 +571,8 @@ hit_next: if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, bits, - wake, delete); + set |= clear_state_bit(tree, state, bits, wake, + delete); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -561,12 +590,11 @@ hit_next: prealloc = alloc_extent_state(GFP_ATOMIC); err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, bits, - wake, delete); + set |= clear_state_bit(tree, prealloc, bits, wake, delete); + prealloc = NULL; goto out; } @@ -667,16 +695,23 @@ out: return 0; } -static void set_state_bits(struct extent_io_tree *tree, +static int set_state_bits(struct extent_io_tree *tree, struct extent_state *state, int bits) { + int ret; + + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } - set_state_cb(tree, state, bits); state->state |= bits; + + return 0; } static void cache_state(struct extent_state *state, @@ -758,7 +793,10 @@ hit_next: goto out; } - set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, bits); + if (err) + goto out; + cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -805,7 +843,9 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, bits); + if (err) + goto out; cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -829,11 +869,13 @@ hit_next: this_end = last_start - 1; err = insert_state(tree, prealloc, start, this_end, bits); - cache_state(prealloc, cached_state); - prealloc = NULL; BUG_ON(err == -EEXIST); - if (err) + if (err) { + prealloc = NULL; goto out; + } + cache_state(prealloc, cached_state); + prealloc = NULL; start = this_end + 1; goto search_again; } @@ -852,7 +894,11 @@ hit_next: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - set_state_bits(tree, prealloc, bits); + err = set_state_bits(tree, prealloc, bits); + if (err) { + prealloc = NULL; + goto out; + } cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 14ed16fd862d..4794ec891fed 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -60,8 +60,13 @@ struct extent_io_ops { struct extent_state *state, int uptodate); int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, unsigned long old, unsigned long bits); - int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits); + int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, + unsigned long bits); + int (*merge_extent_hook)(struct inode *inode, + struct extent_state *new, + struct extent_state *other); + int (*split_extent_hook)(struct inode *inode, + struct extent_state *orig, u64 split); int (*write_cache_pages_lock_hook)(struct page *page); }; @@ -79,10 +84,14 @@ struct extent_state { u64 start; u64 end; /* inclusive */ struct rb_node rb_node; + + /* ADD NEW ELEMENTS AFTER THIS */ struct extent_io_tree *tree; wait_queue_head_t wq; atomic_t refs; unsigned long state; + u64 split_start; + u64 split_end; /* for use by the FS */ u64 private; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a3492a3ad96b..f19e1259a971 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -123,7 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, root->sectorsize - 1) & ~((u64)root->sectorsize - 1); end_of_last_block = start_pos + num_bytes - 1; - btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + if (err) + return err; + for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; SetPageUptodate(p); @@ -917,21 +920,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* do the reserve before the mutex lock in case we have to do some + * flushing. We wouldn't deadlock, but this is more polite. + */ + err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (err) + goto out_nolock; + + mutex_lock(&inode->i_mutex); + current->backing_dev_info = inode->i_mapping->backing_dev_info; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) - goto out_nolock; + goto out; + if (count == 0) - goto out_nolock; + goto out; err = file_remove_suid(file); if (err) - goto out_nolock; + goto out; + file_update_time(file); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); - mutex_lock(&inode->i_mutex); + /* generic_write_checks can change our pos */ + start_pos = pos; + BTRFS_I(inode)->sequence++; first_index = pos >> PAGE_CACHE_SHIFT; last_index = (pos + count) >> PAGE_CACHE_SHIFT; @@ -1005,9 +1022,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } if (will_write) { - btrfs_fdatawrite_range(inode->i_mapping, pos, - pos + write_bytes - 1, - WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, pos, + pos + write_bytes - 1); } else { balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages); @@ -1028,6 +1044,7 @@ out: mutex_unlock(&inode->i_mutex); if (ret) err = ret; + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); out_nolock: kfree(pages); @@ -1196,7 +1213,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } -struct file_operations btrfs_file_operations = { +const struct file_operations btrfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .aio_read = generic_file_aio_read, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e9b76bcd1c12..112e5aa85892 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -62,7 +62,7 @@ static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct address_space_operations btrfs_symlink_aops; -static struct file_operations btrfs_dir_file_operations; +static const struct file_operations btrfs_dir_file_operations; static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; @@ -1159,6 +1159,83 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, return ret; } +static int btrfs_split_extent_hook(struct inode *inode, + struct extent_state *orig, u64 split) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 size; + + if (!(orig->state & EXTENT_DELALLOC)) + return 0; + + size = orig->end - orig->start + 1; + if (size > root->fs_info->max_extent) { + u64 num_extents; + u64 new_size; + + new_size = orig->end - split + 1; + num_extents = div64_u64(size + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + + /* + * if we break a large extent up then leave delalloc_extents be, + * since we've already accounted for the large extent. + */ + if (div64_u64(new_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent) < num_extents) + return 0; + } + + BTRFS_I(inode)->delalloc_extents++; + + return 0; +} + +/* + * extent_io.c merge_extent_hook, used to track merged delayed allocation + * extents so we can keep track of new extents that are just merged onto old + * extents, such as when we are doing sequential writes, so we can properly + * account for the metadata space we'll need. + */ +static int btrfs_merge_extent_hook(struct inode *inode, + struct extent_state *new, + struct extent_state *other) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 new_size, old_size; + u64 num_extents; + + /* not delalloc, ignore it */ + if (!(other->state & EXTENT_DELALLOC)) + return 0; + + old_size = other->end - other->start + 1; + if (new->start < other->start) + new_size = other->end - new->start + 1; + else + new_size = new->end - other->start + 1; + + /* we're not bigger than the max, unreserve the space and go */ + if (new_size <= root->fs_info->max_extent) { + BTRFS_I(inode)->delalloc_extents--; + return 0; + } + + /* + * If we grew by another max_extent, just return, we want to keep that + * reserved amount. + */ + num_extents = div64_u64(old_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + if (div64_u64(new_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent) > num_extents) + return 0; + + BTRFS_I(inode)->delalloc_extents--; + + return 0; +} + /* * extent_io.c set_bit_hook, used to track delayed allocation * bytes in this file, and to maintain the list of inodes that @@ -1167,6 +1244,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, unsigned long old, unsigned long bits) { + /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testeing for the DELALLOC @@ -1174,6 +1252,8 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, */ if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + + BTRFS_I(inode)->delalloc_extents++; btrfs_delalloc_reserve_space(root, inode, end - start + 1); spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += end - start + 1; @@ -1190,22 +1270,27 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, /* * extent_io.c clear_bit_hook, see set_bit_hook for why */ -static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits) +static int btrfs_clear_bit_hook(struct inode *inode, + struct extent_state *state, unsigned long bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + BTRFS_I(inode)->delalloc_extents--; + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + spin_lock(&root->fs_info->delalloc_lock); - if (end - start + 1 > root->fs_info->delalloc_bytes) { + if (state->end - state->start + 1 > + root->fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs warning: delalloc account " "%llu %llu\n", - (unsigned long long)end - start + 1, + (unsigned long long) + state->end - state->start + 1, (unsigned long long) root->fs_info->delalloc_bytes); btrfs_delalloc_free_space(root, inode, (u64)-1); @@ -1213,9 +1298,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, BTRFS_I(inode)->delalloc_bytes = 0; } else { btrfs_delalloc_free_space(root, inode, - end - start + 1); - root->fs_info->delalloc_bytes -= end - start + 1; - BTRFS_I(inode)->delalloc_bytes -= end - start + 1; + state->end - + state->start + 1); + root->fs_info->delalloc_bytes -= state->end - + state->start + 1; + BTRFS_I(inode)->delalloc_bytes -= state->end - + state->start + 1; } if (BTRFS_I(inode)->delalloc_bytes == 0 && !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { @@ -2950,7 +3038,12 @@ again: goto again; } - btrfs_set_extent_delalloc(inode, page_start, page_end); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); + if (ret) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + goto out_unlock; + } + ret = 0; if (offset != PAGE_CACHE_SIZE) { kaddr = kmap(page); @@ -2981,15 +3074,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) u64 last_byte; u64 cur_offset; u64 hole_size; - int err; + int err = 0; if (size <= hole_start) return 0; - err = btrfs_check_metadata_free_space(root); - if (err) - return err; - btrfs_truncate_page(inode->i_mapping, inode->i_size); while (1) { @@ -3024,12 +3113,18 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) cur_offset, &hint_byte, 1); if (err) break; + + err = btrfs_reserve_metadata_space(root, 1); + if (err) + break; + err = btrfs_insert_file_extent(trans, root, inode->i_ino, cur_offset, 0, 0, hole_size, 0, hole_size, 0, 0, 0); btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); + btrfs_unreserve_metadata_space(root, 1); } free_extent_map(em); cur_offset = last_byte; @@ -3990,11 +4085,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; - err = btrfs_check_metadata_free_space(root); + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto fail; + return err; trans = btrfs_start_transaction(root, 1); + if (!trans) + goto fail; btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); @@ -4032,6 +4134,7 @@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: + btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4052,10 +4155,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, u64 objectid; u64 index = 0; - err = btrfs_check_metadata_free_space(root); + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto fail; + return err; + trans = btrfs_start_transaction(root, 1); + if (!trans) + goto fail; btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); @@ -4096,6 +4207,7 @@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: + btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4118,10 +4230,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == 0) return -ENOENT; - btrfs_inc_nlink(inode); - err = btrfs_check_metadata_free_space(root); + /* + * 1 item for inode ref + * 2 items for dir items + */ + err = btrfs_reserve_metadata_space(root, 3); if (err) - goto fail; + return err; + + btrfs_inc_nlink(inode); + err = btrfs_set_inode_index(dir, &index); if (err) goto fail; @@ -4145,6 +4263,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: + btrfs_unreserve_metadata_space(root, 3); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4164,17 +4283,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) u64 index = 0; unsigned long nr = 1; - err = btrfs_check_metadata_free_space(root); + /* + * 2 items for inode and ref + * 2 items for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto out_unlock; + return err; trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, dir); - - if (IS_ERR(trans)) { - err = PTR_ERR(trans); + if (!trans) { + err = -ENOMEM; goto out_unlock; } + btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); if (err) { @@ -4223,6 +4346,7 @@ out_fail: btrfs_end_transaction_throttle(trans, root); out_unlock: + btrfs_unreserve_metadata_space(root, 5); if (drop_on_err) iput(inode); btrfs_btree_balance_dirty(root, nr); @@ -4747,6 +4871,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out; } + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (ret) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + ret = VM_FAULT_SIGBUS; + goto out; + } + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); @@ -4778,7 +4909,23 @@ again: goto again; } - btrfs_set_extent_delalloc(inode, page_start, page_end); + /* + * XXX - page_mkwrite gets called every time the page is dirtied, even + * if it was already dirty, so for space accounting reasons we need to + * clear any delalloc bits for the range we are fixing to save. There + * is probably a better way to do this, but for now keep consistent with + * prepare_pages in the normal write path. + */ + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS); + + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); + if (ret) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + ret = VM_FAULT_SIGBUS; + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + goto out_unlock; + } ret = 0; /* page is wholly or partially inside EOF */ @@ -4801,6 +4948,7 @@ again: unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); if (!ret) return VM_FAULT_LOCKED; unlock_page(page); @@ -4917,6 +5065,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) return NULL; ei->last_trans = 0; ei->logged_trans = 0; + ei->delalloc_extents = 0; + ei->delalloc_reserved_extents = 0; btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->ordered_operations); @@ -5070,7 +5220,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - ret = btrfs_check_metadata_free_space(root); + /* + * 2 items for dir items + * 1 item for orphan entry + * 1 item for ref + */ + ret = btrfs_reserve_metadata_space(root, 4); if (ret) return ret; @@ -5185,6 +5340,8 @@ out_fail: if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); + + btrfs_unreserve_metadata_space(root, 4); return ret; } @@ -5256,11 +5413,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; - err = btrfs_check_metadata_free_space(root); + /* + * 2 items for inode item and ref + * 2 items for dir items + * 1 item for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto out_fail; + return err; trans = btrfs_start_transaction(root, 1); + if (!trans) + goto out_fail; btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); @@ -5341,6 +5505,7 @@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); out_fail: + btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -5362,6 +5527,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, while (num_bytes > 0) { alloc_size = min(num_bytes, root->fs_info->max_extent); + + ret = btrfs_reserve_metadata_space(root, 1); + if (ret) + goto out; + ret = btrfs_reserve_extent(trans, root, alloc_size, root->sectorsize, 0, alloc_hint, (u64)-1, &ins, 1); @@ -5381,6 +5551,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, num_bytes -= ins.offset; cur_offset += ins.offset; alloc_hint = ins.objectid + ins.offset; + btrfs_unreserve_metadata_space(root, 1); } out: if (cur_offset > start) { @@ -5544,7 +5715,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = { .permission = btrfs_permission, }; -static struct file_operations btrfs_dir_file_operations = { +static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = btrfs_real_readdir, @@ -5566,6 +5737,8 @@ static struct extent_io_ops btrfs_extent_io_ops = { .readpage_io_failed_hook = btrfs_io_failed_hook, .set_bit_hook = btrfs_set_bit_hook, .clear_bit_hook = btrfs_clear_bit_hook, + .merge_extent_hook = btrfs_merge_extent_hook, + .split_extent_hook = btrfs_split_extent_hook, }; /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a8577a7f26ab..9a780c8d0ac8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -239,7 +239,13 @@ static noinline int create_subvol(struct btrfs_root *root, u64 index = 0; unsigned long nr = 1; - ret = btrfs_check_metadata_free_space(root); + /* + * 1 - inode item + * 2 - refs + * 1 - root item + * 2 - dir items + */ + ret = btrfs_reserve_metadata_space(root, 6); if (ret) return ret; @@ -340,6 +346,9 @@ fail: err = btrfs_commit_transaction(trans, root); if (err && !ret) ret = err; + + btrfs_unreserve_metadata_space(root, 6); + btrfs_btree_balance_dirty(root, nr); return ret; } @@ -355,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (!root->ref_cows) return -EINVAL; - ret = btrfs_check_metadata_free_space(root); + /* + * 1 - inode item + * 2 - refs + * 1 - root item + * 2 - dir items + */ + ret = btrfs_reserve_metadata_space(root, 6); if (ret) goto fail_unlock; pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) { ret = -ENOMEM; + btrfs_unreserve_metadata_space(root, 6); goto fail_unlock; } pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); if (!pending_snapshot->name) { ret = -ENOMEM; kfree(pending_snapshot); + btrfs_unreserve_metadata_space(root, 6); goto fail_unlock; } memcpy(pending_snapshot->name, name, namelen); @@ -1215,15 +1232,15 @@ static long btrfs_ioctl_trans_start(struct file *file) struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - int ret = 0; + int ret; + ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + goto out; - if (file->private_data) { - ret = -EINPROGRESS; + ret = -EINPROGRESS; + if (file->private_data) goto out; - } ret = mnt_want_write(file->f_path.mnt); if (ret) @@ -1233,12 +1250,19 @@ static long btrfs_ioctl_trans_start(struct file *file) root->fs_info->open_ioctl_trans++; mutex_unlock(&root->fs_info->trans_mutex); + ret = -ENOMEM; trans = btrfs_start_ioctl_transaction(root, 0); - if (trans) - file->private_data = trans; - else - ret = -ENOMEM; - /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ + if (!trans) + goto out_drop; + + file->private_data = trans; + return 0; + +out_drop: + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + mnt_drop_write(file->f_path.mnt); out: return ret; } @@ -1254,24 +1278,20 @@ long btrfs_ioctl_trans_end(struct file *file) struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - int ret = 0; trans = file->private_data; - if (!trans) { - ret = -EINVAL; - goto out; - } - btrfs_end_transaction(trans, root); + if (!trans) + return -EINVAL; file->private_data = NULL; + btrfs_end_transaction(trans, root); + mutex_lock(&root->fs_info->trans_mutex); root->fs_info->open_ioctl_trans--; mutex_unlock(&root->fs_info->trans_mutex); mnt_drop_write(file->f_path.mnt); - -out: - return ret; + return 0; } long btrfs_ioctl(struct file *file, unsigned int diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b5d6d24726b0..897fba835f89 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -458,7 +458,7 @@ void btrfs_start_ordered_extent(struct inode *inode, * start IO on any dirty ones so the wait doesn't stall waiting * for pdflush to find them */ - btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -488,17 +488,15 @@ again: /* start IO across the range first to instantiate any delalloc * extents */ - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); /* The compression code will leave pages locked but return from * writepage without setting the page writeback. Starting again * with WB_SYNC_ALL will end up waiting for the IO to actually start. */ - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - btrfs_wait_on_page_writeback_range(inode->i_mapping, - start >> PAGE_CACHE_SHIFT, - orig_end >> PAGE_CACHE_SHIFT); + filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; found = 0; @@ -716,89 +714,6 @@ out: } -/** - * taken from mm/filemap.c because it isn't exported - * - * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range - * @mapping: address space structure to write - * @start: offset in bytes where the range starts - * @end: offset in bytes where the range ends (inclusive) - * @sync_mode: enable synchronous operation - * - * Start writeback against all of a mapping's dirty pages that lie - * within the byte offsets <start, end> inclusive. - * - * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory cleansing writeback. The difference between - * these two operations is that if a dirty page/buffer is encountered, it must - * be waited upon, and not just skipped over. - */ -int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode) -{ - struct writeback_control wbc = { - .sync_mode = sync_mode, - .nr_to_write = mapping->nrpages * 2, - .range_start = start, - .range_end = end, - }; - return btrfs_writepages(mapping, &wbc); -} - -/** - * taken from mm/filemap.c because it isn't exported - * - * wait_on_page_writeback_range - wait for writeback to complete - * @mapping: target address_space - * @start: beginning page index - * @end: ending page index - * - * Wait for writeback to complete against pages indexed by start->end - * inclusive - */ -int btrfs_wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end) -{ - struct pagevec pvec; - int nr_pages; - int ret = 0; - pgoff_t index; - - if (end < start) - return 0; - - pagevec_init(&pvec, 0); - index = start; - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { - unsigned i; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - /* until radix tree lookup accepts end_index */ - if (page->index > end) - continue; - - wait_on_page_writeback(page); - if (PageError(page)) - ret = -EIO; - } - pagevec_release(&pvec); - cond_resched(); - } - - /* Check for outstanding write errors */ - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) - ret = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) - ret = -EIO; - - return ret; -} - /* * add a given inode to the list of inodes that must be fully on * disk before a transaction commit finishes. diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 993a7ea45c70..f82e87488ca8 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -153,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); int btrfs_ordered_update_i_size(struct inode *inode, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end); -int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 67035385444c..9de9b2236419 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -344,7 +344,9 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; +#ifdef CONFIG_BTRFS_POSIX_ACL sb->s_flags |= MS_POSIXACL; +#endif tree_root = open_ctree(sb, fs_devices, (char *)data); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 88f866f85e7a..0b8f36d4400a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -186,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->alloc_exclude_start = 0; h->delayed_ref_updates = 0; + if (!current->journal_info) + current->journal_info = h; + root->fs_info->running_transaction->use_count++; record_root_in_trans(h, root); mutex_unlock(&root->fs_info->trans_mutex); @@ -317,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, wake_up(&cur_trans->writer_wait); put_transaction(cur_trans); mutex_unlock(&info->trans_mutex); + + if (current->journal_info == trans) + current->journal_info = NULL; memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); @@ -743,6 +749,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memcpy(&pending->root_key, &key, sizeof(key)); fail: kfree(new_root_item); + btrfs_unreserve_metadata_space(root, 6); return ret; } @@ -1059,6 +1066,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); + if (current->journal_info == trans) + current->journal_info = NULL; + kmem_cache_free(btrfs_trans_handle_cachep, trans); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 23e7d36ff325..7eda483d7b5a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) goto error; device->name = kstrdup(orig_dev->name, GFP_NOFS); - if (!device->name) + if (!device->name) { + kfree(device); goto error; + } device->devid = orig_dev->devid; device->work.func = pending_bios_fn; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index a9d3bf4d2689..b0fc93f95fd0 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -260,7 +260,7 @@ err: * attributes are handled directly. */ struct xattr_handler *btrfs_xattr_handlers[] = { -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_POSIX_ACL &btrfs_xattr_acl_access_handler, &btrfs_xattr_acl_default_handler, #endif diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 0376ac66c44a..be4392ca2098 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -22,6 +22,7 @@ #include <linux/kernel.h> #include <linux/major.h> #include <linux/time.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/ioport.h> #include <linux/fcntl.h> diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 8aadb99b7634..1cd6d9d3e29a 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,8 +1,9 @@ config ECRYPT_FS tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && NET + depends on EXPERIMENTAL && KEYS && CRYPTO select CRYPTO_ECB select CRYPTO_CBC + select CRYPTO_MD5 help Encrypted filesystem that operates on the VFS layer. See <file:Documentation/filesystems/ecryptfs.txt> to learn more about diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 101fe4c7b1ee..c6ac85d6c701 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -35,6 +35,7 @@ #include <linux/key.h> #include <linux/parser.h> #include <linux/fs_stack.h> +#include <linux/ima.h> #include "ecryptfs_kernel.h" /** @@ -118,6 +119,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) const struct cred *cred = current_cred(); struct ecryptfs_inode_info *inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); + int opened_lower_file = 0; int rc = 0; mutex_lock(&inode_info->lower_file_mutex); @@ -134,9 +136,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) "for lower_dentry [0x%p] and lower_mnt [0x%p]; " "rc = [%d]\n", lower_dentry, lower_mnt, rc); inode_info->lower_file = NULL; - } + } else + opened_lower_file = 1; } mutex_unlock(&inode_info->lower_file_mutex); + if (opened_lower_file) + ima_counts_get(inode_info->lower_file); return rc; } diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index d5c0ea2e8f2d..9f2d45d75b1a 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -26,20 +26,6 @@ config EXT4_FS If unsure, say N. -config EXT4DEV_COMPAT - bool "Enable ext4dev compatibility" - depends on EXT4_FS - help - Starting with 2.6.28, the name of the ext4 filesystem was - renamed from ext4dev to ext4. Unfortunately there are some - legacy userspace programs (such as klibc's fstype) have - "ext4dev" hardcoded. - - To enable backwards compatibility so that systems that are - still expecting to mount ext4 filesystems using ext4dev, - choose Y here. This feature will go away by 2.6.31, so - please arrange to get your userspace programs fixed! - config EXT4_FS_XATTR bool "Ext4 extended attributes" depends on EXT4_FS diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ec367bce7215..5c5bc5dafff8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1146,8 +1146,8 @@ static int check_block_validity(struct inode *inode, const char *msg, } /* - * Return the number of dirty pages in the given inode starting at - * page frame idx. + * Return the number of contiguous dirty pages in a given inode + * starting at page frame idx. */ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, unsigned int max_pages) @@ -1181,15 +1181,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, unlock_page(page); break; } - head = page_buffers(page); - bh = head; - do { - if (!buffer_delay(bh) && - !buffer_unwritten(bh)) { - done = 1; - break; - } - } while ((bh = bh->b_this_page) != head); + if (page_has_buffers(page)) { + bh = head = page_buffers(page); + do { + if (!buffer_delay(bh) && + !buffer_unwritten(bh)) + done = 1; + bh = bh->b_this_page; + } while (!done && (bh != head)); + } unlock_page(page); if (done) break; @@ -3378,6 +3378,7 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, ssize_t ret; int orphan = 0; size_t count = iov_length(iov, nr_segs); + int retries = 0; if (rw == WRITE) { loff_t final_size = offset + count; @@ -3400,9 +3401,12 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, } } +retry: ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (orphan) { int err; @@ -5612,14 +5616,12 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) */ void ext4_dirty_inode(struct inode *inode) { - handle_t *current_handle = ext4_journal_current_handle(); handle_t *handle; handle = ext4_journal_start(inode, 2); if (IS_ERR(handle)) goto out; - jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 12e726a7073f..312211ee05af 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3966,27 +3966,6 @@ static struct file_system_type ext4_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -#ifdef CONFIG_EXT4DEV_COMPAT -static int ext4dev_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data,struct vfsmount *mnt) -{ - printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs " - "to mount using ext4\n", dev_name); - printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility " - "will go away by 2.6.31\n", dev_name); - return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); -} - -static struct file_system_type ext4dev_fs_type = { - .owner = THIS_MODULE, - .name = "ext4dev", - .get_sb = ext4dev_get_sb, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS("ext4dev"); -#endif - static int __init init_ext4_fs(void) { int err; @@ -4011,13 +3990,6 @@ static int __init init_ext4_fs(void) err = register_filesystem(&ext4_fs_type); if (err) goto out; -#ifdef CONFIG_EXT4DEV_COMPAT - err = register_filesystem(&ext4dev_fs_type); - if (err) { - unregister_filesystem(&ext4_fs_type); - goto out; - } -#endif return 0; out: destroy_inodecache(); @@ -4036,9 +4008,6 @@ out4: static void __exit exit_ext4_fs(void) { unregister_filesystem(&ext4_fs_type); -#ifdef CONFIG_EXT4DEV_COMPAT - unregister_filesystem(&ext4dev_fs_type); -#endif destroy_inodecache(); exit_ext4_xattr(); exit_ext4_mballoc(); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 761af77491f5..b0ab5219becb 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -770,7 +770,7 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static struct file_operations jbd2_seq_info_fops = { +static const struct file_operations jbd2_seq_info_fops = { .owner = THIS_MODULE, .open = jbd2_seq_info_open, .read = seq_read, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 63976c0ccc25..99ea196f071f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1180,7 +1180,7 @@ static int nfs4_init_client(struct nfs_client *clp, 1, flags & NFS_MOUNT_NORESVPORT); if (error < 0) goto error; - memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); error = nfs_idmap_new(clp); if (error < 0) { diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 2636c26d56fa..fa3408f20112 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -121,7 +121,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); if (IS_ERR(mnt_path)) - return mnt; + return ERR_CAST(mnt_path); mountdata->mnt_path = mnt_path; maxbuflen = mnt_path - 1 - page2; @@ -132,15 +132,15 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (buf->len <= 0 || buf->len >= maxbuflen) continue; - mountdata->addr = (struct sockaddr *)&addr; - if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) continue; - mountdata->addrlen = nfs_parse_server_name(buf->data, - buf->len, - mountdata->addr, mountdata->addrlen); + + mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, + (struct sockaddr *)&addr, sizeof(addr)); if (mountdata->addrlen == 0) continue; + + mountdata->addr = (struct sockaddr *)&addr; rpc_set_port(mountdata->addr, NFS_PORT); memcpy(page2, buf->data, buf->len); diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index e27c6cef18f2..0156c01c212c 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -127,12 +127,6 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) } void -nfs4_renewd_prepare_shutdown(struct nfs_server *server) -{ - cancel_delayed_work(&server->nfs_client->cl_renewd); -} - -void nfs4_kill_renewd(struct nfs_client *clp) { cancel_delayed_work_sync(&clp->cl_renewd); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 29786d3b9326..6dabf6feec94 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -728,22 +728,24 @@ static void nfs_umount_begin(struct super_block *sb) unlock_kernel(); } -static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(int flags) +static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version) { struct nfs_parsed_mount_data *data; data = kzalloc(sizeof(*data), GFP_KERNEL); if (data) { - data->flags = flags; data->rsize = NFS_MAX_FILE_IO_SIZE; data->wsize = NFS_MAX_FILE_IO_SIZE; data->acregmin = NFS_DEF_ACREGMIN; data->acregmax = NFS_DEF_ACREGMAX; data->acdirmin = NFS_DEF_ACDIRMIN; data->acdirmax = NFS_DEF_ACDIRMAX; + data->mount_server.port = NFS_UNSPEC_PORT; data->nfs_server.port = NFS_UNSPEC_PORT; + data->nfs_server.protocol = XPRT_TRANSPORT_TCP; data->auth_flavors[0] = RPC_AUTH_UNIX; data->auth_flavor_len = 1; + data->version = version; data->minorversion = 0; } return data; @@ -776,15 +778,13 @@ static int nfs_verify_server_address(struct sockaddr *addr) * Select between a default port value and a user-specified port value. * If a zero value is set, then autobind will be used. */ -static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port, +static void nfs_set_port(struct sockaddr *sap, int *port, const unsigned short default_port) { - unsigned short port = default_port; + if (*port == NFS_UNSPEC_PORT) + *port = default_port; - if (parsed_port != NFS_UNSPEC_PORT) - port = parsed_port; - - rpc_set_port(sap, port); + rpc_set_port(sap, *port); } /* @@ -1475,7 +1475,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, args->mount_server.addrlen = args->nfs_server.addrlen; } request.salen = args->mount_server.addrlen; - nfs_set_default_port(request.sap, args->mount_server.port, 0); + nfs_set_port(request.sap, &args->mount_server.port, 0); /* * Now ask the mount server to map our export path @@ -1711,8 +1711,6 @@ static int nfs_validate_mount_data(void *options, if (!(data->flags & NFS_MOUNT_TCP)) args->nfs_server.protocol = XPRT_TRANSPORT_UDP; - else - args->nfs_server.protocol = XPRT_TRANSPORT_TCP; /* N.B. caller will free nfs_server.hostname in all cases */ args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); args->namlen = data->namlen; @@ -1767,7 +1765,7 @@ static int nfs_validate_mount_data(void *options, goto out_v4_not_compiled; #endif - nfs_set_default_port(sap, args->nfs_server.port, 0); + nfs_set_port(sap, &args->nfs_server.port, 0); nfs_set_mount_transport_protocol(args); @@ -1848,9 +1846,10 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->acdirmin != nfss->acdirmin / HZ || data->acdirmax != nfss->acdirmax / HZ || data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || + data->nfs_server.port != nfss->port || data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || - memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr, - data->nfs_server.addrlen) != 0) + !rpc_cmp_addr(&data->nfs_server.address, + &nfss->nfs_client->cl_addr)) return -EINVAL; return 0; @@ -1893,6 +1892,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->acdirmin = nfss->acdirmin / HZ; data->acdirmax = nfss->acdirmax / HZ; data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; + data->nfs_server.port = nfss->port; data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, data->nfs_server.addrlen); @@ -2106,7 +2106,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, }; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(NFS_MOUNT_VER3 | NFS_MOUNT_TCP); + data = nfs_alloc_parsed_mount_data(3); mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); if (data == NULL || mntfh == NULL) goto out_free_fh; @@ -2331,7 +2331,7 @@ static int nfs4_validate_text_mount_data(void *options, { struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; - nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT); + nfs_set_port(sap, &args->nfs_server.port, NFS_PORT); nfs_validate_transport_protocol(args); @@ -2376,7 +2376,6 @@ static int nfs4_validate_mount_data(void *options, if (data == NULL) goto out_no_data; - args->version = 4; switch (data->version) { case 1: if (data->host_addrlen > sizeof(args->nfs_server.address)) @@ -2660,7 +2659,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, struct nfs_parsed_mount_data *data; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(0); + data = nfs_alloc_parsed_mount_data(4); if (data == NULL) goto out_free_data; @@ -2690,7 +2689,6 @@ static void nfs4_kill_super(struct super_block *sb) dprintk("--> %s\n", __func__); nfs_super_return_all_delegations(sb); kill_anon_super(sb); - nfs4_renewd_prepare_shutdown(server); nfs_fscache_release_super_cookie(sb); nfs_free_server(server); dprintk("<-- %s\n", __func__); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 00388d2a3c99..5c01fc148ce8 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -176,7 +176,7 @@ static const struct file_operations exports_operations = { extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); -static struct file_operations pool_stats_operations = { +static const struct file_operations pool_stats_operations = { .open = nfsd_pool_stats_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 1a4fa04cf071..e097099bfc8f 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -697,7 +697,7 @@ not_empty: return 0; } -struct file_operations nilfs_dir_operations = { +const struct file_operations nilfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = nilfs_readdir, diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 7d7b4983dee3..30292df443ce 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) * We have mostly NULL's here: the current defaults are ok for * the nilfs filesystem. */ -struct file_operations nilfs_file_operations = { +const struct file_operations nilfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index b18c4998f8d0..f6326112d647 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -433,7 +433,7 @@ static const struct address_space_operations def_mdt_aops = { }; static const struct inode_operations def_mdt_iops; -static struct file_operations def_mdt_fops; +static const struct file_operations def_mdt_fops; /* * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index bad7368782d0..4da6f67e9a91 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -294,9 +294,9 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *); /* * Inodes and files operations */ -extern struct file_operations nilfs_dir_operations; +extern const struct file_operations nilfs_dir_operations; extern const struct inode_operations nilfs_file_inode_operations; -extern struct file_operations nilfs_file_operations; +extern const struct file_operations nilfs_file_operations; extern const struct address_space_operations nilfs_aops; extern const struct inode_operations nilfs_dir_inode_operations; extern const struct inode_operations nilfs_special_inode_operations; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 09cc25d04611..c452d116b892 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf, } #endif /* CONFIG_DEBUG_FS */ -static struct file_operations o2hb_debug_fops = { +static const struct file_operations o2hb_debug_fops = { .open = o2hb_debug_open, .release = o2hb_debug_release, .read = o2hb_debug_read, diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index cfb2be708abe..da794bc07a6c 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static struct file_operations nst_seq_fops = { +static const struct file_operations nst_seq_fops = { .open = nst_fop_open, .read = seq_read, .llseek = seq_lseek, @@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static struct file_operations sc_seq_fops = { +static const struct file_operations sc_seq_fops = { .open = sc_fop_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index ca46002ec10e..42b0bad7a612 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -478,7 +478,7 @@ bail: return -ENOMEM; } -static struct file_operations debug_purgelist_fops = { +static const struct file_operations debug_purgelist_fops = { .open = debug_purgelist_open, .release = debug_buffer_release, .read = debug_buffer_read, @@ -538,7 +538,7 @@ bail: return -ENOMEM; } -static struct file_operations debug_mle_fops = { +static const struct file_operations debug_mle_fops = { .open = debug_mle_open, .release = debug_buffer_release, .read = debug_buffer_read, @@ -741,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static struct file_operations debug_lockres_fops = { +static const struct file_operations debug_lockres_fops = { .open = debug_lockres_open, .release = debug_lockres_release, .read = seq_read, @@ -925,7 +925,7 @@ bail: return -ENOMEM; } -static struct file_operations debug_state_fops = { +static const struct file_operations debug_state_fops = { .open = debug_state_open, .release = debug_buffer_release, .read = debug_buffer_read, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 4cc3c890a2cd..c0e48aeebb1c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -373,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, } #endif /* CONFIG_DEBUG_FS */ -static struct file_operations ocfs2_osb_debug_fops = { +static const struct file_operations ocfs2_osb_debug_fops = { .open = ocfs2_osb_debug_open, .release = ocfs2_debug_release, .read = ocfs2_debug_read, diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 3680bae335b5..b42d62419034 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -498,7 +498,7 @@ const struct inode_operations omfs_dir_inops = { .rmdir = omfs_rmdir, }; -struct file_operations omfs_dir_operations = { +const struct file_operations omfs_dir_operations = { .read = generic_read_dir, .readdir = omfs_readdir, .llseek = generic_file_llseek, diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 4845fbb18e6e..399487c09364 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, omfs_get_block); } -struct file_operations omfs_file_operations = { +const struct file_operations omfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h index df71039945ac..ebe2fdbe535e 100644 --- a/fs/omfs/omfs.h +++ b/fs/omfs/omfs.h @@ -44,14 +44,14 @@ extern int omfs_allocate_range(struct super_block *sb, int min_request, extern int omfs_clear_range(struct super_block *sb, u64 block, int count); /* dir.c */ -extern struct file_operations omfs_dir_operations; +extern const struct file_operations omfs_dir_operations; extern const struct inode_operations omfs_dir_inops; extern int omfs_make_empty(struct inode *inode, struct super_block *sb); extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, u64 fsblock); /* file.c */ -extern struct file_operations omfs_file_operations; +extern const struct file_operations omfs_file_operations; extern const struct inode_operations omfs_file_inops; extern const struct address_space_operations omfs_aops; extern void omfs_make_empty_table(struct buffer_head *bh, int offset); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7b685e10cbad..f38fee0311a7 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -248,19 +248,11 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[WRITE]), (unsigned long long)part_stat_read(p, sectors[WRITE]), jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - part_in_flight(p), + p->in_flight, jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue))); } -ssize_t part_inflight_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]); -} - #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -289,7 +281,6 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); -static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -301,7 +292,6 @@ static struct attribute *part_attrs[] = { &dev_attr_size.attr, &dev_attr_alignment_offset.attr, &dev_attr_stat.attr, - &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 56013371f9f3..a44a7897fd4d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -23,7 +23,6 @@ #include <asm/io.h> #include <linux/list.h> #include <linux/ioport.h> -#include <linux/mm.h> #include <linux/memory.h> #include <asm/sections.h> diff --git a/fs/proc/page.c b/fs/proc/page.c index 2281c2cbfe2b..5033ce0d254b 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -94,6 +94,7 @@ static const struct file_operations proc_kpagecount_operations = { #define KPF_COMPOUND_TAIL 16 #define KPF_HUGE 17 #define KPF_UNEVICTABLE 18 +#define KPF_HWPOISON 19 #define KPF_NOPAGE 20 #define KPF_KSM 21 @@ -180,6 +181,10 @@ static u64 get_uflags(struct page *page) u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); +#ifdef CONFIG_MEMORY_FAILURE + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); +#endif + #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); #endif diff --git a/fs/select.c b/fs/select.c index a201fc370223..fd38ce2e32e3 100644 --- a/fs/select.c +++ b/fs/select.c @@ -15,6 +15,7 @@ */ #include <linux/kernel.h> +#include <linux/sched.h> #include <linux/syscalls.h> #include <linux/module.h> #include <linux/slab.h> diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 381854461b28..c2e30eea74dc 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -186,19 +186,37 @@ xfs_destroy_ioend( } /* + * If the end of the current ioend is beyond the current EOF, + * return the new EOF value, otherwise zero. + */ +STATIC xfs_fsize_t +xfs_ioend_new_eof( + xfs_ioend_t *ioend) +{ + xfs_inode_t *ip = XFS_I(ioend->io_inode); + xfs_fsize_t isize; + xfs_fsize_t bsize; + + bsize = ioend->io_offset + ioend->io_size; + isize = MAX(ip->i_size, ip->i_new_size); + isize = MIN(isize, bsize); + return isize > ip->i_d.di_size ? isize : 0; +} + +/* * Update on-disk file size now that data has been written to disk. * The current in-memory file size is i_size. If a write is beyond * eof i_new_size will be the intended file size until i_size is * updated. If this write does not extend all the way to the valid * file size then restrict this update to the end of the write. */ + STATIC void xfs_setfilesize( xfs_ioend_t *ioend) { xfs_inode_t *ip = XFS_I(ioend->io_inode); xfs_fsize_t isize; - xfs_fsize_t bsize; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); ASSERT(ioend->io_type != IOMAP_READ); @@ -206,16 +224,10 @@ xfs_setfilesize( if (unlikely(ioend->io_error)) return; - bsize = ioend->io_offset + ioend->io_size; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - isize = MAX(ip->i_size, ip->i_new_size); - isize = MIN(isize, bsize); - - if (ip->i_d.di_size < isize) { + isize = xfs_ioend_new_eof(ioend); + if (isize) { ip->i_d.di_size = isize; - ip->i_update_core = 1; xfs_mark_inode_dirty_sync(ip); } @@ -404,10 +416,16 @@ xfs_submit_ioend_bio( struct bio *bio) { atomic_inc(&ioend->io_remaining); - bio->bi_private = ioend; bio->bi_end_io = xfs_end_bio; + /* + * If the I/O is beyond EOF we mark the inode dirty immediately + * but don't update the inode size until I/O completion. + */ + if (xfs_ioend_new_eof(ioend)) + xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); + submit_bio(WRITE, bio); ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); bio_put(bio); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 629370974e57..eff61e2732af 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -176,14 +176,7 @@ xfs_file_fsync( struct dentry *dentry, int datasync) { - struct inode *inode = dentry->d_inode; - struct xfs_inode *ip = XFS_I(inode); - int error; - - /* capture size updates in I/O completion before writing the inode. */ - error = filemap_fdatawait(inode->i_mapping); - if (error) - return error; + struct xfs_inode *ip = XFS_I(dentry->d_inode); xfs_iflags_clear(ip, XFS_ITRUNCATED); return -xfs_fsync(ip); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index da0159d99f82..cd42ef78f6b5 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -57,19 +57,22 @@ #include <linux/fiemap.h> /* - * Bring the atime in the XFS inode uptodate. - * Used before logging the inode to disk or when the Linux inode goes away. + * Bring the timestamps in the XFS inode uptodate. + * + * Used before writing the inode to disk. */ void -xfs_synchronize_atime( +xfs_synchronize_times( xfs_inode_t *ip) { struct inode *inode = VFS_I(ip); - if (!(inode->i_state & I_CLEAR)) { - ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; - } + ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; + ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; + ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; } /* @@ -106,32 +109,20 @@ xfs_ichgtime( if ((flags & XFS_ICHGTIME_MOD) && !timespec_equal(&inode->i_mtime, &tv)) { inode->i_mtime = tv; - ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; sync_it = 1; } if ((flags & XFS_ICHGTIME_CHG) && !timespec_equal(&inode->i_ctime, &tv)) { inode->i_ctime = tv; - ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec; sync_it = 1; } /* - * We update the i_update_core field _after_ changing - * the timestamps in order to coordinate properly with - * xfs_iflush() so that we don't lose timestamp updates. - * This keeps us from having to hold the inode lock - * while doing this. We use the SYNCHRONIZE macro to - * ensure that the compiler does not reorder the update - * of i_update_core above the timestamp updates above. + * Update complete - now make sure everyone knows that the inode + * is dirty. */ - if (sync_it) { - SYNCHRONIZE(); - ip->i_update_core = 1; + if (sync_it) xfs_mark_inode_dirty_sync(ip); - } } /* @@ -506,10 +497,8 @@ xfs_vn_getattr( stat->gid = ip->i_d.di_gid; stat->ino = ip->i_ino; stat->atime = inode->i_atime; - stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec; - stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - stat->ctime.tv_sec = ip->i_d.di_ctime.t_sec; - stat->ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; stat->blocks = XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 49e4a6aea73c..072050f8d346 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -667,7 +667,7 @@ start: xip->i_new_size = new_size; if (likely(!(ioflags & IO_INVIS))) - xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + file_update_time(file); /* * If the offset is beyond the size of the file, we have a couple diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index bdd41c8c342f..18a4b8e11df2 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -977,6 +977,28 @@ xfs_fs_inode_init_once( } /* + * Dirty the XFS inode when mark_inode_dirty_sync() is called so that + * we catch unlogged VFS level updates to the inode. Care must be taken + * here - the transaction code calls mark_inode_dirty_sync() to mark the + * VFS inode dirty in a transaction and clears the i_update_core field; + * it must clear the field after calling mark_inode_dirty_sync() to + * correctly indicate that the dirty state has been propagated into the + * inode log item. + * + * We need the barrier() to maintain correct ordering between unlogged + * updates and the transaction commit code that clears the i_update_core + * field. This requires all updates to be completed before marking the + * inode dirty. + */ +STATIC void +xfs_fs_dirty_inode( + struct inode *inode) +{ + barrier(); + XFS_I(inode)->i_update_core = 1; +} + +/* * Attempt to flush the inode, this will actually fail * if the inode is pinned, but we dirty the inode again * at the point when it is unpinned after a log write, @@ -1126,7 +1148,7 @@ xfs_fs_put_super( } STATIC int -xfs_fs_sync_super( +xfs_fs_sync_fs( struct super_block *sb, int wait) { @@ -1134,23 +1156,23 @@ xfs_fs_sync_super( int error; /* - * Treat a sync operation like a freeze. This is to work - * around a race in sync_inodes() which works in two phases - * - an asynchronous flush, which can write out an inode - * without waiting for file size updates to complete, and a - * synchronous flush, which wont do anything because the - * async flush removed the inode's dirty flag. Also - * sync_inodes() will not see any files that just have - * outstanding transactions to be flushed because we don't - * dirty the Linux inode until after the transaction I/O - * completes. + * Not much we can do for the first async pass. Writing out the + * superblock would be counter-productive as we are going to redirty + * when writing out other data and metadata (and writing out a single + * block is quite fast anyway). + * + * Try to asynchronously kick off quota syncing at least. */ - if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) - error = xfs_quiesce_data(mp); - else - error = xfs_sync_fsdata(mp, 0); + if (!wait) { + xfs_qm_sync(mp, SYNC_TRYLOCK); + return 0; + } + + error = xfs_quiesce_data(mp); + if (error) + return -error; - if (unlikely(laptop_mode)) { + if (laptop_mode) { int prev_sync_seq = mp->m_sync_seq; /* @@ -1169,7 +1191,7 @@ xfs_fs_sync_super( mp->m_sync_seq != prev_sync_seq); } - return -error; + return 0; } STATIC int @@ -1539,10 +1561,11 @@ xfs_fs_get_sb( static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, + .dirty_inode = xfs_fs_dirty_inode, .write_inode = xfs_fs_write_inode, .clear_inode = xfs_fs_clear_inode, .put_super = xfs_fs_put_super, - .sync_fs = xfs_fs_sync_super, + .sync_fs = xfs_fs_sync_fs, .freeze_fs = xfs_fs_freeze, .statfs = xfs_fs_statfs, .remount_fs = xfs_fs_remount, diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 320be6aea492..961df0a22c78 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -309,11 +309,15 @@ xfs_sync_attr( STATIC int xfs_commit_dummy_trans( struct xfs_mount *mp, - uint log_flags) + uint flags) { struct xfs_inode *ip = mp->m_rootip; struct xfs_trans *tp; int error; + int log_flags = XFS_LOG_FORCE; + + if (flags & SYNC_WAIT) + log_flags |= XFS_LOG_SYNC; /* * Put a dummy transaction in the log to tell recovery @@ -331,13 +335,12 @@ xfs_commit_dummy_trans( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* XXX(hch): ignoring the error here.. */ error = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* the log force ensures this transaction is pushed to disk */ xfs_log_force(mp, 0, log_flags); - return 0; + return error; } int @@ -385,7 +388,20 @@ xfs_sync_fsdata( else XFS_BUF_ASYNC(bp); - return xfs_bwrite(mp, bp); + error = xfs_bwrite(mp, bp); + if (error) + return error; + + /* + * If this is a data integrity sync make sure all pending buffers + * are flushed out for the log coverage check below. + */ + if (flags & SYNC_WAIT) + xfs_flush_buftarg(mp->m_ddev_targp, 1); + + if (xfs_log_need_covered(mp)) + error = xfs_commit_dummy_trans(mp, flags); + return error; out_brelse: xfs_buf_relse(bp); @@ -419,14 +435,16 @@ xfs_quiesce_data( /* push non-blocking */ xfs_sync_data(mp, 0); xfs_qm_sync(mp, SYNC_TRYLOCK); - xfs_filestream_flush(mp); - /* push and block */ + /* push and block till complete */ xfs_sync_data(mp, SYNC_WAIT); xfs_qm_sync(mp, SYNC_WAIT); + /* drop inode references pinned by filestreams */ + xfs_filestream_flush(mp); + /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp, 0); + error = xfs_sync_fsdata(mp, SYNC_WAIT); /* flush data-only devices */ if (mp->m_rtdev_targp) @@ -570,8 +588,6 @@ xfs_sync_worker( /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); - if (xfs_log_need_covered(mp)) - error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 7465f9ee125f..ab89a7e94a0f 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -206,10 +206,10 @@ xfs_swap_extents( * process that the file was not changed out from * under it. */ - if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) || - (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) || - (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || - (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { error = XFS_ERROR(EBUSY); goto out_unlock; } diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index fa913e459442..41ad537c49e9 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -854,6 +854,7 @@ xfs_dir2_leaf_getdents( */ ra_want = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize) - 1; + ASSERT(ra_want >= 0); /* * If we don't have as many as we want, and we haven't @@ -1088,7 +1089,8 @@ xfs_dir2_leaf_getdents( */ ptr += length; curoff += length; - bufsize -= length; + /* bufsize may have just been a guess; don't go negative */ + bufsize = bufsize > length ? bufsize - length : 0; } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c1dc7ef5a1d8..b92a4fa2a0a1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3068,9 +3068,9 @@ xfs_iflush_int( SYNCHRONIZE(); /* - * Make sure to get the latest atime from the Linux inode. + * Make sure to get the latest timestamps from the Linux inode. */ - xfs_synchronize_atime(ip); + xfs_synchronize_times(ip); if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0b38b9a869ec..41555de1d1db 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -504,7 +504,7 @@ void xfs_ichgtime(xfs_inode_t *, int); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); -void xfs_synchronize_atime(xfs_inode_t *); +void xfs_synchronize_times(xfs_inode_t *); void xfs_mark_inode_dirty_sync(xfs_inode_t *); #if defined(XFS_INODE_TRACE) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 47d5b663c37e..9794b876d6ff 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -232,6 +232,15 @@ xfs_inode_item_format( nvecs = 1; /* + * Make sure the linux inode is dirty. We do this before + * clearing i_update_core as the VFS will call back into + * XFS here and set i_update_core, so we need to dirty the + * inode first so that the ordering of i_update_core and + * unlogged modifications still works as described below. + */ + xfs_mark_inode_dirty_sync(ip); + + /* * Clear i_update_core if the timestamps (or any other * non-transactional modification) need flushing/logging * and we're about to log them with the rest of the core. @@ -263,14 +272,9 @@ xfs_inode_item_format( } /* - * Make sure to get the latest atime from the Linux inode. + * Make sure to get the latest timestamps from the Linux inode. */ - xfs_synchronize_atime(ip); - - /* - * make sure the linux inode is dirty - */ - xfs_mark_inode_dirty_sync(ip); + xfs_synchronize_times(ip); vecp->i_addr = (xfs_caddr_t)&ip->i_d; vecp->i_len = sizeof(struct xfs_icdinode); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index b68f9107e26c..62efab2f3839 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -59,6 +59,7 @@ xfs_bulkstat_one_iget( { xfs_icdinode_t *dic; /* dinode core info pointer */ xfs_inode_t *ip; /* incore inode pointer */ + struct inode *inode; int error; error = xfs_iget(mp, NULL, ino, @@ -72,6 +73,7 @@ xfs_bulkstat_one_iget( ASSERT(ip->i_imap.im_blkno != 0); dic = &ip->i_d; + inode = VFS_I(ip); /* xfs_iget returns the following without needing * further change. @@ -83,16 +85,19 @@ xfs_bulkstat_one_iget( buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; + /* - * We are reading the atime from the Linux inode because the - * dinode might not be uptodate. + * We need to read the timestamps from the Linux inode because + * the VFS keeps writing directly into the inode structure instead + * of telling us about the updates. */ - buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec; - buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec; - buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; - buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; - buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; - buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; + buf->bs_atime.tv_sec = inode->i_atime.tv_sec; + buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec; + buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec; + buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec; + buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec; + buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec; + buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; buf->bs_extents = dic->di_nextents; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index a434f287962d..b572f7e840e0 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2476,12 +2476,6 @@ xfs_reclaim( ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); /* - * Make sure the atime in the XFS inode is correct before freeing the - * Linux inode. - */ - xfs_synchronize_atime(ip); - - /* * If we have nothing to flush with this inode then complete the * teardown now, otherwise break the link between the xfs inode and the * linux inode and clean up the xfs inode later. This avoids flushing |