diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/block_dev.c | 10 | ||||
-rw-r--r-- | fs/btrfs/async-thread.c | 4 | ||||
-rw-r--r-- | fs/btrfs/ctree.c | 121 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 29 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 15 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 516 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 1003 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.h | 8 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 2 | ||||
-rw-r--r-- | fs/btrfs/print-tree.c | 6 | ||||
-rw-r--r-- | fs/btrfs/relocation.c | 3 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 40 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 2 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 46 | ||||
-rw-r--r-- | fs/cifs/connect.c | 8 | ||||
-rw-r--r-- | fs/cifs/inode.c | 9 | ||||
-rw-r--r-- | fs/ecryptfs/keystore.c | 13 | ||||
-rw-r--r-- | fs/ext3/dir.c | 3 | ||||
-rw-r--r-- | fs/ext3/inode.c | 32 | ||||
-rw-r--r-- | fs/jbd/journal.c | 26 | ||||
-rw-r--r-- | fs/jbd/transaction.c | 68 | ||||
-rw-r--r-- | fs/jfs/acl.c | 4 | ||||
-rw-r--r-- | fs/notify/Kconfig | 12 | ||||
-rw-r--r-- | fs/notify/dnotify/Kconfig | 2 | ||||
-rw-r--r-- | fs/notify/fsnotify.c | 4 | ||||
-rw-r--r-- | fs/notify/inotify/Kconfig | 2 | ||||
-rw-r--r-- | fs/notify/inotify/inotify_user.c | 109 | ||||
-rw-r--r-- | fs/notify/notification.c | 19 | ||||
-rw-r--r-- | fs/ramfs/file-nommu.c | 1 | ||||
-rw-r--r-- | fs/sysfs/dir.c | 2 |
30 files changed, 1536 insertions, 583 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index 3a6d4fb2a329..94dfda24c06e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -564,6 +564,16 @@ struct block_device *bdget(dev_t dev) EXPORT_SYMBOL(bdget); +/** + * bdgrab -- Grab a reference to an already referenced block device + * @bdev: Block device to grab a reference to. + */ +struct block_device *bdgrab(struct block_device *bdev) +{ + atomic_inc(&bdev->bd_inode->i_count); + return bdev; +} + long nr_blockdev_pages(void) { struct block_device *bdev; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 6e4f6c50a120..019e8af449ab 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work) * list */ if (worker->idle) { - spin_lock_irqsave(&worker->workers->lock, flags); + spin_lock(&worker->workers->lock); worker->idle = 0; list_move_tail(&worker->worker_list, &worker->workers->worker_list); - spin_unlock_irqrestore(&worker->workers->lock, flags); + spin_unlock(&worker->workers->lock); } if (!worker->working) { wake = 1; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 60a45f3a4e91..3fdcc0512d3a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) btrfs_disk_key_to_cpu(&k1, disk); - if (k1.objectid > k2->objectid) - return 1; - if (k1.objectid < k2->objectid) - return -1; - if (k1.type > k2->type) - return 1; - if (k1.type < k2->type) - return -1; - if (k1.offset > k2->offset) - return 1; - if (k1.offset < k2->offset) - return -1; - return 0; + return btrfs_comp_cpu_keys(&k1, k2); } /* @@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - if (btrfs_header_nritems(mid) > 2) - return 0; - if (btrfs_header_nritems(mid) < 2) err_on_enospc = 1; @@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root struct extent_buffer *b; int slot; int ret; + int err; int level; int lowest_unlock = 1; u8 lowest_level = 0; @@ -1737,8 +1723,6 @@ again: p->locks[level] = 1; if (cow) { - int wret; - /* * if we don't really need to cow this block * then we don't want to set the path blocking, @@ -1749,12 +1733,12 @@ again: btrfs_set_path_blocking(p); - wret = btrfs_cow_block(trans, root, b, - p->nodes[level + 1], - p->slots[level + 1], &b); - if (wret) { + err = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b); + if (err) { free_extent_buffer(b); - ret = wret; + ret = err; goto done; } } @@ -1793,41 +1777,45 @@ cow_done: ret = bin_search(b, key, level, &slot); if (level != 0) { - if (ret && slot > 0) + int dec = 0; + if (ret && slot > 0) { + dec = 1; slot -= 1; + } p->slots[level] = slot; - ret = setup_nodes_for_search(trans, root, p, b, level, + err = setup_nodes_for_search(trans, root, p, b, level, ins_len); - if (ret == -EAGAIN) + if (err == -EAGAIN) goto again; - else if (ret) + if (err) { + ret = err; goto done; + } b = p->nodes[level]; slot = p->slots[level]; unlock_up(p, level, lowest_unlock); - /* this is only true while dropping a snapshot */ if (level == lowest_level) { - ret = 0; + if (dec) + p->slots[level]++; goto done; } - ret = read_block_for_search(trans, root, p, + err = read_block_for_search(trans, root, p, &b, level, slot, key); - if (ret == -EAGAIN) + if (err == -EAGAIN) goto again; - - if (ret == -EIO) + if (err) { + ret = err; goto done; + } if (!p->skip_locking) { - int lret; - btrfs_clear_path_blocking(p, NULL); - lret = btrfs_try_spin_lock(b); + err = btrfs_try_spin_lock(b); - if (!lret) { + if (!err) { btrfs_set_path_blocking(p); btrfs_tree_lock(b); btrfs_clear_path_blocking(p, b); @@ -1837,16 +1825,14 @@ cow_done: p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, b) < ins_len) { - int sret; - btrfs_set_path_blocking(p); - sret = split_leaf(trans, root, key, - p, ins_len, ret == 0); + err = split_leaf(trans, root, key, + p, ins_len, ret == 0); btrfs_clear_path_blocking(p, NULL); - BUG_ON(sret > 0); - if (sret) { - ret = sret; + BUG_ON(err > 0); + if (err) { + ret = err; goto done; } } @@ -3807,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) { + if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below @@ -4042,10 +4028,9 @@ out: * calling this function. */ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *key, int lowest_level, + struct btrfs_key *key, int level, int cache_only, u64 min_trans) { - int level = lowest_level; int slot; struct extent_buffer *c; @@ -4058,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, c = path->nodes[level]; next: if (slot >= btrfs_header_nritems(c)) { - level++; - if (level == BTRFS_MAX_LEVEL) + int ret; + int orig_lowest; + struct btrfs_key cur_key; + if (level + 1 >= BTRFS_MAX_LEVEL || + !path->nodes[level + 1]) return 1; - continue; + + if (path->locks[level + 1]) { + level++; + continue; + } + + slot = btrfs_header_nritems(c) - 1; + if (level == 0) + btrfs_item_key_to_cpu(c, &cur_key, slot); + else + btrfs_node_key_to_cpu(c, &cur_key, slot); + + orig_lowest = path->lowest_level; + btrfs_release_path(root, path); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &cur_key, path, + 0, 0); + path->lowest_level = orig_lowest; + if (ret < 0) + return ret; + + c = path->nodes[level]; + slot = path->slots[level]; + if (ret == 0) + slot++; + goto next; } + if (level == 0) btrfs_item_key_to_cpu(c, key, slot); else { @@ -4146,7 +4160,8 @@ again: * advance the path if there are now more items available. */ if (nritems > 0 && path->slots[0] < nritems - 1) { - path->slots[0]++; + if (ret == 0) + path->slots[0]++; ret = 0; goto done; } @@ -4278,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root, path->slots[0]--; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.type == type) - return 0; if (found_key.objectid < min_objectid) break; + if (found_key.type == type) + return 0; if (found_key.objectid == min_objectid && found_key.type < type) break; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 98a873838717..215ef8cae823 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -481,7 +481,7 @@ struct btrfs_shared_data_ref { struct btrfs_extent_inline_ref { u8 type; - u64 offset; + __le64 offset; } __attribute__ ((__packed__)); /* old style backrefs item */ @@ -689,6 +689,7 @@ struct btrfs_space_info { struct list_head block_groups; spinlock_t lock; struct rw_semaphore groups_sem; + atomic_t caching_threads; }; /* @@ -707,6 +708,9 @@ struct btrfs_free_cluster { /* first extent starting offset */ u64 window_start; + /* if this cluster simply points at a bitmap in the block group */ + bool points_to_bitmap; + struct btrfs_block_group_cache *block_group; /* * when a cluster is allocated from a block group, we put the @@ -716,24 +720,37 @@ struct btrfs_free_cluster { struct list_head block_group_list; }; +enum btrfs_caching_type { + BTRFS_CACHE_NO = 0, + BTRFS_CACHE_STARTED = 1, + BTRFS_CACHE_FINISHED = 2, +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; + struct btrfs_fs_info *fs_info; spinlock_t lock; - struct mutex cache_mutex; u64 pinned; u64 reserved; u64 flags; - int cached; + u64 sectorsize; + int extents_thresh; + int free_extents; + int total_bitmaps; int ro; int dirty; + /* cache tracking stuff */ + wait_queue_head_t caching_q; + int cached; + struct btrfs_space_info *space_info; /* free space cache stuff */ spinlock_t tree_lock; - struct rb_root free_space_bytes; struct rb_root free_space_offset; + u64 free_space; /* block group cache stuff */ struct rb_node cache_node; @@ -942,6 +959,9 @@ struct btrfs_root { /* the node lock is held while changing the node pointer */ spinlock_t node_lock; + /* taken when updating the commit root */ + struct rw_semaphore commit_root_sem; + struct extent_buffer *commit_root; struct btrfs_root *log_root; struct btrfs_root *reloc_root; @@ -1988,6 +2008,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); +void btrfs_free_pinned_extents(struct btrfs_fs_info *info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d28d29c95f7c..7dcaa8138864 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -909,6 +909,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, spin_lock_init(&root->inode_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); + init_rwsem(&root->commit_root_sem); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); @@ -1799,6 +1800,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_super_chunk_root(disk_super), blocksize, generation); BUG_ON(!chunk_root->node); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", + sb->s_id); + goto fail_chunk_root; + } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); chunk_root->commit_root = btrfs_root_node(chunk_root); @@ -1826,6 +1832,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, blocksize, generation); if (!tree_root->node) goto fail_chunk_root; + if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", + sb->s_id); + goto fail_tree_root; + } btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); @@ -2322,6 +2333,9 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } + fs_info->closing = 2; + smp_mb(); + if (fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", (unsigned long long)fs_info->delalloc_bytes); @@ -2343,6 +2357,7 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(root->fs_info->csum_root->commit_root); btrfs_free_block_groups(root->fs_info); + btrfs_free_pinned_extents(root->fs_info); del_fs_roots(fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a5aca3997d42..fadf69a2764b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -21,6 +21,7 @@ #include <linux/blkdev.h> #include <linux/sort.h> #include <linux/rcupdate.h> +#include <linux/kthread.h> #include "compat.h" #include "hash.h" #include "ctree.h" @@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force); +static noinline int +block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + smp_mb(); + return cache->cached == BTRFS_CACHE_FINISHED; +} + static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) { return (cache->flags & bits) == bits; @@ -146,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, } /* + * We always set EXTENT_LOCKED for the super mirror extents so we don't + * overwrite them, so those bits need to be unset. Also, if we are unmounting + * with pinned extents still sitting there because we had a block group caching, + * we need to clear those now, since we are done. + */ +void btrfs_free_pinned_extents(struct btrfs_fs_info *info) +{ + u64 start, end, last = 0; + int ret; + + while (1) { + ret = find_first_extent_bit(&info->pinned_extents, last, + &start, &end, + EXTENT_LOCKED|EXTENT_DIRTY); + if (ret) + break; + + clear_extent_bits(&info->pinned_extents, start, end, + EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS); + last = end+1; + } +} + +static int remove_sb_from_cache(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(&root->fs_info->mapping_tree, + cache->key.objectid, bytenr, + 0, &logical, &nr, &stripe_len); + BUG_ON(ret); + while (nr--) { + try_lock_extent(&fs_info->pinned_extents, + logical[nr], + logical[nr] + stripe_len - 1, GFP_NOFS); + } + kfree(logical); + } + + return 0; +} + +/* * this is only called by cache_block_group, since we could have freed extents * we need to check the pinned_extents for any extents that can't be used yet * since their free space will be released as soon as the transaction commits. */ -static int add_new_free_space(struct btrfs_block_group_cache *block_group, +static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *info, u64 start, u64 end) { - u64 extent_start, extent_end, size; + u64 extent_start, extent_end, size, total_added = 0; int ret; while (start < end) { ret = find_first_extent_bit(&info->pinned_extents, start, &extent_start, &extent_end, - EXTENT_DIRTY); + EXTENT_DIRTY|EXTENT_LOCKED); if (ret) break; @@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, start = extent_end + 1; } else if (extent_start > start && extent_start < end) { size = extent_start - start; + total_added += size; ret = btrfs_add_free_space(block_group, start, size); BUG_ON(ret); @@ -178,84 +237,79 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, if (start < end) { size = end - start; + total_added += size; ret = btrfs_add_free_space(block_group, start, size); BUG_ON(ret); } - return 0; + return total_added; } -static int remove_sb_from_cache(struct btrfs_root *root, - struct btrfs_block_group_cache *cache) -{ - u64 bytenr; - u64 *logical; - int stripe_len; - int i, nr, ret; - - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(&root->fs_info->mapping_tree, - cache->key.objectid, bytenr, 0, - &logical, &nr, &stripe_len); - BUG_ON(ret); - while (nr--) { - btrfs_remove_free_space(cache, logical[nr], - stripe_len); - } - kfree(logical); - } - return 0; -} - -static int cache_block_group(struct btrfs_root *root, - struct btrfs_block_group_cache *block_group) +static int caching_kthread(void *data) { + struct btrfs_block_group_cache *block_group = data; + struct btrfs_fs_info *fs_info = block_group->fs_info; + u64 last = 0; struct btrfs_path *path; int ret = 0; struct btrfs_key key; struct extent_buffer *leaf; int slot; - u64 last; - - if (!block_group) - return 0; + u64 total_found = 0; - root = root->fs_info->extent_root; - - if (block_group->cached) - return 0; + BUG_ON(!fs_info); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + atomic_inc(&block_group->space_info->caching_threads); + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); +again: + /* need to make sure the commit_root doesn't disappear */ + down_read(&fs_info->extent_root->commit_root_sem); + /* - * we get into deadlocks with paths held by callers of this function. - * since the alloc_mutex is protecting things right now, just - * skip the locking here + * We don't want to deadlock with somebody trying to allocate a new + * extent for the extent root while also trying to search the extent + * root to add free space. So we skip locking and search the commit + * root, since its read-only */ path->skip_locking = 1; - last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + path->search_commit_root = 1; + path->reada = 2; + key.objectid = last; key.offset = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto err; while (1) { + smp_mb(); + if (block_group->fs_info->closing > 1) { + last = (u64)-1; + break; + } + leaf = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_leaf(fs_info->extent_root, path); if (ret < 0) goto err; - if (ret == 0) - continue; - else + else if (ret) break; + + if (need_resched()) { + btrfs_release_path(fs_info->extent_root, path); + up_read(&fs_info->extent_root->commit_root_sem); + cond_resched(); + goto again; + } + + continue; } btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid < block_group->key.objectid) @@ -266,24 +320,59 @@ static int cache_block_group(struct btrfs_root *root, break; if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { - add_new_free_space(block_group, root->fs_info, last, - key.objectid); - + total_found += add_new_free_space(block_group, + fs_info, last, + key.objectid); last = key.objectid + key.offset; } + + if (total_found > (1024 * 1024 * 2)) { + total_found = 0; + wake_up(&block_group->caching_q); + } next: path->slots[0]++; } + ret = 0; - add_new_free_space(block_group, root->fs_info, last, - block_group->key.objectid + - block_group->key.offset); + total_found += add_new_free_space(block_group, fs_info, last, + block_group->key.objectid + + block_group->key.offset); + + spin_lock(&block_group->lock); + block_group->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&block_group->lock); - block_group->cached = 1; - remove_sb_from_cache(root, block_group); - ret = 0; err: btrfs_free_path(path); + up_read(&fs_info->extent_root->commit_root_sem); + atomic_dec(&block_group->space_info->caching_threads); + wake_up(&block_group->caching_q); + + return 0; +} + +static int cache_block_group(struct btrfs_block_group_cache *cache) +{ + struct task_struct *tsk; + int ret = 0; + + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + return ret; + } + cache->cached = BTRFS_CACHE_STARTED; + spin_unlock(&cache->lock); + + tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", + cache->key.objectid); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); + printk(KERN_ERR "error running thread %d\n", ret); + BUG(); + } + return ret; } @@ -2387,13 +2476,29 @@ fail: } +static struct btrfs_block_group_cache * +next_block_group(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + struct rb_node *node; + spin_lock(&root->fs_info->block_group_cache_lock); + node = rb_next(&cache->cache_node); + btrfs_put_block_group(cache); + if (node) { + cache = rb_entry(node, struct btrfs_block_group_cache, + cache_node); + atomic_inc(&cache->count); + } else + cache = NULL; + spin_unlock(&root->fs_info->block_group_cache_lock); + return cache; +} + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_block_group_cache *cache, *entry; - struct rb_node *n; + struct btrfs_block_group_cache *cache; int err = 0; - int werr = 0; struct btrfs_path *path; u64 last = 0; @@ -2402,39 +2507,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, return -ENOMEM; while (1) { - cache = NULL; - spin_lock(&root->fs_info->block_group_cache_lock); - for (n = rb_first(&root->fs_info->block_group_cache_tree); - n; n = rb_next(n)) { - entry = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - if (entry->dirty) { - cache = entry; - break; - } + if (last == 0) { + err = btrfs_run_delayed_refs(trans, root, + (unsigned long)-1); + BUG_ON(err); } - spin_unlock(&root->fs_info->block_group_cache_lock); - if (!cache) - break; + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + if (cache->dirty) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } cache->dirty = 0; - last += cache->key.offset; + last = cache->key.objectid + cache->key.offset; - err = write_one_cache_group(trans, root, - path, cache); - /* - * if we fail to write the cache group, we want - * to keep it marked dirty in hopes that a later - * write will work - */ - if (err) { - werr = err; - continue; - } + err = write_one_cache_group(trans, root, path, cache); + BUG_ON(err); + btrfs_put_block_group(cache); } + btrfs_free_path(path); - return werr; + return 0; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -2484,6 +2585,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->force_alloc = 0; *space_info = found; list_add_rcu(&found->list, &info->space_info); + atomic_set(&found->caching_threads, 0); return 0; } @@ -2947,13 +3049,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; - if (pin) { + if (pin) set_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); - } else { - clear_extent_dirty(&fs_info->pinned_extents, - bytenr, bytenr + num - 1, GFP_NOFS); - } while (num > 0) { cache = btrfs_lookup_block_group(fs_info, bytenr); @@ -2969,14 +3067,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, spin_unlock(&cache->space_info->lock); fs_info->total_pinned += len; } else { + int unpin = 0; + + /* + * in order to not race with the block group caching, we + * only want to unpin the extent if we are cached. If + * we aren't cached, we want to start async caching this + * block group so we can free the extent the next time + * around. + */ spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - cache->pinned -= len; - cache->space_info->bytes_pinned -= len; + unpin = (cache->cached == BTRFS_CACHE_FINISHED); + if (likely(unpin)) { + cache->pinned -= len; + cache->space_info->bytes_pinned -= len; + fs_info->total_pinned -= len; + } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - fs_info->total_pinned -= len; - if (cache->cached) + + if (likely(unpin)) + clear_extent_dirty(&fs_info->pinned_extents, + bytenr, bytenr + len -1, + GFP_NOFS); + else + cache_block_group(cache); + + if (unpin) btrfs_add_free_space(cache, bytenr, len); } btrfs_put_block_group(cache); @@ -3030,6 +3148,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) &start, &end, EXTENT_DIRTY); if (ret) break; + set_extent_dirty(copy, start, end, GFP_NOFS); last = end + 1; } @@ -3058,6 +3177,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } + return ret; } @@ -3436,6 +3556,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val) } /* + * when we wait for progress in the block group caching, its because + * our allocation attempt failed at least once. So, we must sleep + * and let some progress happen before we try again. + * + * This function will sleep at least once waiting for new free space to + * show up, and then it will check the block group free space numbers + * for our min num_bytes. Another option is to have it go ahead + * and look in the rbtree for a free extent of a given size, but this + * is a good start. + */ +static noinline int +wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, + u64 num_bytes) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE); + + if (block_group_cache_done(cache)) { + finish_wait(&cache->caching_q, &wait); + return 0; + } + schedule(); + finish_wait(&cache->caching_q, &wait); + + wait_event(cache->caching_q, block_group_cache_done(cache) || + (cache->free_space >= num_bytes)); + return 0; +} + +enum btrfs_loop_type { + LOOP_CACHED_ONLY = 0, + LOOP_CACHING_NOWAIT = 1, + LOOP_CACHING_WAIT = 2, + LOOP_ALLOC_CHUNK = 3, + LOOP_NO_EMPTY_SIZE = 4, +}; + +/* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: * ins->objectid == block start @@ -3460,6 +3619,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_space_info *space_info; int last_ptr_loop = 0; int loop = 0; + bool found_uncached_bg = false; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -3491,15 +3651,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - if (!last_ptr) { + if (!last_ptr) empty_cluster = 0; - loop = 1; - } if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); - if (block_group && block_group_bits(block_group, data)) { + /* + * we don't want to use the block group if it doesn't match our + * allocation bits, or if its not cached. + */ + if (block_group && block_group_bits(block_group, data) && + block_group_cache_done(block_group)) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -3522,21 +3685,35 @@ search: down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups, list) { u64 offset; + int cached; atomic_inc(&block_group->count); search_start = block_group->key.objectid; have_block_group: - if (unlikely(!block_group->cached)) { - mutex_lock(&block_group->cache_mutex); - ret = cache_block_group(root, block_group); - mutex_unlock(&block_group->cache_mutex); - if (ret) { - btrfs_put_block_group(block_group); - break; + if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + /* + * we want to start caching kthreads, but not too many + * right off the bat so we don't overwhelm the system, + * so only start them if there are less than 2 and we're + * in the initial allocation phase. + */ + if (loop > LOOP_CACHING_NOWAIT || + atomic_read(&space_info->caching_threads) < 2) { + ret = cache_block_group(block_group); + BUG_ON(ret); } } + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) { + found_uncached_bg = true; + + /* if we only want cached bgs, loop */ + if (loop == LOOP_CACHED_ONLY) + goto loop; + } + if (unlikely(block_group->ro)) goto loop; @@ -3615,14 +3792,21 @@ refill_cluster: spin_unlock(&last_ptr->refill_lock); goto checks; } + } else if (!cached && loop > LOOP_CACHING_NOWAIT) { + spin_unlock(&last_ptr->refill_lock); + + wait_block_group_cache_progress(block_group, + num_bytes + empty_cluster + empty_size); + goto have_block_group; } + /* * at this point we either didn't find a cluster * or we weren't able to allocate a block from our * cluster. Free the cluster we've been trying * to use, and go to the next block group */ - if (loop < 2) { + if (loop < LOOP_NO_EMPTY_SIZE) { btrfs_return_cluster_to_free_space(NULL, last_ptr); spin_unlock(&last_ptr->refill_lock); @@ -3633,11 +3817,17 @@ refill_cluster: offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); - if (!offset) + if (!offset && (cached || (!cached && + loop == LOOP_CACHING_NOWAIT))) { goto loop; + } else if (!offset && (!cached && + loop > LOOP_CACHING_NOWAIT)) { + wait_block_group_cache_progress(block_group, + num_bytes + empty_size); + goto have_block_group; + } checks: search_start = stripe_align(root, offset); - /* move on to the next group */ if (search_start + num_bytes >= search_end) { btrfs_add_free_space(block_group, offset, num_bytes); @@ -3683,13 +3873,26 @@ loop: } up_read(&space_info->groups_sem); - /* loop == 0, try to find a clustered alloc in every block group - * loop == 1, try again after forcing a chunk allocation - * loop == 2, set empty_size and empty_cluster to 0 and try again + /* LOOP_CACHED_ONLY, only search fully cached block groups + * LOOP_CACHING_NOWAIT, search partially cached block groups, but + * dont wait foR them to finish caching + * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_ALLOC_CHUNK, force a chunk allocation and try again + * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try + * again */ - if (!ins->objectid && loop < 3 && - (empty_size || empty_cluster || allowed_chunk_alloc)) { - if (loop >= 2) { + if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && + (found_uncached_bg || empty_size || empty_cluster || + allowed_chunk_alloc)) { + if (found_uncached_bg) { + found_uncached_bg = false; + if (loop < LOOP_CACHING_WAIT) { + loop++; + goto search; + } + } + + if (loop == LOOP_ALLOC_CHUNK) { empty_size = 0; empty_cluster = 0; } @@ -3702,7 +3905,7 @@ loop: space_info->force_alloc = 1; } - if (loop < 3) { + if (loop < LOOP_NO_EMPTY_SIZE) { loop++; goto search; } @@ -3798,7 +4001,7 @@ again: num_bytes, data, 1); goto again; } - if (ret) { + if (ret == -ENOSPC) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, data); @@ -3806,7 +4009,6 @@ again: "wanted %llu\n", (unsigned long long)data, (unsigned long long)num_bytes); dump_space_info(sinfo, num_bytes); - BUG(); } return ret; @@ -3844,7 +4046,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, empty_size, hint_byte, search_end, ins, data); - update_reserved_extents(root, ins->objectid, ins->offset, 1); + if (!ret) + update_reserved_extents(root, ins->objectid, ins->offset, 1); + return ret; } @@ -4006,9 +4210,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - mutex_lock(&block_group->cache_mutex); - cache_block_group(root, block_group); - mutex_unlock(&block_group->cache_mutex); + cache_block_group(block_group); + wait_event(block_group->caching_q, + block_group_cache_done(block_group)); ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset); @@ -4039,7 +4243,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, empty_size, hint_byte, search_end, ins, 0); - BUG_ON(ret); + if (ret) + return ret; if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) @@ -6955,11 +7160,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) &info->block_group_cache_tree); spin_unlock(&info->block_group_cache_lock); - btrfs_remove_free_space_cache(block_group); down_write(&block_group->space_info->groups_sem); list_del(&block_group->list); up_write(&block_group->space_info->groups_sem); + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_event(block_group->caching_q, + block_group_cache_done(block_group)); + + btrfs_remove_free_space_cache(block_group); + WARN_ON(atomic_read(&block_group->count) != 1); kfree(block_group); @@ -7025,9 +7235,19 @@ int btrfs_read_block_groups(struct btrfs_root *root) atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); - mutex_init(&cache->cache_mutex); + cache->fs_info = info; + init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); + + /* + * we only want to have 32k of ram per block group for keeping + * track of free space, and if we pass 1/2 of that we want to + * start converting things over to using bitmaps + */ + cache->extents_thresh = ((1024 * 32) / 2) / + sizeof(struct btrfs_free_space); + read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); @@ -7036,6 +7256,26 @@ int btrfs_read_block_groups(struct btrfs_root *root) key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(root, path); cache->flags = btrfs_block_group_flags(&cache->item); + cache->sectorsize = root->sectorsize; + + remove_sb_from_cache(root, cache); + + /* + * check for two cases, either we are full, and therefore + * don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all + * the space in and be done with it. This saves us _alot_ of + * time, particularly in the full case. + */ + if (found_key.offset == btrfs_block_group_used(&cache->item)) { + cache->cached = BTRFS_CACHE_FINISHED; + } else if (btrfs_block_group_used(&cache->item) == 0) { + cache->cached = BTRFS_CACHE_FINISHED; + add_new_free_space(cache, root->fs_info, + found_key.objectid, + found_key.objectid + + found_key.offset); + } ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), @@ -7079,10 +7319,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.objectid = chunk_offset; cache->key.offset = size; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = root->sectorsize; + + /* + * we only want to have 32k of ram per block group for keeping track + * of free space, and if we pass 1/2 of that we want to start + * converting things over to using bitmaps + */ + cache->extents_thresh = ((1024 * 32) / 2) / + sizeof(struct btrfs_free_space); atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); - mutex_init(&cache->cache_mutex); + init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); @@ -7091,6 +7340,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->cached = BTRFS_CACHE_FINISHED; + remove_sb_from_cache(root, cache); + + add_new_free_space(cache, root->fs_info, chunk_offset, + chunk_offset + size); + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); @@ -7149,7 +7404,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); spin_unlock(&root->fs_info->block_group_cache_lock); - btrfs_remove_free_space_cache(block_group); + down_write(&block_group->space_info->groups_sem); /* * we must use list_del_init so people can check to see if they @@ -7158,11 +7413,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, list_del_init(&block_group->list); up_write(&block_group->space_info->groups_sem); + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_event(block_group->caching_q, + block_group_cache_done(block_group)); + + btrfs_remove_free_space_cache(block_group); + spin_lock(&block_group->space_info->lock); block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; spin_unlock(&block_group->space_info->lock); - block_group->space_info->full = 0; + + btrfs_clear_space_info_full(root->fs_info); btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4538e48581a5..af99b78b288e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -16,45 +16,46 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/pagemap.h> #include <linux/sched.h> +#include <linux/math64.h> #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" -struct btrfs_free_space { - struct rb_node bytes_index; - struct rb_node offset_index; - u64 offset; - u64 bytes; -}; +#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) -static int tree_insert_offset(struct rb_root *root, u64 offset, - struct rb_node *node) +static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, + u64 offset) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct btrfs_free_space *info; + BUG_ON(offset < bitmap_start); + offset -= bitmap_start; + return (unsigned long)(div64_u64(offset, sectorsize)); +} - while (*p) { - parent = *p; - info = rb_entry(parent, struct btrfs_free_space, offset_index); +static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize) +{ + return (unsigned long)(div64_u64(bytes, sectorsize)); +} - if (offset < info->offset) - p = &(*p)->rb_left; - else if (offset > info->offset) - p = &(*p)->rb_right; - else - return -EEXIST; - } +static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group, + u64 offset) +{ + u64 bitmap_start; + u64 bytes_per_bitmap; - rb_link_node(node, parent, p); - rb_insert_color(node, root); + bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize; + bitmap_start = offset - block_group->key.objectid; + bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); + bitmap_start *= bytes_per_bitmap; + bitmap_start += block_group->key.objectid; - return 0; + return bitmap_start; } -static int tree_insert_bytes(struct rb_root *root, u64 bytes, - struct rb_node *node) +static int tree_insert_offset(struct rb_root *root, u64 offset, + struct rb_node *node, int bitmap) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes, while (*p) { parent = *p; - info = rb_entry(parent, struct btrfs_free_space, bytes_index); + info = rb_entry(parent, struct btrfs_free_space, offset_index); - if (bytes < info->bytes) + if (offset < info->offset) { p = &(*p)->rb_left; - else + } else if (offset > info->offset) { p = &(*p)->rb_right; + } else { + /* + * we could have a bitmap entry and an extent entry + * share the same offset. If this is the case, we want + * the extent entry to always be found first if we do a + * linear search through the tree, since we want to have + * the quickest allocation time, and allocating from an + * extent is faster than allocating from a bitmap. So + * if we're inserting a bitmap and we find an entry at + * this offset, we want to go right, or after this entry + * logically. If we are inserting an extent and we've + * found a bitmap, we want to go left, or before + * logically. + */ + if (bitmap) { + WARN_ON(info->bitmap); + p = &(*p)->rb_right; + } else { + WARN_ON(!info->bitmap); + p = &(*p)->rb_left; + } + } } rb_link_node(node, parent, p); @@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes, /* * searches the tree for the given offset. * - * fuzzy == 1: this is used for allocations where we are given a hint of where - * to look for free space. Because the hint may not be completely on an offset - * mark, or the hint may no longer point to free space we need to fudge our - * results a bit. So we look for free space starting at or after offset with at - * least bytes size. We prefer to find as close to the given offset as we can. - * Also if the offset is within a free space range, then we will return the free - * space that contains the given offset, which means we can return a free space - * chunk with an offset before the provided offset. - * - * fuzzy == 0: this is just a normal tree search. Give us the free space that - * starts at the given offset which is at least bytes size, and if its not there - * return NULL. + * fuzzy - If this is set, then we are trying to make an allocation, and we just + * want a section that has at least bytes size and comes at or after the given + * offset. */ -static struct btrfs_free_space *tree_search_offset(struct rb_root *root, - u64 offset, u64 bytes, - int fuzzy) +static struct btrfs_free_space * +tree_search_offset(struct btrfs_block_group_cache *block_group, + u64 offset, int bitmap_only, int fuzzy) { - struct rb_node *n = root->rb_node; - struct btrfs_free_space *entry, *ret = NULL; + struct rb_node *n = block_group->free_space_offset.rb_node; + struct btrfs_free_space *entry, *prev = NULL; + + /* find entry that is closest to the 'offset' */ + while (1) { + if (!n) { + entry = NULL; + break; + } - while (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); + prev = entry; - if (offset < entry->offset) { - if (fuzzy && - (!ret || entry->offset < ret->offset) && - (bytes <= entry->bytes)) - ret = entry; + if (offset < entry->offset) n = n->rb_left; - } else if (offset > entry->offset) { - if (fuzzy && - (entry->offset + entry->bytes - 1) >= offset && - bytes <= entry->bytes) { - ret = entry; - break; - } + else if (offset > entry->offset) n = n->rb_right; - } else { - if (bytes > entry->bytes) { - n = n->rb_right; - continue; - } - ret = entry; + else break; - } } - return ret; -} - -/* - * return a chunk at least bytes size, as close to offset that we can get. - */ -static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, - u64 offset, u64 bytes) -{ - struct rb_node *n = root->rb_node; - struct btrfs_free_space *entry, *ret = NULL; + if (bitmap_only) { + if (!entry) + return NULL; + if (entry->bitmap) + return entry; - while (n) { - entry = rb_entry(n, struct btrfs_free_space, bytes_index); + /* + * bitmap entry and extent entry may share same offset, + * in that case, bitmap entry comes after extent entry. + */ + n = rb_next(n); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + if (entry->offset != offset) + return NULL; - if (bytes < entry->bytes) { + WARN_ON(!entry->bitmap); + return entry; + } else if (entry) { + if (entry->bitmap) { /* - * We prefer to get a hole size as close to the size we - * are asking for so we don't take small slivers out of - * huge holes, but we also want to get as close to the - * offset as possible so we don't have a whole lot of - * fragmentation. + * if previous extent entry covers the offset, + * we should return it instead of the bitmap entry */ - if (offset <= entry->offset) { - if (!ret) - ret = entry; - else if (entry->bytes < ret->bytes) - ret = entry; - else if (entry->offset < ret->offset) - ret = entry; + n = &entry->offset_index; + while (1) { + n = rb_prev(n); + if (!n) + break; + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap) { + if (prev->offset + prev->bytes > offset) + entry = prev; + break; + } } - n = n->rb_left; - } else if (bytes > entry->bytes) { - n = n->rb_right; + } + return entry; + } + + if (!prev) + return NULL; + + /* find last entry before the 'offset' */ + entry = prev; + if (entry->offset > offset) { + n = rb_prev(&entry->offset_index); + if (n) { + entry = rb_entry(n, struct btrfs_free_space, + offset_index); + BUG_ON(entry->offset > offset); } else { - /* - * Ok we may have multiple chunks of the wanted size, - * so we don't want to take the first one we find, we - * want to take the one closest to our given offset, so - * keep searching just in case theres a better match. - */ - n = n->rb_right; - if (offset > entry->offset) - continue; - else if (!ret || entry->offset < ret->offset) - ret = entry; + if (fuzzy) + return entry; + else + return NULL; } } - return ret; + if (entry->bitmap) { + n = &entry->offset_index; + while (1) { + n = rb_prev(n); + if (!n) + break; + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap) { + if (prev->offset + prev->bytes > offset) + return prev; + break; + } + } + if (entry->offset + BITS_PER_BITMAP * + block_group->sectorsize > offset) + return entry; + } else if (entry->offset + entry->bytes > offset) + return entry; + + if (!fuzzy) + return NULL; + + while (1) { + if (entry->bitmap) { + if (entry->offset + BITS_PER_BITMAP * + block_group->sectorsize > offset) + break; + } else { + if (entry->offset + entry->bytes > offset) + break; + } + + n = rb_next(&entry->offset_index); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + } + return entry; } static void unlink_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *info) { rb_erase(&info->offset_index, &block_group->free_space_offset); - rb_erase(&info->bytes_index, &block_group->free_space_bytes); + block_group->free_extents--; + block_group->free_space -= info->bytes; } static int link_free_space(struct btrfs_block_group_cache *block_group, @@ -190,17 +246,314 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, { int ret = 0; - - BUG_ON(!info->bytes); + BUG_ON(!info->bitmap && !info->bytes); ret = tree_insert_offset(&block_group->free_space_offset, info->offset, - &info->offset_index); + &info->offset_index, (info->bitmap != NULL)); if (ret) return ret; - ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, - &info->bytes_index); - if (ret) - return ret; + block_group->free_space += info->bytes; + block_group->free_extents++; + return ret; +} + +static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) +{ + u64 max_bytes, possible_bytes; + + /* + * The goal is to keep the total amount of memory used per 1gb of space + * at or below 32k, so we need to adjust how much memory we allow to be + * used by extent based free space tracking + */ + max_bytes = MAX_CACHE_BYTES_PER_GIG * + (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); + + possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) + + (sizeof(struct btrfs_free_space) * + block_group->extents_thresh); + + if (possible_bytes > max_bytes) { + int extent_bytes = max_bytes - + (block_group->total_bitmaps * PAGE_CACHE_SIZE); + + if (extent_bytes <= 0) { + block_group->extents_thresh = 0; + return; + } + + block_group->extents_thresh = extent_bytes / + (sizeof(struct btrfs_free_space)); + } +} + +static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, end; + unsigned long i; + + start = offset_to_bit(info->offset, block_group->sectorsize, offset); + end = start + bytes_to_bits(bytes, block_group->sectorsize); + BUG_ON(end > BITS_PER_BITMAP); + + for (i = start; i < end; i++) + clear_bit(i, info->bitmap); + + info->bytes -= bytes; + block_group->free_space -= bytes; +} + +static void bitmap_set_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, end; + unsigned long i; + + start = offset_to_bit(info->offset, block_group->sectorsize, offset); + end = start + bytes_to_bits(bytes, block_group->sectorsize); + BUG_ON(end > BITS_PER_BITMAP); + + for (i = start; i < end; i++) + set_bit(i, info->bitmap); + + info->bytes += bytes; + block_group->free_space += bytes; +} + +static int search_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info, u64 *offset, + u64 *bytes) +{ + unsigned long found_bits = 0; + unsigned long bits, i; + unsigned long next_zero; + + i = offset_to_bit(bitmap_info->offset, block_group->sectorsize, + max_t(u64, *offset, bitmap_info->offset)); + bits = bytes_to_bits(*bytes, block_group->sectorsize); + + for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); + i < BITS_PER_BITMAP; + i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { + next_zero = find_next_zero_bit(bitmap_info->bitmap, + BITS_PER_BITMAP, i); + if ((next_zero - i) >= bits) { + found_bits = next_zero - i; + break; + } + i = next_zero; + } + + if (found_bits) { + *offset = (u64)(i * block_group->sectorsize) + + bitmap_info->offset; + *bytes = (u64)(found_bits) * block_group->sectorsize; + return 0; + } + + return -1; +} + +static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache + *block_group, u64 *offset, + u64 *bytes, int debug) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + int ret; + + if (!block_group->free_space_offset.rb_node) + return NULL; + + entry = tree_search_offset(block_group, + offset_to_bitmap(block_group, *offset), + 0, 1); + if (!entry) + return NULL; + + for (node = &entry->offset_index; node; node = rb_next(node)) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (entry->bytes < *bytes) + continue; + + if (entry->bitmap) { + ret = search_bitmap(block_group, entry, offset, bytes); + if (!ret) + return entry; + continue; + } + + *offset = entry->offset; + *bytes = entry->bytes; + return entry; + } + + return NULL; +} + +static void add_new_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset) +{ + u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; + int max_bitmaps = (int)div64_u64(block_group->key.offset + + bytes_per_bg - 1, bytes_per_bg); + BUG_ON(block_group->total_bitmaps >= max_bitmaps); + + info->offset = offset_to_bitmap(block_group, offset); + link_free_space(block_group, info); + block_group->total_bitmaps++; + + recalculate_thresholds(block_group); +} + +static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info, + u64 *offset, u64 *bytes) +{ + u64 end; + +again: + end = bitmap_info->offset + + (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1; + + if (*offset > bitmap_info->offset && *offset + *bytes > end) { + bitmap_clear_bits(block_group, bitmap_info, *offset, + end - *offset + 1); + *bytes -= end - *offset + 1; + *offset = end + 1; + } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { + bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes); + *bytes = 0; + } + + if (*bytes) { + if (!bitmap_info->bytes) { + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kfree(bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + + bitmap_info = tree_search_offset(block_group, + offset_to_bitmap(block_group, + *offset), + 1, 0); + if (!bitmap_info) + return -EINVAL; + + if (!bitmap_info->bitmap) + return -EAGAIN; + + goto again; + } else if (!bitmap_info->bytes) { + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kfree(bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + + return 0; +} + +static int insert_into_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + struct btrfs_free_space *bitmap_info; + int added = 0; + u64 bytes, offset, end; + int ret; + + /* + * If we are below the extents threshold then we can add this as an + * extent, and don't have to deal with the bitmap + */ + if (block_group->free_extents < block_group->extents_thresh && + info->bytes > block_group->sectorsize * 4) + return 0; + + /* + * some block groups are so tiny they can't be enveloped by a bitmap, so + * don't even bother to create a bitmap for this + */ + if (BITS_PER_BITMAP * block_group->sectorsize > + block_group->key.offset) + return 0; + + bytes = info->bytes; + offset = info->offset; + +again: + bitmap_info = tree_search_offset(block_group, + offset_to_bitmap(block_group, offset), + 1, 0); + if (!bitmap_info) { + BUG_ON(added); + goto new_bitmap; + } + + end = bitmap_info->offset + + (u64)(BITS_PER_BITMAP * block_group->sectorsize); + + if (offset >= bitmap_info->offset && offset + bytes > end) { + bitmap_set_bits(block_group, bitmap_info, offset, + end - offset); + bytes -= end - offset; + offset = end; + added = 0; + } else if (offset >= bitmap_info->offset && offset + bytes <= end) { + bitmap_set_bits(block_group, bitmap_info, offset, bytes); + bytes = 0; + } else { + BUG(); + } + + if (!bytes) { + ret = 1; + goto out; + } else + goto again; + +new_bitmap: + if (info && info->bitmap) { + add_new_bitmap(block_group, info, offset); + added = 1; + info = NULL; + goto again; + } else { + spin_unlock(&block_group->tree_lock); + + /* no pre-allocated info, allocate a new one */ + if (!info) { + info = kzalloc(sizeof(struct btrfs_free_space), + GFP_NOFS); + if (!info) { + spin_lock(&block_group->tree_lock); + ret = -ENOMEM; + goto out; + } + } + + /* allocate the bitmap */ + info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + spin_lock(&block_group->tree_lock); + if (!info->bitmap) { + ret = -ENOMEM; + goto out; + } + goto again; + } + +out: + if (info) { + if (info->bitmap) + kfree(info->bitmap); + kfree(info); + } return ret; } @@ -208,8 +561,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes) { - struct btrfs_free_space *right_info; - struct btrfs_free_space *left_info; + struct btrfs_free_space *right_info = NULL; + struct btrfs_free_space *left_info = NULL; struct btrfs_free_space *info = NULL; int ret = 0; @@ -227,18 +580,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, * are adding, if there is remove that struct and add a new one to * cover the entire range */ - right_info = tree_search_offset(&block_group->free_space_offset, - offset+bytes, 0, 0); - left_info = tree_search_offset(&block_group->free_space_offset, - offset-1, 0, 1); + right_info = tree_search_offset(block_group, offset + bytes, 0, 0); + if (right_info && rb_prev(&right_info->offset_index)) + left_info = rb_entry(rb_prev(&right_info->offset_index), + struct btrfs_free_space, offset_index); + else + left_info = tree_search_offset(block_group, offset - 1, 0, 0); - if (right_info) { + /* + * If there was no extent directly to the left or right of this new + * extent then we know we're going to have to allocate a new extent, so + * before we do that see if we need to drop this into a bitmap + */ + if ((!left_info || left_info->bitmap) && + (!right_info || right_info->bitmap)) { + ret = insert_into_bitmap(block_group, info); + + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + goto out; + } + } + + if (right_info && !right_info->bitmap) { unlink_free_space(block_group, right_info); info->bytes += right_info->bytes; kfree(right_info); } - if (left_info && left_info->offset + left_info->bytes == offset) { + if (left_info && !left_info->bitmap && + left_info->offset + left_info->bytes == offset) { unlink_free_space(block_group, left_info); info->offset = left_info->offset; info->bytes += left_info->bytes; @@ -248,11 +621,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, ret = link_free_space(block_group, info); if (ret) kfree(info); - +out: spin_unlock(&block_group->tree_lock); if (ret) { - printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); + printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); BUG_ON(ret == -EEXIST); } @@ -263,40 +636,65 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes) { struct btrfs_free_space *info; + struct btrfs_free_space *next_info = NULL; int ret = 0; spin_lock(&block_group->tree_lock); - info = tree_search_offset(&block_group->free_space_offset, offset, 0, - 1); - if (info && info->offset == offset) { - if (info->bytes < bytes) { - printk(KERN_ERR "Found free space at %llu, size %llu," - "trying to use %llu\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, - (unsigned long long)bytes); +again: + info = tree_search_offset(block_group, offset, 0, 0); + if (!info) { + WARN_ON(1); + goto out_lock; + } + + if (info->bytes < bytes && rb_next(&info->offset_index)) { + u64 end; + next_info = rb_entry(rb_next(&info->offset_index), + struct btrfs_free_space, + offset_index); + + if (next_info->bitmap) + end = next_info->offset + BITS_PER_BITMAP * + block_group->sectorsize - 1; + else + end = next_info->offset + next_info->bytes; + + if (next_info->bytes < bytes || + next_info->offset > offset || offset > end) { + printk(KERN_CRIT "Found free space at %llu, size %llu," + " trying to use %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)bytes); WARN_ON(1); ret = -EINVAL; - spin_unlock(&block_group->tree_lock); - goto out; + goto out_lock; } - unlink_free_space(block_group, info); - if (info->bytes == bytes) { - kfree(info); - spin_unlock(&block_group->tree_lock); - goto out; + info = next_info; + } + + if (info->bytes == bytes) { + unlink_free_space(block_group, info); + if (info->bitmap) { + kfree(info->bitmap); + block_group->total_bitmaps--; } + kfree(info); + goto out_lock; + } + if (!info->bitmap && info->offset == offset) { + unlink_free_space(block_group, info); info->offset += bytes; info->bytes -= bytes; + link_free_space(block_group, info); + goto out_lock; + } - ret = link_free_space(block_group, info); - spin_unlock(&block_group->tree_lock); - BUG_ON(ret); - } else if (info && info->offset < offset && - info->offset + info->bytes >= offset + bytes) { + if (!info->bitmap && info->offset <= offset && + info->offset + info->bytes >= offset + bytes) { u64 old_start = info->offset; /* * we're freeing space in the middle of the info, @@ -312,7 +710,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, info->offset = offset + bytes; info->bytes = old_end - info->offset; ret = link_free_space(block_group, info); - BUG_ON(ret); + WARN_ON(ret); + if (ret) + goto out_lock; } else { /* the hole we're creating ends at the end * of the info struct, just free the info @@ -320,32 +720,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, kfree(info); } spin_unlock(&block_group->tree_lock); - /* step two, insert a new info struct to cover anything - * before the hole + + /* step two, insert a new info struct to cover + * anything before the hole */ ret = btrfs_add_free_space(block_group, old_start, offset - old_start); - BUG_ON(ret); - } else { - spin_unlock(&block_group->tree_lock); - if (!info) { - printk(KERN_ERR "couldn't find space %llu to free\n", - (unsigned long long)offset); - printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", - block_group->cached, - (unsigned long long)block_group->key.objectid, - (unsigned long long)block_group->key.offset); - btrfs_dump_free_space(block_group, bytes); - } else if (info) { - printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " - "but wanted offset=%llu bytes=%llu\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - } - WARN_ON(1); + WARN_ON(ret); + goto out; } + + ret = remove_from_bitmap(block_group, info, &offset, &bytes); + if (ret == -EAGAIN) + goto again; + BUG_ON(ret); +out_lock: + spin_unlock(&block_group->tree_lock); out: return ret; } @@ -361,10 +751,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes) count++; - printk(KERN_ERR "entry offset %llu, bytes %llu\n", + printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", (unsigned long long)info->offset, - (unsigned long long)info->bytes); + (unsigned long long)info->bytes, + (info->bitmap) ? "yes" : "no"); } + printk(KERN_INFO "block group has cluster?: %s\n", + list_empty(&block_group->cluster_list) ? "no" : "yes"); printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" "\n", count); } @@ -397,26 +790,35 @@ __btrfs_return_cluster_to_free_space( { struct btrfs_free_space *entry; struct rb_node *node; + bool bitmap; spin_lock(&cluster->lock); if (cluster->block_group != block_group) goto out; + bitmap = cluster->points_to_bitmap; + cluster->block_group = NULL; cluster->window_start = 0; + list_del_init(&cluster->block_group_list); + cluster->points_to_bitmap = false; + + if (bitmap) + goto out; + node = rb_first(&cluster->root); - while(node) { + while (node) { entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); - link_free_space(block_group, entry); + BUG_ON(entry->bitmap); + tree_insert_offset(&block_group->free_space_offset, + entry->offset, &entry->offset_index, 0); } - list_del_init(&cluster->block_group_list); - - btrfs_put_block_group(cluster->block_group); - cluster->block_group = NULL; cluster->root.rb_node = NULL; + out: spin_unlock(&cluster->lock); + btrfs_put_block_group(block_group); return 0; } @@ -425,20 +827,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) struct btrfs_free_space *info; struct rb_node *node; struct btrfs_free_cluster *cluster; - struct btrfs_free_cluster *safe; + struct list_head *head; spin_lock(&block_group->tree_lock); - - list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, - block_group_list) { + while ((head = block_group->cluster_list.next) != + &block_group->cluster_list) { + cluster = list_entry(head, struct btrfs_free_cluster, + block_group_list); WARN_ON(cluster->block_group != block_group); __btrfs_return_cluster_to_free_space(block_group, cluster); + if (need_resched()) { + spin_unlock(&block_group->tree_lock); + cond_resched(); + spin_lock(&block_group->tree_lock); + } } - while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { - info = rb_entry(node, struct btrfs_free_space, bytes_index); + while ((node = rb_last(&block_group->free_space_offset)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, offset_index); unlink_free_space(block_group, info); + if (info->bitmap) + kfree(info->bitmap); kfree(info); if (need_resched()) { spin_unlock(&block_group->tree_lock); @@ -446,6 +856,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) spin_lock(&block_group->tree_lock); } } + spin_unlock(&block_group->tree_lock); } @@ -453,25 +864,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space *entry = NULL; + u64 bytes_search = bytes + empty_size; u64 ret = 0; spin_lock(&block_group->tree_lock); - entry = tree_search_offset(&block_group->free_space_offset, offset, - bytes + empty_size, 1); + entry = find_free_space(block_group, &offset, &bytes_search, 0); if (!entry) - entry = tree_search_bytes(&block_group->free_space_bytes, - offset, bytes + empty_size); - if (entry) { + goto out; + + ret = offset; + if (entry->bitmap) { + bitmap_clear_bits(block_group, entry, offset, bytes); + if (!entry->bytes) { + unlink_free_space(block_group, entry); + kfree(entry->bitmap); + kfree(entry); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + } else { unlink_free_space(block_group, entry); - ret = entry->offset; entry->offset += bytes; entry->bytes -= bytes; - if (!entry->bytes) kfree(entry); else link_free_space(block_group, entry); } + +out: spin_unlock(&block_group->tree_lock); return ret; @@ -517,6 +938,47 @@ int btrfs_return_cluster_to_free_space( return ret; } +static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 bytes, u64 min_start) +{ + struct btrfs_free_space *entry; + int err; + u64 search_start = cluster->window_start; + u64 search_bytes = bytes; + u64 ret = 0; + + spin_lock(&block_group->tree_lock); + spin_lock(&cluster->lock); + + if (!cluster->points_to_bitmap) + goto out; + + if (cluster->block_group != block_group) + goto out; + + entry = tree_search_offset(block_group, search_start, 0, 0); + + if (!entry || !entry->bitmap) + goto out; + + search_start = min_start; + search_bytes = bytes; + + err = search_bitmap(block_group, entry, &search_start, + &search_bytes); + if (err) + goto out; + + ret = search_start; + bitmap_clear_bits(block_group, entry, ret, bytes); +out: + spin_unlock(&cluster->lock); + spin_unlock(&block_group->tree_lock); + + return ret; +} + /* * given a cluster, try to allocate 'bytes' from it, returns 0 * if it couldn't find anything suitably large, or a logical disk offset @@ -530,6 +992,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, struct rb_node *node; u64 ret = 0; + if (cluster->points_to_bitmap) + return btrfs_alloc_from_bitmap(block_group, cluster, bytes, + min_start); + spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; @@ -567,9 +1033,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, } out: spin_unlock(&cluster->lock); + return ret; } +static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *entry, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 min_bytes) +{ + unsigned long next_zero; + unsigned long i; + unsigned long search_bits; + unsigned long total_bits; + unsigned long found_bits; + unsigned long start = 0; + unsigned long total_found = 0; + bool found = false; + + i = offset_to_bit(entry->offset, block_group->sectorsize, + max_t(u64, offset, entry->offset)); + search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + total_bits = bytes_to_bits(bytes, block_group->sectorsize); + +again: + found_bits = 0; + for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); + i < BITS_PER_BITMAP; + i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { + next_zero = find_next_zero_bit(entry->bitmap, + BITS_PER_BITMAP, i); + if (next_zero - i >= search_bits) { + found_bits = next_zero - i; + break; + } + i = next_zero; + } + + if (!found_bits) + return -1; + + if (!found) { + start = i; + found = true; + } + + total_found += found_bits; + + if (cluster->max_size < found_bits * block_group->sectorsize) + cluster->max_size = found_bits * block_group->sectorsize; + + if (total_found < total_bits) { + i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); + if (i - start > total_bits * 2) { + total_found = 0; + cluster->max_size = 0; + found = false; + } + goto again; + } + + cluster->window_start = start * block_group->sectorsize + + entry->offset; + cluster->points_to_bitmap = true; + + return 0; +} + /* * here we try to find a cluster of blocks in a block group. The goal * is to find at least bytes free and up to empty_size + bytes free. @@ -587,12 +1117,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, struct btrfs_free_space *entry = NULL; struct rb_node *node; struct btrfs_free_space *next; - struct btrfs_free_space *last; + struct btrfs_free_space *last = NULL; u64 min_bytes; u64 window_start; u64 window_free; u64 max_extent = 0; - int total_retries = 0; + bool found_bitmap = false; int ret; /* for metadata, allow allocates with more holes */ @@ -620,31 +1150,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } again: - min_bytes = min(min_bytes, bytes + empty_size); - entry = tree_search_bytes(&block_group->free_space_bytes, - offset, min_bytes); + entry = tree_search_offset(block_group, offset, found_bitmap, 1); if (!entry) { ret = -ENOSPC; goto out; } + + /* + * If found_bitmap is true, we exhausted our search for extent entries, + * and we just want to search all of the bitmaps that we can find, and + * ignore any extent entries we find. + */ + while (entry->bitmap || found_bitmap || + (!entry->bitmap && entry->bytes < min_bytes)) { + struct rb_node *node = rb_next(&entry->offset_index); + + if (entry->bitmap && entry->bytes > bytes + empty_size) { + ret = btrfs_bitmap_cluster(block_group, entry, cluster, + offset, bytes + empty_size, + min_bytes); + if (!ret) + goto got_it; + } + + if (!node) { + ret = -ENOSPC; + goto out; + } + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + + /* + * We already searched all the extent entries from the passed in offset + * to the end and didn't find enough space for the cluster, and we also + * didn't find any bitmaps that met our criteria, just go ahead and exit + */ + if (found_bitmap) { + ret = -ENOSPC; + goto out; + } + + cluster->points_to_bitmap = false; window_start = entry->offset; window_free = entry->bytes; last = entry; max_extent = entry->bytes; - while(1) { + while (1) { /* out window is just right, lets fill it */ if (window_free >= bytes + empty_size) break; node = rb_next(&last->offset_index); if (!node) { + if (found_bitmap) + goto again; ret = -ENOSPC; goto out; } next = rb_entry(node, struct btrfs_free_space, offset_index); /* + * we found a bitmap, so if this search doesn't result in a + * cluster, we know to go and search again for the bitmaps and + * start looking for space there + */ + if (next->bitmap) { + if (!found_bitmap) + offset = next->offset; + found_bitmap = true; + last = next; + continue; + } + + /* * we haven't filled the empty size and the window is * very large. reset and try again */ @@ -655,19 +1234,6 @@ again: window_free = entry->bytes; last = entry; max_extent = 0; - total_retries++; - if (total_retries % 64 == 0) { - if (min_bytes >= (bytes + empty_size)) { - ret = -ENOSPC; - goto out; - } - /* - * grow our allocation a bit, we're not having - * much luck - */ - min_bytes *= 2; - goto again; - } } else { last = next; window_free += next->bytes; @@ -685,11 +1251,19 @@ again: * The cluster includes an rbtree, but only uses the offset index * of each free space cache entry. */ - while(1) { + while (1) { node = rb_next(&entry->offset_index); - unlink_free_space(block_group, entry); + if (entry->bitmap && node) { + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } else if (entry->bitmap && !node) { + break; + } + + rb_erase(&entry->offset_index, &block_group->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index); + &entry->offset_index, 0); BUG_ON(ret); if (!node || entry == last) @@ -697,8 +1271,10 @@ again: entry = rb_entry(node, struct btrfs_free_space, offset_index); } - ret = 0; + cluster->max_size = max_extent; +got_it: + ret = 0; atomic_inc(&block_group->count); list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; @@ -718,6 +1294,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) spin_lock_init(&cluster->refill_lock); cluster->root.rb_node = NULL; cluster->max_size = 0; + cluster->points_to_bitmap = false; INIT_LIST_HEAD(&cluster->block_group_list); cluster->block_group = NULL; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 266fb8764054..890a8e79011b 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -19,6 +19,14 @@ #ifndef __BTRFS_FREE_SPACE_CACHE #define __BTRFS_FREE_SPACE_CACHE +struct btrfs_free_space { + struct rb_node offset_index; + u64 offset; + u64 bytes; + unsigned long *bitmap; + struct list_head list; +}; + int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 791eab19e330..56fe83fa60c4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2603,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, if (root->ref_cows) btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); path = btrfs_alloc_path(); - path->reada = -1; BUG_ON(!path); + path->reada = -1; /* FIXME, add redo link to tree so we don't leak on crash */ key.objectid = inode->i_ino; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 6d6523da0a30..0d126be22b63 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) } printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", (unsigned long long)btrfs_header_bytenr(c), - btrfs_header_level(c), nr, + level, nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); @@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) btrfs_level_size(root, level - 1), btrfs_node_ptr_generation(c, i)); if (btrfs_is_leaf(next) && - btrfs_header_level(c) != 1) + level != 1) BUG(); if (btrfs_header_level(next) != - btrfs_header_level(c) - 1) + level - 1) BUG(); btrfs_print_tree(root, next); free_extent_buffer(next); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 008397934778..e71264d1c2c9 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -670,6 +670,8 @@ again: err = ret; goto out; } + if (ret > 0 && path2->slots[level] > 0) + path2->slots[level]--; eb = path2->nodes[level]; WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != @@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(level == 0); path->lowest_level = level; ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); + path->lowest_level = 0; if (ret < 0) { btrfs_free_path(path); return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2dbf1c1f56ee..e51d2bc532f8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -40,6 +40,14 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) } } +static noinline void switch_commit_root(struct btrfs_root *root) +{ + down_write(&root->commit_root_sem); + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); + up_write(&root->commit_root_sem); +} + /* * either allocate a new transaction or hop into the existing one */ @@ -444,9 +452,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, btrfs_write_dirty_block_groups(trans, root); - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); - while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start) @@ -457,13 +462,11 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, &root->root_key, &root->root_item); BUG_ON(ret); - btrfs_write_dirty_block_groups(trans, root); - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + ret = btrfs_write_dirty_block_groups(trans, root); BUG_ON(ret); } - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + switch_commit_root(root); return 0; } @@ -495,9 +498,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, root = list_entry(next, struct btrfs_root, dirty_list); update_cowonly_root(trans, root); - - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); } return 0; } @@ -544,8 +544,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_update_reloc_root(trans, root); if (root->commit_root != root->node) { - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + switch_commit_root(root); btrfs_set_root_node(&root->root_item, root->node); } @@ -943,9 +942,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); - if (flush_on_commit || snap_pending) { - if (flush_on_commit) - btrfs_start_delalloc_inodes(root); + if (flush_on_commit) { + btrfs_start_delalloc_inodes(root); + ret = btrfs_wait_ordered_extents(root, 0); + BUG_ON(ret); + } else if (snap_pending) { ret = btrfs_wait_ordered_extents(root, 1); BUG_ON(ret); } @@ -1009,15 +1010,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_set_root_node(&root->fs_info->tree_root->root_item, root->fs_info->tree_root->node); - free_extent_buffer(root->fs_info->tree_root->commit_root); - root->fs_info->tree_root->commit_root = - btrfs_root_node(root->fs_info->tree_root); + switch_commit_root(root->fs_info->tree_root); btrfs_set_root_node(&root->fs_info->chunk_root->root_item, root->fs_info->chunk_root->node); - free_extent_buffer(root->fs_info->chunk_root->commit_root); - root->fs_info->chunk_root->commit_root = - btrfs_root_node(root->fs_info->chunk_root); + switch_commit_root(root->fs_info->chunk_root); update_super_roots(root); @@ -1057,6 +1054,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, cur_trans->commit_done = 1; root->fs_info->last_trans_committed = cur_trans->transid; + wake_up(&cur_trans->commit_wait); put_transaction(cur_trans); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c13922206d1b..d91b0de7c502 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, return -ENOENT; inode = read_one_inode(root, key->objectid); - BUG_ON(!dir); + BUG_ON(!inode); ref_ptr = btrfs_item_ptr_offset(eb, slot); ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3ab80e9cd767..5dbefd11b4af 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -721,7 +721,8 @@ error: */ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, - u64 num_bytes, u64 *start) + u64 num_bytes, u64 *start, + u64 *max_avail) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; @@ -758,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 0); if (ret < 0) goto error; - ret = btrfs_previous_item(root, path, 0, key.type); - if (ret < 0) - goto error; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto error; + if (ret > 0) + start_found = 1; + } l = path->nodes[0]; btrfs_item_key_to_cpu(l, &key, path->slots[0]); while (1) { @@ -803,6 +808,10 @@ no_more_items: if (last_byte < search_start) last_byte = search_start; hole_size = key.offset - last_byte; + + if (hole_size > *max_avail) + *max_avail = hole_size; + if (key.offset > last_byte && hole_size >= num_bytes) { *start = last_byte; @@ -1621,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, device->fs_devices->total_rw_bytes += diff; device->total_bytes = new_size; + device->disk_total_bytes = new_size; btrfs_clear_space_info_full(device->dev_root->fs_info); return btrfs_update_device(trans, device); @@ -2007,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) goto done; if (ret) { ret = 0; - goto done; + break; } l = path->nodes[0]; @@ -2015,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) - goto done; + break; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); length = btrfs_dev_extent_length(l, dev_extent); @@ -2171,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size); again: + max_avail = 0; if (!map || map->num_stripes != num_stripes) { kfree(map); map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -2219,7 +2230,8 @@ again: if (device->in_fs_metadata && avail >= min_free) { ret = find_free_dev_extent(trans, device, - min_free, &dev_offset); + min_free, &dev_offset, + &max_avail); if (ret == 0) { list_move_tail(&device->dev_alloc_list, &private_devs); @@ -2795,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, } } - for (i = 0; i > nr; i++) { - struct btrfs_multi_bio *multi; - struct btrfs_bio_stripe *stripe; - int ret; - - length = 1; - ret = btrfs_map_block(map_tree, WRITE, buf[i], - &length, &multi, 0); - BUG_ON(ret); - - stripe = multi->stripes; - for (j = 0; j < multi->num_stripes; j++) { - if (stripe->physical >= physical && - physical < stripe->physical + length) - break; - } - BUG_ON(j >= multi->num_stripes); - kfree(multi); - } - *logical = buf; *naddrs = nr; *stripe_len = map->stripe_len; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9bb5c8750736..fc44d316d0bb 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2452,10 +2452,10 @@ try_mount_again: tcon->local_lease = volume_info->local_lease; } if (pSesInfo) { - if (pSesInfo->capabilities & CAP_LARGE_FILES) { - sb->s_maxbytes = (u64) 1 << 63; - } else - sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */ + if (pSesInfo->capabilities & CAP_LARGE_FILES) + sb->s_maxbytes = MAX_LFS_FILESIZE; + else + sb->s_maxbytes = MAX_NON_LFS; } /* BB FIXME fix time_gran to be larger for LANMAN sessions */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 18afe57b2461..82d83839655e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -212,7 +212,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, * junction to the new submount (ie to setup the fake directory * which represents a DFS referral). */ -void +static void cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -388,7 +388,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, } /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ -void +static void cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, struct cifs_sb_info *cifs_sb, bool adjust_tz) { @@ -513,9 +513,12 @@ int cifs_get_inode_info(struct inode **pinode, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc1) { - /* BB EOPNOSUPP disable SERVER_INUM? */ cFYI(1, ("GetSrvInodeNum rc %d", rc1)); fattr.cf_uniqueid = iunique(sb, ROOT_I); + /* disable serverino if call not supported */ + if (rc1 == -EINVAL) + cifs_sb->mnt_cifs_flags &= + ~CIFS_MOUNT_SERVER_INUM; } } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index af737bb56cb7..259525c9abb8 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, } (*new_auth_tok)->session_key.encrypted_key_size = (body_size - (ECRYPTFS_SALT_SIZE + 5)); + if ((*new_auth_tok)->session_key.encrypted_key_size + > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { + printk(KERN_WARNING "Tag 3 packet contains key larger " + "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n"); + rc = -EINVAL; + goto out_free; + } if (unlikely(data[(*packet_size)++] != 0x04)) { printk(KERN_WARNING "Unknown version number [%d]\n", data[(*packet_size) - 1]); @@ -1449,6 +1456,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents, rc = -EINVAL; goto out; } + if (unlikely((*tag_11_contents_size) > max_contents_bytes)) { + printk(KERN_ERR "Literal data section in tag 11 packet exceeds " + "expected size\n"); + rc = -EINVAL; + goto out; + } if (data[(*packet_size)++] != 0x62) { printk(KERN_WARNING "Unrecognizable packet\n"); rc = -EINVAL; diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 3d724a95882f..373fa90c796a 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext3_get_blocks_handle(NULL, inode, blk, 1, - &map_bh, 0, 0); + err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 5f51fed5c750..b49908a167ae 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -788,7 +788,7 @@ err_out: int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, - int create, int extend_disksize) + int create) { int err = -EIO; int offsets[4]; @@ -911,13 +911,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!err) err = ext3_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - /* - * i_disksize growing is protected by truncate_mutex. Don't forget to - * protect it if you're about to implement concurrent - * ext3_get_block() -bzzz - */ - if (!err && extend_disksize && inode->i_size > ei->i_disksize) - ei->i_disksize = inode->i_size; mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; @@ -972,7 +965,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, } ret = ext3_get_blocks_handle(handle, inode, iblock, - max_blocks, bh_result, create, 0); + max_blocks, bh_result, create); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1005,7 +998,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); err = ext3_get_blocks_handle(handle, inode, block, 1, - &dummy, create, 1); + &dummy, create); /* * ext3_get_blocks_handle() returns number of blocks * mapped. 0 in case of a HOLE. @@ -1193,15 +1186,16 @@ write_begin_failed: * i_size_read because we hold i_mutex. * * Add inode to orphan list in case we crash before truncate - * finishes. + * finishes. Do this only if ext3_can_truncate() agrees so + * that orphan processing code is happy. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1287,7 +1281,7 @@ static int ext3_ordered_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ret2 = ext3_journal_stop(handle); if (!ret) @@ -1296,7 +1290,7 @@ static int ext3_ordered_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } @@ -1315,14 +1309,14 @@ static int ext3_writeback_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ret = ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } @@ -1358,7 +1352,7 @@ static int ext3_journalled_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; if (inode->i_size > EXT3_I(inode)->i_disksize) { @@ -1375,7 +1369,7 @@ static int ext3_journalled_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 737f7246a4b5..f96f85092d1c 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); + journal_t *journal = transaction->t_journal; /* * The buffer really shouldn't be locked: only the current committing @@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction, J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); + /* keep subsequent assertions sane */ + new_bh->b_state = 0; + init_buffer(new_bh, NULL, NULL); + atomic_set(&new_bh->b_count, 1); + new_jh = journal_add_journal_head(new_bh); /* This sleeps */ /* * If a new transaction has already done a buffer copy-out, then @@ -361,14 +367,6 @@ repeat: kunmap_atomic(mapped_data, KM_USER0); } - /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); - atomic_set(&new_bh->b_count, 1); - jbd_unlock_bh_state(bh_in); - - new_jh = journal_add_journal_head(new_bh); /* This sleeps */ - set_bh_page(new_bh, new_page, new_offset); new_jh->b_transaction = NULL; new_bh->b_size = jh2bh(jh_in)->b_size; @@ -385,7 +383,11 @@ repeat: * copying is moved to the transaction's shadow queue. */ JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); - journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_lock(&journal->j_list_lock); + __journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh_in); + JBUFFER_TRACE(new_jh, "file as BJ_IO"); journal_file_buffer(new_jh, transaction, BJ_IO); @@ -848,6 +850,12 @@ static int journal_reset(journal_t *journal) first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); + if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } journal->j_first = first; journal->j_last = last; diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 73242ba7c7b1..c03ac11f74be 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -489,34 +489,15 @@ void journal_unlock_updates (journal_t *journal) wake_up(&journal->j_wait_transaction_locked); } -/* - * Report any unexpected dirty buffers which turn up. Normally those - * indicate an error, but they can occur if the user is running (say) - * tune2fs to modify the live filesystem, so we need the option of - * continuing as gracefully as possible. # - * - * The caller should already hold the journal lock and - * j_list_lock spinlock: most callers will need those anyway - * in order to probe the buffer's journaling state safely. - */ -static void jbd_unexpected_dirty_buffer(struct journal_head *jh) +static void warn_dirty_buffer(struct buffer_head *bh) { - int jlist; - - /* If this buffer is one which might reasonably be dirty - * --- ie. data, or not part of this journal --- then - * we're OK to leave it alone, but otherwise we need to - * move the dirty bit to the journal's own internal - * JBDDirty bit. */ - jlist = jh->b_jlist; + char b[BDEVNAME_SIZE]; - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - struct buffer_head *bh = jh2bh(jh); - - if (test_clear_buffer_dirty(bh)) - set_buffer_jbddirty(bh); - } + printk(KERN_WARNING + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } /* @@ -583,14 +564,16 @@ repeat: if (jh->b_next_transaction) J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + warn_dirty_buffer(bh); } /* * In any case we need to clean the dirty flag and we must * do it under the buffer lock to be sure we don't race * with running write-out. */ - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); - jbd_unexpected_dirty_buffer(jh); + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); } unlock_buffer(bh); @@ -826,6 +809,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); if (jh->b_transaction == NULL) { + /* + * Previous journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); jh->b_transaction = transaction; /* first access by this transaction */ @@ -1782,8 +1774,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); __journal_file_buffer(jh, transaction, BJ_Forget); - clear_buffer_jbddirty(bh); may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); @@ -2041,12 +2038,17 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction && jh->b_jlist == jlist) return; - /* The following list of buffer states needs to be consistent - * with __jbd_unexpected_dirty_buffer()'s handling of dirty - * state. */ - if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) was_dirty = 1; diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 91fa3ad6e8c2..a29c7c3e3fb8 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -67,10 +67,8 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type) acl = posix_acl_from_xattr(value, size); } kfree(value); - if (!IS_ERR(acl)) { + if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); - posix_acl_release(acl); - } return acl; } diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index 31dac7e3b0f1..dffbb0911d02 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -1,15 +1,5 @@ config FSNOTIFY - bool "Filesystem notification backend" - default y - ---help--- - fsnotify is a backend for filesystem notification. fsnotify does - not provide any userspace interface but does provide the basis - needed for other notification schemes such as dnotify, inotify, - and fanotify. - - Say Y here to enable fsnotify suport. - - If unsure, say Y. + def_bool n source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig index 904ff8d5405a..f9c1ca139d8f 100644 --- a/fs/notify/dnotify/Kconfig +++ b/fs/notify/dnotify/Kconfig @@ -1,6 +1,6 @@ config DNOTIFY bool "Dnotify support" - depends on FSNOTIFY + select FSNOTIFY default y help Dnotify is a directory-based per-fd file change notification system diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ec2f7bd76818..037e878e03fc 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const if (!group->ops->should_send_event(group, to_tell, mask)) continue; if (!event) { - event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie); + event = fsnotify_create_event(to_tell, mask, data, + data_is, file_name, cookie, + GFP_KERNEL); /* shit, we OOM'd and now we can't tell, maybe * someday someone else will want to do something * here */ diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index 5356884289a1..3e56dbffe729 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -15,7 +15,7 @@ config INOTIFY config INOTIFY_USER bool "Inotify support for userspace" - depends on FSNOTIFY + select FSNOTIFY default y ---help--- Say Y here to enable inotify support for userspace, including the diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ff27a2965844..f30d9bbc2e1b 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -57,7 +57,6 @@ int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; struct kmem_cache *event_priv_cachep __read_mostly; -static struct fsnotify_event *inotify_ignored_event; /* * When inotify registers a new group it increments this and uses that @@ -365,6 +364,17 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns return error; } +static void inotify_remove_from_idr(struct fsnotify_group *group, + struct inotify_inode_mark_entry *ientry) +{ + struct idr *idr; + + spin_lock(&group->inotify_data.idr_lock); + idr = &group->inotify_data.idr; + idr_remove(idr, ientry->wd); + spin_unlock(&group->inotify_data.idr_lock); + ientry->wd = -1; +} /* * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the * internal reference help on the mark because it is in the idr. @@ -373,13 +383,19 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) { struct inotify_inode_mark_entry *ientry; + struct fsnotify_event *ignored_event; struct inotify_event_private_data *event_priv; struct fsnotify_event_private_data *fsn_event_priv; - struct idr *idr; + + ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, + FSNOTIFY_EVENT_NONE, NULL, 0, + GFP_NOFS); + if (!ignored_event) + return; ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); + event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); if (unlikely(!event_priv)) goto skip_send_ignore; @@ -388,7 +404,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, fsn_event_priv->group = group; event_priv->wd = ientry->wd; - fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv); + fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); /* did the private data get added? */ if (list_empty(&fsn_event_priv->event_list)) @@ -396,14 +412,16 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, skip_send_ignore: + /* matches the reference taken when the event was created */ + fsnotify_put_event(ignored_event); + /* remove this entry from the idr */ - spin_lock(&group->inotify_data.idr_lock); - idr = &group->inotify_data.idr; - idr_remove(idr, ientry->wd); - spin_unlock(&group->inotify_data.idr_lock); + inotify_remove_from_idr(group, ientry); /* removed from idr, drop that reference */ fsnotify_put_mark(entry); + + atomic_dec(&group->inotify_data.user->inotify_watches); } /* ding dong the mark is dead */ @@ -418,6 +436,7 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod { struct fsnotify_mark_entry *entry = NULL; struct inotify_inode_mark_entry *ientry; + struct inotify_inode_mark_entry *tmp_ientry; int ret = 0; int add = (arg & IN_MASK_ADD); __u32 mask; @@ -428,54 +447,66 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod if (unlikely(!mask)) return -EINVAL; - ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); - if (unlikely(!ientry)) + tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); + if (unlikely(!tmp_ientry)) return -ENOMEM; /* we set the mask at the end after attaching it */ - fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark); - ientry->wd = 0; + fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); + tmp_ientry->wd = -1; find_entry: spin_lock(&inode->i_lock); entry = fsnotify_find_mark_entry(group, inode); spin_unlock(&inode->i_lock); if (entry) { - kmem_cache_free(inotify_inode_mark_cachep, ientry); ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); } else { - if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) { - ret = -ENOSPC; - goto out_err; - } - - ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode); - if (ret == -EEXIST) - goto find_entry; - else if (ret) + ret = -ENOSPC; + if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) goto out_err; - - entry = &ientry->fsn_entry; retry: ret = -ENOMEM; if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) goto out_err; spin_lock(&group->inotify_data.idr_lock); - /* if entry is added to the idr we keep the reference obtained - * through fsnotify_mark_add. remember to drop this reference - * when entry is removed from idr */ - ret = idr_get_new_above(&group->inotify_data.idr, entry, - ++group->inotify_data.last_wd, - &ientry->wd); + ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, + group->inotify_data.last_wd, + &tmp_ientry->wd); spin_unlock(&group->inotify_data.idr_lock); if (ret) { if (ret == -EAGAIN) goto retry; goto out_err; } + + ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); + if (ret) { + inotify_remove_from_idr(group, tmp_ientry); + if (ret == -EEXIST) + goto find_entry; + goto out_err; + } + + /* tmp_ientry has been added to the inode, so we are all set up. + * now we just need to make sure tmp_ientry doesn't get freed and + * we need to set up entry and ientry so the generic code can + * do its thing. */ + ientry = tmp_ientry; + entry = &ientry->fsn_entry; + tmp_ientry = NULL; + atomic_inc(&group->inotify_data.user->inotify_watches); + + /* update the idr hint */ + group->inotify_data.last_wd = ientry->wd; + + /* we put the mark on the idr, take a reference */ + fsnotify_get_mark(entry); } + ret = ientry->wd; + spin_lock(&entry->lock); old_mask = entry->mask; @@ -506,14 +537,19 @@ retry: fsnotify_recalc_group_mask(group); } - return ientry->wd; + /* this either matches fsnotify_find_mark_entry, or init_mark_entry + * depending on which path we took... */ + fsnotify_put_mark(entry); out_err: - /* see this isn't supposed to happen, just kill the watch */ - if (entry) { - fsnotify_destroy_mark_by_entry(entry); - fsnotify_put_mark(entry); + /* could be an error, could be that we found an existing mark */ + if (tmp_ientry) { + /* on the idr but didn't make it on the inode */ + if (tmp_ientry->wd != -1) + inotify_remove_from_idr(group, tmp_ientry); + kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry); } + return ret; } @@ -721,9 +757,6 @@ static int __init inotify_user_setup(void) inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); - inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0); - if (!inotify_ignored_event) - panic("unable to allocate the inotify ignored event\n"); inotify_max_queued_events = 16384; inotify_max_user_instances = 128; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 959b73e756fd..521368574e97 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -136,18 +136,24 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new { if ((old->mask == new->mask) && (old->to_tell == new->to_tell) && - (old->data_type == new->data_type)) { + (old->data_type == new->data_type) && + (old->name_len == new->name_len)) { switch (old->data_type) { case (FSNOTIFY_EVENT_INODE): - if (old->inode == new->inode) + /* remember, after old was put on the wait_q we aren't + * allowed to look at the inode any more, only thing + * left to check was if the file_name is the same */ + if (old->name_len && + !strcmp(old->file_name, new->file_name)) return true; break; case (FSNOTIFY_EVENT_PATH): if ((old->path.mnt == new->path.mnt) && (old->path.dentry == new->path.dentry)) return true; + break; case (FSNOTIFY_EVENT_NONE): - return true; + return false; }; } return false; @@ -339,18 +345,19 @@ static void initialize_event(struct fsnotify_event *event) * @name the filename, if available */ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, - int data_type, const char *name, u32 cookie) + int data_type, const char *name, u32 cookie, + gfp_t gfp) { struct fsnotify_event *event; - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); + event = kmem_cache_alloc(fsnotify_event_cachep, gfp); if (!event) return NULL; initialize_event(event); if (name) { - event->file_name = kstrdup(name, GFP_KERNEL); + event->file_name = kstrdup(name, gfp); if (!event->file_name) { kmem_cache_free(fsnotify_event_cachep, event); return NULL; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index ebb2c417912c..11f0c06316de 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -20,6 +20,7 @@ #include <linux/ramfs.h> #include <linux/pagevec.h> #include <linux/mman.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include "internal.h" diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index d88d0fac9fa5..14f2d71ea3ce 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -939,8 +939,10 @@ again: /* Remove from old parent's list and insert into new parent's list. */ sysfs_unlink_sibling(sd); sysfs_get(new_parent_sd); + drop_nlink(old_parent->d_inode); sysfs_put(sd->s_parent); sd->s_parent = new_parent_sd; + inc_nlink(new_parent->d_inode); sysfs_link_sibling(sd); out_unlock: |