From 5db0276014b80484689eb6c1bf7b94af1c7d5b1a Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Tue, 1 Nov 2011 17:04:16 +0100
Subject: Btrfs: add optional integrity check code

The two files added in this patch contain all the code that is
required to implement the integrity checks.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/check-integrity.c | 3068 ++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/check-integrity.h |   36 +
 2 files changed, 3104 insertions(+)
 create mode 100644 fs/btrfs/check-integrity.c
 create mode 100644 fs/btrfs/check-integrity.h

(limited to 'fs')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 000000000000..ad0b3ba735b7
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,3068 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/*
+ * This module can be used to catch cases when the btrfs kernel
+ * code executes write requests to the disk that bring the file
+ * system in an inconsistent state. In such a state, a power-loss
+ * or kernel panic event would cause that the data on disk is
+ * lost or at least damaged.
+ *
+ * Code is added that examines all block write requests during
+ * runtime (including writes of the super block). Three rules
+ * are verified and an error is printed on violation of the
+ * rules:
+ * 1. It is not allowed to write a disk block which is
+ *    currently referenced by the super block (either directly
+ *    or indirectly).
+ * 2. When a super block is written, it is verified that all
+ *    referenced (directly or indirectly) blocks fulfill the
+ *    following requirements:
+ *    2a. All referenced blocks have either been present when
+ *        the file system was mounted, (i.e., they have been
+ *        referenced by the super block) or they have been
+ *        written since then and the write completion callback
+ *        was called and a FLUSH request to the device where
+ *        these blocks are located was received and completed.
+ *    2b. All referenced blocks need to have a generation
+ *        number which is equal to the parent's number.
+ *
+ * One issue that was found using this module was that the log
+ * tree on disk became temporarily corrupted because disk blocks
+ * that had been in use for the log tree had been freed and
+ * reused too early, while being referenced by the written super
+ * block.
+ *
+ * The search term in the kernel log that can be used to filter
+ * on the existence of detected integrity issues is
+ * "btrfs: attempt".
+ *
+ * The integrity check is enabled via mount options. These
+ * mount options are only supported if the integrity check
+ * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
+ *
+ * Example #1, apply integrity checks to all metadata:
+ * mount /dev/sdb1 /mnt -o check_int
+ *
+ * Example #2, apply integrity checks to all metadata and
+ * to data extents:
+ * mount /dev/sdb1 /mnt -o check_int_data
+ *
+ * Example #3, apply integrity checks to all metadata and dump
+ * the tree that the super block references to kernel messages
+ * each time after a super block was written:
+ * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
+ *
+ * If the integrity check tool is included and activated in
+ * the mount options, plenty of kernel memory is used, and
+ * plenty of additional CPU cycles are spent. Enabling this
+ * functionality is not intended for normal use. In most
+ * cases, unless you are a btrfs developer who needs to verify
+ * the integrity of (super)-block write requests, do not
+ * enable the config option BTRFS_FS_CHECK_INTEGRITY to
+ * include and compile the integrity check tool.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/crc32c.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "extent_io.h"
+#include "disk-io.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "check-integrity.h"
+
+#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
+#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
+#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
+#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
+#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
+#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)	/* in characters,
+							 * excluding " [...]" */
+#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
+
+#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
+
+/*
+ * The definition of the bitmask fields for the print_mask.
+ * They are specified with the mount option check_integrity_print_mask.
+ */
+#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE			0x00000001
+#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION		0x00000002
+#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE			0x00000004
+#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE			0x00000008
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH			0x00000010
+#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH			0x00000020
+#define BTRFSIC_PRINT_MASK_VERBOSE				0x00000040
+#define BTRFSIC_PRINT_MASK_VERY_VERBOSE				0x00000080
+#define BTRFSIC_PRINT_MASK_INITIAL_TREE				0x00000100
+#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES			0x00000200
+#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE			0x00000400
+#define BTRFSIC_PRINT_MASK_NUM_COPIES				0x00000800
+#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS		0x00001000
+
+struct btrfsic_dev_state;
+struct btrfsic_state;
+
+struct btrfsic_block {
+	u32 magic_num;		/* only used for debug purposes */
+	unsigned int is_metadata:1;	/* if it is meta-data, not data-data */
+	unsigned int is_superblock:1;	/* if it is one of the superblocks */
+	unsigned int is_iodone:1;	/* if is done by lower subsystem */
+	unsigned int iodone_w_error:1;	/* error was indicated to endio */
+	unsigned int never_written:1;	/* block was added because it was
+					 * referenced, not because it was
+					 * written */
+	unsigned int mirror_num:2;	/* large enough to hold
+					 * BTRFS_SUPER_MIRROR_MAX */
+	struct btrfsic_dev_state *dev_state;
+	u64 dev_bytenr;		/* key, physical byte num on disk */
+	u64 logical_bytenr;	/* logical byte num on disk */
+	u64 generation;
+	struct btrfs_disk_key disk_key;	/* extra info to print in case of
+					 * issues, will not always be correct */
+	struct list_head collision_resolving_node;	/* list node */
+	struct list_head all_blocks_node;	/* list node */
+
+	/* the following two lists contain block_link items */
+	struct list_head ref_to_list;	/* list */
+	struct list_head ref_from_list;	/* list */
+	struct btrfsic_block *next_in_same_bio;
+	void *orig_bio_bh_private;
+	union {
+		bio_end_io_t *bio;
+		bh_end_io_t *bh;
+	} orig_bio_bh_end_io;
+	int submit_bio_bh_rw;
+	u64 flush_gen; /* only valid if !never_written */
+};
+
+/*
+ * Elements of this type are allocated dynamically and required because
+ * each block object can refer to and can be ref from multiple blocks.
+ * The key to lookup them in the hashtable is the dev_bytenr of
+ * the block ref to plus the one from the block refered from.
+ * The fact that they are searchable via a hashtable and that a
+ * ref_cnt is maintained is not required for the btrfs integrity
+ * check algorithm itself, it is only used to make the output more
+ * beautiful in case that an error is detected (an error is defined
+ * as a write operation to a block while that block is still referenced).
+ */
+struct btrfsic_block_link {
+	u32 magic_num;		/* only used for debug purposes */
+	u32 ref_cnt;
+	struct list_head node_ref_to;	/* list node */
+	struct list_head node_ref_from;	/* list node */
+	struct list_head collision_resolving_node;	/* list node */
+	struct btrfsic_block *block_ref_to;
+	struct btrfsic_block *block_ref_from;
+	u64 parent_generation;
+};
+
+struct btrfsic_dev_state {
+	u32 magic_num;		/* only used for debug purposes */
+	struct block_device *bdev;
+	struct btrfsic_state *state;
+	struct list_head collision_resolving_node;	/* list node */
+	struct btrfsic_block dummy_block_for_bio_bh_flush;
+	u64 last_flush_gen;
+	char name[BDEVNAME_SIZE];
+};
+
+struct btrfsic_block_hashtable {
+	struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_link_hashtable {
+	struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_dev_state_hashtable {
+	struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_data_ctx {
+	u64 start;		/* virtual bytenr */
+	u64 dev_bytenr;		/* physical bytenr on device */
+	u32 len;
+	struct btrfsic_dev_state *dev;
+	char *data;
+	struct buffer_head *bh;	/* do not use if set to NULL */
+};
+
+/* This structure is used to implement recursion without occupying
+ * any stack space, refer to btrfsic_process_metablock() */
+struct btrfsic_stack_frame {
+	u32 magic;
+	u32 nr;
+	int error;
+	int i;
+	int limit_nesting;
+	int num_copies;
+	int mirror_num;
+	struct btrfsic_block *block;
+	struct btrfsic_block_data_ctx *block_ctx;
+	struct btrfsic_block *next_block;
+	struct btrfsic_block_data_ctx next_block_ctx;
+	struct btrfs_header *hdr;
+	struct btrfsic_stack_frame *prev;
+};
+
+/* Some state per mounted filesystem */
+struct btrfsic_state {
+	u32 print_mask;
+	int include_extent_data;
+	int csum_size;
+	struct list_head all_blocks_list;
+	struct btrfsic_block_hashtable block_hashtable;
+	struct btrfsic_block_link_hashtable block_link_hashtable;
+	struct btrfs_root *root;
+	u64 max_superblock_generation;
+	struct btrfsic_block *latest_superblock;
+};
+
+static void btrfsic_block_init(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_alloc(void);
+static void btrfsic_block_free(struct btrfsic_block *b);
+static void btrfsic_block_link_init(struct btrfsic_block_link *n);
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
+static void btrfsic_block_link_free(struct btrfsic_block_link *n);
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+					struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+		struct block_device *bdev,
+		u64 dev_bytenr,
+		struct btrfsic_block_hashtable *h);
+static void btrfsic_block_link_hashtable_init(
+		struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_add(
+		struct btrfsic_block_link *l,
+		struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+		struct block_device *bdev_ref_to,
+		u64 dev_bytenr_ref_to,
+		struct block_device *bdev_ref_from,
+		u64 dev_bytenr_ref_from,
+		struct btrfsic_block_link_hashtable *h);
+static void btrfsic_dev_state_hashtable_init(
+		struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_add(
+		struct btrfsic_dev_state *ds,
+		struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+		struct block_device *bdev,
+		struct btrfsic_dev_state_hashtable *h);
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+				      struct btrfs_fs_devices *fs_devices);
+static int btrfsic_process_metablock(struct btrfsic_state *state,
+				     struct btrfsic_block *block,
+				     struct btrfsic_block_data_ctx *block_ctx,
+				     struct btrfs_header *hdr,
+				     int limit_nesting, int force_iodone_flag);
+static int btrfsic_create_link_to_next_block(
+		struct btrfsic_state *state,
+		struct btrfsic_block *block,
+		struct btrfsic_block_data_ctx
+		*block_ctx, u64 next_bytenr,
+		int limit_nesting,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block **next_blockp,
+		int force_iodone_flag,
+		int *num_copiesp, int *mirror_nump,
+		struct btrfs_disk_key *disk_key,
+		u64 parent_generation);
+static int btrfsic_handle_extent_data(struct btrfsic_state *state,
+				      struct btrfsic_block *block,
+				      struct btrfsic_block_data_ctx *block_ctx,
+				      u32 item_offset, int force_iodone_flag);
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+			     struct btrfsic_block_data_ctx *block_ctx_out,
+			     int mirror_num);
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+				  u32 len, struct block_device *bdev,
+				  struct btrfsic_block_data_ctx *block_ctx_out);
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_read_block(struct btrfsic_state *state,
+			      struct btrfsic_block_data_ctx *block_ctx);
+static void btrfsic_dump_database(struct btrfsic_state *state);
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+				     const u8 *data, unsigned int size);
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+					  u64 dev_bytenr, u8 *mapped_data,
+					  unsigned int len, struct bio *bio,
+					  int *bio_is_patched,
+					  struct buffer_head *bh,
+					  int submit_bio_bh_rw);
+static int btrfsic_process_written_superblock(
+		struct btrfsic_state *state,
+		struct btrfsic_block *const block,
+		struct btrfs_super_block *const super_hdr);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
+static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
+					      const struct btrfsic_block *block,
+					      int recursion_level);
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+					struct btrfsic_block *const block,
+					int recursion_level);
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l);
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l);
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+				   const struct btrfsic_block *block);
+static void btrfsic_dump_tree(const struct btrfsic_state *state);
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+				  const struct btrfsic_block *block,
+				  int indent_level);
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block *next_block,
+		struct btrfsic_block *from_block,
+		u64 parent_generation);
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *block_ctx,
+		const char *additional_string,
+		int is_metadata,
+		int is_iodone,
+		int never_written,
+		int mirror_num,
+		int *was_created);
+static int btrfsic_process_superblock_dev_mirror(
+		struct btrfsic_state *state,
+		struct btrfsic_dev_state *dev_state,
+		struct btrfs_device *device,
+		int superblock_mirror_num,
+		struct btrfsic_dev_state **selected_dev_state,
+		struct btrfs_super_block *selected_super);
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+		struct block_device *bdev);
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+					   u64 bytenr,
+					   struct btrfsic_dev_state *dev_state,
+					   u64 dev_bytenr, char *data);
+
+static struct mutex btrfsic_mutex;
+static int btrfsic_is_initialized;
+static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
+
+
+static void btrfsic_block_init(struct btrfsic_block *b)
+{
+	b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
+	b->dev_state = NULL;
+	b->dev_bytenr = 0;
+	b->logical_bytenr = 0;
+	b->generation = BTRFSIC_GENERATION_UNKNOWN;
+	b->disk_key.objectid = 0;
+	b->disk_key.type = 0;
+	b->disk_key.offset = 0;
+	b->is_metadata = 0;
+	b->is_superblock = 0;
+	b->is_iodone = 0;
+	b->iodone_w_error = 0;
+	b->never_written = 0;
+	b->mirror_num = 0;
+	b->next_in_same_bio = NULL;
+	b->orig_bio_bh_private = NULL;
+	b->orig_bio_bh_end_io.bio = NULL;
+	INIT_LIST_HEAD(&b->collision_resolving_node);
+	INIT_LIST_HEAD(&b->all_blocks_node);
+	INIT_LIST_HEAD(&b->ref_to_list);
+	INIT_LIST_HEAD(&b->ref_from_list);
+	b->submit_bio_bh_rw = 0;
+	b->flush_gen = 0;
+}
+
+static struct btrfsic_block *btrfsic_block_alloc(void)
+{
+	struct btrfsic_block *b;
+
+	b = kzalloc(sizeof(*b), GFP_NOFS);
+	if (NULL != b)
+		btrfsic_block_init(b);
+
+	return b;
+}
+
+static void btrfsic_block_free(struct btrfsic_block *b)
+{
+	BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
+	kfree(b);
+}
+
+static void btrfsic_block_link_init(struct btrfsic_block_link *l)
+{
+	l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
+	l->ref_cnt = 1;
+	INIT_LIST_HEAD(&l->node_ref_to);
+	INIT_LIST_HEAD(&l->node_ref_from);
+	INIT_LIST_HEAD(&l->collision_resolving_node);
+	l->block_ref_to = NULL;
+	l->block_ref_from = NULL;
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
+{
+	struct btrfsic_block_link *l;
+
+	l = kzalloc(sizeof(*l), GFP_NOFS);
+	if (NULL != l)
+		btrfsic_block_link_init(l);
+
+	return l;
+}
+
+static void btrfsic_block_link_free(struct btrfsic_block_link *l)
+{
+	BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
+	kfree(l);
+}
+
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
+{
+	ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
+	ds->bdev = NULL;
+	ds->state = NULL;
+	ds->name[0] = '\0';
+	INIT_LIST_HEAD(&ds->collision_resolving_node);
+	ds->last_flush_gen = 0;
+	btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
+	ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
+	ds->dummy_block_for_bio_bh_flush.dev_state = ds;
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
+{
+	struct btrfsic_dev_state *ds;
+
+	ds = kzalloc(sizeof(*ds), GFP_NOFS);
+	if (NULL != ds)
+		btrfsic_dev_state_init(ds);
+
+	return ds;
+}
+
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
+{
+	BUG_ON(!(NULL == ds ||
+		 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
+	kfree(ds);
+}
+
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
+{
+	int i;
+
+	for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
+		INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+					struct btrfsic_block_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(b->dev_bytenr >> 16)) ^
+	     ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
+	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+
+	list_add(&b->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
+{
+	list_del(&b->collision_resolving_node);
+}
+
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+		struct block_device *bdev,
+		u64 dev_bytenr,
+		struct btrfsic_block_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(dev_bytenr >> 16)) ^
+	     ((unsigned int)((uintptr_t)bdev))) &
+	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+	struct list_head *elem;
+
+	list_for_each(elem, h->table + hashval) {
+		struct btrfsic_block *const b =
+		    list_entry(elem, struct btrfsic_block,
+			       collision_resolving_node);
+
+		if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
+			return b;
+	}
+
+	return NULL;
+}
+
+static void btrfsic_block_link_hashtable_init(
+		struct btrfsic_block_link_hashtable *h)
+{
+	int i;
+
+	for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
+		INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_link_hashtable_add(
+		struct btrfsic_block_link *l,
+		struct btrfsic_block_link_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
+	     ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
+	     ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
+	     ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
+	     & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+
+	BUG_ON(NULL == l->block_ref_to);
+	BUG_ON(NULL == l->block_ref_from);
+	list_add(&l->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
+{
+	list_del(&l->collision_resolving_node);
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+		struct block_device *bdev_ref_to,
+		u64 dev_bytenr_ref_to,
+		struct block_device *bdev_ref_from,
+		u64 dev_bytenr_ref_from,
+		struct btrfsic_block_link_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
+	     ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
+	     ((unsigned int)((uintptr_t)bdev_ref_to)) ^
+	     ((unsigned int)((uintptr_t)bdev_ref_from))) &
+	     (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+	struct list_head *elem;
+
+	list_for_each(elem, h->table + hashval) {
+		struct btrfsic_block_link *const l =
+		    list_entry(elem, struct btrfsic_block_link,
+			       collision_resolving_node);
+
+		BUG_ON(NULL == l->block_ref_to);
+		BUG_ON(NULL == l->block_ref_from);
+		if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
+		    l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
+		    l->block_ref_from->dev_state->bdev == bdev_ref_from &&
+		    l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
+			return l;
+	}
+
+	return NULL;
+}
+
+static void btrfsic_dev_state_hashtable_init(
+		struct btrfsic_dev_state_hashtable *h)
+{
+	int i;
+
+	for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
+		INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_dev_state_hashtable_add(
+		struct btrfsic_dev_state *ds,
+		struct btrfsic_dev_state_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)((uintptr_t)ds->bdev)) &
+	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+
+	list_add(&ds->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
+{
+	list_del(&ds->collision_resolving_node);
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+		struct block_device *bdev,
+		struct btrfsic_dev_state_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)((uintptr_t)bdev)) &
+	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+	struct list_head *elem;
+
+	list_for_each(elem, h->table + hashval) {
+		struct btrfsic_dev_state *const ds =
+		    list_entry(elem, struct btrfsic_dev_state,
+			       collision_resolving_node);
+
+		if (ds->bdev == bdev)
+			return ds;
+	}
+
+	return NULL;
+}
+
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+				      struct btrfs_fs_devices *fs_devices)
+{
+	int ret;
+	struct btrfs_super_block *selected_super;
+	struct list_head *dev_head = &fs_devices->devices;
+	struct btrfs_device *device;
+	struct btrfsic_dev_state *selected_dev_state = NULL;
+	int pass;
+
+	BUG_ON(NULL == state);
+	selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
+	if (NULL == selected_super) {
+		printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+		return -1;
+	}
+
+	list_for_each_entry(device, dev_head, dev_list) {
+		int i;
+		struct btrfsic_dev_state *dev_state;
+
+		if (!device->bdev || !device->name)
+			continue;
+
+		dev_state = btrfsic_dev_state_lookup(device->bdev);
+		BUG_ON(NULL == dev_state);
+		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+			ret = btrfsic_process_superblock_dev_mirror(
+					state, dev_state, device, i,
+					&selected_dev_state, selected_super);
+			if (0 != ret && 0 == i) {
+				kfree(selected_super);
+				return ret;
+			}
+		}
+	}
+
+	if (NULL == state->latest_superblock) {
+		printk(KERN_INFO "btrfsic: no superblock found!\n");
+		kfree(selected_super);
+		return -1;
+	}
+
+	state->csum_size = btrfs_super_csum_size(selected_super);
+
+	for (pass = 0; pass < 3; pass++) {
+		int num_copies;
+		int mirror_num;
+		u64 next_bytenr;
+
+		switch (pass) {
+		case 0:
+			next_bytenr = btrfs_super_root(selected_super);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "root@%llu\n",
+				       (unsigned long long)next_bytenr);
+			break;
+		case 1:
+			next_bytenr = btrfs_super_chunk_root(selected_super);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "chunk@%llu\n",
+				       (unsigned long long)next_bytenr);
+			break;
+		case 2:
+			next_bytenr = btrfs_super_log_root(selected_super);
+			if (0 == next_bytenr)
+				continue;
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "log@%llu\n",
+				       (unsigned long long)next_bytenr);
+			break;
+		}
+
+		num_copies =
+		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+				     next_bytenr, PAGE_SIZE);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       (unsigned long long)next_bytenr, num_copies);
+
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			struct btrfsic_block *next_block;
+			struct btrfsic_block_data_ctx tmp_next_block_ctx;
+			struct btrfsic_block_link *l;
+			struct btrfs_header *hdr;
+
+			ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+						&tmp_next_block_ctx,
+						mirror_num);
+			if (ret) {
+				printk(KERN_INFO "btrfsic:"
+				       " btrfsic_map_block(root @%llu,"
+				       " mirror %d) failed!\n",
+				       (unsigned long long)next_bytenr,
+				       mirror_num);
+				kfree(selected_super);
+				return -1;
+			}
+
+			next_block = btrfsic_block_hashtable_lookup(
+					tmp_next_block_ctx.dev->bdev,
+					tmp_next_block_ctx.dev_bytenr,
+					&state->block_hashtable);
+			BUG_ON(NULL == next_block);
+
+			l = btrfsic_block_link_hashtable_lookup(
+					tmp_next_block_ctx.dev->bdev,
+					tmp_next_block_ctx.dev_bytenr,
+					state->latest_superblock->dev_state->
+					bdev,
+					state->latest_superblock->dev_bytenr,
+					&state->block_link_hashtable);
+			BUG_ON(NULL == l);
+
+			ret = btrfsic_read_block(state, &tmp_next_block_ctx);
+			if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+				printk(KERN_INFO
+				       "btrfsic: read @logical %llu failed!\n",
+				       (unsigned long long)
+				       tmp_next_block_ctx.start);
+				btrfsic_release_block_ctx(&tmp_next_block_ctx);
+				kfree(selected_super);
+				return -1;
+			}
+
+			hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
+			ret = btrfsic_process_metablock(state,
+							next_block,
+							&tmp_next_block_ctx,
+							hdr,
+							BTRFS_MAX_LEVEL + 3, 1);
+			btrfsic_release_block_ctx(&tmp_next_block_ctx);
+		}
+	}
+
+	kfree(selected_super);
+	return ret;
+}
+
+static int btrfsic_process_superblock_dev_mirror(
+		struct btrfsic_state *state,
+		struct btrfsic_dev_state *dev_state,
+		struct btrfs_device *device,
+		int superblock_mirror_num,
+		struct btrfsic_dev_state **selected_dev_state,
+		struct btrfs_super_block *selected_super)
+{
+	struct btrfs_super_block *super_tmp;
+	u64 dev_bytenr;
+	struct buffer_head *bh;
+	struct btrfsic_block *superblock_tmp;
+	int pass;
+	struct block_device *const superblock_bdev = device->bdev;
+
+	/* super block bytenr is always the unmapped device bytenr */
+	dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
+	bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
+	if (NULL == bh)
+		return -1;
+	super_tmp = (struct btrfs_super_block *)
+	    (bh->b_data + (dev_bytenr & 4095));
+
+	if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
+	    strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
+		    sizeof(super_tmp->magic)) ||
+	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
+		brelse(bh);
+		return 0;
+	}
+
+	superblock_tmp =
+	    btrfsic_block_hashtable_lookup(superblock_bdev,
+					   dev_bytenr,
+					   &state->block_hashtable);
+	if (NULL == superblock_tmp) {
+		superblock_tmp = btrfsic_block_alloc();
+		if (NULL == superblock_tmp) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			brelse(bh);
+			return -1;
+		}
+		/* for superblock, only the dev_bytenr makes sense */
+		superblock_tmp->dev_bytenr = dev_bytenr;
+		superblock_tmp->dev_state = dev_state;
+		superblock_tmp->logical_bytenr = dev_bytenr;
+		superblock_tmp->generation = btrfs_super_generation(super_tmp);
+		superblock_tmp->is_metadata = 1;
+		superblock_tmp->is_superblock = 1;
+		superblock_tmp->is_iodone = 1;
+		superblock_tmp->never_written = 0;
+		superblock_tmp->mirror_num = 1 + superblock_mirror_num;
+		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+			printk(KERN_INFO "New initial S-block (bdev %p, %s)"
+			       " @%llu (%s/%llu/%d)\n",
+			       superblock_bdev, device->name,
+			       (unsigned long long)dev_bytenr,
+			       dev_state->name,
+			       (unsigned long long)dev_bytenr,
+			       superblock_mirror_num);
+		list_add(&superblock_tmp->all_blocks_node,
+			 &state->all_blocks_list);
+		btrfsic_block_hashtable_add(superblock_tmp,
+					    &state->block_hashtable);
+	}
+
+	/* select the one with the highest generation field */
+	if (btrfs_super_generation(super_tmp) >
+	    state->max_superblock_generation ||
+	    0 == state->max_superblock_generation) {
+		memcpy(selected_super, super_tmp, sizeof(*selected_super));
+		*selected_dev_state = dev_state;
+		state->max_superblock_generation =
+		    btrfs_super_generation(super_tmp);
+		state->latest_superblock = superblock_tmp;
+	}
+
+	for (pass = 0; pass < 3; pass++) {
+		u64 next_bytenr;
+		int num_copies;
+		int mirror_num;
+		const char *additional_string = NULL;
+		struct btrfs_disk_key tmp_disk_key;
+
+		tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+		tmp_disk_key.offset = 0;
+		switch (pass) {
+		case 0:
+			tmp_disk_key.objectid =
+			    cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+			additional_string = "initial root ";
+			next_bytenr = btrfs_super_root(super_tmp);
+			break;
+		case 1:
+			tmp_disk_key.objectid =
+			    cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+			additional_string = "initial chunk ";
+			next_bytenr = btrfs_super_chunk_root(super_tmp);
+			break;
+		case 2:
+			tmp_disk_key.objectid =
+			    cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+			additional_string = "initial log ";
+			next_bytenr = btrfs_super_log_root(super_tmp);
+			if (0 == next_bytenr)
+				continue;
+			break;
+		}
+
+		num_copies =
+		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+				     next_bytenr, PAGE_SIZE);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       (unsigned long long)next_bytenr, num_copies);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			struct btrfsic_block *next_block;
+			struct btrfsic_block_data_ctx tmp_next_block_ctx;
+			struct btrfsic_block_link *l;
+
+			if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+					      &tmp_next_block_ctx,
+					      mirror_num)) {
+				printk(KERN_INFO "btrfsic: btrfsic_map_block("
+				       "bytenr @%llu, mirror %d) failed!\n",
+				       (unsigned long long)next_bytenr,
+				       mirror_num);
+				brelse(bh);
+				return -1;
+			}
+
+			next_block = btrfsic_block_lookup_or_add(
+					state, &tmp_next_block_ctx,
+					additional_string, 1, 1, 0,
+					mirror_num, NULL);
+			if (NULL == next_block) {
+				btrfsic_release_block_ctx(&tmp_next_block_ctx);
+				brelse(bh);
+				return -1;
+			}
+
+			next_block->disk_key = tmp_disk_key;
+			next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+			l = btrfsic_block_link_lookup_or_add(
+					state, &tmp_next_block_ctx,
+					next_block, superblock_tmp,
+					BTRFSIC_GENERATION_UNKNOWN);
+			btrfsic_release_block_ctx(&tmp_next_block_ctx);
+			if (NULL == l) {
+				brelse(bh);
+				return -1;
+			}
+		}
+	}
+	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
+		btrfsic_dump_tree_sub(state, superblock_tmp, 0);
+
+	brelse(bh);
+	return 0;
+}
+
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
+{
+	struct btrfsic_stack_frame *sf;
+
+	sf = kzalloc(sizeof(*sf), GFP_NOFS);
+	if (NULL == sf)
+		printk(KERN_INFO "btrfsic: alloc memory failed!\n");
+	else
+		sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
+	return sf;
+}
+
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
+{
+	BUG_ON(!(NULL == sf ||
+		 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
+	kfree(sf);
+}
+
+static int btrfsic_process_metablock(
+		struct btrfsic_state *state,
+		struct btrfsic_block *const first_block,
+		struct btrfsic_block_data_ctx *const first_block_ctx,
+		struct btrfs_header *const first_hdr,
+		int first_limit_nesting, int force_iodone_flag)
+{
+	struct btrfsic_stack_frame initial_stack_frame = { 0 };
+	struct btrfsic_stack_frame *sf;
+	struct btrfsic_stack_frame *next_stack;
+
+	sf = &initial_stack_frame;
+	sf->error = 0;
+	sf->i = -1;
+	sf->limit_nesting = first_limit_nesting;
+	sf->block = first_block;
+	sf->block_ctx = first_block_ctx;
+	sf->next_block = NULL;
+	sf->hdr = first_hdr;
+	sf->prev = NULL;
+
+continue_with_new_stack_frame:
+	sf->block->generation = le64_to_cpu(sf->hdr->generation);
+	if (0 == sf->hdr->level) {
+		struct btrfs_leaf *const leafhdr =
+		    (struct btrfs_leaf *)sf->hdr;
+
+		if (-1 == sf->i) {
+			sf->nr = le32_to_cpu(leafhdr->header.nritems);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "leaf %llu items %d generation %llu"
+				       " owner %llu\n",
+				       (unsigned long long)
+				       sf->block_ctx->start,
+				       sf->nr,
+				       (unsigned long long)
+				       le64_to_cpu(leafhdr->header.generation),
+				       (unsigned long long)
+				       le64_to_cpu(leafhdr->header.owner));
+		}
+
+continue_with_current_leaf_stack_frame:
+		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+			sf->i++;
+			sf->num_copies = 0;
+		}
+
+		if (sf->i < sf->nr) {
+			struct btrfs_item *disk_item = leafhdr->items + sf->i;
+			struct btrfs_disk_key *disk_key = &disk_item->key;
+			u8 type;
+			const u32 item_offset = le32_to_cpu(disk_item->offset);
+
+			type = disk_key->type;
+
+			if (BTRFS_ROOT_ITEM_KEY == type) {
+				const struct btrfs_root_item *const root_item =
+				    (struct btrfs_root_item *)
+				    (sf->block_ctx->data +
+				     offsetof(struct btrfs_leaf, items) +
+				     item_offset);
+				const u64 next_bytenr =
+				    le64_to_cpu(root_item->bytenr);
+
+				sf->error =
+				    btrfsic_create_link_to_next_block(
+						state,
+						sf->block,
+						sf->block_ctx,
+						next_bytenr,
+						sf->limit_nesting,
+						&sf->next_block_ctx,
+						&sf->next_block,
+						force_iodone_flag,
+						&sf->num_copies,
+						&sf->mirror_num,
+						disk_key,
+						le64_to_cpu(root_item->
+						generation));
+				if (sf->error)
+					goto one_stack_frame_backwards;
+
+				if (NULL != sf->next_block) {
+					struct btrfs_header *const next_hdr =
+					    (struct btrfs_header *)
+					    sf->next_block_ctx.data;
+
+					next_stack =
+					    btrfsic_stack_frame_alloc();
+					if (NULL == next_stack) {
+						btrfsic_release_block_ctx(
+								&sf->
+								next_block_ctx);
+						goto one_stack_frame_backwards;
+					}
+
+					next_stack->i = -1;
+					next_stack->block = sf->next_block;
+					next_stack->block_ctx =
+					    &sf->next_block_ctx;
+					next_stack->next_block = NULL;
+					next_stack->hdr = next_hdr;
+					next_stack->limit_nesting =
+					    sf->limit_nesting - 1;
+					next_stack->prev = sf;
+					sf = next_stack;
+					goto continue_with_new_stack_frame;
+				}
+			} else if (BTRFS_EXTENT_DATA_KEY == type &&
+				   state->include_extent_data) {
+				sf->error = btrfsic_handle_extent_data(
+						state,
+						sf->block,
+						sf->block_ctx,
+						item_offset,
+						force_iodone_flag);
+				if (sf->error)
+					goto one_stack_frame_backwards;
+			}
+
+			goto continue_with_current_leaf_stack_frame;
+		}
+	} else {
+		struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
+
+		if (-1 == sf->i) {
+			sf->nr = le32_to_cpu(nodehdr->header.nritems);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO "node %llu level %d items %d"
+				       " generation %llu owner %llu\n",
+				       (unsigned long long)
+				       sf->block_ctx->start,
+				       nodehdr->header.level, sf->nr,
+				       (unsigned long long)
+				       le64_to_cpu(nodehdr->header.generation),
+				       (unsigned long long)
+				       le64_to_cpu(nodehdr->header.owner));
+		}
+
+continue_with_current_node_stack_frame:
+		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+			sf->i++;
+			sf->num_copies = 0;
+		}
+
+		if (sf->i < sf->nr) {
+			struct btrfs_key_ptr *disk_key_ptr =
+			    nodehdr->ptrs + sf->i;
+			const u64 next_bytenr =
+			    le64_to_cpu(disk_key_ptr->blockptr);
+
+			sf->error = btrfsic_create_link_to_next_block(
+					state,
+					sf->block,
+					sf->block_ctx,
+					next_bytenr,
+					sf->limit_nesting,
+					&sf->next_block_ctx,
+					&sf->next_block,
+					force_iodone_flag,
+					&sf->num_copies,
+					&sf->mirror_num,
+					&disk_key_ptr->key,
+					le64_to_cpu(disk_key_ptr->generation));
+			if (sf->error)
+				goto one_stack_frame_backwards;
+
+			if (NULL != sf->next_block) {
+				struct btrfs_header *const next_hdr =
+				    (struct btrfs_header *)
+				    sf->next_block_ctx.data;
+
+				next_stack = btrfsic_stack_frame_alloc();
+				if (NULL == next_stack)
+					goto one_stack_frame_backwards;
+
+				next_stack->i = -1;
+				next_stack->block = sf->next_block;
+				next_stack->block_ctx = &sf->next_block_ctx;
+				next_stack->next_block = NULL;
+				next_stack->hdr = next_hdr;
+				next_stack->limit_nesting =
+				    sf->limit_nesting - 1;
+				next_stack->prev = sf;
+				sf = next_stack;
+				goto continue_with_new_stack_frame;
+			}
+
+			goto continue_with_current_node_stack_frame;
+		}
+	}
+
+one_stack_frame_backwards:
+	if (NULL != sf->prev) {
+		struct btrfsic_stack_frame *const prev = sf->prev;
+
+		/* the one for the initial block is freed in the caller */
+		btrfsic_release_block_ctx(sf->block_ctx);
+
+		if (sf->error) {
+			prev->error = sf->error;
+			btrfsic_stack_frame_free(sf);
+			sf = prev;
+			goto one_stack_frame_backwards;
+		}
+
+		btrfsic_stack_frame_free(sf);
+		sf = prev;
+		goto continue_with_new_stack_frame;
+	} else {
+		BUG_ON(&initial_stack_frame != sf);
+	}
+
+	return sf->error;
+}
+
+static int btrfsic_create_link_to_next_block(
+		struct btrfsic_state *state,
+		struct btrfsic_block *block,
+		struct btrfsic_block_data_ctx *block_ctx,
+		u64 next_bytenr,
+		int limit_nesting,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block **next_blockp,
+		int force_iodone_flag,
+		int *num_copiesp, int *mirror_nump,
+		struct btrfs_disk_key *disk_key,
+		u64 parent_generation)
+{
+	struct btrfsic_block *next_block = NULL;
+	int ret;
+	struct btrfsic_block_link *l;
+	int did_alloc_block_link;
+	int block_was_created;
+
+	*next_blockp = NULL;
+	if (0 == *num_copiesp) {
+		*num_copiesp =
+		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+				     next_bytenr, PAGE_SIZE);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       (unsigned long long)next_bytenr, *num_copiesp);
+		*mirror_nump = 1;
+	}
+
+	if (*mirror_nump > *num_copiesp)
+		return 0;
+
+	if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+		printk(KERN_INFO
+		       "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
+		       *mirror_nump);
+	ret = btrfsic_map_block(state, next_bytenr,
+				BTRFSIC_BLOCK_SIZE,
+				next_block_ctx, *mirror_nump);
+	if (ret) {
+		printk(KERN_INFO
+		       "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+		       (unsigned long long)next_bytenr, *mirror_nump);
+		btrfsic_release_block_ctx(next_block_ctx);
+		*next_blockp = NULL;
+		return -1;
+	}
+
+	next_block = btrfsic_block_lookup_or_add(state,
+						 next_block_ctx, "referenced ",
+						 1, force_iodone_flag,
+						 !force_iodone_flag,
+						 *mirror_nump,
+						 &block_was_created);
+	if (NULL == next_block) {
+		btrfsic_release_block_ctx(next_block_ctx);
+		*next_blockp = NULL;
+		return -1;
+	}
+	if (block_was_created) {
+		l = NULL;
+		next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+	} else {
+		if (next_block->logical_bytenr != next_bytenr &&
+		    !(!next_block->is_metadata &&
+		      0 == next_block->logical_bytenr)) {
+			printk(KERN_INFO
+			       "Referenced block @%llu (%s/%llu/%d)"
+			       " found in hash table, %c,"
+			       " bytenr mismatch (!= stored %llu).\n",
+			       (unsigned long long)next_bytenr,
+			       next_block_ctx->dev->name,
+			       (unsigned long long)next_block_ctx->dev_bytenr,
+			       *mirror_nump,
+			       btrfsic_get_block_type(state, next_block),
+			       (unsigned long long)next_block->logical_bytenr);
+		} else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "Referenced block @%llu (%s/%llu/%d)"
+			       " found in hash table, %c.\n",
+			       (unsigned long long)next_bytenr,
+			       next_block_ctx->dev->name,
+			       (unsigned long long)next_block_ctx->dev_bytenr,
+			       *mirror_nump,
+			       btrfsic_get_block_type(state, next_block));
+		next_block->logical_bytenr = next_bytenr;
+
+		next_block->mirror_num = *mirror_nump;
+		l = btrfsic_block_link_hashtable_lookup(
+				next_block_ctx->dev->bdev,
+				next_block_ctx->dev_bytenr,
+				block_ctx->dev->bdev,
+				block_ctx->dev_bytenr,
+				&state->block_link_hashtable);
+	}
+
+	next_block->disk_key = *disk_key;
+	if (NULL == l) {
+		l = btrfsic_block_link_alloc();
+		if (NULL == l) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			btrfsic_release_block_ctx(next_block_ctx);
+			*next_blockp = NULL;
+			return -1;
+		}
+
+		did_alloc_block_link = 1;
+		l->block_ref_to = next_block;
+		l->block_ref_from = block;
+		l->ref_cnt = 1;
+		l->parent_generation = parent_generation;
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			btrfsic_print_add_link(state, l);
+
+		list_add(&l->node_ref_to, &block->ref_to_list);
+		list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+		btrfsic_block_link_hashtable_add(l,
+						 &state->block_link_hashtable);
+	} else {
+		did_alloc_block_link = 0;
+		if (0 == limit_nesting) {
+			l->ref_cnt++;
+			l->parent_generation = parent_generation;
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				btrfsic_print_add_link(state, l);
+		}
+	}
+
+	if (limit_nesting > 0 && did_alloc_block_link) {
+		ret = btrfsic_read_block(state, next_block_ctx);
+		if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+			printk(KERN_INFO
+			       "btrfsic: read block @logical %llu failed!\n",
+			       (unsigned long long)next_bytenr);
+			btrfsic_release_block_ctx(next_block_ctx);
+			*next_blockp = NULL;
+			return -1;
+		}
+
+		*next_blockp = next_block;
+	} else {
+		*next_blockp = NULL;
+	}
+	(*mirror_nump)++;
+
+	return 0;
+}
+
+static int btrfsic_handle_extent_data(
+		struct btrfsic_state *state,
+		struct btrfsic_block *block,
+		struct btrfsic_block_data_ctx *block_ctx,
+		u32 item_offset, int force_iodone_flag)
+{
+	int ret;
+	struct btrfs_file_extent_item *file_extent_item =
+	    (struct btrfs_file_extent_item *)(block_ctx->data +
+					      offsetof(struct btrfs_leaf,
+						       items) + item_offset);
+	u64 next_bytenr =
+	    le64_to_cpu(file_extent_item->disk_bytenr) +
+	    le64_to_cpu(file_extent_item->offset);
+	u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
+	u64 generation = le64_to_cpu(file_extent_item->generation);
+	struct btrfsic_block_link *l;
+
+	if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+		printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
+		       " offset = %llu, num_bytes = %llu\n",
+		       file_extent_item->type,
+		       (unsigned long long)
+		       le64_to_cpu(file_extent_item->disk_bytenr),
+		       (unsigned long long)
+		       le64_to_cpu(file_extent_item->offset),
+		       (unsigned long long)
+		       le64_to_cpu(file_extent_item->num_bytes));
+	if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
+	    ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
+		return 0;
+	while (num_bytes > 0) {
+		u32 chunk_len;
+		int num_copies;
+		int mirror_num;
+
+		if (num_bytes > BTRFSIC_BLOCK_SIZE)
+			chunk_len = BTRFSIC_BLOCK_SIZE;
+		else
+			chunk_len = num_bytes;
+
+		num_copies =
+		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+				     next_bytenr, PAGE_SIZE);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       (unsigned long long)next_bytenr, num_copies);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			struct btrfsic_block_data_ctx next_block_ctx;
+			struct btrfsic_block *next_block;
+			int block_was_created;
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO "btrfsic_handle_extent_data("
+				       "mirror_num=%d)\n", mirror_num);
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+				printk(KERN_INFO
+				       "\tdisk_bytenr = %llu, num_bytes %u\n",
+				       (unsigned long long)next_bytenr,
+				       chunk_len);
+			ret = btrfsic_map_block(state, next_bytenr,
+						chunk_len, &next_block_ctx,
+						mirror_num);
+			if (ret) {
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_map_block(@%llu,"
+				       " mirror=%d) failed!\n",
+				       (unsigned long long)next_bytenr,
+				       mirror_num);
+				return -1;
+			}
+
+			next_block = btrfsic_block_lookup_or_add(
+					state,
+					&next_block_ctx,
+					"referenced ",
+					0,
+					force_iodone_flag,
+					!force_iodone_flag,
+					mirror_num,
+					&block_was_created);
+			if (NULL == next_block) {
+				printk(KERN_INFO
+				       "btrfsic: error, kmalloc failed!\n");
+				btrfsic_release_block_ctx(&next_block_ctx);
+				return -1;
+			}
+			if (!block_was_created) {
+				if (next_block->logical_bytenr != next_bytenr &&
+				    !(!next_block->is_metadata &&
+				      0 == next_block->logical_bytenr)) {
+					printk(KERN_INFO
+					       "Referenced block"
+					       " @%llu (%s/%llu/%d)"
+					       " found in hash table, D,"
+					       " bytenr mismatch"
+					       " (!= stored %llu).\n",
+					       (unsigned long long)next_bytenr,
+					       next_block_ctx.dev->name,
+					       (unsigned long long)
+					       next_block_ctx.dev_bytenr,
+					       mirror_num,
+					       (unsigned long long)
+					       next_block->logical_bytenr);
+				}
+				next_block->logical_bytenr = next_bytenr;
+				next_block->mirror_num = mirror_num;
+			}
+
+			l = btrfsic_block_link_lookup_or_add(state,
+							     &next_block_ctx,
+							     next_block, block,
+							     generation);
+			btrfsic_release_block_ctx(&next_block_ctx);
+			if (NULL == l)
+				return -1;
+		}
+
+		next_bytenr += chunk_len;
+		num_bytes -= chunk_len;
+	}
+
+	return 0;
+}
+
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+			     struct btrfsic_block_data_ctx *block_ctx_out,
+			     int mirror_num)
+{
+	int ret;
+	u64 length;
+	struct btrfs_bio *multi = NULL;
+	struct btrfs_device *device;
+
+	length = len;
+	ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+			      bytenr, &length, &multi, mirror_num);
+
+	device = multi->stripes[0].dev;
+	block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
+	block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+	block_ctx_out->start = bytenr;
+	block_ctx_out->len = len;
+	block_ctx_out->data = NULL;
+	block_ctx_out->bh = NULL;
+
+	if (0 == ret)
+		kfree(multi);
+	if (NULL == block_ctx_out->dev) {
+		ret = -ENXIO;
+		printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
+	}
+
+	return ret;
+}
+
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+				  u32 len, struct block_device *bdev,
+				  struct btrfsic_block_data_ctx *block_ctx_out)
+{
+	block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
+	block_ctx_out->dev_bytenr = bytenr;
+	block_ctx_out->start = bytenr;
+	block_ctx_out->len = len;
+	block_ctx_out->data = NULL;
+	block_ctx_out->bh = NULL;
+	if (NULL != block_ctx_out->dev) {
+		return 0;
+	} else {
+		printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
+		return -ENXIO;
+	}
+}
+
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
+{
+	if (NULL != block_ctx->bh) {
+		brelse(block_ctx->bh);
+		block_ctx->bh = NULL;
+	}
+}
+
+static int btrfsic_read_block(struct btrfsic_state *state,
+			      struct btrfsic_block_data_ctx *block_ctx)
+{
+	block_ctx->bh = NULL;
+	if (block_ctx->dev_bytenr & 4095) {
+		printk(KERN_INFO
+		       "btrfsic: read_block() with unaligned bytenr %llu\n",
+		       (unsigned long long)block_ctx->dev_bytenr);
+		return -1;
+	}
+	if (block_ctx->len > 4096) {
+		printk(KERN_INFO
+		       "btrfsic: read_block() with too huge size %d\n",
+		       block_ctx->len);
+		return -1;
+	}
+
+	block_ctx->bh = __bread(block_ctx->dev->bdev,
+				block_ctx->dev_bytenr >> 12, 4096);
+	if (NULL == block_ctx->bh)
+		return -1;
+	block_ctx->data = block_ctx->bh->b_data;
+
+	return block_ctx->len;
+}
+
+static void btrfsic_dump_database(struct btrfsic_state *state)
+{
+	struct list_head *elem_all;
+
+	BUG_ON(NULL == state);
+
+	printk(KERN_INFO "all_blocks_list:\n");
+	list_for_each(elem_all, &state->all_blocks_list) {
+		const struct btrfsic_block *const b_all =
+		    list_entry(elem_all, struct btrfsic_block,
+			       all_blocks_node);
+		struct list_head *elem_ref_to;
+		struct list_head *elem_ref_from;
+
+		printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
+		       btrfsic_get_block_type(state, b_all),
+		       (unsigned long long)b_all->logical_bytenr,
+		       b_all->dev_state->name,
+		       (unsigned long long)b_all->dev_bytenr,
+		       b_all->mirror_num);
+
+		list_for_each(elem_ref_to, &b_all->ref_to_list) {
+			const struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_to,
+				       struct btrfsic_block_link,
+				       node_ref_to);
+
+			printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+			       " refers %u* to"
+			       " %c @%llu (%s/%llu/%d)\n",
+			       btrfsic_get_block_type(state, b_all),
+			       (unsigned long long)b_all->logical_bytenr,
+			       b_all->dev_state->name,
+			       (unsigned long long)b_all->dev_bytenr,
+			       b_all->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+		}
+
+		list_for_each(elem_ref_from, &b_all->ref_from_list) {
+			const struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_from,
+				       struct btrfsic_block_link,
+				       node_ref_from);
+
+			printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+			       " is ref %u* from"
+			       " %c @%llu (%s/%llu/%d)\n",
+			       btrfsic_get_block_type(state, b_all),
+			       (unsigned long long)b_all->logical_bytenr,
+			       b_all->dev_state->name,
+			       (unsigned long long)b_all->dev_bytenr,
+			       b_all->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_from),
+			       (unsigned long long)
+			       l->block_ref_from->logical_bytenr,
+			       l->block_ref_from->dev_state->name,
+			       (unsigned long long)
+			       l->block_ref_from->dev_bytenr,
+			       l->block_ref_from->mirror_num);
+		}
+
+		printk(KERN_INFO "\n");
+	}
+}
+
+/*
+ * Test whether the disk block contains a tree block (leaf or node)
+ * (note that this test fails for the super block)
+ */
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+				     const u8 *data, unsigned int size)
+{
+	struct btrfs_header *h;
+	u8 csum[BTRFS_CSUM_SIZE];
+	u32 crc = ~(u32)0;
+	int fail = 0;
+	int crc_fail = 0;
+
+	h = (struct btrfs_header *)data;
+
+	if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
+		fail++;
+
+	crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
+	btrfs_csum_final(crc, csum);
+	if (memcmp(csum, h->csum, state->csum_size))
+		crc_fail++;
+
+	return fail || crc_fail;
+}
+
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+					  u64 dev_bytenr,
+					  u8 *mapped_data, unsigned int len,
+					  struct bio *bio,
+					  int *bio_is_patched,
+					  struct buffer_head *bh,
+					  int submit_bio_bh_rw)
+{
+	int is_metadata;
+	struct btrfsic_block *block;
+	struct btrfsic_block_data_ctx block_ctx;
+	int ret;
+	struct btrfsic_state *state = dev_state->state;
+	struct block_device *bdev = dev_state->bdev;
+
+	WARN_ON(len > PAGE_SIZE);
+	is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
+	if (NULL != bio_is_patched)
+		*bio_is_patched = 0;
+
+	block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
+					       &state->block_hashtable);
+	if (NULL != block) {
+		u64 bytenr;
+		struct list_head *elem_ref_to;
+		struct list_head *tmp_ref_to;
+
+		if (block->is_superblock) {
+			bytenr = le64_to_cpu(((struct btrfs_super_block *)
+					      mapped_data)->bytenr);
+			is_metadata = 1;
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
+				printk(KERN_INFO
+				       "[before new superblock is written]:\n");
+				btrfsic_dump_tree_sub(state, block, 0);
+			}
+		}
+		if (is_metadata) {
+			if (!block->is_superblock) {
+				bytenr = le64_to_cpu(((struct btrfs_header *)
+						      mapped_data)->bytenr);
+				btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
+							       dev_state,
+							       dev_bytenr,
+							       mapped_data);
+			}
+			if (block->logical_bytenr != bytenr) {
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/%d)"
+				       " found in hash table, %c,"
+				       " bytenr mismatch"
+				       " (!= stored %llu).\n",
+				       (unsigned long long)bytenr,
+				       dev_state->name,
+				       (unsigned long long)dev_bytenr,
+				       block->mirror_num,
+				       btrfsic_get_block_type(state, block),
+				       (unsigned long long)
+				       block->logical_bytenr);
+				block->logical_bytenr = bytenr;
+			} else if (state->print_mask &
+				   BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/%d)"
+				       " found in hash table, %c.\n",
+				       (unsigned long long)bytenr,
+				       dev_state->name,
+				       (unsigned long long)dev_bytenr,
+				       block->mirror_num,
+				       btrfsic_get_block_type(state, block));
+		} else {
+			bytenr = block->logical_bytenr;
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/%d)"
+				       " found in hash table, %c.\n",
+				       (unsigned long long)bytenr,
+				       dev_state->name,
+				       (unsigned long long)dev_bytenr,
+				       block->mirror_num,
+				       btrfsic_get_block_type(state, block));
+		}
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "ref_to_list: %cE, ref_from_list: %cE\n",
+			       list_empty(&block->ref_to_list) ? ' ' : '!',
+			       list_empty(&block->ref_from_list) ? ' ' : '!');
+		if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
+			printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+			       " @%llu (%s/%llu/%d), old(gen=%llu,"
+			       " objectid=%llu, type=%d, offset=%llu),"
+			       " new(gen=%llu),"
+			       " which is referenced by most recent superblock"
+			       " (superblockgen=%llu)!\n",
+			       btrfsic_get_block_type(state, block),
+			       (unsigned long long)bytenr,
+			       dev_state->name,
+			       (unsigned long long)dev_bytenr,
+			       block->mirror_num,
+			       (unsigned long long)block->generation,
+			       (unsigned long long)
+			       le64_to_cpu(block->disk_key.objectid),
+			       block->disk_key.type,
+			       (unsigned long long)
+			       le64_to_cpu(block->disk_key.offset),
+			       (unsigned long long)
+			       le64_to_cpu(((struct btrfs_header *)
+					    mapped_data)->generation),
+			       (unsigned long long)
+			       state->max_superblock_generation);
+			btrfsic_dump_tree(state);
+		}
+
+		if (!block->is_iodone && !block->never_written) {
+			printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+			       " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
+			       " which is not yet iodone!\n",
+			       btrfsic_get_block_type(state, block),
+			       (unsigned long long)bytenr,
+			       dev_state->name,
+			       (unsigned long long)dev_bytenr,
+			       block->mirror_num,
+			       (unsigned long long)block->generation,
+			       (unsigned long long)
+			       le64_to_cpu(((struct btrfs_header *)
+					    mapped_data)->generation));
+			/* it would not be safe to go on */
+			btrfsic_dump_tree(state);
+			return;
+		}
+
+		/*
+		 * Clear all references of this block. Do not free
+		 * the block itself even if is not referenced anymore
+		 * because it still carries valueable information
+		 * like whether it was ever written and IO completed.
+		 */
+		list_for_each_safe(elem_ref_to, tmp_ref_to,
+				   &block->ref_to_list) {
+			struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_to,
+				       struct btrfsic_block_link,
+				       node_ref_to);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				btrfsic_print_rem_link(state, l);
+			l->ref_cnt--;
+			if (0 == l->ref_cnt) {
+				list_del(&l->node_ref_to);
+				list_del(&l->node_ref_from);
+				btrfsic_block_link_hashtable_remove(l);
+				btrfsic_block_link_free(l);
+			}
+		}
+
+		if (block->is_superblock)
+			ret = btrfsic_map_superblock(state, bytenr, len,
+						     bdev, &block_ctx);
+		else
+			ret = btrfsic_map_block(state, bytenr, len,
+						&block_ctx, 0);
+		if (ret) {
+			printk(KERN_INFO
+			       "btrfsic: btrfsic_map_block(root @%llu)"
+			       " failed!\n", (unsigned long long)bytenr);
+			return;
+		}
+		block_ctx.data = mapped_data;
+		/* the following is required in case of writes to mirrors,
+		 * use the same that was used for the lookup */
+		block_ctx.dev = dev_state;
+		block_ctx.dev_bytenr = dev_bytenr;
+
+		if (is_metadata || state->include_extent_data) {
+			block->never_written = 0;
+			block->iodone_w_error = 0;
+			if (NULL != bio) {
+				block->is_iodone = 0;
+				BUG_ON(NULL == bio_is_patched);
+				if (!*bio_is_patched) {
+					block->orig_bio_bh_private =
+					    bio->bi_private;
+					block->orig_bio_bh_end_io.bio =
+					    bio->bi_end_io;
+					block->next_in_same_bio = NULL;
+					bio->bi_private = block;
+					bio->bi_end_io = btrfsic_bio_end_io;
+					*bio_is_patched = 1;
+				} else {
+					struct btrfsic_block *chained_block =
+					    (struct btrfsic_block *)
+					    bio->bi_private;
+
+					BUG_ON(NULL == chained_block);
+					block->orig_bio_bh_private =
+					    chained_block->orig_bio_bh_private;
+					block->orig_bio_bh_end_io.bio =
+					    chained_block->orig_bio_bh_end_io.
+					    bio;
+					block->next_in_same_bio = chained_block;
+					bio->bi_private = block;
+				}
+			} else if (NULL != bh) {
+				block->is_iodone = 0;
+				block->orig_bio_bh_private = bh->b_private;
+				block->orig_bio_bh_end_io.bh = bh->b_end_io;
+				block->next_in_same_bio = NULL;
+				bh->b_private = block;
+				bh->b_end_io = btrfsic_bh_end_io;
+			} else {
+				block->is_iodone = 1;
+				block->orig_bio_bh_private = NULL;
+				block->orig_bio_bh_end_io.bio = NULL;
+				block->next_in_same_bio = NULL;
+			}
+		}
+
+		block->flush_gen = dev_state->last_flush_gen + 1;
+		block->submit_bio_bh_rw = submit_bio_bh_rw;
+		if (is_metadata) {
+			block->logical_bytenr = bytenr;
+			block->is_metadata = 1;
+			if (block->is_superblock) {
+				ret = btrfsic_process_written_superblock(
+						state,
+						block,
+						(struct btrfs_super_block *)
+						mapped_data);
+				if (state->print_mask &
+				    BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
+					printk(KERN_INFO
+					"[after new superblock is written]:\n");
+					btrfsic_dump_tree_sub(state, block, 0);
+				}
+			} else {
+				block->mirror_num = 0;	/* unknown */
+				ret = btrfsic_process_metablock(
+						state,
+						block,
+						&block_ctx,
+						(struct btrfs_header *)
+						block_ctx.data,
+						0, 0);
+			}
+			if (ret)
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_process_metablock"
+				       "(root @%llu) failed!\n",
+				       (unsigned long long)dev_bytenr);
+		} else {
+			block->is_metadata = 0;
+			block->mirror_num = 0;	/* unknown */
+			block->generation = BTRFSIC_GENERATION_UNKNOWN;
+			if (!state->include_extent_data
+			    && list_empty(&block->ref_from_list)) {
+				/*
+				 * disk block is overwritten with extent
+				 * data (not meta data) and we are configured
+				 * to not include extent data: take the
+				 * chance and free the block's memory
+				 */
+				btrfsic_block_hashtable_remove(block);
+				list_del(&block->all_blocks_node);
+				btrfsic_block_free(block);
+			}
+		}
+		btrfsic_release_block_ctx(&block_ctx);
+	} else {
+		/* block has not been found in hash table */
+		u64 bytenr;
+
+		if (!is_metadata) {
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO "Written block (%s/%llu/?)"
+				       " !found in hash table, D.\n",
+				       dev_state->name,
+				       (unsigned long long)dev_bytenr);
+			if (!state->include_extent_data)
+				return;	/* ignore that written D block */
+
+			/* this is getting ugly for the
+			 * include_extent_data case... */
+			bytenr = 0;	/* unknown */
+			block_ctx.start = bytenr;
+			block_ctx.len = len;
+			block_ctx.bh = NULL;
+		} else {
+			bytenr = le64_to_cpu(((struct btrfs_header *)
+					      mapped_data)->bytenr);
+			btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
+						       dev_bytenr,
+						       mapped_data);
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/?)"
+				       " !found in hash table, M.\n",
+				       (unsigned long long)bytenr,
+				       dev_state->name,
+				       (unsigned long long)dev_bytenr);
+
+			ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
+						0);
+			if (ret) {
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_map_block(root @%llu)"
+				       " failed!\n",
+				       (unsigned long long)dev_bytenr);
+				return;
+			}
+		}
+		block_ctx.data = mapped_data;
+		/* the following is required in case of writes to mirrors,
+		 * use the same that was used for the lookup */
+		block_ctx.dev = dev_state;
+		block_ctx.dev_bytenr = dev_bytenr;
+
+		block = btrfsic_block_alloc();
+		if (NULL == block) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			btrfsic_release_block_ctx(&block_ctx);
+			return;
+		}
+		block->dev_state = dev_state;
+		block->dev_bytenr = dev_bytenr;
+		block->logical_bytenr = bytenr;
+		block->is_metadata = is_metadata;
+		block->never_written = 0;
+		block->iodone_w_error = 0;
+		block->mirror_num = 0;	/* unknown */
+		block->flush_gen = dev_state->last_flush_gen + 1;
+		block->submit_bio_bh_rw = submit_bio_bh_rw;
+		if (NULL != bio) {
+			block->is_iodone = 0;
+			BUG_ON(NULL == bio_is_patched);
+			if (!*bio_is_patched) {
+				block->orig_bio_bh_private = bio->bi_private;
+				block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+				block->next_in_same_bio = NULL;
+				bio->bi_private = block;
+				bio->bi_end_io = btrfsic_bio_end_io;
+				*bio_is_patched = 1;
+			} else {
+				struct btrfsic_block *chained_block =
+				    (struct btrfsic_block *)
+				    bio->bi_private;
+
+				BUG_ON(NULL == chained_block);
+				block->orig_bio_bh_private =
+				    chained_block->orig_bio_bh_private;
+				block->orig_bio_bh_end_io.bio =
+				    chained_block->orig_bio_bh_end_io.bio;
+				block->next_in_same_bio = chained_block;
+				bio->bi_private = block;
+			}
+		} else if (NULL != bh) {
+			block->is_iodone = 0;
+			block->orig_bio_bh_private = bh->b_private;
+			block->orig_bio_bh_end_io.bh = bh->b_end_io;
+			block->next_in_same_bio = NULL;
+			bh->b_private = block;
+			bh->b_end_io = btrfsic_bh_end_io;
+		} else {
+			block->is_iodone = 1;
+			block->orig_bio_bh_private = NULL;
+			block->orig_bio_bh_end_io.bio = NULL;
+			block->next_in_same_bio = NULL;
+		}
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "New written %c-block @%llu (%s/%llu/%d)\n",
+			       is_metadata ? 'M' : 'D',
+			       (unsigned long long)block->logical_bytenr,
+			       block->dev_state->name,
+			       (unsigned long long)block->dev_bytenr,
+			       block->mirror_num);
+		list_add(&block->all_blocks_node, &state->all_blocks_list);
+		btrfsic_block_hashtable_add(block, &state->block_hashtable);
+
+		if (is_metadata) {
+			ret = btrfsic_process_metablock(state, block,
+							&block_ctx,
+							(struct btrfs_header *)
+							block_ctx.data, 0, 0);
+			if (ret)
+				printk(KERN_INFO
+				       "btrfsic: process_metablock(root @%llu)"
+				       " failed!\n",
+				       (unsigned long long)dev_bytenr);
+		}
+		btrfsic_release_block_ctx(&block_ctx);
+	}
+}
+
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+{
+	struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+	int iodone_w_error;
+
+	/* mutex is not held! This is not save if IO is not yet completed
+	 * on umount */
+	iodone_w_error = 0;
+	if (bio_error_status)
+		iodone_w_error = 1;
+
+	BUG_ON(NULL == block);
+	bp->bi_private = block->orig_bio_bh_private;
+	bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+
+	do {
+		struct btrfsic_block *next_block;
+		struct btrfsic_dev_state *const dev_state = block->dev_state;
+
+		if ((dev_state->state->print_mask &
+		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+			printk(KERN_INFO
+			       "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+			       bio_error_status,
+			       btrfsic_get_block_type(dev_state->state, block),
+			       (unsigned long long)block->logical_bytenr,
+			       dev_state->name,
+			       (unsigned long long)block->dev_bytenr,
+			       block->mirror_num);
+		next_block = block->next_in_same_bio;
+		block->iodone_w_error = iodone_w_error;
+		if (block->submit_bio_bh_rw & REQ_FLUSH) {
+			dev_state->last_flush_gen++;
+			if ((dev_state->state->print_mask &
+			     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+				printk(KERN_INFO
+				       "bio_end_io() new %s flush_gen=%llu\n",
+				       dev_state->name,
+				       (unsigned long long)
+				       dev_state->last_flush_gen);
+		}
+		if (block->submit_bio_bh_rw & REQ_FUA)
+			block->flush_gen = 0; /* FUA completed means block is
+					       * on disk */
+		block->is_iodone = 1; /* for FLUSH, this releases the block */
+		block = next_block;
+	} while (NULL != block);
+
+	bp->bi_end_io(bp, bio_error_status);
+}
+
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
+{
+	struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
+	int iodone_w_error = !uptodate;
+	struct btrfsic_dev_state *dev_state;
+
+	BUG_ON(NULL == block);
+	dev_state = block->dev_state;
+	if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+		printk(KERN_INFO
+		       "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
+		       iodone_w_error,
+		       btrfsic_get_block_type(dev_state->state, block),
+		       (unsigned long long)block->logical_bytenr,
+		       block->dev_state->name,
+		       (unsigned long long)block->dev_bytenr,
+		       block->mirror_num);
+
+	block->iodone_w_error = iodone_w_error;
+	if (block->submit_bio_bh_rw & REQ_FLUSH) {
+		dev_state->last_flush_gen++;
+		if ((dev_state->state->print_mask &
+		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+			printk(KERN_INFO
+			       "bh_end_io() new %s flush_gen=%llu\n",
+			       dev_state->name,
+			       (unsigned long long)dev_state->last_flush_gen);
+	}
+	if (block->submit_bio_bh_rw & REQ_FUA)
+		block->flush_gen = 0; /* FUA completed means block is on disk */
+
+	bh->b_private = block->orig_bio_bh_private;
+	bh->b_end_io = block->orig_bio_bh_end_io.bh;
+	block->is_iodone = 1; /* for FLUSH, this releases the block */
+	bh->b_end_io(bh, uptodate);
+}
+
+static int btrfsic_process_written_superblock(
+		struct btrfsic_state *state,
+		struct btrfsic_block *const superblock,
+		struct btrfs_super_block *const super_hdr)
+{
+	int pass;
+
+	superblock->generation = btrfs_super_generation(super_hdr);
+	if (!(superblock->generation > state->max_superblock_generation ||
+	      0 == state->max_superblock_generation)) {
+		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+			printk(KERN_INFO
+			       "btrfsic: superblock @%llu (%s/%llu/%d)"
+			       " with old gen %llu <= %llu\n",
+			       (unsigned long long)superblock->logical_bytenr,
+			       superblock->dev_state->name,
+			       (unsigned long long)superblock->dev_bytenr,
+			       superblock->mirror_num,
+			       (unsigned long long)
+			       btrfs_super_generation(super_hdr),
+			       (unsigned long long)
+			       state->max_superblock_generation);
+	} else {
+		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+			printk(KERN_INFO
+			       "btrfsic: got new superblock @%llu (%s/%llu/%d)"
+			       " with new gen %llu > %llu\n",
+			       (unsigned long long)superblock->logical_bytenr,
+			       superblock->dev_state->name,
+			       (unsigned long long)superblock->dev_bytenr,
+			       superblock->mirror_num,
+			       (unsigned long long)
+			       btrfs_super_generation(super_hdr),
+			       (unsigned long long)
+			       state->max_superblock_generation);
+
+		state->max_superblock_generation =
+		    btrfs_super_generation(super_hdr);
+		state->latest_superblock = superblock;
+	}
+
+	for (pass = 0; pass < 3; pass++) {
+		int ret;
+		u64 next_bytenr;
+		struct btrfsic_block *next_block;
+		struct btrfsic_block_data_ctx tmp_next_block_ctx;
+		struct btrfsic_block_link *l;
+		int num_copies;
+		int mirror_num;
+		const char *additional_string = NULL;
+		struct btrfs_disk_key tmp_disk_key;
+
+		tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+		tmp_disk_key.offset = 0;
+
+		switch (pass) {
+		case 0:
+			tmp_disk_key.objectid =
+			    cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+			additional_string = "root ";
+			next_bytenr = btrfs_super_root(super_hdr);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "root@%llu\n",
+				       (unsigned long long)next_bytenr);
+			break;
+		case 1:
+			tmp_disk_key.objectid =
+			    cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+			additional_string = "chunk ";
+			next_bytenr = btrfs_super_chunk_root(super_hdr);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "chunk@%llu\n",
+				       (unsigned long long)next_bytenr);
+			break;
+		case 2:
+			tmp_disk_key.objectid =
+			    cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+			additional_string = "log ";
+			next_bytenr = btrfs_super_log_root(super_hdr);
+			if (0 == next_bytenr)
+				continue;
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "log@%llu\n",
+				       (unsigned long long)next_bytenr);
+			break;
+		}
+
+		num_copies =
+		    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+				     next_bytenr, PAGE_SIZE);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       (unsigned long long)next_bytenr, num_copies);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			int was_created;
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "btrfsic_process_written_superblock("
+				       "mirror_num=%d)\n", mirror_num);
+			ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+						&tmp_next_block_ctx,
+						mirror_num);
+			if (ret) {
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_map_block(@%llu,"
+				       " mirror=%d) failed!\n",
+				       (unsigned long long)next_bytenr,
+				       mirror_num);
+				return -1;
+			}
+
+			next_block = btrfsic_block_lookup_or_add(
+					state,
+					&tmp_next_block_ctx,
+					additional_string,
+					1, 0, 1,
+					mirror_num,
+					&was_created);
+			if (NULL == next_block) {
+				printk(KERN_INFO
+				       "btrfsic: error, kmalloc failed!\n");
+				btrfsic_release_block_ctx(&tmp_next_block_ctx);
+				return -1;
+			}
+
+			next_block->disk_key = tmp_disk_key;
+			if (was_created)
+				next_block->generation =
+				    BTRFSIC_GENERATION_UNKNOWN;
+			l = btrfsic_block_link_lookup_or_add(
+					state,
+					&tmp_next_block_ctx,
+					next_block,
+					superblock,
+					BTRFSIC_GENERATION_UNKNOWN);
+			btrfsic_release_block_ctx(&tmp_next_block_ctx);
+			if (NULL == l)
+				return -1;
+		}
+	}
+
+	if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
+		WARN_ON(1);
+		btrfsic_dump_tree(state);
+	}
+
+	return 0;
+}
+
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+					struct btrfsic_block *const block,
+					int recursion_level)
+{
+	struct list_head *elem_ref_to;
+	int ret = 0;
+
+	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+		/*
+		 * Note that this situation can happen and does not
+		 * indicate an error in regular cases. It happens
+		 * when disk blocks are freed and later reused.
+		 * The check-integrity module is not aware of any
+		 * block free operations, it just recognizes block
+		 * write operations. Therefore it keeps the linkage
+		 * information for a block until a block is
+		 * rewritten. This can temporarily cause incorrect
+		 * and even circular linkage informations. This
+		 * causes no harm unless such blocks are referenced
+		 * by the most recent super block.
+		 */
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "btrfsic: abort cyclic linkage (case 1).\n");
+
+		return ret;
+	}
+
+	/*
+	 * This algorithm is recursive because the amount of used stack
+	 * space is very small and the max recursion depth is limited.
+	 */
+	list_for_each(elem_ref_to, &block->ref_to_list) {
+		const struct btrfsic_block_link *const l =
+		    list_entry(elem_ref_to, struct btrfsic_block_link,
+			       node_ref_to);
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "rl=%d, %c @%llu (%s/%llu/%d)"
+			       " %u* refers to %c @%llu (%s/%llu/%d)\n",
+			       recursion_level,
+			       btrfsic_get_block_type(state, block),
+			       (unsigned long long)block->logical_bytenr,
+			       block->dev_state->name,
+			       (unsigned long long)block->dev_bytenr,
+			       block->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+		if (l->block_ref_to->never_written) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which is never written!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+			ret = -1;
+		} else if (!l->block_ref_to->is_iodone) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which is not yet iodone!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+			ret = -1;
+		} else if (l->parent_generation !=
+			   l->block_ref_to->generation &&
+			   BTRFSIC_GENERATION_UNKNOWN !=
+			   l->parent_generation &&
+			   BTRFSIC_GENERATION_UNKNOWN !=
+			   l->block_ref_to->generation) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " with generation %llu !="
+			       " parent generation %llu!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num,
+			       (unsigned long long)l->block_ref_to->generation,
+			       (unsigned long long)l->parent_generation);
+			ret = -1;
+		} else if (l->block_ref_to->flush_gen >
+			   l->block_ref_to->dev_state->last_flush_gen) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which is not flushed out of disk's write cache"
+			       " (block flush_gen=%llu,"
+			       " dev->flush_gen=%llu)!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num,
+			       (unsigned long long)block->flush_gen,
+			       (unsigned long long)
+			       l->block_ref_to->dev_state->last_flush_gen);
+			ret = -1;
+		} else if (-1 == btrfsic_check_all_ref_blocks(state,
+							      l->block_ref_to,
+							      recursion_level +
+							      1)) {
+			ret = -1;
+		}
+	}
+
+	return ret;
+}
+
+static int btrfsic_is_block_ref_by_superblock(
+		const struct btrfsic_state *state,
+		const struct btrfsic_block *block,
+		int recursion_level)
+{
+	struct list_head *elem_ref_from;
+
+	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+		/* refer to comment at "abort cyclic linkage (case 1)" */
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "btrfsic: abort cyclic linkage (case 2).\n");
+
+		return 0;
+	}
+
+	/*
+	 * This algorithm is recursive because the amount of used stack space
+	 * is very small and the max recursion depth is limited.
+	 */
+	list_for_each(elem_ref_from, &block->ref_from_list) {
+		const struct btrfsic_block_link *const l =
+		    list_entry(elem_ref_from, struct btrfsic_block_link,
+			       node_ref_from);
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "rl=%d, %c @%llu (%s/%llu/%d)"
+			       " is ref %u* from %c @%llu (%s/%llu/%d)\n",
+			       recursion_level,
+			       btrfsic_get_block_type(state, block),
+			       (unsigned long long)block->logical_bytenr,
+			       block->dev_state->name,
+			       (unsigned long long)block->dev_bytenr,
+			       block->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_from),
+			       (unsigned long long)
+			       l->block_ref_from->logical_bytenr,
+			       l->block_ref_from->dev_state->name,
+			       (unsigned long long)
+			       l->block_ref_from->dev_bytenr,
+			       l->block_ref_from->mirror_num);
+		if (l->block_ref_from->is_superblock &&
+		    state->latest_superblock->dev_bytenr ==
+		    l->block_ref_from->dev_bytenr &&
+		    state->latest_superblock->dev_state->bdev ==
+		    l->block_ref_from->dev_state->bdev)
+			return 1;
+		else if (btrfsic_is_block_ref_by_superblock(state,
+							    l->block_ref_from,
+							    recursion_level +
+							    1))
+			return 1;
+	}
+
+	return 0;
+}
+
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l)
+{
+	printk(KERN_INFO
+	       "Add %u* link from %c @%llu (%s/%llu/%d)"
+	       " to %c @%llu (%s/%llu/%d).\n",
+	       l->ref_cnt,
+	       btrfsic_get_block_type(state, l->block_ref_from),
+	       (unsigned long long)l->block_ref_from->logical_bytenr,
+	       l->block_ref_from->dev_state->name,
+	       (unsigned long long)l->block_ref_from->dev_bytenr,
+	       l->block_ref_from->mirror_num,
+	       btrfsic_get_block_type(state, l->block_ref_to),
+	       (unsigned long long)l->block_ref_to->logical_bytenr,
+	       l->block_ref_to->dev_state->name,
+	       (unsigned long long)l->block_ref_to->dev_bytenr,
+	       l->block_ref_to->mirror_num);
+}
+
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l)
+{
+	printk(KERN_INFO
+	       "Rem %u* link from %c @%llu (%s/%llu/%d)"
+	       " to %c @%llu (%s/%llu/%d).\n",
+	       l->ref_cnt,
+	       btrfsic_get_block_type(state, l->block_ref_from),
+	       (unsigned long long)l->block_ref_from->logical_bytenr,
+	       l->block_ref_from->dev_state->name,
+	       (unsigned long long)l->block_ref_from->dev_bytenr,
+	       l->block_ref_from->mirror_num,
+	       btrfsic_get_block_type(state, l->block_ref_to),
+	       (unsigned long long)l->block_ref_to->logical_bytenr,
+	       l->block_ref_to->dev_state->name,
+	       (unsigned long long)l->block_ref_to->dev_bytenr,
+	       l->block_ref_to->mirror_num);
+}
+
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+				   const struct btrfsic_block *block)
+{
+	if (block->is_superblock &&
+	    state->latest_superblock->dev_bytenr == block->dev_bytenr &&
+	    state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
+		return 'S';
+	else if (block->is_superblock)
+		return 's';
+	else if (block->is_metadata)
+		return 'M';
+	else
+		return 'D';
+}
+
+static void btrfsic_dump_tree(const struct btrfsic_state *state)
+{
+	btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
+}
+
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+				  const struct btrfsic_block *block,
+				  int indent_level)
+{
+	struct list_head *elem_ref_to;
+	int indent_add;
+	static char buf[80];
+	int cursor_position;
+
+	/*
+	 * Should better fill an on-stack buffer with a complete line and
+	 * dump it at once when it is time to print a newline character.
+	 */
+
+	/*
+	 * This algorithm is recursive because the amount of used stack space
+	 * is very small and the max recursion depth is limited.
+	 */
+	indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+			     btrfsic_get_block_type(state, block),
+			     (unsigned long long)block->logical_bytenr,
+			     block->dev_state->name,
+			     (unsigned long long)block->dev_bytenr,
+			     block->mirror_num);
+	if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+		printk("[...]\n");
+		return;
+	}
+	printk(buf);
+	indent_level += indent_add;
+	if (list_empty(&block->ref_to_list)) {
+		printk("\n");
+		return;
+	}
+	if (block->mirror_num > 1 &&
+	    !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
+		printk(" [...]\n");
+		return;
+	}
+
+	cursor_position = indent_level;
+	list_for_each(elem_ref_to, &block->ref_to_list) {
+		const struct btrfsic_block_link *const l =
+		    list_entry(elem_ref_to, struct btrfsic_block_link,
+			       node_ref_to);
+
+		while (cursor_position < indent_level) {
+			printk(" ");
+			cursor_position++;
+		}
+		if (l->ref_cnt > 1)
+			indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
+		else
+			indent_add = sprintf(buf, " --> ");
+		if (indent_level + indent_add >
+		    BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+			printk("[...]\n");
+			cursor_position = 0;
+			continue;
+		}
+
+		printk(buf);
+
+		btrfsic_dump_tree_sub(state, l->block_ref_to,
+				      indent_level + indent_add);
+		cursor_position = 0;
+	}
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block *next_block,
+		struct btrfsic_block *from_block,
+		u64 parent_generation)
+{
+	struct btrfsic_block_link *l;
+
+	l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
+						next_block_ctx->dev_bytenr,
+						from_block->dev_state->bdev,
+						from_block->dev_bytenr,
+						&state->block_link_hashtable);
+	if (NULL == l) {
+		l = btrfsic_block_link_alloc();
+		if (NULL == l) {
+			printk(KERN_INFO
+			       "btrfsic: error, kmalloc" " failed!\n");
+			return NULL;
+		}
+
+		l->block_ref_to = next_block;
+		l->block_ref_from = from_block;
+		l->ref_cnt = 1;
+		l->parent_generation = parent_generation;
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			btrfsic_print_add_link(state, l);
+
+		list_add(&l->node_ref_to, &from_block->ref_to_list);
+		list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+		btrfsic_block_link_hashtable_add(l,
+						 &state->block_link_hashtable);
+	} else {
+		l->ref_cnt++;
+		l->parent_generation = parent_generation;
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			btrfsic_print_add_link(state, l);
+	}
+
+	return l;
+}
+
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *block_ctx,
+		const char *additional_string,
+		int is_metadata,
+		int is_iodone,
+		int never_written,
+		int mirror_num,
+		int *was_created)
+{
+	struct btrfsic_block *block;
+
+	block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
+					       block_ctx->dev_bytenr,
+					       &state->block_hashtable);
+	if (NULL == block) {
+		struct btrfsic_dev_state *dev_state;
+
+		block = btrfsic_block_alloc();
+		if (NULL == block) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			return NULL;
+		}
+		dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
+		if (NULL == dev_state) {
+			printk(KERN_INFO
+			       "btrfsic: error, lookup dev_state failed!\n");
+			btrfsic_block_free(block);
+			return NULL;
+		}
+		block->dev_state = dev_state;
+		block->dev_bytenr = block_ctx->dev_bytenr;
+		block->logical_bytenr = block_ctx->start;
+		block->is_metadata = is_metadata;
+		block->is_iodone = is_iodone;
+		block->never_written = never_written;
+		block->mirror_num = mirror_num;
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "New %s%c-block @%llu (%s/%llu/%d)\n",
+			       additional_string,
+			       btrfsic_get_block_type(state, block),
+			       (unsigned long long)block->logical_bytenr,
+			       dev_state->name,
+			       (unsigned long long)block->dev_bytenr,
+			       mirror_num);
+		list_add(&block->all_blocks_node, &state->all_blocks_list);
+		btrfsic_block_hashtable_add(block, &state->block_hashtable);
+		if (NULL != was_created)
+			*was_created = 1;
+	} else {
+		if (NULL != was_created)
+			*was_created = 0;
+	}
+
+	return block;
+}
+
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+					   u64 bytenr,
+					   struct btrfsic_dev_state *dev_state,
+					   u64 dev_bytenr, char *data)
+{
+	int num_copies;
+	int mirror_num;
+	int ret;
+	struct btrfsic_block_data_ctx block_ctx;
+	int match = 0;
+
+	num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+				      bytenr, PAGE_SIZE);
+
+	for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+		ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+					&block_ctx, mirror_num);
+		if (ret) {
+			printk(KERN_INFO "btrfsic:"
+			       " btrfsic_map_block(logical @%llu,"
+			       " mirror %d) failed!\n",
+			       (unsigned long long)bytenr, mirror_num);
+			continue;
+		}
+
+		if (dev_state->bdev == block_ctx.dev->bdev &&
+		    dev_bytenr == block_ctx.dev_bytenr) {
+			match++;
+			btrfsic_release_block_ctx(&block_ctx);
+			break;
+		}
+		btrfsic_release_block_ctx(&block_ctx);
+	}
+
+	if (!match) {
+		printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
+		       " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
+		       " phys_bytenr=%llu)!\n",
+		       (unsigned long long)bytenr, dev_state->name,
+		       (unsigned long long)dev_bytenr);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+						&block_ctx, mirror_num);
+			if (ret)
+				continue;
+
+			printk(KERN_INFO "Read logical bytenr @%llu maps to"
+			       " (%s/%llu/%d)\n",
+			       (unsigned long long)bytenr,
+			       block_ctx.dev->name,
+			       (unsigned long long)block_ctx.dev_bytenr,
+			       mirror_num);
+		}
+		WARN_ON(1);
+	}
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+		struct block_device *bdev)
+{
+	struct btrfsic_dev_state *ds;
+
+	ds = btrfsic_dev_state_hashtable_lookup(bdev,
+						&btrfsic_dev_state_hashtable);
+	return ds;
+}
+
+int btrfsic_submit_bh(int rw, struct buffer_head *bh)
+{
+	struct btrfsic_dev_state *dev_state;
+
+	if (!btrfsic_is_initialized)
+		return submit_bh(rw, bh);
+
+	mutex_lock(&btrfsic_mutex);
+	/* since btrfsic_submit_bh() might also be called before
+	 * btrfsic_mount(), this might return NULL */
+	dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
+
+	/* Only called to write the superblock (incl. FLUSH/FUA) */
+	if (NULL != dev_state &&
+	    (rw & WRITE) && bh->b_size > 0) {
+		u64 dev_bytenr;
+
+		dev_bytenr = 4096 * bh->b_blocknr;
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
+			       " size=%lu, data=%p, bdev=%p)\n",
+			       rw, bh->b_blocknr,
+			       (unsigned long long)dev_bytenr, bh->b_size,
+			       bh->b_data, bh->b_bdev);
+		btrfsic_process_written_block(dev_state, dev_bytenr,
+					      bh->b_data, bh->b_size, NULL,
+					      NULL, bh, rw);
+	} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
+			       rw, bh->b_bdev);
+		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+			if ((dev_state->state->print_mask &
+			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+			      BTRFSIC_PRINT_MASK_VERBOSE)))
+				printk(KERN_INFO
+				       "btrfsic_submit_bh(%s) with FLUSH"
+				       " but dummy block already in use"
+				       " (ignored)!\n",
+				       dev_state->name);
+		} else {
+			struct btrfsic_block *const block =
+				&dev_state->dummy_block_for_bio_bh_flush;
+
+			block->is_iodone = 0;
+			block->never_written = 0;
+			block->iodone_w_error = 0;
+			block->flush_gen = dev_state->last_flush_gen + 1;
+			block->submit_bio_bh_rw = rw;
+			block->orig_bio_bh_private = bh->b_private;
+			block->orig_bio_bh_end_io.bh = bh->b_end_io;
+			block->next_in_same_bio = NULL;
+			bh->b_private = block;
+			bh->b_end_io = btrfsic_bh_end_io;
+		}
+	}
+	mutex_unlock(&btrfsic_mutex);
+	return submit_bh(rw, bh);
+}
+
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+	struct btrfsic_dev_state *dev_state;
+
+	if (!btrfsic_is_initialized) {
+		submit_bio(rw, bio);
+		return;
+	}
+
+	mutex_lock(&btrfsic_mutex);
+	/* since btrfsic_submit_bio() is also called before
+	 * btrfsic_mount(), this might return NULL */
+	dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
+	if (NULL != dev_state &&
+	    (rw & WRITE) && NULL != bio->bi_io_vec) {
+		unsigned int i;
+		u64 dev_bytenr;
+		int bio_is_patched;
+
+		dev_bytenr = 512 * bio->bi_sector;
+		bio_is_patched = 0;
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bio(rw=0x%x, bi_vcnt=%u,"
+			       " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
+			       rw, bio->bi_vcnt, bio->bi_sector,
+			       (unsigned long long)dev_bytenr,
+			       bio->bi_bdev);
+
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			u8 *mapped_data;
+
+			mapped_data = kmap(bio->bi_io_vec[i].bv_page);
+			if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+			     BTRFSIC_PRINT_MASK_VERBOSE) ==
+			    (dev_state->state->print_mask &
+			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+			      BTRFSIC_PRINT_MASK_VERBOSE)))
+				printk(KERN_INFO
+				       "#%u: page=%p, mapped=%p, len=%u,"
+				       " offset=%u\n",
+				       i, bio->bi_io_vec[i].bv_page,
+				       mapped_data,
+				       bio->bi_io_vec[i].bv_len,
+				       bio->bi_io_vec[i].bv_offset);
+			btrfsic_process_written_block(dev_state, dev_bytenr,
+						      mapped_data,
+						      bio->bi_io_vec[i].bv_len,
+						      bio, &bio_is_patched,
+						      NULL, rw);
+			kunmap(bio->bi_io_vec[i].bv_page);
+			dev_bytenr += bio->bi_io_vec[i].bv_len;
+		}
+	} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
+			       rw, bio->bi_bdev);
+		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+			if ((dev_state->state->print_mask &
+			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+			      BTRFSIC_PRINT_MASK_VERBOSE)))
+				printk(KERN_INFO
+				       "btrfsic_submit_bio(%s) with FLUSH"
+				       " but dummy block already in use"
+				       " (ignored)!\n",
+				       dev_state->name);
+		} else {
+			struct btrfsic_block *const block =
+				&dev_state->dummy_block_for_bio_bh_flush;
+
+			block->is_iodone = 0;
+			block->never_written = 0;
+			block->iodone_w_error = 0;
+			block->flush_gen = dev_state->last_flush_gen + 1;
+			block->submit_bio_bh_rw = rw;
+			block->orig_bio_bh_private = bio->bi_private;
+			block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+			block->next_in_same_bio = NULL;
+			bio->bi_private = block;
+			bio->bi_end_io = btrfsic_bio_end_io;
+		}
+	}
+	mutex_unlock(&btrfsic_mutex);
+
+	submit_bio(rw, bio);
+}
+
+int btrfsic_mount(struct btrfs_root *root,
+		  struct btrfs_fs_devices *fs_devices,
+		  int including_extent_data, u32 print_mask)
+{
+	int ret;
+	struct btrfsic_state *state;
+	struct list_head *dev_head = &fs_devices->devices;
+	struct btrfs_device *device;
+
+	state = kzalloc(sizeof(*state), GFP_NOFS);
+	if (NULL == state) {
+		printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
+		return -1;
+	}
+
+	if (!btrfsic_is_initialized) {
+		mutex_init(&btrfsic_mutex);
+		btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
+		btrfsic_is_initialized = 1;
+	}
+	mutex_lock(&btrfsic_mutex);
+	state->root = root;
+	state->print_mask = print_mask;
+	state->include_extent_data = including_extent_data;
+	state->csum_size = 0;
+	INIT_LIST_HEAD(&state->all_blocks_list);
+	btrfsic_block_hashtable_init(&state->block_hashtable);
+	btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
+	state->max_superblock_generation = 0;
+	state->latest_superblock = NULL;
+
+	list_for_each_entry(device, dev_head, dev_list) {
+		struct btrfsic_dev_state *ds;
+		char *p;
+
+		if (!device->bdev || !device->name)
+			continue;
+
+		ds = btrfsic_dev_state_alloc();
+		if (NULL == ds) {
+			printk(KERN_INFO
+			       "btrfs check-integrity: kmalloc() failed!\n");
+			mutex_unlock(&btrfsic_mutex);
+			return -1;
+		}
+		ds->bdev = device->bdev;
+		ds->state = state;
+		bdevname(ds->bdev, ds->name);
+		ds->name[BDEVNAME_SIZE - 1] = '\0';
+		for (p = ds->name; *p != '\0'; p++);
+		while (p > ds->name && *p != '/')
+			p--;
+		if (*p == '/')
+			p++;
+		strlcpy(ds->name, p, sizeof(ds->name));
+		btrfsic_dev_state_hashtable_add(ds,
+						&btrfsic_dev_state_hashtable);
+	}
+
+	ret = btrfsic_process_superblock(state, fs_devices);
+	if (0 != ret) {
+		mutex_unlock(&btrfsic_mutex);
+		btrfsic_unmount(root, fs_devices);
+		return ret;
+	}
+
+	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
+		btrfsic_dump_database(state);
+	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
+		btrfsic_dump_tree(state);
+
+	mutex_unlock(&btrfsic_mutex);
+	return 0;
+}
+
+void btrfsic_unmount(struct btrfs_root *root,
+		     struct btrfs_fs_devices *fs_devices)
+{
+	struct list_head *elem_all;
+	struct list_head *tmp_all;
+	struct btrfsic_state *state;
+	struct list_head *dev_head = &fs_devices->devices;
+	struct btrfs_device *device;
+
+	if (!btrfsic_is_initialized)
+		return;
+
+	mutex_lock(&btrfsic_mutex);
+
+	state = NULL;
+	list_for_each_entry(device, dev_head, dev_list) {
+		struct btrfsic_dev_state *ds;
+
+		if (!device->bdev || !device->name)
+			continue;
+
+		ds = btrfsic_dev_state_hashtable_lookup(
+				device->bdev,
+				&btrfsic_dev_state_hashtable);
+		if (NULL != ds) {
+			state = ds->state;
+			btrfsic_dev_state_hashtable_remove(ds);
+			btrfsic_dev_state_free(ds);
+		}
+	}
+
+	if (NULL == state) {
+		printk(KERN_INFO
+		       "btrfsic: error, cannot find state information"
+		       " on umount!\n");
+		mutex_unlock(&btrfsic_mutex);
+		return;
+	}
+
+	/*
+	 * Don't care about keeping the lists' state up to date,
+	 * just free all memory that was allocated dynamically.
+	 * Free the blocks and the block_links.
+	 */
+	list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
+		struct btrfsic_block *const b_all =
+		    list_entry(elem_all, struct btrfsic_block,
+			       all_blocks_node);
+		struct list_head *elem_ref_to;
+		struct list_head *tmp_ref_to;
+
+		list_for_each_safe(elem_ref_to, tmp_ref_to,
+				   &b_all->ref_to_list) {
+			struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_to,
+				       struct btrfsic_block_link,
+				       node_ref_to);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				btrfsic_print_rem_link(state, l);
+
+			l->ref_cnt--;
+			if (0 == l->ref_cnt)
+				btrfsic_block_link_free(l);
+		}
+
+		if (b_all->is_iodone)
+			btrfsic_block_free(b_all);
+		else
+			printk(KERN_INFO "btrfs: attempt to free %c-block"
+			       " @%llu (%s/%llu/%d) on umount which is"
+			       " not yet iodone!\n",
+			       btrfsic_get_block_type(state, b_all),
+			       (unsigned long long)b_all->logical_bytenr,
+			       b_all->dev_state->name,
+			       (unsigned long long)b_all->dev_bytenr,
+			       b_all->mirror_num);
+	}
+
+	mutex_unlock(&btrfsic_mutex);
+
+	kfree(state);
+}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 000000000000..8b59175cc502
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_CHECK_INTEGRITY__)
+#define __BTRFS_CHECK_INTEGRITY__
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+int btrfsic_submit_bh(int rw, struct buffer_head *bh);
+void btrfsic_submit_bio(int rw, struct bio *bio);
+#else
+#define btrfsic_submit_bh submit_bh
+#define btrfsic_submit_bio submit_bio
+#endif
+
+int btrfsic_mount(struct btrfs_root *root,
+		  struct btrfs_fs_devices *fs_devices,
+		  int including_extent_data, u32 print_mask);
+void btrfsic_unmount(struct btrfs_root *root,
+		     struct btrfs_fs_devices *fs_devices);
+
+#endif
-- 
cgit v1.2.3


From c975dd469d748ce619c510050d4fb407c2398591 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Tue, 1 Nov 2011 17:06:04 +0100
Subject: Btrfs: add config option to enable btrfs integrity check

Added the BTRFS_FS_CHECK_INTEGRITY option to Kconfig. It depends on
BTRFS_FS.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/Kconfig | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ecb9fd3be143..d33f01c08b60 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
 	  Linux website <http://acl.bestbits.at/>.
 
 	  If you don't know what Access Control Lists are, say N
+
+config BTRFS_FS_CHECK_INTEGRITY
+	bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+	depends on BTRFS_FS
+	help
+	  Adds code that examines all block write requests (including
+	  writes of the super block). The goal is to verify that the
+	  state of the filesystem on disk is always consistent, i.e.,
+	  after a power-loss or kernel panic event the filesystem is
+	  in a consistent state.
+
+	  If the integrity check tool is included and activated in
+	  the mount options, plenty of kernel memory is used, and
+	  plenty of additional CPU cycles are spent. Enabling this
+	  functionality is not intended for normal use.
+
+	  In most cases, unless you are a btrfs developer who needs
+	  to verify the integrity of (super)-block write requests
+	  during the run of a regression test, say N
-- 
cgit v1.2.3


From f11e4d7f533249ddfa110116200c5c3a509f9218 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Tue, 1 Nov 2011 17:06:39 +0100
Subject: Btrfs: Makefile changes to optionally include btrfs integrity check

If the btrfs integrity check is enabled, the files required to
implement the checks are included in the build.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd29c5e5..bc5b3556cee6 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -11,3 +11,4 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   reada.o backref.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
-- 
cgit v1.2.3


From 21adbd5cbb5344a3fca6bb7ddb2ab6cb03c44546 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Wed, 9 Nov 2011 13:44:05 +0100
Subject: Btrfs: integrate integrity check module into btrfs

This is the last part of the patch series. It modifies the btrfs
code to use the integrity check module if configured to do so
with the define BTRFS_FS_CHECK_INTEGRITY. If this define is not set,
the only effective change is that code is added that handles the
mount option to activate the integrity check. If the mount option is
set and the define BTRFS_FS_CHECK_INTEGRITY is not set, that code
complains in the log and the mount fails with EINVAL.

Add the mount option to activate the usage of the integrity check
code.
Add invocation of btrfs integrity check code init and cleanup
function on mount and umount, respectively.
Add hook to call btrfs integrity check code version of
submit_bh/submit_bio.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/ctree.h     |  8 +++++++-
 fs/btrfs/disk-io.c   | 26 ++++++++++++++++++++++++--
 fs/btrfs/extent_io.c |  5 +++--
 fs/btrfs/scrub.c     |  5 +++--
 fs/btrfs/super.c     | 39 ++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.c   |  7 ++++---
 6 files changed, 79 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323d..39f6188688e6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -971,7 +971,7 @@ struct btrfs_fs_info {
 	 * is required instead of the faster short fsync log commits
 	 */
 	u64 last_trans_log_full_commit;
-	unsigned long mount_opt:20;
+	unsigned long mount_opt:21;
 	unsigned long compress_type:4;
 	u64 max_inline;
 	u64 alloc_start;
@@ -1155,6 +1155,10 @@ struct btrfs_fs_info {
 	int scrub_workers_refcnt;
 	struct btrfs_workers scrub_workers;
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	u32 check_integrity_print_mask;
+#endif
+
 	/* filesystem state */
 	u64 fs_state;
 
@@ -1413,6 +1417,8 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
 #define BTRFS_MOUNT_RECOVERY		(1 << 18)
+#define BTRFS_MOUNT_CHECK_INTEGRITY	(1 << 19)
+#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 20)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3f9d5551e582..f363c6d9c3de 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,6 +43,7 @@
 #include "tree-log.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
+#include "check-integrity.h"
 
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
@@ -2001,6 +2002,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->scrub_pause_wait);
 	init_rwsem(&fs_info->scrub_super_lock);
 	fs_info->scrub_workers_refcnt = 0;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	fs_info->check_integrity_print_mask = 0;
+#endif
 
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
@@ -2356,6 +2360,19 @@ retry_root_backup:
 		btrfs_set_opt(fs_info->mount_opt, SSD);
 	}
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+		ret = btrfsic_mount(tree_root, fs_devices,
+				    btrfs_test_opt(tree_root,
+					CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+				    1 : 0,
+				    fs_info->check_integrity_print_mask);
+		if (ret)
+			printk(KERN_WARNING "btrfs: failed to initialize"
+			       " integrity check module %s\n", sb->s_id);
+	}
+#endif
+
 	/* do not make disk changes in broken FS */
 	if (btrfs_super_log_root(disk_super) != 0 &&
 	    !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2634,7 +2651,7 @@ static int write_dev_supers(struct btrfs_device *device,
 		 * we fua the first super.  The others we allow
 		 * to go down lazy.
 		 */
-		ret = submit_bh(WRITE_FUA, bh);
+		ret = btrfsic_submit_bh(WRITE_FUA, bh);
 		if (ret)
 			errors++;
 	}
@@ -2711,7 +2728,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 	device->flush_bio = bio;
 
 	bio_get(bio);
-	submit_bio(WRITE_FLUSH, bio);
+	btrfsic_submit_bio(WRITE_FLUSH, bio);
 
 	return 0;
 }
@@ -3057,6 +3074,11 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->caching_workers);
 	btrfs_stop_workers(&fs_info->readahead_workers);
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(root, CHECK_INTEGRITY))
+		btrfsic_unmount(root, fs_info->fs_devices);
+#endif
+
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9dc09f4..246669296e02 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -18,6 +18,7 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
+#include "check-integrity.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
 	}
 	bio->bi_bdev = dev->bdev;
 	bio_add_page(bio, page, length, start-page_offset(page));
-	submit_bio(WRITE_SYNC, bio);
+	btrfsic_submit_bio(WRITE_SYNC, bio);
 	wait_for_completion(&compl);
 
 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
 					   mirror_num, bio_flags, start);
 	else
-		submit_bio(rw, bio);
+		btrfsic_submit_bio(rw, bio);
 
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ddf2c90d3fc0..567e148caca2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -25,6 +25,7 @@
 #include "transaction.h"
 #include "backref.h"
 #include "extent_io.h"
+#include "check-integrity.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -732,7 +733,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
 	bio_add_page(bio, page, PAGE_SIZE, 0);
 	bio->bi_end_io = scrub_fixup_end_io;
 	bio->bi_private = &complete;
-	submit_bio(rw, bio);
+	btrfsic_submit_bio(rw, bio);
 
 	/* this will also unplug the queue */
 	wait_for_completion(&complete);
@@ -958,7 +959,7 @@ static int scrub_submit(struct scrub_dev *sdev)
 	sdev->curr = -1;
 	atomic_inc(&sdev->in_flight);
 
-	submit_bio(READ, sbio->bio);
+	btrfsic_submit_bio(READ, sbio->bio);
 
 	return 0;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 34a8b6112ea4..22a2015f1d7b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -165,7 +165,10 @@ enum {
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
 	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-	Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+	Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
+	Opt_check_integrity, Opt_check_integrity_including_extent_data,
+	Opt_check_integrity_print_mask,
+	Opt_err,
 };
 
 static match_table_t tokens = {
@@ -200,6 +203,9 @@ static match_table_t tokens = {
 	{Opt_inode_cache, "inode_cache"},
 	{Opt_no_space_cache, "nospace_cache"},
 	{Opt_recovery, "recovery"},
+	{Opt_check_integrity, "check_int"},
+	{Opt_check_integrity_including_extent_data, "check_int_data"},
+	{Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
 	{Opt_err, NULL},
 };
 
@@ -398,6 +404,37 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: enabling auto recovery");
 			btrfs_set_opt(info->mount_opt, RECOVERY);
 			break;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+		case Opt_check_integrity_including_extent_data:
+			printk(KERN_INFO "btrfs: enabling check integrity"
+			       " including extent data\n");
+			btrfs_set_opt(info->mount_opt,
+				      CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+			break;
+		case Opt_check_integrity:
+			printk(KERN_INFO "btrfs: enabling check integrity\n");
+			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+			break;
+		case Opt_check_integrity_print_mask:
+			intarg = 0;
+			match_int(&args[0], &intarg);
+			if (intarg) {
+				info->check_integrity_print_mask = intarg;
+				printk(KERN_INFO "btrfs:"
+				       " check_integrity_print_mask 0x%x\n",
+				       info->check_integrity_print_mask);
+			}
+			break;
+#else
+		case Opt_check_integrity_including_extent_data:
+		case Opt_check_integrity:
+		case Opt_check_integrity_print_mask:
+			printk(KERN_ERR "btrfs: support for check_integrity*"
+			       " not compiled in!\n");
+			ret = -EINVAL;
+			goto out;
+#endif
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9d..821334f6e3a1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -32,6 +32,7 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "async-thread.h"
+#include "check-integrity.h"
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
@@ -246,7 +247,7 @@ loop_lock:
 			sync_pending = 0;
 		}
 
-		submit_bio(cur->bi_rw, cur);
+		btrfsic_submit_bio(cur->bi_rw, cur);
 		num_run++;
 		batch_run++;
 		if (need_resched())
@@ -3304,7 +3305,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & REQ_WRITE)) {
 		bio_get(bio);
-		submit_bio(rw, bio);
+		btrfsic_submit_bio(rw, bio);
 		bio_put(bio);
 		return 0;
 	}
@@ -3399,7 +3400,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 			if (async_submit)
 				schedule_bio(root, dev, rw, bio);
 			else
-				submit_bio(rw, bio);
+				btrfsic_submit_bio(rw, bio);
 		} else {
 			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
 			bio->bi_sector = logical >> 9;
-- 
cgit v1.2.3


From da5c81356426c476112f2b59fe64bdb1b37f079d Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Tue, 13 Sep 2011 12:29:12 +0200
Subject: Btrfs: generic data structure to build unique lists

ulist is a generic data structures to hold a collection of unique u64
values. The only operations it supports is adding to the list and
enumerating it.

It is possible to store an auxiliary value along with the key. The
implementation is preliminary and can probably be sped up significantly.

It is used by btrfs_find_all_roots() quota to translate recursions into
iterative loops.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/Makefile |   2 +-
 fs/btrfs/ulist.c  | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ulist.h  |  68 +++++++++++++++++
 3 files changed, 289 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/ulist.c
 create mode 100644 fs/btrfs/ulist.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd29c5e5..70798407b9a2 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,6 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-	   reada.o backref.o
+	   reada.o backref.o ulist.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644
index 000000000000..12f5147bd2b1
--- /dev/null
+++ b/fs/btrfs/ulist.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include "ulist.h"
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ *
+ * A sample usage for ulists is the enumeration of directed graphs without
+ * visiting a node twice. The pseudo-code could look like this:
+ *
+ * ulist = ulist_alloc();
+ * ulist_add(ulist, root);
+ * elem = NULL;
+ *
+ * while ((elem = ulist_next(ulist, elem)) {
+ * 	for (all child nodes n in elem)
+ *		ulist_add(ulist, n);
+ *	do something useful with the node;
+ * }
+ * ulist_free(ulist);
+ *
+ * This assumes the graph nodes are adressable by u64. This stems from the
+ * usage for tree enumeration in btrfs, where the logical addresses are
+ * 64 bit.
+ *
+ * It is also useful for tree enumeration which could be done elegantly
+ * recursively, but is not possible due to kernel stack limitations. The
+ * loop would be similar to the above.
+ */
+
+/**
+ * ulist_init - freshly initialize a ulist
+ * @ulist:	the ulist to initialize
+ *
+ * Note: don't use this function to init an already used ulist, use
+ * ulist_reinit instead.
+ */
+void ulist_init(struct ulist *ulist)
+{
+	ulist->nnodes = 0;
+	ulist->nodes = ulist->int_nodes;
+	ulist->nodes_alloced = ULIST_SIZE;
+}
+EXPORT_SYMBOL(ulist_init);
+
+/**
+ * ulist_fini - free up additionally allocated memory for the ulist
+ * @ulist:	the ulist from which to free the additional memory
+ *
+ * This is useful in cases where the base 'struct ulist' has been statically
+ * allocated.
+ */
+void ulist_fini(struct ulist *ulist)
+{
+	/*
+	 * The first ULIST_SIZE elements are stored inline in struct ulist.
+	 * Only if more elements are alocated they need to be freed.
+	 */
+	if (ulist->nodes_alloced > ULIST_SIZE)
+		kfree(ulist->nodes);
+	ulist->nodes_alloced = 0;	/* in case ulist_fini is called twice */
+}
+EXPORT_SYMBOL(ulist_fini);
+
+/**
+ * ulist_reinit - prepare a ulist for reuse
+ * @ulist:	ulist to be reused
+ *
+ * Free up all additional memory allocated for the list elements and reinit
+ * the ulist.
+ */
+void ulist_reinit(struct ulist *ulist)
+{
+	ulist_fini(ulist);
+	ulist_init(ulist);
+}
+EXPORT_SYMBOL(ulist_reinit);
+
+/**
+ * ulist_alloc - dynamically allocate a ulist
+ * @gfp_mask:	allocation flags to for base allocation
+ *
+ * The allocated ulist will be returned in an initialized state.
+ */
+struct ulist *ulist_alloc(unsigned long gfp_mask)
+{
+	struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
+
+	if (!ulist)
+		return NULL;
+
+	ulist_init(ulist);
+
+	return ulist;
+}
+EXPORT_SYMBOL(ulist_alloc);
+
+/**
+ * ulist_free - free dynamically allocated ulist
+ * @ulist:	ulist to free
+ *
+ * It is not necessary to call ulist_fini before.
+ */
+void ulist_free(struct ulist *ulist)
+{
+	if (!ulist)
+		return;
+	ulist_fini(ulist);
+	kfree(ulist);
+}
+EXPORT_SYMBOL(ulist_free);
+
+/**
+ * ulist_add - add an element to the ulist
+ * @ulist:	ulist to add the element to
+ * @val:	value to add to ulist
+ * @aux:	auxiliary value to store along with val
+ * @gfp_mask:	flags to use for allocation
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks write
+ *       locking is needed
+ *
+ * Add an element to a ulist. The @val will only be added if it doesn't
+ * already exist. If it is added, the auxiliary value @aux is stored along with
+ * it. In case @val already exists in the ulist, @aux is ignored, even if
+ * it differs from the already stored value.
+ *
+ * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
+ * inserted.
+ * In case of allocation failure -ENOMEM is returned and the ulist stays
+ * unaltered.
+ */
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+	      unsigned long gfp_mask)
+{
+	int i;
+
+	for (i = 0; i < ulist->nnodes; ++i) {
+		if (ulist->nodes[i].val == val)
+			return 0;
+	}
+
+	if (ulist->nnodes >= ulist->nodes_alloced) {
+		u64 new_alloced = ulist->nodes_alloced + 128;
+		struct ulist_node *new_nodes;
+		void *old = NULL;
+
+		/*
+		 * if nodes_alloced == ULIST_SIZE no memory has been allocated
+		 * yet, so pass NULL to krealloc
+		 */
+		if (ulist->nodes_alloced > ULIST_SIZE)
+			old = ulist->nodes;
+
+		new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
+				     gfp_mask);
+		if (!new_nodes)
+			return -ENOMEM;
+
+		if (!old)
+			memcpy(new_nodes, ulist->int_nodes,
+			       sizeof(ulist->int_nodes));
+
+		ulist->nodes = new_nodes;
+		ulist->nodes_alloced = new_alloced;
+	}
+	ulist->nodes[ulist->nnodes].val = val;
+	ulist->nodes[ulist->nnodes].aux = aux;
+	++ulist->nnodes;
+
+	return 1;
+}
+EXPORT_SYMBOL(ulist_add);
+
+/**
+ * ulist_next - iterate ulist
+ * @ulist:	ulist to iterate
+ * @prev:	previously returned element or %NULL to start iteration
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks only read
+ *       locking is needed
+ *
+ * This function is used to iterate an ulist. The iteration is started with
+ * @prev = %NULL. It returns the next element from the ulist or %NULL when the
+ * end is reached. No guarantee is made with respect to the order in which
+ * the elements are returned. They might neither be returned in order of
+ * addition nor in ascending order.
+ * It is allowed to call ulist_add during an enumeration. Newly added items
+ * are guaranteed to show up in the running enumeration.
+ */
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
+{
+	int next;
+
+	if (ulist->nnodes == 0)
+		return NULL;
+
+	if (!prev)
+		return &ulist->nodes[0];
+
+	next = (prev - ulist->nodes) + 1;
+	if (next < 0 || next >= ulist->nnodes)
+		return NULL;
+
+	return &ulist->nodes[next];
+}
+EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644
index 000000000000..2e25dec58ec0
--- /dev/null
+++ b/fs/btrfs/ulist.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ *
+ */
+
+#ifndef __ULIST__
+#define __ULIST__
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ */
+
+/*
+ * number of elements statically allocated inside struct ulist
+ */
+#define ULIST_SIZE 16
+
+/*
+ * element of the list
+ */
+struct ulist_node {
+	u64 val;		/* value to store */
+	unsigned long aux;	/* auxiliary value saved along with the val */
+};
+
+struct ulist {
+	/*
+	 * number of elements stored in list
+	 */
+	unsigned long nnodes;
+
+	/*
+	 * number of nodes we already have room for
+	 */
+	unsigned long nodes_alloced;
+
+	/*
+	 * pointer to the array storing the elements. The first ULIST_SIZE
+	 * elements are stored inline. In this case the it points to int_nodes.
+	 * After exceeding ULIST_SIZE, dynamic memory is allocated.
+	 */
+	struct ulist_node *nodes;
+
+	/*
+	 * inline storage space for the first ULIST_SIZE entries
+	 */
+	struct ulist_node int_nodes[ULIST_SIZE];
+};
+
+void ulist_init(struct ulist *ulist);
+void ulist_fini(struct ulist *ulist);
+void ulist_reinit(struct ulist *ulist);
+struct ulist *ulist_alloc(unsigned long gfp_mask);
+void ulist_free(struct ulist *ulist);
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+	      unsigned long gfp_mask);
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
+
+#endif
-- 
cgit v1.2.3


From c7d22a3c3cdb73d8a0151e2ccc8cf4a48c48310b Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Tue, 22 Nov 2011 15:14:33 +0100
Subject: Btrfs: added helper btrfs_next_item()

btrfs_next_item() makes the btrfs path point to the next item, crossing leaf
boundaries if needed.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/ctree.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50634abef9b4..3e4a07b79817 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2482,6 +2482,13 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+	++p->slots[0];
+	if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+		return btrfs_next_leaf(root, p);
+	return 0;
+}
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 void btrfs_drop_snapshot(struct btrfs_root *root,
-- 
cgit v1.2.3


From 66d7e7f09f77456fe68683247d77721032a00ee5 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Mon, 12 Sep 2011 15:26:38 +0200
Subject: Btrfs: mark delayed refs as for cow

Add a for_cow parameter to add_delayed_*_ref and pass the appropriate value
from every call site. The for_cow parameter will later on be used to
determine if a ref will change anything with respect to qgroups.

Delayed refs coming from relocation are always counted as for_cow, as they
don't change subvol quota.

Also pass in the fs_info for later use.

btrfs_find_all_roots() will use this as an optimization, as changes that are
for_cow will not change anything with respect to which root points to a
certain leaf. Thus, we don't need to add the current sequence number to
those delayed refs.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/ctree.c       |  42 ++++++++++----------
 fs/btrfs/ctree.h       |  17 +++++----
 fs/btrfs/delayed-ref.c |  50 ++++++++++++++----------
 fs/btrfs/delayed-ref.h |  15 +++++---
 fs/btrfs/disk-io.c     |   3 +-
 fs/btrfs/extent-tree.c | 101 +++++++++++++++++++++++++++++--------------------
 fs/btrfs/file.c        |  10 ++---
 fs/btrfs/inode.c       |   2 +-
 fs/btrfs/ioctl.c       |   5 ++-
 fs/btrfs/relocation.c  |  18 +++++----
 fs/btrfs/transaction.c |   4 +-
 fs/btrfs/tree-log.c    |   2 +-
 12 files changed, 155 insertions(+), 114 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dede441bdeee..0639a555e16e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
 	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
 				     new_root_objectid, &disk_key, level,
-				     buf->start, 0);
+				     buf->start, 0, 1);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-		ret = btrfs_inc_ref(trans, root, cow, 1);
+		ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 	else
-		ret = btrfs_inc_ref(trans, root, cow, 0);
+		ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 
 	if (ret)
 		return ret;
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 		if ((owner == root->root_key.objectid ||
 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
-			ret = btrfs_inc_ref(trans, root, buf, 1);
+			ret = btrfs_inc_ref(trans, root, buf, 1, 1);
 			BUG_ON(ret);
 
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID) {
-				ret = btrfs_dec_ref(trans, root, buf, 0);
+				ret = btrfs_dec_ref(trans, root, buf, 0, 1);
 				BUG_ON(ret);
-				ret = btrfs_inc_ref(trans, root, cow, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 				BUG_ON(ret);
 			}
 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID)
-				ret = btrfs_inc_ref(trans, root, cow, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 			else
-				ret = btrfs_inc_ref(trans, root, cow, 0);
+				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 			BUG_ON(ret);
 		}
 		if (new_flags != 0) {
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID)
-				ret = btrfs_inc_ref(trans, root, cow, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 			else
-				ret = btrfs_inc_ref(trans, root, cow, 0);
+				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 			BUG_ON(ret);
-			ret = btrfs_dec_ref(trans, root, buf, 1);
+			ret = btrfs_dec_ref(trans, root, buf, 1, 1);
 			BUG_ON(ret);
 		}
 		clean_tree_block(trans, root, buf);
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
 				     root->root_key.objectid, &disk_key,
-				     level, search_start, empty_size);
+				     level, search_start, empty_size, 1);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		rcu_assign_pointer(root->node, cow);
 
 		btrfs_free_tree_block(trans, root, buf, parent_start,
-				      last_ref);
+				      last_ref, 1);
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
 	} else {
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
 		btrfs_free_tree_block(trans, root, buf, parent_start,
-				      last_ref);
+				      last_ref, 1);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		free_extent_buffer(mid);
 
 		root_sub_used(root, mid->len);
-		btrfs_free_tree_block(trans, root, mid, 0, 1);
+		btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
 		return 0;
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			if (wret)
 				ret = wret;
 			root_sub_used(root, right->len);
-			btrfs_free_tree_block(trans, root, right, 0, 1);
+			btrfs_free_tree_block(trans, root, right, 0, 1, 0);
 			free_extent_buffer(right);
 			right = NULL;
 		} else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		if (wret)
 			ret = wret;
 		root_sub_used(root, mid->len);
-		btrfs_free_tree_block(trans, root, mid, 0, 1);
+		btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
 		free_extent_buffer(mid);
 		mid = NULL;
 	} else {
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
 	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
 				   root->root_key.objectid, &lower_key,
-				   level, root->node->start, 0);
+				   level, root->node->start, 0, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 
 	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
 					root->root_key.objectid,
-					&disk_key, level, c->start, 0);
+					&disk_key, level, c->start, 0, 0);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
@@ -2970,7 +2970,7 @@ again:
 
 	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
 					root->root_key.objectid,
-					&disk_key, 0, l->start, 0);
+					&disk_key, 0, l->start, 0, 0);
 	if (IS_ERR(right))
 		return PTR_ERR(right);
 
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 
 	root_sub_used(root, leaf->len);
 
-	btrfs_free_tree_block(trans, root, leaf, 0, 1);
+	btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
 	return 0;
 }
 /*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3e4a07b79817..543f60bddb39 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2277,11 +2277,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root, u32 blocksize,
 					u64 parent, u64 root_objectid,
 					struct btrfs_disk_key *key, int level,
-					u64 hint, u64 empty_size);
+					u64 hint, u64 empty_size, int for_cow);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
-			   u64 parent, int last_ref);
+			   u64 parent, int last_ref, int for_cow);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize,
@@ -2301,17 +2301,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  u64 search_end, struct btrfs_key *ins,
 				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
+		  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
+		  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 flags,
 				int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
-		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 owner, u64 offset);
+		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+		      u64 owner, u64 offset, int for_cow);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -2323,7 +2323,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset);
+			 u64 root_objectid, u64 owner, u64 offset, int for_cow);
 
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
@@ -2492,7 +2492,8 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 void btrfs_drop_snapshot(struct btrfs_root *root,
-			 struct btrfs_block_rsv *block_rsv, int update_ref);
+			 struct btrfs_block_rsv *block_rsv, int update_ref,
+			 int for_reloc);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct extent_buffer *node,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 125cf76fcd08..3a0f0ab804f4 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -390,7 +390,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
  * this does all the dirty work in terms of maintaining the correct
  * overall modification count.
  */
-static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+					struct btrfs_trans_handle *trans,
 					struct btrfs_delayed_ref_node *ref,
 					u64 bytenr, u64 num_bytes,
 					int action, int is_data)
@@ -468,10 +469,12 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
 /*
  * helper to insert a delayed tree ref into the rbtree.
  */
-static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+					 struct btrfs_trans_handle *trans,
 					 struct btrfs_delayed_ref_node *ref,
 					 u64 bytenr, u64 num_bytes, u64 parent,
-					 u64 ref_root, int level, int action)
+					 u64 ref_root, int level, int action,
+					 int for_cow)
 {
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_tree_ref *full_ref;
@@ -522,11 +525,12 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 /*
  * helper to insert a delayed data ref into the rbtree.
  */
-static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+					 struct btrfs_trans_handle *trans,
 					 struct btrfs_delayed_ref_node *ref,
 					 u64 bytenr, u64 num_bytes, u64 parent,
 					 u64 ref_root, u64 owner, u64 offset,
-					 int action)
+					 int action, int for_cow)
 {
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_data_ref *full_ref;
@@ -554,6 +558,7 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
 		full_ref->root = ref_root;
 		ref->type = BTRFS_EXTENT_DATA_REF_KEY;
 	}
+
 	full_ref->objectid = owner;
 	full_ref->offset = offset;
 
@@ -580,10 +585,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
  * to make sure the delayed ref is eventually processed before this
  * transaction commits.
  */
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root,  int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op)
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int for_cow)
 {
 	struct btrfs_delayed_tree_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
@@ -610,12 +617,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-				   action, 0);
+	ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+				   num_bytes, action, 0);
 	BUG_ON(ret);
 
-	ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
-				   parent, ref_root, level, action);
+	ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
+				   num_bytes, parent, ref_root, level, action,
+				   for_cow);
 	BUG_ON(ret);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
@@ -624,11 +632,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 /*
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op)
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int for_cow)
 {
 	struct btrfs_delayed_data_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
@@ -655,18 +665,20 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-				   action, 1);
+	ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+				   num_bytes, action, 1);
 	BUG_ON(ret);
 
-	ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
-				   parent, ref_root, owner, offset, action);
+	ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
+				   num_bytes, parent, ref_root, owner, offset,
+				   action, for_cow);
 	BUG_ON(ret);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
 
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op)
 {
@@ -683,7 +695,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+	ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
 				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
 				   extent_op->is_data);
 	BUG_ON(ret);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e287e3b0eab0..8316bff18d30 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -151,16 +151,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 	}
 }
 
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root, int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int for_cow);
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int for_cow);
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op);
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 94abc25392f6..6f8cd17c9a9f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1243,7 +1243,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	root->ref_cows = 0;
 
 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-				      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+				      BTRFS_TREE_LOG_OBJECTID, NULL,
+				      0, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		kfree(root);
 		return ERR_CAST(leaf);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 813c6bb96c9a..dc8b9a834596 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1872,20 +1872,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset)
+			 u64 root_objectid, u64 owner, u64 offset, int for_cow)
 {
 	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+					num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_ADD_DELAYED_REF, NULL);
+					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
 	} else {
-		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+					num_bytes,
 					parent, root_objectid, owner, offset,
-					BTRFS_ADD_DELAYED_REF, NULL);
+					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
 	}
 	return ret;
 }
@@ -2405,7 +2409,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 	extent_op->update_key = 0;
 	extent_op->is_data = is_data ? 1 : 0;
 
-	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+					  num_bytes, extent_op);
 	if (ret)
 		kfree(extent_op);
 	return ret;
@@ -2590,7 +2595,7 @@ out:
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
-			   int full_backref, int inc)
+			   int full_backref, int inc, int for_cow)
 {
 	u64 bytenr;
 	u64 num_bytes;
@@ -2603,7 +2608,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 	int level;
 	int ret = 0;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64);
+			    u64, u64, u64, u64, u64, u64, int);
 
 	ref_root = btrfs_header_owner(buf);
 	nritems = btrfs_header_nritems(buf);
@@ -2640,14 +2645,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			key.offset -= btrfs_file_extent_offset(buf, fi);
 			ret = process_func(trans, root, bytenr, num_bytes,
 					   parent, ref_root, key.objectid,
-					   key.offset);
+					   key.offset, for_cow);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			num_bytes = btrfs_level_size(root, level - 1);
 			ret = process_func(trans, root, bytenr, num_bytes,
-					   parent, ref_root, level - 1, 0);
+					   parent, ref_root, level - 1, 0,
+					   for_cow);
 			if (ret)
 				goto fail;
 		}
@@ -2659,15 +2665,15 @@ fail:
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref)
+		  struct extent_buffer *buf, int full_backref, int for_cow)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
 }
 
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref)
+		  struct extent_buffer *buf, int full_backref, int for_cow)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -4937,16 +4943,17 @@ out:
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
-			   u64 parent, int last_ref)
+			   u64 parent, int last_ref, int for_cow)
 {
 	struct btrfs_block_group_cache *cache = NULL;
 	int ret;
 
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
-						parent, root->root_key.objectid,
-						btrfs_header_level(buf),
-						BTRFS_DROP_DELAYED_REF, NULL);
+		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+					buf->start, buf->len,
+					parent, root->root_key.objectid,
+					btrfs_header_level(buf),
+					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
 		BUG_ON(ret);
 	}
 
@@ -4981,12 +4988,12 @@ out:
 	btrfs_put_block_group(cache);
 }
 
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root,
-		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 owner, u64 offset)
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+		      u64 owner, u64 offset, int for_cow)
 {
 	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	/*
 	 * tree log blocks never actually go into the extent allocation
@@ -4998,14 +5005,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		btrfs_pin_extent(root, bytenr, num_bytes, 1);
 		ret = 0;
 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+					num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_DROP_DELAYED_REF, NULL);
+					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
 		BUG_ON(ret);
 	} else {
-		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
-					parent, root_objectid, owner,
-					offset, BTRFS_DROP_DELAYED_REF, NULL);
+		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+						num_bytes,
+						parent, root_objectid, owner,
+						offset, BTRFS_DROP_DELAYED_REF,
+						NULL, for_cow);
 		BUG_ON(ret);
 	}
 	return ret;
@@ -5826,9 +5836,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 
 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
-	ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
-					 0, root_objectid, owner, offset,
-					 BTRFS_ADD_DELAYED_EXTENT, NULL);
+	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+					 ins->offset, 0,
+					 root_objectid, owner, offset,
+					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
 	return ret;
 }
 
@@ -5998,7 +6009,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root, u32 blocksize,
 					u64 parent, u64 root_objectid,
 					struct btrfs_disk_key *key, int level,
-					u64 hint, u64 empty_size)
+					u64 hint, u64 empty_size, int for_cow)
 {
 	struct btrfs_key ins;
 	struct btrfs_block_rsv *block_rsv;
@@ -6042,10 +6053,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		extent_op->update_flags = 1;
 		extent_op->is_data = 0;
 
-		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+					ins.objectid,
 					ins.offset, parent, root_objectid,
 					level, BTRFS_ADD_DELAYED_EXTENT,
-					extent_op);
+					extent_op, for_cow);
 		BUG_ON(ret);
 	}
 	return buf;
@@ -6062,6 +6074,7 @@ struct walk_control {
 	int keep_locks;
 	int reada_slot;
 	int reada_count;
+	int for_reloc;
 };
 
 #define DROP_REFERENCE	1
@@ -6200,9 +6213,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 	/* wc->stage == UPDATE_BACKREF */
 	if (!(wc->flags[level] & flag)) {
 		BUG_ON(!path->locks[level]);
-		ret = btrfs_inc_ref(trans, root, eb, 1);
+		ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
 		BUG_ON(ret);
-		ret = btrfs_dec_ref(trans, root, eb, 0);
+		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
 		BUG_ON(ret);
 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
 						  eb->len, flag, 0);
@@ -6346,7 +6359,7 @@ skip:
 		}
 
 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-					root->root_key.objectid, level - 1, 0);
+				root->root_key.objectid, level - 1, 0, 0);
 		BUG_ON(ret);
 	}
 	btrfs_tree_unlock(next);
@@ -6420,9 +6433,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 	if (wc->refs[level] == 1) {
 		if (level == 0) {
 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-				ret = btrfs_dec_ref(trans, root, eb, 1);
+				ret = btrfs_dec_ref(trans, root, eb, 1,
+						    wc->for_reloc);
 			else
-				ret = btrfs_dec_ref(trans, root, eb, 0);
+				ret = btrfs_dec_ref(trans, root, eb, 0,
+						    wc->for_reloc);
 			BUG_ON(ret);
 		}
 		/* make block locked assertion in clean_tree_block happy */
@@ -6449,7 +6464,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 			       btrfs_header_owner(path->nodes[level + 1]));
 	}
 
-	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
 out:
 	wc->refs[level] = 0;
 	wc->flags[level] = 0;
@@ -6533,7 +6548,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
  * blocks are properly updated.
  */
 void btrfs_drop_snapshot(struct btrfs_root *root,
-			 struct btrfs_block_rsv *block_rsv, int update_ref)
+			 struct btrfs_block_rsv *block_rsv, int update_ref,
+			 int for_reloc)
 {
 	struct btrfs_path *path;
 	struct btrfs_trans_handle *trans;
@@ -6621,6 +6637,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
 	wc->stage = DROP_REFERENCE;
 	wc->update_ref = update_ref;
 	wc->keep_locks = 0;
+	wc->for_reloc = for_reloc;
 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
 	while (1) {
@@ -6705,6 +6722,7 @@ out:
  * drop subtree rooted at tree block 'node'.
  *
  * NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
  */
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
@@ -6749,6 +6767,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	wc->stage = DROP_REFERENCE;
 	wc->update_ref = 0;
 	wc->keep_locks = 1;
+	wc->for_reloc = 1;
 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
 	while (1) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f2e928289600..d2b60ed6c33f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -678,7 +678,7 @@ next_slot:
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						new_key.objectid,
-						start - extent_offset);
+						start - extent_offset, 0);
 				BUG_ON(ret);
 				*hint_byte = disk_bytenr;
 			}
@@ -753,7 +753,7 @@ next_slot:
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						key.objectid, key.offset -
-						extent_offset);
+						extent_offset, 0);
 				BUG_ON(ret);
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
@@ -962,7 +962,7 @@ again:
 
 		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
 					   root->root_key.objectid,
-					   ino, orig_offset);
+					   ino, orig_offset, 0);
 		BUG_ON(ret);
 
 		if (split == start) {
@@ -989,7 +989,7 @@ again:
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					ino, orig_offset);
+					ino, orig_offset, 0);
 		BUG_ON(ret);
 	}
 	other_start = 0;
@@ -1006,7 +1006,7 @@ again:
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					ino, orig_offset);
+					ino, orig_offset, 0);
 		BUG_ON(ret);
 	}
 	if (del_nr == 0) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c5ccec23984c..ea819386b864 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3139,7 +3139,7 @@ delete:
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes, 0,
 						btrfs_header_owner(leaf),
-						ino, extent_offset);
+						ino, extent_offset, 0);
 			BUG_ON(ret);
 		}
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 72d461656f60..c48f2e931ea0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -358,7 +358,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 		return PTR_ERR(trans);
 
 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-				      0, objectid, NULL, 0, 0, 0);
+				      0, objectid, NULL, 0, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		goto fail;
@@ -2425,7 +2425,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 							disko, diskl, 0,
 							root->root_key.objectid,
 							btrfs_ino(inode),
-							new_key.offset - datao);
+							new_key.offset - datao,
+							0);
 					BUG_ON(ret);
 				}
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index dff29d5e151a..8c1aae2c845d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
 					   num_bytes, parent,
 					   btrfs_header_owner(leaf),
-					   key.objectid, key.offset);
+					   key.objectid, key.offset, 1);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					parent, btrfs_header_owner(leaf),
-					key.objectid, key.offset);
+					key.objectid, key.offset, 1);
 		BUG_ON(ret);
 	}
 	if (dirty)
@@ -1778,21 +1778,23 @@ again:
 
 		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0);
+					src->root_key.objectid, level - 1, 0,
+					1);
 		BUG_ON(ret);
 		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0);
+					0, 1);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0);
+					src->root_key.objectid, level - 1, 0,
+					1);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0);
+					0, 1);
 		BUG_ON(ret);
 
 		btrfs_unlock_up_safe(path, 0);
@@ -2244,7 +2246,7 @@ again:
 		} else {
 			list_del_init(&reloc_root->root_list);
 		}
-		btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
+		btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
 	}
 
 	if (found) {
@@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 						node->eb->start, blocksize,
 						upper->eb->start,
 						btrfs_header_owner(upper->eb),
-						node->level, 0);
+						node->level, 0, 1);
 			BUG_ON(ret);
 
 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3c..a2bfedcbcabc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
 
 		if (btrfs_header_backref_rev(root->node) <
 		    BTRFS_MIXED_BACKREF_REV)
-			btrfs_drop_snapshot(root, NULL, 0);
+			btrfs_drop_snapshot(root, NULL, 0, 0);
 		else
-			btrfs_drop_snapshot(root, NULL, 1);
+			btrfs_drop_snapshot(root, NULL, 1, 0);
 	}
 	return 0;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f4d81c06d48f..fce7b9ec3bfa 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 				ret = btrfs_inc_extent_ref(trans, root,
 						ins.objectid, ins.offset,
 						0, root->root_key.objectid,
-						key->objectid, offset);
+						key->objectid, offset, 0);
 				BUG_ON(ret);
 			} else {
 				/*
-- 
cgit v1.2.3


From eebe063b7f916087cd5c61de57b20a3a30894a96 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Wed, 14 Sep 2011 14:01:24 +0200
Subject: Btrfs: always save ref_root in delayed refs

For consistent backref walking and (later) qgroup calculation the
information to which root a delayed ref belongs is useful even for shared
refs.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/delayed-ref.c | 18 ++++++++----------
 fs/btrfs/delayed-ref.h | 12 ++++--------
 2 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 3a0f0ab804f4..babd37badb43 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -495,13 +495,12 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	ref->in_tree = 1;
 
 	full_ref = btrfs_delayed_node_to_tree_ref(ref);
-	if (parent) {
-		full_ref->parent = parent;
+	full_ref->parent = parent;
+	full_ref->root = ref_root;
+	if (parent)
 		ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-	} else {
-		full_ref->root = ref_root;
+	else
 		ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-	}
 	full_ref->level = level;
 
 	trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -551,13 +550,12 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	ref->in_tree = 1;
 
 	full_ref = btrfs_delayed_node_to_data_ref(ref);
-	if (parent) {
-		full_ref->parent = parent;
+	full_ref->parent = parent;
+	full_ref->root = ref_root;
+	if (parent)
 		ref->type = BTRFS_SHARED_DATA_REF_KEY;
-	} else {
-		full_ref->root = ref_root;
+	else
 		ref->type = BTRFS_EXTENT_DATA_REF_KEY;
-	}
 
 	full_ref->objectid = owner;
 	full_ref->offset = offset;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 8316bff18d30..a5fb2bc83732 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -98,19 +98,15 @@ struct btrfs_delayed_ref_head {
 
 struct btrfs_delayed_tree_ref {
 	struct btrfs_delayed_ref_node node;
-	union {
-		u64 root;
-		u64 parent;
-	};
+	u64 root;
+	u64 parent;
 	int level;
 };
 
 struct btrfs_delayed_data_ref {
 	struct btrfs_delayed_ref_node node;
-	union {
-		u64 root;
-		u64 parent;
-	};
+	u64 root;
+	u64 parent;
 	u64 objectid;
 	u64 offset;
 };
-- 
cgit v1.2.3


From 5b25f70f4200766355cdabda604e131d2fb6010d Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Tue, 13 Sep 2011 10:55:48 +0200
Subject: Btrfs: add nested locking mode for paths

This patch adds the possibilty to read-lock an extent even if it is already
write-locked from the same thread. btrfs_find_all_roots() needs this
capability.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/extent_io.c |  1 +
 fs/btrfs/extent_io.h |  2 ++
 fs/btrfs/locking.c   | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index be1bf627a14b..dd8d140eb27b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3571,6 +3571,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 	atomic_set(&eb->blocking_writers, 0);
 	atomic_set(&eb->spinning_readers, 0);
 	atomic_set(&eb->spinning_writers, 0);
+	eb->lock_nested = 0;
 	init_waitqueue_head(&eb->write_lock_wq);
 	init_waitqueue_head(&eb->read_lock_wq);
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7604c3001322..bc6a042cb6fc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -129,6 +129,7 @@ struct extent_buffer {
 	struct list_head leak_list;
 	struct rcu_head rcu_head;
 	atomic_t refs;
+	pid_t lock_owner;
 
 	/* count of read lock holders on the extent buffer */
 	atomic_t write_locks;
@@ -137,6 +138,7 @@ struct extent_buffer {
 	atomic_t blocking_readers;
 	atomic_t spinning_readers;
 	atomic_t spinning_writers;
+	int lock_nested;
 
 	/* protects write locks */
 	rwlock_t lock;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d77b67c4b275..5e178d8f7167 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
  */
 void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
+	if (eb->lock_nested) {
+		read_lock(&eb->lock);
+		if (eb->lock_nested && current->pid == eb->lock_owner) {
+			read_unlock(&eb->lock);
+			return;
+		}
+		read_unlock(&eb->lock);
+	}
 	if (rw == BTRFS_WRITE_LOCK) {
 		if (atomic_read(&eb->blocking_writers) == 0) {
 			WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
  */
 void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
+	if (eb->lock_nested) {
+		read_lock(&eb->lock);
+		if (&eb->lock_nested && current->pid == eb->lock_owner) {
+			read_unlock(&eb->lock);
+			return;
+		}
+		read_unlock(&eb->lock);
+	}
 	if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
 		BUG_ON(atomic_read(&eb->blocking_writers) != 1);
 		write_lock(&eb->lock);
@@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 void btrfs_tree_read_lock(struct extent_buffer *eb)
 {
 again:
+	read_lock(&eb->lock);
+	if (atomic_read(&eb->blocking_writers) &&
+	    current->pid == eb->lock_owner) {
+		/*
+		 * This extent is already write-locked by our thread. We allow
+		 * an additional read lock to be added because it's for the same
+		 * thread. btrfs_find_all_roots() depends on this as it may be
+		 * called on a partly (write-)locked tree.
+		 */
+		BUG_ON(eb->lock_nested);
+		eb->lock_nested = 1;
+		read_unlock(&eb->lock);
+		return;
+	}
+	read_unlock(&eb->lock);
 	wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
 	read_lock(&eb->lock);
 	if (atomic_read(&eb->blocking_writers)) {
 		read_unlock(&eb->lock);
-		wait_event(eb->write_lock_wq,
-			   atomic_read(&eb->blocking_writers) == 0);
 		goto again;
 	}
 	atomic_inc(&eb->read_locks);
@@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
 	}
 	atomic_inc(&eb->write_locks);
 	atomic_inc(&eb->spinning_writers);
+	eb->lock_owner = current->pid;
 	return 1;
 }
 
@@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
  */
 void btrfs_tree_read_unlock(struct extent_buffer *eb)
 {
+	if (eb->lock_nested) {
+		read_lock(&eb->lock);
+		if (eb->lock_nested && current->pid == eb->lock_owner) {
+			eb->lock_nested = 0;
+			read_unlock(&eb->lock);
+			return;
+		}
+		read_unlock(&eb->lock);
+	}
 	btrfs_assert_tree_read_locked(eb);
 	WARN_ON(atomic_read(&eb->spinning_readers) == 0);
 	atomic_dec(&eb->spinning_readers);
@@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
  */
 void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
 {
+	if (eb->lock_nested) {
+		read_lock(&eb->lock);
+		if (eb->lock_nested && current->pid == eb->lock_owner) {
+			eb->lock_nested = 0;
+			read_unlock(&eb->lock);
+			return;
+		}
+		read_unlock(&eb->lock);
+	}
 	btrfs_assert_tree_read_locked(eb);
 	WARN_ON(atomic_read(&eb->blocking_readers) == 0);
 	if (atomic_dec_and_test(&eb->blocking_readers))
@@ -181,6 +229,7 @@ again:
 	WARN_ON(atomic_read(&eb->spinning_writers));
 	atomic_inc(&eb->spinning_writers);
 	atomic_inc(&eb->write_locks);
+	eb->lock_owner = current->pid;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 00f04b88791ff49dc64ada18819d40a5b0671709 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Wed, 14 Sep 2011 12:37:00 +0200
Subject: Btrfs: add sequence numbers to delayed refs

Sequence numbers are needed to reconstruct the backrefs of a given extent to
a certain point in time. The total set of backrefs consist of the set of
backrefs recorded on disk plus the enqueued delayed refs for it that existed
at that moment.

This patch also adds a list that records all delayed refs which are
currently in the process of being added.

When walking all refs of an extent in btrfs_find_all_roots(), we freeze the
current state of delayed refs, honor anythinh up to this point and prevent
processing newer delayed refs to assert consistency.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/delayed-ref.c | 34 ++++++++++++++++++++++++
 fs/btrfs/delayed-ref.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/transaction.c |  4 +++
 3 files changed, 108 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index babd37badb43..a405db0320e8 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
 		return -1;
 	if (ref1->type > ref2->type)
 		return 1;
+	/* merging of sequenced refs is not allowed */
+	if (ref1->seq < ref2->seq)
+		return -1;
+	if (ref1->seq > ref2->seq)
+		return 1;
 	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
 		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -209,6 +214,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+			    u64 seq)
+{
+	struct seq_list *elem;
+
+	assert_spin_locked(&delayed_refs->lock);
+	if (list_empty(&delayed_refs->seq_head))
+		return 0;
+
+	elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
+	if (seq >= elem->seq) {
+		pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
+			 seq, elem->seq, delayed_refs);
+		return 1;
+	}
+	return 0;
+}
+
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 			   struct list_head *cluster, u64 start)
 {
@@ -438,6 +461,7 @@ static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	ref->action  = 0;
 	ref->is_head = 1;
 	ref->in_tree = 1;
+	ref->seq = 0;
 
 	head_ref = btrfs_delayed_node_to_head(ref);
 	head_ref->must_insert_reserved = must_insert_reserved;
@@ -479,6 +503,7 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_tree_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	u64 seq = 0;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
@@ -494,6 +519,10 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	ref->is_head = 0;
 	ref->in_tree = 1;
 
+	if (need_ref_seq(for_cow, ref_root))
+		seq = inc_delayed_seq(delayed_refs);
+	ref->seq = seq;
+
 	full_ref = btrfs_delayed_node_to_tree_ref(ref);
 	full_ref->parent = parent;
 	full_ref->root = ref_root;
@@ -534,6 +563,7 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_data_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	u64 seq = 0;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
@@ -549,6 +579,10 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	ref->is_head = 0;
 	ref->in_tree = 1;
 
+	if (need_ref_seq(for_cow, ref_root))
+		seq = inc_delayed_seq(delayed_refs);
+	ref->seq = seq;
+
 	full_ref = btrfs_delayed_node_to_data_ref(ref);
 	full_ref->parent = parent;
 	full_ref->root = ref_root;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a5fb2bc83732..174416f7882b 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
 	/* the size of the extent */
 	u64 num_bytes;
 
+	/* seq number to keep track of insertion order */
+	u64 seq;
+
 	/* ref count on this data structure */
 	atomic_t refs;
 
@@ -136,6 +139,20 @@ struct btrfs_delayed_ref_root {
 	int flushing;
 
 	u64 run_delayed_start;
+
+	/*
+	 * seq number of delayed refs. We need to know if a backref was being
+	 * added before the currently processed ref or afterwards.
+	 */
+	u64 seq;
+
+	/*
+	 * seq_list holds a list of all seq numbers that are currently being
+	 * added to the list. While walking backrefs (btrfs_find_all_roots,
+	 * qgroups), which might take some time, no newer ref must be processed,
+	 * as it might influence the outcome of the walk.
+	 */
+	struct list_head seq_head;
 };
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -171,6 +188,59 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 			   struct btrfs_delayed_ref_head *head);
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 			   struct list_head *cluster, u64 search_start);
+
+struct seq_list {
+	struct list_head list;
+	u64 seq;
+};
+
+static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
+{
+	assert_spin_locked(&delayed_refs->lock);
+	++delayed_refs->seq;
+	return delayed_refs->seq;
+}
+
+static inline void
+btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+		      struct seq_list *elem)
+{
+	assert_spin_locked(&delayed_refs->lock);
+	elem->seq = delayed_refs->seq;
+	list_add_tail(&elem->list, &delayed_refs->seq_head);
+}
+
+static inline void
+btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+		      struct seq_list *elem)
+{
+	spin_lock(&delayed_refs->lock);
+	list_del(&elem->list);
+	spin_unlock(&delayed_refs->lock);
+}
+
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+			    u64 seq);
+
+/*
+ * delayed refs with a ref_seq > 0 must be held back during backref walking.
+ * this only applies to items in one of the fs-trees. for_cow items never need
+ * to be held back, so they won't get a ref_seq number.
+ */
+static inline int need_ref_seq(int for_cow, u64 rootid)
+{
+	if (for_cow)
+		return 0;
+
+	if (rootid == BTRFS_FS_TREE_OBJECTID)
+		return 1;
+
+	if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+		return 1;
+
+	return 0;
+}
+
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a2bfedcbcabc..31a7393af64e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
 	WARN_ON(atomic_read(&transaction->use_count) == 0);
 	if (atomic_dec_and_test(&transaction->use_count)) {
 		BUG_ON(!list_empty(&transaction->list));
+		WARN_ON(transaction->delayed_refs.root.rb_node);
+		WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
 		memset(transaction, 0, sizeof(*transaction));
 		kmem_cache_free(btrfs_transaction_cachep, transaction);
 	}
@@ -108,8 +110,10 @@ loop:
 	cur_trans->delayed_refs.num_heads = 0;
 	cur_trans->delayed_refs.flushing = 0;
 	cur_trans->delayed_refs.run_delayed_start = 0;
+	cur_trans->delayed_refs.seq = 1;
 	spin_lock_init(&cur_trans->commit_lock);
 	spin_lock_init(&cur_trans->delayed_refs.lock);
+	INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
 
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 	list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
-- 
cgit v1.2.3


From d1270cd91f308c9d22b2804720c36ccd32dbc35e Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Tue, 13 Sep 2011 15:16:43 +0200
Subject: Btrfs: put back delayed refs that are too new

When processing a delayed ref, first check if there are still old refs in
the process of being added. If so, put this ref back to the tree. To avoid
looping on this ref, choose a newer one in the next loop.
btrfs_find_ref_cluster has to take care of that.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/delayed-ref.c | 43 +++++++++++++++++++++++++------------------
 fs/btrfs/extent-tree.c | 27 ++++++++++++++++++++++-----
 2 files changed, 47 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index a405db0320e8..ee181989d444 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -155,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 
 /*
  * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot.
+ * If return_bigger is given, the next bigger entry is returned if no exact
+ * match is found.
  */
 static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
 				  u64 bytenr,
-				  struct btrfs_delayed_ref_node **last)
+				  struct btrfs_delayed_ref_node **last,
+				  int return_bigger)
 {
-	struct rb_node *n = root->rb_node;
+	struct rb_node *n;
 	struct btrfs_delayed_ref_node *entry;
-	int cmp;
+	int cmp = 0;
 
+again:
+	n = root->rb_node;
+	entry = NULL;
 	while (n) {
 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
 		WARN_ON(!entry->in_tree);
@@ -187,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
 		else
 			return entry;
 	}
+	if (entry && return_bigger) {
+		if (cmp > 0) {
+			n = rb_next(&entry->rb_node);
+			if (!n)
+				n = rb_first(root);
+			entry = rb_entry(n, struct btrfs_delayed_ref_node,
+					 rb_node);
+			bytenr = entry->bytenr;
+			return_bigger = 0;
+			goto again;
+		}
+		return entry;
+	}
 	return NULL;
 }
 
@@ -246,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 		node = rb_first(&delayed_refs->root);
 	} else {
 		ref = NULL;
-		find_ref_head(&delayed_refs->root, start, &ref);
+		find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
 		if (ref) {
-			struct btrfs_delayed_ref_node *tmp;
-
-			node = rb_prev(&ref->rb_node);
-			while (node) {
-				tmp = rb_entry(node,
-					       struct btrfs_delayed_ref_node,
-					       rb_node);
-				if (tmp->bytenr < start)
-					break;
-				ref = tmp;
-				node = rb_prev(&ref->rb_node);
-			}
 			node = &ref->rb_node;
 		} else
 			node = rb_first(&delayed_refs->root);
@@ -748,7 +755,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+	ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
 	if (ref)
 		return btrfs_delayed_node_to_head(ref);
 	return NULL;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index dc8b9a834596..bbcca12fbbba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2236,6 +2236,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 			}
 		}
 
+		/*
+		 * locked_ref is the head node, so we have to go one
+		 * node back for any delayed ref updates
+		 */
+		ref = select_delayed_ref(locked_ref);
+
+		if (ref && ref->seq &&
+		    btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+			/*
+			 * there are still refs with lower seq numbers in the
+			 * process of being added. Don't run this ref yet.
+			 */
+			list_del_init(&locked_ref->cluster);
+			mutex_unlock(&locked_ref->mutex);
+			locked_ref = NULL;
+			delayed_refs->num_heads_ready++;
+			spin_unlock(&delayed_refs->lock);
+			cond_resched();
+			spin_lock(&delayed_refs->lock);
+			continue;
+		}
+
 		/*
 		 * record the must insert reserved flag before we
 		 * drop the spin lock.
@@ -2246,11 +2268,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		extent_op = locked_ref->extent_op;
 		locked_ref->extent_op = NULL;
 
-		/*
-		 * locked_ref is the head node, so we have to go one
-		 * node back for any delayed ref updates
-		 */
-		ref = select_delayed_ref(locked_ref);
 		if (!ref) {
 			/* All delayed refs have been processed, Go ahead
 			 * and send the head node to run_one_delayed_ref,
-- 
cgit v1.2.3


From a168650c08300434e1456abe7b6451f1448230d3 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 12 Dec 2011 16:10:07 +0100
Subject: Btrfs: add waitqueue instead of doing busy waiting for more delayed
 refs

Now that we may be holding back delayed refs for a limited period, we
might end up having no runnable delayed refs. Without this commit, we'd
do busy waiting in that thread until another (runnable) ref arives.
Instead, we're detecting this situation and use a waitqueue, such that
we only try to run more refs after
	a) another runnable ref was added  or
	b) delayed refs are no longer held back

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/delayed-ref.c |  8 +++++++
 fs/btrfs/delayed-ref.h |  7 ++++++
 fs/btrfs/extent-tree.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/transaction.c |  1 +
 4 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ee181989d444..66e4f29505a3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -664,6 +664,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 				   num_bytes, parent, ref_root, level, action,
 				   for_cow);
 	BUG_ON(ret);
+	if (!need_ref_seq(for_cow, ref_root) &&
+	    waitqueue_active(&delayed_refs->seq_wait))
+		wake_up(&delayed_refs->seq_wait);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
@@ -712,6 +715,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 				   num_bytes, parent, ref_root, owner, offset,
 				   action, for_cow);
 	BUG_ON(ret);
+	if (!need_ref_seq(for_cow, ref_root) &&
+	    waitqueue_active(&delayed_refs->seq_wait))
+		wake_up(&delayed_refs->seq_wait);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
@@ -739,6 +745,8 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				   extent_op->is_data);
 	BUG_ON(ret);
 
+	if (waitqueue_active(&delayed_refs->seq_wait))
+		wake_up(&delayed_refs->seq_wait);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 174416f7882b..d8f244d94925 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -153,6 +153,12 @@ struct btrfs_delayed_ref_root {
 	 * as it might influence the outcome of the walk.
 	 */
 	struct list_head seq_head;
+
+	/*
+	 * when the only refs we have in the list must not be processed, we want
+	 * to wait for more refs to show up or for the end of backref walking.
+	 */
+	wait_queue_head_t seq_wait;
 };
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -216,6 +222,7 @@ btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
 {
 	spin_lock(&delayed_refs->lock);
 	list_del(&elem->list);
+	wake_up(&delayed_refs->seq_wait);
 	spin_unlock(&delayed_refs->lock);
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index bbcca12fbbba..0a435e2e143e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2300,7 +2300,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		ref->in_tree = 0;
 		rb_erase(&ref->rb_node, &delayed_refs->root);
 		delayed_refs->num_entries--;
-
+		/*
+		 * we modified num_entries, but as we're currently running
+		 * delayed refs, skip
+		 *     wake_up(&delayed_refs->seq_wait);
+		 * here.
+		 */
 		spin_unlock(&delayed_refs->lock);
 
 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2317,6 +2322,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 	return count;
 }
 
+
+static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
+			unsigned long num_refs)
+{
+	struct list_head *first_seq = delayed_refs->seq_head.next;
+
+	spin_unlock(&delayed_refs->lock);
+	pr_debug("waiting for more refs (num %ld, first %p)\n",
+		 num_refs, first_seq);
+	wait_event(delayed_refs->seq_wait,
+		   num_refs != delayed_refs->num_entries ||
+		   delayed_refs->seq_head.next != first_seq);
+	pr_debug("done waiting for more refs (num %ld, first %p)\n",
+		 delayed_refs->num_entries, delayed_refs->seq_head.next);
+	spin_lock(&delayed_refs->lock);
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2332,8 +2354,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_node *ref;
 	struct list_head cluster;
 	int ret;
+	u64 delayed_start;
 	int run_all = count == (unsigned long)-1;
 	int run_most = 0;
+	unsigned long num_refs = 0;
+	int consider_waiting;
 
 	if (root == root->fs_info->extent_root)
 		root = root->fs_info->tree_root;
@@ -2341,6 +2366,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	delayed_refs = &trans->transaction->delayed_refs;
 	INIT_LIST_HEAD(&cluster);
 again:
+	consider_waiting = 0;
 	spin_lock(&delayed_refs->lock);
 	if (count == 0) {
 		count = delayed_refs->num_entries * 2;
@@ -2357,11 +2383,35 @@ again:
 		 * of refs to process starting at the first one we are able to
 		 * lock
 		 */
+		delayed_start = delayed_refs->run_delayed_start;
 		ret = btrfs_find_ref_cluster(trans, &cluster,
 					     delayed_refs->run_delayed_start);
 		if (ret)
 			break;
 
+		if (delayed_start >= delayed_refs->run_delayed_start) {
+			if (consider_waiting == 0) {
+				/*
+				 * btrfs_find_ref_cluster looped. let's do one
+				 * more cycle. if we don't run any delayed ref
+				 * during that cycle (because we can't because
+				 * all of them are blocked) and if the number of
+				 * refs doesn't change, we avoid busy waiting.
+				 */
+				consider_waiting = 1;
+				num_refs = delayed_refs->num_entries;
+			} else {
+				wait_for_more_refs(delayed_refs, num_refs);
+				/*
+				 * after waiting, things have changed. we
+				 * dropped the lock and someone else might have
+				 * run some refs, built new clusters and so on.
+				 * therefore, we restart staleness detection.
+				 */
+				consider_waiting = 0;
+			}
+		}
+
 		ret = run_clustered_refs(trans, root, &cluster);
 		BUG_ON(ret < 0);
 
@@ -2369,6 +2419,11 @@ again:
 
 		if (count == 0)
 			break;
+
+		if (ret || delayed_refs->run_delayed_start == 0) {
+			/* refs were run, let's reset staleness detection */
+			consider_waiting = 0;
+		}
 	}
 
 	if (run_all) {
@@ -4933,6 +4988,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	rb_erase(&head->node.rb_node, &delayed_refs->root);
 
 	delayed_refs->num_entries--;
+	if (waitqueue_active(&delayed_refs->seq_wait))
+		wake_up(&delayed_refs->seq_wait);
 
 	/*
 	 * we don't take a ref on the node because we're removing it from the
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 31a7393af64e..04c5c7c2c32f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -111,6 +111,7 @@ loop:
 	cur_trans->delayed_refs.flushing = 0;
 	cur_trans->delayed_refs.run_delayed_start = 0;
 	cur_trans->delayed_refs.seq = 1;
+	init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
 	spin_lock_init(&cur_trans->commit_lock);
 	spin_lock_init(&cur_trans->delayed_refs.lock);
 	INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
-- 
cgit v1.2.3


From 8da6d5815c592b713ecaf4f4f8b631f8359c96c4 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Wed, 23 Nov 2011 18:55:04 +0100
Subject: Btrfs: added btrfs_find_all_roots()

This function gets a byte number (a data extent), collects all the leafs
pointing to it and walks up the trees to find all fs roots pointing to those
leafs. It also returns the list of all leafs pointing to that extent.

It does proper locking for the involved trees, can be used on busy file
systems and honors delayed refs.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/backref.c | 783 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/backref.h |   5 +
 2 files changed, 788 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 22c64fff1bd5..03c30a1836f4 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -19,6 +19,9 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
 
 struct __data_ref {
 	struct list_head list;
@@ -32,6 +35,786 @@ struct __shared_ref {
 	u64 disk_byte;
 };
 
+/*
+ * this structure records all encountered refs on the way up to the root
+ */
+struct __prelim_ref {
+	struct list_head list;
+	u64 root_id;
+	struct btrfs_key key;
+	int level;
+	int count;
+	u64 parent;
+	u64 wanted_disk_byte;
+};
+
+static int __add_prelim_ref(struct list_head *head, u64 root_id,
+			    struct btrfs_key *key, int level, u64 parent,
+			    u64 wanted_disk_byte, int count)
+{
+	struct __prelim_ref *ref;
+
+	/* in case we're adding delayed refs, we're holding the refs spinlock */
+	ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->root_id = root_id;
+	if (key)
+		ref->key = *key;
+	else
+		memset(&ref->key, 0, sizeof(ref->key));
+
+	ref->level = level;
+	ref->count = count;
+	ref->parent = parent;
+	ref->wanted_disk_byte = wanted_disk_byte;
+	list_add_tail(&ref->list, head);
+
+	return 0;
+}
+
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+				struct ulist *parents,
+				struct extent_buffer *eb, int level,
+				u64 wanted_objectid, u64 wanted_disk_byte)
+{
+	int ret;
+	int slot;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 disk_byte;
+
+add_parent:
+	ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+	if (ret < 0)
+		return ret;
+
+	if (level != 0)
+		return 0;
+
+	/*
+	 * if the current leaf is full with EXTENT_DATA items, we must
+	 * check the next one if that holds a reference as well.
+	 * ref->count cannot be used to skip this check.
+	 * repeat this until we don't find any additional EXTENT_DATA items.
+	 */
+	while (1) {
+		ret = btrfs_next_leaf(root, path);
+		if (ret < 0)
+			return ret;
+		if (ret)
+			return 0;
+
+		eb = path->nodes[0];
+		for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
+			btrfs_item_key_to_cpu(eb, &key, slot);
+			if (key.objectid != wanted_objectid ||
+			    key.type != BTRFS_EXTENT_DATA_KEY)
+				return 0;
+			fi = btrfs_item_ptr(eb, slot,
+						struct btrfs_file_extent_item);
+			disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+			if (disk_byte == wanted_disk_byte)
+				goto add_parent;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+					struct __prelim_ref *ref,
+					struct ulist *parents)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct btrfs_key root_key;
+	struct btrfs_key key = {0};
+	struct extent_buffer *eb;
+	int ret = 0;
+	int root_level;
+	int level = ref->level;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	root_key.objectid = ref->root_id;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out;
+	}
+
+	rcu_read_lock();
+	root_level = btrfs_header_level(root->node);
+	rcu_read_unlock();
+
+	if (root_level + 1 == level)
+		goto out;
+
+	path->lowest_level = level;
+	ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
+	pr_debug("search slot in root %llu (level %d, ref count %d) returned "
+		 "%d for key (%llu %u %llu)\n",
+		 (unsigned long long)ref->root_id, level, ref->count, ret,
+		 (unsigned long long)ref->key.objectid, ref->key.type,
+		 (unsigned long long)ref->key.offset);
+	if (ret < 0)
+		goto out;
+
+	eb = path->nodes[level];
+	if (!eb) {
+		WARN_ON(1);
+		ret = 1;
+		goto out;
+	}
+
+	if (level == 0) {
+		if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret)
+				goto out;
+			eb = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+	}
+
+	/* the last two parameters will only be used for level == 0 */
+	ret = add_all_parents(root, path, parents, eb, level, key.objectid,
+				ref->wanted_disk_byte);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * resolve all indirect backrefs from the list
+ */
+static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+				   struct list_head *head)
+{
+	int err;
+	int ret = 0;
+	struct __prelim_ref *ref;
+	struct __prelim_ref *ref_safe;
+	struct __prelim_ref *new_ref;
+	struct ulist *parents;
+	struct ulist_node *node;
+
+	parents = ulist_alloc(GFP_NOFS);
+	if (!parents)
+		return -ENOMEM;
+
+	/*
+	 * _safe allows us to insert directly after the current item without
+	 * iterating over the newly inserted items.
+	 * we're also allowed to re-assign ref during iteration.
+	 */
+	list_for_each_entry_safe(ref, ref_safe, head, list) {
+		if (ref->parent)	/* already direct */
+			continue;
+		if (ref->count == 0)
+			continue;
+		err = __resolve_indirect_ref(fs_info, ref, parents);
+		if (err) {
+			if (ret == 0)
+				ret = err;
+			continue;
+		}
+
+		/* we put the first parent into the ref at hand */
+		node = ulist_next(parents, NULL);
+		ref->parent = node ? node->val : 0;
+
+		/* additional parents require new refs being added here */
+		while ((node = ulist_next(parents, node))) {
+			new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
+			if (!new_ref) {
+				ret = -ENOMEM;
+				break;
+			}
+			memcpy(new_ref, ref, sizeof(*ref));
+			new_ref->parent = node->val;
+			list_add(&new_ref->list, &ref->list);
+		}
+		ulist_reinit(parents);
+	}
+
+	ulist_free(parents);
+	return ret;
+}
+
+/*
+ * merge two lists of backrefs and adjust counts accordingly
+ *
+ * mode = 1: merge identical keys, if key is set
+ * mode = 2: merge identical parents
+ */
+static int __merge_refs(struct list_head *head, int mode)
+{
+	struct list_head *pos1;
+
+	list_for_each(pos1, head) {
+		struct list_head *n2;
+		struct list_head *pos2;
+		struct __prelim_ref *ref1;
+
+		ref1 = list_entry(pos1, struct __prelim_ref, list);
+
+		if (mode == 1 && ref1->key.type == 0)
+			continue;
+		for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
+		     pos2 = n2, n2 = pos2->next) {
+			struct __prelim_ref *ref2;
+
+			ref2 = list_entry(pos2, struct __prelim_ref, list);
+
+			if (mode == 1) {
+				if (memcmp(&ref1->key, &ref2->key,
+					   sizeof(ref1->key)) ||
+				    ref1->level != ref2->level ||
+				    ref1->root_id != ref2->root_id)
+					continue;
+				ref1->count += ref2->count;
+			} else {
+				if (ref1->parent != ref2->parent)
+					continue;
+				ref1->count += ref2->count;
+			}
+			list_del(&ref2->list);
+			kfree(ref2);
+		}
+
+	}
+	return 0;
+}
+
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+			      struct btrfs_key *info_key,
+			      struct list_head *prefs)
+{
+	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+	struct rb_node *n = &head->node.rb_node;
+	int sgn;
+	int ret;
+
+	if (extent_op && extent_op->update_key)
+		btrfs_disk_key_to_cpu(info_key, &extent_op->key);
+
+	while ((n = rb_prev(n))) {
+		struct btrfs_delayed_ref_node *node;
+		node = rb_entry(n, struct btrfs_delayed_ref_node,
+				rb_node);
+		if (node->bytenr != head->node.bytenr)
+			break;
+		WARN_ON(node->is_head);
+
+		if (node->seq > seq)
+			continue;
+
+		switch (node->action) {
+		case BTRFS_ADD_DELAYED_EXTENT:
+		case BTRFS_UPDATE_DELAYED_HEAD:
+			WARN_ON(1);
+			continue;
+		case BTRFS_ADD_DELAYED_REF:
+			sgn = 1;
+			break;
+		case BTRFS_DROP_DELAYED_REF:
+			sgn = -1;
+			break;
+		default:
+			BUG_ON(1);
+		}
+		switch (node->type) {
+		case BTRFS_TREE_BLOCK_REF_KEY: {
+			struct btrfs_delayed_tree_ref *ref;
+
+			ref = btrfs_delayed_node_to_tree_ref(node);
+			ret = __add_prelim_ref(prefs, ref->root, info_key,
+					       ref->level + 1, 0, node->bytenr,
+					       node->ref_mod * sgn);
+			break;
+		}
+		case BTRFS_SHARED_BLOCK_REF_KEY: {
+			struct btrfs_delayed_tree_ref *ref;
+
+			ref = btrfs_delayed_node_to_tree_ref(node);
+			ret = __add_prelim_ref(prefs, ref->root, info_key,
+					       ref->level + 1, ref->parent,
+					       node->bytenr,
+					       node->ref_mod * sgn);
+			break;
+		}
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_delayed_data_ref *ref;
+			struct btrfs_key key;
+
+			ref = btrfs_delayed_node_to_data_ref(node);
+
+			key.objectid = ref->objectid;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = ref->offset;
+			ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
+					       node->bytenr,
+					       node->ref_mod * sgn);
+			break;
+		}
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_delayed_data_ref *ref;
+			struct btrfs_key key;
+
+			ref = btrfs_delayed_node_to_data_ref(node);
+
+			key.objectid = ref->objectid;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = ref->offset;
+			ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+					       ref->parent, node->bytenr,
+					       node->ref_mod * sgn);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		BUG_ON(ret);
+	}
+
+	return 0;
+}
+
+/*
+ * add all inline backrefs for bytenr to the list
+ */
+static int __add_inline_refs(struct btrfs_fs_info *fs_info,
+			     struct btrfs_path *path, u64 bytenr,
+			     struct btrfs_key *info_key, int *info_level,
+			     struct list_head *prefs)
+{
+	int ret;
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	unsigned long ptr;
+	unsigned long end;
+	struct btrfs_extent_item *ei;
+	u64 flags;
+	u64 item_size;
+
+	/*
+	 * enumerate all inline refs
+	 */
+	leaf = path->nodes[0];
+	slot = path->slots[0] - 1;
+
+	item_size = btrfs_item_size_nr(leaf, slot);
+	BUG_ON(item_size < sizeof(*ei));
+
+	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+	flags = btrfs_extent_flags(leaf, ei);
+
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + item_size;
+
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		struct btrfs_tree_block_info *info;
+		struct btrfs_disk_key disk_key;
+
+		info = (struct btrfs_tree_block_info *)ptr;
+		*info_level = btrfs_tree_block_level(leaf, info);
+		btrfs_tree_block_key(leaf, info, &disk_key);
+		btrfs_disk_key_to_cpu(info_key, &disk_key);
+		ptr += sizeof(struct btrfs_tree_block_info);
+		BUG_ON(ptr > end);
+	} else {
+		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+	}
+
+	while (ptr < end) {
+		struct btrfs_extent_inline_ref *iref;
+		u64 offset;
+		int type;
+
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(leaf, iref);
+		offset = btrfs_extent_inline_ref_offset(leaf, iref);
+
+		switch (type) {
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, 0, info_key,
+						*info_level + 1, offset,
+						bytenr, 1);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_shared_data_ref *sdref;
+			int count;
+
+			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+			count = btrfs_shared_data_ref_count(leaf, sdref);
+			ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
+					       bytenr, count);
+			break;
+		}
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, offset, info_key,
+					       *info_level + 1, 0, bytenr, 1);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_extent_data_ref *dref;
+			int count;
+			u64 root;
+
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			count = btrfs_extent_data_ref_count(leaf, dref);
+			key.objectid = btrfs_extent_data_ref_objectid(leaf,
+								      dref);
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+			root = btrfs_extent_data_ref_root(leaf, dref);
+			ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
+						count);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		BUG_ON(ret);
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+
+	return 0;
+}
+
+/*
+ * add all non-inline backrefs for bytenr to the list
+ */
+static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
+			    struct btrfs_path *path, u64 bytenr,
+			    struct btrfs_key *info_key, int info_level,
+			    struct list_head *prefs)
+{
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	int ret;
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	while (1) {
+		ret = btrfs_next_item(extent_root, path);
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = 0;
+			break;
+		}
+
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.objectid != bytenr)
+			break;
+		if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+			continue;
+		if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+			break;
+
+		switch (key.type) {
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, 0, info_key,
+						info_level + 1, key.offset,
+						bytenr, 1);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_shared_data_ref *sdref;
+			int count;
+
+			sdref = btrfs_item_ptr(leaf, slot,
+					      struct btrfs_shared_data_ref);
+			count = btrfs_shared_data_ref_count(leaf, sdref);
+			ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+						bytenr, count);
+			break;
+		}
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, key.offset, info_key,
+						info_level + 1, 0, bytenr, 1);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_extent_data_ref *dref;
+			int count;
+			u64 root;
+
+			dref = btrfs_item_ptr(leaf, slot,
+					      struct btrfs_extent_data_ref);
+			count = btrfs_extent_data_ref_count(leaf, dref);
+			key.objectid = btrfs_extent_data_ref_objectid(leaf,
+								      dref);
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+			root = btrfs_extent_data_ref_root(leaf, dref);
+			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+						bytenr, count);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		BUG_ON(ret);
+	}
+
+	return ret;
+}
+
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info, u64 bytenr,
+			     u64 seq, struct ulist *refs, struct ulist *roots)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_key info_key = { 0 };
+	struct btrfs_delayed_ref_root *delayed_refs = NULL;
+	struct btrfs_delayed_ref_head *head = NULL;
+	int info_level = 0;
+	int ret;
+	struct list_head prefs_delayed;
+	struct list_head prefs;
+	struct __prelim_ref *ref;
+
+	INIT_LIST_HEAD(&prefs);
+	INIT_LIST_HEAD(&prefs_delayed);
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/*
+	 * grab both a lock on the path and a lock on the delayed ref head.
+	 * We need both to get a consistent picture of how the refs look
+	 * at a specified point in time
+	 */
+again:
+	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+	/*
+	 * look if there are updates for this ref queued and lock the head
+	 */
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(trans, bytenr);
+	if (head) {
+		if (!mutex_trylock(&head->mutex)) {
+			atomic_inc(&head->node.refs);
+			spin_unlock(&delayed_refs->lock);
+
+			btrfs_release_path(path);
+
+			/*
+			 * Mutex was contended, block until it's
+			 * released and try again
+			 */
+			mutex_lock(&head->mutex);
+			mutex_unlock(&head->mutex);
+			btrfs_put_delayed_ref(&head->node);
+			goto again;
+		}
+		ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
+		if (ret)
+			goto out;
+	}
+	spin_unlock(&delayed_refs->lock);
+
+	if (path->slots[0]) {
+		struct extent_buffer *leaf;
+		int slot;
+
+		leaf = path->nodes[0];
+		slot = path->slots[0] - 1;
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid == bytenr &&
+		    key.type == BTRFS_EXTENT_ITEM_KEY) {
+			ret = __add_inline_refs(fs_info, path, bytenr,
+						&info_key, &info_level, &prefs);
+			if (ret)
+				goto out;
+			ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
+					       info_level, &prefs);
+			if (ret)
+				goto out;
+		}
+	}
+	btrfs_release_path(path);
+
+	/*
+	 * when adding the delayed refs above, the info_key might not have
+	 * been known yet. Go over the list and replace the missing keys
+	 */
+	list_for_each_entry(ref, &prefs_delayed, list) {
+		if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
+			memcpy(&ref->key, &info_key, sizeof(ref->key));
+	}
+	list_splice_init(&prefs_delayed, &prefs);
+
+	ret = __merge_refs(&prefs, 1);
+	if (ret)
+		goto out;
+
+	ret = __resolve_indirect_refs(fs_info, &prefs);
+	if (ret)
+		goto out;
+
+	ret = __merge_refs(&prefs, 2);
+	if (ret)
+		goto out;
+
+	while (!list_empty(&prefs)) {
+		ref = list_first_entry(&prefs, struct __prelim_ref, list);
+		list_del(&ref->list);
+		if (ref->count < 0)
+			WARN_ON(1);
+		if (ref->count && ref->root_id && ref->parent == 0) {
+			/* no parent == root of tree */
+			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+			BUG_ON(ret < 0);
+		}
+		if (ref->count && ref->parent) {
+			ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
+			BUG_ON(ret < 0);
+		}
+		kfree(ref);
+	}
+
+out:
+	if (head)
+		mutex_unlock(&head->mutex);
+	btrfs_free_path(path);
+	while (!list_empty(&prefs)) {
+		ref = list_first_entry(&prefs, struct __prelim_ref, list);
+		list_del(&ref->list);
+		kfree(ref);
+	}
+	while (!list_empty(&prefs_delayed)) {
+		ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+				       list);
+		list_del(&ref->list);
+		kfree(ref);
+	}
+
+	return ret;
+}
+
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info, u64 bytenr,
+				u64 num_bytes, u64 seq, struct ulist **leafs)
+{
+	struct ulist *tmp;
+	int ret;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+	*leafs = ulist_alloc(GFP_NOFS);
+	if (!*leafs) {
+		ulist_free(tmp);
+		return -ENOMEM;
+	}
+
+	ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
+	ulist_free(tmp);
+
+	if (ret < 0 && ret != -ENOENT) {
+		ulist_free(*leafs);
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info, u64 bytenr,
+				u64 num_bytes, u64 seq, struct ulist **roots)
+{
+	struct ulist *tmp;
+	struct ulist_node *node = NULL;
+	int ret;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+	*roots = ulist_alloc(GFP_NOFS);
+	if (!*roots) {
+		ulist_free(tmp);
+		return -ENOMEM;
+	}
+
+	while (1) {
+		ret = find_parent_nodes(trans, fs_info, bytenr, seq,
+					tmp, *roots);
+		if (ret < 0 && ret != -ENOENT) {
+			ulist_free(tmp);
+			ulist_free(*roots);
+			return ret;
+		}
+		node = ulist_next(tmp, node);
+		if (!node)
+			break;
+		bytenr = node->val;
+	}
+
+	ulist_free(tmp);
+	return 0;
+}
+
+
 static int __inode_info(u64 inum, u64 ioff, u8 key_type,
 			struct btrfs_root *fs_root, struct btrfs_path *path,
 			struct btrfs_key *found_key)
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 92618837cb8f..d00dfa9ca934 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -20,6 +20,7 @@
 #define __BTRFS_BACKREF__
 
 #include "ioctl.h"
+#include "ulist.h"
 
 struct inode_fs_paths {
 	struct btrfs_path		*btrfs_path;
@@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info, u64 bytenr,
+				u64 num_bytes, u64 seq, struct ulist **roots);
+
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
 					struct btrfs_path *path);
-- 
cgit v1.2.3


From 4692cf58aa7b81f721c1653d48db99ea41421d58 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Fri, 2 Dec 2011 14:56:41 +0100
Subject: Btrfs: new backref walking code

The old backref iteration code could only safely be used on commit roots.
Besides this limitation, it had bugs in finding the roots for these
references. This commit replaces large parts of it by btrfs_find_all_roots()
which a) really finds all roots and the correct roots, b) works correctly
under heavy file system load, c) considers delayed refs.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/backref.c | 354 +++++++++++++++--------------------------------------
 fs/btrfs/ioctl.c   |   8 +-
 fs/btrfs/scrub.c   |   7 +-
 3 files changed, 107 insertions(+), 262 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 03c30a1836f4..b9a843226de8 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -23,18 +23,6 @@
 #include "transaction.h"
 #include "delayed-ref.h"
 
-struct __data_ref {
-	struct list_head list;
-	u64 inum;
-	u64 root;
-	u64 extent_data_item_offset;
-};
-
-struct __shared_ref {
-	struct list_head list;
-	u64 disk_byte;
-};
-
 /*
  * this structure records all encountered refs on the way up to the root
  */
@@ -964,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
 	if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
 	    found_key->objectid > logical ||
-	    found_key->objectid + found_key->offset <= logical)
+	    found_key->objectid + found_key->offset <= logical) {
+		pr_debug("logical %llu is not within any extent\n",
+			 (unsigned long long)logical);
 		return -ENOENT;
+	}
 
 	eb = path->nodes[0];
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -974,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 	flags = btrfs_extent_flags(eb, ei);
 
+	pr_debug("logical %llu is at position %llu within the extent (%llu "
+		 "EXTENT_ITEM %llu) flags %#llx size %u\n",
+		 (unsigned long long)logical,
+		 (unsigned long long)(logical - found_key->objectid),
+		 (unsigned long long)found_key->objectid,
+		 (unsigned long long)found_key->offset,
+		 (unsigned long long)flags, item_size);
 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		return BTRFS_EXTENT_FLAG_TREE_BLOCK;
 	if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -1070,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
 	return 0;
 }
 
-static int __data_list_add(struct list_head *head, u64 inum,
-				u64 extent_data_item_offset, u64 root)
-{
-	struct __data_ref *ref;
-
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
-	if (!ref)
-		return -ENOMEM;
-
-	ref->inum = inum;
-	ref->extent_data_item_offset = extent_data_item_offset;
-	ref->root = root;
-	list_add_tail(&ref->list, head);
-
-	return 0;
-}
-
-static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
-				struct btrfs_extent_data_ref *dref)
-{
-	return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
-				btrfs_extent_data_ref_offset(eb, dref),
-				btrfs_extent_data_ref_root(eb, dref));
-}
-
-static int __shared_list_add(struct list_head *head, u64 disk_byte)
-{
-	struct __shared_ref *ref;
-
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
-	if (!ref)
-		return -ENOMEM;
-
-	ref->disk_byte = disk_byte;
-	list_add_tail(&ref->list, head);
-
-	return 0;
-}
-
-static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
-					   u64 logical, u64 inum,
-					   u64 extent_data_item_offset,
-					   u64 extent_offset,
-					   struct btrfs_path *path,
-					   struct list_head *data_refs,
-					   iterate_extent_inodes_t *iterate,
-					   void *ctx)
-{
-	u64 ref_root;
-	u32 item_size;
-	struct btrfs_key key;
-	struct extent_buffer *eb;
-	struct btrfs_extent_item *ei;
-	struct btrfs_extent_inline_ref *eiref;
-	struct __data_ref *ref;
-	int ret;
-	int type;
-	int last;
-	unsigned long ptr = 0;
-
-	WARN_ON(!list_empty(data_refs));
-	ret = extent_from_logical(fs_info, logical, path, &key);
-	if (ret & BTRFS_EXTENT_FLAG_DATA)
-		ret = -EIO;
-	if (ret < 0)
-		goto out;
-
-	eb = path->nodes[0];
-	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-	item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
-	ret = 0;
-	ref_root = 0;
-	/*
-	 * as done in iterate_extent_inodes, we first build a list of refs to
-	 * iterate, then free the path and then iterate them to avoid deadlocks.
-	 */
-	do {
-		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-						&eiref, &type);
-		if (last < 0) {
-			ret = last;
-			goto out;
-		}
-		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
-		    type == BTRFS_SHARED_BLOCK_REF_KEY) {
-			ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
-			ret = __data_list_add(data_refs, inum,
-						extent_data_item_offset,
-						ref_root);
-		}
-	} while (!ret && !last);
-
-	btrfs_release_path(path);
-
-	if (ref_root == 0) {
-		printk(KERN_ERR "btrfs: failed to find tree block ref "
-			"for shared data backref %llu\n", logical);
-		WARN_ON(1);
-		ret = -EIO;
-	}
-
-out:
-	while (!list_empty(data_refs)) {
-		ref = list_first_entry(data_refs, struct __data_ref, list);
-		list_del(&ref->list);
-		if (!ret)
-			ret = iterate(ref->inum, extent_offset +
-					ref->extent_data_item_offset,
-					ref->root, ctx);
-		kfree(ref);
-	}
-
-	return ret;
-}
-
-static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
-				    u64 logical, u64 orig_extent_item_objectid,
-				    u64 extent_offset, struct btrfs_path *path,
-				    struct list_head *data_refs,
-				    iterate_extent_inodes_t *iterate,
-				    void *ctx)
+static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path, u64 logical,
+				u64 orig_extent_item_objectid,
+				u64 extent_item_pos, u64 root,
+				iterate_extent_inodes_t *iterate, void *ctx)
 {
 	u64 disk_byte;
 	struct btrfs_key key;
@@ -1199,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
 	struct extent_buffer *eb;
 	int slot;
 	int nritems;
-	int ret;
-	int found = 0;
+	int ret = 0;
+	int extent_type;
+	u64 data_offset;
+	u64 data_len;
 
 	eb = read_tree_block(fs_info->tree_root, logical,
 				fs_info->tree_root->leafsize, 0);
@@ -1218,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
 		if (key.type != BTRFS_EXTENT_DATA_KEY)
 			continue;
 		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
-		if (!fi) {
-			free_extent_buffer(eb);
-			return -EIO;
-		}
+		extent_type = btrfs_file_extent_type(eb, fi);
+		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
 		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-		if (disk_byte != orig_extent_item_objectid) {
-			if (found)
-				break;
-			else
-				continue;
-		}
-		++found;
-		ret = __iter_shared_inline_ref_inodes(fs_info, logical,
-							key.objectid,
-							key.offset,
-							extent_offset, path,
-							data_refs,
-							iterate, ctx);
-		if (ret)
-			break;
-	}
+		if (disk_byte != orig_extent_item_objectid)
+			continue;
 
-	if (!found) {
-		printk(KERN_ERR "btrfs: failed to follow shared data backref "
-			"to parent %llu\n", logical);
-		WARN_ON(1);
-		ret = -EIO;
+		data_offset = btrfs_file_extent_offset(eb, fi);
+		data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+		if (extent_item_pos < data_offset ||
+		    extent_item_pos >= data_offset + data_len)
+			continue;
+
+		pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
+				"root %llu\n", orig_extent_item_objectid,
+				key.objectid, key.offset, root);
+		ret = iterate(key.objectid,
+				key.offset + (extent_item_pos - data_offset),
+				root, ctx);
+		if (ret) {
+			pr_debug("stopping iteration because ret=%d\n", ret);
+			break;
+		}
 	}
 
 	free_extent_buffer(eb);
+
 	return ret;
 }
 
 /*
  * calls iterate() for every inode that references the extent identified by
- * the given parameters. will use the path given as a parameter and return it
- * released.
+ * the given parameters.
  * when the iterator function returns a non-zero value, iteration stops.
+ * path is guaranteed to be in released state when iterate() is called.
  */
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 				struct btrfs_path *path,
-				u64 extent_item_objectid,
-				u64 extent_offset,
+				u64 extent_item_objectid, u64 extent_item_pos,
 				iterate_extent_inodes_t *iterate, void *ctx)
 {
-	unsigned long ptr = 0;
-	int last;
 	int ret;
-	int type;
-	u64 logical;
-	u32 item_size;
-	struct btrfs_extent_inline_ref *eiref;
-	struct btrfs_extent_data_ref *dref;
-	struct extent_buffer *eb;
-	struct btrfs_extent_item *ei;
-	struct btrfs_key key;
 	struct list_head data_refs = LIST_HEAD_INIT(data_refs);
 	struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
-	struct __data_ref *ref_d;
-	struct __shared_ref *ref_s;
+	struct btrfs_trans_handle *trans;
+	struct ulist *refs;
+	struct ulist *roots;
+	struct ulist_node *ref_node = NULL;
+	struct ulist_node *root_node = NULL;
+	struct seq_list seq_elem;
+	struct btrfs_delayed_ref_root *delayed_refs;
 
-	eb = path->nodes[0];
-	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-	item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
-	/* first we iterate the inline refs, ... */
-	do {
-		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-						&eiref, &type);
-		if (last == -ENOENT) {
-			ret = 0;
-			break;
-		}
-		if (last < 0) {
-			ret = last;
-			break;
-		}
+	trans = btrfs_join_transaction(fs_info->extent_root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
-		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
-			dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
-			ret = __data_list_add_eb(&data_refs, eb, dref);
-		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
-			logical = btrfs_extent_inline_ref_offset(eb, eiref);
-			ret = __shared_list_add(&shared_refs, logical);
-		}
-	} while (!ret && !last);
+	pr_debug("resolving all inodes for extent %llu\n",
+			extent_item_objectid);
 
-	/* ... then we proceed to in-tree references and ... */
-	while (!ret) {
-		++path->slots[0];
-		if (path->slots[0] > btrfs_header_nritems(eb)) {
-			ret = btrfs_next_leaf(fs_info->extent_root, path);
-			if (ret) {
-				if (ret == 1)
-					ret = 0; /* we're done */
-				break;
-			}
-			eb = path->nodes[0];
-		}
-		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
-		if (key.objectid != extent_item_objectid)
-			break;
-		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
-			dref = btrfs_item_ptr(eb, path->slots[0],
-						struct btrfs_extent_data_ref);
-			ret = __data_list_add_eb(&data_refs, eb, dref);
-		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
-			ret = __shared_list_add(&shared_refs, key.offset);
-		}
-	}
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	btrfs_get_delayed_seq(delayed_refs, &seq_elem);
+	spin_unlock(&delayed_refs->lock);
 
-	btrfs_release_path(path);
+	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+				   extent_item_pos, seq_elem.seq,
+				   &refs);
 
-	/*
-	 * ... only at the very end we can process the refs we found. this is
-	 * because the iterator function we call is allowed to make tree lookups
-	 * and we have to avoid deadlocks. additionally, we need more tree
-	 * lookups ourselves for shared data refs.
-	 */
-	while (!list_empty(&data_refs)) {
-		ref_d = list_first_entry(&data_refs, struct __data_ref, list);
-		list_del(&ref_d->list);
-		if (!ret)
-			ret = iterate(ref_d->inum, extent_offset +
-					ref_d->extent_data_item_offset,
-					ref_d->root, ctx);
-		kfree(ref_d);
-	}
+	if (ret)
+		goto out;
 
-	while (!list_empty(&shared_refs)) {
-		ref_s = list_first_entry(&shared_refs, struct __shared_ref,
-					list);
-		list_del(&ref_s->list);
-		if (!ret)
-			ret = __iter_shared_inline_ref(fs_info,
-							ref_s->disk_byte,
-							extent_item_objectid,
-							extent_offset, path,
-							&data_refs,
-							iterate, ctx);
-		kfree(ref_s);
+	while (!ret && (ref_node = ulist_next(refs, ref_node))) {
+		ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
+						seq_elem.seq, &roots);
+		if (ret)
+			break;
+		while (!ret && (root_node = ulist_next(roots, root_node))) {
+			pr_debug("root %llu references leaf %llu\n",
+					root_node->val, ref_node->val);
+			ret = iterate_leaf_refs(fs_info, path, ref_node->val,
+						extent_item_objectid,
+						extent_item_pos, root_node->val,
+						iterate, ctx);
+		}
 	}
 
+	ulist_free(refs);
+	ulist_free(roots);
+out:
+	btrfs_put_delayed_seq(delayed_refs, &seq_elem);
+	btrfs_end_transaction(trans, fs_info->extent_root);
 	return ret;
 }
 
@@ -1369,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 				iterate_extent_inodes_t *iterate, void *ctx)
 {
 	int ret;
-	u64 offset;
+	u64 extent_item_pos;
 	struct btrfs_key found_key;
 
 	ret = extent_from_logical(fs_info, logical, path,
 					&found_key);
+	btrfs_release_path(path);
 	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		ret = -EINVAL;
 	if (ret < 0)
 		return ret;
 
-	offset = logical - found_key.objectid;
+	extent_item_pos = logical - found_key.objectid;
 	ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
-					offset, iterate, ctx);
+					extent_item_pos, iterate, ctx);
 
 	return ret;
 }
@@ -1426,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
 			name_len = btrfs_inode_ref_name_len(eb, iref);
 			/* path must be released before calling iterate()! */
+			pr_debug("following ref at offset %u for inode %llu in "
+				 "tree %llu\n", cur,
+				 (unsigned long long)found_key.objectid,
+				 (unsigned long long)fs_root->objectid);
 			ret = iterate(parent, iref, eb, ctx);
 			if (ret) {
 				free_extent_buffer(eb);
@@ -1466,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 		return PTR_ERR(fspath);
 
 	if (fspath > fspath_min) {
+		pr_debug("path resolved: %s\n", fspath);
 		ipath->fspath->val[i] = (u64)(unsigned long)fspath;
 		++ipath->fspath->elem_cnt;
 		ipath->fspath->bytes_left = fspath - fspath_min;
 	} else {
+		pr_debug("missed path, not enough space. missing bytes: %lu, "
+			 "constructed so far: %s\n",
+			 (unsigned long)(fspath_min - fspath), fspath_min);
 		++ipath->fspath->elem_missed;
 		ipath->fspath->bytes_missing += fspath_min - fspath;
 		ipath->fspath->bytes_left = 0;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c48f2e931ea0..9b0526872b7b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2976,7 +2976,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 {
 	int ret = 0;
 	int size;
-	u64 extent_offset;
+	u64 extent_item_pos;
 	struct btrfs_ioctl_logical_ino_args *loi;
 	struct btrfs_data_container *inodes = NULL;
 	struct btrfs_path *path = NULL;
@@ -3007,15 +3007,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 	}
 
 	ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+	btrfs_release_path(path);
 
 	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		ret = -ENOENT;
 	if (ret < 0)
 		goto out;
 
-	extent_offset = loi->logical - key.objectid;
+	extent_item_pos = loi->logical - key.objectid;
 	ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
-					extent_offset, build_ino_list, inodes);
+					extent_item_pos, build_ino_list,
+					inodes);
 
 	if (ret < 0)
 		goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index c27bcb67f330..b5edff25a53f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -309,7 +309,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
 	u8 ref_level;
 	unsigned long ptr = 0;
 	const int bufsize = 4096;
-	u64 extent_offset;
+	u64 extent_item_pos;
 
 	path = btrfs_alloc_path();
 
@@ -329,12 +329,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
 	if (ret < 0)
 		goto out;
 
-	extent_offset = swarn.logical - found_key.objectid;
+	extent_item_pos = swarn.logical - found_key.objectid;
 	swarn.extent_item_size = found_key.offset;
 
 	eb = path->nodes[0];
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+	btrfs_release_path(path);
 
 	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		do {
@@ -351,7 +352,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
 	} else {
 		swarn.path = path;
 		iterate_extent_inodes(fs_info, path, found_key.objectid,
-					extent_offset,
+					extent_item_pos,
 					scrub_print_warning_inode, &swarn);
 	}
 
-- 
cgit v1.2.3


From 6bf7e080d5bcb0d399ee38ce3dabbfad64448192 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 1 Dec 2011 14:35:19 +0100
Subject: Btrfs: make sure we're not using obsolete code in btrfs_get_extent

There's code in btrfs_get_extent that should never be used. This patch turns
a WARN_ON(1) into a BUG(), hoping we can remove the transaction code from
btrfs_get_extent soon.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ea819386b864..603d740f0f1c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5022,7 +5022,7 @@ again:
 			}
 			flush_dcache_page(page);
 		} else if (create && PageUptodate(page)) {
-			WARN_ON(1);
+			BUG();
 			if (!trans) {
 				kunmap(page);
 				free_extent_map(em);
-- 
cgit v1.2.3


From 203bf287cb01a5dc26c20bd3737cecf3aeba1d48 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 6 Jan 2012 15:23:57 -0500
Subject: Btrfs: run chunk allocations while we do delayed refs

Btrfs tries to batch extent allocation tree changes to improve performance
and reduce metadata trashing.  But it doesn't allocate new metadata chunks
while it is doing allocations for the extent allocation tree.

This commit changes the delayed refence code to do chunk allocations if we're
getting low on room.  It prevents crashes and improves performance.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 14 ++++++++++----
 fs/btrfs/transaction.c |  9 +--------
 2 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5fbe576d2ba..71549d11a09e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2267,9 +2267,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 				BUG_ON(ret);
 				kfree(extent_op);
 
-				cond_resched();
-				spin_lock(&delayed_refs->lock);
-				continue;
+				goto next;
 			}
 
 			list_del_init(&locked_ref->cluster);
@@ -2289,7 +2287,11 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		btrfs_put_delayed_ref(ref);
 		kfree(extent_op);
 		count++;
-
+next:
+		do_chunk_alloc(trans, root->fs_info->extent_root,
+			       2 * 1024 * 1024,
+			       btrfs_get_alloc_profile(root, 0),
+			       CHUNK_ALLOC_NO_FORCE);
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
@@ -2317,6 +2319,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	if (root == root->fs_info->extent_root)
 		root = root->fs_info->tree_root;
 
+	do_chunk_alloc(trans, root->fs_info->extent_root,
+		       2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
+		       CHUNK_ALLOC_NO_FORCE);
+
 	delayed_refs = &trans->transaction->delayed_refs;
 	INIT_LIST_HEAD(&cluster);
 again:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3c..360c2dfd1ee6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -467,19 +467,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
-	while (count < 4) {
+	while (count < 2) {
 		unsigned long cur = trans->delayed_ref_updates;
 		trans->delayed_ref_updates = 0;
 		if (cur &&
 		    trans->transaction->delayed_refs.num_heads_ready > 64) {
 			trans->delayed_ref_updates = 0;
-
-			/*
-			 * do a full flush if the transaction is trying
-			 * to close
-			 */
-			if (trans->transaction->delayed_refs.flushing)
-				cur = 0;
 			btrfs_run_delayed_refs(trans, root, cur);
 		} else {
 			break;
-- 
cgit v1.2.3


From cf1d72c9ceec391d34c48724da57282e97f01122 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 6 Jan 2012 15:41:34 -0500
Subject: Btrfs: lower the bar for chunk allocation

The chunk allocation code has tried to keep a pretty tight lid on creating new
metadata chunks.  This is partially because in the past the reservation
code didn't give us an accurate idea of how much space was being used.

The new code is much more accurate, so we're able to get rid of some of these
checks.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71549d11a09e..247d2c94f8ec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3263,27 +3263,12 @@ static int should_alloc_chunk(struct btrfs_root *root,
 		if (num_bytes - num_allocated < thresh)
 			return 1;
 	}
-
-	/*
-	 * we have two similar checks here, one based on percentage
-	 * and once based on a hard number of 256MB.  The idea
-	 * is that if we have a good amount of free
-	 * room, don't allocate a chunk.  A good mount is
-	 * less than 80% utilized of the chunks we have allocated,
-	 * or more than 256MB free
-	 */
-	if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
-		return 0;
-
-	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
-		return 0;
-
 	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
-	/* 256MB or 5% of the FS */
-	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+	/* 256MB or 2% of the FS */
+	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
 
-	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
 		return 0;
 	return 1;
 }
-- 
cgit v1.2.3


From 1100373f8aa69e377386499350496e3d8565605f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 6 Jan 2012 15:47:38 -0500
Subject: Btrfs: use bigger metadata chunks on bigger filesystems

The 256MB chunk is a little small on a huge FS.  This scales up the
chunk size.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9d..ac00e3aa80a1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2441,7 +2441,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		max_stripe_size = 1024 * 1024 * 1024;
 		max_chunk_size = 10 * max_stripe_size;
 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
-		max_stripe_size = 256 * 1024 * 1024;
+		/* for larger filesystems, use larger metadata chunks */
+		if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+			max_stripe_size = 1024 * 1024 * 1024;
+		else
+			max_stripe_size = 256 * 1024 * 1024;
 		max_chunk_size = max_stripe_size;
 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		max_stripe_size = 8 * 1024 * 1024;
-- 
cgit v1.2.3


From a5f6f719a5cd7caeee8ed8137cf3f94c3bbebc65 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <lxoliva@fsfla.org>
Date: Mon, 12 Dec 2011 04:48:19 -0200
Subject: Btrfs: test free space only for unclustered allocation

Since the clustered allocation may be taking extents from a different
block group, there's no point in spin-locking and testing the current
block group free space before attempting to allocate space from a
cluster, even more so when we might refrain from even trying the
cluster in the current block group because, after the cluster was set
up, not enough free space remained.  Furthermore, cluster creation
attempts fail fast when the block group doesn't have enough free
space, so the test was completely superfluous.

I've move the free space test past the cluster allocation attempt,
where it is more useful, and arranged for a cluster in the current
block group to be released before trying an unclustered allocation,
when we reach the LOOP_NO_EMPTY_SIZE stage, so that the free space in
the cluster stands a chance of being combined with additional free
space in the block group so as to succeed in the allocation attempt.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 247d2c94f8ec..5ea3acc53241 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5286,15 +5286,6 @@ alloc:
 		if (unlikely(block_group->ro))
 			goto loop;
 
-		spin_lock(&block_group->free_space_ctl->tree_lock);
-		if (cached &&
-		    block_group->free_space_ctl->free_space <
-		    num_bytes + empty_cluster + empty_size) {
-			spin_unlock(&block_group->free_space_ctl->tree_lock);
-			goto loop;
-		}
-		spin_unlock(&block_group->free_space_ctl->tree_lock);
-
 		/*
 		 * Ok we want to try and use the cluster allocator, so
 		 * lets look there
@@ -5340,8 +5331,15 @@ refill_cluster:
 			 * plenty of times and not have found
 			 * anything, so we are likely way too
 			 * fragmented for the clustering stuff to find
-			 * anything.  */
-			if (loop >= LOOP_NO_EMPTY_SIZE) {
+			 * anything.
+			 *
+			 * However, if the cluster is taken from the
+			 * current block group, release the cluster
+			 * first, so that we stand a better chance of
+			 * succeeding in the unclustered
+			 * allocation.  */
+			if (loop >= LOOP_NO_EMPTY_SIZE &&
+			    last_ptr->block_group != block_group) {
 				spin_unlock(&last_ptr->refill_lock);
 				goto unclustered_alloc;
 			}
@@ -5352,6 +5350,11 @@ refill_cluster:
 			 */
 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
 
+			if (loop >= LOOP_NO_EMPTY_SIZE) {
+				spin_unlock(&last_ptr->refill_lock);
+				goto unclustered_alloc;
+			}
+
 			/* allocate a cluster in this block group */
 			ret = btrfs_find_space_cluster(trans, root,
 					       block_group, last_ptr,
@@ -5392,6 +5395,15 @@ refill_cluster:
 		}
 
 unclustered_alloc:
+		spin_lock(&block_group->free_space_ctl->tree_lock);
+		if (cached &&
+		    block_group->free_space_ctl->free_space <
+		    num_bytes + empty_cluster + empty_size) {
+			spin_unlock(&block_group->free_space_ctl->tree_lock);
+			goto loop;
+		}
+		spin_unlock(&block_group->free_space_ctl->tree_lock);
+
 		offset = btrfs_find_space_for_alloc(block_group, search_start,
 						    num_bytes, empty_size);
 		/*
-- 
cgit v1.2.3


From fc7c1077ceb99c35e5f9d0ce03dc7740565bb2bf Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <lxoliva@fsfla.org>
Date: Mon, 28 Nov 2011 12:36:17 -0200
Subject: Btrfs: don't set up allocation result twice

We store the allocation start and length twice in ins, once right
after the other, but with intervening calls that may prevent the
duplicate from being optimized out by the compiler.  Remove one of the
assignments.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5ea3acc53241..37594e4bf660 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5441,9 +5441,6 @@ checks:
 			goto loop;
 		}
 
-		ins->objectid = search_start;
-		ins->offset = num_bytes;
-
 		if (offset < search_start)
 			btrfs_add_free_space(used_block_group, offset,
 					     search_start - offset);
-- 
cgit v1.2.3


From 1bb91902dc90e25449893e693ad45605cb08fbe5 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <lxoliva@fsfla.org>
Date: Fri, 14 Oct 2011 12:10:36 -0300
Subject: Btrfs: revamp clustered allocation logic

Parameterize clusters on minimum total size, minimum chunk size and
minimum contiguous size for at least one chunk, without limits on
cluster, window or gap sizes.  Don't tolerate any fragmentation for
SSD_SPREAD; accept it for metadata, but try to keep data dense.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 112 +++++++++++++++++++-------------------------
 1 file changed, 49 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ec23d43d0c35..ce40db59c706 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2283,23 +2283,23 @@ out:
 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 				struct btrfs_free_space *entry,
 				struct btrfs_free_cluster *cluster,
-				u64 offset, u64 bytes, u64 min_bytes)
+				u64 offset, u64 bytes,
+				u64 cont1_bytes, u64 min_bytes)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	unsigned long next_zero;
 	unsigned long i;
-	unsigned long search_bits;
-	unsigned long total_bits;
+	unsigned long want_bits;
+	unsigned long min_bits;
 	unsigned long found_bits;
 	unsigned long start = 0;
 	unsigned long total_found = 0;
 	int ret;
-	bool found = false;
 
 	i = offset_to_bit(entry->offset, block_group->sectorsize,
 			  max_t(u64, offset, entry->offset));
-	search_bits = bytes_to_bits(bytes, block_group->sectorsize);
-	total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+	want_bits = bytes_to_bits(bytes, block_group->sectorsize);
+	min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
 
 again:
 	found_bits = 0;
@@ -2308,7 +2308,7 @@ again:
 	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
 		next_zero = find_next_zero_bit(entry->bitmap,
 					       BITS_PER_BITMAP, i);
-		if (next_zero - i >= search_bits) {
+		if (next_zero - i >= min_bits) {
 			found_bits = next_zero - i;
 			break;
 		}
@@ -2318,10 +2318,9 @@ again:
 	if (!found_bits)
 		return -ENOSPC;
 
-	if (!found) {
+	if (!total_found) {
 		start = i;
 		cluster->max_size = 0;
-		found = true;
 	}
 
 	total_found += found_bits;
@@ -2329,13 +2328,8 @@ again:
 	if (cluster->max_size < found_bits * block_group->sectorsize)
 		cluster->max_size = found_bits * block_group->sectorsize;
 
-	if (total_found < total_bits) {
-		i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
-		if (i - start > total_bits * 2) {
-			total_found = 0;
-			cluster->max_size = 0;
-			found = false;
-		}
+	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+		i = next_zero + 1;
 		goto again;
 	}
 
@@ -2351,23 +2345,23 @@ again:
 
 /*
  * This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
  */
 static noinline int
 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 			struct btrfs_free_cluster *cluster,
 			struct list_head *bitmaps, u64 offset, u64 bytes,
-			u64 min_bytes)
+			u64 cont1_bytes, u64 min_bytes)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *first = NULL;
 	struct btrfs_free_space *entry = NULL;
-	struct btrfs_free_space *prev = NULL;
 	struct btrfs_free_space *last;
 	struct rb_node *node;
 	u64 window_start;
 	u64 window_free;
 	u64 max_extent;
-	u64 max_gap = 128 * 1024;
 
 	entry = tree_search_offset(ctl, offset, 0, 1);
 	if (!entry)
@@ -2377,8 +2371,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 	 * We don't want bitmaps, so just move along until we find a normal
 	 * extent entry.
 	 */
-	while (entry->bitmap) {
-		if (list_empty(&entry->list))
+	while (entry->bitmap || entry->bytes < min_bytes) {
+		if (entry->bitmap && list_empty(&entry->list))
 			list_add_tail(&entry->list, bitmaps);
 		node = rb_next(&entry->offset_index);
 		if (!node)
@@ -2391,12 +2385,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 	max_extent = entry->bytes;
 	first = entry;
 	last = entry;
-	prev = entry;
 
-	while (window_free <= min_bytes) {
-		node = rb_next(&entry->offset_index);
-		if (!node)
-			return -ENOSPC;
+	for (node = rb_next(&entry->offset_index); node;
+	     node = rb_next(&entry->offset_index)) {
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 
 		if (entry->bitmap) {
@@ -2405,26 +2396,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 			continue;
 		}
 
-		/*
-		 * we haven't filled the empty size and the window is
-		 * very large.  reset and try again
-		 */
-		if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
-		    entry->offset - window_start > (min_bytes * 2)) {
-			first = entry;
-			window_start = entry->offset;
-			window_free = entry->bytes;
-			last = entry;
+		if (entry->bytes < min_bytes)
+			continue;
+
+		last = entry;
+		window_free += entry->bytes;
+		if (entry->bytes > max_extent)
 			max_extent = entry->bytes;
-		} else {
-			last = entry;
-			window_free += entry->bytes;
-			if (entry->bytes > max_extent)
-				max_extent = entry->bytes;
-		}
-		prev = entry;
 	}
 
+	if (window_free < bytes || max_extent < cont1_bytes)
+		return -ENOSPC;
+
 	cluster->window_start = first->offset;
 
 	node = &first->offset_index;
@@ -2438,7 +2421,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 		node = rb_next(&entry->offset_index);
-		if (entry->bitmap)
+		if (entry->bitmap || entry->bytes < min_bytes)
 			continue;
 
 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
@@ -2460,7 +2443,7 @@ static noinline int
 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 		     struct btrfs_free_cluster *cluster,
 		     struct list_head *bitmaps, u64 offset, u64 bytes,
-		     u64 min_bytes)
+		     u64 cont1_bytes, u64 min_bytes)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry;
@@ -2485,7 +2468,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 		if (entry->bytes < min_bytes)
 			continue;
 		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
-					   bytes, min_bytes);
+					   bytes, cont1_bytes, min_bytes);
 		if (!ret)
 			return 0;
 	}
@@ -2499,7 +2482,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 
 /*
  * here we try to find a cluster of blocks in a block group.  The goal
- * is to find at least bytes free and up to empty_size + bytes free.
+ * is to find at least bytes+empty_size.
  * We might not find them all in one contiguous area.
  *
  * returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2498,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 	struct btrfs_free_space *entry, *tmp;
 	LIST_HEAD(bitmaps);
 	u64 min_bytes;
+	u64 cont1_bytes;
 	int ret;
 
-	/* for metadata, allow allocates with more holes */
+	/*
+	 * Choose the minimum extent size we'll require for this
+	 * cluster.  For SSD_SPREAD, don't allow any fragmentation.
+	 * For metadata, allow allocates with smaller extents.  For
+	 * data, keep it dense.
+	 */
 	if (btrfs_test_opt(root, SSD_SPREAD)) {
-		min_bytes = bytes + empty_size;
+		cont1_bytes = min_bytes = bytes + empty_size;
 	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
-		/*
-		 * we want to do larger allocations when we are
-		 * flushing out the delayed refs, it helps prevent
-		 * making more work as we go along.
-		 */
-		if (trans->transaction->delayed_refs.flushing)
-			min_bytes = max(bytes, (bytes + empty_size) >> 1);
-		else
-			min_bytes = max(bytes, (bytes + empty_size) >> 4);
-	} else
-		min_bytes = max(bytes, (bytes + empty_size) >> 2);
+		cont1_bytes = bytes;
+		min_bytes = block_group->sectorsize;
+	} else {
+		cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+		min_bytes = block_group->sectorsize;
+	}
 
 	spin_lock(&ctl->tree_lock);
 
@@ -2539,7 +2523,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 	 * If we know we don't have enough space to make a cluster don't even
 	 * bother doing all the work to try and find one.
 	 */
-	if (ctl->free_space < min_bytes) {
+	if (ctl->free_space < bytes) {
 		spin_unlock(&ctl->tree_lock);
 		return -ENOSPC;
 	}
@@ -2553,10 +2537,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 	}
 
 	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
-				      bytes, min_bytes);
+				      bytes + empty_size,
+				      cont1_bytes, min_bytes);
 	if (ret)
 		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
-					   offset, bytes, min_bytes);
+					   offset, bytes + empty_size,
+					   cont1_bytes, min_bytes);
 
 	/* Clear our temporary list */
 	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
-- 
cgit v1.2.3


From 98c7089c769048f941bd5c5285287f8fc301f8b1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:00:31 -0500
Subject: btrfs: preparation to fixing mount/umount race

We need fs_info and root to live until the moment when the victim
superblock leaves the list, so we need to postpone free_fs_info()
until after ->put_super().  The call is buried in close_ctree(),
though, so we need to lift it into the callers (including
btrfs_put_super()) first.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 3 +--
 fs/btrfs/super.c   | 6 +++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f99a099a7747..dcb5d949b543 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2424,6 +2424,7 @@ retry_root_backup:
 		up_read(&fs_info->cleanup_work_sem);
 		if (err) {
 			close_ctree(tree_root);
+			free_fs_info(fs_info);
 			return ERR_PTR(err);
 		}
 	}
@@ -3059,8 +3060,6 @@ int close_ctree(struct btrfs_root *root)
 	bdi_destroy(&fs_info->bdi);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
-	free_fs_info(fs_info);
-
 	return 0;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ae488aa1966a..a3f435e58987 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -151,6 +151,7 @@ static void btrfs_put_super(struct super_block *sb)
 	int ret;
 
 	ret = close_ctree(root);
+	free_fs_info(root->fs_info);
 	sb->s_fs_info = NULL;
 
 	(void)ret; /* FIXME: need to fix VFS to return error? */
@@ -589,6 +590,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	struct inode *inode;
 	struct dentry *root_dentry;
 	struct btrfs_root *tree_root;
+	struct btrfs_fs_info *fs_info;
 	struct btrfs_key key;
 	int err;
 
@@ -609,12 +611,13 @@ static int btrfs_fill_super(struct super_block *sb,
 		printk("btrfs: open_ctree failed\n");
 		return PTR_ERR(tree_root);
 	}
+	fs_info = tree_root->fs_info;
 	sb->s_fs_info = tree_root;
 
 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
+	inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto fail_close;
@@ -635,6 +638,7 @@ static int btrfs_fill_super(struct super_block *sb,
 
 fail_close:
 	close_ctree(tree_root);
+	free_fs_info(fs_info);
 	return err;
 }
 
-- 
cgit v1.2.3


From aea52e19dd2085617dd7d247afb76a90cf2ea40c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:22:46 -0500
Subject: btrfs: get ->kill_sb() of its own

... and move free_fs_info() to that, out of ->put_super().  Do NOT
set ->s_fs_info to NULL in the latter; we need it for sget() to
be able to see and wait for fs in the middle of umount if we get a
mount/umount race.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a3f435e58987..eca48624a4f0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -147,14 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 
 static void btrfs_put_super(struct super_block *sb)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
-	int ret;
-
-	ret = close_ctree(root);
-	free_fs_info(root->fs_info);
-	sb->s_fs_info = NULL;
-
-	(void)ret; /* FIXME: need to fix VFS to return error? */
+	(void)close_ctree(btrfs_sb(sb));
+	/* FIXME: need to fix VFS to return error? */
+	/* AV: return it _where_?  ->put_super() can be triggered by any number
+	 * of async events, up to and including delivery of SIGKILL to the
+	 * last process that kept it busy.  Or segfault in the aforementioned
+	 * process...  Whom would you report that to?
+	 */
 }
 
 enum {
@@ -1223,11 +1222,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+static void btrfs_kill_super(struct super_block *sb)
+{
+	struct btrfs_fs_info *fs_info = NULL;
+	if (sb->s_root)
+		fs_info = btrfs_sb(sb)->fs_info;
+	kill_anon_super(sb);
+	if (fs_info)
+		free_fs_info(fs_info);
+}
+
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
 	.mount		= btrfs_mount,
-	.kill_sb	= kill_anon_super,
+	.kill_sb	= btrfs_kill_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-- 
cgit v1.2.3


From 6de1d09d9677dce7a04ef485d426821159ab7de9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:29:09 -0500
Subject: btrfs: fix mount/umount race

Do *NOT* skip doomed superblocks in btrfs_test_super(); we want
sget() to wait for their shutdown to complete.  Since we don't
mutilate ->s_fs_info in ->put_super() anymore (or free what it
used to point to until the superblock is past being findable
by sget()), we can just DTRT there and report a match.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index eca48624a4f0..b9fd62a0fca2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -733,20 +733,15 @@ static int btrfs_test_super(struct super_block *s, void *data)
 	struct btrfs_root *test_root = data;
 	struct btrfs_root *root = btrfs_sb(s);
 
-	/*
-	 * If this super block is going away, return false as it
-	 * can't match as an existing super block.
-	 */
-	if (!atomic_read(&s->s_active))
-		return 0;
 	return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
 }
 
 static int btrfs_set_super(struct super_block *s, void *data)
 {
-	s->s_fs_info = data;
-
-	return set_anon_super(s, data);
+	int err = set_anon_super(s, data);
+	if (!err)
+		s->s_fs_info = data;
+	return err;
 }
 
 /*
-- 
cgit v1.2.3


From 10f6327b5d26660e06120ba17cff993cb616b1d2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 15:05:22 -0500
Subject: btrfs: fix a deadlock in btrfs_scan_one_device()

pathname resolution under a global mutex, taken on some paths in ->mount()
is a Bad Idea(tm) - think what happens if said pathname resolution triggers
automount of some btrfs instance and walks into attempt to grab the same
mutex.  Deadlock - we are waiting for daemon to finish walking the path,
daemon is waiting for us to release the mutex...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/volumes.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9d..96bb3c0a79b0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -706,8 +706,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	u64 devid;
 	u64 transid;
 
-	mutex_lock(&uuid_mutex);
-
 	flags |= FMODE_EXCL;
 	bdev = blkdev_get_by_path(path, flags, holder);
 
@@ -716,6 +714,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		goto error;
 	}
 
+	mutex_lock(&uuid_mutex);
 	ret = set_blocksize(bdev, 4096);
 	if (ret)
 		goto error_close;
@@ -737,9 +736,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	brelse(bh);
 error_close:
+	mutex_unlock(&uuid_mutex);
 	blkdev_put(bdev, flags);
 error:
-	mutex_unlock(&uuid_mutex);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 2eb34cd368b3e5cef960ae9d9971a037cd6fa9d3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 00:32:06 -0500
Subject: btrfs: sanitizing ->fs_info, part 1

take assignment of ->fs_info to callers of __setup_root()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dcb5d949b543..04eba10af825 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1142,7 +1142,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->orphan_item_inserted = 0;
 	root->orphan_cleanup_state = 0;
 
-	root->fs_info = fs_info;
 	root->objectid = objectid;
 	root->last_trans = 0;
 	root->highest_objectid = 0;
@@ -1193,6 +1192,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	u32 blocksize;
 	u64 generation;
 
+	root->fs_info = fs_info;
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, objectid);
@@ -1227,6 +1227,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 
+	root->fs_info = fs_info;
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1330,6 +1331,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 		goto out;
 	}
 
+	root->fs_info = fs_info;
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, location->objectid);
@@ -2055,6 +2057,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->transaction_blocked_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
 
+	tree_root->fs_info = fs_info;
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
@@ -2247,6 +2250,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 				     btrfs_super_chunk_root_level(disk_super));
 	generation = btrfs_super_chunk_root_generation(disk_super);
 
+	chunk_root->fs_info = fs_info;
 	__setup_root(nodesize, leafsize, sectorsize, stripesize,
 		     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
 
@@ -2373,6 +2377,7 @@ retry_root_backup:
 			goto fail_trans_kthread;
 		}
 
+		log_tree_root->fs_info = fs_info;
 		__setup_root(nodesize, leafsize, sectorsize, stripesize,
 			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
 
-- 
cgit v1.2.3


From 1233f546ec29ac32424327baf6ae5df81e3d9eae Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 00:34:00 -0500
Subject: btrfs: sanitizing ->fs_info, part 2

lift assignment to callers of find_and_setup_root()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 04eba10af825..b8dae517f80c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1192,7 +1192,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	u32 blocksize;
 	u64 generation;
 
-	root->fs_info = fs_info;
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, objectid);
@@ -1322,6 +1321,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 	if (location->offset == (u64)-1) {
+		root->fs_info = fs_info;
 		ret = find_and_setup_root(tree_root, fs_info,
 					  location->objectid, root);
 		if (ret) {
@@ -2300,18 +2300,21 @@ retry_root_backup:
 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
 	tree_root->commit_root = btrfs_root_node(tree_root);
 
+	extent_root->fs_info = fs_info;
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	if (ret)
 		goto recovery_tree_root;
 	extent_root->track_dirty = 1;
 
+	dev_root->fs_info = fs_info;
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
 	if (ret)
 		goto recovery_tree_root;
 	dev_root->track_dirty = 1;
 
+	csum_root->fs_info = fs_info;
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
 	if (ret)
-- 
cgit v1.2.3


From 38a77db49ad8f78369dcdfb693b8e5a818a60104 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 00:38:54 -0500
Subject: btrfs: sanitizing ->fs_info, part 3

move assignments to ->fs_info in open_ctree() up, to the place
just after the original allocations.  Assignment for tree_root
becomes a no-op - we'd obtained fs_info from tree_root->fs_info
in the first place.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b8dae517f80c..06480fcf0fb3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1320,8 +1320,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	root = kzalloc(sizeof(*root), GFP_NOFS);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
+	root->fs_info = fs_info;
 	if (location->offset == (u64)-1) {
-		root->fs_info = fs_info;
 		ret = find_and_setup_root(tree_root, fs_info,
 					  location->objectid, root);
 		if (ret) {
@@ -1331,7 +1331,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 		goto out;
 	}
 
-	root->fs_info = fs_info;
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, location->objectid);
@@ -1914,6 +1913,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		err = -ENOMEM;
 		goto fail;
 	}
+	chunk_root->fs_info = fs_info;
+	extent_root->fs_info = fs_info;
+	dev_root->fs_info = fs_info;
+	csum_root->fs_info = fs_info;
 
 	ret = init_srcu_struct(&fs_info->subvol_srcu);
 	if (ret) {
@@ -2057,7 +2060,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->transaction_blocked_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
 
-	tree_root->fs_info = fs_info;
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
@@ -2250,7 +2252,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 				     btrfs_super_chunk_root_level(disk_super));
 	generation = btrfs_super_chunk_root_generation(disk_super);
 
-	chunk_root->fs_info = fs_info;
 	__setup_root(nodesize, leafsize, sectorsize, stripesize,
 		     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
 
@@ -2300,21 +2301,18 @@ retry_root_backup:
 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
 	tree_root->commit_root = btrfs_root_node(tree_root);
 
-	extent_root->fs_info = fs_info;
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	if (ret)
 		goto recovery_tree_root;
 	extent_root->track_dirty = 1;
 
-	dev_root->fs_info = fs_info;
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
 	if (ret)
 		goto recovery_tree_root;
 	dev_root->track_dirty = 1;
 
-	csum_root->fs_info = fs_info;
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
 	if (ret)
-- 
cgit v1.2.3


From 6f07e42ee6fcc252a210781d7262f4051e9fd8f6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 00:46:16 -0500
Subject: btrfs: sanitizing ->fs_info, part 4

A new helper: btrfs_alloc_root(fs_info); allocates btrfs_root
and sets ->fs_info.  All places allocating the suckers converted
to it.  At that point we *never* reassign ->fs_info of btrfs_root;
it's set before anyone sees the address of newly allocated
struct btrfs_root and never assigned anywhere else.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 33 +++++++++++++++------------------
 fs/btrfs/disk-io.h |  2 ++
 fs/btrfs/super.c   |  3 +--
 3 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 06480fcf0fb3..ee846ce58846 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1215,6 +1215,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	return 0;
 }
 
+struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+	if (root)
+		root->fs_info = fs_info;
+	return root;
+}
+
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info)
 {
@@ -1222,11 +1230,10 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct extent_buffer *leaf;
 
-	root = kzalloc(sizeof(*root), GFP_NOFS);
+	root = btrfs_alloc_root(fs_info);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 
-	root->fs_info = fs_info;
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1317,10 +1324,9 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	u32 blocksize;
 	int ret = 0;
 
-	root = kzalloc(sizeof(*root), GFP_NOFS);
+	root = btrfs_alloc_root(fs_info);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
-	root->fs_info = fs_info;
 	if (location->offset == (u64)-1) {
 		ret = find_and_setup_root(tree_root, fs_info,
 					  location->objectid, root);
@@ -1900,23 +1906,15 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	int num_backups_tried = 0;
 	int backup_index = 0;
 
-	extent_root = fs_info->extent_root =
-		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	csum_root = fs_info->csum_root =
-		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	chunk_root = fs_info->chunk_root =
-		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	dev_root = fs_info->dev_root =
-		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
+	csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
+	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+	dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
 
 	if (!extent_root || !csum_root || !chunk_root || !dev_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
-	chunk_root->fs_info = fs_info;
-	extent_root->fs_info = fs_info;
-	dev_root->fs_info = fs_info;
-	csum_root->fs_info = fs_info;
 
 	ret = init_srcu_struct(&fs_info->subvol_srcu);
 	if (ret) {
@@ -2372,13 +2370,12 @@ retry_root_backup:
 		     btrfs_level_size(tree_root,
 				      btrfs_super_log_root_level(disk_super));
 
-		log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+		log_tree_root = btrfs_alloc_root(fs_info);
 		if (!log_tree_root) {
 			err = -ENOMEM;
 			goto fail_trans_kthread;
 		}
 
-		log_tree_root->fs_info = fs_info;
 		__setup_root(nodesize, leafsize, sectorsize, stripesize,
 			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c99d0a8f13fa..2bb5f59ddf95 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -86,6 +86,8 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 
+struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info);
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
 void btrfs_set_buffer_lockdep_class(u64 objectid,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b9fd62a0fca2..e9f876a1655b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -901,12 +901,11 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (!fs_info)
 		return ERR_PTR(-ENOMEM);
 
-	fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	fs_info->tree_root = btrfs_alloc_root(fs_info);
 	if (!fs_info->tree_root) {
 		error = -ENOMEM;
 		goto error_fs_info;
 	}
-	fs_info->tree_root->fs_info = fs_info;
 	fs_info->fs_devices = fs_devices;
 
 	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
-- 
cgit v1.2.3


From e3029d9fd426c8f582210ba35551ae5506218345 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 00:56:18 -0500
Subject: btrfs: sanitizing ->fs_info, part 5

close_ctree() uses a weird mix of accesses to root->fs_info and
its value at the beginning of function stored in local variable.
Since ->fs_info *never* changes, let's just use the local variable
to avoid confusion.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ee846ce58846..9308c7f13c17 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2982,7 +2982,7 @@ int close_ctree(struct btrfs_root *root)
 		   (atomic_read(&fs_info->defrag_running) == 0));
 
 	/* clear out the rbtree of defraggable inodes */
-	btrfs_run_defrag_inodes(root->fs_info);
+	btrfs_run_defrag_inodes(fs_info);
 
 	/*
 	 * Here come 2 situations when btrfs is broken to flip readonly:
@@ -3011,8 +3011,8 @@ int close_ctree(struct btrfs_root *root)
 
 	btrfs_put_block_group_cache(fs_info);
 
-	kthread_stop(root->fs_info->transaction_kthread);
-	kthread_stop(root->fs_info->cleaner_kthread);
+	kthread_stop(fs_info->transaction_kthread);
+	kthread_stop(fs_info->cleaner_kthread);
 
 	fs_info->closing = 2;
 	smp_mb();
@@ -3030,14 +3030,14 @@ int close_ctree(struct btrfs_root *root)
 	free_extent_buffer(fs_info->extent_root->commit_root);
 	free_extent_buffer(fs_info->tree_root->node);
 	free_extent_buffer(fs_info->tree_root->commit_root);
-	free_extent_buffer(root->fs_info->chunk_root->node);
-	free_extent_buffer(root->fs_info->chunk_root->commit_root);
-	free_extent_buffer(root->fs_info->dev_root->node);
-	free_extent_buffer(root->fs_info->dev_root->commit_root);
-	free_extent_buffer(root->fs_info->csum_root->node);
-	free_extent_buffer(root->fs_info->csum_root->commit_root);
-
-	btrfs_free_block_groups(root->fs_info);
+	free_extent_buffer(fs_info->chunk_root->node);
+	free_extent_buffer(fs_info->chunk_root->commit_root);
+	free_extent_buffer(fs_info->dev_root->node);
+	free_extent_buffer(fs_info->dev_root->commit_root);
+	free_extent_buffer(fs_info->csum_root->node);
+	free_extent_buffer(fs_info->csum_root->commit_root);
+
+	btrfs_free_block_groups(fs_info);
 
 	del_fs_roots(fs_info);
 
-- 
cgit v1.2.3


From ad2b2c802be2d3e8ed8364fef5ffaddabe448219 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:10:02 -0500
Subject: btrfs: make open_ctree() return int

It returns either ERR_PTR(-ve) or sb->s_fs_info.  The latter can
be found by caller just as well, TYVM, no need to return it.  Just
return -ve or 0...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 12 ++++++------
 fs/btrfs/disk-io.h |  6 +++---
 fs/btrfs/super.c   |  8 ++++----
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9308c7f13c17..78247522c693 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1880,9 +1880,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 }
 
 
-struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct btrfs_fs_devices *fs_devices,
-			      char *options)
+int open_ctree(struct super_block *sb,
+	       struct btrfs_fs_devices *fs_devices,
+	       char *options)
 {
 	u32 sectorsize;
 	u32 nodesize;
@@ -2428,11 +2428,11 @@ retry_root_backup:
 		if (err) {
 			close_ctree(tree_root);
 			free_fs_info(fs_info);
-			return ERR_PTR(err);
+			return err;
 		}
 	}
 
-	return tree_root;
+	return 0;
 
 fail_trans_kthread:
 	kthread_stop(fs_info->transaction_kthread);
@@ -2479,7 +2479,7 @@ fail_srcu:
 fail:
 	btrfs_close_devices(fs_info->fs_devices);
 	free_fs_info(fs_info);
-	return ERR_PTR(err);
+	return err;
 
 recovery_tree_root:
 	if (!btrfs_test_opt(tree_root, RECOVERY))
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2bb5f59ddf95..6f5f48778b00 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root, struct extent_buffer *buf);
-struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct btrfs_fs_devices *fs_devices,
-			      char *options);
+int open_ctree(struct super_block *sb,
+	       struct btrfs_fs_devices *fs_devices,
+	       char *options);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e9f876a1655b..56e007fd6702 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -604,12 +604,12 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_flags |= MS_POSIXACL;
 #endif
 
-	tree_root = open_ctree(sb, fs_devices, (char *)data);
-
-	if (IS_ERR(tree_root)) {
+	err = open_ctree(sb, fs_devices, (char *)data);
+	if (err) {
 		printk("btrfs: open_ctree failed\n");
-		return PTR_ERR(tree_root);
+		return err;
 	}
+	tree_root = sb->s_fs_info;
 	fs_info = tree_root->fs_info;
 	sb->s_fs_info = tree_root;
 
-- 
cgit v1.2.3


From 29db78aa0ac82319b764b87a1c5030d74523e296 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:12:38 -0500
Subject: btrfs: kill pointless reassignment of ->s_fs_info in
 btrfs_fill_super()

We do not (fortunately) modify ->s_fs_info of superblock on the fly in
btrfs_fill_super(); apparent assignment is a no-op.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 56e007fd6702..a381f9f9b0c2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -588,8 +588,8 @@ static int btrfs_fill_super(struct super_block *sb,
 {
 	struct inode *inode;
 	struct dentry *root_dentry;
-	struct btrfs_root *tree_root;
-	struct btrfs_fs_info *fs_info;
+	struct btrfs_root *tree_root = sb->s_fs_info;
+	struct btrfs_fs_info *fs_info = tree_root->fs_info;
 	struct btrfs_key key;
 	int err;
 
@@ -609,9 +609,6 @@ static int btrfs_fill_super(struct super_block *sb,
 		printk("btrfs: open_ctree failed\n");
 		return err;
 	}
-	tree_root = sb->s_fs_info;
-	fs_info = tree_root->fs_info;
-	sb->s_fs_info = tree_root;
 
 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	key.type = BTRFS_INODE_ITEM_KEY;
-- 
cgit v1.2.3


From be7e0950def403e90b5295ff2192c39967bf2aec Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:40:05 -0500
Subject: btrfs: merge free_fs_info() calls on fill_super failures

... all the way up into btrfs_mount().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 2 --
 fs/btrfs/super.c   | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 78247522c693..b6d11eb4e430 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2427,7 +2427,6 @@ retry_root_backup:
 		up_read(&fs_info->cleanup_work_sem);
 		if (err) {
 			close_ctree(tree_root);
-			free_fs_info(fs_info);
 			return err;
 		}
 	}
@@ -2478,7 +2477,6 @@ fail_srcu:
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
 	btrfs_close_devices(fs_info->fs_devices);
-	free_fs_info(fs_info);
 	return err;
 
 recovery_tree_root:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a381f9f9b0c2..8901b6c85260 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -634,7 +634,6 @@ static int btrfs_fill_super(struct super_block *sb,
 
 fail_close:
 	close_ctree(tree_root);
-	free_fs_info(fs_info);
 	return err;
 }
 
@@ -947,6 +946,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
+			free_fs_info(fs_info);
 			deactivate_locked_super(s);
 			return ERR_PTR(error);
 		}
-- 
cgit v1.2.3


From d22ca7de770e2a683eac000ba4db5a95e2f4c969 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:46:50 -0500
Subject: btrfs: make free_fs_info() call ->kill_sb() unconditional

... and don't bother with it after btrfs_fill_super() failure -
->kill_sb() (unlike ->put_super()) will be called even if we
have not got non-NULL ->s_root.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8901b6c85260..f628a6a79ca6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -946,7 +946,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
-			free_fs_info(fs_info);
 			deactivate_locked_super(s);
 			return ERR_PTR(error);
 		}
@@ -1215,12 +1214,9 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static void btrfs_kill_super(struct super_block *sb)
 {
-	struct btrfs_fs_info *fs_info = NULL;
-	if (sb->s_root)
-		fs_info = btrfs_sb(sb)->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
 	kill_anon_super(sb);
-	if (fs_info)
-		free_fs_info(fs_info);
+	free_fs_info(fs_info);
 }
 
 static struct file_system_type btrfs_fs_type = {
-- 
cgit v1.2.3


From 59553edf110e5576d91be9dd5bd53d110e0d0290 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 01:56:28 -0500
Subject: btrfs: consolidate failure exits in btrfs_mount() a bit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f628a6a79ca6..15c8ae571e40 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -630,6 +630,7 @@ static int btrfs_fill_super(struct super_block *sb,
 
 	save_mount_options(sb, data);
 	cleancache_init_fs(sb);
+	sb->s_flags |= MS_ACTIVE;
 	return 0;
 
 fail_close:
@@ -929,14 +930,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	if (s->s_root) {
-		if ((flags ^ s->s_flags) & MS_RDONLY) {
-			deactivate_locked_super(s);
-			error = -EBUSY;
-			goto error_close_devices;
-		}
-
 		btrfs_close_devices(fs_devices);
 		free_fs_info(fs_info);
+		if ((flags ^ s->s_flags) & MS_RDONLY)
+			error = -EBUSY;
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -945,19 +942,11 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
-		if (error) {
-			deactivate_locked_super(s);
-			return ERR_PTR(error);
-		}
-
-		s->s_flags |= MS_ACTIVE;
 	}
 
-	root = get_default_root(s, subvol_objectid);
-	if (IS_ERR(root)) {
+	root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
+	if (IS_ERR(root))
 		deactivate_locked_super(s);
-		return root;
-	}
 
 	return root;
 
-- 
cgit v1.2.3


From 815745cf3e46681241ad8025602ffbf2a452d514 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 15:40:49 -0500
Subject: btrfs: let ->s_fs_info point to fs_info, not root...

the latter can be obtained from the former (by looking as ->tree_root)
just as cheaply as we currently are doing the other way round.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/disk-io.c |  4 +--
 fs/btrfs/export.c  |  2 +-
 fs/btrfs/ioctl.c   |  7 +++--
 fs/btrfs/super.c   | 75 +++++++++++++++++++++++++++---------------------------
 5 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323d..dbb999a38272 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2196,7 +2196,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
 	return btrfs_item_size(eb, e) - offset;
 }
 
-static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b6d11eb4e430..f7d8b9ba519b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1894,8 +1894,8 @@ int open_ctree(struct super_block *sb,
 	struct btrfs_key location;
 	struct buffer_head *bh;
 	struct btrfs_super_block *disk_super;
-	struct btrfs_root *tree_root = btrfs_sb(sb);
-	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *extent_root;
 	struct btrfs_root *csum_root;
 	struct btrfs_root *chunk_root;
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1b8dc33778f9..5f77166fd01c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 				       u64 root_objectid, u32 generation,
 				       int check_generation)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_root *root;
 	struct inode *inode;
 	struct btrfs_key key;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5441ff1480fd..0c55f8f38091 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -276,14 +276,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 
 static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 {
-	struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
 	struct btrfs_device *device;
 	struct request_queue *q;
 	struct fstrim_range range;
 	u64 minlen = ULLONG_MAX;
 	u64 num_devices = 0;
-	u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -312,7 +311,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 
 	range.len = min(range.len, total_bytes - range.start);
 	range.minlen = max(range.minlen, minlen);
-	ret = btrfs_trim_fs(root, &range);
+	ret = btrfs_trim_fs(fs_info->tree_root, &range);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15c8ae571e40..305adf6dc09a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -147,7 +147,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 
 static void btrfs_put_super(struct super_block *sb)
 {
-	(void)close_ctree(btrfs_sb(sb));
+	(void)close_ctree(btrfs_sb(sb)->tree_root);
 	/* FIXME: need to fix VFS to return error? */
 	/* AV: return it _where_?  ->put_super() can be triggered by any number
 	 * of async events, up to and including delivery of SIGKILL to the
@@ -500,7 +500,8 @@ out:
 static struct dentry *get_default_root(struct super_block *sb,
 				       u64 subvol_objectid)
 {
-	struct btrfs_root *root = sb->s_fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_root *new_root;
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
@@ -530,7 +531,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 	 * will mount by default if we haven't been given a specific subvolume
 	 * to mount.
 	 */
-	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+	dir_id = btrfs_super_root_dir(fs_info->super_copy);
 	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
 	if (IS_ERR(di)) {
 		btrfs_free_path(path);
@@ -544,7 +545,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 		 */
 		btrfs_free_path(path);
 		dir_id = BTRFS_FIRST_FREE_OBJECTID;
-		new_root = root->fs_info->fs_root;
+		new_root = fs_info->fs_root;
 		goto setup_root;
 	}
 
@@ -552,7 +553,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 	btrfs_free_path(path);
 
 find_root:
-	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	new_root = btrfs_read_fs_root_no_name(fs_info, &location);
 	if (IS_ERR(new_root))
 		return ERR_CAST(new_root);
 
@@ -588,8 +589,7 @@ static int btrfs_fill_super(struct super_block *sb,
 {
 	struct inode *inode;
 	struct dentry *root_dentry;
-	struct btrfs_root *tree_root = sb->s_fs_info;
-	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_key key;
 	int err;
 
@@ -634,20 +634,21 @@ static int btrfs_fill_super(struct super_block *sb,
 	return 0;
 
 fail_close:
-	close_ctree(tree_root);
+	close_ctree(fs_info->tree_root);
 	return err;
 }
 
 int btrfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = btrfs_sb(sb);
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
 	int ret;
 
 	trace_btrfs_sync_fs(wait);
 
 	if (!wait) {
-		filemap_flush(root->fs_info->btree_inode->i_mapping);
+		filemap_flush(fs_info->btree_inode->i_mapping);
 		return 0;
 	}
 
@@ -663,8 +664,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 {
-	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
+	struct btrfs_root *root = info->tree_root;
 	char *compress_type;
 
 	if (btrfs_test_opt(root, DEGRADED))
@@ -727,10 +728,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 
 static int btrfs_test_super(struct super_block *s, void *data)
 {
-	struct btrfs_root *test_root = data;
-	struct btrfs_root *root = btrfs_sb(s);
+	struct btrfs_fs_info *p = data;
+	struct btrfs_fs_info *fs_info = btrfs_sb(s);
 
-	return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+	return fs_info->fs_devices == p->fs_devices;
 }
 
 static int btrfs_set_super(struct super_block *s, void *data)
@@ -922,8 +923,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	bdev = fs_devices->latest_bdev;
-	s = sget(fs_type, btrfs_test_super, btrfs_set_super,
-		 fs_info->tree_root);
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto error_close_devices;
@@ -939,7 +939,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 		s->s_flags = flags | MS_NOSEC;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+		btrfs_sb(s)->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 	}
@@ -959,7 +959,8 @@ error_fs_info:
 
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
 	int ret;
 
 	ret = btrfs_parse_options(root, data);
@@ -975,13 +976,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		ret =  btrfs_commit_super(root);
 		WARN_ON(ret);
 	} else {
-		if (root->fs_info->fs_devices->rw_devices == 0)
+		if (fs_info->fs_devices->rw_devices == 0)
 			return -EACCES;
 
-		if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
+		if (btrfs_super_log_root(fs_info->super_copy) != 0)
 			return -EINVAL;
 
-		ret = btrfs_cleanup_fs_roots(root->fs_info);
+		ret = btrfs_cleanup_fs_roots(fs_info);
 		WARN_ON(ret);
 
 		/* recover relocation */
@@ -1150,18 +1151,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
-	struct list_head *head = &root->fs_info->space_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	struct list_head *head = &fs_info->space_info;
 	struct btrfs_space_info *found;
 	u64 total_used = 0;
 	u64 total_free_data = 0;
 	int bits = dentry->d_sb->s_blocksize_bits;
-	__be32 *fsid = (__be32 *)root->fs_info->fsid;
+	__be32 *fsid = (__be32 *)fs_info->fsid;
 	int ret;
 
 	/* holding chunk_muext to avoid allocating new chunks */
-	mutex_lock(&root->fs_info->chunk_mutex);
+	mutex_lock(&fs_info->chunk_mutex);
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
 		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1180,14 +1181,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
 	buf->f_bavail = total_free_data;
-	ret = btrfs_calc_avail_data_space(root, &total_free_data);
+	ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
 	if (ret) {
-		mutex_unlock(&root->fs_info->chunk_mutex);
+		mutex_unlock(&fs_info->chunk_mutex);
 		return ret;
 	}
 	buf->f_bavail += total_free_data;
 	buf->f_bavail = buf->f_bavail >> bits;
-	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&fs_info->chunk_mutex);
 
 	/* We treat it as constant endianness (it doesn't matter _which_)
 	   because we want the fsid to come out the same whether mounted
@@ -1203,7 +1204,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static void btrfs_kill_super(struct super_block *sb)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	kill_anon_super(sb);
 	free_fs_info(fs_info);
 }
@@ -1246,17 +1247,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 
 static int btrfs_freeze(struct super_block *sb)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
-	mutex_lock(&root->fs_info->transaction_kthread_mutex);
-	mutex_lock(&root->fs_info->cleaner_mutex);
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	mutex_lock(&fs_info->transaction_kthread_mutex);
+	mutex_lock(&fs_info->cleaner_mutex);
 	return 0;
 }
 
 static int btrfs_unfreeze(struct super_block *sb)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
-	mutex_unlock(&root->fs_info->cleaner_mutex);
-	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	mutex_unlock(&fs_info->cleaner_mutex);
+	mutex_unlock(&fs_info->transaction_kthread_mutex);
 	return 0;
 }
 
-- 
cgit v1.2.3


From f84a8bd60e3ee49eacc9ba824babf149ba3dad7e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Nov 2011 15:57:57 -0500
Subject: btrfs: take allocation of ->tree_root into open_ctree()

now that we don't need it for sget() anymore...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/disk-io.c | 8 +++++---
 fs/btrfs/disk-io.h | 2 --
 fs/btrfs/super.c   | 5 -----
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f7d8b9ba519b..dd71be875939 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1215,7 +1215,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	return 0;
 }
 
-struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
 	if (root)
@@ -1895,7 +1895,7 @@ int open_ctree(struct super_block *sb,
 	struct buffer_head *bh;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *tree_root;
 	struct btrfs_root *extent_root;
 	struct btrfs_root *csum_root;
 	struct btrfs_root *chunk_root;
@@ -1906,12 +1906,14 @@ int open_ctree(struct super_block *sb,
 	int num_backups_tried = 0;
 	int backup_index = 0;
 
+	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
 	extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
 	csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
 	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
 	dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
 
-	if (!extent_root || !csum_root || !chunk_root || !dev_root) {
+	if (!tree_root || !extent_root || !csum_root ||
+	    !chunk_root || !dev_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 6f5f48778b00..e4bc4741319b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -86,8 +86,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 
-struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info);
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
 void btrfs_set_buffer_lockdep_class(u64 objectid,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 305adf6dc09a..044652ee7ef5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -899,11 +899,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (!fs_info)
 		return ERR_PTR(-ENOMEM);
 
-	fs_info->tree_root = btrfs_alloc_root(fs_info);
-	if (!fs_info->tree_root) {
-		error = -ENOMEM;
-		goto error_fs_info;
-	}
 	fs_info->fs_devices = fs_devices;
 
 	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
-- 
cgit v1.2.3


From db804f23a72bada58f083dfad6a65d019ddb3bd4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 10 Jan 2012 16:41:01 +0800
Subject: Btrfs: add pinned extents to on-disk free space cache correctly

I got this while running xfstests:

[24256.836098] block group 317849600 has an wrong amount of free space
[24256.836100] btrfs: failed to load free space cache for block group 317849600

We should clamp the extent returned by find_first_extent_bit(),
so the start of the extent won't smaller than the start of the
block group.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ec23d43d0c35..01840ef95a32 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -838,7 +838,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	struct io_ctl io_ctl;
 	struct list_head bitmap_list;
 	struct btrfs_key key;
-	u64 start, end, len;
+	u64 start, extent_start, extent_end, len;
 	int entries = 0;
 	int bitmaps = 0;
 	int ret;
@@ -857,25 +857,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 				     struct btrfs_free_cluster,
 				     block_group_list);
 
-	/*
-	 * We shouldn't have switched the pinned extents yet so this is the
-	 * right one
-	 */
-	unpin = root->fs_info->pinned_extents;
-
 	/* Lock all pages first so we can lock the extent safely. */
 	io_ctl_prepare_pages(&io_ctl, inode, 0);
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
 			 0, &cached_state, GFP_NOFS);
 
-	/*
-	 * When searching for pinned extents, we need to start at our start
-	 * offset.
-	 */
-	if (block_group)
-		start = block_group->key.objectid;
-
 	node = rb_first(&ctl->free_space_offset);
 	if (!node && cluster) {
 		node = rb_first(&cluster->root);
@@ -918,9 +905,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * We want to add any pinned extents to our free space cache
 	 * so we don't leak the space
 	 */
+
+	/*
+	 * We shouldn't have switched the pinned extents yet so this is the
+	 * right one
+	 */
+	unpin = root->fs_info->pinned_extents;
+
+	if (block_group)
+		start = block_group->key.objectid;
+
 	while (block_group && (start < block_group->key.objectid +
 			       block_group->key.offset)) {
-		ret = find_first_extent_bit(unpin, start, &start, &end,
+		ret = find_first_extent_bit(unpin, start,
+					    &extent_start, &extent_end,
 					    EXTENT_DIRTY);
 		if (ret) {
 			ret = 0;
@@ -928,20 +926,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		}
 
 		/* This pinned extent is out of our range */
-		if (start >= block_group->key.objectid +
+		if (extent_start >= block_group->key.objectid +
 		    block_group->key.offset)
 			break;
 
-		len = block_group->key.objectid +
-			block_group->key.offset - start;
-		len = min(len, end + 1 - start);
+		extent_start = max(extent_start, start);
+		extent_end = min(block_group->key.objectid +
+				 block_group->key.offset, extent_end + 1);
+		len = extent_end - extent_start;
 
 		entries++;
-		ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+		ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
 		if (ret)
 			goto out_nospc;
 
-		start = end + 1;
+		start = extent_end;
 	}
 
 	/* Write out the bitmaps */
-- 
cgit v1.2.3


From a1ee5a45818acc7f9c13e560827cf3e8735ac919 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 9 Jan 2012 14:27:42 +0800
Subject: Btrfs: avoid possible NULL deref in io_ctl_drop_pages()

If we run into some failure path in io_ctl_prepare_pages(),
io_ctl->pages[] array may have some NULL pointers.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 01840ef95a32..4e55af333e19 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
 	io_ctl_unmap_page(io_ctl);
 
 	for (i = 0; i < io_ctl->num_pages; i++) {
-		ClearPageChecked(io_ctl->pages[i]);
-		unlock_page(io_ctl->pages[i]);
-		page_cache_release(io_ctl->pages[i]);
+		if (io_ctl->pages[i]) {
+			ClearPageChecked(io_ctl->pages[i]);
+			unlock_page(io_ctl->pages[i]);
+			page_cache_release(io_ctl->pages[i]);
+		}
 	}
 }
 
-- 
cgit v1.2.3


From 706efc6630c2722602541a6a2fc5900a4e38456a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 9 Jan 2012 14:36:28 +0800
Subject: Btrfs: check the return value of io_ctl_init()

It can return -ENOMEM.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4e55af333e19..e4eb222147cc 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -637,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 	if (!num_entries)
 		return 0;
 
-	io_ctl_init(&io_ctl, inode, root);
+	ret = io_ctl_init(&io_ctl, inode, root);
+	if (ret)
+		return ret;
+
 	ret = readahead_cache(inode);
 	if (ret)
 		goto out;
@@ -851,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	if (!i_size_read(inode))
 		return -1;
 
-	io_ctl_init(&io_ctl, inode, root);
+	ret = io_ctl_init(&io_ctl, inode, root);
+	if (ret)
+		return -1;
 
 	/* Get the cluster for this block_group if it exists */
 	if (block_group && !list_empty(&block_group->cluster_list))
-- 
cgit v1.2.3


From f062abf089ff705e09bbaa6fa1e2fd7688a0f2ea Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 29 Dec 2011 13:36:45 +0800
Subject: Btrfs: remove BUG_ON()s in btrfs_ioctl_setflags()

We can recover from errors and return -errno to user space.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c04f02c7d5bb..9619fb03a5d6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	struct btrfs_trans_handle *trans;
 	unsigned int flags, oldflags;
 	int ret;
+	u64 ip_oldflags;
+	unsigned int i_oldflags;
 
 	if (btrfs_root_readonly(root))
 		return -EROFS;
@@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
 	mutex_lock(&inode->i_mutex);
 
+	ip_oldflags = ip->flags;
+	i_oldflags = inode->i_flags;
+
 	flags = btrfs_mask_flags(inode->i_mode, flags);
 	oldflags = btrfs_flags_to_ioctl(ip->flags);
 	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -250,18 +255,23 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	}
 
 	trans = btrfs_join_transaction(root);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_drop;
+	}
 
 	btrfs_update_iflags(inode);
 	inode->i_ctime = CURRENT_TIME;
 	ret = btrfs_update_inode(trans, root, inode);
-	BUG_ON(ret);
 
 	btrfs_end_transaction(trans, root);
+ out_drop:
+	if (ret) {
+		ip->flags = ip_oldflags;
+		inode->i_flags = i_oldflags;
+	}
 
 	mnt_drop_write(file->f_path.mnt);
-
-	ret = 0;
  out_unlock:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
-- 
cgit v1.2.3


From 4da6f1a332f6c16b6594c7892f13c31459b9b1c8 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 29 Dec 2011 13:39:50 +0800
Subject: Btrfs: reserve metadata space in btrfs_ioctl_setflags()

Check and reserve space for btrfs_update_inode().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9619fb03a5d6..fe8a60c865eb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -254,7 +254,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 		ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
 	}
 
-	trans = btrfs_join_transaction(root);
+	trans = btrfs_start_transaction(root, 1);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		goto out_drop;
-- 
cgit v1.2.3


From 125ccb0ae6806dbec31abf4a85448971df3b4e39 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 8 Dec 2011 15:07:24 +0800
Subject: Btrfs: don't pass a trans handle unnecessarily in volumes.c

Some functions never use the transaction handle passed to them.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/volumes.c     | 18 +++++++-----------
 fs/btrfs/volumes.h     |  3 +--
 3 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8603ee4e3dfd..5b53479ce07b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7084,7 +7084,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 		 * space to fit our block group in.
 		 */
 		if (device->total_bytes > device->bytes_used + min_free) {
-			ret = find_free_dev_extent(NULL, device, min_free,
+			ret = find_free_dev_extent(device, min_free,
 						   &dev_offset, NULL);
 			if (!ret)
 				dev_nr++;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9d..73f673c8d8d8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -829,7 +829,6 @@ out:
 
 /*
  * find_free_dev_extent - find free space in the specified device
- * @trans:	transaction handler
  * @device:	the device which we search the free space in
  * @num_bytes:	the size of the free space that we need
  * @start:	store the start of the free space.
@@ -848,8 +847,7 @@ out:
  * But if we don't find suitable free space, it is used to store the size of
  * the max free space.
  */
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *len)
 {
 	struct btrfs_key key;
@@ -893,7 +891,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
-	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
 	if (ret > 0) {
@@ -1469,8 +1467,7 @@ error_undo:
 /*
  * does all the dirty work required for changing file system's UUID.
  */
-static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root)
+static int btrfs_prepare_sprout(struct btrfs_root *root)
 {
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_fs_devices *old_devices;
@@ -1695,7 +1692,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
 	if (seeding_dev) {
 		sb->s_flags &= ~MS_RDONLY;
-		ret = btrfs_prepare_sprout(trans, root);
+		ret = btrfs_prepare_sprout(root);
 		BUG_ON(ret);
 	}
 
@@ -2323,8 +2320,7 @@ done:
 	return ret;
 }
 
-static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
+static int btrfs_add_system_chunk(struct btrfs_root *root,
 			   struct btrfs_key *key,
 			   struct btrfs_chunk *chunk, int item_size)
 {
@@ -2496,7 +2492,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		if (total_avail == 0)
 			continue;
 
-		ret = find_free_dev_extent(trans, device,
+		ret = find_free_dev_extent(device,
 					   max_stripe_size * dev_stripes,
 					   &dev_offset, &max_avail);
 		if (ret && ret != -ENOSPC)
@@ -2687,7 +2683,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 
 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+		ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
 					     item_size);
 		BUG_ON(ret);
 	}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 78f2d4d4f37f..c1701ec9d49f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -230,7 +230,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_balance(struct btrfs_root *dev_root);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *max_avail);
 #endif
-- 
cgit v1.2.3


From de11cc12df17337979e0929d2831887432f236ca Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 1 Dec 2011 12:55:47 +0800
Subject: Btrfs: don't pre-allocate btrfs bio

We pre-allocate a btrfs bio with fixed size, and then may re-allocate
memory if we find stripes are bigger than the fixed size. But this
pre-allocation is not necessary.

Also we don't have to calcuate the stripe number twice.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/volumes.c | 67 +++++++++++++++++-------------------------------------
 1 file changed, 21 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 73f673c8d8d8..540fdd25fb5e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2897,26 +2897,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	u64 stripe_nr;
 	u64 stripe_nr_orig;
 	u64 stripe_nr_end;
-	int stripes_allocated = 8;
-	int stripes_required = 1;
 	int stripe_index;
 	int i;
+	int ret = 0;
 	int num_stripes;
 	int max_errors = 0;
 	struct btrfs_bio *bbio = NULL;
 
-	if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
-		stripes_allocated = 1;
-again:
-	if (bbio_ret) {
-		bbio = kzalloc(btrfs_bio_size(stripes_allocated),
-				GFP_NOFS);
-		if (!bbio)
-			return -ENOMEM;
-
-		atomic_set(&bbio->error, 0);
-	}
-
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	read_unlock(&em_tree->lock);
@@ -2935,32 +2922,6 @@ again:
 	if (mirror_num > map->num_stripes)
 		mirror_num = 0;
 
-	/* if our btrfs_bio struct is too small, back off and try again */
-	if (rw & REQ_WRITE) {
-		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
-				 BTRFS_BLOCK_GROUP_DUP)) {
-			stripes_required = map->num_stripes;
-			max_errors = 1;
-		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-			stripes_required = map->sub_stripes;
-			max_errors = 1;
-		}
-	}
-	if (rw & REQ_DISCARD) {
-		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-				 BTRFS_BLOCK_GROUP_RAID1 |
-				 BTRFS_BLOCK_GROUP_DUP |
-				 BTRFS_BLOCK_GROUP_RAID10)) {
-			stripes_required = map->num_stripes;
-		}
-	}
-	if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
-	    stripes_allocated < stripes_required) {
-		stripes_allocated = map->num_stripes;
-		free_extent_map(em);
-		kfree(bbio);
-		goto again;
-	}
 	stripe_nr = offset;
 	/*
 	 * stripe_nr counts the total number of stripes we have to stride
@@ -3055,6 +3016,13 @@ again:
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
 
+	bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+	if (!bbio) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	atomic_set(&bbio->error, 0);
+
 	if (rw & REQ_DISCARD) {
 		for (i = 0; i < num_stripes; i++) {
 			bbio->stripes[i].physical =
@@ -3151,15 +3119,22 @@ again:
 			stripe_index++;
 		}
 	}
-	if (bbio_ret) {
-		*bbio_ret = bbio;
-		bbio->num_stripes = num_stripes;
-		bbio->max_errors = max_errors;
-		bbio->mirror_num = mirror_num;
+
+	if (rw & REQ_WRITE) {
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+				 BTRFS_BLOCK_GROUP_RAID10 |
+				 BTRFS_BLOCK_GROUP_DUP)) {
+			max_errors = 1;
+		}
 	}
+
+	*bbio_ret = bbio;
+	bbio->num_stripes = num_stripes;
+	bbio->max_errors = max_errors;
+	bbio->mirror_num = mirror_num;
 out:
 	free_extent_map(em);
-	return 0;
+	return ret;
 }
 
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
-- 
cgit v1.2.3


From ec9ef7a13be4dcce964c8503e8999087945e5b9e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 1 Dec 2011 14:06:42 +0800
Subject: Btrfs: simplfy calculation of stripe length for discard operation

For btrfs raid, while discarding a range of space, we'll need to know
the start offset and length to discard for each device, and it's done
in btrfs_map_block().

However the calculation is a bit complex for raid0 and raid10, so I
reimplement it based on a fact that:

        dev1          dev2           dev3    (raid0)
        -----------------------------------
        s0 s3 s6      s1 s4 s7       s2 s5

Each device has (total_stripes / nr_dev) stripes, or plus one.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/volumes.c | 95 ++++++++++++++++++------------------------------------
 1 file changed, 31 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 540fdd25fb5e..563ef650850e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3024,80 +3024,47 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	atomic_set(&bbio->error, 0);
 
 	if (rw & REQ_DISCARD) {
+		int factor = 0;
+		int sub_stripes = 0;
+		u64 stripes_per_dev = 0;
+		u32 remaining_stripes = 0;
+
+		if (map->type &
+		    (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+			if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+				sub_stripes = 1;
+			else
+				sub_stripes = map->sub_stripes;
+
+			factor = map->num_stripes / sub_stripes;
+			stripes_per_dev = div_u64_rem(stripe_nr_end -
+						      stripe_nr_orig,
+						      factor,
+						      &remaining_stripes);
+		}
+
 		for (i = 0; i < num_stripes; i++) {
 			bbio->stripes[i].physical =
 				map->stripes[stripe_index].physical +
 				stripe_offset + stripe_nr * map->stripe_len;
 			bbio->stripes[i].dev = map->stripes[stripe_index].dev;
 
-			if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
-				u64 stripes;
-				u32 last_stripe = 0;
-				int j;
-
-				div_u64_rem(stripe_nr_end - 1,
-					    map->num_stripes,
-					    &last_stripe);
-
-				for (j = 0; j < map->num_stripes; j++) {
-					u32 test;
-
-					div_u64_rem(stripe_nr_end - 1 - j,
-						    map->num_stripes, &test);
-					if (test == stripe_index)
-						break;
-				}
-				stripes = stripe_nr_end - 1 - j;
-				do_div(stripes, map->num_stripes);
-				bbio->stripes[i].length = map->stripe_len *
-					(stripes - stripe_nr + 1);
-
-				if (i == 0) {
+			if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+					 BTRFS_BLOCK_GROUP_RAID10)) {
+				bbio->stripes[i].length = stripes_per_dev *
+							  map->stripe_len;
+				if (i / sub_stripes < remaining_stripes)
+					bbio->stripes[i].length +=
+						map->stripe_len;
+				if (i < sub_stripes)
 					bbio->stripes[i].length -=
 						stripe_offset;
-					stripe_offset = 0;
-				}
-				if (stripe_index == last_stripe)
-					bbio->stripes[i].length -=
-						stripe_end_offset;
-			} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-				u64 stripes;
-				int j;
-				int factor = map->num_stripes /
-					     map->sub_stripes;
-				u32 last_stripe = 0;
-
-				div_u64_rem(stripe_nr_end - 1,
-					    factor, &last_stripe);
-				last_stripe *= map->sub_stripes;
-
-				for (j = 0; j < factor; j++) {
-					u32 test;
-
-					div_u64_rem(stripe_nr_end - 1 - j,
-						    factor, &test);
-
-					if (test ==
-					    stripe_index / map->sub_stripes)
-						break;
-				}
-				stripes = stripe_nr_end - 1 - j;
-				do_div(stripes, factor);
-				bbio->stripes[i].length = map->stripe_len *
-					(stripes - stripe_nr + 1);
-
-				if (i < map->sub_stripes) {
-					bbio->stripes[i].length -=
-						stripe_offset;
-					if (i == map->sub_stripes - 1)
-						stripe_offset = 0;
-				}
-				if (stripe_index >= last_stripe &&
-				    stripe_index <= (last_stripe +
-						     map->sub_stripes - 1)) {
+				if ((i / sub_stripes + 1) %
+				    sub_stripes == remaining_stripes)
 					bbio->stripes[i].length -=
 						stripe_end_offset;
-				}
+				if (i == sub_stripes - 1)
+					stripe_offset = 0;
 			} else
 				bbio->stripes[i].length = *length;
 
-- 
cgit v1.2.3


From 7fe1e641502616220437079258506196bc4d8cbf Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 29 Dec 2011 14:47:27 +0800
Subject: Btrfs: rewrite btrfs_trim_block_group()

There are various bugs in block group trimming:

- It may trim from offset smaller than user-specified offset.
- It may trim beyond user-specified range.
- It may leak free space for extents smaller than specified minlen.
- It may truncate the last trimmed extent thus leak free space.
- With mixed extents+bitmaps, some extents may not be trimmed.
- With mixed extents+bitmaps, some bitmaps may not be trimmed (even
none will be trimmed). Even for those trimmed, not all the free space
in the bitmaps will be trimmed.

I rewrite btrfs_trim_block_group() and break it into two functions.
One is to trim extents only, and the other is to trim bitmaps only.

Before patching:

	# fstrim -v /mnt/
	/mnt/: 1496465408 bytes were trimmed

After patching:

	# fstrim -v /mnt/
	/mnt/: 2193768448 bytes were trimmed

And this matches the total free space:

	# btrfs fi df /mnt
	Data: total=3.58GB, used=1.79GB
	System, DUP: total=8.00MB, used=4.00KB
	System: total=4.00MB, used=0.00
	Metadata, DUP: total=205.12MB, used=97.14MB
	Metadata: total=8.00MB, used=0.00

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 235 +++++++++++++++++++++++++++++++-------------
 1 file changed, 164 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e4eb222147cc..b3cbb8939fa3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2594,17 +2594,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
 	cluster->block_group = NULL;
 }
 
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
-			   u64 *trimmed, u64 start, u64 end, u64 minlen)
+static int do_trimming(struct btrfs_block_group_cache *block_group,
+		       u64 *total_trimmed, u64 start, u64 bytes,
+		       u64 reserved_start, u64 reserved_bytes)
 {
-	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
-	struct btrfs_free_space *entry = NULL;
+	struct btrfs_space_info *space_info = block_group->space_info;
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	u64 bytes = 0;
-	u64 actually_trimmed;
-	int ret = 0;
+	int ret;
+	int update = 0;
+	u64 trimmed = 0;
 
-	*trimmed = 0;
+	spin_lock(&space_info->lock);
+	spin_lock(&block_group->lock);
+	if (!block_group->ro) {
+		block_group->reserved += reserved_bytes;
+		space_info->bytes_reserved += reserved_bytes;
+		update = 1;
+	}
+	spin_unlock(&block_group->lock);
+	spin_unlock(&space_info->lock);
+
+	ret = btrfs_error_discard_extent(fs_info->extent_root,
+					 start, bytes, &trimmed);
+	if (!ret)
+		*total_trimmed += trimmed;
+
+	btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+
+	if (update) {
+		spin_lock(&space_info->lock);
+		spin_lock(&block_group->lock);
+		if (block_group->ro)
+			space_info->bytes_readonly += reserved_bytes;
+		block_group->reserved -= reserved_bytes;
+		space_info->bytes_reserved -= reserved_bytes;
+		spin_unlock(&space_info->lock);
+		spin_unlock(&block_group->lock);
+	}
+
+	return ret;
+}
+
+static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+			  u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	struct rb_node *node;
+	int ret = 0;
+	u64 extent_start;
+	u64 extent_bytes;
+	u64 bytes;
 
 	while (start < end) {
 		spin_lock(&ctl->tree_lock);
@@ -2615,81 +2655,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 		}
 
 		entry = tree_search_offset(ctl, start, 0, 1);
-		if (!entry)
-			entry = tree_search_offset(ctl,
-						   offset_to_bitmap(ctl, start),
-						   1, 1);
-
-		if (!entry || entry->offset >= end) {
+		if (!entry) {
 			spin_unlock(&ctl->tree_lock);
 			break;
 		}
 
-		if (entry->bitmap) {
-			ret = search_bitmap(ctl, entry, &start, &bytes);
-			if (!ret) {
-				if (start >= end) {
-					spin_unlock(&ctl->tree_lock);
-					break;
-				}
-				bytes = min(bytes, end - start);
-				bitmap_clear_bits(ctl, entry, start, bytes);
-				if (entry->bytes == 0)
-					free_bitmap(ctl, entry);
-			} else {
-				start = entry->offset + BITS_PER_BITMAP *
-					block_group->sectorsize;
+		/* skip bitmaps */
+		while (entry->bitmap) {
+			node = rb_next(&entry->offset_index);
+			if (!node) {
 				spin_unlock(&ctl->tree_lock);
-				ret = 0;
-				continue;
+				goto out;
 			}
-		} else {
-			start = entry->offset;
-			bytes = min(entry->bytes, end - start);
-			unlink_free_space(ctl, entry);
-			kmem_cache_free(btrfs_free_space_cachep, entry);
+			entry = rb_entry(node, struct btrfs_free_space,
+					 offset_index);
+		}
+
+		if (entry->offset >= end) {
+			spin_unlock(&ctl->tree_lock);
+			break;
+		}
+
+		extent_start = entry->offset;
+		extent_bytes = entry->bytes;
+		start = max(start, extent_start);
+		bytes = min(extent_start + extent_bytes, end) - start;
+		if (bytes < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			goto next;
 		}
 
+		unlink_free_space(ctl, entry);
+		kmem_cache_free(btrfs_free_space_cachep, entry);
+
 		spin_unlock(&ctl->tree_lock);
 
-		if (bytes >= minlen) {
-			struct btrfs_space_info *space_info;
-			int update = 0;
-
-			space_info = block_group->space_info;
-			spin_lock(&space_info->lock);
-			spin_lock(&block_group->lock);
-			if (!block_group->ro) {
-				block_group->reserved += bytes;
-				space_info->bytes_reserved += bytes;
-				update = 1;
-			}
-			spin_unlock(&block_group->lock);
-			spin_unlock(&space_info->lock);
-
-			ret = btrfs_error_discard_extent(fs_info->extent_root,
-							 start,
-							 bytes,
-							 &actually_trimmed);
-
-			btrfs_add_free_space(block_group, start, bytes);
-			if (update) {
-				spin_lock(&space_info->lock);
-				spin_lock(&block_group->lock);
-				if (block_group->ro)
-					space_info->bytes_readonly += bytes;
-				block_group->reserved -= bytes;
-				space_info->bytes_reserved -= bytes;
-				spin_unlock(&space_info->lock);
-				spin_unlock(&block_group->lock);
-			}
+		ret = do_trimming(block_group, total_trimmed, start, bytes,
+				  extent_start, extent_bytes);
+		if (ret)
+			break;
+next:
+		start += bytes;
 
-			if (ret)
-				break;
-			*trimmed += actually_trimmed;
+		if (fatal_signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		cond_resched();
+	}
+out:
+	return ret;
+}
+
+static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+			u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	int ret = 0;
+	int ret2;
+	u64 bytes;
+	u64 offset = offset_to_bitmap(ctl, start);
+
+	while (offset < end) {
+		bool next_bitmap = false;
+
+		spin_lock(&ctl->tree_lock);
+
+		if (ctl->free_space < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			break;
+		}
+
+		entry = tree_search_offset(ctl, offset, 1, 0);
+		if (!entry) {
+			spin_unlock(&ctl->tree_lock);
+			next_bitmap = true;
+			goto next;
+		}
+
+		bytes = minlen;
+		ret2 = search_bitmap(ctl, entry, &start, &bytes);
+		if (ret2 || start >= end) {
+			spin_unlock(&ctl->tree_lock);
+			next_bitmap = true;
+			goto next;
+		}
+
+		bytes = min(bytes, end - start);
+		if (bytes < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			goto next;
+		}
+
+		bitmap_clear_bits(ctl, entry, start, bytes);
+		if (entry->bytes == 0)
+			free_bitmap(ctl, entry);
+
+		spin_unlock(&ctl->tree_lock);
+
+		ret = do_trimming(block_group, total_trimmed, start, bytes,
+				  start, bytes);
+		if (ret)
+			break;
+next:
+		if (next_bitmap) {
+			offset += BITS_PER_BITMAP * ctl->unit;
+		} else {
+			start += bytes;
+			if (start >= offset + BITS_PER_BITMAP * ctl->unit)
+				offset += BITS_PER_BITMAP * ctl->unit;
 		}
-		start += bytes;
-		bytes = 0;
 
 		if (fatal_signal_pending(current)) {
 			ret = -ERESTARTSYS;
@@ -2702,6 +2779,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 	return ret;
 }
 
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+			   u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+	int ret;
+
+	*trimmed = 0;
+
+	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+	if (ret)
+		return ret;
+
+	ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+
+	return ret;
+}
+
 /*
  * Find the left-most item in the cache tree, and then return the
  * smallest inode number in the item.
-- 
cgit v1.2.3


From c7c144db531fda414e532adac56e965ce332e2a5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Dec 2011 10:39:22 +0800
Subject: Btrfs: update global block_rsv when creating a new block group

A bug was triggered while using seed device:

    # mkfs.btrfs /dev/loop1
    # btrfstune -S 1 /dev/loop1
    # mount -o /dev/loop1 /mnt
    # btrfs dev add /dev/loop2 /mnt

btrfs: block rsv returned -28
------------[ cut here ]------------
WARNING: at fs/btrfs/extent-tree.c:5969 btrfs_alloc_free_block+0x166/0x396 [btrfs]()
...
Call Trace:
...
[<f7b7c31c>] btrfs_cow_block+0x101/0x147 [btrfs]
[<f7b7eaa6>] btrfs_search_slot+0x1b8/0x55f [btrfs]
[<f7b7f844>] btrfs_insert_empty_items+0x42/0x7f [btrfs]
[<f7b7f8c1>] btrfs_insert_item+0x40/0x7e [btrfs]
[<f7b8ac02>] btrfs_make_block_group+0x243/0x2aa [btrfs]
[<f7bb3f53>] __btrfs_alloc_chunk+0x672/0x70e [btrfs]
[<f7bb41ff>] init_first_rw_device+0x77/0x13c [btrfs]
[<f7bb5a62>] btrfs_init_new_device+0x664/0x9fd [btrfs]
[<f7bbb65a>] btrfs_ioctl+0x694/0xdbe [btrfs]
[<c04f55f7>] do_vfs_ioctl+0x496/0x4cc
[<c04f5660>] sys_ioctl+0x33/0x4f
[<c07b9edf>] sysenter_do_call+0x12/0x38
---[ end trace 906adac595facc7d ]---

Since seed device is readonly, there's no usable space in the filesystem.
Afterwards we add a sprout device to it, and the kernel creates a METADATA
block group and a SYSTEM block group where comes free space we can reserve,
but we still get revervation failure because the global block_rsv hasn't
been updated accordingly.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5b53479ce07b..bf30f670cda9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7446,6 +7446,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
 	BUG_ON(ret);
+	update_global_block_rsv(root->fs_info);
 
 	spin_lock(&cache->space_info->lock);
 	cache->space_info->bytes_readonly += cache->bytes_super;
-- 
cgit v1.2.3


From b367e47fb3a70f5d24ebd6faf7d42436d485fb2d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Dec 2011 11:38:24 +0800
Subject: Btrfs: fix possible deadlock when opening a seed device

The correct lock order is uuid_mutex -> volume_mutex -> chunk_mutex,
but when we mount a filesystem which has backing seed devices, we have
this lock chain:

    open_ctree()
        lock(chunk_mutex);
        read_chunk_tree();
            read_one_dev();
                open_seed_devices();
                    lock(uuid_mutex);

and then we hit a lockdep splat.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/disk-io.c | 2 --
 fs/btrfs/volumes.c | 9 +++++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3f9d5551e582..858ab347413e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2270,9 +2270,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
 	   BTRFS_UUID_SIZE);
 
-	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_chunk_tree(chunk_root);
-	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
 		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
 		       sb->s_id);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 563ef650850e..fbb493b28d5a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3506,7 +3506,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 	struct btrfs_fs_devices *fs_devices;
 	int ret;
 
-	mutex_lock(&uuid_mutex);
+	BUG_ON(!mutex_is_locked(&uuid_mutex));
 
 	fs_devices = root->fs_info->fs_devices->seed;
 	while (fs_devices) {
@@ -3544,7 +3544,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 	fs_devices->seed = root->fs_info->fs_devices->seed;
 	root->fs_info->fs_devices->seed = fs_devices;
 out:
-	mutex_unlock(&uuid_mutex);
 	return ret;
 }
 
@@ -3687,6 +3686,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
 	if (!path)
 		return -ENOMEM;
 
+	mutex_lock(&uuid_mutex);
+	lock_chunks(root);
+
 	/* first we search for all of the device items, and then we
 	 * read in all of the chunk items.  This way we can create chunk
 	 * mappings that reference all of the devices that are afound
@@ -3737,6 +3739,9 @@ again:
 	}
 	ret = 0;
 error:
+	unlock_chunks(root);
+	mutex_unlock(&uuid_mutex);
+
 	btrfs_free_path(path);
 	return ret;
 }
-- 
cgit v1.2.3


From bce41d601e58af12cee1398fe836e6b9a8fb5396 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Tue, 10 Jan 2012 15:32:29 +0200
Subject: jffs2: do not initialize variable unnecessarily

Remove unnecessary initializer for a local variable.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/erase.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index a01cdad6aad1..eafb8d37a6fb 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -335,7 +335,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
 	void *ebuf;
 	uint32_t ofs;
 	size_t retlen;
-	int ret = -EIO;
+	int ret;
 	unsigned long *wordebuf;
 
 	ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen,
-- 
cgit v1.2.3


From 353b67d8ced4dc53281c88150ad295e24bc4b4c5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 26 Nov 2011 00:35:39 +0100
Subject: jbd: Issue cache flush after checkpointing

When we reach cleanup_journal_tail(), there is no guarantee that
checkpointed buffers are on a stable storage - especially if buffers were
written out by log_do_checkpoint(), they are likely to be only in disk's
caches. Thus when we update journal superblock, effectively removing old
transaction from journal, this write of superblock can get to stable storage
before those checkpointed buffers which can result in filesystem corruption
after a crash.

A similar problem can happen if we replay the journal and wipe it before
flushing disk's caches.

Thus we must unconditionally issue a cache flush before we update journal
superblock in these cases. The fix is slightly complicated by the fact that we
have to get log tail before we issue cache flush but we can store it in the
journal superblock only after the cache flush. Otherwise we risk races where
new tail is written before appropriate cache flush is finished.

I managed to reproduce the corruption using somewhat tweaked Chris Mason's
barrier-test scheduler. Also this should fix occasional reports of 'Bit already
freed' filesystem errors which are totally unreproducible but inspection of
several fs images I've gathered over time points to a problem like this.

CC: stable@kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/checkpoint.c | 27 ++++++++++++++++++++++-----
 fs/jbd/recovery.c   |  4 ++++
 2 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 5d1a00a5041b..05f0754f2b46 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -453,8 +453,6 @@ out:
  *
  * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
  *
- * Called with the journal lock held.
- *
  * This is the only part of the journaling code which really needs to be
  * aware of transaction aborts.  Checkpointing involves writing to the
  * main filesystem area rather than to the journal, so it can proceed
@@ -472,13 +470,14 @@ int cleanup_journal_tail(journal_t *journal)
 	if (is_journal_aborted(journal))
 		return 1;
 
-	/* OK, work out the oldest transaction remaining in the log, and
+	/*
+	 * OK, work out the oldest transaction remaining in the log, and
 	 * the log block it starts at.
 	 *
 	 * If the log is now empty, we need to work out which is the
 	 * next transaction ID we will write, and where it will
-	 * start. */
-
+	 * start.
+	 */
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	transaction = journal->j_checkpoint_transactions;
@@ -504,7 +503,25 @@ int cleanup_journal_tail(journal_t *journal)
 		spin_unlock(&journal->j_state_lock);
 		return 1;
 	}
+	spin_unlock(&journal->j_state_lock);
+
+	/*
+	 * We need to make sure that any blocks that were recently written out
+	 * --- perhaps by log_do_checkpoint() --- are flushed out before we
+	 * drop the transactions from the journal. It's unlikely this will be
+	 * necessary, especially with an appropriately sized journal, but we
+	 * need this to guarantee correctness.  Fortunately
+	 * cleanup_journal_tail() doesn't get called all that often.
+	 */
+	if (journal->j_flags & JFS_BARRIER)
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 
+	spin_lock(&journal->j_state_lock);
+	if (!tid_gt(first_tid, journal->j_tail_sequence)) {
+		spin_unlock(&journal->j_state_lock);
+		/* Someone else cleaned up journal so return 0 */
+		return 0;
+	}
 	/* OK, update the superblock to recover the freed space.
 	 * Physical blocks come first: have we wrapped beyond the end of
 	 * the log?  */
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 5b43e96788e6..008bf062fd26 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,6 +20,7 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
+#include <linux/blkdev.h>
 #endif
 
 /*
@@ -263,6 +264,9 @@ int journal_recover(journal_t *journal)
 	err2 = sync_blockdev(journal->j_fs_dev);
 	if (!err)
 		err = err2;
+	/* Flush disk caches to get replayed data on the permanent storage */
+	if (journal->j_flags & JFS_BARRIER)
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 
 	return err;
 }
-- 
cgit v1.2.3


From 34b07840565004cfa444e165e88bf77a5cbcdb25 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Mon, 9 Jan 2012 15:58:37 +0100
Subject: ext2: protect inode changes in the SETVERSION and SETFLAGS ioctls

Unlock mutex after i_flags and i_ctime updates in the EXT2_IOC_SETFLAGS
ioctl.

Use i_mutex in the EXT2_IOC_SETVERSION ioctl to protect i_ctime and
i_generation updates and make the ioctl consistent since i_mutex is
also used in other places to protect timestamps and inode changes.

Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Djalal Harouni <tixxdz@opendz.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/ioctl.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 1089f760c847..2de655f5d625 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -77,10 +77,11 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		flags = flags & EXT2_FL_USER_MODIFIABLE;
 		flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE;
 		ei->i_flags = flags;
-		mutex_unlock(&inode->i_mutex);
 
 		ext2_set_inode_flags(inode);
 		inode->i_ctime = CURRENT_TIME_SEC;
+		mutex_unlock(&inode->i_mutex);
+
 		mark_inode_dirty(inode);
 setflags_out:
 		mnt_drop_write_file(filp);
@@ -88,20 +89,29 @@ setflags_out:
 	}
 	case EXT2_IOC_GETVERSION:
 		return put_user(inode->i_generation, (int __user *) arg);
-	case EXT2_IOC_SETVERSION:
+	case EXT2_IOC_SETVERSION: {
+		__u32 generation;
+
 		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 		ret = mnt_want_write_file(filp);
 		if (ret)
 			return ret;
-		if (get_user(inode->i_generation, (int __user *) arg)) {
+		if (get_user(generation, (int __user *) arg)) {
 			ret = -EFAULT;
-		} else {
-			inode->i_ctime = CURRENT_TIME_SEC;
-			mark_inode_dirty(inode);
+			goto setversion_out;
 		}
+
+		mutex_lock(&inode->i_mutex);
+		inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_generation = generation;
+		mutex_unlock(&inode->i_mutex);
+
+		mark_inode_dirty(inode);
+setversion_out:
 		mnt_drop_write_file(filp);
 		return ret;
+	}
 	case EXT2_IOC_GETRSVSZ:
 		if (test_opt(inode->i_sb, RESERVATION)
 			&& S_ISREG(inode->i_mode)
-- 
cgit v1.2.3


From 46fe44ce8777f087aa8ad4a2605fdcfb9c2d63af Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 16 Nov 2011 15:03:59 +0100
Subject: quota: Pass information that quota is stored in system file to
 userspace

Quota tools need to know whether quota is stored in a system file or in
classical aquota.{user|group} files. So pass this information as a flag
in GETINFO quotactl.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 5ec59b20cf76..46741970371b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2125,6 +2125,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		mutex_unlock(&dqopt->dqio_mutex);
 		goto out_file_init;
 	}
+	if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+		dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
 	mutex_unlock(&dqopt->dqio_mutex);
 	spin_lock(&dq_state_lock);
 	dqopt->flags |= dquot_state_flag(flags, type);
@@ -2464,7 +2466,7 @@ int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	spin_lock(&dq_data_lock);
 	ii->dqi_bgrace = mi->dqi_bgrace;
 	ii->dqi_igrace = mi->dqi_igrace;
-	ii->dqi_flags = mi->dqi_flags & DQF_MASK;
+	ii->dqi_flags = mi->dqi_flags & DQF_GETINFO_MASK;
 	ii->dqi_valid = IIF_ALL;
 	spin_unlock(&dq_data_lock);
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
@@ -2490,8 +2492,8 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	if (ii->dqi_valid & IIF_IGRACE)
 		mi->dqi_igrace = ii->dqi_igrace;
 	if (ii->dqi_valid & IIF_FLAGS)
-		mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) |
-				(ii->dqi_flags & DQF_MASK);
+		mi->dqi_flags = (mi->dqi_flags & ~DQF_SETINFO_MASK) |
+				(ii->dqi_flags & DQF_SETINFO_MASK);
 	spin_unlock(&dq_data_lock);
 	mark_info_dirty(sb, type);
 	/* Force write to disk */
-- 
cgit v1.2.3


From 7250170c9ed00f3b74b11b98afefab45020672dd Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 11 Jan 2012 10:46:27 +0300
Subject: cifs: integer overflow in parse_dacl()

On 32 bit systems num_aces * sizeof(struct cifs_ace *) could overflow
leading to a smaller ppace buffer than we expected.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsacl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 72ddf23ef6f7..c1b254487388 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -909,6 +909,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 		umode_t group_mask = S_IRWXG;
 		umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
 
+		if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *))
+			return;
 		ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
 				GFP_KERNEL);
 		if (!ppace) {
-- 
cgit v1.2.3


From 673e8e597c06eb81954bf21a10f5cce74a1de8f1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:04 +0000
Subject: xfs: remove xfs_itruncate_data

This wrapper isn't overly useful, not to say rather confusing.

Around the call to xfs_itruncate_extents it does:

 - add tracing
 - add a few asserts in debug builds
 - conditionally update the inode size in two places
 - log the inode

Both the tracing and the inode logging can be moved to xfs_itruncate_extents
as they are useful for the attribute fork as well - in fact the attr code
already does an equivalent xfs_trans_log_inode call just after calling
xfs_itruncate_extents.  The conditional size updates are a mess, and there
was no reason to do them in two places anyway, as the first one was
conditional on the inode having extents - but without extents we
xfs_itruncate_extents would be a no-op and the placement wouldn't matter
anyway.  Instead move the size assignments and the asserts that make sense
to the callers that want it.

As a side effect of this clean up xfs_setattr_size by introducing variables
for the old and new inode size, and moving the size updates into a common
place.

Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr.c        |   4 --
 fs/xfs/xfs_inode.c       | 124 ++++-------------------------------------------
 fs/xfs/xfs_inode.h       |   2 -
 fs/xfs/xfs_iops.c        |  47 +++++++++++-------
 fs/xfs/xfs_qm_syscalls.c |   9 +++-
 fs/xfs/xfs_trace.h       |   4 +-
 fs/xfs/xfs_vnodeops.c    |  17 ++++++-
 7 files changed, 65 insertions(+), 142 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 1e5d97f86ea8..08b9ac644c31 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	if (error)
 		goto out;
 
-	/*
-	 * Commit the last in the sequence of transactions.
-	 */
-	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
 	error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9dda7cc32848..ccd619a993f6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1165,52 +1165,6 @@ xfs_ialloc(
 	return 0;
 }
 
-/*
- * Check to make sure that there are no blocks allocated to the
- * file beyond the size of the file.  We don't check this for
- * files with fixed size extents or real time extents, but we
- * at least do it for regular files.
- */
-#ifdef DEBUG
-STATIC void
-xfs_isize_check(
-	struct xfs_inode	*ip,
-	xfs_fsize_t		isize)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	xfs_fileoff_t		map_first;
-	int			nimaps;
-	xfs_bmbt_irec_t		imaps[2];
-	int			error;
-
-	if (!S_ISREG(ip->i_d.di_mode))
-		return;
-
-	if (XFS_IS_REALTIME_INODE(ip))
-		return;
-
-	if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-		return;
-
-	nimaps = 2;
-	map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-	/*
-	 * The filesystem could be shutting down, so bmapi may return
-	 * an error.
-	 */
-	error = xfs_bmapi_read(ip, map_first,
-			 (XFS_B_TO_FSB(mp,
-			       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
-			 imaps, &nimaps, XFS_BMAPI_ENTIRE);
-	if (error)
-		return;
-	ASSERT(nimaps == 1);
-	ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
-}
-#else	/* DEBUG */
-#define xfs_isize_check(ip, isize)
-#endif	/* DEBUG */
-
 /*
  * Free up the underlying blocks past new_size.  The new size must be smaller
  * than the current size.  This routine can be used both for the attribute and
@@ -1258,6 +1212,8 @@ xfs_itruncate_extents(
 	ASSERT(ip->i_itemp->ili_lock_flags == 0);
 	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
 
+	trace_xfs_itruncate_extents_start(ip, new_size);
+
 	/*
 	 * Since it is possible for space to become allocated beyond
 	 * the end of the file (in a crash where the space is allocated
@@ -1325,6 +1281,14 @@ xfs_itruncate_extents(
 			goto out;
 	}
 
+	/*
+	 * Always re-log the inode so that our permanent transaction can keep
+	 * on rolling it forward in the log.
+	 */
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	trace_xfs_itruncate_extents_end(ip, new_size);
+
 out:
 	*tpp = tp;
 	return error;
@@ -1338,74 +1302,6 @@ out_bmap_cancel:
 	goto out;
 }
 
-int
-xfs_itruncate_data(
-	struct xfs_trans	**tpp,
-	struct xfs_inode	*ip,
-	xfs_fsize_t		new_size)
-{
-	int			error;
-
-	trace_xfs_itruncate_data_start(ip, new_size);
-
-	/*
-	 * The first thing we do is set the size to new_size permanently on
-	 * disk.  This way we don't have to worry about anyone ever being able
-	 * to look at the data being freed even in the face of a crash.
-	 * What we're getting around here is the case where we free a block, it
-	 * is allocated to another file, it is written to, and then we crash.
-	 * If the new data gets written to the file but the log buffers
-	 * containing the free and reallocation don't, then we'd end up with
-	 * garbage in the blocks being freed.  As long as we make the new_size
-	 * permanent before actually freeing any blocks it doesn't matter if
-	 * they get written to.
-	 */
-	if (ip->i_d.di_nextents > 0) {
-		/*
-		 * If we are not changing the file size then do not update
-		 * the on-disk file size - we may be called from
-		 * xfs_inactive_free_eofblocks().  If we update the on-disk
-		 * file size and then the system crashes before the contents
-		 * of the file are flushed to disk then the files may be
-		 * full of holes (ie NULL files bug).
-		 */
-		if (ip->i_size != new_size) {
-			ip->i_d.di_size = new_size;
-			ip->i_size = new_size;
-			xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-		}
-	}
-
-	error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
-	if (error)
-		return error;
-
-	/*
-	 * If we are not changing the file size then do not update the on-disk
-	 * file size - we may be called from xfs_inactive_free_eofblocks().
-	 * If we update the on-disk file size and then the system crashes
-	 * before the contents of the file are flushed to disk then the files
-	 * may be full of holes (ie NULL files bug).
-	 */
-	xfs_isize_check(ip, new_size);
-	if (ip->i_size != new_size) {
-		ip->i_d.di_size = new_size;
-		ip->i_size = new_size;
-	}
-
-	ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
-	ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
-
-	/*
-	 * Always re-log the inode so that our permanent transaction can keep
-	 * on rolling it forward in the log.
-	 */
-	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-
-	trace_xfs_itruncate_data_end(ip, new_size);
-	return 0;
-}
-
 /*
  * This is called when the inode's link count goes to 0.
  * We place the on-disk inode on a list in the AGI.  It
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f0e6b151ba37..440f2acebfa8 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -491,8 +491,6 @@ int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
 			   struct xfs_bmap_free *);
 int		xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
 				      int, xfs_fsize_t);
-int		xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *,
-				   xfs_fsize_t);
 int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 
 void		xfs_iext_realloc(xfs_inode_t *, int, int);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f9babd179223..f02eaa298d3c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -750,6 +750,7 @@ xfs_setattr_size(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct inode		*inode = VFS_I(ip);
 	int			mask = iattr->ia_valid;
+	xfs_off_t		oldsize, newsize;
 	struct xfs_trans	*tp;
 	int			error;
 	uint			lock_flags;
@@ -777,11 +778,13 @@ xfs_setattr_size(
 		lock_flags |= XFS_IOLOCK_EXCL;
 	xfs_ilock(ip, lock_flags);
 
+	oldsize = ip->i_size;
+	newsize = iattr->ia_size;
+
 	/*
 	 * Short circuit the truncate case for zero length files.
 	 */
-	if (iattr->ia_size == 0 &&
-	    ip->i_size == 0 && ip->i_d.di_nextents == 0) {
+	if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
 		if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
 			goto out_unlock;
 
@@ -807,14 +810,14 @@ xfs_setattr_size(
 	 * the inode to the transaction, because the inode cannot be unlocked
 	 * once it is a part of the transaction.
 	 */
-	if (iattr->ia_size > ip->i_size) {
+	if (newsize > oldsize) {
 		/*
 		 * Do the first part of growing a file: zero any data in the
 		 * last block that is beyond the old EOF.  We need to do this
 		 * before the inode is joined to the transaction to modify
 		 * i_size.
 		 */
-		error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+		error = xfs_zero_eof(ip, newsize, oldsize);
 		if (error)
 			goto out_unlock;
 	}
@@ -833,8 +836,8 @@ xfs_setattr_size(
 	 * here and prevents waiting for other data not within the range we
 	 * care about here.
 	 */
-	if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
-		error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
+	if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
+		error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
 					FI_NONE);
 		if (error)
 			goto out_unlock;
@@ -845,8 +848,7 @@ xfs_setattr_size(
 	 */
 	inode_dio_wait(inode);
 
-	error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
-				     xfs_get_blocks);
+	error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
 	if (error)
 		goto out_unlock;
 
@@ -857,7 +859,7 @@ xfs_setattr_size(
 	if (error)
 		goto out_trans_cancel;
 
-	truncate_setsize(inode, iattr->ia_size);
+	truncate_setsize(inode, newsize);
 
 	commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 	lock_flags |= XFS_ILOCK_EXCL;
@@ -876,19 +878,30 @@ xfs_setattr_size(
 	 * these flags set.  For all other operations the VFS set these flags
 	 * explicitly if it wants a timestamp update.
 	 */
-	if (iattr->ia_size != ip->i_size &&
-	    (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+	if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
 		iattr->ia_ctime = iattr->ia_mtime =
 			current_fs_time(inode->i_sb);
 		mask |= ATTR_CTIME | ATTR_MTIME;
 	}
 
-	if (iattr->ia_size > ip->i_size) {
-		ip->i_d.di_size = iattr->ia_size;
-		ip->i_size = iattr->ia_size;
-	} else if (iattr->ia_size <= ip->i_size ||
-		   (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
-		error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
+	/*
+	 * The first thing we do is set the size to new_size permanently on
+	 * disk.  This way we don't have to worry about anyone ever being able
+	 * to look at the data being freed even in the face of a crash.
+	 * What we're getting around here is the case where we free a block, it
+	 * is allocated to another file, it is written to, and then we crash.
+	 * If the new data gets written to the file but the log buffers
+	 * containing the free and reallocation don't, then we'd end up with
+	 * garbage in the blocks being freed.  As long as we make the new size
+	 * permanent before actually freeing any blocks it doesn't matter if
+	 * they get written to.
+	 */
+	ip->i_d.di_size = newsize;
+	ip->i_size = newsize;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	if (newsize <= oldsize) {
+		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
 		if (error)
 			goto out_trans_abort;
 
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 5cc3dde1bc90..27378650b5cb 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -31,6 +31,7 @@
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -263,13 +264,19 @@ xfs_qm_scall_trunc_qfile(
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, 0);
 
-	error = xfs_itruncate_data(&tp, ip, 0);
+	ip->i_d.di_size = 0;
+	ip->i_size = 0;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
 	if (error) {
 		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
 				     XFS_TRANS_ABORT);
 		goto out_unlock;
 	}
 
+	ASSERT(ip->i_d.di_nextents == 0);
+
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a9d5b1e06efe..297f9fa6fb64 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1090,8 +1090,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
 DEFINE_EVENT(xfs_itrunc_class, name, \
 	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
 	TP_ARGS(ip, new_size))
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start);
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
 
 TRACE_EVENT(xfs_pagecache_inval,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f2fea868d4db..96ff03421753 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -226,7 +226,14 @@ xfs_free_eofblocks(
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		xfs_trans_ijoin(tp, ip, 0);
 
-		error = xfs_itruncate_data(&tp, ip, ip->i_size);
+		/*
+		 * Do not update the on-disk file size.  If we update the
+		 * on-disk file size and then the system crashes before the
+		 * contents of the file are flushed to disk then the files
+		 * may be full of holes (ie NULL files bug).
+		 */
+		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
+					      ip->i_size);
 		if (error) {
 			/*
 			 * If we get an error at this point we simply don't
@@ -670,13 +677,19 @@ xfs_inactive(
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		xfs_trans_ijoin(tp, ip, 0);
 
-		error = xfs_itruncate_data(&tp, ip, 0);
+		ip->i_d.di_size = 0;
+		ip->i_size = 0;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
 		if (error) {
 			xfs_trans_cancel(tp,
 				XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 			return VN_INACTIVE_CACHE;
 		}
+
+		ASSERT(ip->i_d.di_nextents == 0);
 	} else if (S_ISLNK(ip->i_d.di_mode)) {
 
 		/*
-- 
cgit v1.2.3


From bf322d983e540f66517db85b6870017613bb1e8d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:05 +0000
Subject: xfs: cleanup xfs_iomap_eof_align_last_fsb

Replace the nasty if, else if, elseif condition with more natural C flow
that expressed the logic we want here better.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_iomap.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 9afa282aa937..a27a44659da6 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb(
 	xfs_fileoff_t	*last_fsb)
 {
 	xfs_fileoff_t	new_last_fsb = 0;
-	xfs_extlen_t	align;
+	xfs_extlen_t	align = 0;
 	int		eof, error;
 
-	if (XFS_IS_REALTIME_INODE(ip))
-		;
-	/*
-	 * If mounted with the "-o swalloc" option, roundup the allocation
-	 * request to a stripe width boundary if the file size is >=
-	 * stripe width and we are allocating past the allocation eof.
-	 */
-	else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
-	        (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
-		new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
-	/*
-	 * Roundup the allocation request to a stripe unit (m_dalign) boundary
-	 * if the file size is >= stripe unit size, and we are allocating past
-	 * the allocation eof.
-	 */
-	else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
-		new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
+	if (!XFS_IS_REALTIME_INODE(ip)) {
+		/*
+		 * Round up the allocation request to a stripe unit
+		 * (m_dalign) boundary if the file size is >= stripe unit
+		 * size, and we are allocating past the allocation eof.
+		 *
+		 * If mounted with the "-o swalloc" option the alignment is
+		 * increased from the strip unit size to the stripe width.
+		 */
+		if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+			align = mp->m_swidth;
+		else if (mp->m_dalign)
+			align = mp->m_dalign;
+
+		if (align && ip->i_size >= XFS_FSB_TO_B(mp, align))
+			new_last_fsb = roundup_64(*last_fsb, align);
+	}
 
 	/*
 	 * Always round up the allocation request to an extent boundary
-- 
cgit v1.2.3


From 3d2b3129c2c48cf0153e0f2058cf87e4b45ca3ac Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:06 +0000
Subject: xfs: remove the unused dm_attrs structure

.. and the just as dead bhv_desc forward declaration while we're at it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 440f2acebfa8..c1574721a191 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -211,7 +211,6 @@ typedef struct xfs_icdinode {
 
 #ifdef __KERNEL__
 
-struct bhv_desc;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -220,12 +219,6 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot;
 
-typedef struct dm_attrs_s {
-	__uint32_t	da_dmevmask;	/* DMIG event mask */
-	__uint16_t	da_dmstate;	/* DMIG state info */
-	__uint16_t	da_pad;		/* DMIG extra padding */
-} dm_attrs_t;
-
 typedef struct xfs_inode {
 	/* Inode linking and identification information. */
 	struct xfs_mount	*i_mount;	/* fs mount struct ptr */
-- 
cgit v1.2.3


From f7e6746ebae984ea67b0a1a1e23c7e6698240631 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Sat, 14 Jan 2012 00:01:01 +0000
Subject: sched/accounting, proc: Fix /proc/stat interrupts sum

Commit 3292beb340c7688 ("sched/accounting: Change cpustat fields to an array")
deleted the code which provides us with the sum of all interrupts in the
system, causing vmstat to report zero interrupts occuring in the system.

Fix this by restoring the code.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-by: Russell King <rmk+kernel@arm.linux.org.uk> # [on ARM]
Tested-by: Tony Luck <tony.luck@intel.com>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Glauber Costa <glommer@parallels.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Tuner <pjt@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/stat.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index d76ca6ae2b1b..121f77cfef76 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -77,6 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
 		steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
 		guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
 		guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+		sum += kstat_cpu_irqs_sum(i);
+		sum += arch_irq_stat_cpu(i);
 
 		for (j = 0; j < NR_SOFTIRQS; j++) {
 			unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
-- 
cgit v1.2.3


From 6fef8df1dcb9b586268caff66df1d71ce8610132 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: get rid of *_alloc_profile fields

{data,metadata,system}_alloc_profile fields have been unused for a long
time now.  Get rid of them.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h       |  3 ---
 fs/btrfs/disk-io.c     |  3 ---
 fs/btrfs/extent-tree.c | 10 ++++------
 fs/btrfs/volumes.c     |  6 ++----
 4 files changed, 6 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323d..f5434ad49b99 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1135,9 +1135,6 @@ struct btrfs_fs_info {
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
 	u64 avail_system_alloc_bits;
-	u64 data_alloc_profile;
-	u64 metadata_alloc_profile;
-	u64 system_alloc_profile;
 
 	unsigned data_chunk_allocations;
 	unsigned metadata_ratio;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3f9d5551e582..ce9d0fb3627d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2321,9 +2321,6 @@ retry_root_backup:
 
 	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
-	fs_info->data_alloc_profile = (u64)-1;
-	fs_info->metadata_alloc_profile = (u64)-1;
-	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 
 	ret = btrfs_init_space_info(fs_info);
 	if (ret) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8603ee4e3dfd..f0591fd66249 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3067,14 +3067,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
 	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		flags |= root->fs_info->avail_data_alloc_bits &
-			 root->fs_info->data_alloc_profile;
+		flags |= root->fs_info->avail_data_alloc_bits;
 	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		flags |= root->fs_info->avail_system_alloc_bits &
-			 root->fs_info->system_alloc_profile;
+		flags |= root->fs_info->avail_system_alloc_bits;
 	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-		flags |= root->fs_info->avail_metadata_alloc_bits &
-			 root->fs_info->metadata_alloc_profile;
+		flags |= root->fs_info->avail_metadata_alloc_bits;
+
 	return btrfs_reduce_alloc_profile(root, flags);
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9d..89096f6d6fb4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2752,8 +2752,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 		return ret;
 
 	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
-			(fs_info->metadata_alloc_profile &
-			 fs_info->avail_metadata_alloc_bits);
+				fs_info->avail_metadata_alloc_bits;
 	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
 
 	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -2763,8 +2762,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 	sys_chunk_offset = chunk_offset + chunk_size;
 
 	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
-			(fs_info->system_alloc_profile &
-			 fs_info->avail_system_alloc_bits);
+				fs_info->avail_system_alloc_bits;
 	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
 
 	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
-- 
cgit v1.2.3


From 52ba692972532f8d652080214b6599ece3dd51b9 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: introduce masks for chunk type and profile

Chunk's type and profile are encoded in u64 flags field.  Introduce
masks to easily access them.  Also fix the type of BTRFS_BLOCK_GROUP_*
constants, it should be ULL.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h       | 26 +++++++++++++++++---------
 fs/btrfs/extent-tree.c | 12 +++---------
 fs/btrfs/volumes.c     | 11 ++---------
 3 files changed, 22 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f5434ad49b99..4370a56fe81a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -751,15 +751,23 @@ struct btrfs_csum_item {
 } __attribute__ ((__packed__));
 
 /* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
-#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
-#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
-#define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
-#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
-#define BTRFS_NR_RAID_TYPES	   5
-
+#define BTRFS_BLOCK_GROUP_DATA		(1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM	(1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA	(1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0		(1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1		(1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP		(1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10	(1ULL << 6)
+#define BTRFS_NR_RAID_TYPES		5
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK	(BTRFS_BLOCK_GROUP_DATA |    \
+					 BTRFS_BLOCK_GROUP_SYSTEM |  \
+					 BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK	(BTRFS_BLOCK_GROUP_RAID0 |   \
+					 BTRFS_BLOCK_GROUP_RAID1 |   \
+					 BTRFS_BLOCK_GROUP_DUP |     \
+					 BTRFS_BLOCK_GROUP_RAID10)
 struct btrfs_block_group_item {
 	__le64 used;
 	__le64 chunk_objectid;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f0591fd66249..a8d8204188d1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 	struct list_head *head = &info->space_info;
 	struct btrfs_space_info *found;
 
-	flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
-		 BTRFS_BLOCK_GROUP_METADATA;
+	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
@@ -2993,9 +2992,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 		INIT_LIST_HEAD(&found->block_groups[i]);
 	init_rwsem(&found->groups_sem);
 	spin_lock_init(&found->lock);
-	found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
-				BTRFS_BLOCK_GROUP_SYSTEM |
-				BTRFS_BLOCK_GROUP_METADATA);
+	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
 	found->total_bytes = total_bytes;
 	found->disk_total = total_bytes * factor;
 	found->bytes_used = bytes_used;
@@ -3016,10 +3013,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
-	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
-				   BTRFS_BLOCK_GROUP_RAID1 |
-				   BTRFS_BLOCK_GROUP_RAID10 |
-				   BTRFS_BLOCK_GROUP_DUP);
+	u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
 	if (extra_flags) {
 		if (flags & BTRFS_BLOCK_GROUP_DATA)
 			fs_info->avail_data_alloc_bits |= extra_flags;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 89096f6d6fb4..d5fdee56777f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2949,12 +2949,8 @@ again:
 		}
 	}
 	if (rw & REQ_DISCARD) {
-		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-				 BTRFS_BLOCK_GROUP_RAID1 |
-				 BTRFS_BLOCK_GROUP_DUP |
-				 BTRFS_BLOCK_GROUP_RAID10)) {
+		if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK)
 			stripes_required = map->num_stripes;
-		}
 	}
 	if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
 	    stripes_allocated < stripes_required) {
@@ -2978,10 +2974,7 @@ again:
 
 	if (rw & REQ_DISCARD)
 		*length = min_t(u64, em->len - offset, *length);
-	else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-			      BTRFS_BLOCK_GROUP_RAID1 |
-			      BTRFS_BLOCK_GROUP_RAID10 |
-			      BTRFS_BLOCK_GROUP_DUP)) {
+	else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
 		/* we limit the length of each bio to what fits in a stripe */
 		*length = min_t(u64, em->len - offset,
 				map->stripe_len - stripe_offset);
-- 
cgit v1.2.3


From a46d11a8b06dd0431a3888fbc4856ea13a8e634f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: add BTRFS_AVAIL_ALLOC_BIT_SINGLE bit

Right now on-disk BTRFS_BLOCK_GROUP_* profile bits are used for
avail_{data,metadata,system}_alloc_bits fields, which gather info about
available allocation profiles in the FS.  When chunk is created or read
from disk, its profile is OR'ed with the corresponding avail_alloc_bits
field.  Since SINGLE is denoted by 0 in the on-disk format, currently
there is no way to tell when such chunks become avaialble.  Restriper
needs that information, so add a separate bit for SINGLE profile.

This bit is going to be in-memory only, it should never be written out
to disk, so it's not a disk format change.  However to avoid remappings
in future, reserve corresponding on-disk bit.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h       | 15 +++++++++++++++
 fs/btrfs/extent-tree.c | 30 +++++++++++++++++++++---------
 2 files changed, 36 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4370a56fe81a..3f8f11e18b53 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -758,6 +758,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_RAID1		(1ULL << 4)
 #define BTRFS_BLOCK_GROUP_DUP		(1ULL << 5)
 #define BTRFS_BLOCK_GROUP_RAID10	(1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RESERVED	BTRFS_AVAIL_ALLOC_BIT_SINGLE
 #define BTRFS_NR_RAID_TYPES		5
 
 #define BTRFS_BLOCK_GROUP_TYPE_MASK	(BTRFS_BLOCK_GROUP_DATA |    \
@@ -768,6 +769,15 @@ struct btrfs_csum_item {
 					 BTRFS_BLOCK_GROUP_RAID1 |   \
 					 BTRFS_BLOCK_GROUP_DUP |     \
 					 BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available.  This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE	(1ULL << 48)
+
 struct btrfs_block_group_item {
 	__le64 used;
 	__le64 chunk_objectid;
@@ -1140,6 +1150,11 @@ struct btrfs_fs_info {
 	spinlock_t ref_cache_lock;
 	u64 total_ref_cache_size;
 
+	/*
+	 * these three are in extended format (availability of single
+	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+	 * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+	 */
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
 	u64 avail_system_alloc_bits;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a8d8204188d1..15a22949da17 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3014,16 +3014,24 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
 	u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
-	if (extra_flags) {
-		if (flags & BTRFS_BLOCK_GROUP_DATA)
-			fs_info->avail_data_alloc_bits |= extra_flags;
-		if (flags & BTRFS_BLOCK_GROUP_METADATA)
-			fs_info->avail_metadata_alloc_bits |= extra_flags;
-		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-			fs_info->avail_system_alloc_bits |= extra_flags;
-	}
+
+	/* chunk -> extended profile */
+	if (extra_flags == 0)
+		extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		fs_info->avail_data_alloc_bits |= extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		fs_info->avail_metadata_alloc_bits |= extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		fs_info->avail_system_alloc_bits |= extra_flags;
 }
 
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format.
+ */
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
 	/*
@@ -3053,8 +3061,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
 	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
-	     (flags & BTRFS_BLOCK_GROUP_DUP)))
+	     (flags & BTRFS_BLOCK_GROUP_DUP))) {
 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+	}
+
+	/* extended -> chunk profile */
+	flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
 	return flags;
 }
 
-- 
cgit v1.2.3


From 10ea00f55a07f8f9536d9112b95108a86f700bab Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: make avail_*_alloc_bits fields dynamic

Currently when new chunks are created respective avail_alloc_bits field
is updated to reflect profiles of all chunks present in the system.
However when chunks are removed profile bits are never cleared.

This patch clears profile bit of respective avail_alloc_bits field when
the last chunk with that profile is removed.  Restriper needs this to
properly operate when "downgrading".

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/extent-tree.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 15a22949da17..946b0671c5d9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7469,6 +7469,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	/* chunk -> extended profile */
+	if (extra_flags == 0)
+		extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		fs_info->avail_data_alloc_bits &= ~extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		fs_info->avail_system_alloc_bits &= ~extra_flags;
+}
+
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start)
 {
@@ -7479,6 +7495,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	struct inode *inode;
 	int ret;
+	int index;
 	int factor;
 
 	root = root->fs_info->extent_root;
@@ -7494,6 +7511,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	free_excluded_extents(root, block_group);
 
 	memcpy(&key, &block_group->key, sizeof(key));
+	index = get_block_group_index(block_group);
 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
 				  BTRFS_BLOCK_GROUP_RAID1 |
 				  BTRFS_BLOCK_GROUP_RAID10))
@@ -7568,6 +7586,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	 * are still on the list after taking the semaphore
 	 */
 	list_del_init(&block_group->list);
+	if (list_empty(&block_group->space_info->block_groups[index]))
+		clear_avail_alloc_bits(root->fs_info, block_group->flags);
 	up_write(&block_group->space_info->groups_sem);
 
 	if (block_group->cached == BTRFS_CACHE_STARTED)
-- 
cgit v1.2.3


From c9e9f97bdfb64d06e9520f8e4f37674ac21cc9bc Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: add basic restriper infrastructure

Add basic restriper infrastructure: extended balancing ioctl and all
related ioctl data structures, add data structure for tracking
restriper's state to fs_info, etc.  The semantics of the old balancing
ioctl are fully preserved.

Explicitly disallow any volume operations when balance is in progress.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h   |   6 +++
 fs/btrfs/disk-io.c |   4 ++
 fs/btrfs/ioctl.c   | 135 ++++++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/ioctl.h   |  43 +++++++++++++++++
 fs/btrfs/volumes.c | 120 +++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/volumes.h |  14 +++++-
 6 files changed, 281 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3f8f11e18b53..c4d98c8df5c5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -934,6 +934,7 @@ struct btrfs_block_group_cache {
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
+struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
@@ -1159,6 +1160,11 @@ struct btrfs_fs_info {
 	u64 avail_metadata_alloc_bits;
 	u64 avail_system_alloc_bits;
 
+	/* restriper state */
+	spinlock_t balance_lock;
+	struct mutex balance_mutex;
+	struct btrfs_balance_control *balance_ctl;
+
 	unsigned data_chunk_allocations;
 	unsigned metadata_ratio;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ce9d0fb3627d..190a1b24c902 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2002,6 +2002,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_rwsem(&fs_info->scrub_super_lock);
 	fs_info->scrub_workers_refcnt = 0;
 
+	spin_lock_init(&fs_info->balance_lock);
+	mutex_init(&fs_info->balance_mutex);
+	fs_info->balance_ctl = NULL;
+
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
 	sb->s_bdi = &fs_info->bdi;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c04f02c7d5bb..d838d2cfb947 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1203,13 +1203,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	mutex_lock(&root->fs_info->volume_mutex);
+	if (root->fs_info->balance_ctl) {
+		printk(KERN_INFO "btrfs: balance in progress\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 
-	mutex_lock(&root->fs_info->volume_mutex);
 	sizestr = vol_args->name;
 	devstr = strchr(sizestr, ':');
 	if (devstr) {
@@ -1226,7 +1234,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
 		       (unsigned long long)devid);
 		ret = -EINVAL;
-		goto out_unlock;
+		goto out_free;
 	}
 	if (!strcmp(sizestr, "max"))
 		new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1249,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		new_size = memparse(sizestr, NULL);
 		if (new_size == 0) {
 			ret = -EINVAL;
-			goto out_unlock;
+			goto out_free;
 		}
 	}
 
@@ -1250,7 +1258,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 	if (mod < 0) {
 		if (new_size > old_size) {
 			ret = -EINVAL;
-			goto out_unlock;
+			goto out_free;
 		}
 		new_size = old_size - new_size;
 	} else if (mod > 0) {
@@ -1259,11 +1267,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 
 	if (new_size < 256 * 1024 * 1024) {
 		ret = -EINVAL;
-		goto out_unlock;
+		goto out_free;
 	}
 	if (new_size > device->bdev->bd_inode->i_size) {
 		ret = -EFBIG;
-		goto out_unlock;
+		goto out_free;
 	}
 
 	do_div(new_size, root->sectorsize);
@@ -1276,7 +1284,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		trans = btrfs_start_transaction(root, 0);
 		if (IS_ERR(trans)) {
 			ret = PTR_ERR(trans);
-			goto out_unlock;
+			goto out_free;
 		}
 		ret = btrfs_grow_device(trans, device, new_size);
 		btrfs_commit_transaction(trans, root);
@@ -1284,9 +1292,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		ret = btrfs_shrink_device(device, new_size);
 	}
 
-out_unlock:
-	mutex_unlock(&root->fs_info->volume_mutex);
+out_free:
 	kfree(vol_args);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 }
 
@@ -2052,14 +2061,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	mutex_lock(&root->fs_info->volume_mutex);
+	if (root->fs_info->balance_ctl) {
+		printk(KERN_INFO "btrfs: balance in progress\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_init_new_device(root, vol_args->name);
 
 	kfree(vol_args);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 }
 
@@ -2074,14 +2094,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
+	mutex_lock(&root->fs_info->volume_mutex);
+	if (root->fs_info->balance_ctl) {
+		printk(KERN_INFO "btrfs: balance in progress\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_rm_device(root, vol_args->name);
 
 	kfree(vol_args);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 }
 
@@ -3034,6 +3065,76 @@ out:
 	return ret;
 }
 
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+			       struct btrfs_ioctl_balance_args *bargs)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+	bargs->flags = bctl->flags;
+
+	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+}
+
+static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_ioctl_balance_args *bargs;
+	struct btrfs_balance_control *bctl;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	mutex_lock(&fs_info->volume_mutex);
+	mutex_lock(&fs_info->balance_mutex);
+
+	if (arg) {
+		bargs = memdup_user(arg, sizeof(*bargs));
+		if (IS_ERR(bargs)) {
+			ret = PTR_ERR(bargs);
+			goto out;
+		}
+	} else {
+		bargs = NULL;
+	}
+
+	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+	if (!bctl) {
+		ret = -ENOMEM;
+		goto out_bargs;
+	}
+
+	bctl->fs_info = fs_info;
+	if (arg) {
+		memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+		memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+		memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+		bctl->flags = bargs->flags;
+	}
+
+	ret = btrfs_balance(bctl, bargs);
+	/*
+	 * bctl is freed in __cancel_balance
+	 */
+	if (arg) {
+		if (copy_to_user(arg, bargs, sizeof(*bargs)))
+			ret = -EFAULT;
+	}
+
+out_bargs:
+	kfree(bargs);
+out:
+	mutex_unlock(&fs_info->balance_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -3078,7 +3179,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_DEV_INFO:
 		return btrfs_ioctl_dev_info(root, argp);
 	case BTRFS_IOC_BALANCE:
-		return btrfs_balance(root->fs_info->dev_root);
+		return btrfs_ioctl_balance(root, NULL);
 	case BTRFS_IOC_CLONE:
 		return btrfs_ioctl_clone(file, arg, 0, 0, 0);
 	case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3211,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_scrub_cancel(root, argp);
 	case BTRFS_IOC_SCRUB_PROGRESS:
 		return btrfs_ioctl_scrub_progress(root, argp);
+	case BTRFS_IOC_BALANCE_V2:
+		return btrfs_ioctl_balance(root, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 252ae9915de8..c8b37d2c0d77 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,47 @@ struct btrfs_ioctl_fs_info_args {
 	__u64 reserved[124];			/* pad to 1k */
 };
 
+/*
+ * this is packed, because it should be exactly the same as its disk
+ * byte order counterpart (struct btrfs_disk_balance_args)
+ */
+struct btrfs_balance_args {
+	__u64 profiles;
+	__u64 usage;
+	__u64 devid;
+	__u64 pstart;
+	__u64 pend;
+	__u64 vstart;
+	__u64 vend;
+
+	__u64 target;
+
+	__u64 flags;
+
+	__u64 unused[8];
+} __attribute__ ((__packed__));
+
+/* report balance progress to userspace */
+struct btrfs_balance_progress {
+	__u64 expected;		/* estimated # of chunks that will be
+				 * relocated to fulfill the request */
+	__u64 considered;	/* # of chunks we have considered so far */
+	__u64 completed;	/* # of chunks relocated so far */
+};
+
+struct btrfs_ioctl_balance_args {
+	__u64 flags;				/* in/out */
+	__u64 state;				/* out */
+
+	struct btrfs_balance_args data;		/* in/out */
+	struct btrfs_balance_args meta;		/* in/out */
+	struct btrfs_balance_args sys;		/* in/out */
+
+	struct btrfs_balance_progress stat;	/* out */
+
+	__u64 unused[72];			/* pad to 1k */
+};
+
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
 struct btrfs_ioctl_ino_lookup_args {
 	__u64 treeid;
@@ -272,6 +313,8 @@ struct btrfs_ioctl_logical_ino_args {
 				 struct btrfs_ioctl_dev_info_args)
 #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
 			       struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
+				   struct btrfs_ioctl_balance_args)
 #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
 					struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d5fdee56777f..9fc06e6bc51a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1282,7 +1282,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	bool clear_super = false;
 
 	mutex_lock(&uuid_mutex);
-	mutex_lock(&root->fs_info->volume_mutex);
 
 	all_avail = root->fs_info->avail_data_alloc_bits |
 		root->fs_info->avail_system_alloc_bits |
@@ -1452,7 +1451,6 @@ error_close:
 	if (bdev)
 		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
-	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
 	return ret;
 error_undo:
@@ -1629,7 +1627,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 
 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
-	mutex_lock(&root->fs_info->volume_mutex);
 
 	devices = &root->fs_info->fs_devices->devices;
 	/*
@@ -1757,8 +1754,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		ret = btrfs_relocate_sys_chunks(root);
 		BUG_ON(ret);
 	}
-out:
-	mutex_unlock(&root->fs_info->volume_mutex);
+
 	return ret;
 error:
 	blkdev_put(bdev, FMODE_EXCL);
@@ -1766,7 +1762,7 @@ error:
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
 	}
-	goto out;
+	return ret;
 }
 
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2077,6 +2073,35 @@ error:
 	return ret;
 }
 
+/*
+ * Should be called with both balance and volume mutexes held to
+ * serialize other volume operations (add_dev/rm_dev/resize) with
+ * restriper.  Same goes for unset_balance_control.
+ */
+static void set_balance_control(struct btrfs_balance_control *bctl)
+{
+	struct btrfs_fs_info *fs_info = bctl->fs_info;
+
+	BUG_ON(fs_info->balance_ctl);
+
+	spin_lock(&fs_info->balance_lock);
+	fs_info->balance_ctl = bctl;
+	spin_unlock(&fs_info->balance_lock);
+}
+
+static void unset_balance_control(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+	BUG_ON(!fs_info->balance_ctl);
+
+	spin_lock(&fs_info->balance_lock);
+	fs_info->balance_ctl = NULL;
+	spin_unlock(&fs_info->balance_lock);
+
+	kfree(bctl);
+}
+
 static u64 div_factor(u64 num, int factor)
 {
 	if (factor == 10)
@@ -2086,29 +2111,23 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
-int btrfs_balance(struct btrfs_root *dev_root)
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 {
-	int ret;
-	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+	struct btrfs_root *chunk_root = fs_info->chunk_root;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct list_head *devices;
 	struct btrfs_device *device;
 	u64 old_size;
 	u64 size_to_free;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
-	struct btrfs_trans_handle *trans;
 	struct btrfs_key found_key;
-
-	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	mutex_lock(&dev_root->fs_info->volume_mutex);
-	dev_root = dev_root->fs_info->dev_root;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int enospc_errors = 0;
 
 	/* step one make some room on all the devices */
+	devices = &fs_info->fs_devices->devices;
 	list_for_each_entry(device, devices, dev_list) {
 		old_size = device->total_bytes;
 		size_to_free = div_factor(old_size, 1);
@@ -2151,12 +2170,14 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		 * failed
 		 */
 		if (ret == 0)
-			break;
+			BUG(); /* FIXME break ? */
 
 		ret = btrfs_previous_item(chunk_root, path, 0,
 					  BTRFS_CHUNK_ITEM_KEY);
-		if (ret)
+		if (ret) {
+			ret = 0;
 			break;
+		}
 
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				      path->slots[0]);
@@ -2174,12 +2195,63 @@ int btrfs_balance(struct btrfs_root *dev_root)
 					   found_key.offset);
 		if (ret && ret != -ENOSPC)
 			goto error;
+		if (ret == -ENOSPC)
+			enospc_errors++;
 		key.offset = found_key.offset - 1;
 	}
-	ret = 0;
+
 error:
 	btrfs_free_path(path);
-	mutex_unlock(&dev_root->fs_info->volume_mutex);
+	if (enospc_errors) {
+		printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+		       enospc_errors);
+		if (!ret)
+			ret = -ENOSPC;
+	}
+
+	return ret;
+}
+
+static void __cancel_balance(struct btrfs_fs_info *fs_info)
+{
+	unset_balance_control(fs_info);
+}
+
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+			       struct btrfs_ioctl_balance_args *bargs);
+
+/*
+ * Should be called with both balance and volume mutexes held
+ */
+int btrfs_balance(struct btrfs_balance_control *bctl,
+		  struct btrfs_ioctl_balance_args *bargs)
+{
+	struct btrfs_fs_info *fs_info = bctl->fs_info;
+	int ret;
+
+	if (btrfs_fs_closing(fs_info)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	set_balance_control(bctl);
+
+	mutex_unlock(&fs_info->balance_mutex);
+
+	ret = __btrfs_balance(fs_info);
+
+	mutex_lock(&fs_info->balance_mutex);
+
+	if (bargs) {
+		memset(bargs, 0, sizeof(*bargs));
+		update_ioctl_balance_args(fs_info, bargs);
+	}
+
+	__cancel_balance(fs_info);
+
+	return ret;
+out:
+	kfree(bctl);
 	return ret;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 78f2d4d4f37f..882582385283 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -186,6 +186,17 @@ struct map_lookup {
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
+struct btrfs_balance_args;
+struct btrfs_balance_control {
+	struct btrfs_fs_info *fs_info;
+
+	struct btrfs_balance_args data;
+	struct btrfs_balance_args meta;
+	struct btrfs_balance_args sys;
+
+	u64 flags;
+};
+
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 				   u64 end, u64 *length);
 
@@ -228,7 +239,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 				       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
-int btrfs_balance(struct btrfs_root *dev_root);
+int btrfs_balance(struct btrfs_balance_control *bctl,
+		  struct btrfs_ioctl_balance_args *bargs);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
-- 
cgit v1.2.3


From f43ffb60fd94e98be02780944e182ade6653b916 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: add basic infrastructure for selective balancing

This allows to have a separate set of filters for each chunk type
(data,meta,sys).  The code however is generic and switch on chunk type
is only done once.

This commit also adds a type filter: it allows to balance for example
meta and system chunks w/o touching data ones.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ioctl.c   |  3 +++
 fs/btrfs/volumes.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.h | 11 ++++++++++
 3 files changed, 71 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d838d2cfb947..29b3a94933f0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3116,6 +3116,9 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
 		memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
 
 		bctl->flags = bargs->flags;
+	} else {
+		/* balance everything - no filters */
+		bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
 	}
 
 	ret = btrfs_balance(bctl, bargs);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9fc06e6bc51a..91bbf6e774c0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2102,6 +2102,30 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info)
 	kfree(bctl);
 }
 
+static int should_balance_chunk(struct btrfs_root *root,
+				struct extent_buffer *leaf,
+				struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+	struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+	struct btrfs_balance_args *bargs = NULL;
+	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+
+	/* type filter */
+	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+		return 0;
+	}
+
+	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+		bargs = &bctl->data;
+	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+		bargs = &bctl->sys;
+	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+		bargs = &bctl->meta;
+
+	return 1;
+}
+
 static u64 div_factor(u64 num, int factor)
 {
 	if (factor == 10)
@@ -2119,10 +2143,13 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	struct btrfs_device *device;
 	u64 old_size;
 	u64 size_to_free;
+	struct btrfs_chunk *chunk;
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_trans_handle *trans;
+	struct extent_buffer *leaf;
+	int slot;
 	int ret;
 	int enospc_errors = 0;
 
@@ -2179,8 +2206,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 			break;
 		}
 
-		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-				      path->slots[0]);
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
 		if (found_key.objectid != key.objectid)
 			break;
 
@@ -2188,7 +2217,14 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 		if (found_key.offset == 0)
 			break;
 
+		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+
+		ret = should_balance_chunk(chunk_root, leaf, chunk,
+					   found_key.offset);
 		btrfs_release_path(path);
+		if (!ret)
+			goto loop;
+
 		ret = btrfs_relocate_chunk(chunk_root,
 					   chunk_root->root_key.objectid,
 					   found_key.objectid,
@@ -2197,6 +2233,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 			goto error;
 		if (ret == -ENOSPC)
 			enospc_errors++;
+loop:
 		key.offset = found_key.offset - 1;
 	}
 
@@ -2227,6 +2264,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		  struct btrfs_ioctl_balance_args *bargs)
 {
 	struct btrfs_fs_info *fs_info = bctl->fs_info;
+	u64 allowed;
 	int ret;
 
 	if (btrfs_fs_closing(fs_info)) {
@@ -2234,6 +2272,23 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		goto out;
 	}
 
+	/*
+	 * In case of mixed groups both data and meta should be picked,
+	 * and identical options should be given for both of them.
+	 */
+	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+	if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+	    (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
+		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+			printk(KERN_ERR "btrfs: with mixed groups data and "
+			       "metadata balance options must be the same\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
 	set_balance_control(bctl);
 
 	mutex_unlock(&fs_info->balance_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 882582385283..003e54216069 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -186,6 +186,17 @@ struct map_lookup {
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
+/*
+ * Restriper's general type filter
+ */
+#define BTRFS_BALANCE_DATA		(1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM		(1ULL << 1)
+#define BTRFS_BALANCE_METADATA		(1ULL << 2)
+
+#define BTRFS_BALANCE_TYPE_MASK		(BTRFS_BALANCE_DATA |	    \
+					 BTRFS_BALANCE_SYSTEM |	    \
+					 BTRFS_BALANCE_METADATA)
+
 struct btrfs_balance_args;
 struct btrfs_balance_control {
 	struct btrfs_fs_info *fs_info;
-- 
cgit v1.2.3


From ed25e9b26f898d8d63ae4a836489f1923534143b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: profiles filter

Select chunks based on a given profile mask.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 24 ++++++++++++++++++++++++
 fs/btrfs/volumes.h |  4 ++++
 2 files changed, 28 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 91bbf6e774c0..447bd422d867 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2102,6 +2102,24 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info)
 	kfree(bctl);
 }
 
+/*
+ * Balance filters.  Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_profile,
+				 struct btrfs_balance_args *bargs)
+{
+	chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	if (chunk_profile == 0)
+		chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+	if (bargs->profiles & chunk_profile)
+		return 0;
+
+	return 1;
+}
+
 static int should_balance_chunk(struct btrfs_root *root,
 				struct extent_buffer *leaf,
 				struct btrfs_chunk *chunk, u64 chunk_offset)
@@ -2123,6 +2141,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
 		bargs = &bctl->meta;
 
+	/* profiles filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+	    chunk_profiles_filter(chunk_type, bargs)) {
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 003e54216069..fb20d7740440 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -196,6 +196,10 @@ struct map_lookup {
 #define BTRFS_BALANCE_TYPE_MASK		(BTRFS_BALANCE_DATA |	    \
 					 BTRFS_BALANCE_SYSTEM |	    \
 					 BTRFS_BALANCE_METADATA)
+/*
+ * Balance filters
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES	(1ULL << 0)
 
 struct btrfs_balance_args;
 struct btrfs_balance_control {
-- 
cgit v1.2.3


From 5ce5b3c0916ba3a2e34cf648b94044adc5ef9e76 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: usage filter

Select chunks that are less than X percent full.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 36 ++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  1 +
 2 files changed, 37 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 447bd422d867..b858242374db 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2120,6 +2120,36 @@ static int chunk_profiles_filter(u64 chunk_profile,
 	return 1;
 }
 
+static u64 div_factor_fine(u64 num, int factor)
+{
+	if (factor <= 0)
+		return 0;
+	if (factor >= 100)
+		return num;
+
+	num *= factor;
+	do_div(num, 100);
+	return num;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+			      struct btrfs_balance_args *bargs)
+{
+	struct btrfs_block_group_cache *cache;
+	u64 chunk_used, user_thresh;
+	int ret = 1;
+
+	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+	chunk_used = btrfs_block_group_used(&cache->item);
+
+	user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+	if (chunk_used < user_thresh)
+		ret = 0;
+
+	btrfs_put_block_group(cache);
+	return ret;
+}
+
 static int should_balance_chunk(struct btrfs_root *root,
 				struct extent_buffer *leaf,
 				struct btrfs_chunk *chunk, u64 chunk_offset)
@@ -2147,6 +2177,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 		return 0;
 	}
 
+	/* usage filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index fb20d7740440..eee77fcf812a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -200,6 +200,7 @@ struct map_lookup {
  * Balance filters
  */
 #define BTRFS_BALANCE_ARGS_PROFILES	(1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE	(1ULL << 1)
 
 struct btrfs_balance_args;
 struct btrfs_balance_control {
-- 
cgit v1.2.3


From 409d404b461afa9738619f249fd7f62a366b68c2 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:47 +0200
Subject: Btrfs: devid filter

Relocate chunks which have at least one stripe located on a device with
devid X.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 23 +++++++++++++++++++++++
 fs/btrfs/volumes.h |  1 +
 2 files changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b858242374db..9ff5cd09a256 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2150,6 +2150,23 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
 	return ret;
 }
 
+static int chunk_devid_filter(struct extent_buffer *leaf,
+			      struct btrfs_chunk *chunk,
+			      struct btrfs_balance_args *bargs)
+{
+	struct btrfs_stripe *stripe;
+	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	int i;
+
+	for (i = 0; i < num_stripes; i++) {
+		stripe = btrfs_stripe_nr(chunk, i);
+		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+			return 0;
+	}
+
+	return 1;
+}
+
 static int should_balance_chunk(struct btrfs_root *root,
 				struct extent_buffer *leaf,
 				struct btrfs_chunk *chunk, u64 chunk_offset)
@@ -2183,6 +2200,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 		return 0;
 	}
 
+	/* devid filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+	    chunk_devid_filter(leaf, chunk, bargs)) {
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index eee77fcf812a..7cfec03c29b8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -201,6 +201,7 @@ struct map_lookup {
  */
 #define BTRFS_BALANCE_ARGS_PROFILES	(1ULL << 0)
 #define BTRFS_BALANCE_ARGS_USAGE	(1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID	(1ULL << 2)
 
 struct btrfs_balance_args;
 struct btrfs_balance_control {
-- 
cgit v1.2.3


From 94e60d5a5c4b98a32b1077dec88df09ada712376 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: devid subset filter

Select chunks which have at least one byte of at least one stripe
located on a device with devid X in a given [pstart,pend) physical
address range.

This filter only works when devid filter is turned on.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  1 +
 2 files changed, 47 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9ff5cd09a256..c60071a448a5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2167,6 +2167,46 @@ static int chunk_devid_filter(struct extent_buffer *leaf,
 	return 1;
 }
 
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+			       struct btrfs_chunk *chunk,
+			       u64 chunk_offset,
+			       struct btrfs_balance_args *bargs)
+{
+	struct btrfs_stripe *stripe;
+	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	u64 stripe_offset;
+	u64 stripe_length;
+	int factor;
+	int i;
+
+	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+		return 0;
+
+	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
+		factor = 2;
+	else
+		factor = 1;
+	factor = num_stripes / factor;
+
+	for (i = 0; i < num_stripes; i++) {
+		stripe = btrfs_stripe_nr(chunk, i);
+		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+			continue;
+
+		stripe_offset = btrfs_stripe_offset(leaf, stripe);
+		stripe_length = btrfs_chunk_length(leaf, chunk);
+		do_div(stripe_length, factor);
+
+		if (stripe_offset < bargs->pend &&
+		    stripe_offset + stripe_length > bargs->pstart)
+			return 0;
+	}
+
+	return 1;
+}
+
 static int should_balance_chunk(struct btrfs_root *root,
 				struct extent_buffer *leaf,
 				struct btrfs_chunk *chunk, u64 chunk_offset)
@@ -2206,6 +2246,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 		return 0;
 	}
 
+	/* drange filter, makes sense only with devid filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+	    chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7cfec03c29b8..844b08e388f2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -202,6 +202,7 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_PROFILES	(1ULL << 0)
 #define BTRFS_BALANCE_ARGS_USAGE	(1ULL << 1)
 #define BTRFS_BALANCE_ARGS_DEVID	(1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
 
 struct btrfs_balance_args;
 struct btrfs_balance_control {
-- 
cgit v1.2.3


From ea67176ae8c024f64d85ec33873e5eadf1af7247 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: virtual address space subset filter

Select chunks which have at least one byte located inside a given
[vstart, vend) virtual address space range.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 20 ++++++++++++++++++++
 fs/btrfs/volumes.h |  1 +
 2 files changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c60071a448a5..e86c9e4fe51e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2207,6 +2207,20 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
 	return 1;
 }
 
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+			       struct btrfs_chunk *chunk,
+			       u64 chunk_offset,
+			       struct btrfs_balance_args *bargs)
+{
+	if (chunk_offset < bargs->vend &&
+	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+		/* at least part of the chunk is inside this vrange */
+		return 0;
+
+	return 1;
+}
+
 static int should_balance_chunk(struct btrfs_root *root,
 				struct extent_buffer *leaf,
 				struct btrfs_chunk *chunk, u64 chunk_offset)
@@ -2252,6 +2266,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 		return 0;
 	}
 
+	/* vrange filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 844b08e388f2..eac26c359312 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -203,6 +203,7 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_USAGE	(1ULL << 1)
 #define BTRFS_BALANCE_ARGS_DEVID	(1ULL << 2)
 #define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
 
 struct btrfs_balance_args;
 struct btrfs_balance_control {
-- 
cgit v1.2.3


From 70922617b0099f420deceb53d5dc7f4fb30d08d0 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: do not reduce profile in do_chunk_alloc()

Every caller of do_chunk_alloc() feeds it the reduced allocation
profile, so stop trying to reduce it one more time.  Instead check the
validity of the passed profile.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h       | 18 ++++++++++++++++++
 fs/btrfs/extent-tree.c |  2 +-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c4d98c8df5c5..1e7aea60da2b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2536,6 +2536,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 	kfree(fs_info->super_for_commit);
 	kfree(fs_info);
 }
+/**
+ * profile_is_valid - tests whether a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static inline int profile_is_valid(u64 flags, int extended)
+{
+	u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+	if (extended)
+		mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+	if (flags & mask)
+		return 0;
+	/* true if zero or exactly one bit set */
+	return (flags & (~flags + 1)) == flags;
+}
 
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 946b0671c5d9..a1a18ea7b6c6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3295,7 +3295,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	int wait_for_alloc = 0;
 	int ret = 0;
 
-	flags = btrfs_reduce_alloc_profile(extent_root, flags);
+	BUG_ON(!profile_is_valid(flags, 0));
 
 	space_info = __find_space_info(extent_root->fs_info, flags);
 	if (!space_info) {
-- 
cgit v1.2.3


From e4d8ec0f65b91bfb4984a4927632ded95f9825ad Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: implement online profile changing

Profile changing is done by launching a balance with
BTRFS_BALANCE_CONVERT bits set and target fields of respective
btrfs_balance_args structs initialized.  Profile reducing code in this
case will pick restriper's target profile if it's available instead of
doing a blind reduce.  If target profile is not yet available it goes
back to a plain reduce.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/extent-tree.c | 56 +++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.c     | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h     |  5 ++++
 3 files changed, 129 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a1a18ea7b6c6..e6a832e3e647 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3030,7 +3030,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 /*
  * @flags: available profiles in extended format (see ctree.h)
  *
- * Returns reduced profile in chunk format.
+ * Returns reduced profile in chunk format.  If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
  */
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
@@ -3042,6 +3044,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
 		root->fs_info->fs_devices->missing_devices;
 
+	/* pick restriper's target profile if it's available */
+	spin_lock(&root->fs_info->balance_lock);
+	if (root->fs_info->balance_ctl) {
+		struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+		u64 tgt = 0;
+
+		if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
+		    (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+		    (flags & bctl->data.target)) {
+			tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+		} else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
+			   (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+			   (flags & bctl->sys.target)) {
+			tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+		} else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
+			   (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+			   (flags & bctl->meta.target)) {
+			tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+		}
+
+		if (tgt) {
+			spin_unlock(&root->fs_info->balance_lock);
+			flags = tgt;
+			goto out;
+		}
+	}
+	spin_unlock(&root->fs_info->balance_lock);
+
 	if (num_devices == 1)
 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
 	if (num_devices < 4)
@@ -3065,6 +3095,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
 	}
 
+out:
 	/* extended -> chunk profile */
 	flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
 	return flags;
@@ -6795,6 +6826,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
+	if (root->fs_info->balance_ctl) {
+		struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+		u64 tgt = 0;
+
+		/* pick restriper's target profile and return */
+		if (flags & BTRFS_BLOCK_GROUP_DATA &&
+		    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+			tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+		} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+			   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+			tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+		} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+			   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+			tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+		}
+
+		if (tgt) {
+			/* extended -> chunk profile */
+			tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+			return tgt;
+		}
+	}
+
 	/*
 	 * we add in the count of missing devices because we want
 	 * to make sure that any RAID levels on a degraded FS
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e86c9e4fe51e..f08210e83339 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2438,6 +2438,75 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		}
 	}
 
+	/*
+	 * Profile changing sanity checks.  Skip them if a simple
+	 * balance is requested.
+	 */
+	if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
+	      BTRFS_BALANCE_ARGS_CONVERT))
+		goto do_balance;
+
+	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+	if (fs_info->fs_devices->num_devices == 1)
+		allowed |= BTRFS_BLOCK_GROUP_DUP;
+	else if (fs_info->fs_devices->num_devices < 4)
+		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+	else
+		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+				BTRFS_BLOCK_GROUP_RAID10);
+
+	if (!profile_is_valid(bctl->data.target, 1) ||
+	    bctl->data.target & ~allowed) {
+		printk(KERN_ERR "btrfs: unable to start balance with target "
+		       "data profile %llu\n",
+		       (unsigned long long)bctl->data.target);
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!profile_is_valid(bctl->meta.target, 1) ||
+	    bctl->meta.target & ~allowed) {
+		printk(KERN_ERR "btrfs: unable to start balance with target "
+		       "metadata profile %llu\n",
+		       (unsigned long long)bctl->meta.target);
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!profile_is_valid(bctl->sys.target, 1) ||
+	    bctl->sys.target & ~allowed) {
+		printk(KERN_ERR "btrfs: unable to start balance with target "
+		       "system profile %llu\n",
+		       (unsigned long long)bctl->sys.target);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
+		printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* allow to reduce meta or sys integrity only if force set */
+	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+			BTRFS_BLOCK_GROUP_RAID10;
+	if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	     (fs_info->avail_system_alloc_bits & allowed) &&
+	     !(bctl->sys.target & allowed)) ||
+	    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	     (fs_info->avail_metadata_alloc_bits & allowed) &&
+	     !(bctl->meta.target & allowed))) {
+		if (bctl->flags & BTRFS_BALANCE_FORCE) {
+			printk(KERN_INFO "btrfs: force reducing metadata "
+			       "integrity\n");
+		} else {
+			printk(KERN_ERR "btrfs: balance will reduce metadata "
+			       "integrity, use force if you want this\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+do_balance:
 	set_balance_control(bctl);
 
 	mutex_unlock(&fs_info->balance_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index eac26c359312..79ee9c324562 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -196,6 +196,9 @@ struct map_lookup {
 #define BTRFS_BALANCE_TYPE_MASK		(BTRFS_BALANCE_DATA |	    \
 					 BTRFS_BALANCE_SYSTEM |	    \
 					 BTRFS_BALANCE_METADATA)
+
+#define BTRFS_BALANCE_FORCE		(1ULL << 3)
+
 /*
  * Balance filters
  */
@@ -205,6 +208,8 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
 #define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
 
+#define BTRFS_BALANCE_ARGS_CONVERT	(1ULL << 8)
+
 struct btrfs_balance_args;
 struct btrfs_balance_control {
 	struct btrfs_fs_info *fs_info;
-- 
cgit v1.2.3


From cfa4c961cc69ffb7bda450972320a25cbd413e19 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: soft profile changing mode (aka soft convert)

When doing convert from one profile to another if soft mode is on
restriper won't touch chunks that already have the profile we are
converting to.  This is useful if e.g. half of the FS was converted
earlier.

The soft mode switch is (like every other filter) per-type.  This means
that we can convert for example meta chunks the "hard" way while
converting data chunks selectively with soft switch.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 23 +++++++++++++++++++++++
 fs/btrfs/volumes.h |  6 ++++++
 2 files changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f08210e83339..98b4067017ff 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2221,6 +2221,23 @@ static int chunk_vrange_filter(struct extent_buffer *leaf,
 	return 1;
 }
 
+static int chunk_soft_convert_filter(u64 chunk_profile,
+				     struct btrfs_balance_args *bargs)
+{
+	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+		return 0;
+
+	chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	if (chunk_profile == 0)
+		chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+	if (bargs->target & chunk_profile)
+		return 1;
+
+	return 0;
+}
+
 static int should_balance_chunk(struct btrfs_root *root,
 				struct extent_buffer *leaf,
 				struct btrfs_chunk *chunk, u64 chunk_offset)
@@ -2272,6 +2289,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 		return 0;
 	}
 
+	/* soft profile changing mode */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+	    chunk_soft_convert_filter(chunk_type, bargs)) {
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 79ee9c324562..6c143c98017a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -208,7 +208,13 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
 #define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
 
+/*
+ * Profile changing flags.  When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
 #define BTRFS_BALANCE_ARGS_CONVERT	(1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT		(1ULL << 9)
 
 struct btrfs_balance_args;
 struct btrfs_balance_control {
-- 
cgit v1.2.3


From 0940ebf6b92ea10a6f30ae5ac3993a3b75745da6 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: save balance parameters to disk

Introduce a new btree objectid for storing balance item.  The reason is
to be able to resume restriper after a crash with the same parameters.
Balance item has a very high objectid and goes into tree of tree roots.

The key for the new item is as follows:

	[ BTRFS_BALANCE_OBJECTID ; BTRFS_BALANCE_ITEM_KEY ; 0 ]

Older kernels simply ignore it so it's safe to mount with an older
kernel and then go back to the newer one.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h   | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.c |  99 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 231 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1e7aea60da2b..9997a59e4f58 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
 /* holds checksums of all the data extents */
 #define BTRFS_CSUM_TREE_OBJECTID 7ULL
 
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
 	__le16 name_len;
 } __attribute__ ((__packed__));
 
+struct btrfs_disk_balance_args {
+	/*
+	 * profiles to operate on, single is denoted by
+	 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+	 */
+	__le64 profiles;
+
+	/* usage filter */
+	__le64 usage;
+
+	/* devid filter */
+	__le64 devid;
+
+	/* devid subset filter [pstart..pend) */
+	__le64 pstart;
+	__le64 pend;
+
+	/* btrfs virtual address space subset filter [vstart..vend) */
+	__le64 vstart;
+	__le64 vend;
+
+	/*
+	 * profile to convert to, single is denoted by
+	 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+	 */
+	__le64 target;
+
+	/* BTRFS_BALANCE_ARGS_* */
+	__le64 flags;
+
+	__le64 unused[8];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+	/* BTRFS_BALANCE_* */
+	__le64 flags;
+
+	struct btrfs_disk_balance_args data;
+	struct btrfs_disk_balance_args meta;
+	struct btrfs_disk_balance_args sys;
+
+	__le64 unused[4];
+} __attribute__ ((__packed__));
+
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -1409,6 +1460,8 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_ITEM_KEY	216
 #define BTRFS_CHUNK_ITEM_KEY	228
 
+#define BTRFS_BALANCE_ITEM_KEY	248
+
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -2103,8 +2156,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
 BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
 		   num_devices, 64);
 
-/* struct btrfs_super_block */
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+				      struct btrfs_balance_item *bi,
+				      struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+					  struct btrfs_balance_item *bi,
+					  struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+				      struct btrfs_balance_item *bi,
+				      struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+					  struct btrfs_balance_item *bi,
+					  struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+				     struct btrfs_balance_item *bi,
+				     struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+					 struct btrfs_balance_item *bi,
+					 struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
 
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+			       struct btrfs_disk_balance_args *disk)
+{
+	memset(cpu, 0, sizeof(*cpu));
+
+	cpu->profiles = le64_to_cpu(disk->profiles);
+	cpu->usage = le64_to_cpu(disk->usage);
+	cpu->devid = le64_to_cpu(disk->devid);
+	cpu->pstart = le64_to_cpu(disk->pstart);
+	cpu->pend = le64_to_cpu(disk->pend);
+	cpu->vstart = le64_to_cpu(disk->vstart);
+	cpu->vend = le64_to_cpu(disk->vend);
+	cpu->target = le64_to_cpu(disk->target);
+	cpu->flags = le64_to_cpu(disk->flags);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+			       struct btrfs_balance_args *cpu)
+{
+	memset(disk, 0, sizeof(*disk));
+
+	disk->profiles = cpu_to_le64(cpu->profiles);
+	disk->usage = cpu_to_le64(cpu->usage);
+	disk->devid = cpu_to_le64(cpu->devid);
+	disk->pstart = cpu_to_le64(cpu->pstart);
+	disk->pend = cpu_to_le64(cpu->pend);
+	disk->vstart = cpu_to_le64(cpu->vstart);
+	disk->vend = cpu_to_le64(cpu->vend);
+	disk->target = cpu_to_le64(cpu->target);
+	disk->flags = cpu_to_le64(cpu->flags);
+}
+
+/* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 98b4067017ff..4c60ca0ae90b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2073,6 +2073,97 @@ error:
 	return ret;
 }
 
+static int insert_balance_item(struct btrfs_root *root,
+			       struct btrfs_balance_control *bctl)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_balance_item *item;
+	struct btrfs_disk_balance_args disk_bargs;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int ret, err;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*item));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+	btrfs_set_balance_data(leaf, item, &disk_bargs);
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+	btrfs_set_balance_meta(leaf, item, &disk_bargs);
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+	btrfs_set_balance_sys(leaf, item, &disk_bargs);
+
+	btrfs_set_balance_flags(leaf, item, bctl->flags);
+
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	err = btrfs_commit_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+	return ret;
+}
+
+static int del_balance_item(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret, err;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+out:
+	btrfs_free_path(path);
+	err = btrfs_commit_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+	return ret;
+}
+
 /*
  * Should be called with both balance and volume mutexes held to
  * serialize other volume operations (add_dev/rm_dev/resize) with
@@ -2423,7 +2514,11 @@ error:
 
 static void __cancel_balance(struct btrfs_fs_info *fs_info)
 {
+	int ret;
+
 	unset_balance_control(fs_info);
+	ret = del_balance_item(fs_info->tree_root);
+	BUG_ON(ret);
 }
 
 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
@@ -2530,6 +2625,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	}
 
 do_balance:
+	ret = insert_balance_item(fs_info->tree_root, bctl);
+	if (ret)
+		goto out;
+
 	set_balance_control(bctl);
 
 	mutex_unlock(&fs_info->balance_mutex);
-- 
cgit v1.2.3


From 596410151ed71819b9e8a8018c6c9992796b256d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: recover balance on mount

On mount, if balance item is found, resume balance in a separate
kernel thread.

Try to be smart to continue roughly where previous balance (or convert)
was interrupted.  For chunk types that were being converted to some
profile we turn on soft convert, in case of a simple balance we turn on
usage filter and relocate only less-than-90%-full chunks of that type.
These are just heuristics but they help quite a bit, and can be improved
in future.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/disk-io.c |   4 ++
 fs/btrfs/volumes.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.h |   2 +
 3 files changed, 139 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 190a1b24c902..eb7a11ac5b73 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2427,6 +2427,10 @@ retry_root_backup:
 		if (!err)
 			err = btrfs_orphan_cleanup(fs_info->tree_root);
 		up_read(&fs_info->cleanup_work_sem);
+
+		if (!err)
+			err = btrfs_recover_balance(fs_info->tree_root);
+
 		if (err) {
 			close_ctree(tree_root);
 			return ERR_PTR(err);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4c60ca0ae90b..17e565388de0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
 #include <linux/random.h>
 #include <linux/iocontext.h>
 #include <linux/capability.h>
+#include <linux/kthread.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -2164,6 +2165,46 @@ out:
 	return ret;
 }
 
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+	/*
+	 * Turn on soft mode for chunk types that were being converted.
+	 */
+	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+	/*
+	 * Turn on usage filter if is not already used.  The idea is
+	 * that chunks that we have already balanced should be
+	 * reasonably full.  Don't do it for chunks that are being
+	 * converted - that will keep us from relocating unconverted
+	 * (albeit full) chunks.
+	 */
+	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->data.usage = 90;
+	}
+	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->sys.usage = 90;
+	}
+	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->meta.usage = 90;
+	}
+}
+
 /*
  * Should be called with both balance and volume mutexes held to
  * serialize other volume operations (add_dev/rm_dev/resize) with
@@ -2626,10 +2667,18 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 
 do_balance:
 	ret = insert_balance_item(fs_info->tree_root, bctl);
-	if (ret)
+	if (ret && ret != -EEXIST)
 		goto out;
 
-	set_balance_control(bctl);
+	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+		BUG_ON(ret == -EEXIST);
+		set_balance_control(bctl);
+	} else {
+		BUG_ON(ret != -EEXIST);
+		spin_lock(&fs_info->balance_lock);
+		update_balance_args(bctl);
+		spin_unlock(&fs_info->balance_lock);
+	}
 
 	mutex_unlock(&fs_info->balance_mutex);
 
@@ -2646,7 +2695,89 @@ do_balance:
 
 	return ret;
 out:
+	if (bctl->flags & BTRFS_BALANCE_RESUME)
+		__cancel_balance(fs_info);
+	else
+		kfree(bctl);
+	return ret;
+}
+
+static int balance_kthread(void *data)
+{
+	struct btrfs_balance_control *bctl =
+			(struct btrfs_balance_control *)data;
+	struct btrfs_fs_info *fs_info = bctl->fs_info;
+	int ret;
+
+	mutex_lock(&fs_info->volume_mutex);
+	mutex_lock(&fs_info->balance_mutex);
+
+	set_balance_control(bctl);
+
+	printk(KERN_INFO "btrfs: continuing balance\n");
+	ret = btrfs_balance(bctl, NULL);
+
+	mutex_unlock(&fs_info->balance_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
+	return ret;
+}
+
+int btrfs_recover_balance(struct btrfs_root *tree_root)
+{
+	struct task_struct *tsk;
+	struct btrfs_balance_control *bctl;
+	struct btrfs_balance_item *item;
+	struct btrfs_disk_balance_args disk_bargs;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+	if (!bctl) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out_bctl;
+	if (ret > 0) { /* ret = -ENOENT; */
+		ret = 0;
+		goto out_bctl;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+	bctl->fs_info = tree_root->fs_info;
+	bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+
+	btrfs_balance_data(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+	btrfs_balance_meta(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+	btrfs_balance_sys(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+	tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
+	if (IS_ERR(tsk))
+		ret = PTR_ERR(tsk);
+	else
+		goto out;
+
+out_bctl:
 	kfree(bctl);
+out:
+	btrfs_free_path(path);
 	return ret;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6c143c98017a..cd25ea58ec35 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -198,6 +198,7 @@ struct map_lookup {
 					 BTRFS_BALANCE_METADATA)
 
 #define BTRFS_BALANCE_FORCE		(1ULL << 3)
+#define BTRFS_BALANCE_RESUME		(1ULL << 4)
 
 /*
  * Balance filters
@@ -271,6 +272,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_balance(struct btrfs_balance_control *bctl,
 		  struct btrfs_ioctl_balance_args *bargs);
+int btrfs_recover_balance(struct btrfs_root *tree_root);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
-- 
cgit v1.2.3


From 9555c6c180600b40f6e86bd4dc53bf47e06ed663 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:48 +0200
Subject: Btrfs: add skip_balance mount option

Since restriper kthread starts involuntarily on mount and can suck cpu
and memory bandwidth add a mount option to forcefully skip it.  The
restriper in that case hangs around in paused state and can be resumed
from userspace when it's convenient.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/super.c   | 11 +++++++++--
 fs/btrfs/volumes.c | 10 +++++++---
 3 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9997a59e4f58..99eb2bcd9aa7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1492,6 +1492,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
 #define BTRFS_MOUNT_RECOVERY		(1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE	(1 << 19)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 34a8b6112ea4..063b521e3ded 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -164,8 +164,9 @@ enum {
 	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-	Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+	Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+	Opt_err,
 };
 
 static match_table_t tokens = {
@@ -200,6 +201,7 @@ static match_table_t tokens = {
 	{Opt_inode_cache, "inode_cache"},
 	{Opt_no_space_cache, "nospace_cache"},
 	{Opt_recovery, "recovery"},
+	{Opt_skip_balance, "skip_balance"},
 	{Opt_err, NULL},
 };
 
@@ -398,6 +400,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: enabling auto recovery");
 			btrfs_set_opt(info->mount_opt, RECOVERY);
 			break;
+		case Opt_skip_balance:
+			btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
@@ -723,6 +728,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",autodefrag");
 	if (btrfs_test_opt(root, INODE_MAP_CACHE))
 		seq_puts(seq, ",inode_cache");
+	if (btrfs_test_opt(root, SKIP_BALANCE))
+		seq_puts(seq, ",skip_balance");
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 17e565388de0..e0160607e6e2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2707,15 +2707,19 @@ static int balance_kthread(void *data)
 	struct btrfs_balance_control *bctl =
 			(struct btrfs_balance_control *)data;
 	struct btrfs_fs_info *fs_info = bctl->fs_info;
-	int ret;
+	int ret = 0;
 
 	mutex_lock(&fs_info->volume_mutex);
 	mutex_lock(&fs_info->balance_mutex);
 
 	set_balance_control(bctl);
 
-	printk(KERN_INFO "btrfs: continuing balance\n");
-	ret = btrfs_balance(bctl, NULL);
+	if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+		printk(KERN_INFO "btrfs: force skipping balance\n");
+	} else {
+		printk(KERN_INFO "btrfs: continuing balance\n");
+		ret = btrfs_balance(bctl, NULL);
+	}
 
 	mutex_unlock(&fs_info->balance_mutex);
 	mutex_unlock(&fs_info->volume_mutex);
-- 
cgit v1.2.3


From 837d5b6e46d1a4af5b6cc8f2fe83cb5de79a2961 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:49 +0200
Subject: Btrfs: allow for pausing restriper

Implement an ioctl for pausing restriper.  This pauses the relocation,
but balance is still considered to be "in progress": balance item is
not deleted, other volume operations cannot be started, etc.  If paused
in the middle of profile changing operation we will continue making
allocations with the target profile.

Add a hook to close_ctree() to pause restriper and free its data
structures on unmount.  (It's safe to unmount when restriper is in
"paused" state, we will resume with the same parameters on the next
mount)

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h   |  4 ++++
 fs/btrfs/disk-io.c |  6 ++++++
 fs/btrfs/ioctl.c   | 28 +++++++++++++++++++++++++++-
 fs/btrfs/ioctl.h   |  7 +++++++
 fs/btrfs/volumes.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.h |  1 +
 6 files changed, 94 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 99eb2bcd9aa7..1afda75d5414 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1214,7 +1214,10 @@ struct btrfs_fs_info {
 	/* restriper state */
 	spinlock_t balance_lock;
 	struct mutex balance_mutex;
+	atomic_t balance_running;
+	atomic_t balance_pause_req;
 	struct btrfs_balance_control *balance_ctl;
+	wait_queue_head_t balance_wait_q;
 
 	unsigned data_chunk_allocations;
 	unsigned metadata_ratio;
@@ -2658,6 +2661,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 }
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+	kfree(fs_info->balance_ctl);
 	kfree(fs_info->delayed_root);
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index eb7a11ac5b73..8ce837407800 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2004,7 +2004,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	spin_lock_init(&fs_info->balance_lock);
 	mutex_init(&fs_info->balance_mutex);
+	atomic_set(&fs_info->balance_running, 0);
+	atomic_set(&fs_info->balance_pause_req, 0);
 	fs_info->balance_ctl = NULL;
+	init_waitqueue_head(&fs_info->balance_wait_q);
 
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
@@ -2980,6 +2983,9 @@ int close_ctree(struct btrfs_root *root)
 	fs_info->closing = 1;
 	smp_mb();
 
+	/* pause restriper - we want to resume on mount */
+	btrfs_pause_balance(root->fs_info);
+
 	btrfs_scrub_cancel(root);
 
 	/* wait for any defraggers to finish */
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 29b3a94933f0..f572c53dda4f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3072,6 +3072,11 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 
 	bargs->flags = bctl->flags;
 
+	if (atomic_read(&fs_info->balance_running))
+		bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+	if (atomic_read(&fs_info->balance_pause_req))
+		bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+
 	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
 	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
 	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
@@ -3103,6 +3108,11 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
 		bargs = NULL;
 	}
 
+	if (fs_info->balance_ctl) {
+		ret = -EINPROGRESS;
+		goto out_bargs;
+	}
+
 	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
 	if (!bctl) {
 		ret = -ENOMEM;
@@ -3123,7 +3133,8 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
 
 	ret = btrfs_balance(bctl, bargs);
 	/*
-	 * bctl is freed in __cancel_balance
+	 * bctl is freed in __cancel_balance or in free_fs_info if
+	 * restriper was paused all the way until unmount
 	 */
 	if (arg) {
 		if (copy_to_user(arg, bargs, sizeof(*bargs)))
@@ -3138,6 +3149,19 @@ out:
 	return ret;
 }
 
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case BTRFS_BALANCE_CTL_PAUSE:
+		return btrfs_pause_balance(root->fs_info);
+	}
+
+	return -EINVAL;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -3216,6 +3240,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_scrub_progress(root, argp);
 	case BTRFS_IOC_BALANCE_V2:
 		return btrfs_ioctl_balance(root, argp);
+	case BTRFS_IOC_BALANCE_CTL:
+		return btrfs_ioctl_balance_ctl(root, arg);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index c8b37d2c0d77..e972e11a8d77 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,9 @@ struct btrfs_ioctl_fs_info_args {
 	__u64 reserved[124];			/* pad to 1k */
 };
 
+/* balance control ioctl modes */
+#define BTRFS_BALANCE_CTL_PAUSE		1
+
 /*
  * this is packed, because it should be exactly the same as its disk
  * byte order counterpart (struct btrfs_disk_balance_args)
@@ -137,6 +140,9 @@ struct btrfs_balance_progress {
 	__u64 completed;	/* # of chunks relocated so far */
 };
 
+#define BTRFS_BALANCE_STATE_RUNNING	(1ULL << 0)
+#define BTRFS_BALANCE_STATE_PAUSE_REQ	(1ULL << 1)
+
 struct btrfs_ioctl_balance_args {
 	__u64 flags;				/* in/out */
 	__u64 state;				/* out */
@@ -315,6 +321,7 @@ struct btrfs_ioctl_logical_ino_args {
 			       struct btrfs_ioctl_fs_info_args)
 #define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
 				   struct btrfs_ioctl_balance_args)
+#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
 #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
 					struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e0160607e6e2..d32660ce753d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2492,6 +2492,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
 	while (1) {
+		if (atomic_read(&fs_info->balance_pause_req)) {
+			ret = -ECANCELED;
+			goto error;
+		}
+
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
 		if (ret < 0)
 			goto error;
@@ -2553,6 +2558,11 @@ error:
 	return ret;
 }
 
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+	return atomic_read(&fs_info->balance_pause_req) == 0;
+}
+
 static void __cancel_balance(struct btrfs_fs_info *fs_info)
 {
 	int ret;
@@ -2575,7 +2585,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	u64 allowed;
 	int ret;
 
-	if (btrfs_fs_closing(fs_info)) {
+	if (btrfs_fs_closing(fs_info) ||
+	    atomic_read(&fs_info->balance_pause_req)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -2680,18 +2691,25 @@ do_balance:
 		spin_unlock(&fs_info->balance_lock);
 	}
 
+	atomic_inc(&fs_info->balance_running);
 	mutex_unlock(&fs_info->balance_mutex);
 
 	ret = __btrfs_balance(fs_info);
 
 	mutex_lock(&fs_info->balance_mutex);
+	atomic_dec(&fs_info->balance_running);
 
 	if (bargs) {
 		memset(bargs, 0, sizeof(*bargs));
 		update_ioctl_balance_args(fs_info, bargs);
 	}
 
-	__cancel_balance(fs_info);
+	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+	    balance_need_close(fs_info)) {
+		__cancel_balance(fs_info);
+	}
+
+	wake_up(&fs_info->balance_wait_q);
 
 	return ret;
 out:
@@ -2785,6 +2803,35 @@ out:
 	return ret;
 }
 
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+	int ret = 0;
+
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		mutex_unlock(&fs_info->balance_mutex);
+		return -ENOTCONN;
+	}
+
+	if (atomic_read(&fs_info->balance_running)) {
+		atomic_inc(&fs_info->balance_pause_req);
+		mutex_unlock(&fs_info->balance_mutex);
+
+		wait_event(fs_info->balance_wait_q,
+			   atomic_read(&fs_info->balance_running) == 0);
+
+		mutex_lock(&fs_info->balance_mutex);
+		/* we are good with balance_ctl ripped off from under us */
+		BUG_ON(atomic_read(&fs_info->balance_running));
+		atomic_dec(&fs_info->balance_pause_req);
+	} else {
+		ret = -ENOTCONN;
+	}
+
+	mutex_unlock(&fs_info->balance_mutex);
+	return ret;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index cd25ea58ec35..80953afb12b9 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -273,6 +273,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_balance(struct btrfs_balance_control *bctl,
 		  struct btrfs_ioctl_balance_args *bargs);
 int btrfs_recover_balance(struct btrfs_root *tree_root);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
-- 
cgit v1.2.3


From a7e99c691af553fc15ac46a51f130b7c59a65f76 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:49 +0200
Subject: Btrfs: allow for canceling restriper

Implement an ioctl for canceling restriper.  Currently we wait until
relocation of the current block group is finished, in future this can be
done by triggering a commit.  Balance item is deleted and no memory
about the interrupted balance is kept.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c |  1 +
 fs/btrfs/ioctl.c   |  4 ++++
 fs/btrfs/ioctl.h   |  2 ++
 fs/btrfs/volumes.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/volumes.h |  1 +
 6 files changed, 53 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1afda75d5414..dfc136cc07d7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1216,6 +1216,7 @@ struct btrfs_fs_info {
 	struct mutex balance_mutex;
 	atomic_t balance_running;
 	atomic_t balance_pause_req;
+	atomic_t balance_cancel_req;
 	struct btrfs_balance_control *balance_ctl;
 	wait_queue_head_t balance_wait_q;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8ce837407800..c23b82d8ec08 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2006,6 +2006,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->balance_mutex);
 	atomic_set(&fs_info->balance_running, 0);
 	atomic_set(&fs_info->balance_pause_req, 0);
+	atomic_set(&fs_info->balance_cancel_req, 0);
 	fs_info->balance_ctl = NULL;
 	init_waitqueue_head(&fs_info->balance_wait_q);
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f572c53dda4f..60852217ce9a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3076,6 +3076,8 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 		bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
 	if (atomic_read(&fs_info->balance_pause_req))
 		bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+	if (atomic_read(&fs_info->balance_cancel_req))
+		bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
 
 	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
 	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
@@ -3157,6 +3159,8 @@ static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
 	switch (cmd) {
 	case BTRFS_BALANCE_CTL_PAUSE:
 		return btrfs_pause_balance(root->fs_info);
+	case BTRFS_BALANCE_CTL_CANCEL:
+		return btrfs_cancel_balance(root->fs_info);
 	}
 
 	return -EINVAL;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index e972e11a8d77..cd19d10794b9 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -111,6 +111,7 @@ struct btrfs_ioctl_fs_info_args {
 
 /* balance control ioctl modes */
 #define BTRFS_BALANCE_CTL_PAUSE		1
+#define BTRFS_BALANCE_CTL_CANCEL	2
 
 /*
  * this is packed, because it should be exactly the same as its disk
@@ -142,6 +143,7 @@ struct btrfs_balance_progress {
 
 #define BTRFS_BALANCE_STATE_RUNNING	(1ULL << 0)
 #define BTRFS_BALANCE_STATE_PAUSE_REQ	(1ULL << 1)
+#define BTRFS_BALANCE_STATE_CANCEL_REQ	(1ULL << 2)
 
 struct btrfs_ioctl_balance_args {
 	__u64 flags;				/* in/out */
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d32660ce753d..c32667318ae4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2492,7 +2492,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
 	while (1) {
-		if (atomic_read(&fs_info->balance_pause_req)) {
+		if (atomic_read(&fs_info->balance_pause_req) ||
+		    atomic_read(&fs_info->balance_cancel_req)) {
 			ret = -ECANCELED;
 			goto error;
 		}
@@ -2560,7 +2561,10 @@ error:
 
 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
 {
-	return atomic_read(&fs_info->balance_pause_req) == 0;
+	/* cancel requested || normal exit path */
+	return atomic_read(&fs_info->balance_cancel_req) ||
+		(atomic_read(&fs_info->balance_pause_req) == 0 &&
+		 atomic_read(&fs_info->balance_cancel_req) == 0);
 }
 
 static void __cancel_balance(struct btrfs_fs_info *fs_info)
@@ -2586,7 +2590,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	int ret;
 
 	if (btrfs_fs_closing(fs_info) ||
-	    atomic_read(&fs_info->balance_pause_req)) {
+	    atomic_read(&fs_info->balance_pause_req) ||
+	    atomic_read(&fs_info->balance_cancel_req)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -2832,6 +2837,42 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
 	return ret;
 }
 
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		mutex_unlock(&fs_info->balance_mutex);
+		return -ENOTCONN;
+	}
+
+	atomic_inc(&fs_info->balance_cancel_req);
+	/*
+	 * if we are running just wait and return, balance item is
+	 * deleted in btrfs_balance in this case
+	 */
+	if (atomic_read(&fs_info->balance_running)) {
+		mutex_unlock(&fs_info->balance_mutex);
+		wait_event(fs_info->balance_wait_q,
+			   atomic_read(&fs_info->balance_running) == 0);
+		mutex_lock(&fs_info->balance_mutex);
+	} else {
+		/* __cancel_balance needs volume_mutex */
+		mutex_unlock(&fs_info->balance_mutex);
+		mutex_lock(&fs_info->volume_mutex);
+		mutex_lock(&fs_info->balance_mutex);
+
+		if (fs_info->balance_ctl)
+			__cancel_balance(fs_info);
+
+		mutex_unlock(&fs_info->volume_mutex);
+	}
+
+	BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+	atomic_dec(&fs_info->balance_cancel_req);
+	mutex_unlock(&fs_info->balance_mutex);
+	return 0;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 80953afb12b9..caa9abd218ef 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -274,6 +274,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		  struct btrfs_ioctl_balance_args *bargs);
 int btrfs_recover_balance(struct btrfs_root *tree_root);
 int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
-- 
cgit v1.2.3


From de322263d3a6d4ffd4ed7c4d0c6536e9497aec9b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:49 +0200
Subject: Btrfs: allow for resuming restriper after it was paused

Recognize BTRFS_BALANCE_RESUME flag passed from userspace.  We use the
same heuristics used when recovering balance after a crash to try to
start where we left off last time.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ioctl.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 60852217ce9a..85e546ffe3c7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3106,6 +3106,20 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
 			ret = PTR_ERR(bargs);
 			goto out;
 		}
+
+		if (bargs->flags & BTRFS_BALANCE_RESUME) {
+			if (!fs_info->balance_ctl) {
+				ret = -ENOTCONN;
+				goto out_bargs;
+			}
+
+			bctl = fs_info->balance_ctl;
+			spin_lock(&fs_info->balance_lock);
+			bctl->flags |= BTRFS_BALANCE_RESUME;
+			spin_unlock(&fs_info->balance_lock);
+
+			goto do_balance;
+		}
 	} else {
 		bargs = NULL;
 	}
@@ -3133,6 +3147,7 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
 		bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
 	}
 
+do_balance:
 	ret = btrfs_balance(bctl, bargs);
 	/*
 	 * bctl is freed in __cancel_balance or in free_fs_info if
-- 
cgit v1.2.3


From 19a39dce3b9bf0244d19a446718ad6f7605ff099 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Jan 2012 22:04:49 +0200
Subject: Btrfs: add balance progress reporting

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/ioctl.c   | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/ioctl.h   |  2 ++
 fs/btrfs/volumes.c | 39 +++++++++++++++++++++++++++++++++++----
 fs/btrfs/volumes.h |  3 +++
 4 files changed, 84 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 85e546ffe3c7..1e7a9bac31ab 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3065,7 +3065,7 @@ out:
 	return ret;
 }
 
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
 			       struct btrfs_ioctl_balance_args *bargs)
 {
 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -3082,6 +3082,14 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
 	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
 	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+	if (lock) {
+		spin_lock(&fs_info->balance_lock);
+		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+		spin_unlock(&fs_info->balance_lock);
+	} else {
+		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+	}
 }
 
 static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
@@ -3181,6 +3189,39 @@ static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
 	return -EINVAL;
 }
 
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+					 void __user *arg)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_ioctl_balance_args *bargs;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		ret = -ENOTCONN;
+		goto out;
+	}
+
+	bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+	if (!bargs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	update_ioctl_balance_args(fs_info, 1, bargs);
+
+	if (copy_to_user(arg, bargs, sizeof(*bargs)))
+		ret = -EFAULT;
+
+	kfree(bargs);
+out:
+	mutex_unlock(&fs_info->balance_mutex);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -3261,6 +3302,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_balance(root, argp);
 	case BTRFS_IOC_BALANCE_CTL:
 		return btrfs_ioctl_balance_ctl(root, arg);
+	case BTRFS_IOC_BALANCE_PROGRESS:
+		return btrfs_ioctl_balance_progress(root, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index cd19d10794b9..4f69028a68c4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -324,6 +324,8 @@ struct btrfs_ioctl_logical_ino_args {
 #define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
 				   struct btrfs_ioctl_balance_args)
 #define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
+#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
+					struct btrfs_ioctl_balance_args)
 #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
 					struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c32667318ae4..d73439b4d7da 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2441,6 +2441,7 @@ static u64 div_factor(u64 num, int factor)
 
 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 {
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
 	struct btrfs_root *chunk_root = fs_info->chunk_root;
 	struct btrfs_root *dev_root = fs_info->dev_root;
 	struct list_head *devices;
@@ -2456,6 +2457,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	int slot;
 	int ret;
 	int enospc_errors = 0;
+	bool counting = true;
 
 	/* step one make some room on all the devices */
 	devices = &fs_info->fs_devices->devices;
@@ -2487,12 +2489,18 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 		ret = -ENOMEM;
 		goto error;
 	}
+
+	/* zero out stat counters */
+	spin_lock(&fs_info->balance_lock);
+	memset(&bctl->stat, 0, sizeof(bctl->stat));
+	spin_unlock(&fs_info->balance_lock);
+again:
 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.offset = (u64)-1;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
 	while (1) {
-		if (atomic_read(&fs_info->balance_pause_req) ||
+		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
 		    atomic_read(&fs_info->balance_cancel_req)) {
 			ret = -ECANCELED;
 			goto error;
@@ -2529,24 +2537,47 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 
 		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
 
+		if (!counting) {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.considered++;
+			spin_unlock(&fs_info->balance_lock);
+		}
+
 		ret = should_balance_chunk(chunk_root, leaf, chunk,
 					   found_key.offset);
 		btrfs_release_path(path);
 		if (!ret)
 			goto loop;
 
+		if (counting) {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.expected++;
+			spin_unlock(&fs_info->balance_lock);
+			goto loop;
+		}
+
 		ret = btrfs_relocate_chunk(chunk_root,
 					   chunk_root->root_key.objectid,
 					   found_key.objectid,
 					   found_key.offset);
 		if (ret && ret != -ENOSPC)
 			goto error;
-		if (ret == -ENOSPC)
+		if (ret == -ENOSPC) {
 			enospc_errors++;
+		} else {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.completed++;
+			spin_unlock(&fs_info->balance_lock);
+		}
 loop:
 		key.offset = found_key.offset - 1;
 	}
 
+	if (counting) {
+		btrfs_release_path(path);
+		counting = false;
+		goto again;
+	}
 error:
 	btrfs_free_path(path);
 	if (enospc_errors) {
@@ -2576,7 +2607,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
 	BUG_ON(ret);
 }
 
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
 			       struct btrfs_ioctl_balance_args *bargs);
 
 /*
@@ -2706,7 +2737,7 @@ do_balance:
 
 	if (bargs) {
 		memset(bargs, 0, sizeof(*bargs));
-		update_ioctl_balance_args(fs_info, bargs);
+		update_ioctl_balance_args(fs_info, 0, bargs);
 	}
 
 	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index caa9abd218ef..6faec9dd1f93 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -218,6 +218,7 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_SOFT		(1ULL << 9)
 
 struct btrfs_balance_args;
+struct btrfs_balance_progress;
 struct btrfs_balance_control {
 	struct btrfs_fs_info *fs_info;
 
@@ -226,6 +227,8 @@ struct btrfs_balance_control {
 	struct btrfs_balance_args sys;
 
 	u64 flags;
+
+	struct btrfs_balance_progress stat;
 };
 
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
-- 
cgit v1.2.3


From 7ad85bb76a61801362701b77c5cee5aa09f35369 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 12 Jan 2012 19:10:12 -0500
Subject: Btrfs: do not use btrfs_end_transaction_throttle everywhere

A user reported a problem where things like open with O_CREAT would take up to
30 seconds when he had nfs activity on the same mount.  This is because all of
our quick metadata operations, like create, symlink etc all do
btrfs_end_transaction_throttle, which if the transaction is blocked will wait
for the commit to complete before it returns.  This adds a ridiculous amount of
latency and isn't really needed.  The normal btrfs_end_transaction will mark the
transaction as blocked and wake the transaction kthread up if it thinks the
transaction needs to end (this being in the running out of global reserve space
scenario), and this is all that is really needed since we've already done
everything we're going to do, we just need to return.  This should help people
with the latency they were seeing when using synchronous heavy workloads.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 18 +++++++++---------
 fs/btrfs/xattr.c |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index acc4ff39ca4e..5f8ba210c0aa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2845,7 +2845,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
 		BUG_ON(!root->fs_info->enospc_unlink);
 		root->fs_info->enospc_unlink = 0;
 	}
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 }
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3434,7 +3434,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
 		i_size_write(inode, newsize);
 		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
 		ret = btrfs_update_inode(trans, root, inode);
-		btrfs_end_transaction_throttle(trans, root);
+		btrfs_end_transaction(trans, root);
 	} else {
 
 		/*
@@ -4655,7 +4655,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	}
 out_unlock:
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -4723,7 +4723,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	}
 out_unlock:
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -4782,7 +4782,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -4848,7 +4848,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 out_fail:
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	if (drop_on_err)
 		iput(inode);
 	btrfs_btree_balance_dirty(root, nr);
@@ -6668,7 +6668,7 @@ end_trans:
 			err = ret;
 
 		nr = trans->blocks_used;
-		ret = btrfs_end_transaction_throttle(trans, root);
+		ret = btrfs_end_transaction(trans, root);
 		btrfs_btree_balance_dirty(root, nr);
 	}
 
@@ -7075,7 +7075,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		btrfs_end_log_trans(root);
 	}
 out_fail:
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 out_notrans:
 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
 		up_read(&root->fs_info->subvol_sem);
@@ -7247,7 +7247,7 @@ out_unlock:
 	if (!err)
 		d_instantiate(dentry, inode);
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3848b04e310e..e7a5659087e6 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
 out:
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	return ret;
 }
 
-- 
cgit v1.2.3


From f70a9a6b94af86fca069a7552ab672c31b457786 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 12 Jan 2012 19:10:12 -0500
Subject: Btrfs: fix btrfsck error 400 when truncating a compressed

Reproduce steps:
 # mkfs.btrfs /dev/sdb5
 # mount /dev/sdb5 -o compress=lzo /mnt
 # dd if=/dev/zero of=/mnt/tmpfile bs=128K count=1
 # sync
 # truncate -s 64K /mnt/tmpfile
 root 5 inode 257 errors 400

This is because of the wrong if condition, which is used to check if we should
subtract the bytes of the dropped range from i_blocks/i_bytes of i-node or not.
When we truncate a compressed extent, btrfs substracts the bytes of the whole
extent, it's wrong. We should substract the real size that we truncate, no
matter it is a compressed extent or not. Fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5f8ba210c0aa..946a7f1b3295 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3009,7 +3009,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	int pending_del_nr = 0;
 	int pending_del_slot = 0;
 	int extent_type = -1;
-	int encoding;
 	int ret;
 	int err = 0;
 	u64 ino = btrfs_ino(inode);
@@ -3059,7 +3058,6 @@ search_again:
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		found_type = btrfs_key_type(&found_key);
-		encoding = 0;
 
 		if (found_key.objectid != ino)
 			break;
@@ -3072,10 +3070,6 @@ search_again:
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			extent_type = btrfs_file_extent_type(leaf, fi);
-			encoding = btrfs_file_extent_compression(leaf, fi);
-			encoding |= btrfs_file_extent_encryption(leaf, fi);
-			encoding |= btrfs_file_extent_other_encoding(leaf, fi);
-
 			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 				item_end +=
 				    btrfs_file_extent_num_bytes(leaf, fi);
@@ -3103,7 +3097,7 @@ search_again:
 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
-			if (!del_item && !encoding) {
+			if (!del_item) {
 				u64 orig_num_bytes =
 					btrfs_file_extent_num_bytes(leaf, fi);
 				extent_num_bytes = new_size -
-- 
cgit v1.2.3


From ec39e180fd3188c983c94603634bfcd019f42ae7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 12 Jan 2012 19:10:12 -0500
Subject: Btrfs: release space on error in page_mkwrite

If updating the inode gave us an ENOSPC we were just returning in page_mkwrite,
which is a problem since we make our reservation right before trying to update
the inode, so fix the out label so that we actually free our reservation.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 946a7f1b3295..85fd86ea9830 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6488,8 +6488,8 @@ out_unlock:
 	if (!ret)
 		return VM_FAULT_LOCKED;
 	unlock_page(page);
-	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
+	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 45a8090e626ab470c91142954431a93846030b0d Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 12 Jan 2012 19:10:12 -0500
Subject: Btrfs: don't call btrfs_throttle in file write

Btrfs_throttle will make us wait if there is a currently committing transaction
until we can open new transactions, which is ridiculous since we don't actually
start any transactions within the file write path anyway, so all this does is
introduce big latencies if we have a sync/fsync heavy workload going on while
somebody else is trying to do work.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fc97b00bd871..0f61e11a2998 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1273,7 +1273,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 						   dirty_pages);
 		if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
 			btrfs_btree_balance_dirty(root, 1);
-		btrfs_throttle(root);
 
 		pos += copied;
 		num_written += copied;
-- 
cgit v1.2.3


From 3f7de037fb3727b20bc27332cdcf2488b702394c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 10 Nov 2011 08:29:20 -0500
Subject: Btrfs: add allocator tracepoints

I used these tracepoints when figuring out what the cluster stuff was doing, so
add them to mainline in case we need to profile this stuff again.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c      |  9 +++++++++
 fs/btrfs/free-space-cache.c | 12 +++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a44072a692ab..ad1a20bc834d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5256,6 +5256,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	ins->objectid = 0;
 	ins->offset = 0;
 
+	trace_find_free_extent(orig_root, num_bytes, empty_size, data);
+
 	space_info = __find_space_info(root->fs_info, data);
 	if (!space_info) {
 		printk(KERN_ERR "No space info for %llu\n", data);
@@ -5432,6 +5434,8 @@ alloc:
 			if (offset) {
 				/* we have a block, we're done */
 				spin_unlock(&last_ptr->refill_lock);
+				trace_btrfs_reserve_extent_cluster(root,
+					block_group, search_start, num_bytes);
 				goto checks;
 			}
 
@@ -5490,6 +5494,9 @@ refill_cluster:
 				if (offset) {
 					/* we found one, proceed */
 					spin_unlock(&last_ptr->refill_lock);
+					trace_btrfs_reserve_extent_cluster(root,
+						block_group, search_start,
+						num_bytes);
 					goto checks;
 				}
 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5576,6 +5583,8 @@ checks:
 		ins->objectid = search_start;
 		ins->offset = num_bytes;
 
+		trace_btrfs_reserve_extent(orig_root, block_group,
+					   search_start, num_bytes);
 		if (offset < search_start)
 			btrfs_add_free_space(used_block_group, offset,
 					     search_start - offset);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6c7887a7770c..efe20032e4a1 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2346,6 +2346,8 @@ again:
 				 &entry->offset_index, 1);
 	BUG_ON(ret);
 
+	trace_btrfs_setup_cluster(block_group, cluster,
+				  total_found * block_group->sectorsize, 1);
 	return 0;
 }
 
@@ -2368,6 +2370,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 	u64 window_start;
 	u64 window_free;
 	u64 max_extent;
+	u64 total_size = 0;
 
 	entry = tree_search_offset(ctl, offset, 0, 1);
 	if (!entry)
@@ -2433,11 +2436,12 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
 		ret = tree_insert_offset(&cluster->root, entry->offset,
 					 &entry->offset_index, 0);
+		total_size += entry->bytes;
 		BUG_ON(ret);
 	} while (node && entry != last);
 
 	cluster->max_size = max_extent;
-
+	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
 	return 0;
 }
 
@@ -2542,6 +2546,10 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
+	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
+				 min_bytes);
+
+	INIT_LIST_HEAD(&bitmaps);
 	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
 				      bytes + empty_size,
 				      cont1_bytes, min_bytes);
@@ -2559,6 +2567,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 		list_add_tail(&cluster->block_group_list,
 			      &block_group->cluster_list);
 		cluster->block_group = block_group;
+	} else {
+		trace_btrfs_failed_cluster_setup(block_group);
 	}
 out:
 	spin_unlock(&cluster->lock);
-- 
cgit v1.2.3


From 90290e19820e3323ce6b9c2888eeb68bf29c278b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 2 Dec 2011 15:44:12 -0500
Subject: Btrfs: protect orphan block rsv with spin_lock

We've been seeing warnings coming out of the orphan commit stuff forever from
ceph.  Turns out it's because we're racing with checking if the orphan block
reserve is set, because we clear it outside of the spin_lock.  So leave the
normal fastpath checks where they are, but take the spin_lock and _recheck_ to
make sure we haven't had an orphan block rsv added in the meantime.  Then clear
the root's orphan block rsv and release the lock.  With this patch a user said
the warnings went away and they usually showed up pretty soon after he started
ceph.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 85fd86ea9830..619742d37166 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1951,12 +1951,28 @@ enum btrfs_orphan_cleanup_state {
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root)
 {
+	struct btrfs_block_rsv *block_rsv;
 	int ret;
 
 	if (!list_empty(&root->orphan_list) ||
 	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
 		return;
 
+	spin_lock(&root->orphan_lock);
+	if (!list_empty(&root->orphan_list)) {
+		spin_unlock(&root->orphan_lock);
+		return;
+	}
+
+	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
+		spin_unlock(&root->orphan_lock);
+		return;
+	}
+
+	block_rsv = root->orphan_block_rsv;
+	root->orphan_block_rsv = NULL;
+	spin_unlock(&root->orphan_lock);
+
 	if (root->orphan_item_inserted &&
 	    btrfs_root_refs(&root->root_item) > 0) {
 		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
@@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 		root->orphan_item_inserted = 0;
 	}
 
-	if (root->orphan_block_rsv) {
-		WARN_ON(root->orphan_block_rsv->size > 0);
-		btrfs_free_block_rsv(root, root->orphan_block_rsv);
-		root->orphan_block_rsv = NULL;
+	if (block_rsv) {
+		WARN_ON(block_rsv->size > 0);
+		btrfs_free_block_rsv(root, block_rsv);
 	}
 }
 
-- 
cgit v1.2.3


From 8c2a3ca20f6233677ac3222c6506174010eb414f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 10 Jan 2012 10:31:31 -0500
Subject: Btrfs: space leak tracepoints

This in addition to a script in my btrfs-tracing tree will help track down space
leaks when we're getting space left over in block groups on umount.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/delayed-inode.c | 45 +++++++++++++++++++++++++++----------
 fs/btrfs/extent-tree.c   | 58 +++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/inode-map.c     |  4 ++++
 fs/btrfs/transaction.c   |  2 ++
 4 files changed, 89 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 9c1eccc2c503..fe4cd0f1cef1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 
 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-	if (!ret)
+	if (!ret) {
+		trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+					      item->key.objectid,
+					      num_bytes, 1);
 		item->bytes_reserved = num_bytes;
+	}
 
 	return ret;
 }
@@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
 		return;
 
 	rsv = &root->fs_info->delayed_block_rsv;
+	trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+				      item->key.objectid, item->bytes_reserved,
+				      0);
 	btrfs_block_rsv_release(root, rsv,
 				item->bytes_reserved);
 }
@@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata(
 	struct btrfs_block_rsv *dst_rsv;
 	u64 num_bytes;
 	int ret;
-	int release = false;
+	bool release = false;
 
 	src_rsv = trans->block_rsv;
 	dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata(
 		 */
 		if (ret == -EAGAIN)
 			ret = -ENOSPC;
-		if (!ret)
+		if (!ret) {
 			node->bytes_reserved = num_bytes;
+			trace_btrfs_space_reservation(root->fs_info,
+						      "delayed_inode",
+						      btrfs_ino(inode),
+						      num_bytes, 1);
+		}
 		return ret;
 	} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
 		spin_lock(&BTRFS_I(inode)->lock);
@@ -707,11 +719,17 @@ out:
 	 * reservation here.  I think it may be time for a documentation page on
 	 * how block rsvs. work.
 	 */
-	if (!ret)
+	if (!ret) {
+		trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+					      btrfs_ino(inode), num_bytes, 1);
 		node->bytes_reserved = num_bytes;
+	}
 
-	if (release)
+	if (release) {
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
+					      btrfs_ino(inode), num_bytes, 0);
 		btrfs_block_rsv_release(root, src_rsv, num_bytes);
+	}
 
 	return ret;
 }
@@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
 		return;
 
 	rsv = &root->fs_info->delayed_block_rsv;
+	trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+				      node->inode_id, node->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, rsv,
 				node->bytes_reserved);
 	node->bytes_reserved = 0;
@@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 		goto release_node;
 	}
 
-	ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
-	/*
-	 * we have reserved enough space when we start a new transaction,
-	 * so reserving metadata failure is impossible
-	 */
-	BUG_ON(ret);
-
 	delayed_item->key.objectid = btrfs_ino(dir);
 	btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
 	delayed_item->key.offset = index;
@@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 	dir_item->type = type;
 	memcpy((char *)(dir_item + 1), name, name_len);
 
+	ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+	/*
+	 * we have reserved enough space when we start a new transaction,
+	 * so reserving metadata failure is impossible
+	 */
+	BUG_ON(ret);
+
+
 	mutex_lock(&delayed_node->mutex);
 	ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
 	if (unlikely(ret)) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ad1a20bc834d..556f9aa25bb7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3310,6 +3310,8 @@ commit_trans:
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
+	trace_btrfs_space_reservation(root->fs_info, "space_info",
+				      (u64)data_sinfo, bytes, 1);
 	spin_unlock(&data_sinfo->lock);
 
 	return 0;
@@ -3329,6 +3331,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 	data_sinfo = BTRFS_I(inode)->space_info;
 	spin_lock(&data_sinfo->lock);
 	data_sinfo->bytes_may_use -= bytes;
+	trace_btrfs_space_reservation(root->fs_info, "space_info",
+				      (u64)data_sinfo, bytes, 0);
 	spin_unlock(&data_sinfo->lock);
 }
 
@@ -3686,6 +3690,10 @@ again:
 	if (used <= space_info->total_bytes) {
 		if (used + orig_bytes <= space_info->total_bytes) {
 			space_info->bytes_may_use += orig_bytes;
+			trace_btrfs_space_reservation(root->fs_info,
+						      "space_info",
+						      (u64)space_info,
+						      orig_bytes, 1);
 			ret = 0;
 		} else {
 			/*
@@ -3753,6 +3761,10 @@ again:
 
 		if (used + num_bytes < space_info->total_bytes + avail) {
 			space_info->bytes_may_use += orig_bytes;
+			trace_btrfs_space_reservation(root->fs_info,
+						      "space_info",
+						      (u64)space_info,
+						      orig_bytes, 1);
 			ret = 0;
 		} else {
 			wait_ordered = true;
@@ -3859,7 +3871,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
 	spin_unlock(&block_rsv->lock);
 }
 
-static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+				    struct btrfs_block_rsv *block_rsv,
 				    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3895,6 +3908,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
 		if (num_bytes) {
 			spin_lock(&space_info->lock);
 			space_info->bytes_may_use -= num_bytes;
+			trace_btrfs_space_reservation(fs_info, "space_info",
+						      (u64)space_info,
+						      num_bytes, 0);
 			space_info->reservation_progress++;
 			spin_unlock(&space_info->lock);
 		}
@@ -4051,7 +4067,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
 	if (global_rsv->full || global_rsv == block_rsv ||
 	    block_rsv->space_info != global_rsv->space_info)
 		global_rsv = NULL;
-	block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
+				num_bytes);
 }
 
 /*
@@ -4110,11 +4127,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 		num_bytes = sinfo->total_bytes - num_bytes;
 		block_rsv->reserved += num_bytes;
 		sinfo->bytes_may_use += num_bytes;
+		trace_btrfs_space_reservation(fs_info, "space_info",
+					      (u64)sinfo, num_bytes, 1);
 	}
 
 	if (block_rsv->reserved >= block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
 		sinfo->bytes_may_use -= num_bytes;
+		trace_btrfs_space_reservation(fs_info, "space_info",
+					      (u64)sinfo, num_bytes, 0);
 		sinfo->reservation_progress++;
 		block_rsv->reserved = block_rsv->size;
 		block_rsv->full = 1;
@@ -4149,7 +4170,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
-	block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
+				(u64)-1);
 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
 	WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4166,6 +4188,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 	if (!trans->bytes_reserved)
 		return;
 
+	trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans,
+				      trans->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 	trans->bytes_reserved = 0;
 }
@@ -4183,6 +4207,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 	 * when we are truly done with the orphan item.
 	 */
 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	trace_btrfs_space_reservation(root->fs_info, "orphan",
+				      btrfs_ino(inode), num_bytes, 1);
 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 
@@ -4190,6 +4216,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	trace_btrfs_space_reservation(root->fs_info, "orphan",
+				      btrfs_ino(inode), num_bytes, 0);
 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
 
@@ -4370,8 +4398,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 		if (dropped)
 			to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
-		if (to_free)
+		if (to_free) {
 			btrfs_block_rsv_release(root, block_rsv, to_free);
+			trace_btrfs_space_reservation(root->fs_info,
+						      "delalloc",
+						      btrfs_ino(inode),
+						      to_free, 0);
+		}
 		return ret;
 	}
 
@@ -4383,6 +4416,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	BTRFS_I(inode)->reserved_extents += nr_extents;
 	spin_unlock(&BTRFS_I(inode)->lock);
 
+	if (to_reserve)
+		trace_btrfs_space_reservation(root->fs_info,"delalloc",
+					      btrfs_ino(inode), to_reserve, 1);
 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
 	return 0;
@@ -4412,6 +4448,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 	if (dropped > 0)
 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
+	trace_btrfs_space_reservation(root->fs_info, "delalloc",
+				      btrfs_ino(inode), to_free, 0);
 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
 				to_free);
 }
@@ -4666,7 +4704,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 			cache->reserved += num_bytes;
 			space_info->bytes_reserved += num_bytes;
 			if (reserve == RESERVE_ALLOC) {
-				BUG_ON(space_info->bytes_may_use < num_bytes);
+				trace_btrfs_space_reservation(cache->fs_info,
+							      "space_info",
+							      (u64)space_info,
+							      num_bytes, 0);
 				space_info->bytes_may_use -= num_bytes;
 			}
 		}
@@ -6126,10 +6167,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	return ERR_PTR(-ENOSPC);
 }
 
-static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
+			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
 {
 	block_rsv_add_bytes(block_rsv, blocksize, 0);
-	block_rsv_release_bytes(block_rsv, NULL, 0);
+	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
 }
 
 /*
@@ -6159,7 +6201,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
 				   empty_size, hint, (u64)-1, &ins, 0);
 	if (ret) {
-		unuse_block_rsv(block_rsv, blocksize);
+		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
 		return ERR_PTR(ret);
 	}
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f8962a957d65..213ffa86ce1b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 					  trans->bytes_reserved);
 	if (ret)
 		goto out;
+	trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
+				      trans->bytes_reserved, 1);
 again:
 	inode = lookup_free_ino_inode(root, path);
 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -498,6 +500,8 @@ again:
 out_put:
 	iput(inode);
 out_release:
+	trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
+				      trans->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 out:
 	trans->block_rsv = rsv;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d5f987b49d70..287a6728b1ad 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -326,6 +326,8 @@ again:
 	}
 
 	if (num_bytes) {
+		trace_btrfs_space_reservation(root->fs_info, "transaction",
+					      (u64)h, num_bytes, 1);
 		h->block_rsv = &root->fs_info->trans_block_rsv;
 		h->bytes_reserved = num_bytes;
 	}
-- 
cgit v1.2.3


From f248679e86fead40cc78e724c7181d6bec1a2046 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 13 Jan 2012 12:09:22 -0500
Subject: Btrfs: add a delalloc mutex to inodes for delalloc reservations

I was using i_mutex for this, but we're getting bogus lockdep warnings by doing
that and theres no real way to get rid of those, so just stop using i_mutex to
protect delalloc metadata reservations and use a delalloc mutex instead.  This
shouldn't be contended often at all, only if you are writing and mmap writing to
the file at the same time.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/btrfs_inode.h |  3 +++
 fs/btrfs/extent-tree.c |  5 +++--
 fs/btrfs/inode.c       | 11 +----------
 fs/btrfs/ioctl.c       |  2 --
 fs/btrfs/relocation.c  |  2 --
 5 files changed, 7 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 634608d2a6d0..9b9b15fd5204 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,9 @@ struct btrfs_inode {
 	/* held while logging the inode in tree-log.c */
 	struct mutex log_mutex;
 
+	/* held while doing delalloc reservations */
+	struct mutex delalloc_mutex;
+
 	/* used to order data wrt metadata */
 	struct btrfs_ordered_inode_tree ordered_tree;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 556f9aa25bb7..e0ad5f0f895e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4345,12 +4345,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	/* Need to be holding the i_mutex here if we aren't free space cache */
 	if (btrfs_is_free_space_inode(root, inode))
 		flush = 0;
-	else
-		WARN_ON(!mutex_is_locked(&inode->i_mutex));
 
 	if (flush && btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
 
+	mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
 
 	spin_lock(&BTRFS_I(inode)->lock);
@@ -4405,6 +4404,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 						      btrfs_ino(inode),
 						      to_free, 0);
 		}
+		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 		return ret;
 	}
 
@@ -4415,6 +4415,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	}
 	BTRFS_I(inode)->reserved_extents += nr_extents;
 	spin_unlock(&BTRFS_I(inode)->lock);
+	mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 
 	if (to_reserve)
 		trace_btrfs_space_reservation(root->fs_info,"delalloc",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 619742d37166..5977987abdb1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2239,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 				continue;
 			}
 			nr_truncate++;
-			/*
-			 * Need to hold the imutex for reservation purposes, not
-			 * a huge deal here but I have a WARN_ON in
-			 * btrfs_delalloc_reserve_space to catch offenders.
-			 */
-			mutex_lock(&inode->i_mutex);
 			ret = btrfs_truncate(inode);
-			mutex_unlock(&inode->i_mutex);
 		} else {
 			nr_unlink++;
 		}
@@ -6411,10 +6404,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	u64 page_start;
 	u64 page_end;
 
-	/* Need this to keep space reservations serialized */
-	mutex_lock(&inode->i_mutex);
 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-	mutex_unlock(&inode->i_mutex);
 	if (!ret)
 		ret = btrfs_update_time(vma->vm_file);
 	if (ret) {
@@ -6758,6 +6748,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	extent_io_tree_init(&ei->io_tree, &inode->i_data);
 	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
 	mutex_init(&ei->log_mutex);
+	mutex_init(&ei->delalloc_mutex);
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	INIT_LIST_HEAD(&ei->i_orphan);
 	INIT_LIST_HEAD(&ei->delalloc_inodes);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7fdf22c2dc0d..6834be4c8709 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -868,10 +868,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
 		return 0;
 	file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
 
-	mutex_lock(&inode->i_mutex);
 	ret = btrfs_delalloc_reserve_space(inode,
 					   num_pages << PAGE_CACHE_SHIFT);
-	mutex_unlock(&inode->i_mutex);
 	if (ret)
 		return ret;
 again:
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index efe9f792544d..8c1aae2c845d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2949,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
 	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
 	while (index <= last_index) {
-		mutex_lock(&inode->i_mutex);
 		ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
-		mutex_unlock(&inode->i_mutex);
 		if (ret)
 			goto out;
 
-- 
cgit v1.2.3


From 96bdc7dc61fb1b1e8e858dafb13abee8482ba064 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 16 Jan 2012 08:13:11 -0500
Subject: Btrfs: use larger system chunks

system chunks by default are very small.  This makes them slightly
larger and also fixes the conditional checks to make sure we don't
allocate a billion of them at once.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 +++
 fs/btrfs/volumes.c     | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e0ad5f0f895e..700879ed64cf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3384,6 +3384,9 @@ static int should_alloc_chunk(struct btrfs_root *root,
 
 	/* 256MB or 2% of the FS */
 	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
+	/* system chunks need a much small threshold */
+	if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		thresh = 32 * 1024 * 1024;
 
 	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
 		return 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 59e878f9fdcc..7ffdb154daec 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3166,7 +3166,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			max_stripe_size = 256 * 1024 * 1024;
 		max_chunk_size = max_stripe_size;
 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		max_stripe_size = 8 * 1024 * 1024;
+		max_stripe_size = 32 * 1024 * 1024;
 		max_chunk_size = 2 * max_stripe_size;
 	} else {
 		printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
-- 
cgit v1.2.3


From 8096b1ebb59b94b3bc6abb6b7d121419e83447ba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:07 +0000
Subject: xfs: remove the if_ext_max field in struct xfs_ifork

We spent a lot of effort to maintain this field, but it always equals to the
fork size divided by the constant size of an extent.  The prime use of it is
to assert that the two stay in sync.  Just divide the fork size by the extent
size in the few places that we actually use it and remove the overhead
of maintaining it.  Also introduce a few helpers to consolidate the places
where we actually care about the value.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr_leaf.c  |   9 -----
 fs/xfs/xfs_bmap.c       | 101 ++++++++++++++++++++++--------------------------
 fs/xfs/xfs_dfrag.c      |  43 ++++++++++-----------
 fs/xfs/xfs_iget.c       |   2 -
 fs/xfs/xfs_inode.c      |  30 +++++---------
 fs/xfs/xfs_inode.h      |   4 +-
 fs/xfs/xfs_inode_item.c |   2 -
 fs/xfs/xfs_trace.h      |   5 +--
 8 files changed, 81 insertions(+), 115 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index c1b55e596551..d25eafd4d28d 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
 	dp = args->dp;
 	mp = dp->i_mount;
 	dp->i_d.di_forkoff = forkoff;
-	dp->i_df.if_ext_max =
-		XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-	dp->i_afp->if_ext_max =
-		XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
 
 	ifp = dp->i_afp;
 	ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -326,7 +322,6 @@ xfs_attr_fork_reset(
 	ASSERT(ip->i_d.di_anextents == 0);
 	ASSERT(ip->i_afp == NULL);
 
-	ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
 
@@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
 				(args->op_flags & XFS_DA_OP_ADDNAME) ||
 				!(mp->m_flags & XFS_MOUNT_ATTR2) ||
 				dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
-		dp->i_afp->if_ext_max =
-			XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-		dp->i_df.if_ext_max =
-			XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
 		xfs_trans_log_inode(args->trans, dp,
 					XFS_ILOG_CORE | XFS_ILOG_ADATA);
 	}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index d0ab78837057..7a888ca2f7f6 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge(
 }
 
 /*
-* Update the record referred to by cur to the value given
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+		XFS_IFORK_NEXTENTS(ip, whichfork) >
+			XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+		XFS_IFORK_NEXTENTS(ip, whichfork) <=
+			XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Update the record referred to by cur to the value given
  * by [off, bno, len, state].
  * This either works (return 0) or gets an EFSCORRUPTED error.
  */
@@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
 			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
 					bma->firstblock, bma->flist,
 					&bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
@@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
 			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
 				bma->firstblock, bma->flist, &bma->cur, 1,
 				&tmp_rval, XFS_DATA_FORK);
@@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
 			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
 					bma->firstblock, bma->flist, &bma->cur,
 					1, &tmp_rval, XFS_DATA_FORK);
@@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real(
 	}
 
 	/* convert to a btree if necessary */
-	if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+	if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
 		int	tmp_logflags;	/* partial log flag return val */
 
 		ASSERT(bma->cur == NULL);
@@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real(
 	}
 
 	/* convert to a btree if necessary */
-	if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+	if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
 		int	tmp_logflags;	/* partial log flag return val */
 
 		ASSERT(cur == NULL);
@@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real(
 	}
 
 	/* convert to a btree if necessary */
-	if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
+	if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
 		int	tmp_logflags;	/* partial log flag return val */
 
 		ASSERT(bma->cur == NULL);
@@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree(
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
 	/*
 	 * Make space in the inode incore.
 	 */
@@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset(
 	    ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
 		uint	dfl_forkoff = xfs_default_attroffset(ip) >> 3;
 
-		if (dfl_forkoff > ip->i_d.di_forkoff) {
+		if (dfl_forkoff > ip->i_d.di_forkoff)
 			ip->i_d.di_forkoff = dfl_forkoff;
-			ip->i_df.if_ext_max =
-				XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
-			ip->i_afp->if_ext_max =
-				XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
-		}
 	}
 }
 
@@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork(
 	int			error;		/* error return value */
 
 	ASSERT(XFS_IFORK_Q(ip) == 0);
-	ASSERT(ip->i_df.if_ext_max ==
-	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
 
 	mp = ip->i_mount;
 	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork(
 		error = XFS_ERROR(EINVAL);
 		goto error1;
 	}
-	ip->i_df.if_ext_max =
-		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
 	ASSERT(ip->i_afp == NULL);
 	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
-	ip->i_afp->if_ext_max =
-		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	ip->i_afp->if_flags = XFS_IFEXTENTS;
 	logflags = 0;
 	xfs_bmap_init(&flist, &firstblock);
@@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork(
 		} else
 			spin_unlock(&mp->m_sb_lock);
 	}
-	if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
+
+	error = xfs_bmap_finish(&tp, &flist, &committed);
+	if (error)
 		goto error2;
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	ASSERT(ip->i_df.if_ext_max ==
-	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
-	return error;
+	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 error2:
 	xfs_bmap_cancel(&flist);
 error1:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 error0:
 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-	ASSERT(ip->i_df.if_ext_max ==
-	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
 	return error;
 }
 
@@ -4379,8 +4382,6 @@ xfs_bmapi_read(
 	XFS_STATS_INC(xs_blk_mapr);
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
 		error = xfs_iread_extents(NULL, ip, whichfork);
@@ -4871,8 +4872,6 @@ xfs_bmapi_write(
 		return XFS_ERROR(EIO);
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 
 	XFS_STATS_INC(xs_blk_mapw);
 
@@ -4981,8 +4980,7 @@ xfs_bmapi_write(
 	/*
 	 * Transform from btree to extents, give it cur.
 	 */
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+	if (xfs_bmap_wants_extents(ip, whichfork)) {
 		int		tmp_logflags = 0;
 
 		ASSERT(bma.cur);
@@ -4992,10 +4990,10 @@ xfs_bmapi_write(
 		if (error)
 			goto error0;
 	}
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
-	       XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+	       XFS_IFORK_NEXTENTS(ip, whichfork) >
+		XFS_IFORK_MAXEXT(ip, whichfork));
 	error = 0;
 error0:
 	/*
@@ -5095,8 +5093,7 @@ xfs_bunmapi(
 
 	ASSERT(len > 0);
 	ASSERT(nexts >= 0);
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
 	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
 	    (error = xfs_iread_extents(tp, ip, whichfork)))
 		return error;
@@ -5322,7 +5319,8 @@ xfs_bunmapi(
 		 */
 		if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
 		    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-		    XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+		    XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+			XFS_IFORK_MAXEXT(ip, whichfork) &&
 		    del.br_startoff > got.br_startoff &&
 		    del.br_startoff + del.br_blockcount <
 		    got.br_startoff + got.br_blockcount) {
@@ -5353,13 +5351,11 @@ nodelete:
 		}
 	}
 	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
 	/*
 	 * Convert to a btree if necessary.
 	 */
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+	if (xfs_bmap_needs_btree(ip, whichfork)) {
 		ASSERT(cur == NULL);
 		error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
 			&cur, 0, &tmp_logflags, whichfork);
@@ -5370,8 +5366,7 @@ nodelete:
 	/*
 	 * transform from btree to extents, give it cur
 	 */
-	else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
-		 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+	else if (xfs_bmap_wants_extents(ip, whichfork)) {
 		ASSERT(cur != NULL);
 		error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
 			whichfork);
@@ -5382,8 +5377,6 @@ nodelete:
 	/*
 	 * transform from extents to local?
 	 */
-	ASSERT(ifp->if_ext_max ==
-	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 	error = 0;
 error0:
 	/*
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 654dc6f05bac..dd974a55c77d 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -163,12 +163,14 @@ xfs_swap_extents_check_format(
 
 	/* Check temp in extent form to max in target */
 	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+			XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
 		return EINVAL;
 
 	/* Check target in extent form to max in temp */
 	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+			XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
 		return EINVAL;
 
 	/*
@@ -180,18 +182,25 @@ xfs_swap_extents_check_format(
 	 * (a common defrag case) which will occur when the temp inode is in
 	 * extent format...
 	 */
-	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-	    ((XFS_IFORK_BOFF(ip) &&
-	      tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
-	     XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
-		return EINVAL;
+	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		if (XFS_IFORK_BOFF(ip) &&
+		    tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+			return EINVAL;
+		if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+		    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+			return EINVAL;
+	}
 
 	/* Reciprocal target->temp btree format checks */
-	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-	    ((XFS_IFORK_BOFF(tip) &&
-	      ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
-	     XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
-		return EINVAL;
+	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		if (XFS_IFORK_BOFF(tip) &&
+		    ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+			return EINVAL;
+
+		if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+		    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+			return EINVAL;
+	}
 
 	return 0;
 }
@@ -348,16 +357,6 @@ xfs_swap_extents(
 	*ifp = *tifp;		/* struct copy */
 	*tifp = *tempifp;	/* struct copy */
 
-	/*
-	 * Fix the in-memory data fork values that are dependent on the fork
-	 * offset in the inode. We can't assume they remain the same as attr2
-	 * has dynamic fork offsets.
-	 */
-	ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
-					(uint)sizeof(xfs_bmbt_rec_t);
-	tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
-					(uint)sizeof(xfs_bmbt_rec_t);
-
 	/*
 	 * Fix the on-disk inode values
 	 */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 3960a066d7ff..f180ce896cd7 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -450,8 +450,6 @@ again:
 
 	*ipp = ip;
 
-	ASSERT(ip->i_df.if_ext_max ==
-	       XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
 	/*
 	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
 	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ccd619a993f6..96b29e3286db 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -299,11 +299,8 @@ xfs_iformat(
 {
 	xfs_attr_shortform_t	*atp;
 	int			size;
-	int			error;
+	int			error = 0;
 	xfs_fsize_t             di_size;
-	ip->i_df.if_ext_max =
-		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
-	error = 0;
 
 	if (unlikely(be32_to_cpu(dip->di_nextents) +
 		     be16_to_cpu(dip->di_anextents) >
@@ -409,10 +406,10 @@ xfs_iformat(
 	}
 	if (!XFS_DFORK_Q(dip))
 		return 0;
+
 	ASSERT(ip->i_afp == NULL);
 	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
-	ip->i_afp->if_ext_max =
-		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
 	switch (dip->di_aformat) {
 	case XFS_DINODE_FMT_LOCAL:
 		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
@@ -604,10 +601,11 @@ xfs_iformat_btree(
 	 * or the number of extents is greater than the number of
 	 * blocks.
 	 */
-	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
-	    || XFS_BMDR_SPACE_CALC(nrecs) >
-			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
-	    || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+			XFS_IFORK_MAXEXT(ip, whichfork) ||
+		     XFS_BMDR_SPACE_CALC(nrecs) >
+			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
+		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
 		xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
 			(unsigned long long) ip->i_ino);
 		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -835,12 +833,6 @@ xfs_iread(
 		 * with the uninitialized part of it.
 		 */
 		ip->i_d.di_mode = 0;
-		/*
-		 * Initialize the per-fork minima and maxima for a new
-		 * inode here.  xfs_iformat will do it for old inodes.
-		 */
-		ip->i_df.if_ext_max =
-			XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	}
 
 	/*
@@ -1740,8 +1732,6 @@ xfs_ifree(
 	ip->i_d.di_flags = 0;
 	ip->i_d.di_dmevmask = 0;
 	ip->i_d.di_forkoff = 0;		/* mark the attr fork not in use */
-	ip->i_df.if_ext_max =
-		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
 	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 	/*
@@ -2408,7 +2398,7 @@ xfs_iflush(
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(!completion_done(&ip->i_flush));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
+	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 
 	iip = ip->i_itemp;
 	mp = ip->i_mount;
@@ -2524,7 +2514,7 @@ xfs_iflush_int(
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(!completion_done(&ip->i_flush));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
+	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 
 	iip = ip->i_itemp;
 	mp = ip->i_mount;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c1574721a191..47497e11c111 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -66,7 +66,6 @@ typedef struct xfs_ifork {
 	struct xfs_btree_block	*if_broot;	/* file's incore btree root */
 	short			if_broot_bytes;	/* bytes allocated for root */
 	unsigned char		if_flags;	/* per-fork flags */
-	unsigned char		if_ext_max;	/* max # of extent records */
 	union {
 		xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
 		xfs_ext_irec_t	*if_ext_irec;	/* irec map file exts */
@@ -206,7 +205,8 @@ typedef struct xfs_icdinode {
 	((w) == XFS_DATA_FORK ? \
 		((ip)->i_d.di_nextents = (n)) : \
 		((ip)->i_d.di_anextents = (n)))
-
+#define XFS_IFORK_MAXEXT(ip, w) \
+	(XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
 
 
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cfd6c7f8cc3c..2b6b4fcef49e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -79,8 +79,6 @@ xfs_inode_item_size(
 		break;
 
 	case XFS_DINODE_FMT_BTREE:
-		ASSERT(ip->i_df.if_ext_max ==
-		       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
 		iip->ili_format.ilf_fields &=
 			~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
 			  XFS_ILOG_DEV | XFS_ILOG_UUID);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 297f9fa6fb64..81efa0416173 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1568,7 +1568,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
 		__field(xfs_ino_t, ino)
 		__field(int, format)
 		__field(int, nex)
-		__field(int, max_nex)
 		__field(int, broot_size)
 		__field(int, fork_off)
 	),
@@ -1578,18 +1577,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
 		__entry->ino = ip->i_ino;
 		__entry->format = ip->i_d.di_format;
 		__entry->nex = ip->i_d.di_nextents;
-		__entry->max_nex = ip->i_df.if_ext_max;
 		__entry->broot_size = ip->i_df.if_broot_bytes;
 		__entry->fork_off = XFS_IFORK_BOFF(ip);
 	),
 	TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
-		  "Max in-fork extents %d, broot size %d, fork offset %d",
+		  "broot size %d, fork offset %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
 		  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
 		  __entry->nex,
-		  __entry->max_nex,
 		  __entry->broot_size,
 		  __entry->fork_off)
 )
-- 
cgit v1.2.3


From 49e4c70e52a2bc2090e5a4e003e2888af21d6a2b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:08 +0000
Subject: xfs: make i_flags an unsigned long

To be used for bit wakeup i_flags needs to be an unsigned long or we'll
run into trouble on big endian systems.  Because of the 1-byte i_update
field right after it this actually causes a fairly large size increase
on its own (4 or 8 bytes), but that increase will be more than offset
by the next two patches.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 47497e11c111..be8dc0c2cf52 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -242,7 +242,7 @@ typedef struct xfs_inode {
 	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
 	/* Miscellaneous state. */
-	unsigned short		i_flags;	/* see defined flags below */
+	unsigned long		i_flags;	/* see defined flags below */
 	unsigned char		i_update_core;	/* timestamps/size is dirty */
 	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
 
-- 
cgit v1.2.3


From 474fce067521a40dbacc722e8ba119e81c2d31bf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:09 +0000
Subject: xfs: replace i_flock with a sleeping bitlock

We almost never block on i_flock, the exception is synchronous inode
flushing.  Instead of bloating the inode with a 16/24-byte completion
that we abuse as a semaphore just implement it as a bitlock that uses
a bit waitqueue for the rare sleeping path.  This primarily is a
tradeoff between a much smaller inode and a faster non-blocking
path vs faster wakeups, and we are much better off with the former.

A small downside is that we will lose lockdep checking for i_flock, but
given that it's always taken inside the ilock that should be acceptable.

Note that for example the inode writeback locking is implemented in a
very similar way.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_iget.c       | 20 +++++++++++--
 fs/xfs/xfs_inode.c      |  4 +--
 fs/xfs/xfs_inode.h      | 78 +++++++++++++++++++++++++++++++------------------
 fs/xfs/xfs_inode_item.c |  4 +--
 fs/xfs/xfs_super.c      |  7 -----
 fs/xfs/xfs_sync.c       |  9 +++---
 6 files changed, 76 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index f180ce896cd7..a7cf7139f9ad 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -77,7 +77,7 @@ xfs_inode_alloc(
 
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(completion_done(&ip->i_flush));
+	ASSERT(!xfs_isiflocked(ip));
 	ASSERT(ip->i_ino == 0);
 
 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -150,7 +150,7 @@ xfs_inode_free(
 	/* asserts to verify all state is correct here */
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(completion_done(&ip->i_flush));
+	ASSERT(!xfs_isiflocked(ip));
 
 	/*
 	 * Because we use RCU freeing we need to ensure the inode always
@@ -713,3 +713,19 @@ xfs_isilocked(
 	return 0;
 }
 #endif
+
+void
+__xfs_iflock(
+	struct xfs_inode	*ip)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+	do {
+		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		if (xfs_isiflocked(ip))
+			io_schedule();
+	} while (!xfs_iflock_nowait(ip));
+
+	finish_wait(wq, &wait.wait);
+}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 96b29e3286db..eeb60d31b086 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2396,7 +2396,7 @@ xfs_iflush(
 	XFS_STATS_INC(xs_iflush_count);
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-	ASSERT(!completion_done(&ip->i_flush));
+	ASSERT(xfs_isiflocked(ip));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 
@@ -2512,7 +2512,7 @@ xfs_iflush_int(
 #endif
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-	ASSERT(!completion_done(&ip->i_flush));
+	ASSERT(xfs_isiflocked(ip));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be8dc0c2cf52..960d2a89b3ac 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -237,7 +237,6 @@ typedef struct xfs_inode {
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
 	mrlock_t		i_lock;		/* inode lock */
 	mrlock_t		i_iolock;	/* inode IO lock */
-	struct completion	i_flush;	/* inode flush completion q */
 	atomic_t		i_pincount;	/* inode pin count */
 	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
@@ -324,6 +323,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 	return ret;
 }
 
+static inline int
+xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
+{
+	int ret;
+
+	spin_lock(&ip->i_flags_lock);
+	ret = ip->i_flags & flags;
+	if (!ret)
+		ip->i_flags |= flags;
+	spin_unlock(&ip->i_flags_lock);
+	return ret;
+}
+
 /*
  * Project quota id helpers (previously projid was 16bit only
  * and using two 16bit values to hold new 32bit projid was chosen
@@ -343,36 +355,18 @@ xfs_set_projid(struct xfs_inode *ip,
 	ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
 }
 
-/*
- * Manage the i_flush queue embedded in the inode.  This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
-	wait_for_completion(&ip->i_flush);
-}
-
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
-	return try_wait_for_completion(&ip->i_flush);
-}
-
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
-	complete(&ip->i_flush);
-}
-
 /*
  * In-core inode flags.
  */
-#define XFS_IRECLAIM		0x0001  /* started reclaiming this inode */
-#define XFS_ISTALE		0x0002	/* inode has been staled */
-#define XFS_IRECLAIMABLE	0x0004	/* inode can be reclaimed */
-#define XFS_INEW		0x0008	/* inode has just been allocated */
-#define XFS_IFILESTREAM		0x0010	/* inode is in a filestream directory */
-#define XFS_ITRUNCATED		0x0020	/* truncated down so flush-on-close */
-#define XFS_IDIRTY_RELEASE	0x0040	/* dirty release already seen */
+#define XFS_IRECLAIM		(1 << 0) /* started reclaiming this inode */
+#define XFS_ISTALE		(1 << 1) /* inode has been staled */
+#define XFS_IRECLAIMABLE	(1 << 2) /* inode can be reclaimed */
+#define XFS_INEW		(1 << 3) /* inode has just been allocated */
+#define XFS_IFILESTREAM		(1 << 4) /* inode is in a filestream dir. */
+#define XFS_ITRUNCATED		(1 << 5) /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE	(1 << 6) /* dirty release already seen */
+#define __XFS_IFLOCK_BIT	7	 /* inode is being flushed right now */
+#define XFS_IFLOCK		(1 << __XFS_IFLOCK_BIT)
 
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -384,6 +378,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 	 XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \
 	 XFS_IFILESTREAM);
 
+/*
+ * Synchronize processes attempting to flush the in-core inode back to disk.
+ */
+
+extern void __xfs_iflock(struct xfs_inode *ip);
+
+static inline int xfs_iflock_nowait(struct xfs_inode *ip)
+{
+	return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
+}
+
+static inline void xfs_iflock(struct xfs_inode *ip)
+{
+	if (!xfs_iflock_nowait(ip))
+		__xfs_iflock(ip);
+}
+
+static inline void xfs_ifunlock(struct xfs_inode *ip)
+{
+	xfs_iflags_clear(ip, XFS_IFLOCK);
+	wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
+}
+
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+	return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+
 /*
  * Flags for inode locking.
  * Bit ranges:	1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 2b6b4fcef49e..c8d4ce0efd5a 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -717,7 +717,7 @@ xfs_inode_item_pushbuf(
 	 * If a flush is not in progress anymore, chances are that the
 	 * inode was taken off the AIL. So, just get out.
 	 */
-	if (completion_done(&ip->i_flush) ||
+	if (!xfs_isiflocked(ip) ||
 	    !(lip->li_flags & XFS_LI_IN_AIL)) {
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		return true;
@@ -750,7 +750,7 @@ xfs_inode_item_push(
 	struct xfs_inode	*ip = iip->ili_inode;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
-	ASSERT(!completion_done(&ip->i_flush));
+	ASSERT(xfs_isiflocked(ip));
 
 	/*
 	 * Since we were able to lock the inode's flush lock and
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 281961c1d81a..6851fa7b1afa 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -829,13 +829,6 @@ xfs_fs_inode_init_once(
 	atomic_set(&ip->i_pincount, 0);
 	spin_lock_init(&ip->i_flags_lock);
 	init_waitqueue_head(&ip->i_ipin_wait);
-	/*
-	 * Because we want to use a counting completion, complete
-	 * the flush completion once to allow a single access to
-	 * the flush completion without blocking.
-	 */
-	init_completion(&ip->i_flush);
-	complete(&ip->i_flush);
 
 	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
 		     "xfsino", ip->i_ino);
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 72c01a1c16e7..40b75eecd2b4 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -707,14 +707,13 @@ xfs_reclaim_inode_grab(
 		return 1;
 
 	/*
-	 * do some unlocked checks first to avoid unnecessary lock traffic.
-	 * The first is a flush lock check, the second is a already in reclaim
-	 * check. Only do these checks if we are not going to block on locks.
+	 * If we are asked for non-blocking operation, do unlocked checks to
+	 * see if the inode already is being flushed or in reclaim to avoid
+	 * lock traffic.
 	 */
 	if ((flags & SYNC_TRYLOCK) &&
-	    (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+	    __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
 		return 1;
-	}
 
 	/*
 	 * The radix tree lock here protects a thread in xfs_iget from racing
-- 
cgit v1.2.3


From f392e6319a4e9a028b0c8b48f000bb01d660ad53 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:10 +0000
Subject: xfs: replace i_pin_wait with a bit waitqueue

Replace i_pin_wait, which is only used during synchronous inode flushing
with a bit waitqueue.  This trades off a much smaller inode against
slightly slower wakeup performance, and saves 12 (32-bit) or 20 (64-bit)
bytes in the XFS inode.

Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.c      | 27 +++++++++++++++++++++------
 fs/xfs/xfs_inode.h      |  3 ++-
 fs/xfs/xfs_inode_item.c |  2 +-
 fs/xfs/xfs_super.c      |  1 -
 4 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index eeb60d31b086..62603369b523 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2037,7 +2037,7 @@ xfs_idestroy_fork(
  * once someone is waiting for it to be unpinned.
  */
 static void
-xfs_iunpin_nowait(
+xfs_iunpin(
 	struct xfs_inode	*ip)
 {
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
@@ -2049,14 +2049,29 @@ xfs_iunpin_nowait(
 
 }
 
+static void
+__xfs_iunpin_wait(
+	struct xfs_inode	*ip)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
+	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
+
+	xfs_iunpin(ip);
+
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		if (xfs_ipincount(ip))
+			io_schedule();
+	} while (xfs_ipincount(ip));
+	finish_wait(wq, &wait.wait);
+}
+
 void
 xfs_iunpin_wait(
 	struct xfs_inode	*ip)
 {
-	if (xfs_ipincount(ip)) {
-		xfs_iunpin_nowait(ip);
-		wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
-	}
+	if (xfs_ipincount(ip))
+		__xfs_iunpin_wait(ip);
 }
 
 /*
@@ -2415,7 +2430,7 @@ xfs_iflush(
 	 * out for us if they occur after the log force completes.
 	 */
 	if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
-		xfs_iunpin_nowait(ip);
+		xfs_iunpin(ip);
 		xfs_ifunlock(ip);
 		return EAGAIN;
 	}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 960d2a89b3ac..4acbe740be46 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -238,7 +238,6 @@ typedef struct xfs_inode {
 	mrlock_t		i_lock;		/* inode lock */
 	mrlock_t		i_iolock;	/* inode IO lock */
 	atomic_t		i_pincount;	/* inode pin count */
-	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
 	/* Miscellaneous state. */
 	unsigned long		i_flags;	/* see defined flags below */
@@ -367,6 +366,8 @@ xfs_set_projid(struct xfs_inode *ip,
 #define XFS_IDIRTY_RELEASE	(1 << 6) /* dirty release already seen */
 #define __XFS_IFLOCK_BIT	7	 /* inode is being flushed right now */
 #define XFS_IFLOCK		(1 << __XFS_IFLOCK_BIT)
+#define __XFS_IPINNED_BIT	8	 /* wakeup key for zero pin count */
+#define XFS_IPINNED		(1 << __XFS_IPINNED_BIT)
 
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c8d4ce0efd5a..91d71dcd4852 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -555,7 +555,7 @@ xfs_inode_item_unpin(
 	trace_xfs_inode_unpin(ip, _RET_IP_);
 	ASSERT(atomic_read(&ip->i_pincount) > 0);
 	if (atomic_dec_and_test(&ip->i_pincount))
-		wake_up(&ip->i_ipin_wait);
+		wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
 }
 
 /*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 6851fa7b1afa..ee5b695c99a7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -828,7 +828,6 @@ xfs_fs_inode_init_once(
 	/* xfs inode */
 	atomic_set(&ip->i_pincount, 0);
 	spin_lock_init(&ip->i_flags_lock);
-	init_waitqueue_head(&ip->i_ipin_wait);
 
 	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
 		     "xfsino", ip->i_ino);
-- 
cgit v1.2.3


From ce7ae151ddada3dbf67301464343c154903166b3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:11 +0000
Subject: xfs: remove the i_size field in struct xfs_inode

There is no fundamental need to keep an in-memory inode size copy in the XFS
inode.  We already have the on-disk value in the dinode, and the separate
in-memory copy that we need for regular files only in the XFS inode.

Remove the xfs_inode i_size field and change the XFS_ISIZE macro to use the
VFS inode i_size field for regular files.  Switch code that was directly
accessing the i_size field in the xfs_inode to XFS_ISIZE, or in cases where
we are limited to regular files direct access of the VFS inode i_size field.

This also allows dropping some fairly complicated code in the write path
which dealt with keeping the xfs_inode i_size uptodate with the VFS i_size
that is getting updated inside ->write_end.

Note that we do not bother resetting the VFS i_size when truncating a file
that gets freed to zero as there is no point in doing so because the VFS inode
is no longer in use at this point.  Just relax the assert in xfs_ifree to
only check the on-disk size instead.

Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_aops.c        |  2 +-
 fs/xfs/xfs_bmap.c        | 15 ++++++---------
 fs/xfs/xfs_file.c        | 45 +++++++++++----------------------------------
 fs/xfs/xfs_fs_subr.c     |  2 +-
 fs/xfs/xfs_iget.c        |  1 -
 fs/xfs/xfs_inode.c       |  8 ++------
 fs/xfs/xfs_inode.h       | 16 ++++++++++++----
 fs/xfs/xfs_iomap.c       | 12 ++++++------
 fs/xfs/xfs_iops.c        |  3 +--
 fs/xfs/xfs_qm_syscalls.c |  1 -
 fs/xfs/xfs_trace.h       |  2 +-
 fs/xfs/xfs_vnodeops.c    | 31 +++++++++++++++----------------
 12 files changed, 56 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee9b625..4d27ea117e0e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,7 +111,7 @@ xfs_ioend_new_eof(
 	xfs_fsize_t		bsize;
 
 	bsize = ioend->io_offset + ioend->io_size;
-	isize = MAX(ip->i_size, ip->i_new_size);
+	isize = MAX(i_size_read(VFS_I(ip)), ip->i_new_size);
 	isize = MIN(isize, bsize);
 	return isize > ip->i_d.di_size ? isize : 0;
 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 7a888ca2f7f6..188ef2fbd628 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3997,11 +3997,8 @@ xfs_bmap_one_block(
 	xfs_bmbt_irec_t	s;		/* internal version of extent */
 
 #ifndef DEBUG
-	if (whichfork == XFS_DATA_FORK) {
-		return S_ISREG(ip->i_d.di_mode) ?
-			(ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
-			(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
-	}
+	if (whichfork == XFS_DATA_FORK)
+		return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
 #endif	/* !DEBUG */
 	if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
 		return 0;
@@ -4013,7 +4010,7 @@ xfs_bmap_one_block(
 	xfs_bmbt_get_all(ep, &s);
 	rval = s.br_startoff == 0 && s.br_blockcount == 1;
 	if (rval && whichfork == XFS_DATA_FORK)
-		ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
+		ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
 	return rval;
 }
 
@@ -5427,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole(
 	if (startblock == HOLESTARTBLOCK) {
 		mp = ip->i_mount;
 		out->bmv_block = -1;
-		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size));
+		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
 		fixlen -= out->bmv_offset;
 		if (prealloced && out->bmv_offset + out->bmv_length == end) {
 			/* Came to hole at EOF. Trim it. */
@@ -5515,7 +5512,7 @@ xfs_getbmap(
 			fixlen = XFS_MAXIOFFSET(mp);
 		} else {
 			prealloced = 0;
-			fixlen = ip->i_size;
+			fixlen = XFS_ISIZE(ip);
 		}
 	}
 
@@ -5544,7 +5541,7 @@ xfs_getbmap(
 
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
-		if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+		if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
 			error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
 			if (error)
 				goto out_unlock_iolock;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f675f3d9d7b3..86d5dc260464 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -327,7 +327,7 @@ xfs_file_aio_read(
 				mp->m_rtdev_targp : mp->m_ddev_targp;
 		if ((iocb->ki_pos & target->bt_smask) ||
 		    (size & target->bt_smask)) {
-			if (iocb->ki_pos == ip->i_size)
+			if (iocb->ki_pos == i_size_read(inode))
 				return 0;
 			return -XFS_ERROR(EINVAL);
 		}
@@ -412,30 +412,6 @@ xfs_file_splice_read(
 	return ret;
 }
 
-STATIC void
-xfs_aio_write_isize_update(
-	struct inode	*inode,
-	loff_t		*ppos,
-	ssize_t		bytes_written)
-{
-	struct xfs_inode	*ip = XFS_I(inode);
-	xfs_fsize_t		isize = i_size_read(inode);
-
-	if (bytes_written > 0)
-		XFS_STATS_ADD(xs_write_bytes, bytes_written);
-
-	if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
-					*ppos > isize))
-		*ppos = isize;
-
-	if (*ppos > ip->i_size) {
-		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-		if (*ppos > ip->i_size)
-			ip->i_size = *ppos;
-		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-	}
-}
-
 /*
  * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
  * part of the I/O may have been written to disk before the error occurred.  In
@@ -451,8 +427,8 @@ xfs_aio_write_newsize_update(
 		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
 		if (new_size == ip->i_new_size)
 			ip->i_new_size = 0;
-		if (ip->i_d.di_size > ip->i_size)
-			ip->i_d.di_size = ip->i_size;
+		if (ip->i_d.di_size > i_size_read(VFS_I(ip)))
+			ip->i_d.di_size = i_size_read(VFS_I(ip));
 		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 }
@@ -492,15 +468,16 @@ xfs_file_splice_write(
 	new_size = *ppos + count;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	if (new_size > ip->i_size)
+	if (new_size > i_size_read(inode))
 		ip->i_new_size = new_size;
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 	trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
 
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+	if (ret > 0)
+		XFS_STATS_ADD(xs_write_bytes, ret);
 
-	xfs_aio_write_isize_update(inode, ppos, ret);
 	xfs_aio_write_newsize_update(ip, new_size);
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	return ret;
@@ -728,14 +705,14 @@ restart:
 	 * values are still valid.
 	 */
 	if ((ip->i_new_size && *pos > ip->i_new_size) ||
-	    (!ip->i_new_size && *pos > ip->i_size)) {
+	    (!ip->i_new_size && *pos > i_size_read(inode))) {
 		if (*iolock == XFS_IOLOCK_SHARED) {
 			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
 			*iolock = XFS_IOLOCK_EXCL;
 			xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
 			goto restart;
 		}
-		error = -xfs_zero_eof(ip, *pos, ip->i_size);
+		error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
 	}
 
 	/*
@@ -744,7 +721,7 @@ restart:
 	 * ip->i_new_size if this IO ends beyond any other in-flight writes.
 	 */
 	new_size = *pos + *count;
-	if (new_size > ip->i_size) {
+	if (new_size > i_size_read(inode)) {
 		if (new_size > ip->i_new_size)
 			ip->i_new_size = new_size;
 		*new_sizep = new_size;
@@ -957,11 +934,11 @@ xfs_file_aio_write(
 		ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
 						ocount, &new_size, &iolock);
 
-	xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
-
 	if (ret <= 0)
 		goto out_unlock;
 
+	XFS_STATS_ADD(xs_write_bytes, ret);
+
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
 		loff_t end = pos + ret - 1;
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811c..652b875a9d4c 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -90,7 +90,7 @@ xfs_wait_on_pages(
 
 	if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
 		return -filemap_fdatawait_range(mapping, first,
-					last == -1 ? ip->i_size - 1 : last);
+					last == -1 ? XFS_ISIZE(ip) - 1 : last);
 	}
 	return 0;
 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index a7cf7139f9ad..3b5b78aa3b87 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -94,7 +94,6 @@ xfs_inode_alloc(
 	ip->i_update_core = 0;
 	ip->i_delayed_blks = 0;
 	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-	ip->i_size = 0;
 	ip->i_new_size = 0;
 
 	return ip;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 62603369b523..b21022499c2e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -347,7 +347,6 @@ xfs_iformat(
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 		ip->i_d.di_size = 0;
-		ip->i_size = 0;
 		ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
 		break;
 
@@ -853,7 +852,6 @@ xfs_iread(
 	}
 
 	ip->i_delayed_blks = 0;
-	ip->i_size = ip->i_d.di_size;
 
 	/*
 	 * Mark the buffer containing the inode as something to keep
@@ -1043,7 +1041,6 @@ xfs_ialloc(
 	}
 
 	ip->i_d.di_size = 0;
-	ip->i_size = 0;
 	ip->i_d.di_nextents = 0;
 	ASSERT(ip->i_d.di_nblocks == 0);
 
@@ -1198,7 +1195,7 @@ xfs_itruncate_extents(
 	int			done = 0;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-	ASSERT(new_size <= ip->i_size);
+	ASSERT(new_size <= XFS_ISIZE(ip));
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 	ASSERT(ip->i_itemp != NULL);
 	ASSERT(ip->i_itemp->ili_lock_flags == 0);
@@ -1712,8 +1709,7 @@ xfs_ifree(
 	ASSERT(ip->i_d.di_nlink == 0);
 	ASSERT(ip->i_d.di_nextents == 0);
 	ASSERT(ip->i_d.di_anextents == 0);
-	ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
-	       (!S_ISREG(ip->i_d.di_mode)));
+	ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
 	ASSERT(ip->i_d.di_nblocks == 0);
 
 	/*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 4acbe740be46..cd99e43fa8f0 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -246,16 +246,12 @@ typedef struct xfs_inode {
 
 	xfs_icdinode_t		i_d;		/* most of ondisk inode */
 
-	xfs_fsize_t		i_size;		/* in-memory size */
 	xfs_fsize_t		i_new_size;	/* size when write completes */
 
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
 } xfs_inode_t;
 
-#define XFS_ISIZE(ip)	S_ISREG((ip)->i_d.di_mode) ? \
-				(ip)->i_size : (ip)->i_d.di_size;
-
 /* Convert from vfs inode to xfs inode */
 static inline struct xfs_inode *XFS_I(struct inode *inode)
 {
@@ -268,6 +264,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
 	return &ip->i_vnode;
 }
 
+/*
+ * For regular files we only update the on-disk filesize when actually
+ * writing data back to disk.  Until then only the copy in the VFS inode
+ * is uptodate.
+ */
+static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
+{
+	if (S_ISREG(ip->i_d.di_mode))
+		return i_size_read(VFS_I(ip));
+	return ip->i_d.di_size;
+}
+
 /*
  * i_flags helper functions
  */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index a27a44659da6..246c7d57c6f9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -74,7 +74,7 @@ xfs_iomap_eof_align_last_fsb(
 		else if (mp->m_dalign)
 			align = mp->m_dalign;
 
-		if (align && ip->i_size >= XFS_FSB_TO_B(mp, align))
+		if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
 			new_last_fsb = roundup_64(*last_fsb, align);
 	}
 
@@ -154,7 +154,7 @@ xfs_iomap_write_direct(
 
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-	if ((offset + count) > ip->i_size) {
+	if ((offset + count) > XFS_ISIZE(ip)) {
 		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
 		if (error)
 			goto error_out;
@@ -211,7 +211,7 @@ xfs_iomap_write_direct(
 	xfs_trans_ijoin(tp, ip, 0);
 
 	bmapi_flag = 0;
-	if (offset < ip->i_size || extsz)
+	if (offset < XFS_ISIZE(ip) || extsz)
 		bmapi_flag |= XFS_BMAPI_PREALLOC;
 
 	/*
@@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate(
 	int		found_delalloc = 0;
 
 	*prealloc = 0;
-	if ((offset + count) <= ip->i_size)
+	if (offset + count <= XFS_ISIZE(ip))
 		return 0;
 
 	/*
@@ -340,7 +340,7 @@ xfs_iomap_prealloc_size(
 		 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
 		 * ensure we always pass in a non-zero value.
 		 */
-		alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+		alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
 		alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
 					rounddown_pow_of_two(alloc_blocks));
 
@@ -564,7 +564,7 @@ xfs_iomap_write_allocate(
 			 * back....
 			 */
 			nimaps = 1;
-			end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
+			end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
 			error = xfs_bmap_last_offset(NULL, ip, &last_block,
 							XFS_DATA_FORK);
 			if (error)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f02eaa298d3c..ab302539e5b9 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -778,7 +778,7 @@ xfs_setattr_size(
 		lock_flags |= XFS_IOLOCK_EXCL;
 	xfs_ilock(ip, lock_flags);
 
-	oldsize = ip->i_size;
+	oldsize = inode->i_size;
 	newsize = iattr->ia_size;
 
 	/*
@@ -897,7 +897,6 @@ xfs_setattr_size(
 	 * they get written to.
 	 */
 	ip->i_d.di_size = newsize;
-	ip->i_size = newsize;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 	if (newsize <= oldsize) {
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 27378650b5cb..eafbcff81f3a 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -265,7 +265,6 @@ xfs_qm_scall_trunc_qfile(
 	xfs_trans_ijoin(tp, ip, 0);
 
 	ip->i_d.di_size = 0;
-	ip->i_size = 0;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 81efa0416173..2aabcc9c507e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1038,7 +1038,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_fast_assign(
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
-		__entry->isize = ip->i_size;
+		__entry->isize = VFS_I(ip)->i_size;
 		__entry->disize = ip->i_d.di_size;
 		__entry->new_size = ip->i_new_size;
 		__entry->offset = offset;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 96ff03421753..0cf52da9d246 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -175,7 +175,7 @@ xfs_free_eofblocks(
 	 * Figure out if there are any blocks beyond the end
 	 * of the file.  If not, then there is nothing to do.
 	 */
-	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
 	last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
 	if (last_fsb <= end_fsb)
 		return 0;
@@ -233,7 +233,7 @@ xfs_free_eofblocks(
 		 * may be full of holes (ie NULL files bug).
 		 */
 		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
-					      ip->i_size);
+					      XFS_ISIZE(ip));
 		if (error) {
 			/*
 			 * If we get an error at this point we simply don't
@@ -547,8 +547,8 @@ xfs_release(
 		return 0;
 
 	if ((S_ISREG(ip->i_d.di_mode) &&
-	     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-	       ip->i_delayed_blks > 0)) &&
+	     (VFS_I(ip)->i_size > 0 ||
+	      (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
 	     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
 	    (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
 
@@ -625,7 +625,7 @@ xfs_inactive(
 	 * only one with a reference to the inode.
 	 */
 	truncate = ((ip->i_d.di_nlink == 0) &&
-	    ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
+	    ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
 	     (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
 	    S_ISREG(ip->i_d.di_mode));
 
@@ -639,12 +639,12 @@ xfs_inactive(
 
 	if (ip->i_d.di_nlink != 0) {
 		if ((S_ISREG(ip->i_d.di_mode) &&
-                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                       ip->i_delayed_blks > 0)) &&
-		      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
-		     (!(ip->i_d.di_flags &
+		    (VFS_I(ip)->i_size > 0 ||
+		     (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
+		    (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+		    (!(ip->i_d.di_flags &
 				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
-		      (ip->i_delayed_blks != 0)))) {
+		     ip->i_delayed_blks != 0))) {
 			error = xfs_free_eofblocks(mp, ip, 0);
 			if (error)
 				return VN_INACTIVE_CACHE;
@@ -678,7 +678,6 @@ xfs_inactive(
 		xfs_trans_ijoin(tp, ip, 0);
 
 		ip->i_d.di_size = 0;
-		ip->i_size = 0;
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
@@ -1974,11 +1973,11 @@ xfs_zero_remaining_bytes(
 	 * since nothing can read beyond eof.  The space will
 	 * be zeroed when the file is extended anyway.
 	 */
-	if (startoff >= ip->i_size)
+	if (startoff >= XFS_ISIZE(ip))
 		return 0;
 
-	if (endoff > ip->i_size)
-		endoff = ip->i_size;
+	if (endoff > XFS_ISIZE(ip))
+		endoff = XFS_ISIZE(ip);
 
 	bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -2273,7 +2272,7 @@ xfs_change_file_space(
 		bf->l_start += offset;
 		break;
 	case 2: /*SEEK_END*/
-		bf->l_start += ip->i_size;
+		bf->l_start += XFS_ISIZE(ip);
 		break;
 	default:
 		return XFS_ERROR(EINVAL);
@@ -2290,7 +2289,7 @@ xfs_change_file_space(
 	bf->l_whence = 0;
 
 	startoffset = bf->l_start;
-	fsize = ip->i_size;
+	fsize = XFS_ISIZE(ip);
 
 	/*
 	 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
-- 
cgit v1.2.3


From 2813d682e8e6a278f94817429afd46b30875bb6e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:12 +0000
Subject: xfs: remove the i_new_size field in struct xfs_inode

Now that we use the VFS i_size field throughout XFS there is no need for the
i_new_size field any more given that the VFS i_size field gets updated
in ->write_end before unlocking the page, and thus is always uptodate when
writeback could see a page.  Removing i_new_size also has the advantage that
we will never have to trim back di_size during a failed buffered write,
given that it never gets updated past i_size.

Note that currently the generic direct I/O code only updates i_size after
calling our end_io handler, which requires a small workaround to make
sure di_size actually makes it to disk.  I hope to fix this properly in
the generic code.

A downside is that we lose the support for parallel non-overlapping O_DIRECT
appending writes that recently was added.  I don't think keeping the complex
and fragile i_new_size infrastructure for this is a good tradeoff - if we
really care about parallel appending writers we should investigate turning
the iolock into a range lock, which would also allow for parallel
non-overlapping buffered writers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_aops.c  | 29 ++++++++++++----------
 fs/xfs/xfs_file.c  | 72 ++++++++----------------------------------------------
 fs/xfs/xfs_iget.c  |  1 -
 fs/xfs/xfs_inode.h |  2 --
 fs/xfs/xfs_trace.h | 18 +++-----------
 5 files changed, 30 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4d27ea117e0e..74b9baf36ac3 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,8 +111,7 @@ xfs_ioend_new_eof(
 	xfs_fsize_t		bsize;
 
 	bsize = ioend->io_offset + ioend->io_size;
-	isize = MAX(i_size_read(VFS_I(ip)), ip->i_new_size);
-	isize = MIN(isize, bsize);
+	isize = MIN(i_size_read(VFS_I(ip)), bsize);
 	return isize > ip->i_d.di_size ? isize : 0;
 }
 
@@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 }
 
 /*
- * Update on-disk file size now that data has been written to disk.  The
- * current in-memory file size is i_size.  If a write is beyond eof i_new_size
- * will be the intended file size until i_size is updated.  If this write does
- * not extend all the way to the valid file size then restrict this update to
- * the end of the write.
+ * Update on-disk file size now that data has been written to disk.
  *
  * This function does not block as blocking on the inode lock in IO completion
  * can lead to IO completion order dependency deadlocks.. If it can't get the
@@ -1278,6 +1273,15 @@ xfs_end_io_direct_write(
 {
 	struct xfs_ioend	*ioend = iocb->private;
 
+	/*
+	 * While the generic direct I/O code updates the inode size, it does
+	 * so only after the end_io handler is called, which means our
+	 * end_io handler thinks the on-disk size is outside the in-core
+	 * size.  To prevent this just update it a little bit earlier here.
+	 */
+	if (offset + size > i_size_read(ioend->io_inode))
+		i_size_write(ioend->io_inode, offset + size);
+
 	/*
 	 * blockdev_direct_IO can return an error even after the I/O
 	 * completion handler was called.  Thus we need to protect
@@ -1340,12 +1344,11 @@ xfs_vm_write_failed(
 
 	if (to > inode->i_size) {
 		/*
-		 * punch out the delalloc blocks we have already allocated. We
-		 * don't call xfs_setattr() to do this as we may be in the
-		 * middle of a multi-iovec write and so the vfs inode->i_size
-		 * will not match the xfs ip->i_size and so it will zero too
-		 * much. Hence we jus truncate the page cache to zero what is
-		 * necessary and punch the delalloc blocks directly.
+		 * Punch out the delalloc blocks we have already allocated.
+		 *
+		 * Don't bother with xfs_setattr given that nothing can have
+		 * made it to disk yet as the page is still locked at this
+		 * point.
 		 */
 		struct xfs_inode	*ip = XFS_I(inode);
 		xfs_fileoff_t		start_fsb;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 86d5dc260464..632313926788 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -412,27 +412,6 @@ xfs_file_splice_read(
 	return ret;
 }
 
-/*
- * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
- * part of the I/O may have been written to disk before the error occurred.  In
- * this case the on-disk file size may have been adjusted beyond the in-memory
- * file size and now needs to be truncated back.
- */
-STATIC void
-xfs_aio_write_newsize_update(
-	struct xfs_inode	*ip,
-	xfs_fsize_t		new_size)
-{
-	if (new_size == ip->i_new_size) {
-		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-		if (new_size == ip->i_new_size)
-			ip->i_new_size = 0;
-		if (ip->i_d.di_size > i_size_read(VFS_I(ip)))
-			ip->i_d.di_size = i_size_read(VFS_I(ip));
-		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-	}
-}
-
 /*
  * xfs_file_splice_write() does not use xfs_rw_ilock() because
  * generic_file_splice_write() takes the i_mutex itself. This, in theory,
@@ -451,7 +430,6 @@ xfs_file_splice_write(
 {
 	struct inode		*inode = outfilp->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
-	xfs_fsize_t		new_size;
 	int			ioflags = 0;
 	ssize_t			ret;
 
@@ -465,20 +443,12 @@ xfs_file_splice_write(
 
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-	new_size = *ppos + count;
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	if (new_size > i_size_read(inode))
-		ip->i_new_size = new_size;
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
 	trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
 
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_write_bytes, ret);
 
-	xfs_aio_write_newsize_update(ip, new_size);
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	return ret;
 }
@@ -673,16 +643,13 @@ xfs_file_aio_write_checks(
 	struct file		*file,
 	loff_t			*pos,
 	size_t			*count,
-	xfs_fsize_t		*new_sizep,
 	int			*iolock)
 {
 	struct inode		*inode = file->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
-	xfs_fsize_t		new_size;
 	int			error = 0;
 
 	xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-	*new_sizep = 0;
 restart:
 	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
 	if (error) {
@@ -697,15 +664,13 @@ restart:
 	/*
 	 * If the offset is beyond the size of the file, we need to zero any
 	 * blocks that fall between the existing EOF and the start of this
-	 * write. There is no need to issue zeroing if another in-flght IO ends
-	 * at or before this one If zeronig is needed and we are currently
-	 * holding the iolock shared, we need to update it to exclusive which
-	 * involves dropping all locks and relocking to maintain correct locking
-	 * order. If we do this, restart the function to ensure all checks and
-	 * values are still valid.
+	 * write.  If zeroing is needed and we are currently holding the
+	 * iolock shared, we need to update it to exclusive which involves
+	 * dropping all locks and relocking to maintain correct locking order.
+	 * If we do this, restart the function to ensure all checks and values
+	 * are still valid.
 	 */
-	if ((ip->i_new_size && *pos > ip->i_new_size) ||
-	    (!ip->i_new_size && *pos > i_size_read(inode))) {
+	if (*pos > i_size_read(inode)) {
 		if (*iolock == XFS_IOLOCK_SHARED) {
 			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
 			*iolock = XFS_IOLOCK_EXCL;
@@ -714,19 +679,6 @@ restart:
 		}
 		error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
 	}
-
-	/*
-	 * If this IO extends beyond EOF, we may need to update ip->i_new_size.
-	 * We have already zeroed space beyond EOF (if necessary).  Only update
-	 * ip->i_new_size if this IO ends beyond any other in-flight writes.
-	 */
-	new_size = *pos + *count;
-	if (new_size > i_size_read(inode)) {
-		if (new_size > ip->i_new_size)
-			ip->i_new_size = new_size;
-		*new_sizep = new_size;
-	}
-
 	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 	if (error)
 		return error;
@@ -772,7 +724,6 @@ xfs_file_dio_aio_write(
 	unsigned long		nr_segs,
 	loff_t			pos,
 	size_t			ocount,
-	xfs_fsize_t		*new_size,
 	int			*iolock)
 {
 	struct file		*file = iocb->ki_filp;
@@ -817,7 +768,7 @@ xfs_file_dio_aio_write(
 		xfs_rw_ilock(ip, *iolock);
 	}
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
 	if (ret)
 		return ret;
 
@@ -855,7 +806,6 @@ xfs_file_buffered_aio_write(
 	unsigned long		nr_segs,
 	loff_t			pos,
 	size_t			ocount,
-	xfs_fsize_t		*new_size,
 	int			*iolock)
 {
 	struct file		*file = iocb->ki_filp;
@@ -869,7 +819,7 @@ xfs_file_buffered_aio_write(
 	*iolock = XFS_IOLOCK_EXCL;
 	xfs_rw_ilock(ip, *iolock);
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
 	if (ret)
 		return ret;
 
@@ -909,7 +859,6 @@ xfs_file_aio_write(
 	ssize_t			ret;
 	int			iolock;
 	size_t			ocount = 0;
-	xfs_fsize_t		new_size = 0;
 
 	XFS_STATS_INC(xs_write_calls);
 
@@ -929,10 +878,10 @@ xfs_file_aio_write(
 
 	if (unlikely(file->f_flags & O_DIRECT))
 		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &new_size, &iolock);
+						ocount, &iolock);
 	else
 		ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &new_size, &iolock);
+						ocount, &iolock);
 
 	if (ret <= 0)
 		goto out_unlock;
@@ -953,7 +902,6 @@ xfs_file_aio_write(
 	}
 
 out_unlock:
-	xfs_aio_write_newsize_update(ip, new_size);
 	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 3b5b78aa3b87..8c3e46394d48 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -94,7 +94,6 @@ xfs_inode_alloc(
 	ip->i_update_core = 0;
 	ip->i_delayed_blks = 0;
 	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-	ip->i_new_size = 0;
 
 	return ip;
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index cd99e43fa8f0..2f27b7454085 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -246,8 +246,6 @@ typedef struct xfs_inode {
 
 	xfs_icdinode_t		i_d;		/* most of ondisk inode */
 
-	xfs_fsize_t		i_new_size;	/* size when write completes */
-
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
 } xfs_inode_t;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2aabcc9c507e..6b6df5802e95 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -891,7 +891,6 @@ DECLARE_EVENT_CLASS(xfs_file_class,
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
 		__field(xfs_fsize_t, size)
-		__field(xfs_fsize_t, new_size)
 		__field(loff_t, offset)
 		__field(size_t, count)
 		__field(int, flags)
@@ -900,17 +899,15 @@ DECLARE_EVENT_CLASS(xfs_file_class,
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
 		__entry->size = ip->i_d.di_size;
-		__entry->new_size = ip->i_new_size;
 		__entry->offset = offset;
 		__entry->count = count;
 		__entry->flags = flags;
 	),
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
 		  "offset 0x%llx count 0x%zx ioflags %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->size,
-		  __entry->new_size,
 		  __entry->offset,
 		  __entry->count,
 		  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
@@ -978,7 +975,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
 		__field(loff_t, size)
-		__field(loff_t, new_size)
 		__field(loff_t, offset)
 		__field(size_t, count)
 		__field(int, type)
@@ -990,7 +986,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
 		__entry->size = ip->i_d.di_size;
-		__entry->new_size = ip->i_new_size;
 		__entry->offset = offset;
 		__entry->count = count;
 		__entry->type = type;
@@ -998,13 +993,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
 		__entry->startblock = irec ? irec->br_startblock : 0;
 		__entry->blockcount = irec ? irec->br_blockcount : 0;
 	),
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-		  "offset 0x%llx count %zd type %s "
-		  "startoff 0x%llx startblock %lld blockcount 0x%llx",
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
+		  "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->size,
-		  __entry->new_size,
 		  __entry->offset,
 		  __entry->count,
 		  __print_symbolic(__entry->type, XFS_IO_TYPES),
@@ -1031,7 +1024,6 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
 		__field(xfs_ino_t, ino)
 		__field(loff_t, isize)
 		__field(loff_t, disize)
-		__field(loff_t, new_size)
 		__field(loff_t, offset)
 		__field(size_t, count)
 	),
@@ -1040,17 +1032,15 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
 		__entry->ino = ip->i_ino;
 		__entry->isize = VFS_I(ip)->i_size;
 		__entry->disize = ip->i_d.di_size;
-		__entry->new_size = ip->i_new_size;
 		__entry->offset = offset;
 		__entry->count = count;
 	),
-	TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx "
+	TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
 		  "offset 0x%llx count %zd",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->isize,
 		  __entry->disize,
-		  __entry->new_size,
 		  __entry->offset,
 		  __entry->count)
 );
-- 
cgit v1.2.3


From 5bf1f26227a59b9634e95eb3c7c012b766e5e6a0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:13 +0000
Subject: xfs: always return with the iolock held from
 xfs_file_aio_write_checks

While xfs_iunlock is fine with 0 lockflags the calling conventions are much
cleaner if xfs_file_aio_write_checks never returns without the iolock held.

Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_file.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 632313926788..134ff2fe4f4d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -636,7 +636,9 @@ out_lock:
 /*
  * Common pre-write limit and setup checks.
  *
- * Returns with iolock held according to @iolock.
+ * Called with the iolocked held either shared and exclusive according to
+ * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
+ * if called for a direct write beyond i_size.
  */
 STATIC ssize_t
 xfs_file_aio_write_checks(
@@ -653,8 +655,7 @@ xfs_file_aio_write_checks(
 restart:
 	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
 	if (error) {
-		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
-		*iolock = 0;
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 		return error;
 	}
 
-- 
cgit v1.2.3


From d060646436233912178e6b9e3a7f30a41214220f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Dec 2011 20:00:14 +0000
Subject: xfs: cleanup xfs_file_aio_write

With all the size field updates out of the way xfs_file_aio_write can
be further simplified by pushing all iolock handling into
xfs_file_dio_aio_write and xfs_file_buffered_aio_write and using
the generic generic_write_sync helper for synchronous writes.

Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_file.c | 82 +++++++++++++++++++++++++------------------------------
 1 file changed, 37 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 134ff2fe4f4d..7e5bc872f2b4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -724,8 +724,7 @@ xfs_file_dio_aio_write(
 	const struct iovec	*iovp,
 	unsigned long		nr_segs,
 	loff_t			pos,
-	size_t			ocount,
-	int			*iolock)
+	size_t			ocount)
 {
 	struct file		*file = iocb->ki_filp;
 	struct address_space	*mapping = file->f_mapping;
@@ -735,10 +734,10 @@ xfs_file_dio_aio_write(
 	ssize_t			ret = 0;
 	size_t			count = ocount;
 	int			unaligned_io = 0;
+	int			iolock;
 	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 
-	*iolock = 0;
 	if ((pos & target->bt_smask) || (count & target->bt_smask))
 		return -XFS_ERROR(EINVAL);
 
@@ -753,31 +752,31 @@ xfs_file_dio_aio_write(
 	 * EOF zeroing cases and fill out the new inode size as appropriate.
 	 */
 	if (unaligned_io || mapping->nrpages)
-		*iolock = XFS_IOLOCK_EXCL;
+		iolock = XFS_IOLOCK_EXCL;
 	else
-		*iolock = XFS_IOLOCK_SHARED;
-	xfs_rw_ilock(ip, *iolock);
+		iolock = XFS_IOLOCK_SHARED;
+	xfs_rw_ilock(ip, iolock);
 
 	/*
 	 * Recheck if there are cached pages that need invalidate after we got
 	 * the iolock to protect against other threads adding new pages while
 	 * we were waiting for the iolock.
 	 */
-	if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
-		xfs_rw_iunlock(ip, *iolock);
-		*iolock = XFS_IOLOCK_EXCL;
-		xfs_rw_ilock(ip, *iolock);
+	if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
+		xfs_rw_iunlock(ip, iolock);
+		iolock = XFS_IOLOCK_EXCL;
+		xfs_rw_ilock(ip, iolock);
 	}
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
 	if (ret)
-		return ret;
+		goto out;
 
 	if (mapping->nrpages) {
 		ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
 							FI_REMAPF_LOCKED);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
 	/*
@@ -786,15 +785,18 @@ xfs_file_dio_aio_write(
 	 */
 	if (unaligned_io)
 		inode_dio_wait(inode);
-	else if (*iolock == XFS_IOLOCK_EXCL) {
+	else if (iolock == XFS_IOLOCK_EXCL) {
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-		*iolock = XFS_IOLOCK_SHARED;
+		iolock = XFS_IOLOCK_SHARED;
 	}
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
 	ret = generic_file_direct_write(iocb, iovp,
 			&nr_segs, pos, &iocb->ki_pos, count, ocount);
 
+out:
+	xfs_rw_iunlock(ip, iolock);
+
 	/* No fallback to buffered IO on errors for XFS. */
 	ASSERT(ret < 0 || ret == count);
 	return ret;
@@ -806,8 +808,7 @@ xfs_file_buffered_aio_write(
 	const struct iovec	*iovp,
 	unsigned long		nr_segs,
 	loff_t			pos,
-	size_t			ocount,
-	int			*iolock)
+	size_t			ocount)
 {
 	struct file		*file = iocb->ki_filp;
 	struct address_space	*mapping = file->f_mapping;
@@ -815,14 +816,14 @@ xfs_file_buffered_aio_write(
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
 	int			enospc = 0;
+	int			iolock = XFS_IOLOCK_EXCL;
 	size_t			count = ocount;
 
-	*iolock = XFS_IOLOCK_EXCL;
-	xfs_rw_ilock(ip, *iolock);
+	xfs_rw_ilock(ip, iolock);
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
 	if (ret)
-		return ret;
+		goto out;
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
@@ -836,13 +837,15 @@ write_retry:
 	 * page locks and retry *once*
 	 */
 	if (ret == -ENOSPC && !enospc) {
-		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-		if (ret)
-			return ret;
 		enospc = 1;
-		goto write_retry;
+		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+		if (!ret)
+			goto write_retry;
 	}
+
 	current->backing_dev_info = NULL;
+out:
+	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
 
@@ -858,7 +861,6 @@ xfs_file_aio_write(
 	struct inode		*inode = mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
-	int			iolock;
 	size_t			ocount = 0;
 
 	XFS_STATS_INC(xs_write_calls);
@@ -878,32 +880,22 @@ xfs_file_aio_write(
 		return -EIO;
 
 	if (unlikely(file->f_flags & O_DIRECT))
-		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &iolock);
+		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
 	else
 		ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &iolock);
+						  ocount);
 
-	if (ret <= 0)
-		goto out_unlock;
-
-	XFS_STATS_ADD(xs_write_bytes, ret);
+	if (ret > 0) {
+		ssize_t err;
 
-	/* Handle various SYNC-type writes */
-	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-		loff_t end = pos + ret - 1;
-		int error;
+		XFS_STATS_ADD(xs_write_bytes, ret);
 
-		xfs_rw_iunlock(ip, iolock);
-		error = xfs_file_fsync(file, pos, end,
-				      (file->f_flags & __O_SYNC) ? 0 : 1);
-		xfs_rw_ilock(ip, iolock);
-		if (error)
-			ret = error;
+		/* Handle various SYNC-type writes */
+		err = generic_write_sync(file, pos, ret);
+		if (err < 0)
+			ret = err;
 	}
 
-out_unlock:
-	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 0a300be6d5be8f66cd96609334710c268d0bfdce Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: remove task argument to audit_set_loginuid

The function always deals with current.  Don't expose an option
pretending one can use it for something.  You can't.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/proc/base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8173dfd89cb2..e3cbebbabebd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1228,7 +1228,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 		goto out_free_page;
 
 	}
-	length = audit_set_loginuid(current, loginuid);
+	length = audit_set_loginuid(loginuid);
 	if (likely(length == 0))
 		length = count;
 
-- 
cgit v1.2.3


From 633b45454503489209b0d9a45f9e3cd1b852c614 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: only allow tasks to set their loginuid if it is -1

At the moment we allow tasks to set their loginuid if they have
CAP_AUDIT_CONTROL.  In reality we want tasks to set the loginuid when they
log in and it be impossible to ever reset.  We had to make it mutable even
after it was once set (with the CAP) because on update and admin might have
to restart sshd.  Now sshd would get his loginuid and the next user which
logged in using ssh would not be able to set his loginuid.

Systemd has changed how userspace works and allowed us to make the kernel
work the way it should.  With systemd users (even admins) are not supposed
to restart services directly.  The system will restart the service for
them.  Thus since systemd is going to loginuid==-1, sshd would get -1, and
sshd would be allowed to set a new loginuid without special permissions.

If an admin in this system were to manually start an sshd he is inserting
himself into the system chain of trust and thus, logically, it's his
loginuid that should be used!  Since we have old systems I make this a
Kconfig option.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/proc/base.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e3cbebbabebd..482df23036b5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1197,9 +1197,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	ssize_t length;
 	uid_t loginuid;
 
-	if (!capable(CAP_AUDIT_CONTROL))
-		return -EPERM;
-
 	rcu_read_lock();
 	if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
 		rcu_read_unlock();
-- 
cgit v1.2.3


From 4043cde8ecf7f7d880eb1133c201a3d392fd68c3 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: do not call audit_getname on error

Just a code cleanup really.  We don't need to make a function call just for
it to return on error.  This also makes the VFS function even easier to follow
and removes a conditional on a hot path.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/namei.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index c283a1ec008e..208c6aa4a989 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page)
 
 static char *getname_flags(const char __user *filename, int flags, int *empty)
 {
-	char *tmp, *result;
-
-	result = ERR_PTR(-ENOMEM);
-	tmp = __getname();
-	if (tmp)  {
-		int retval = do_getname(filename, tmp);
-
-		result = tmp;
-		if (retval < 0) {
-			if (retval == -ENOENT && empty)
-				*empty = 1;
-			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
-				__putname(tmp);
-				result = ERR_PTR(retval);
-			}
+	char *result = __getname();
+	int retval;
+
+	if (!result)
+		return ERR_PTR(-ENOMEM);
+
+	retval = do_getname(filename, result);
+	if (retval < 0) {
+		if (retval == -ENOENT && empty)
+			*empty = 1;
+		if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+			__putname(result);
+			return ERR_PTR(retval);
 		}
 	}
 	audit_getname(result);
-- 
cgit v1.2.3


From e1616300a20c80396109c1cf013ba9a36055a3da Mon Sep 17 00:00:00 2001
From: Kazuya Mio <k-mio@sx.jp.nec.com>
Date: Thu, 1 Dec 2011 16:51:07 +0900
Subject: wake up s_wait_unfrozen when ->freeze_fs fails

dd slept infinitely when fsfeeze failed because of EIO.
To fix this problem, if ->freeze_fs fails, freeze_super() wakes up
the tasks waiting for the filesystem to become unfrozen.

When s_frozen isn't SB_UNFROZEN in __generic_file_aio_write(),
the function sleeps until FITHAW ioctl wakes up s_wait_unfrozen.

However, if ->freeze_fs fails, s_frozen is set to SB_UNFROZEN and then
freeze_super() returns an error number. In this case, FITHAW ioctl returns
EINVAL because s_frozen is already SB_UNFROZEN. There is no way to wake up
s_wait_unfrozen, so __generic_file_aio_write() sleeps infinitely.

Signed-off-by: Kazuya Mio <k-mio@sx.jp.nec.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index de41e1e46f09..6015c02296b7 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1186,6 +1186,8 @@ int freeze_super(struct super_block *sb)
 			printk(KERN_ERR
 				"VFS:Filesystem freeze failed\n");
 			sb->s_frozen = SB_UNFROZEN;
+			smp_wmb();
+			wake_up(&sb->s_wait_unfrozen);
 			deactivate_locked_super(sb);
 			return ret;
 		}
-- 
cgit v1.2.3


From 424a5334a5235c2fbb80090b18a065eeceb51d64 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 12 Jan 2012 12:41:36 +0100
Subject: vfs: remove printk from set_nlink()

Don't log a message for set_nlink(0).

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 4fa4f0916af9..fb10d86ffad7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -322,9 +322,6 @@ EXPORT_SYMBOL(clear_nlink);
 void set_nlink(struct inode *inode, unsigned int nlink)
 {
 	if (!nlink) {
-		printk_ratelimited(KERN_INFO
-			"set_nlink() clearing i_nlink on %s inode %li\n",
-			inode->i_sb->s_type->name, inode->i_ino);
 		clear_nlink(inode);
 	} else {
 		/* Yes, some filesystems do change nlink from zero to one */
-- 
cgit v1.2.3


From e268337dfe26dfc7efd422a804dbb27977a3cccc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 17 Jan 2012 15:21:19 -0800
Subject: proc: clean up and fix /proc/<pid>/mem handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Jüri Aedla reported that the /proc/<pid>/mem handling really isn't very
robust, and it also doesn't match the permission checking of any of the
other related files.

This changes it to do the permission checks at open time, and instead of
tracking the process, it tracks the VM at the time of the open.  That
simplifies the code a lot, but does mean that if you hold the file
descriptor open over an execve(), you'll continue to read from the _old_
VM.

That is different from our previous behavior, but much simpler.  If
somebody actually finds a load where this matters, we'll need to revert
this commit.

I suspect that nobody will ever notice - because the process mapping
addresses will also have changed as part of the execve.  So you cannot
actually usefully access the fd across a VM change simply because all
the offsets for IO would have changed too.

Reported-by: Jüri Aedla <asd@ut.ee>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 145 ++++++++++++++++-----------------------------------------
 1 file changed, 39 insertions(+), 106 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5485a5388ecb..662ddf2ec4f1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -198,65 +198,7 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
 	return result;
 }
 
-static struct mm_struct *__check_mem_permission(struct task_struct *task)
-{
-	struct mm_struct *mm;
-
-	mm = get_task_mm(task);
-	if (!mm)
-		return ERR_PTR(-EINVAL);
-
-	/*
-	 * A task can always look at itself, in case it chooses
-	 * to use system calls instead of load instructions.
-	 */
-	if (task == current)
-		return mm;
-
-	/*
-	 * If current is actively ptrace'ing, and would also be
-	 * permitted to freshly attach with ptrace now, permit it.
-	 */
-	if (task_is_stopped_or_traced(task)) {
-		int match;
-		rcu_read_lock();
-		match = (ptrace_parent(task) == current);
-		rcu_read_unlock();
-		if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
-			return mm;
-	}
-
-	/*
-	 * No one else is allowed.
-	 */
-	mmput(mm);
-	return ERR_PTR(-EPERM);
-}
-
-/*
- * If current may access user memory in @task return a reference to the
- * corresponding mm, otherwise ERR_PTR.
- */
-static struct mm_struct *check_mem_permission(struct task_struct *task)
-{
-	struct mm_struct *mm;
-	int err;
-
-	/*
-	 * Avoid racing if task exec's as we might get a new mm but validate
-	 * against old credentials.
-	 */
-	err = mutex_lock_killable(&task->signal->cred_guard_mutex);
-	if (err)
-		return ERR_PTR(err);
-
-	mm = __check_mem_permission(task);
-	mutex_unlock(&task->signal->cred_guard_mutex);
-
-	return mm;
-}
-
-struct mm_struct *mm_for_maps(struct task_struct *task)
+static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 {
 	struct mm_struct *mm;
 	int err;
@@ -267,7 +209,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 
 	mm = get_task_mm(task);
 	if (mm && mm != current->mm &&
-			!ptrace_may_access(task, PTRACE_MODE_READ)) {
+			!ptrace_may_access(task, mode)) {
 		mmput(mm);
 		mm = ERR_PTR(-EACCES);
 	}
@@ -276,6 +218,11 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 	return mm;
 }
 
+struct mm_struct *mm_for_maps(struct task_struct *task)
+{
+	return mm_access(task, PTRACE_MODE_READ);
+}
+
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
 {
 	int res = 0;
@@ -752,38 +699,39 @@ static const struct file_operations proc_single_file_operations = {
 
 static int mem_open(struct inode* inode, struct file* file)
 {
-	file->private_data = (void*)((long)current->self_exec_id);
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	struct mm_struct *mm;
+
+	if (!task)
+		return -ESRCH;
+
+	mm = mm_access(task, PTRACE_MODE_ATTACH);
+	put_task_struct(task);
+
+	if (IS_ERR(mm))
+		return PTR_ERR(mm);
+
 	/* OK to pass negative loff_t, we can catch out-of-range */
 	file->f_mode |= FMODE_UNSIGNED_OFFSET;
+	file->private_data = mm;
+
 	return 0;
 }
 
 static ssize_t mem_read(struct file * file, char __user * buf,
 			size_t count, loff_t *ppos)
 {
-	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	int ret;
 	char *page;
 	unsigned long src = *ppos;
-	int ret = -ESRCH;
-	struct mm_struct *mm;
+	struct mm_struct *mm = file->private_data;
 
-	if (!task)
-		goto out_no_task;
+	if (!mm)
+		return 0;
 
-	ret = -ENOMEM;
 	page = (char *)__get_free_page(GFP_TEMPORARY);
 	if (!page)
-		goto out;
-
-	mm = check_mem_permission(task);
-	ret = PTR_ERR(mm);
-	if (IS_ERR(mm))
-		goto out_free;
-
-	ret = -EIO;
- 
-	if (file->private_data != (void*)((long)current->self_exec_id))
-		goto out_put;
+		return -ENOMEM;
 
 	ret = 0;
  
@@ -810,13 +758,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 	}
 	*ppos = src;
 
-out_put:
-	mmput(mm);
-out_free:
 	free_page((unsigned long) page);
-out:
-	put_task_struct(task);
-out_no_task:
 	return ret;
 }
 
@@ -825,27 +767,15 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 {
 	int copied;
 	char *page;
-	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 	unsigned long dst = *ppos;
-	struct mm_struct *mm;
+	struct mm_struct *mm = file->private_data;
 
-	copied = -ESRCH;
-	if (!task)
-		goto out_no_task;
+	if (!mm)
+		return 0;
 
-	copied = -ENOMEM;
 	page = (char *)__get_free_page(GFP_TEMPORARY);
 	if (!page)
-		goto out_task;
-
-	mm = check_mem_permission(task);
-	copied = PTR_ERR(mm);
-	if (IS_ERR(mm))
-		goto out_free;
-
-	copied = -EIO;
-	if (file->private_data != (void *)((long)current->self_exec_id))
-		goto out_mm;
+		return -ENOMEM;
 
 	copied = 0;
 	while (count > 0) {
@@ -869,13 +799,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 	}
 	*ppos = dst;
 
-out_mm:
-	mmput(mm);
-out_free:
 	free_page((unsigned long) page);
-out_task:
-	put_task_struct(task);
-out_no_task:
 	return copied;
 }
 
@@ -895,11 +819,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 	return file->f_pos;
 }
 
+static int mem_release(struct inode *inode, struct file *file)
+{
+	struct mm_struct *mm = file->private_data;
+
+	mmput(mm);
+	return 0;
+}
+
 static const struct file_operations proc_mem_operations = {
 	.llseek		= mem_lseek,
 	.read		= mem_read,
 	.write		= mem_write,
 	.open		= mem_open,
+	.release	= mem_release,
 };
 
 static ssize_t environ_read(struct file *file, char __user *buf,
-- 
cgit v1.2.3


From f5fffcee27c09143ba80e5257dbd1f381d86342f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 17 Jan 2012 13:49:17 -0500
Subject: cifs: better instrumentation for coalesce_t2

When coalesce_t2 returns an error, have it throw a cFYI message that
explains the reason. Also rename some variables to clarify what they
represent.

Reported-and-Tested-by: Konstantinos Skarlatos <k.skarlatos@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 84 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 50 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4666780f315d..5cc15856e4ad 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -225,74 +225,90 @@ static int check2ndT2(struct smb_hdr *pSMB)
 
 static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 {
-	struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
+	struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)psecond;
 	struct smb_t2_rsp *pSMBt  = (struct smb_t2_rsp *)pTargetSMB;
-	char *data_area_of_target;
-	char *data_area_of_buf2;
+	char *data_area_of_tgt;
+	char *data_area_of_src;
 	int remaining;
-	unsigned int byte_count, total_in_buf;
-	__u16 total_data_size, total_in_buf2;
+	unsigned int byte_count, total_in_tgt;
+	__u16 tgt_total_cnt, src_total_cnt, total_in_src;
 
-	total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+	src_total_cnt = get_unaligned_le16(&pSMBs->t2_rsp.TotalDataCount);
+	tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
 
-	if (total_data_size !=
-	    get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
-		cFYI(1, "total data size of primary and secondary t2 differ");
+	if (tgt_total_cnt != src_total_cnt)
+		cFYI(1, "total data count of primary and secondary t2 differ "
+			"source=%hu target=%hu", src_total_cnt, tgt_total_cnt);
 
-	total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
+	total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
 
-	remaining = total_data_size - total_in_buf;
+	remaining = tgt_total_cnt - total_in_tgt;
 
-	if (remaining < 0)
+	if (remaining < 0) {
+		cFYI(1, "Server sent too much data. tgt_total_cnt=%hu "
+			"total_in_tgt=%hu", tgt_total_cnt, total_in_tgt);
 		return -EPROTO;
+	}
 
-	if (remaining == 0) /* nothing to do, ignore */
+	if (remaining == 0) {
+		/* nothing to do, ignore */
+		cFYI(1, "no more data remains");
 		return 0;
+	}
 
-	total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
-	if (remaining < total_in_buf2) {
+	total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount);
+	if (remaining < total_in_src)
 		cFYI(1, "transact2 2nd response contains too much data");
-	}
 
 	/* find end of first SMB data area */
-	data_area_of_target = (char *)&pSMBt->hdr.Protocol +
+	data_area_of_tgt = (char *)&pSMBt->hdr.Protocol +
 				get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
-	/* validate target area */
 
-	data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
-				get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
+	/* validate target area */
+	data_area_of_src = (char *)&pSMBs->hdr.Protocol +
+				get_unaligned_le16(&pSMBs->t2_rsp.DataOffset);
 
-	data_area_of_target += total_in_buf;
+	data_area_of_tgt += total_in_tgt;
 
-	/* copy second buffer into end of first buffer */
-	total_in_buf += total_in_buf2;
+	total_in_tgt += total_in_src;
 	/* is the result too big for the field? */
-	if (total_in_buf > USHRT_MAX)
+	if (total_in_tgt > USHRT_MAX) {
+		cFYI(1, "coalesced DataCount too large (%u)", total_in_tgt);
 		return -EPROTO;
-	put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
+	}
+	put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount);
 
 	/* fix up the BCC */
 	byte_count = get_bcc(pTargetSMB);
-	byte_count += total_in_buf2;
+	byte_count += total_in_src;
 	/* is the result too big for the field? */
-	if (byte_count > USHRT_MAX)
+	if (byte_count > USHRT_MAX) {
+		cFYI(1, "coalesced BCC too large (%u)", byte_count);
 		return -EPROTO;
+	}
 	put_bcc(byte_count, pTargetSMB);
 
 	byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
-	byte_count += total_in_buf2;
+	byte_count += total_in_src;
 	/* don't allow buffer to overflow */
-	if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)
+	if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+		cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count);
 		return -ENOBUFS;
+	}
 	pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
 
-	memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
+	/* copy second buffer into end of first buffer */
+	memcpy(data_area_of_tgt, data_area_of_src, total_in_src);
 
-	if (remaining == total_in_buf2) {
-		cFYI(1, "found the last secondary response");
-		return 0; /* we are done */
-	} else /* more responses to go */
+	if (remaining != total_in_src) {
+		/* more responses to go */
+		cFYI(1, "waiting for more secondary responses");
 		return 1;
+	}
+
+	/* we are done */
+	cFYI(1, "found the last secondary response");
+	return 0;
 }
 
 static void
-- 
cgit v1.2.3


From ce91acb3acae26f4163c5a6f1f695d1a1e8d9009 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 17 Jan 2012 16:08:51 -0500
Subject: cifs: lower default wsize when unix extensions are not used

We've had some reports of servers (namely, the Solaris in-kernel CIFS
server) that don't deal properly with writes that are "too large" even
though they set CAP_LARGE_WRITE_ANDX. Change the default to better
mirror what windows clients do.

Cc: stable@vger.kernel.org
Cc: Pavel Shilovsky <piastry@etersoft.ru>
Reported-by: Nick Davis <phireph0x@yahoo.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5cc15856e4ad..a66dcb52988c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2930,18 +2930,33 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 #define CIFS_DEFAULT_IOSIZE (1024 * 1024)
 
 /*
- * Windows only supports a max of 60k reads. Default to that when posix
- * extensions aren't in force.
+ * Windows only supports a max of 60kb reads and 65535 byte writes. Default to
+ * those values when posix extensions aren't in force. In actuality here, we
+ * use 65536 to allow for a write that is a multiple of 4k. Most servers seem
+ * to be ok with the extra byte even though Windows doesn't send writes that
+ * are that large.
+ *
+ * Citation:
+ *
+ * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx
  */
 #define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
+#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)
 
 static unsigned int
 cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
 {
 	__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
 	struct TCP_Server_Info *server = tcon->ses->server;
-	unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
-				CIFS_DEFAULT_IOSIZE;
+	unsigned int wsize;
+
+	/* start with specified wsize, or default */
+	if (pvolume_info->wsize)
+		wsize = pvolume_info->wsize;
+	else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
+		wsize = CIFS_DEFAULT_IOSIZE;
+	else
+		wsize = CIFS_DEFAULT_NON_POSIX_WSIZE;
 
 	/* can server support 24-bit write sizes? (via UNIX extensions) */
 	if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
-- 
cgit v1.2.3


From 04febabcf55beeffb8794a0d8c539e571bd2ae29 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 17 Jan 2012 16:09:15 -0500
Subject: cifs: sanitize username handling

Currently, it's not very clear whether you're allowed to have a NULL
vol->username or ses->user_name. Some places check for it and some don't.

Make it clear that a NULL pointer is OK in these fields, and ensure that
all the callers check for that.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifs_spnego.c | 10 +++++++---
 fs/cifs/cifsencrypt.c | 11 ++++++++---
 fs/cifs/connect.c     | 19 ++++++++++++-------
 3 files changed, 27 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 2272fd5fe5b7..e622863b292f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -113,9 +113,11 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
 		   MAX_MECH_STR_LEN +
 		   UID_KEY_LEN + (sizeof(uid_t) * 2) +
 		   CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
-		   USER_KEY_LEN + strlen(sesInfo->user_name) +
 		   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
 
+	if (sesInfo->user_name)
+		desc_len += USER_KEY_LEN + strlen(sesInfo->user_name);
+
 	spnego_key = ERR_PTR(-ENOMEM);
 	description = kzalloc(desc_len, GFP_KERNEL);
 	if (description == NULL)
@@ -152,8 +154,10 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
 	dp = description + strlen(description);
 	sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
 
-	dp = description + strlen(description);
-	sprintf(dp, ";user=%s", sesInfo->user_name);
+	if (sesInfo->user_name) {
+		dp = description + strlen(description);
+		sprintf(dp, ";user=%s", sesInfo->user_name);
+	}
 
 	dp = description + strlen(description);
 	sprintf(dp, ";pid=0x%x", current->pid);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 5d9b9acc5fce..bce99e6a4950 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -420,15 +420,20 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 	}
 
 	/* convert ses->user_name to unicode and uppercase */
-	len = strlen(ses->user_name);
+	len = ses->user_name ? strlen(ses->user_name) : 0;
 	user = kmalloc(2 + (len * 2), GFP_KERNEL);
 	if (user == NULL) {
 		cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
 		rc = -ENOMEM;
 		return rc;
 	}
-	len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
-	UniStrupr(user);
+
+	if (len) {
+		len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
+		UniStrupr(user);
+	} else {
+		memset(user, '\0', 2);
+	}
 
 	rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
 				(char *)user, 2 * len);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a66dcb52988c..b952a21e917b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1997,10 +1997,16 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
 			return 0;
 		break;
 	default:
+		/* NULL username means anonymous session */
+		if (ses->user_name == NULL) {
+			if (!vol->nullauth)
+				return 0;
+			break;
+		}
+
 		/* anything else takes username/password */
-		if (ses->user_name == NULL)
-			return 0;
-		if (strncmp(ses->user_name, vol->username,
+		if (strncmp(ses->user_name,
+			    vol->username ? vol->username : "",
 			    MAX_USERNAME_SIZE))
 			return 0;
 		if (strlen(vol->username) != 0 &&
@@ -3167,10 +3173,9 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
 		return -EINVAL;
 
 	if (volume_info->nullauth) {
-		cFYI(1, "null user");
-		volume_info->username = kzalloc(1, GFP_KERNEL);
-		if (volume_info->username == NULL)
-			return -ENOMEM;
+		cFYI(1, "Anonymous login");
+		kfree(volume_info->username);
+		volume_info->username = NULL;
 	} else if (volume_info->username) {
 		/* BB fixme parse for domain name here */
 		cFYI(1, "Username: %s", volume_info->username);
-- 
cgit v1.2.3


From 8a8798a5ff90977d6459ce1d657cf8fe13a51e97 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 17 Jan 2012 16:09:15 -0500
Subject: cifs: fetch credentials out of keyring for non-krb5 auth multiuser
 mounts

Fix up multiuser mounts to set the secType and set the username and
password from the key payload in the vol info for non-krb5 auth types.

Look for a key of type "secret" with a description of
"cifs:a:<server address>" or "cifs:d:<domainname>". If that's found,
then scrape the username and password out of the key payload and use
that to create a new user session.

Finally, don't have the code enforce krb5 auth on multiuser mounts,
but do require a kernel with keys support.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 165 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index b952a21e917b..28f23c03da53 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -38,6 +38,7 @@
 #include <asm/processor.h>
 #include <linux/inet.h>
 #include <linux/module.h>
+#include <keys/user-type.h>
 #include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -1594,11 +1595,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 		}
 	}
 
-	if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
-		cERROR(1, "Multiuser mounts currently require krb5 "
-			  "authentication!");
+#ifndef CONFIG_KEYS
+	/* Muliuser mounts require CONFIG_KEYS support */
+	if (vol->multiuser) {
+		cERROR(1, "Multiuser mounts require kernels with "
+			  "CONFIG_KEYS enabled.");
 		goto cifs_parse_mount_err;
 	}
+#endif
 
 	if (vol->UNCip == NULL)
 		vol->UNCip = &vol->UNC[2];
@@ -2061,6 +2065,132 @@ cifs_put_smb_ses(struct cifs_ses *ses)
 	cifs_put_tcp_session(server);
 }
 
+#ifdef CONFIG_KEYS
+
+/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */
+#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1)
+
+/* Populate username and pw fields from keyring if possible */
+static int
+cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
+{
+	int rc = 0;
+	char *desc, *delim, *payload;
+	ssize_t len;
+	struct key *key;
+	struct TCP_Server_Info *server = ses->server;
+	struct sockaddr_in *sa;
+	struct sockaddr_in6 *sa6;
+	struct user_key_payload *upayload;
+
+	desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	/* try to find an address key first */
+	switch (server->dstaddr.ss_family) {
+	case AF_INET:
+		sa = (struct sockaddr_in *)&server->dstaddr;
+		sprintf(desc, "cifs:a:%pI4", &sa->sin_addr.s_addr);
+		break;
+	case AF_INET6:
+		sa6 = (struct sockaddr_in6 *)&server->dstaddr;
+		sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr);
+		break;
+	default:
+		cFYI(1, "Bad ss_family (%hu)", server->dstaddr.ss_family);
+		rc = -EINVAL;
+		goto out_err;
+	}
+
+	cFYI(1, "%s: desc=%s", __func__, desc);
+	key = request_key(&key_type_logon, desc, "");
+	if (IS_ERR(key)) {
+		if (!ses->domainName) {
+			cFYI(1, "domainName is NULL");
+			rc = PTR_ERR(key);
+			goto out_err;
+		}
+
+		/* didn't work, try to find a domain key */
+		sprintf(desc, "cifs:d:%s", ses->domainName);
+		cFYI(1, "%s: desc=%s", __func__, desc);
+		key = request_key(&key_type_logon, desc, "");
+		if (IS_ERR(key)) {
+			rc = PTR_ERR(key);
+			goto out_err;
+		}
+	}
+
+	down_read(&key->sem);
+	upayload = key->payload.data;
+	if (IS_ERR_OR_NULL(upayload)) {
+		rc = PTR_ERR(key);
+		goto out_key_put;
+	}
+
+	/* find first : in payload */
+	payload = (char *)upayload->data;
+	delim = strnchr(payload, upayload->datalen, ':');
+	cFYI(1, "payload=%s", payload);
+	if (!delim) {
+		cFYI(1, "Unable to find ':' in payload (datalen=%d)",
+				upayload->datalen);
+		rc = -EINVAL;
+		goto out_key_put;
+	}
+
+	len = delim - payload;
+	if (len > MAX_USERNAME_SIZE || len <= 0) {
+		cFYI(1, "Bad value from username search (len=%ld)", len);
+		rc = -EINVAL;
+		goto out_key_put;
+	}
+
+	vol->username = kstrndup(payload, len, GFP_KERNEL);
+	if (!vol->username) {
+		cFYI(1, "Unable to allocate %ld bytes for username", len);
+		rc = -ENOMEM;
+		goto out_key_put;
+	}
+	cFYI(1, "%s: username=%s", __func__, vol->username);
+
+	len = key->datalen - (len + 1);
+	if (len > MAX_PASSWORD_SIZE || len <= 0) {
+		cFYI(1, "Bad len for password search (len=%ld)", len);
+		rc = -EINVAL;
+		kfree(vol->username);
+		vol->username = NULL;
+		goto out_key_put;
+	}
+
+	++delim;
+	vol->password = kstrndup(delim, len, GFP_KERNEL);
+	if (!vol->password) {
+		cFYI(1, "Unable to allocate %ld bytes for password", len);
+		rc = -ENOMEM;
+		kfree(vol->username);
+		vol->username = NULL;
+		goto out_key_put;
+	}
+
+out_key_put:
+	up_read(&key->sem);
+	key_put(key);
+out_err:
+	kfree(desc);
+	cFYI(1, "%s: returning %d", __func__, rc);
+	return rc;
+}
+#else /* ! CONFIG_KEYS */
+static inline int
+cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
+		   struct cifs_ses *ses __attribute__((unused)))
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_KEYS */
+
 static bool warned_on_ntlm;  /* globals init to false automatically */
 
 static struct cifs_ses *
@@ -3693,16 +3823,38 @@ int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,
 	return rc;
 }
 
+static int
+cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
+{
+	switch (ses->server->secType) {
+	case Kerberos:
+		vol->secFlg = CIFSSEC_MUST_KRB5;
+		return 0;
+	case NTLMv2:
+		vol->secFlg = CIFSSEC_MUST_NTLMV2;
+		break;
+	case NTLM:
+		vol->secFlg = CIFSSEC_MUST_NTLM;
+		break;
+	case RawNTLMSSP:
+		vol->secFlg = CIFSSEC_MUST_NTLMSSP;
+		break;
+	case LANMAN:
+		vol->secFlg = CIFSSEC_MUST_LANMAN;
+		break;
+	}
+
+	return cifs_set_cifscreds(vol, ses);
+}
+
 static struct cifs_tcon *
 cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 {
+	int rc;
 	struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
 	struct cifs_ses *ses;
 	struct cifs_tcon *tcon = NULL;
 	struct smb_vol *vol_info;
-	char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */
-			   /* We used to have this as MAX_USERNAME which is   */
-			   /* way too big now (256 instead of 32) */
 
 	vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
 	if (vol_info == NULL) {
@@ -3710,8 +3862,6 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 		goto out;
 	}
 
-	snprintf(username, sizeof(username), "krb50x%x", fsuid);
-	vol_info->username = username;
 	vol_info->local_nls = cifs_sb->local_nls;
 	vol_info->linux_uid = fsuid;
 	vol_info->cred_uid = fsuid;
@@ -3721,8 +3871,11 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 	vol_info->local_lease = master_tcon->local_lease;
 	vol_info->no_linux_ext = !master_tcon->unix_ext;
 
-	/* FIXME: allow for other secFlg settings */
-	vol_info->secFlg = CIFSSEC_MUST_KRB5;
+	rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
+	if (rc) {
+		tcon = ERR_PTR(rc);
+		goto out;
+	}
 
 	/* get a reference for the same TCP session */
 	spin_lock(&cifs_tcp_ses_lock);
@@ -3745,6 +3898,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 	if (ses->capabilities & CAP_UNIX)
 		reset_cifs_unix_caps(0, tcon, NULL, vol_info);
 out:
+	kfree(vol_info->username);
+	kfree(vol_info->password);
 	kfree(vol_info);
 
 	return tcon;
-- 
cgit v1.2.3


From 789b4588da40cf572ef982bdc5d590ec1b0386fe Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 17 Jan 2012 16:09:15 -0500
Subject: cifs: warn about impending deprecation of legacy MultiuserMount code

We'll allow a grace period of 2 releases (3.3 and 3.4) and then remove
the legacy code in 3.5.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifs_debug.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 84e8c0724704..24b3dfc05282 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -676,14 +676,23 @@ static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
 {
 	char c;
 	int rc;
+	static bool warned;
 
 	rc = get_user(c, buffer);
 	if (rc)
 		return rc;
 	if (c == '0' || c == 'n' || c == 'N')
 		multiuser_mount = 0;
-	else if (c == '1' || c == 'y' || c == 'Y')
+	else if (c == '1' || c == 'y' || c == 'Y') {
 		multiuser_mount = 1;
+		if (!warned) {
+			warned = true;
+			printk(KERN_WARNING "CIFS VFS: The legacy multiuser "
+				"mount code is scheduled to be deprecated in "
+				"3.5. Please switch to using the multiuser "
+				"mount option.");
+		}
+	}
 
 	return count;
 }
-- 
cgit v1.2.3


From 88a4412b798236bfdd9284d5c251d76679f944e1 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 18 Jan 2012 17:13:47 -0600
Subject: [CIFS] Fix build break with multiuser patch when LANMAN disabled

CC: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ba53c1c6c6cc..76e7d8b6da17 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -879,6 +879,8 @@ require use of the stronger protocol */
 #define   CIFSSEC_MASK          0xB70B7 /* current flags supported if weak */
 #endif /* UPCALL */
 #else /* do not allow weak pw hash */
+#define   CIFSSEC_MUST_LANMAN	0
+#define   CIFSSEC_MUST_PLNTXT	0
 #ifdef CONFIG_CIFS_UPCALL
 #define   CIFSSEC_MASK          0x8F08F /* flags supported if no weak allowed */
 #else
-- 
cgit v1.2.3


From c56001879bc091eee0c7a8e6e94ea0bea63c3012 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 18 Jan 2012 17:19:11 -0600
Subject: [CIFS] ACL and FSCACHE support no longer EXPERIMENTAL

CIFS ACL support and FSCACHE support have been in long enough
to be no longer considered experimental.  Remove obsolete Kconfig
dependency.

Signed-off-by: Steve French <sfrench@us.ibm.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f66cc1625150..0554b00a7b33 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -140,7 +140,6 @@ config CIFS_DFS_UPCALL
 
 config CIFS_FSCACHE
 	  bool "Provide CIFS client caching support (EXPERIMENTAL)"
-	  depends on EXPERIMENTAL
 	  depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
 	  help
 	    Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
@@ -149,7 +148,7 @@ config CIFS_FSCACHE
 
 config CIFS_ACL
 	  bool "Provide CIFS ACL support (EXPERIMENTAL)"
-	  depends on EXPERIMENTAL && CIFS_XATTR && KEYS
+	  depends on CIFS_XATTR && KEYS
 	  help
 	    Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
 	    is handed over to the application/caller.
-- 
cgit v1.2.3


From acbbb76a26648dfae6fed0989879e40d75692bfc Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 18 Jan 2012 22:32:33 -0600
Subject: CIFS: Rename *UCS* functions to *UTF16*

to reflect the unicode encoding used by CIFS protocol.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Acked-by: Jeff Layton <jlayton@samba.org>
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
---
 fs/cifs/cifs_unicode.c |  41 +++++++------
 fs/cifs/cifs_unicode.h |  20 +++---
 fs/cifs/cifsencrypt.c  |  12 ++--
 fs/cifs/cifssmb.c      | 162 +++++++++++++++++++++++++------------------------
 fs/cifs/connect.c      |   4 +-
 fs/cifs/readdir.c      |   9 +--
 fs/cifs/sess.c         |  34 +++++------
 fs/cifs/smbencrypt.c   |   2 +-
 8 files changed, 146 insertions(+), 138 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 1b2e180b018d..fbb9da951843 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -27,17 +27,17 @@
 #include "cifs_debug.h"
 
 /*
- * cifs_ucs2_bytes - how long will a string be after conversion?
- * @ucs - pointer to input string
+ * cifs_utf16_bytes - how long will a string be after conversion?
+ * @utf16 - pointer to input string
  * @maxbytes - don't go past this many bytes of input string
  * @codepage - destination codepage
  *
- * Walk a ucs2le string and return the number of bytes that the string will
+ * Walk a utf16le string and return the number of bytes that the string will
  * be after being converted to the given charset, not including any null
  * termination required. Don't walk past maxbytes in the source buffer.
  */
 int
-cifs_ucs2_bytes(const __le16 *from, int maxbytes,
+cifs_utf16_bytes(const __le16 *from, int maxbytes,
 		const struct nls_table *codepage)
 {
 	int i;
@@ -122,7 +122,7 @@ cp_convert:
 }
 
 /*
- * cifs_from_ucs2 - convert utf16le string to local charset
+ * cifs_from_utf16 - convert utf16le string to local charset
  * @to - destination buffer
  * @from - source buffer
  * @tolen - destination buffer size (in bytes)
@@ -130,7 +130,7 @@ cp_convert:
  * @codepage - codepage to which characters should be converted
  * @mapchar - should characters be remapped according to the mapchars option?
  *
- * Convert a little-endian ucs2le string (as sent by the server) to a string
+ * Convert a little-endian utf16le string (as sent by the server) to a string
  * in the provided codepage. The tolen and fromlen parameters are to ensure
  * that the code doesn't walk off of the end of the buffer (which is always
  * a danger if the alignment of the source buffer is off). The destination
@@ -139,12 +139,12 @@ cp_convert:
  * null terminator).
  *
  * Note that some windows versions actually send multiword UTF-16 characters
- * instead of straight UCS-2. The linux nls routines however aren't able to
+ * instead of straight UTF16-2. The linux nls routines however aren't able to
  * deal with those characters properly. In the event that we get some of
  * those characters, they won't be translated properly.
  */
 int
-cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
 		 const struct nls_table *codepage, bool mapchar)
 {
 	int i, charlen, safelen;
@@ -190,13 +190,13 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
 }
 
 /*
- * NAME:	cifs_strtoUCS()
+ * NAME:	cifs_strtoUTF16()
  *
  * FUNCTION:	Convert character string to unicode string
  *
  */
 int
-cifs_strtoUCS(__le16 *to, const char *from, int len,
+cifs_strtoUTF16(__le16 *to, const char *from, int len,
 	      const struct nls_table *codepage)
 {
 	int charlen;
@@ -206,7 +206,7 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
 		charlen = codepage->char2uni(from, len, &wchar_to);
 		if (charlen < 1) {
-			cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
+			cERROR(1, "strtoUTF16: char2uni of 0x%x returned %d",
 				*from, charlen);
 			/* A question mark */
 			wchar_to = 0x003f;
@@ -220,7 +220,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 }
 
 /*
- * cifs_strndup_from_ucs - copy a string from wire format to the local codepage
+ * cifs_strndup_from_utf16 - copy a string from wire format to the local
+ * codepage
  * @src - source string
  * @maxlen - don't walk past this many bytes in the source string
  * @is_unicode - is this a unicode string?
@@ -231,19 +232,19 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
  * error.
  */
 char *
-cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
-	     const struct nls_table *codepage)
+cifs_strndup_from_utf16(const char *src, const int maxlen,
+			const bool is_unicode, const struct nls_table *codepage)
 {
 	int len;
 	char *dst;
 
 	if (is_unicode) {
-		len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage);
+		len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);
 		len += nls_nullsize(codepage);
 		dst = kmalloc(len, GFP_KERNEL);
 		if (!dst)
 			return NULL;
-		cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage,
+		cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
 			       false);
 	} else {
 		len = strnlen(src, maxlen);
@@ -264,7 +265,7 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
  * names are little endian 16 bit Unicode on the wire
  */
 int
-cifsConvertToUCS(__le16 *target, const char *source, int srclen,
+cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
 		 const struct nls_table *cp, int mapChars)
 {
 	int i, j, charlen;
@@ -273,7 +274,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
 	wchar_t tmp;
 
 	if (!mapChars)
-		return cifs_strtoUCS(target, source, PATH_MAX, cp);
+		return cifs_strtoUTF16(target, source, PATH_MAX, cp);
 
 	for (i = 0, j = 0; i < srclen; j++) {
 		src_char = source[i];
@@ -281,7 +282,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
 		switch (src_char) {
 		case 0:
 			put_unaligned(0, &target[j]);
-			goto ctoUCS_out;
+			goto ctoUTF16_out;
 		case ':':
 			dst_char = cpu_to_le16(UNI_COLON);
 			break;
@@ -326,7 +327,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
 		put_unaligned(dst_char, &target[j]);
 	}
 
-ctoUCS_out:
+ctoUTF16_out:
 	return i;
 }
 
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 6d02fd560566..a513a546700b 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -74,16 +74,16 @@ extern const struct UniCaseRange CifsUniLowerRange[];
 #endif				/* UNIUPR_NOLOWER */
 
 #ifdef __KERNEL__
-int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
-		   const struct nls_table *codepage, bool mapchar);
-int cifs_ucs2_bytes(const __le16 *from, int maxbytes,
-		    const struct nls_table *codepage);
-int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
-char *cifs_strndup_from_ucs(const char *src, const int maxlen,
-			    const bool is_unicode,
-			    const struct nls_table *codepage);
-extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
-			const struct nls_table *cp, int mapChars);
+int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
+		    const struct nls_table *codepage, bool mapchar);
+int cifs_utf16_bytes(const __le16 *from, int maxbytes,
+		     const struct nls_table *codepage);
+int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *);
+char *cifs_strndup_from_utf16(const char *src, const int maxlen,
+			      const bool is_unicode,
+			      const struct nls_table *codepage);
+extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen,
+			      const struct nls_table *cp, int mapChars);
 
 #endif
 
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index bce99e6a4950..63c460e503b6 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -327,7 +327,7 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
 	attrptr->length = cpu_to_le16(2 * dlen);
 	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-	cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+	cifs_strtoUTF16((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
 
 	return 0;
 }
@@ -376,7 +376,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
 					kmalloc(attrsize + 1, GFP_KERNEL);
 				if (!ses->domainName)
 						return -ENOMEM;
-				cifs_from_ucs2(ses->domainName,
+				cifs_from_utf16(ses->domainName,
 					(__le16 *)blobptr, attrsize, attrsize,
 					nls_cp, false);
 				break;
@@ -429,7 +429,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 	}
 
 	if (len) {
-		len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
+		len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp);
 		UniStrupr(user);
 	} else {
 		memset(user, '\0', 2);
@@ -453,8 +453,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 			rc = -ENOMEM;
 			return rc;
 		}
-		len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
-					nls_cp);
+		len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
+				      nls_cp);
 		rc =
 		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
 					(char *)domain, 2 * len);
@@ -473,7 +473,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 			rc = -ENOMEM;
 			return rc;
 		}
-		len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+		len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len,
 					nls_cp);
 		rc =
 		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6600aa2d2ef3..8b7794c31591 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -821,8 +821,8 @@ PsxDelete:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else { /* BB add path length overrun check */
@@ -893,8 +893,8 @@ DelFileRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->fileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->fileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {		/* BB improve check for buffer overruns BB */
@@ -938,8 +938,8 @@ RmDirRetry:
 		return rc;
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, dirName,
-					 PATH_MAX, nls_codepage, remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, dirName,
+					      PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {		/* BB improve check for buffer overruns BB */
@@ -981,8 +981,8 @@ MkDirRetry:
 		return rc;
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, name,
-					    PATH_MAX, nls_codepage, remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
+					      PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {		/* BB improve check for buffer overruns BB */
@@ -1030,8 +1030,8 @@ PsxCreat:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, name,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, name,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -1197,8 +1197,8 @@ OldOpenRetry:
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		count = 1;      /* account for one byte pad to word boundary */
 		name_len =
-		   cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
-				    fileName, PATH_MAX, nls_codepage, remap);
+		   cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
+				      fileName, PATH_MAX, nls_codepage, remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {                /* BB improve check for buffer overruns BB */
@@ -1304,8 +1304,8 @@ openRetry:
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		count = 1;	/* account for one byte pad to word boundary */
 		name_len =
-		    cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
-				     fileName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
+				       fileName, PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 		pSMB->NameLength = cpu_to_le16(name_len);
@@ -2649,16 +2649,16 @@ renameRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 		pSMB->OldFileName[name_len] = 0x04;	/* pad */
 	/* protocol requires ASCII signature byte on Unicode string */
 		pSMB->OldFileName[name_len + 1] = 0x00;
 		name_len2 =
-		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
-				     toName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+				       toName, PATH_MAX, nls_codepage, remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2;	/* convert to bytes */
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -2738,10 +2738,12 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,
 	/* unicode only call */
 	if (target_name == NULL) {
 		sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid);
-		len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+		len_of_str =
+			cifsConvertToUTF16((__le16 *)rename_info->target_name,
 					dummy_string, 24, nls_codepage, remap);
 	} else {
-		len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+		len_of_str =
+			cifsConvertToUTF16((__le16 *)rename_info->target_name,
 					target_name, PATH_MAX, nls_codepage,
 					remap);
 	}
@@ -2795,17 +2797,17 @@ copyRetry:
 	pSMB->Flags = cpu_to_le16(flags & COPY_TREE);
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len = cifsConvertToUCS((__le16 *) pSMB->OldFileName,
-					    fromName, PATH_MAX, nls_codepage,
-					    remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName,
+					      fromName, PATH_MAX, nls_codepage,
+					      remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 		pSMB->OldFileName[name_len] = 0x04;     /* pad */
 		/* protocol requires ASCII signature byte on Unicode string */
 		pSMB->OldFileName[name_len + 1] = 0x00;
 		name_len2 =
-		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
-				toName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+				       toName, PATH_MAX, nls_codepage, remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2; /* convert to bytes */
 	} else { 	/* BB improve the check for buffer overruns BB */
@@ -2861,9 +2863,9 @@ createSymLinkRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifs_strtoUCS((__le16 *) pSMB->FileName, fromName, PATH_MAX
-				  /* find define for this maxpathcomponent */
-				  , nls_codepage);
+		    cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName,
+				    /* find define for this maxpathcomponent */
+				    PATH_MAX, nls_codepage);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 
@@ -2885,9 +2887,9 @@ createSymLinkRetry:
 	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len_target =
-		    cifs_strtoUCS((__le16 *) data_offset, toName, PATH_MAX
-				  /* find define for this maxpathcomponent */
-				  , nls_codepage);
+		    cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX
+				    /* find define for this maxpathcomponent */
+				    , nls_codepage);
 		name_len_target++;	/* trailing null */
 		name_len_target *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -2949,8 +2951,8 @@ createHardLinkRetry:
 		return rc;
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len = cifsConvertToUCS((__le16 *) pSMB->FileName, toName,
-					    PATH_MAX, nls_codepage, remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->FileName, toName,
+					      PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 
@@ -2972,8 +2974,8 @@ createHardLinkRetry:
 	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len_target =
-		    cifsConvertToUCS((__le16 *) data_offset, fromName, PATH_MAX,
-				     nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) data_offset, fromName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len_target++;	/* trailing null */
 		name_len_target *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -3042,8 +3044,8 @@ winCreateHardLinkRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 
@@ -3051,8 +3053,8 @@ winCreateHardLinkRetry:
 		pSMB->OldFileName[name_len] = 0x04;
 		pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
 		name_len2 =
-		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
-				     toName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+				       toName, PATH_MAX, nls_codepage, remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2;	/* convert to bytes */
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -3108,8 +3110,8 @@ querySymLinkRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifs_strtoUCS((__le16 *) pSMB->FileName, searchName,
-				  PATH_MAX, nls_codepage);
+			cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName,
+					PATH_MAX, nls_codepage);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -3166,8 +3168,8 @@ querySymLinkRetry:
 				is_unicode = false;
 
 			/* BB FIXME investigate remapping reserved chars here */
-			*symlinkinfo = cifs_strndup_from_ucs(data_start, count,
-						    is_unicode, nls_codepage);
+			*symlinkinfo = cifs_strndup_from_utf16(data_start,
+					count, is_unicode, nls_codepage);
 			if (!*symlinkinfo)
 				rc = -ENOMEM;
 		}
@@ -3450,8 +3452,9 @@ queryAclRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-					 PATH_MAX, nls_codepage, remap);
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   searchName, PATH_MAX, nls_codepage,
+					   remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 		pSMB->FileName[name_len] = 0;
@@ -3537,8 +3540,8 @@ setAclRetry:
 		return rc;
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				      PATH_MAX, nls_codepage, remap);
+			cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+					   PATH_MAX, nls_codepage, remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -3948,8 +3951,9 @@ QInfRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-					PATH_MAX, nls_codepage, remap);
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   searchName, PATH_MAX, nls_codepage,
+					   remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {
@@ -4086,8 +4090,8 @@ QPathInfoRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -4255,8 +4259,8 @@ UnixQPathInfoRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				  PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -4344,8 +4348,8 @@ findFirstRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				 PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
 		/* We can not add the asterik earlier in case
 		it got remapped to 0xF03A as if it were part of the
 		directory name instead of a wildcard */
@@ -4656,8 +4660,9 @@ GetInodeNumberRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-					 PATH_MAX, nls_codepage, remap);
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   searchName, PATH_MAX, nls_codepage,
+					   remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -4794,9 +4799,9 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 				rc = -ENOMEM;
 				goto parse_DFS_referrals_exit;
 			}
-			cifsConvertToUCS((__le16 *) tmp, searchName,
-					PATH_MAX, nls_codepage, remap);
-			node->path_consumed = cifs_ucs2_bytes(tmp,
+			cifsConvertToUTF16((__le16 *) tmp, searchName,
+					   PATH_MAX, nls_codepage, remap);
+			node->path_consumed = cifs_utf16_bytes(tmp,
 					le16_to_cpu(pSMBr->PathConsumed),
 					nls_codepage);
 			kfree(tmp);
@@ -4809,8 +4814,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		/* copy DfsPath */
 		temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
 		max_len = data_end - temp;
-		node->path_name = cifs_strndup_from_ucs(temp, max_len,
-						      is_unicode, nls_codepage);
+		node->path_name = cifs_strndup_from_utf16(temp, max_len,
+						is_unicode, nls_codepage);
 		if (!node->path_name) {
 			rc = -ENOMEM;
 			goto parse_DFS_referrals_exit;
@@ -4819,8 +4824,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		/* copy link target UNC */
 		temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
 		max_len = data_end - temp;
-		node->node_name = cifs_strndup_from_ucs(temp, max_len,
-						      is_unicode, nls_codepage);
+		node->node_name = cifs_strndup_from_utf16(temp, max_len,
+						is_unicode, nls_codepage);
 		if (!node->node_name)
 			rc = -ENOMEM;
 	}
@@ -4873,8 +4878,9 @@ getDFSRetry:
 	if (ses->capabilities & CAP_UNICODE) {
 		pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->RequestFileName,
-				     searchName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->RequestFileName,
+				       searchName, PATH_MAX, nls_codepage,
+				       remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -5506,8 +5512,8 @@ SetEOFRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -5796,8 +5802,8 @@ SetTimesRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -5877,8 +5883,8 @@ SetAttrLgcyRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			ConvertToUCS((__le16 *) pSMB->fileName, fileName,
-				PATH_MAX, nls_codepage);
+			ConvertToUTF16((__le16 *) pSMB->fileName, fileName,
+				       PATH_MAX, nls_codepage);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -6030,8 +6036,8 @@ setPermsRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -6123,8 +6129,8 @@ QAllEAsRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		list_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
 		list_len++;	/* trailing null */
 		list_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -6301,8 +6307,8 @@ SetEARetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 28f23c03da53..986709a8d903 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3644,7 +3644,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
 	if (ses->capabilities & CAP_UNICODE) {
 		smb_buffer->Flags2 |= SMBFLG2_UNICODE;
 		length =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, tree,
+		    cifs_strtoUTF16((__le16 *) bcc_ptr, tree,
 			6 /* max utf8 char length in bytes */ *
 			(/* server len*/ + 256 /* share len */), nls_codepage);
 		bcc_ptr += 2 * length;	/* convert num 16 bit words to bytes */
@@ -3699,7 +3699,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
 
 		/* mostly informational -- no need to fail on error here */
 		kfree(tcon->nativeFileSystem);
-		tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
+		tcon->nativeFileSystem = cifs_strndup_from_utf16(bcc_ptr,
 						      bytes_left, is_unicode,
 						      nls_codepage);
 
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a090bbe6ee29..e2bbc683e018 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -647,10 +647,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
 
 		name.name = scratch_buf;
 		name.len =
-			cifs_from_ucs2((char *)name.name, (__le16 *)de.name,
-				       UNICODE_NAME_MAX,
-				       min(de.namelen, (size_t)max_len), nlt,
-				       cifs_sb->mnt_cifs_flags &
+			cifs_from_utf16((char *)name.name, (__le16 *)de.name,
+					UNICODE_NAME_MAX,
+					min_t(size_t, de.namelen,
+					      (size_t)max_len), nlt,
+					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 		name.len -= nls_nullsize(nlt);
 	} else {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 4ec3ee9d72cc..d85efad5765f 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -167,16 +167,16 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
 	int bytes_ret = 0;
 
 	/* Copy OS version */
-	bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32,
-				  nls_cp);
+	bytes_ret = cifs_strtoUTF16((__le16 *)bcc_ptr, "Linux version ", 32,
+				    nls_cp);
 	bcc_ptr += 2 * bytes_ret;
-	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release,
-				  32, nls_cp);
+	bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, init_utsname()->release,
+				    32, nls_cp);
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2; /* trailing null */
 
-	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
-				  32, nls_cp);
+	bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
+				    32, nls_cp);
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2; /* trailing null */
 
@@ -197,8 +197,8 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
 		*(bcc_ptr+1) = 0;
 		bytes_ret = 0;
 	} else
-		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName,
-					  256, nls_cp);
+		bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName,
+					    256, nls_cp);
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2;  /* account for null terminator */
 
@@ -226,8 +226,8 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
 		*bcc_ptr = 0;
 		*(bcc_ptr+1) = 0;
 	} else {
-		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name,
-					  MAX_USERNAME_SIZE, nls_cp);
+		bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name,
+					    MAX_USERNAME_SIZE, nls_cp);
 	}
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2; /* account for null termination */
@@ -287,7 +287,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
 	cFYI(1, "bleft %d", bleft);
 
 	kfree(ses->serverOS);
-	ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+	ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
 	cFYI(1, "serverOS=%s", ses->serverOS);
 	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
 	data += len;
@@ -296,7 +296,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
 		return;
 
 	kfree(ses->serverNOS);
-	ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+	ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
 	cFYI(1, "serverNOS=%s", ses->serverNOS);
 	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
 	data += len;
@@ -305,7 +305,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
 		return;
 
 	kfree(ses->serverDomain);
-	ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+	ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
 	cFYI(1, "serverDomain=%s", ses->serverDomain);
 
 	return;
@@ -502,8 +502,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		tmp += 2;
 	} else {
 		int len;
-		len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
-				    MAX_USERNAME_SIZE, nls_cp);
+		len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName,
+				      MAX_USERNAME_SIZE, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
 		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->DomainName.Length = cpu_to_le16(len);
@@ -518,8 +518,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		tmp += 2;
 	} else {
 		int len;
-		len = cifs_strtoUCS((__le16 *)tmp, ses->user_name,
-				    MAX_USERNAME_SIZE, nls_cp);
+		len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name,
+				      MAX_USERNAME_SIZE, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
 		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->UserName.Length = cpu_to_le16(len);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 80d850881938..d5cd9aa7eacc 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -213,7 +213,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
 
 	/* Password cannot be longer than 128 characters */
 	if (passwd) /* Password must be converted to NT unicode */
-		len = cifs_strtoUCS(wpwd, passwd, 128, codepage);
+		len = cifs_strtoUTF16(wpwd, passwd, 128, codepage);
 	else {
 		len = 0;
 		*wpwd = 0; /* Ensure string is null terminated */
-- 
cgit v1.2.3


From 1aab323ea5cd67d2d2572a1f2794978583ff8545 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 19 Jan 2012 13:19:42 -0500
Subject: qnx4: di_fname is an array, for crying out loud...

(struct qnx4_inode_entry *)(bh->b_data + some_offset)->di_fname
is not going to be NULL, TYVM...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/qnx4/inode.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2bfd987f4853..63e0f7471fb5 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -194,20 +194,18 @@ static const char *qnx4_checkroot(struct super_block *sb)
 			}
 			for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
 				rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
-				if (rootdir->di_fname != NULL) {
-					QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
-					if (!strcmp(rootdir->di_fname,
-						    QNX4_BMNAME)) {
-						found = 1;
-						qnx4_sb(sb)->BitMap = kmemdup(rootdir,
-									      sizeof(struct qnx4_inode_entry),
-									      GFP_KERNEL);
-						if (!qnx4_sb(sb)->BitMap) {
-							brelse (bh);
-							return "not enough memory for bitmap inode";
-						}/* keep bitmap inode known */
-						break;
-					}
+				QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
+				if (!strcmp(rootdir->di_fname,
+					    QNX4_BMNAME)) {
+					found = 1;
+					qnx4_sb(sb)->BitMap = kmemdup(rootdir,
+								      sizeof(struct qnx4_inode_entry),
+								      GFP_KERNEL);
+					if (!qnx4_sb(sb)->BitMap) {
+						brelse (bh);
+						return "not enough memory for bitmap inode";
+					}/* keep bitmap inode known */
+					break;
 				}
 			}
 			brelse(bh);
-- 
cgit v1.2.3


From 4134bf81ffd962f4de9bbeca55130d2238bd3698 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 19 Jan 2012 13:40:57 -0500
Subject: qnx4: reduce the insane nesting in qnx4_checkroot()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/qnx4/inode.c | 56 ++++++++++++++++++++++----------------------------------
 1 file changed, 22 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 63e0f7471fb5..3fd121c7c30a 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -179,45 +179,33 @@ static const char *qnx4_checkroot(struct super_block *sb)
 	struct qnx4_inode_entry *rootdir;
 	int rd, rl;
 	int i, j;
-	int found = 0;
 
-	if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') {
+	if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/')
 		return "no qnx4 filesystem (no root dir).";
-	} else {
-		QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
-		rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
-		rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
-		for (j = 0; j < rl; j++) {
-			bh = sb_bread(sb, rd + j);	/* root dir, first block */
-			if (bh == NULL) {
-				return "unable to read root entry.";
-			}
-			for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
-				rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
-				QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
-				if (!strcmp(rootdir->di_fname,
-					    QNX4_BMNAME)) {
-					found = 1;
-					qnx4_sb(sb)->BitMap = kmemdup(rootdir,
-								      sizeof(struct qnx4_inode_entry),
-								      GFP_KERNEL);
-					if (!qnx4_sb(sb)->BitMap) {
-						brelse (bh);
-						return "not enough memory for bitmap inode";
-					}/* keep bitmap inode known */
-					break;
-				}
-			}
+	QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
+	rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
+	rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
+	for (j = 0; j < rl; j++) {
+		bh = sb_bread(sb, rd + j);	/* root dir, first block */
+		if (bh == NULL)
+			return "unable to read root entry.";
+		rootdir = (struct qnx4_inode_entry *) bh->b_data;
+		for (i = 0; i < QNX4_INODES_PER_BLOCK; i++, rootdir++) {
+			QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
+			if (strcmp(rootdir->di_fname, QNX4_BMNAME) != 0)
+				continue;
+			qnx4_sb(sb)->BitMap = kmemdup(rootdir,
+						      sizeof(struct qnx4_inode_entry),
+						      GFP_KERNEL);
 			brelse(bh);
-			if (found != 0) {
-				break;
-			}
-		}
-		if (found == 0) {
-			return "bitmap file not found.";
+			if (!qnx4_sb(sb)->BitMap)
+				return "not enough memory for bitmap inode";
+			/* keep bitmap inode known */
+			return NULL;
 		}
+		brelse(bh);
 	}
-	return NULL;
+	return "bitmap file not found.";
 }
 
 static int qnx4_fill_super(struct super_block *s, void *data, int silent)
-- 
cgit v1.2.3


From 8bc5191b261c4fd9a5e9052cebe04ce2ef05f2e7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 19 Jan 2012 13:54:36 -0500
Subject: qnx4: don't leak ->BitMap on late failure exits

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/qnx4/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 3fd121c7c30a..6b009548d2e0 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -256,7 +256,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 	if (IS_ERR(root)) {
 		printk(KERN_ERR "qnx4: get inode failed\n");
 		ret = PTR_ERR(root);
- 		goto out;
+ 		goto outb;
  	}
 
 	ret = -ENOMEM;
@@ -269,6 +269,8 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 
       outi:
 	iput(root);
+      outb:
+	kfree(qs->BitMap);
       out:
 	brelse(bh);
       outnobh:
-- 
cgit v1.2.3


From 85e72aa5384b1a614563ad63257ded0e91d1a620 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 20 Jan 2012 14:34:09 -0800
Subject: proc: clear_refs: do not clear reserved pages

/proc/pid/clear_refs is used to clear the Referenced and YOUNG bits for
pages and corresponding page table entries of the task with PID pid, which
includes any special mappings inserted into the page tables in order to
provide things like vDSOs and user helper functions.

On ARM this causes a problem because the vectors page is mapped as a
global mapping and since ec706dab ("ARM: add a vma entry for the user
accessible vector page"), a VMA is also inserted into each task for this
page to aid unwinding through signals and syscall restarts.  Since the
vectors page is required for handling faults, clearing the YOUNG bit (and
subsequently writing a faulting pte) means that we lose the vectors page
*globally* and cannot fault it back in.  This results in a system deadlock
on the next exception.

To see this problem in action, just run:

	$ echo 1 > /proc/self/clear_refs

on an ARM platform (as any user) and watch your system hang.  I think this
has been the case since 2.6.37

This patch avoids clearing the aforementioned bits for reserved pages,
therefore leaving the vectors page intact on ARM.  Since reserved pages
are not candidates for swap, this change should not have any impact on the
usefulness of clear_refs.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Reported-by: Moussa Ba <moussaba@micron.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Acked-by: Nicolas Pitre <nico@linaro.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: <stable@vger.kernel.org>		[2.6.37+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e418c5abdb0e..7dcd2a250495 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -518,6 +518,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!page)
 			continue;
 
+		if (PageReserved(page))
+			continue;
+
 		/* Clear accessed and referenced bits. */
 		ptep_test_and_clear_young(vma, addr, pte);
 		ClearPageReferenced(page);
-- 
cgit v1.2.3


From b5763accd3b5fc131ee06e26ce56e63ae0322c9b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sat, 21 Jan 2012 11:02:42 -0800
Subject: kernel-doc: fix new warnings in debugfs

Fix new kernel-doc warnings:

Warning(fs/debugfs/file.c:556): No description found for parameter 'nregs'
Warning(fs/debugfs/file.c:556): Excess function parameter 'mregs' description in 'debugfs_print_regs32'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/debugfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index f65d4455c5e5..ef023eef0464 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_blob);
  * debugfs_print_regs32 - use seq_print to describe a set of registers
  * @s: the seq_file structure being used to generate output
  * @regs: an array if struct debugfs_reg32 structures
- * @mregs: the length of the above array
+ * @nregs: the length of the above array
  * @base: the base address to be used in reading the registers
  * @prefix: a string to be prefixed to every output line
  *
-- 
cgit v1.2.3


From 803ab977618eae2b292cda0a97eed75f42250ddf Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 24 Jan 2012 11:39:22 +0300
Subject: cifs: NULL dereference on allocation failure

We should just return directly here, the goto causes a NULL dereference.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 986709a8d903..026d6464335b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3857,10 +3857,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 	struct smb_vol *vol_info;
 
 	vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
-	if (vol_info == NULL) {
-		tcon = ERR_PTR(-ENOMEM);
-		goto out;
-	}
+	if (vol_info == NULL)
+		return ERR_PTR(-ENOMEM);
 
 	vol_info->local_nls = cifs_sb->local_nls;
 	vol_info->linux_uid = fsuid;
-- 
cgit v1.2.3


From 0863b04d1578879173aacbc5c7be749fccb70809 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sat, 21 Jan 2012 11:02:42 -0800
Subject: kernel-doc: fix new warnings in debugfs

Fix new kernel-doc warnings:

Warning(fs/debugfs/file.c:556): No description found for parameter 'nregs'
Warning(fs/debugfs/file.c:556): Excess function parameter 'mregs' description in 'debugfs_print_regs32'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/debugfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index f65d4455c5e5..ef023eef0464 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_blob);
  * debugfs_print_regs32 - use seq_print to describe a set of registers
  * @s: the seq_file structure being used to generate output
  * @regs: an array if struct debugfs_reg32 structures
- * @mregs: the length of the above array
+ * @nregs: the length of the above array
  * @base: the base address to be used in reading the registers
  * @prefix: a string to be prefixed to every output line
  *
-- 
cgit v1.2.3


From ce597919361dcec97341151690e780eade2a9cf4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 13 Jan 2012 21:32:59 -0800
Subject: sysfs: Complain bitterly about attempts to remove files from
 nonexistent directories.

Recently an OOPS was observed from the usb serial io_ti driver when it tried to remove
sysfs directories.  Upon investigation it turns out this driver was always buggy
and that a recent sysfs change had stopped guarding itself against removing attributes
from sysfs directories that had already been removed. :(

Historically we have been silent about attempting to files from nonexistent sysfs
directories and have politely returned error codes.  That has resulted in people writing
broken code that ignores the error codes.

Issue a kernel WARNING and a stack backtrace to make it clear in no uncertain
terms that abusing sysfs is not ok, and the callers need to fix their code.

This change transforms the io_ti OOPS into a more comprehensible error message
and stack backtrace.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Reported-by: Wolfgang Frisch <wfpub@roembden.net>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c  | 6 ++++++
 fs/sysfs/inode.c | 5 ++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 62f4fb37789e..00012e31829d 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -493,6 +493,12 @@ int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
 	const void *ns = NULL;
 	int err;
 
+	if (!dir_sd) {
+		WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n",
+			kobject_name(kobj));
+		return -ENOENT;
+	}
+
 	err = 0;
 	if (!sysfs_ns_type(dir_sd))
 		goto out;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 4a802b4a9056..85eb81683a29 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -318,8 +318,11 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
 
-	if (!dir_sd)
+	if (!dir_sd) {
+		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
+			name);
 		return -ENOENT;
+	}
 
 	sysfs_addrm_start(&acxt, dir_sd);
 
-- 
cgit v1.2.3


From 9b025eb3a89e041bab6698e3858706be2385d692 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 11 Jan 2012 18:52:10 +0000
Subject: xfs: Fix missing xfs_iunlock() on error recovery path in
 xfs_readlink()

Commit b52a360b forgot to call xfs_iunlock() when it detected corrupted
symplink and bailed out. Fix it by jumping to 'out' instead of doing return.

CC: stable@kernel.org
CC: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Alex Elder <elder@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0cf52da9d246..ebdb88840a47 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -131,7 +131,8 @@ xfs_readlink(
 			 __func__, (unsigned long long) ip->i_ino,
 			 (long long) pathlen);
 		ASSERT(0);
-		return XFS_ERROR(EFSCORRUPTED);
+		error = XFS_ERROR(EFSCORRUPTED);
+		goto out;
 	}
 
 
-- 
cgit v1.2.3


From 30373dc0c87ffef68d5628e77d56ffb1fa22e1ee Mon Sep 17 00:00:00 2001
From: Tim Gardner <tim.gardner@canonical.com>
Date: Thu, 12 Jan 2012 16:31:55 +0100
Subject: ecryptfs: Improve metadata read failure logging

Print inode on metadata read failure. The only real
way of dealing with metadata read failures is to delete
the underlying file system file. Having the inode
allows one to 'find . -inum INODE`.

[tyhicks@canonical.com: Removed some minor not-for-stable parts]
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: stable@vger.kernel.org
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/crypto.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 2a834255c75d..2bf52033538b 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1620,7 +1620,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 		rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
 		if (rc) {
 			printk(KERN_DEBUG "Valid eCryptfs headers not found in "
-			       "file header region or xattr region\n");
+			       "file header region or xattr region, inode %lu\n",
+				ecryptfs_inode->i_ino);
 			rc = -EINVAL;
 			goto out;
 		}
@@ -1629,7 +1630,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 						ECRYPTFS_DONT_VALIDATE_HEADER_SIZE);
 		if (rc) {
 			printk(KERN_DEBUG "Valid eCryptfs headers not found in "
-			       "file xattr region either\n");
+			       "file xattr region either, inode %lu\n",
+				ecryptfs_inode->i_ino);
 			rc = -EINVAL;
 		}
 		if (crypt_stat->mount_crypt_stat->flags
@@ -1640,7 +1642,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 			       "crypto metadata only in the extended attribute "
 			       "region, but eCryptfs was mounted without "
 			       "xattr support enabled. eCryptfs will not treat "
-			       "this like an encrypted file.\n");
+			       "this like an encrypted file, inode %lu\n",
+				ecryptfs_inode->i_ino);
 			rc = -EINVAL;
 		}
 	}
-- 
cgit v1.2.3


From bb4503615d95d6826b7907986ad574e3157877e8 Mon Sep 17 00:00:00 2001
From: Tim Gardner <tim.gardner@canonical.com>
Date: Thu, 12 Jan 2012 16:31:55 +0100
Subject: ecryptfs: Remove unnecessary variable initialization

Removes unneeded variable initialization in ecryptfs_read_metadata(). Also adds
a small comment to help explain metadata reading logic.

[tyhicks@canonical.com: Pulled out of for-stable patch and wrote commit msg]
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/crypto.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 2bf52033538b..ff981503b3e3 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1590,8 +1590,8 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
  */
 int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 {
-	int rc = 0;
-	char *page_virt = NULL;
+	int rc;
+	char *page_virt;
 	struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
 	struct ecryptfs_crypt_stat *crypt_stat =
 	    &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
@@ -1616,6 +1616,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 						ecryptfs_dentry,
 						ECRYPTFS_VALIDATE_HEADER_SIZE);
 	if (rc) {
+		/* metadata is not in the file header, so try xattrs */
 		memset(page_virt, 0, PAGE_CACHE_SIZE);
 		rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
 		if (rc) {
-- 
cgit v1.2.3


From db10e556518eb9d21ee92ff944530d84349684f4 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Thu, 12 Jan 2012 11:30:44 +0100
Subject: eCryptfs: Sanitize write counts of /dev/ecryptfs

A malicious count value specified when writing to /dev/ecryptfs may
result in a a very large kernel memory allocation.

This patch peeks at the specified packet payload size, adds that to the
size of the packet headers and compares the result with the write count
value. The resulting maximum memory allocation size is approximately 532
bytes.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Reported-by: Sasha Levin <levinsasha928@gmail.com>
Cc: <stable@vger.kernel.org>
---
 fs/ecryptfs/miscdev.c | 56 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 940a82e63dc3..0dc5a3d554a4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -409,11 +409,47 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 	ssize_t sz = 0;
 	char *data;
 	uid_t euid = current_euid();
+	unsigned char packet_size_peek[3];
 	int rc;
 
-	if (count == 0)
+	if (count == 0) {
 		goto out;
+	} else if (count == (1 + 4)) {
+		/* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */
+		goto memdup;
+	} else if (count < (1 + 4 + 1)
+		   || count > (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4
+			       + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)) {
+		printk(KERN_WARNING "%s: Acceptable packet size range is "
+		       "[%d-%lu], but amount of data written is [%zu].",
+		       __func__, (1 + 4 + 1),
+		       (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4
+			+ ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES), count);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(packet_size_peek, (buf + 1 + 4),
+			   sizeof(packet_size_peek))) {
+		printk(KERN_WARNING "%s: Error while inspecting packet size\n",
+		       __func__);
+		return -EFAULT;
+	}
+
+	rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size,
+					  &packet_size_length);
+	if (rc) {
+		printk(KERN_WARNING "%s: Error parsing packet length; "
+		       "rc = [%d]\n", __func__, rc);
+		return rc;
+	}
+
+	if ((1 + 4 + packet_size_length + packet_size) != count) {
+		printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__,
+		       packet_size);
+		return -EINVAL;
+	}
 
+memdup:
 	data = memdup_user(buf, count);
 	if (IS_ERR(data)) {
 		printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
@@ -435,23 +471,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 		}
 		memcpy(&counter_nbo, &data[i], 4);
 		seq = be32_to_cpu(counter_nbo);
-		i += 4;
-		rc = ecryptfs_parse_packet_length(&data[i], &packet_size,
-						  &packet_size_length);
-		if (rc) {
-			printk(KERN_WARNING "%s: Error parsing packet length; "
-			       "rc = [%d]\n", __func__, rc);
-			goto out_free;
-		}
-		i += packet_size_length;
-		if ((1 + 4 + packet_size_length + packet_size) != count) {
-			printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
-			       " + packet_size([%zd]))([%zd]) != "
-			       "count([%zd]). Invalid packet format.\n",
-			       __func__, packet_size_length, packet_size,
-			       (1 + packet_size_length + packet_size), count);
-			goto out_free;
-		}
+		i += 4 + packet_size_length;
 		rc = ecryptfs_miscdev_response(&data[i], packet_size,
 					       euid, current_user_ns(),
 					       task_pid(current), seq);
-- 
cgit v1.2.3


From 7f133504249afa48618becac546ce3c35c9f0185 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Sat, 14 Jan 2012 15:51:37 +0100
Subject: eCryptfs: Report errors in writes to /dev/ecryptfs

Errors in writes to /dev/ecryptfs were being incorrectly reported by
returning 0 or the value of the original write count.

This patch clears up the return code assignment in error paths.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/miscdev.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 0dc5a3d554a4..1145c58103e2 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -406,14 +406,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 	__be32 counter_nbo;
 	u32 seq;
 	size_t packet_size, packet_size_length, i;
-	ssize_t sz = 0;
 	char *data;
 	uid_t euid = current_euid();
 	unsigned char packet_size_peek[3];
-	int rc;
+	ssize_t rc;
 
 	if (count == 0) {
-		goto out;
+		return 0;
 	} else if (count == (1 + 4)) {
 		/* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */
 		goto memdup;
@@ -439,7 +438,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 					  &packet_size_length);
 	if (rc) {
 		printk(KERN_WARNING "%s: Error parsing packet length; "
-		       "rc = [%d]\n", __func__, rc);
+		       "rc = [%zd]\n", __func__, rc);
 		return rc;
 	}
 
@@ -454,9 +453,8 @@ memdup:
 	if (IS_ERR(data)) {
 		printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
 		       __func__, PTR_ERR(data));
-		goto out;
+		return PTR_ERR(data);
 	}
-	sz = count;
 	i = 0;
 	switch (data[i++]) {
 	case ECRYPTFS_MSG_RESPONSE:
@@ -467,6 +465,7 @@ memdup:
 			       __func__,
 			       (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
 			       count);
+			rc = -EINVAL;
 			goto out_free;
 		}
 		memcpy(&counter_nbo, &data[i], 4);
@@ -475,10 +474,12 @@ memdup:
 		rc = ecryptfs_miscdev_response(&data[i], packet_size,
 					       euid, current_user_ns(),
 					       task_pid(current), seq);
-		if (rc)
+		if (rc) {
 			printk(KERN_WARNING "%s: Failed to deliver miscdev "
-			       "response to requesting operation; rc = [%d]\n",
+			       "response to requesting operation; rc = [%zd]\n",
 			       __func__, rc);
+			goto out_free;
+		}
 		break;
 	case ECRYPTFS_MSG_HELO:
 	case ECRYPTFS_MSG_QUIT:
@@ -487,12 +488,13 @@ memdup:
 		ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
 				"message of unrecognized type [%d]\n",
 				data[0]);
-		break;
+		rc = -EINVAL;
+		goto out_free;
 	}
+	rc = count;
 out_free:
 	kfree(data);
-out:
-	return sz;
+	return rc;
 }
 
 
-- 
cgit v1.2.3


From 48399c0b0e6172888a2e2e36df1595ab1e049ba8 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Sat, 14 Jan 2012 16:46:46 +0100
Subject: eCryptfs: Replace miscdev read/write magic numbers

ecryptfs_miscdev_read() and ecryptfs_miscdev_write() contained many
magic numbers for specifying packet header field sizes and offsets. This
patch defines those values and replaces the magic values.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/ecryptfs_kernel.h |  5 +++
 fs/ecryptfs/keystore.c        |  5 ++-
 fs/ecryptfs/miscdev.c         | 86 +++++++++++++++++++++++--------------------
 3 files changed, 55 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a9f29b12fbf2..a2362df58ae8 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -151,6 +151,11 @@ ecryptfs_get_key_payload_data(struct key *key)
 					  * dentry name */
 #define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
 					  * metadata */
+#define ECRYPTFS_MIN_PKT_LEN_SIZE 1 /* Min size to specify packet length */
+#define ECRYPTFS_MAX_PKT_LEN_SIZE 2 /* Pass at least this many bytes to
+				     * ecryptfs_parse_packet_length() and
+				     * ecryptfs_write_packet_length()
+				     */
 /* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
  * ECRYPTFS_MAX_IV_BYTES */
 #define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ac1ad48c2376..8e3b943e330f 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -109,7 +109,7 @@ int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
 		(*size) += ((unsigned char)(data[1]) + 192);
 		(*length_size) = 2;
 	} else if (data[0] == 255) {
-		/* Five-byte length; we're not supposed to see this */
+		/* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
 		ecryptfs_printk(KERN_ERR, "Five-byte packet length not "
 				"supported\n");
 		rc = -EINVAL;
@@ -126,7 +126,7 @@ out:
 /**
  * ecryptfs_write_packet_length
  * @dest: The byte array target into which to write the length. Must
- *        have at least 5 bytes allocated.
+ *        have at least ECRYPTFS_MAX_PKT_LEN_SIZE bytes allocated.
  * @size: The length to write.
  * @packet_size_length: The number of bytes used to encode the packet
  *                      length is written to this address.
@@ -146,6 +146,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
 		dest[1] = ((size - 192) % 256);
 		(*packet_size_length) = 2;
 	} else {
+		/* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
 		rc = -EINVAL;
 		ecryptfs_printk(KERN_WARNING,
 				"Unsupported packet size: [%zd]\n", size);
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 1145c58103e2..349209dc6a91 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -218,6 +218,29 @@ out_unlock:
 	return rc;
 }
 
+/*
+ * miscdevfs packet format:
+ *  Octet 0: Type
+ *  Octets 1-4: network byte order msg_ctx->counter
+ *  Octets 5-N0: Size of struct ecryptfs_message to follow
+ *  Octets N0-N1: struct ecryptfs_message (including data)
+ *
+ *  Octets 5-N1 not written if the packet type does not include a message
+ */
+#define PKT_TYPE_SIZE		1
+#define PKT_CTR_SIZE		4
+#define MIN_NON_MSG_PKT_SIZE	(PKT_TYPE_SIZE + PKT_CTR_SIZE)
+#define MIN_MSG_PKT_SIZE	(PKT_TYPE_SIZE + PKT_CTR_SIZE \
+				 + ECRYPTFS_MIN_PKT_LEN_SIZE)
+/* 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES comes from tag 65 packet format */
+#define MAX_MSG_PKT_SIZE	(PKT_TYPE_SIZE + PKT_CTR_SIZE \
+				 + ECRYPTFS_MAX_PKT_LEN_SIZE \
+				 + sizeof(struct ecryptfs_message) \
+				 + 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)
+#define PKT_TYPE_OFFSET		0
+#define PKT_CTR_OFFSET		PKT_TYPE_SIZE
+#define PKT_LEN_OFFSET		(PKT_TYPE_SIZE + PKT_CTR_SIZE)
+
 /**
  * ecryptfs_miscdev_read - format and send message from queue
  * @file: fs/ecryptfs/euid miscdevfs handle (ignored)
@@ -237,7 +260,7 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
 	struct ecryptfs_daemon *daemon;
 	struct ecryptfs_msg_ctx *msg_ctx;
 	size_t packet_length_size;
-	char packet_length[3];
+	char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE];
 	size_t i;
 	size_t total_length;
 	uid_t euid = current_euid();
@@ -305,15 +328,8 @@ check_list:
 		packet_length_size = 0;
 		msg_ctx->msg_size = 0;
 	}
-	/* miscdevfs packet format:
-	 *  Octet 0: Type
-	 *  Octets 1-4: network byte order msg_ctx->counter
-	 *  Octets 5-N0: Size of struct ecryptfs_message to follow
-	 *  Octets N0-N1: struct ecryptfs_message (including data)
-	 *
-	 *  Octets 5-N1 not written if the packet type does not
-	 *  include a message */
-	total_length = (1 + 4 + packet_length_size + msg_ctx->msg_size);
+	total_length = (PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_length_size
+			+ msg_ctx->msg_size);
 	if (count < total_length) {
 		rc = 0;
 		printk(KERN_WARNING "%s: Only given user buffer of "
@@ -324,9 +340,10 @@ check_list:
 	rc = -EFAULT;
 	if (put_user(msg_ctx->type, buf))
 		goto out_unlock_msg_ctx;
-	if (put_user(cpu_to_be32(msg_ctx->counter), (__be32 __user *)(buf + 1)))
+	if (put_user(cpu_to_be32(msg_ctx->counter),
+		     (__be32 __user *)(&buf[PKT_CTR_OFFSET])))
 		goto out_unlock_msg_ctx;
-	i = 5;
+	i = PKT_TYPE_SIZE + PKT_CTR_SIZE;
 	if (msg_ctx->msg) {
 		if (copy_to_user(&buf[i], packet_length, packet_length_size))
 			goto out_unlock_msg_ctx;
@@ -391,12 +408,6 @@ out:
  * @count: Amount of data in @buf
  * @ppos: Pointer to offset in file (ignored)
  *
- * miscdevfs packet format:
- *  Octet 0: Type
- *  Octets 1-4: network byte order msg_ctx->counter (0's for non-response)
- *  Octets 5-N0: Size of struct ecryptfs_message to follow
- *  Octets N0-N1: struct ecryptfs_message (including data)
- *
  * Returns the number of bytes read from @buf
  */
 static ssize_t
@@ -405,29 +416,25 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 {
 	__be32 counter_nbo;
 	u32 seq;
-	size_t packet_size, packet_size_length, i;
+	size_t packet_size, packet_size_length;
 	char *data;
 	uid_t euid = current_euid();
-	unsigned char packet_size_peek[3];
+	unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE];
 	ssize_t rc;
 
 	if (count == 0) {
 		return 0;
-	} else if (count == (1 + 4)) {
+	} else if (count == MIN_NON_MSG_PKT_SIZE) {
 		/* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */
 		goto memdup;
-	} else if (count < (1 + 4 + 1)
-		   || count > (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4
-			       + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)) {
+	} else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) {
 		printk(KERN_WARNING "%s: Acceptable packet size range is "
 		       "[%d-%lu], but amount of data written is [%zu].",
-		       __func__, (1 + 4 + 1),
-		       (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4
-			+ ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES), count);
+		       __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count);
 		return -EINVAL;
 	}
 
-	if (copy_from_user(packet_size_peek, (buf + 1 + 4),
+	if (copy_from_user(packet_size_peek, &buf[PKT_LEN_OFFSET],
 			   sizeof(packet_size_peek))) {
 		printk(KERN_WARNING "%s: Error while inspecting packet size\n",
 		       __func__);
@@ -442,7 +449,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 		return rc;
 	}
 
-	if ((1 + 4 + packet_size_length + packet_size) != count) {
+	if ((PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_size_length + packet_size)
+	    != count) {
 		printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__,
 		       packet_size);
 		return -EINVAL;
@@ -455,25 +463,25 @@ memdup:
 		       __func__, PTR_ERR(data));
 		return PTR_ERR(data);
 	}
-	i = 0;
-	switch (data[i++]) {
+	switch (data[PKT_TYPE_OFFSET]) {
 	case ECRYPTFS_MSG_RESPONSE:
-		if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
+		if (count < (MIN_MSG_PKT_SIZE
+			     + sizeof(struct ecryptfs_message))) {
 			printk(KERN_WARNING "%s: Minimum acceptable packet "
 			       "size is [%zd], but amount of data written is "
 			       "only [%zd]. Discarding response packet.\n",
 			       __func__,
-			       (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
-			       count);
+			       (MIN_MSG_PKT_SIZE
+				+ sizeof(struct ecryptfs_message)), count);
 			rc = -EINVAL;
 			goto out_free;
 		}
-		memcpy(&counter_nbo, &data[i], 4);
+		memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE);
 		seq = be32_to_cpu(counter_nbo);
-		i += 4 + packet_size_length;
-		rc = ecryptfs_miscdev_response(&data[i], packet_size,
-					       euid, current_user_ns(),
-					       task_pid(current), seq);
+		rc = ecryptfs_miscdev_response(
+				&data[PKT_LEN_OFFSET + packet_size_length],
+				packet_size, euid, current_user_ns(),
+				task_pid(current), seq);
 		if (rc) {
 			printk(KERN_WARNING "%s: Failed to deliver miscdev "
 			       "response to requesting operation; rc = [%zd]\n",
-- 
cgit v1.2.3


From 684a3ff7e69acc7c678d1a1394fe9e757993fd34 Mon Sep 17 00:00:00 2001
From: Li Wang <liwang@nudt.edu.cn>
Date: Thu, 19 Jan 2012 09:44:36 +0800
Subject: eCryptfs: Infinite loop due to overflow in ecryptfs_write()

ecryptfs_write() can enter an infinite loop when truncating a file to a
size larger than 4G. This only happens on architectures where size_t is
represented by 32 bits.

This was caused by a size_t overflow due to it incorrectly being used to
store the result of a calculation which uses potentially large values of
type loff_t.

[tyhicks@canonical.com: rewrite subject and commit message]
Signed-off-by: Li Wang <liwang@nudt.edu.cn>
Signed-off-by: Yunchuan Wen <wenyunchuan@kylinos.com.cn>
Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/read_write.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 3745f7c2b9c2..ec3d9368dc5b 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -130,13 +130,13 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
 		pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
 		size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
 		size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
-		size_t total_remaining_bytes = ((offset + size) - pos);
+		loff_t total_remaining_bytes = ((offset + size) - pos);
 
 		if (num_bytes > total_remaining_bytes)
 			num_bytes = total_remaining_bytes;
 		if (pos < offset) {
 			/* remaining zeros to write, up to destination offset */
-			size_t total_remaining_zeros = (offset - pos);
+			loff_t total_remaining_zeros = (offset - pos);
 
 			if (num_bytes > total_remaining_zeros)
 				num_bytes = total_remaining_zeros;
-- 
cgit v1.2.3


From 5e6f0d769017cc49207ef56996e42363ec26c1f0 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Wed, 18 Jan 2012 18:30:04 -0600
Subject: eCryptfs: Make truncate path killable

ecryptfs_write() handles the truncation of eCryptfs inodes. It grabs a
page, zeroes out the appropriate portions, and then encrypts the page
before writing it to the lower filesystem. It was unkillable and due to
the lack of sparse file support could result in tying up a large portion
of system resources, while encrypting pages of zeros, with no way for
the truncate operation to be stopped from userspace.

This patch adds the ability for ecryptfs_write() to detect a pending
fatal signal and return as gracefully as possible. The intent is to
leave the lower file in a useable state, while still allowing a user to
break out of the encryption loop. If a pending fatal signal is detected,
the eCryptfs inode size is updated to reflect the modified inode size
and then -EINTR is returned.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Cc: <stable@vger.kernel.org>
---
 fs/ecryptfs/read_write.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index ec3d9368dc5b..608c1c3fde1b 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -132,6 +132,11 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
 		size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
 		loff_t total_remaining_bytes = ((offset + size) - pos);
 
+		if (fatal_signal_pending(current)) {
+			rc = -EINTR;
+			break;
+		}
+
 		if (num_bytes > total_remaining_bytes)
 			num_bytes = total_remaining_bytes;
 		if (pos < offset) {
@@ -193,15 +198,19 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
 		}
 		pos += num_bytes;
 	}
-	if ((offset + size) > ecryptfs_file_size) {
-		i_size_write(ecryptfs_inode, (offset + size));
+	if (pos > ecryptfs_file_size) {
+		i_size_write(ecryptfs_inode, pos);
 		if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
-			rc = ecryptfs_write_inode_size_to_metadata(
+			int rc2;
+
+			rc2 = ecryptfs_write_inode_size_to_metadata(
 								ecryptfs_inode);
-			if (rc) {
+			if (rc2) {
 				printk(KERN_ERR	"Problem with "
 				       "ecryptfs_write_inode_size_to_metadata; "
-				       "rc = [%d]\n", rc);
+				       "rc = [%d]\n", rc2);
+				if (!rc)
+					rc = rc2;
 				goto out;
 			}
 		}
-- 
cgit v1.2.3


From a261a03904849c3df50bd0300efb7fb3f865137d Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Thu, 19 Jan 2012 20:33:44 -0600
Subject: eCryptfs: Check inode changes in setattr

Most filesystems call inode_change_ok() very early in ->setattr(), but
eCryptfs didn't call it at all. It allowed the lower filesystem to make
the call in its ->setattr() function. Then, eCryptfs would copy the
appropriate inode attributes from the lower inode to the eCryptfs inode.

This patch changes that and actually calls inode_change_ok() on the
eCryptfs inode, fairly early in ecryptfs_setattr(). Ideally, the call
would happen earlier in ecryptfs_setattr(), but there are some possible
inode initialization steps that must happen first.

Since the call was already being made on the lower inode, the change in
functionality should be minimal, except for the case of a file extending
truncate call. In that case, inode_newsize_ok() was never being
called on the eCryptfs inode. Rather than inode_newsize_ok() catching
maximum file size errors early on, eCryptfs would encrypt zeroed pages
and write them to the lower filesystem until the lower filesystem's
write path caught the error in generic_write_checks(). This patch
introduces a new function, called ecryptfs_inode_newsize_ok(), which
checks if the new lower file size is within the appropriate limits when
the truncate operation will be growing the lower file.

In summary this change prevents eCryptfs truncate operations (and the
resulting page encryptions), which would exceed the lower filesystem
limits or FSIZE rlimits, from ever starting.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Reviewed-by: Li Wang <liwang@nudt.edu.cn>
Cc: <stable@vger.kernel.org>
---
 fs/ecryptfs/inode.c | 48 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 19a8ca4ab1dd..19892d7d2ed1 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -822,18 +822,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 		size_t num_zeros = (PAGE_CACHE_SIZE
 				    - (ia->ia_size & ~PAGE_CACHE_MASK));
 
-
-		/*
-		 * XXX(truncate) this should really happen at the begginning
-		 * of ->setattr.  But the code is too messy to that as part
-		 * of a larger patch.  ecryptfs is also totally missing out
-		 * on the inode_change_ok check at the beginning of
-		 * ->setattr while would include this.
-		 */
-		rc = inode_newsize_ok(inode, ia->ia_size);
-		if (rc)
-			goto out;
-
 		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
 			truncate_setsize(inode, ia->ia_size);
 			lower_ia->ia_size = ia->ia_size;
@@ -883,6 +871,28 @@ out:
 	return rc;
 }
 
+static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset)
+{
+	struct ecryptfs_crypt_stat *crypt_stat;
+	loff_t lower_oldsize, lower_newsize;
+
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	lower_oldsize = upper_size_to_lower_size(crypt_stat,
+						 i_size_read(inode));
+	lower_newsize = upper_size_to_lower_size(crypt_stat, offset);
+	if (lower_newsize > lower_oldsize) {
+		/*
+		 * The eCryptfs inode and the new *lower* size are mixed here
+		 * because we may not have the lower i_mutex held and/or it may
+		 * not be appropriate to call inode_newsize_ok() with inodes
+		 * from other filesystems.
+		 */
+		return inode_newsize_ok(inode, lower_newsize);
+	}
+
+	return 0;
+}
+
 /**
  * ecryptfs_truncate
  * @dentry: The ecryptfs layer dentry
@@ -899,6 +909,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 	struct iattr lower_ia = { .ia_valid = 0 };
 	int rc;
 
+	rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length);
+	if (rc)
+		return rc;
+
 	rc = truncate_upper(dentry, &ia, &lower_ia);
 	if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
 		struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
@@ -978,6 +992,16 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 		}
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
+
+	rc = inode_change_ok(inode, ia);
+	if (rc)
+		goto out;
+	if (ia->ia_valid & ATTR_SIZE) {
+		rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size);
+		if (rc)
+			goto out;
+	}
+
 	if (S_ISREG(inode->i_mode)) {
 		rc = filemap_write_and_wait(inode->i_mapping);
 		if (rc)
-- 
cgit v1.2.3


From f2cb933501ebc066bf3c4b1836fd8428f8fe9863 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Wed, 18 Jan 2012 15:09:43 -0600
Subject: eCryptfs: Remove unused ecryptfs_read()

ecryptfs_read() has been ifdef'ed out for years now and it was
apparently unused before then. It is time to get rid of it for good.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/read_write.c | 73 ------------------------------------------------
 1 file changed, 73 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 608c1c3fde1b..5c0106f75775 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -282,76 +282,3 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
 	flush_dcache_page(page_for_ecryptfs);
 	return rc;
 }
-
-#if 0
-/**
- * ecryptfs_read
- * @data: The virtual address into which to write the data read (and
- *        possibly decrypted) from the lower file
- * @offset: The offset in the decrypted view of the file from which to
- *          read into @data
- * @size: The number of bytes to read into @data
- * @ecryptfs_file: The eCryptfs file from which to read
- *
- * Read an arbitrary amount of data from an arbitrary location in the
- * eCryptfs page cache. This is done on an extent-by-extent basis;
- * individual extents are decrypted and read from the lower page
- * cache (via VFS reads). This function takes care of all the
- * address translation to locations in the lower filesystem.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_read(char *data, loff_t offset, size_t size,
-		  struct file *ecryptfs_file)
-{
-	struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
-	struct page *ecryptfs_page;
-	char *ecryptfs_page_virt;
-	loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
-	loff_t data_offset = 0;
-	loff_t pos;
-	int rc = 0;
-
-	if ((offset + size) > ecryptfs_file_size) {
-		rc = -EINVAL;
-		printk(KERN_ERR "%s: Attempt to read data past the end of the "
-			"file; offset = [%lld]; size = [%td]; "
-		       "ecryptfs_file_size = [%lld]\n",
-		       __func__, offset, size, ecryptfs_file_size);
-		goto out;
-	}
-	pos = offset;
-	while (pos < (offset + size)) {
-		pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
-		size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
-		size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
-		size_t total_remaining_bytes = ((offset + size) - pos);
-
-		if (num_bytes > total_remaining_bytes)
-			num_bytes = total_remaining_bytes;
-		ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
-							 ecryptfs_page_idx);
-		if (IS_ERR(ecryptfs_page)) {
-			rc = PTR_ERR(ecryptfs_page);
-			printk(KERN_ERR "%s: Error getting page at "
-			       "index [%ld] from eCryptfs inode "
-			       "mapping; rc = [%d]\n", __func__,
-			       ecryptfs_page_idx, rc);
-			goto out;
-		}
-		ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
-		memcpy((data + data_offset),
-		       ((char *)ecryptfs_page_virt + start_offset_in_page),
-		       num_bytes);
-		kunmap_atomic(ecryptfs_page_virt, KM_USER0);
-		flush_dcache_page(ecryptfs_page);
-		SetPageUptodate(ecryptfs_page);
-		unlock_page(ecryptfs_page);
-		page_cache_release(ecryptfs_page);
-		pos += num_bytes;
-		data_offset += num_bytes;
-	}
-out:
-	return rc;
-}
-#endif  /*  0  */
-- 
cgit v1.2.3


From 58ded24f0fcb85bddb665baba75892f6ad0f4b8a Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Tue, 24 Jan 2012 10:02:22 -0600
Subject: eCryptfs: Fix oops when printing debug info in extent crypto
 functions

If pages passed to the eCryptfs extent-based crypto functions are not
mapped and the module parameter ecryptfs_verbosity=1 was specified at
loading time, a NULL pointer dereference will occur.

Note that this wouldn't happen on a production system, as you wouldn't
pass ecryptfs_verbosity=1 on a production system. It leaks private
information to the system logs and is for debugging only.

The debugging info printed in these messages is no longer very useful
and rather than doing a kmap() in these debugging paths, it will be
better to simply remove the debugging paths completely.

https://launchpad.net/bugs/913651

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Reported-by: Daniel DeFreez
Cc: <stable@vger.kernel.org>
---
 fs/ecryptfs/crypto.c | 40 ----------------------------------------
 1 file changed, 40 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index ff981503b3e3..63ab24510649 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -417,17 +417,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
 			(unsigned long long)(extent_base + extent_offset), rc);
 		goto out;
 	}
-	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Encrypting extent "
-				"with iv:\n");
-		ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
-		ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
-				"encryption:\n");
-		ecryptfs_dump_hex((char *)
-				  (page_address(page)
-				   + (extent_offset * crypt_stat->extent_size)),
-				  8);
-	}
 	rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0,
 					  page, (extent_offset
 						 * crypt_stat->extent_size),
@@ -440,14 +429,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
 		goto out;
 	}
 	rc = 0;
-	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
-			"rc = [%d]\n",
-			(unsigned long long)(extent_base + extent_offset), rc);
-		ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
-				"encryption:\n");
-		ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
-	}
 out:
 	return rc;
 }
@@ -543,17 +524,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
 			(unsigned long long)(extent_base + extent_offset), rc);
 		goto out;
 	}
-	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Decrypting extent "
-				"with iv:\n");
-		ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
-		ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
-				"decryption:\n");
-		ecryptfs_dump_hex((char *)
-				  (page_address(enc_extent_page)
-				   + (extent_offset * crypt_stat->extent_size)),
-				  8);
-	}
 	rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
 					  (extent_offset
 					   * crypt_stat->extent_size),
@@ -567,16 +537,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
 		goto out;
 	}
 	rc = 0;
-	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
-			"rc = [%d]\n",
-			(unsigned long long)(extent_base + extent_offset), rc);
-		ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
-				"decryption:\n");
-		ecryptfs_dump_hex((char *)(page_address(page)
-					   + (extent_offset
-					      * crypt_stat->extent_size)), 8);
-	}
 out:
 	return rc;
 }
-- 
cgit v1.2.3


From 1589cb1a94c381579a0235ca708d9e2dca6d3a39 Mon Sep 17 00:00:00 2001
From: Li Wang <liwang@nudt.edu.cn>
Date: Wed, 25 Jan 2012 15:40:31 +0800
Subject: eCryptfs: move misleading function comments

 The data encryption was moved from ecryptfs_write_end into
ecryptfs_writepage, this patch moves the corresponding function
comments to be consistent with the modification.

Signed-off-by: Li Wang <liwang@nudt.edu.cn>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/mmap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 6a44148c5fb9..10ec695ccd68 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -57,6 +57,10 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
  * @page: Page that is locked before this call is made
  *
  * Returns zero on success; non-zero otherwise
+ *
+ * This is where we encrypt the data and pass the encrypted data to
+ * the lower filesystem.  In OpenPGP-compatible mode, we operate on
+ * entire underlying packets.
  */
 static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
 {
@@ -481,10 +485,6 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
  * @copied: The amount of data copied
  * @page: The eCryptfs page
  * @fsdata: The fsdata (unused)
- *
- * This is where we encrypt the data and pass the encrypted data to
- * the lower filesystem.  In OpenPGP-compatible mode, we operate on
- * entire underlying packets.
  */
 static int ecryptfs_write_end(struct file *file,
 			struct address_space *mapping,
-- 
cgit v1.2.3


From b1375d64c539c5b76794be759b62d3f178e67c32 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 26 Jan 2012 15:01:11 -0500
Subject: Btrfs: fix uninit warning in backref.c

Added initialization with the declaration of ret. It isn't set later on the
switch-default branch (which should never be taken).

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/backref.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b9a843226de8..633c701a287d 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -297,7 +297,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
 	struct rb_node *n = &head->node.rb_node;
 	int sgn;
-	int ret;
+	int ret = 0;
 
 	if (extent_op && extent_op->update_key)
 		btrfs_disk_key_to_cpu(info_key, &extent_op->key);
@@ -392,7 +392,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 			     struct btrfs_key *info_key, int *info_level,
 			     struct list_head *prefs)
 {
-	int ret;
+	int ret = 0;
 	int slot;
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
-- 
cgit v1.2.3


From 357b9784b79924a31ccded5d9a0c688f48cc28f2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 26 Jan 2012 15:01:11 -0500
Subject: Btrfs: make sure a bitmap has enough bytes

We have only been checking for min_bytes available in bitmap entries, but we
won't successfully setup a bitmap cluster unless it has at least bytes in the
bitmap, so in the common case min_bytes is 4k and we want something like 2MB, so
if there are a bunch of bitmap entries with less than 2mb's in them, we'll
search all them anyway, which is suboptimal.  Fix this check.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index efe20032e4a1..6e7406932341 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2475,7 +2475,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 	}
 
 	list_for_each_entry(entry, bitmaps, list) {
-		if (entry->bytes < min_bytes)
+		if (entry->bytes < bytes)
 			continue;
 		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
 					   bytes, cont1_bytes, min_bytes);
-- 
cgit v1.2.3


From 6dd70ce4eb7429c2ba6dd9fa46f78a0a2a254038 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 26 Jan 2012 15:01:11 -0500
Subject: btrfs: Fix busyloops in transaction waiting code

wait_log_commit() and wait_for_writer() were using slightly different
conditions for deciding whether they should call schedule() and whether they
should continue in the wait loop. Thus it could happen that we busylooped when
the first condition was not true while the second one was. That is burning CPU
cycles needlessly and is deadly on UP machines...

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cb877e0886a7..966cc74f5d6c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1957,7 +1957,8 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
 
 		finish_wait(&root->log_commit_wait[index], &wait);
 		mutex_lock(&root->log_mutex);
-	} while (root->log_transid < transid + 2 &&
+	} while (root->fs_info->last_trans_log_full_commit !=
+		 trans->transid && root->log_transid < transid + 2 &&
 		 atomic_read(&root->log_commit[index]));
 	return 0;
 }
@@ -1966,7 +1967,8 @@ static int wait_for_writer(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root)
 {
 	DEFINE_WAIT(wait);
-	while (atomic_read(&root->log_writers)) {
+	while (root->fs_info->last_trans_log_full_commit !=
+	       trans->transid && atomic_read(&root->log_writers)) {
 		prepare_to_wait(&root->log_writer_wait,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
-- 
cgit v1.2.3


From 8bedd51b6121c4607784d75f852828d25d119c52 Mon Sep 17 00:00:00 2001
From: Mitch Harder <mitch.harder@sabayonlinux.org>
Date: Thu, 26 Jan 2012 15:01:11 -0500
Subject: Btrfs: Check for NULL page in extent_range_uptodate

A user has encountered a NULL pointer kernel oops in btrfs when
encountering media errors.  The problem has been identified
as an unhandled NULL pointer returned from find_get_page().
This modification simply checks for a NULL page, and returns
with an error if found (the extent_range_uptodate() function
returns 1 on errors).

After testing this patch, the user reported that the error with
the NULL pointer oops was solved.  However, there is still a
remaining problem with a thread becoming stuck in
wait_on_page_locked(page) in the read_extent_buffer_pages(...)
function in extent_io.c

       for (i = start_i; i < num_pages; i++) {
               page = extent_buffer_page(eb, i);
               wait_on_page_locked(page);
               if (!PageUptodate(page))
                       ret = -EIO;
       }

This patch leaves the issue with the locked page yet to be resolved.

Signed-off-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9d09a4f81875..fcf77e1ded40 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3909,6 +3909,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 	while (start <= end) {
 		index = start >> PAGE_CACHE_SHIFT;
 		page = find_get_page(tree->mapping, index);
+		if (!page)
+			return 1;
 		uptodate = PageUptodate(page);
 		page_cache_release(page);
 		if (!uptodate) {
-- 
cgit v1.2.3


From 0b4a9d248f88e6773312f262e8185f23863d984a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 26 Jan 2012 15:01:11 -0500
Subject: Btrfs: use cluster->window_start when allocating from a cluster
 bitmap

We specifically set window_start in the cluster struct to indicate where the
cluster starts in a bitmap, but we've been using min_start to indicate where
we're searching from.  This is usually the start of the blockgroup, so
essentially means we're constantly searching from the start of any bitmap we
find, which completely negates all the trouble we go to in order to setup a
cluster.  So start using window_start to make sure we actually use the area we
found.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6e7406932341..61447a51f645 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2242,7 +2242,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 		if (entry->bitmap) {
 			ret = btrfs_alloc_from_bitmap(block_group,
 						      cluster, entry, bytes,
-						      min_start);
+						      cluster->window_start);
 			if (ret == 0) {
 				node = rb_next(&entry->offset_index);
 				if (!node)
-- 
cgit v1.2.3


From 0b485143d835c019cddc45f46e4b3873dcc9aa4e Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Thu, 26 Jan 2012 15:01:11 -0500
Subject: Btrfs: fix warning for 32-bit build of fs/btrfs/check-integrity.c

There have been 4 warnings on 32-bit build, they are herewith fixed.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/check-integrity.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ad0b3ba735b7..b669a7d8e499 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1662,7 +1662,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
 	block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
 					       &state->block_hashtable);
 	if (NULL != block) {
-		u64 bytenr;
+		u64 bytenr = 0;
 		struct list_head *elem_ref_to;
 		struct list_head *tmp_ref_to;
 
@@ -2777,9 +2777,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
 			printk(KERN_INFO
 			       "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
 			       " size=%lu, data=%p, bdev=%p)\n",
-			       rw, bh->b_blocknr,
-			       (unsigned long long)dev_bytenr, bh->b_size,
-			       bh->b_data, bh->b_bdev);
+			       rw, (unsigned long)bh->b_blocknr,
+			       (unsigned long long)dev_bytenr,
+			       (unsigned long)bh->b_size, bh->b_data,
+			       bh->b_bdev);
 		btrfsic_process_written_block(dev_state, dev_bytenr,
 					      bh->b_data, bh->b_size, NULL,
 					      NULL, bh, rw);
@@ -2844,7 +2845,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
 			printk(KERN_INFO
 			       "submit_bio(rw=0x%x, bi_vcnt=%u,"
 			       " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
-			       rw, bio->bi_vcnt, bio->bi_sector,
+			       rw, bio->bi_vcnt, (unsigned long)bio->bi_sector,
 			       (unsigned long long)dev_bytenr,
 			       bio->bi_bdev);
 
-- 
cgit v1.2.3


From 7ec31b548a17f773ab6289e795ed3a6820e8b56e Mon Sep 17 00:00:00 2001
From: Liu Bo <liubo2009@cn.fujitsu.com>
Date: Thu, 26 Jan 2012 15:01:12 -0500
Subject: Btrfs: do not defrag a file partially

xfstests 218 complains that btrfs defrags a file partially:
 After: 1
 Write backwards sync, but contiguous - should defrag to 1 extent
 Before: 10
-After: 1
+After: 2

To fix this, we need to set max_to_defrag count properly.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6834be4c8709..0b06a5ca8afc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1066,7 +1066,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		i = range->start >> PAGE_CACHE_SHIFT;
 	}
 	if (!max_to_defrag)
-		max_to_defrag = last_index;
+		max_to_defrag = last_index + 1;
 
 	/*
 	 * make writeback starts from i, so the defrag range can be
-- 
cgit v1.2.3


From 9e622d6bea0202e9fe267955362c01918562c09b Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 26 Jan 2012 15:01:12 -0500
Subject: Btrfs: fix enospc error caused by wrong checks of the chunk

When we did sysbench test for inline files, enospc error happened easily though
there was lots of free disk space which could be allocated for new chunks.

Reproduce steps:
 # mkfs.btrfs -b $((2 * 1024 * 1024 * 1024)) <test partition>
 # mount <test partition> /mnt
 # ulimit -n 102400
 # cd /mnt
 # sysbench --num-threads=1 --test=fileio --file-num=81920 \
 > --file-total-size=80M --file-block-size=1K --file-io-mode=sync \
 > --file-test-mode=seqwr prepare
 # sysbench --num-threads=1 --test=fileio --file-num=81920 \
 > --file-total-size=80M --file-block-size=1K --file-io-mode=sync \
 > --file-test-mode=seqwr run
 <soon later, BUG_ON() was triggered by enospc error>

The reason of this bug is:
Now, we can reserve space which is larger than the free space in the chunks if
we have enough free disk space which can be used for new chunks. By this way,
the space allocator should allocate a new chunk by force if there is no free
space in the free space cache. But there are two wrong checks which break this
operation.

One is
	if (ret == -ENOSPC && num_bytes > min_alloc_size)
in btrfs_reserve_extent(), it is wrong, we should try to allocate a new chunk
even we fail to allocate free space by minimum allocable size.

The other is
	if (space_info->force_alloc)
		force = space_info->force_alloc;
in do_chunk_alloc(). It makes the allocator ignore CHUNK_ALLOC_FORCE If someone
sets ->force_alloc to CHUNK_ALLOC_LIMITED, and makes the enospc error happen.

Fix these two wrong checks. Especially the second one, we fix it by changing
the value of CHUNK_ALLOC_LIMITED and CHUNK_ALLOC_FORCE, and make
CHUNK_ALLOC_FORCE greater than CHUNK_ALLOC_LIMITED since CHUNK_ALLOC_FORCE has
higher priority. And if the value which is passed in by the caller is greater
than ->force_alloc, use the passed value.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 49 +++++++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 700879ed64cf..283af7a676a3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,23 +34,24 @@
 #include "locking.h"
 #include "free-space-cache.h"
 
-/* control flags for do_chunk_alloc's force field
+/*
+ * control flags for do_chunk_alloc's force field
  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  * if we really need one.
  *
- * CHUNK_ALLOC_FORCE means it must try to allocate one
- *
  * CHUNK_ALLOC_LIMITED means to only try and allocate one
  * if we have very few chunks already allocated.  This is
  * used as part of the clustering code to help make sure
  * we have a good pool of storage to cluster in, without
  * filling the FS with empty chunks
  *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
  */
 enum {
 	CHUNK_ALLOC_NO_FORCE = 0,
-	CHUNK_ALLOC_FORCE = 1,
-	CHUNK_ALLOC_LIMITED = 2,
+	CHUNK_ALLOC_LIMITED = 1,
+	CHUNK_ALLOC_FORCE = 2,
 };
 
 /*
@@ -3414,7 +3415,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 
 again:
 	spin_lock(&space_info->lock);
-	if (space_info->force_alloc)
+	if (force < space_info->force_alloc)
 		force = space_info->force_alloc;
 	if (space_info->full) {
 		spin_unlock(&space_info->lock);
@@ -5794,6 +5795,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 			 u64 search_end, struct btrfs_key *ins,
 			 u64 data)
 {
+	bool final_tried = false;
 	int ret;
 	u64 search_start = 0;
 
@@ -5813,22 +5815,25 @@ again:
 			       search_start, search_end, hint_byte,
 			       ins, data);
 
-	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
-		num_bytes = num_bytes >> 1;
-		num_bytes = num_bytes & ~(root->sectorsize - 1);
-		num_bytes = max(num_bytes, min_alloc_size);
-		do_chunk_alloc(trans, root->fs_info->extent_root,
-			       num_bytes, data, CHUNK_ALLOC_FORCE);
-		goto again;
-	}
-	if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
-		struct btrfs_space_info *sinfo;
-
-		sinfo = __find_space_info(root->fs_info, data);
-		printk(KERN_ERR "btrfs allocation failed flags %llu, "
-		       "wanted %llu\n", (unsigned long long)data,
-		       (unsigned long long)num_bytes);
-		dump_space_info(sinfo, num_bytes, 1);
+	if (ret == -ENOSPC) {
+		if (!final_tried) {
+			num_bytes = num_bytes >> 1;
+			num_bytes = num_bytes & ~(root->sectorsize - 1);
+			num_bytes = max(num_bytes, min_alloc_size);
+			do_chunk_alloc(trans, root->fs_info->extent_root,
+				       num_bytes, data, CHUNK_ALLOC_FORCE);
+			if (num_bytes == min_alloc_size)
+				final_tried = true;
+			goto again;
+		} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+			struct btrfs_space_info *sinfo;
+
+			sinfo = __find_space_info(root->fs_info, data);
+			printk(KERN_ERR "btrfs allocation failed flags %llu, "
+			       "wanted %llu\n", (unsigned long long)data,
+			       (unsigned long long)num_bytes);
+			dump_space_info(sinfo, num_bytes, 1);
+		}
 	}
 
 	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
-- 
cgit v1.2.3


From 0c4e538bccc106872d31b1514570b4dac95fb7f2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Thu, 26 Jan 2012 15:01:12 -0500
Subject: btrfs: mask out gfp flags in releasepage

btree_releasepage is a callback and can be passed unknown gfp flags and then
they may end up in kmem_cache_alloc called from alloc_extent_state, slab
allocator will BUG_ON when there is HIGHMEM or DMA32 flag set.

This may happen when btrfs is mounted from a loop device, which masks out
__GFP_IO flag. The check in try_release_extent_state

3399                 if ((mask & GFP_NOFS) == GFP_NOFS)
3400                         mask = GFP_NOFS;

will not work and passes unfiltered flags further resulting in crash at
mm/slab.c:2963

 [<000000000024ae4c>] cache_alloc_refill+0x3b4/0x5c8
 [<000000000024c810>] kmem_cache_alloc+0x204/0x294
 [<00000000001fd3c2>] mempool_alloc+0x52/0x170
 [<000003c000ced0b0>] alloc_extent_state+0x40/0xd4 [btrfs]
 [<000003c000cee5ae>] __clear_extent_bit+0x38a/0x4cc [btrfs]
 [<000003c000cee78c>] try_release_extent_state+0x9c/0xd4 [btrfs]
 [<000003c000cc4c66>] btree_releasepage+0x7e/0xd0 [btrfs]
 [<0000000000210d84>] shrink_page_list+0x6a0/0x724
 [<0000000000211394>] shrink_inactive_list+0x230/0x578
 [<0000000000211bb8>] shrink_list+0x6c/0x120
 [<0000000000211e4e>] shrink_zone+0x1e2/0x228
 [<0000000000211f24>] shrink_zones+0x90/0x254
 [<0000000000213410>] do_try_to_free_pages+0xac/0x420
 [<0000000000213ae0>] try_to_free_pages+0x13c/0x1b0
 [<0000000000204e6c>] __alloc_pages_nodemask+0x5b4/0x9a8
 [<00000000001fb04a>] grab_cache_page_write_begin+0x7e/0xe8

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index da4457f84d78..4c867112b4c8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -961,6 +961,13 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
 
+	/*
+	 * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
+	 * slab allocation from alloc_extent_state down the callchain where
+	 * it'd hit a BUG_ON as those flags are not allowed.
+	 */
+	gfp_flags &= ~GFP_SLAB_BUG_MASK;
+
 	ret = try_release_extent_state(map, tree, page, gfp_flags);
 	if (!ret)
 		return 0;
-- 
cgit v1.2.3


From 9b23062840e7c685ef0a0b561285d6e3a3b6811b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 26 Jan 2012 15:01:12 -0500
Subject: Btrfs: advance window_start if we're using a bitmap

If we span a long area in a bitmap we could end up taking a lot of time
searching to the next free area if we're searching from the original
window_start, so advance window_start in order to make sure we don't do any
superficial searching.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 61447a51f645..5802b1473c3d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2251,6 +2251,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 						 offset_index);
 				continue;
 			}
+			cluster->window_start += bytes;
 		} else {
 			ret = entry->offset;
 
-- 
cgit v1.2.3


From 9998eb703490589c3e8f1bf09b15203156776edb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jan 2012 13:47:40 -0500
Subject: Btrfs: fix reservations in btrfs_page_mkwrite

Josef fixed btrfs_page_mkwrite to properly release reserved
extents if there was an error.  But if we fail to get a reservation
and we fail to dirty the inode (for ENOSPC reasons), we'll end up
trying to release a reservation we never had.

This makes sure we only release if we were able to reserve.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5977987abdb1..7405753ec5d7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6401,18 +6401,23 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	unsigned long zero_start;
 	loff_t size;
 	int ret;
+	int reserved = 0;
 	u64 page_start;
 	u64 page_end;
 
 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-	if (!ret)
+	if (!ret) {
 		ret = btrfs_update_time(vma->vm_file);
+		reserved = 1;
+	}
 	if (ret) {
 		if (ret == -ENOMEM)
 			ret = VM_FAULT_OOM;
 		else /* -ENOSPC, -EIO, etc */
 			ret = VM_FAULT_SIGBUS;
-		goto out;
+		if (reserved)
+			goto out;
+		goto out_noreserve;
 	}
 
 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
@@ -6495,6 +6500,7 @@ out_unlock:
 	unlock_page(page);
 out:
 	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+out_noreserve:
 	return ret;
 }
 
-- 
cgit v1.2.3


From 96150606e2fb82d242c9e4a414e4e922849f7bf7 Mon Sep 17 00:00:00 2001
From: Prasad Joshi <prasadjoshi.linux@gmail.com>
Date: Sat, 26 Nov 2011 11:00:47 +0530
Subject: logfs: update page reference count for pined pages

LogFS sets PG_private flag to indicate a pined page. We assumed that
marking a page as private is enough to ensure its existence. But
instead it is necessary to hold a reference count to the page.

The change resolves the following BUG

BUG: Bad page state in process flush-253:16  pfn:6a6d0
page flags: 0x100000000000808(uptodate|private)

Suggested-and-Acked-by: Joern Engel <joern@logfs.org>
Signed-off-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
---
 fs/logfs/readwrite.c | 29 ++++++++++++++++++++++-------
 fs/logfs/segment.c   | 37 +++++++++++++++++++++++++++++--------
 2 files changed, 51 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 2ac4217b7901..6d663e8ea6da 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -560,8 +560,13 @@ static void inode_free_block(struct super_block *sb, struct logfs_block *block)
 static void indirect_free_block(struct super_block *sb,
 		struct logfs_block *block)
 {
-	ClearPagePrivate(block->page);
-	block->page->private = 0;
+	struct page *page = block->page;
+
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		page_cache_release(page);
+		set_page_private(page, 0);
+	}
 	__free_block(sb, block);
 }
 
@@ -650,8 +655,11 @@ static void alloc_data_block(struct inode *inode, struct page *page)
 	logfs_unpack_index(page->index, &bix, &level);
 	block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
 	block->page = page;
+
 	SetPagePrivate(page);
-	page->private = (unsigned long)block;
+	page_cache_get(page);
+	set_page_private(page, (unsigned long) block);
+
 	block->ops = &indirect_block_ops;
 }
 
@@ -1901,8 +1909,11 @@ static void move_page_to_inode(struct inode *inode, struct page *page)
 	li->li_block = block;
 
 	block->page = NULL;
-	page->private = 0;
-	ClearPagePrivate(page);
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		page_cache_release(page);
+		set_page_private(page, 0);
+	}
 }
 
 static void move_inode_to_page(struct page *page, struct inode *inode)
@@ -1918,8 +1929,12 @@ static void move_inode_to_page(struct page *page, struct inode *inode)
 	BUG_ON(PagePrivate(page));
 	block->ops = &indirect_block_ops;
 	block->page = page;
-	page->private = (unsigned long)block;
-	SetPagePrivate(page);
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		page_cache_get(page);
+		set_page_private(page, (unsigned long) block);
+	}
 
 	block->inode = NULL;
 	li->li_block = NULL;
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 9d5187353255..6aee6092860d 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -86,7 +86,11 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
 		BUG_ON(!page); /* FIXME: reserve a pool */
 		SetPageUptodate(page);
 		memcpy(page_address(page) + offset, buf, copylen);
-		SetPagePrivate(page);
+
+		if (!PagePrivate(page)) {
+			SetPagePrivate(page);
+			page_cache_get(page);
+		}
 		page_cache_release(page);
 
 		buf += copylen;
@@ -110,7 +114,10 @@ static void pad_partial_page(struct logfs_area *area)
 		page = get_mapping_page(sb, index, 0);
 		BUG_ON(!page); /* FIXME: reserve a pool */
 		memset(page_address(page) + offset, 0xff, len);
-		SetPagePrivate(page);
+		if (!PagePrivate(page)) {
+			SetPagePrivate(page);
+			page_cache_get(page);
+		}
 		page_cache_release(page);
 	}
 }
@@ -130,7 +137,10 @@ static void pad_full_pages(struct logfs_area *area)
 		BUG_ON(!page); /* FIXME: reserve a pool */
 		SetPageUptodate(page);
 		memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
-		SetPagePrivate(page);
+		if (!PagePrivate(page)) {
+			SetPagePrivate(page);
+			page_cache_get(page);
+		}
 		page_cache_release(page);
 		index++;
 		no_indizes--;
@@ -485,8 +495,12 @@ static void move_btree_to_page(struct inode *inode, struct page *page,
 		mempool_free(item, super->s_alias_pool);
 	}
 	block->page = page;
-	SetPagePrivate(page);
-	page->private = (unsigned long)block;
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		page_cache_get(page);
+		set_page_private(page, (unsigned long) block);
+	}
 	block->ops = &indirect_block_ops;
 	initialize_block_counters(page, block, data, 0);
 }
@@ -536,8 +550,12 @@ void move_page_to_btree(struct page *page)
 		list_add(&item->list, &block->item_list);
 	}
 	block->page = NULL;
-	ClearPagePrivate(page);
-	page->private = 0;
+
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		page_cache_release(page);
+		set_page_private(page, 0);
+	}
 	block->ops = &btree_block_ops;
 	err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
 			block);
@@ -702,7 +720,10 @@ void freeseg(struct super_block *sb, u32 segno)
 		page = find_get_page(mapping, ofs >> PAGE_SHIFT);
 		if (!page)
 			continue;
-		ClearPagePrivate(page);
+		if (PagePrivate(page)) {
+			ClearPagePrivate(page);
+			page_cache_release(page);
+		}
 		page_cache_release(page);
 	}
 }
-- 
cgit v1.2.3


From 934eed395d201bf0901ca0c0cc3703b18729d0ce Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Sun, 20 Nov 2011 22:29:01 +0530
Subject: logfs: Prevent memory corruption

This is a bad one.  I wonder whether we were so far protected by
no_free_segments(sb) usually being smaller than LOGFS_NO_AREAS.

Found by Dan Carpenter <dan.carpenter@oracle.com> using smatch.

Signed-off-by: Joern Engel <joern@logfs.org>
Signed-off-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
---
 fs/logfs/gc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index caa4419285dc..d4efb061bdc5 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -367,7 +367,7 @@ static struct gc_candidate *get_candidate(struct super_block *sb)
 	int i, max_dist;
 	struct gc_candidate *cand = NULL, *this;
 
-	max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
+	max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
 
 	for (i = max_dist; i >= 0; i--) {
 		this = first_in_list(&super->s_low_list[i]);
-- 
cgit v1.2.3


From 13ced29cb28996a9bc4f68e43ff0c57eafdb1e21 Mon Sep 17 00:00:00 2001
From: Prasad Joshi <prasadjoshi.linux@gmail.com>
Date: Sat, 28 Jan 2012 11:36:06 +0530
Subject: logfs: take write mutex lock during fsync and sync

LogFS uses super->s_write_mutex while writing data to disk. Taking the
same mutex lock in sync and fsync code path solves the following BUG:

------------[ cut here ]------------
kernel BUG at /home/prasad/logfs/dev_bdev.c:134!

Pid: 2387, comm: flush-253:16 Not tainted 3.0.0+ #4 Bochs Bochs
RIP: 0010:[<ffffffffa007deed>]  [<ffffffffa007deed>]
                bdev_writeseg+0x25d/0x270 [logfs]
Call Trace:
[<ffffffffa007c381>] logfs_open_area+0x91/0x150 [logfs]
[<ffffffff8128dcb2>] ? find_level.clone.9+0x62/0x100
[<ffffffffa007c49c>] __logfs_segment_write.clone.20+0x5c/0x190 [logfs]
[<ffffffff810ef005>] ? mempool_kmalloc+0x15/0x20
[<ffffffff810ef383>] ? mempool_alloc+0x53/0x130
[<ffffffffa007c7a4>] logfs_segment_write+0x1d4/0x230 [logfs]
[<ffffffffa0078f8e>] logfs_write_i0+0x12e/0x190 [logfs]
[<ffffffffa0079300>] __logfs_write_rec+0x140/0x220 [logfs]
[<ffffffffa0079444>] logfs_write_rec+0x64/0xd0 [logfs]
[<ffffffffa00795b6>] __logfs_write_buf+0x106/0x110 [logfs]
[<ffffffffa007a13e>] logfs_write_buf+0x4e/0x80 [logfs]
[<ffffffffa0073e33>] __logfs_writepage+0x23/0x80 [logfs]
[<ffffffffa007410c>] logfs_writepage+0xdc/0x110 [logfs]
[<ffffffff810f5ba7>] __writepage+0x17/0x40
[<ffffffff810f6208>] write_cache_pages+0x208/0x4f0
[<ffffffff810f5b90>] ? set_page_dirty+0x70/0x70
[<ffffffff810f653a>] generic_writepages+0x4a/0x70
[<ffffffff810f75d1>] do_writepages+0x21/0x40
[<ffffffff8116b9d1>] writeback_single_inode+0x101/0x250
[<ffffffff8116bdbd>] writeback_sb_inodes+0xed/0x1c0
[<ffffffff8116c5fb>] writeback_inodes_wb+0x7b/0x1e0
[<ffffffff8116cc23>] wb_writeback+0x4c3/0x530
[<ffffffff814d984d>] ? sub_preempt_count+0x9d/0xd0
[<ffffffff8116cd6b>] wb_do_writeback+0xdb/0x290
[<ffffffff814d984d>] ? sub_preempt_count+0x9d/0xd0
[<ffffffff814d6208>] ? _raw_spin_unlock_irqrestore+0x18/0x40
[<ffffffff8105aa5a>] ? del_timer+0x8a/0x120
[<ffffffff8116cfac>] bdi_writeback_thread+0x8c/0x2e0
[<ffffffff8116cf20>] ? wb_do_writeback+0x290/0x290
[<ffffffff8106d2e6>] kthread+0x96/0xa0
[<ffffffff814de514>] kernel_thread_helper+0x4/0x10
[<ffffffff8106d250>] ? kthread_worker_fn+0x190/0x190
[<ffffffff814de510>] ? gs_change+0xb/0xb
RIP  [<ffffffffa007deed>] bdev_writeseg+0x25d/0x270 [logfs]
---[ end trace 0211ad60a57657c4 ]---

Reviewed-by: Joern Engel <joern@logfs.org>
Signed-off-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
---
 fs/logfs/file.c      | 2 ++
 fs/logfs/inode.c     | 2 ++
 fs/logfs/logfs.h     | 2 ++
 fs/logfs/readwrite.c | 6 ++----
 4 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index b548c87a86f1..3886cded283c 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -230,7 +230,9 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		return ret;
 
 	mutex_lock(&inode->i_mutex);
+	logfs_get_wblocks(sb, NULL, WF_LOCK);
 	logfs_write_anchor(sb);
+	logfs_put_wblocks(sb, NULL, WF_LOCK);
 	mutex_unlock(&inode->i_mutex);
 
 	return 0;
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 7e441ad5f792..388d7c5a7bed 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -364,7 +364,9 @@ static void logfs_init_once(void *_li)
 
 static int logfs_sync_fs(struct super_block *sb, int wait)
 {
+	logfs_get_wblocks(sb, NULL, WF_LOCK);
 	logfs_write_anchor(sb);
+	logfs_put_wblocks(sb, NULL, WF_LOCK);
 	return 0;
 }
 
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 398ecff6e548..bb4340850c1b 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -577,6 +577,8 @@ void initialize_block_counters(struct page *page, struct logfs_block *block,
 		__be64 *array, int page_is_empty);
 int logfs_exist_block(struct inode *inode, u64 bix);
 int get_page_reserve(struct inode *inode, struct page *page);
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
 extern struct logfs_block_ops indirect_block_ops;
 
 /* segment.c */
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 6d663e8ea6da..7b10e8aecced 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -244,8 +244,7 @@ static void preunlock_page(struct super_block *sb, struct page *page, int lock)
  * is waiting for s_write_mutex.  We annotate this fact by setting PG_pre_locked
  * in addition to PG_locked.
  */
-static void logfs_get_wblocks(struct super_block *sb, struct page *page,
-		int lock)
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock)
 {
 	struct logfs_super *super = logfs_super(sb);
 
@@ -260,8 +259,7 @@ static void logfs_get_wblocks(struct super_block *sb, struct page *page,
 	}
 }
 
-static void logfs_put_wblocks(struct super_block *sb, struct page *page,
-		int lock)
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock)
 {
 	struct logfs_super *super = logfs_super(sb);
 
-- 
cgit v1.2.3


From ecfd890991a26e70547e025673580923a004c5e4 Mon Sep 17 00:00:00 2001
From: Prasad Joshi <prasadjoshi.linux@gmail.com>
Date: Sun, 30 Oct 2011 22:15:32 +0530
Subject: logfs: set superblock shutdown flag after generic sb shutdown

While unmounting the file system LogFS calls generic_shutdown_super.
The function does file system independent superblock shutdown.
However, it might result in call file system specific inode eviction.

LogFS marks FS shutting down by setting bit LOGFS_SB_FLAG_SHUTDOWN in
super->s_flags. Since, inode eviction might call truncate on inode,
following BUG is observed when file system is unmounted:

------------[ cut here ]------------
kernel BUG at /home/prasad/logfs/segment.c:362!
invalid opcode: 0000 [#1] PREEMPT SMP
CPU 3
Modules linked in: logfs binfmt_misc ppdev virtio_blk parport_pc lp
	parport psmouse floppy virtio_pci serio_raw virtio_ring virtio

Pid: 1933, comm: umount Not tainted 3.0.0+ #4 Bochs Bochs
RIP: 0010:[<ffffffffa008c841>]  [<ffffffffa008c841>]
		logfs_segment_write+0x211/0x230 [logfs]
RSP: 0018:ffff880062d7b9e8  EFLAGS: 00010202
RAX: 000000000000000e RBX: ffff88006eca9000 RCX: 0000000000000000
RDX: ffff88006fd87c40 RSI: ffffea00014ff468 RDI: ffff88007b68e000
RBP: ffff880062d7ba48 R08: 8000000020451430 R09: 0000000000000000
R10: dead000000100100 R11: 0000000000000000 R12: ffff88006fd87c40
R13: ffffea00014ff468 R14: ffff88005ad0a460 R15: 0000000000000000
FS:  00007f25d50ea760(0000) GS:ffff88007fd80000(0000)
	knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000d05e48 CR3: 0000000062c72000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process umount (pid: 1933, threadinfo ffff880062d7a000,
	task ffff880070b44500)
Stack:
ffff880062d7ba38 ffff88005ad0a508 0000000000001000 0000000000000000
8000000020451430 ffffea00014ff468 ffff880062d7ba48 ffff88005ad0a460
ffff880062d7bad8 ffffea00014ff468 ffff88006fd87c40 0000000000000000
Call Trace:
[<ffffffffa0088fee>] logfs_write_i0+0x12e/0x190 [logfs]
[<ffffffffa0089360>] __logfs_write_rec+0x140/0x220 [logfs]
[<ffffffffa0089312>] __logfs_write_rec+0xf2/0x220 [logfs]
[<ffffffffa00894a4>] logfs_write_rec+0x64/0xd0 [logfs]
[<ffffffffa0089616>] __logfs_write_buf+0x106/0x110 [logfs]
[<ffffffffa008a19e>] logfs_write_buf+0x4e/0x80 [logfs]
[<ffffffffa008a6b8>] __logfs_write_inode+0x98/0x110 [logfs]
[<ffffffffa008a7c4>] logfs_truncate+0x54/0x290 [logfs]
[<ffffffffa008abfc>] logfs_evict_inode+0xdc/0x190 [logfs]
[<ffffffff8115eef5>] evict+0x85/0x170
[<ffffffff8115f126>] iput+0xe6/0x1b0
[<ffffffff8115b4a8>] shrink_dcache_for_umount_subtree+0x218/0x280
[<ffffffff8115ce91>] shrink_dcache_for_umount+0x51/0x90
[<ffffffff8114796c>] generic_shutdown_super+0x2c/0x100
[<ffffffffa008cc47>] logfs_kill_sb+0x57/0xf0 [logfs]
[<ffffffff81147de5>] deactivate_locked_super+0x45/0x70
[<ffffffff811487ea>] deactivate_super+0x4a/0x70
[<ffffffff81163934>] mntput_no_expire+0xa4/0xf0
[<ffffffff8116469f>] sys_umount+0x6f/0x380
[<ffffffff814dd46b>] system_call_fastpath+0x16/0x1b
Code: 55 c8 49 8d b6 a8 00 00 00 45 89 f9 45 89 e8 4c 89 e1 4c 89 55
b8 c7 04 24 00 00 00 00 e8 68 fc ff ff 4c 8b 55 b8 e9 3c ff ff ff <0f>
0b 0f 0b c7 45 c0 00 00 00 00 e9 44 fe ff ff 66 66 66 66 66
RIP  [<ffffffffa008c841>] logfs_segment_write+0x211/0x230 [logfs]
RSP <ffff880062d7b9e8>
---[ end trace fe6b040cea952290 ]---

Therefore, move super->s_flags setting after the fs-indenpendent work
has been finished.

Reviewed-by: Joern Engel <joern@logfs.org>
Signed-off-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
---
 fs/logfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index e795c234ea33..f9b7a30b00a3 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -491,9 +491,9 @@ static void logfs_kill_sb(struct super_block *sb)
 	 * From this point on alias entries are simply dropped - and any
 	 * writes to the object store are considered bugs.
 	 */
-	super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
 	log_super("LogFS: Now in shutdown\n");
 	generic_shutdown_super(sb);
+	super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
 
 	BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
 
-- 
cgit v1.2.3


From 0bd90387ed5a8abbcf43391b480efdc211721cfe Mon Sep 17 00:00:00 2001
From: Prasad Joshi <prasadjoshi.linux@gmail.com>
Date: Sun, 2 Oct 2011 23:46:51 +0530
Subject: logfs: Propagate page parameter to __logfs_write_inode

During GC LogFS has to rewrite each valid block to a separate segment.
Rewrite operation reads data from an old segment and writes it to a
newly allocated segment. Since every write operation changes data
block pointers maintained in inode, inode should also be rewritten.

In GC path to avoid AB-BA deadlock LogFS marks a page with
PG_pre_locked in addition to locking the page (PG_locked). The page
lock is ignored iff the page is pre-locked.

LogFS uses a special file called segment file. The segment file
maintains an 8 bytes entry for every segment. It keeps track of erase
count, level etc. for every segment.

Bad things happen with a segment belonging to the segment file is GCed

 ------------[ cut here ]------------
kernel BUG at /home/prasad/logfs/readwrite.c:297!
invalid opcode: 0000 [#1] SMP
Modules linked in: logfs joydev usbhid hid psmouse e1000 i2c_piix4
		serio_raw [last unloaded: logfs]
Pid: 20161, comm: mount Not tainted 3.1.0-rc3+ #3 innotek GmbH
		VirtualBox
EIP: 0060:[<f809132a>] EFLAGS: 00010292 CPU: 0
EIP is at logfs_lock_write_page+0x6a/0x70 [logfs]
EAX: 00000027 EBX: f73f5b20 ECX: c16007c8 EDX: 00000094
ESI: 00000000 EDI: e59be6e4 EBP: c7337b28 ESP: c7337b18
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
Process mount (pid: 20161, ti=c7336000 task=eb323f70 task.ti=c7336000)
Stack:
f8099a3d c7337b24 f73f5b20 00001002 c7337b50 f8091f6d f8099a4d f80994e4
00000003 00000000 c7337b68 00000000 c67e4400 00001000 c7337b80 f80935e5
00000000 00000000 00000000 00000000 e1fcf000 0000000f e59be618 c70bf900
Call Trace:
[<f8091f6d>] logfs_get_write_page.clone.16+0xdd/0x100 [logfs]
[<f80935e5>] logfs_mod_segment_entry+0x55/0x110 [logfs]
[<f809460d>] logfs_get_segment_entry+0x1d/0x20 [logfs]
[<f8091060>] ? logfs_cleanup_journal+0x50/0x50 [logfs]
[<f809521b>] ostore_get_erase_count+0x1b/0x40 [logfs]
[<f80965b8>] logfs_open_area+0xc8/0x150 [logfs]
[<c141a7ec>] ? kmemleak_alloc+0x2c/0x60
[<f809668e>] __logfs_segment_write.clone.16+0x4e/0x1b0 [logfs]
[<c10dd563>] ? mempool_kmalloc+0x13/0x20
[<c10dd563>] ? mempool_kmalloc+0x13/0x20
[<f809696f>] logfs_segment_write+0x17f/0x1d0 [logfs]
[<f8092e8c>] logfs_write_i0+0x11c/0x180 [logfs]
[<f8092f35>] logfs_write_direct+0x45/0x90 [logfs]
[<f80934cd>] __logfs_write_buf+0xbd/0xf0 [logfs]
[<c102900e>] ? kmap_atomic_prot+0x4e/0xe0
[<f809424b>] logfs_write_buf+0x3b/0x60 [logfs]
[<f80947a9>] __logfs_write_inode+0xa9/0x110 [logfs]
[<f8094cb0>] logfs_rewrite_block+0xc0/0x110 [logfs]
[<f8095300>] ? get_mapping_page+0x10/0x60 [logfs]
[<f8095aa0>] ? logfs_load_object_aliases+0x2e0/0x2f0 [logfs]
[<f808e57d>] logfs_gc_segment+0x2ad/0x310 [logfs]
[<f808e62a>] __logfs_gc_once+0x4a/0x80 [logfs]
[<f808ed43>] logfs_gc_pass+0x683/0x6a0 [logfs]
[<f8097a89>] logfs_mount+0x5a9/0x680 [logfs]
[<c1126b21>] mount_fs+0x21/0xd0
[<c10f6f6f>] ? __alloc_percpu+0xf/0x20
[<c113da41>] ? alloc_vfsmnt+0xb1/0x130
[<c113db4b>] vfs_kern_mount+0x4b/0xa0
[<c113e06e>] do_kern_mount+0x3e/0xe0
[<c113f60d>] do_mount+0x34d/0x670
[<c10f2749>] ? strndup_user+0x49/0x70
[<c113fcab>] sys_mount+0x6b/0xa0
[<c142d87c>] syscall_call+0x7/0xb
Code: f8 e8 8b 93 39 c9 8b 45 f8 3e 0f ba 28 00 19 d2 85 d2 74 ca eb d0 0f 0b 8d 45 fc 89 44 24 04 c7 04 24 3d 9a 09 f8 e8 09 92 39 c9 <0f> 0b 8d 74 26 00 55 89 e5 3e 8d 74 26 00 8b 10 80 e6 01 74 09
EIP: [<f809132a>] logfs_lock_write_page+0x6a/0x70 [logfs] SS:ESP 0068:c7337b18
---[ end trace 96e67d5b3aa3d6ca ]---

The patch passes locked page to __logfs_write_inode. It calls function
logfs_get_wblocks() to pre-lock the page. This ensures any further
attempts to lock the page are ignored (esp from get_erase_count).

Acked-by: Joern Engel <joern@logfs.org>
Signed-off-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
---
 fs/logfs/dir.c       |  2 +-
 fs/logfs/inode.c     |  2 +-
 fs/logfs/logfs.h     |  2 +-
 fs/logfs/readwrite.c | 12 ++++++------
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b7d7f67cee5a..b6404898da83 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -71,7 +71,7 @@ static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
 
 static int write_inode(struct inode *inode)
 {
-	return __logfs_write_inode(inode, WF_LOCK);
+	return __logfs_write_inode(inode, NULL, WF_LOCK);
 }
 
 static s64 dir_seek_data(struct inode *inode, s64 pos)
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 388d7c5a7bed..7c42c132c177 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -287,7 +287,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
 		return 0;
 
-	ret = __logfs_write_inode(inode, flags);
+	ret = __logfs_write_inode(inode, NULL, flags);
 	LOGFS_BUG_ON(ret, inode->i_sb);
 	return ret;
 }
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index bb4340850c1b..0dec29887a8a 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -528,7 +528,7 @@ void logfs_destroy_inode_cache(void);
 void logfs_set_blocks(struct inode *inode, u64 no);
 /* these logically belong into inode.c but actually reside in readwrite.c */
 int logfs_read_inode(struct inode *inode);
-int __logfs_write_inode(struct inode *inode, long flags);
+int __logfs_write_inode(struct inode *inode, struct page *, long flags);
 void logfs_evict_inode(struct inode *inode);
 
 /* journal.c */
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 7b10e8aecced..88284c67ba97 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -422,7 +422,7 @@ static void inode_write_block(struct logfs_block *block)
 	if (inode->i_ino == LOGFS_INO_MASTER)
 		logfs_write_anchor(inode->i_sb);
 	else {
-		ret = __logfs_write_inode(inode, 0);
+		ret = __logfs_write_inode(inode, NULL, 0);
 		/* see indirect_write_block comment */
 		BUG_ON(ret);
 	}
@@ -1629,7 +1629,7 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
 			if (inode->i_ino == LOGFS_INO_MASTER)
 				logfs_write_anchor(inode->i_sb);
 			else {
-				err = __logfs_write_inode(inode, flags);
+				err = __logfs_write_inode(inode, page, flags);
 			}
 		}
 	}
@@ -1879,7 +1879,7 @@ int logfs_truncate(struct inode *inode, u64 target)
 		logfs_get_wblocks(sb, NULL, 1);
 		err = __logfs_truncate(inode, size);
 		if (!err)
-			err = __logfs_write_inode(inode, 0);
+			err = __logfs_write_inode(inode, NULL, 0);
 		logfs_put_wblocks(sb, NULL, 1);
 	}
 
@@ -2119,14 +2119,14 @@ void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
 			ec_level);
 }
 
-int __logfs_write_inode(struct inode *inode, long flags)
+int __logfs_write_inode(struct inode *inode, struct page *page, long flags)
 {
 	struct super_block *sb = inode->i_sb;
 	int ret;
 
-	logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
+	logfs_get_wblocks(sb, page, flags & WF_LOCK);
 	ret = do_write_inode(inode);
-	logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
+	logfs_put_wblocks(sb, page, flags & WF_LOCK);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 6c69494f6b442834f26377e02d43fc8e1272221d Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Mon, 12 Sep 2011 21:09:16 +0530
Subject: logfs: remove useless BUG_ON

It prevents write sizes >4k.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/journal.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 9da29706f91c..1e1c369df22b 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -612,7 +612,6 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
 	if (len == 0)
 		return logfs_write_header(super, header, 0, type);
 
-	BUG_ON(len > sb->s_blocksize);
 	compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
 	if (compr_len < 0 || type == JE_ANCHOR) {
 		memcpy(data, buf, len);
-- 
cgit v1.2.3


From 1bcceaff8cbe5e5698ccf1015c9a938aa72718c4 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Fri, 5 Aug 2011 11:18:19 +0200
Subject: logfs: Free areas before calling generic_shutdown_super()

Or hit an assertion in map_invalidatepage() instead.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/logfs.h   |  1 +
 fs/logfs/segment.c | 14 ++++++++++----
 fs/logfs/super.c   |  1 +
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 0dec29887a8a..59ed32cd62d1 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -596,6 +596,7 @@ int logfs_init_mapping(struct super_block *sb);
 void logfs_sync_area(struct logfs_area *area);
 void logfs_sync_segments(struct super_block *sb);
 void freeseg(struct super_block *sb, u32 segno);
+void free_areas(struct super_block *sb);
 
 /* area handling */
 int logfs_init_areas(struct super_block *sb);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 6aee6092860d..ab798ed1cc88 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -862,6 +862,16 @@ static void free_area(struct logfs_area *area)
 	kfree(area);
 }
 
+void free_areas(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	for_each_area(i)
+		free_area(super->s_area[i]);
+	free_area(super->s_journal_area);
+}
+
 static struct logfs_area *alloc_area(struct super_block *sb)
 {
 	struct logfs_area *area;
@@ -944,10 +954,6 @@ err:
 void logfs_cleanup_areas(struct super_block *sb)
 {
 	struct logfs_super *super = logfs_super(sb);
-	int i;
 
 	btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
-	for_each_area(i)
-		free_area(super->s_area[i]);
-	free_area(super->s_journal_area);
 }
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index f9b7a30b00a3..c9ee7f5d1caf 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -486,6 +486,7 @@ static void logfs_kill_sb(struct super_block *sb)
 	/* Alias entries slow down mount, so evict as many as possible */
 	sync_filesystem(sb);
 	logfs_write_anchor(sb);
+	free_areas(sb);
 
 	/*
 	 * From this point on alias entries are simply dropped - and any
-- 
cgit v1.2.3


From bbe01387129f76fa4bec17904eb14c4bdc3c179f Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Fri, 5 Aug 2011 11:13:30 +0200
Subject: logfs: Grow inode in delete path

Can be necessary if an inode gets deleted (through -ENOSPC) before being
written.  Might be better to move this into logfs_write_rec(), but for
now go with the stupid&safe patch.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/readwrite.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 88284c67ba97..4153e65b0148 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1576,11 +1576,15 @@ int logfs_write_buf(struct inode *inode, struct page *page, long flags)
 static int __logfs_delete(struct inode *inode, struct page *page)
 {
 	long flags = WF_DELETE;
+	int err;
 
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 
 	if (page->index < I0_BLOCKS)
 		return logfs_write_direct(inode, page, flags);
+	err = grow_inode(inode, page->index, 0);
+	if (err)
+		return err;
 	return logfs_write_rec(inode, page, page->index, 0, flags);
 }
 
-- 
cgit v1.2.3


From f2933e86ad93a8d1287079d59e67afd6f4166a9d Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Fri, 5 Aug 2011 11:09:55 +0200
Subject: Logfs: Allow NULL block_isbad() methods

Not all mtd drivers define block_isbad().  Let's assume no bad blocks
instead of refusing to mount.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/dev_mtd.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 339e17e9133d..d054d7e975ca 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -150,14 +150,13 @@ static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
 	filler_t *filler = mtd_readpage;
 	struct mtd_info *mtd = super->s_mtd;
 
-	if (!mtd->block_isbad)
-		return NULL;
-
 	*ofs = 0;
-	while (mtd->block_isbad(mtd, *ofs)) {
-		*ofs += mtd->erasesize;
-		if (*ofs >= mtd->size)
-			return NULL;
+	if (mtd->block_isbad) {
+		while (mtd->block_isbad(mtd, *ofs)) {
+			*ofs += mtd->erasesize;
+			if (*ofs >= mtd->size)
+				return NULL;
+		}
 	}
 	BUG_ON(*ofs & ~PAGE_MASK);
 	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
@@ -170,14 +169,13 @@ static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
 	filler_t *filler = mtd_readpage;
 	struct mtd_info *mtd = super->s_mtd;
 
-	if (!mtd->block_isbad)
-		return NULL;
-
 	*ofs = mtd->size - mtd->erasesize;
-	while (mtd->block_isbad(mtd, *ofs)) {
-		*ofs -= mtd->erasesize;
-		if (*ofs <= 0)
-			return NULL;
+	if (mtd->block_isbad) {
+		while (mtd->block_isbad(mtd, *ofs)) {
+			*ofs -= mtd->erasesize;
+			if (*ofs <= 0)
+				return NULL;
+		}
 	}
 	*ofs = *ofs + mtd->erasesize - 0x1000;
 	BUG_ON(*ofs & ~PAGE_MASK);
-- 
cgit v1.2.3


From 4991a5faab7368daac463181e786608b4eb63675 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 31 Jan 2012 11:52:01 +0300
Subject: cifs: check offset in decode_ntlmssp_challenge()

We should check that we're not copying memory from beyond the end of the
blob.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/sess.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d85efad5765f..eb767412177d 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -395,6 +395,10 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 	ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
 	tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
 	tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
+	if (tioffset > blob_len || tioffset + tilen > blob_len) {
+		cERROR(1, "tioffset + tilen too high %u + %u", tioffset, tilen);
+		return -EINVAL;
+	}
 	if (tilen) {
 		ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
 		if (!ses->auth_key.response) {
-- 
cgit v1.2.3


From 000f9bb83968ebd6959ff76870f16fc8f766ebd3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Mon, 30 Jan 2012 19:50:01 -0800
Subject: cifs: fix printk format warnings

Fix printk format warnings for ssize_t variables:

fs/cifs/connect.c:2145:3: warning: format '%ld' expects type 'long int', but argument 3 has type 'ssize_t'
fs/cifs/connect.c:2152:3: warning: format '%ld' expects type 'long int', but argument 3 has type 'ssize_t'
fs/cifs/connect.c:2160:3: warning: format '%ld' expects type 'long int', but argument 3 has type 'ssize_t'
fs/cifs/connect.c:2170:3: warning: format '%ld' expects type 'long int', but argument 3 has type 'ssize_t'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Jeff Layton <jlayton@redhat.com>
Cc:	linux-cifs@vger.kernel.org
---
 fs/cifs/connect.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 026d6464335b..9c288653e6d6 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2142,14 +2142,14 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 
 	len = delim - payload;
 	if (len > MAX_USERNAME_SIZE || len <= 0) {
-		cFYI(1, "Bad value from username search (len=%ld)", len);
+		cFYI(1, "Bad value from username search (len=%zd)", len);
 		rc = -EINVAL;
 		goto out_key_put;
 	}
 
 	vol->username = kstrndup(payload, len, GFP_KERNEL);
 	if (!vol->username) {
-		cFYI(1, "Unable to allocate %ld bytes for username", len);
+		cFYI(1, "Unable to allocate %zd bytes for username", len);
 		rc = -ENOMEM;
 		goto out_key_put;
 	}
@@ -2157,7 +2157,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 
 	len = key->datalen - (len + 1);
 	if (len > MAX_PASSWORD_SIZE || len <= 0) {
-		cFYI(1, "Bad len for password search (len=%ld)", len);
+		cFYI(1, "Bad len for password search (len=%zd)", len);
 		rc = -EINVAL;
 		kfree(vol->username);
 		vol->username = NULL;
@@ -2167,7 +2167,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 	++delim;
 	vol->password = kstrndup(delim, len, GFP_KERNEL);
 	if (!vol->password) {
-		cFYI(1, "Unable to allocate %ld bytes for password", len);
+		cFYI(1, "Unable to allocate %zd bytes for password", len);
 		rc = -ENOMEM;
 		kfree(vol->username);
 		vol->username = NULL;
-- 
cgit v1.2.3


From 4505360376637832f79f84f352588b0a045ad113 Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Fri, 27 Jan 2012 06:37:26 +0000
Subject: xfs: pass KM_SLEEP flag to kmem_realloc() in
 xlog_recover_add_to_cnt_trans()

The kmem_realloc() in xfs is given KM_* memory allocation flags. And it
allocates memory using kmalloc() after they are converted to gfp_mask
flags. In xlog_recover_add_to_cont_trans(), 0u is passed to kmem_realloc(),
instead of them. I guess it is preferred to use them, and here memory must
be allocated but don't have to be done with GFP_ATOMIC. So, this patch
changes it to KM_SLEEP.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: Ben Myers <bpm@sgi.com>
Cc: Alex Elder <elder@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 541a508adea1..15ff5392fb65 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1489,7 +1489,7 @@ xlog_recover_add_to_cont_trans(
 	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
 	old_len = item->ri_buf[item->ri_cnt-1].i_len;
 
-	ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
+	ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
 	memcpy(&ptr[old_len], dp, len); /* d, s, l */
 	item->ri_buf[item->ri_cnt-1].i_len += len;
 	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
-- 
cgit v1.2.3


From 2a73ca8208197d03f78d680b3c7953b897e91eb6 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Tue, 31 Jan 2012 12:51:24 -0600
Subject: [CIFS] Update cifs Kconfig title to match removal of experimental
 dependency

Removed the dependency on CONFIG_EXPERIMENTAL but forgot to update
the text description to be consistent.

Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 0554b00a7b33..2b243af70aa3 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -139,7 +139,7 @@ config CIFS_DFS_UPCALL
 	    points. If unsure, say N.
 
 config CIFS_FSCACHE
-	  bool "Provide CIFS client caching support (EXPERIMENTAL)"
+	  bool "Provide CIFS client caching support"
 	  depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
 	  help
 	    Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
@@ -147,7 +147,7 @@ config CIFS_FSCACHE
 	    manager. If unsure, say N.
 
 config CIFS_ACL
-	  bool "Provide CIFS ACL support (EXPERIMENTAL)"
+	  bool "Provide CIFS ACL support"
 	  depends on CIFS_XATTR && KEYS
 	  help
 	    Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
-- 
cgit v1.2.3


From 15eb77a07c714ac80201abd0a9568888bcee6276 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 17 Jan 2012 11:18:56 -0600
Subject: writeback: fix NULL bdi->dev in trace writeback_single_inode

bdi_prune_sb() resets sb->s_bdi to default_backing_dev_info when the
tearing down the original bdi. Fix trace_writeback_single_inode to
use sb->s_bdi=default_backing_dev_info rather than bdi->dev=NULL for a
teared down bdi.

Cc: <stable@kernel.org>
Reported-by: Rabin Vincent <rabin@rab.in>
Tested-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f855916657ba..5b4a9362d5aa 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -52,14 +52,6 @@ struct wb_writeback_work {
 	struct completion *done;	/* set if the caller waits */
 };
 
-/*
- * Include the creation of the trace points after defining the
- * wb_writeback_work structure so that the definition remains local to this
- * file.
- */
-#define CREATE_TRACE_POINTS
-#include <trace/events/writeback.h>
-
 /*
  * We don't actually have pdflush, but this one is exported though /proc...
  */
@@ -92,6 +84,14 @@ static inline struct inode *wb_inode(struct list_head *head)
 	return list_entry(head, struct inode, i_wb_list);
 }
 
+/*
+ * Include the creation of the trace points after defining the
+ * wb_writeback_work structure and inline functions so that the definition
+ * remains local to this file.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/writeback.h>
+
 /* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
 static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
 {
-- 
cgit v1.2.3


From 7d731019218e49a9811f6d0adec4b1cfcb752bed Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 1 Feb 2012 11:10:24 -0800
Subject: mtd: fix merge conflict resolution breakage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes merge conflict resolution breakage introduced by merge
d3712b9dfcf4 ("Merge tag 'for-linus' of git://github.com/prasad-joshi/logfs_upstream").

The commit changed 'mtd_can_have_bb()' function and made it always
return zero, which is incorrect.  Instead, we need it to return whether
the underlying flash device can have bad eraseblocks or not.  UBI needs
this information because it affects how it handles the underlying flash.
E.g., if the underlying flash is NOR, it cannot have bad blocks and any
write or erase error is fatal, and all we can do is to switch to R/O
mode.  We do not need to reserve a pool of good eraseblocks for bad
eraseblocks handling, and so on.

This patch also removes 'mtd_can_have_bb()' invocations from Logfs to
ensure correct Logfs behavior.

I've tested that with this patch UBI works on top of NOR and NAND
flashes emulated by mtdram and nandsim correspondingly.

This patch is based on patch from Linus Torvalds.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Jörn Engel <joern@logfs.org>
Acked-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/logfs/dev_mtd.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index e97404d611e0..9c501449450d 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -152,9 +152,6 @@ static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
 	filler_t *filler = logfs_mtd_readpage;
 	struct mtd_info *mtd = super->s_mtd;
 
-	if (!mtd_can_have_bb(mtd))
-		return NULL;
-
 	*ofs = 0;
 	while (mtd_block_isbad(mtd, *ofs)) {
 		*ofs += mtd->erasesize;
@@ -172,9 +169,6 @@ static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
 	filler_t *filler = logfs_mtd_readpage;
 	struct mtd_info *mtd = super->s_mtd;
 
-	if (!mtd_can_have_bb(mtd))
-		return NULL;
-
 	*ofs = mtd->size - mtd->erasesize;
 	while (mtd_block_isbad(mtd, *ofs)) {
 		*ofs -= mtd->erasesize;
-- 
cgit v1.2.3


From 71879d3cb3dd8f2dfdefb252775c1b3ea04a3dd4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 31 Jan 2012 17:14:38 +0100
Subject: proc: mem_release() should check mm != NULL

mem_release() can hit mm == NULL, add the necessary check.

Cc: stable@kernel.org
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9cde9edf9c4d..c3617ea7830b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -822,8 +822,8 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 static int mem_release(struct inode *inode, struct file *file)
 {
 	struct mm_struct *mm = file->private_data;
-
-	mmput(mm);
+	if (mm)
+		mmput(mm);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 572d34b946bae070debd42db1143034d9687e13f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 31 Jan 2012 17:14:54 +0100
Subject: proc: unify mem_read() and mem_write()

No functional changes, cleanup and preparation.

mem_read() and mem_write() are very similar. Move this code into the
new common helper, mem_rw(), which takes the additional "int write"
argument.

Cc: stable@kernel.org
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 90 +++++++++++++++++++++-------------------------------------
 1 file changed, 32 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c3617ea7830b..be1909041685 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -718,57 +718,13 @@ static int mem_open(struct inode* inode, struct file* file)
 	return 0;
 }
 
-static ssize_t mem_read(struct file * file, char __user * buf,
-			size_t count, loff_t *ppos)
+static ssize_t mem_rw(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos, int write)
 {
-	int ret;
-	char *page;
-	unsigned long src = *ppos;
 	struct mm_struct *mm = file->private_data;
-
-	if (!mm)
-		return 0;
-
-	page = (char *)__get_free_page(GFP_TEMPORARY);
-	if (!page)
-		return -ENOMEM;
-
-	ret = 0;
- 
-	while (count > 0) {
-		int this_len, retval;
-
-		this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
-		retval = access_remote_vm(mm, src, page, this_len, 0);
-		if (!retval) {
-			if (!ret)
-				ret = -EIO;
-			break;
-		}
-
-		if (copy_to_user(buf, page, retval)) {
-			ret = -EFAULT;
-			break;
-		}
- 
-		ret += retval;
-		src += retval;
-		buf += retval;
-		count -= retval;
-	}
-	*ppos = src;
-
-	free_page((unsigned long) page);
-	return ret;
-}
-
-static ssize_t mem_write(struct file * file, const char __user *buf,
-			 size_t count, loff_t *ppos)
-{
-	int copied;
+	unsigned long addr = *ppos;
+	ssize_t copied;
 	char *page;
-	unsigned long dst = *ppos;
-	struct mm_struct *mm = file->private_data;
 
 	if (!mm)
 		return 0;
@@ -779,30 +735,48 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 
 	copied = 0;
 	while (count > 0) {
-		int this_len, retval;
+		int this_len = min_t(int, count, PAGE_SIZE);
 
-		this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
-		if (copy_from_user(page, buf, this_len)) {
+		if (write && copy_from_user(page, buf, this_len)) {
 			copied = -EFAULT;
 			break;
 		}
-		retval = access_remote_vm(mm, dst, page, this_len, 1);
-		if (!retval) {
+
+		this_len = access_remote_vm(mm, addr, page, this_len, write);
+		if (!this_len) {
 			if (!copied)
 				copied = -EIO;
 			break;
 		}
-		copied += retval;
-		buf += retval;
-		dst += retval;
-		count -= retval;			
+
+		if (!write && copy_to_user(buf, page, this_len)) {
+			copied = -EFAULT;
+			break;
+		}
+
+		buf += this_len;
+		addr += this_len;
+		copied += this_len;
+		count -= this_len;
 	}
-	*ppos = dst;
+	*ppos = addr;
 
 	free_page((unsigned long) page);
 	return copied;
 }
 
+static ssize_t mem_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	return mem_rw(file, buf, count, ppos, 0);
+}
+
+static ssize_t mem_write(struct file *file, const char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	return mem_rw(file, (char __user*)buf, count, ppos, 1);
+}
+
 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 {
 	switch (orig) {
-- 
cgit v1.2.3


From 6d08f2c7139790c268820a2e590795cb8333181a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 31 Jan 2012 17:15:11 +0100
Subject: proc: make sure mem_open() doesn't pin the target's memory

Once /proc/pid/mem is opened, the memory can't be released until
mem_release() even if its owner exits.

Change mem_open() to do atomic_inc(mm_count) + mmput(), this only
pins mm_struct. Change mem_rw() to do atomic_inc_not_zero(mm_count)
before access_remote_vm(), this verifies that this mm is still alive.

I am not sure what should mem_rw() return if atomic_inc_not_zero()
fails. With this patch it returns zero to match the "mm == NULL" case,
may be it should return -EINVAL like it did before e268337d.

Perhaps it makes sense to add the additional fatal_signal_pending()
check into the main loop, to ensure we do not hold this memory if
the target task was oom-killed.

Cc: stable@kernel.org
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index be1909041685..d9512bd03e6c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -711,6 +711,13 @@ static int mem_open(struct inode* inode, struct file* file)
 	if (IS_ERR(mm))
 		return PTR_ERR(mm);
 
+	if (mm) {
+		/* ensure this mm_struct can't be freed */
+		atomic_inc(&mm->mm_count);
+		/* but do not pin its memory */
+		mmput(mm);
+	}
+
 	/* OK to pass negative loff_t, we can catch out-of-range */
 	file->f_mode |= FMODE_UNSIGNED_OFFSET;
 	file->private_data = mm;
@@ -734,6 +741,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
 		return -ENOMEM;
 
 	copied = 0;
+	if (!atomic_inc_not_zero(&mm->mm_users))
+		goto free;
+
 	while (count > 0) {
 		int this_len = min_t(int, count, PAGE_SIZE);
 
@@ -761,6 +771,8 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
 	}
 	*ppos = addr;
 
+	mmput(mm);
+free:
 	free_page((unsigned long) page);
 	return copied;
 }
@@ -797,7 +809,7 @@ static int mem_release(struct inode *inode, struct file *file)
 {
 	struct mm_struct *mm = file->private_data;
 	if (mm)
-		mmput(mm);
+		mmdrop(mm);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 114fc47492e23d93653e4a16664833e98d62a563 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Wed, 11 Jan 2012 17:41:01 -0800
Subject: ceph: change "ceph.layout" xattr to be "ceph.file.layout"

The virtual extended attribute named "ceph.layout" is meaningful
only for regular files.  Change its name to be "ceph.file.layout" to
more directly reflect that in the ceph xattr namespace.  Preserve
the old "ceph.layout" name for the time being (until we decide it's
safe to get rid of it entirely).

Add a missing initializer for "readonly" in the terminating entry.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a5e36e4488a7..9e6734e38c12 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -111,8 +111,10 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 }
 
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+	{ true, "ceph.file.layout", ceph_vxattrcb_layout},
+	/* The following extended attribute name is deprecated */
 	{ true, "ceph.layout", ceph_vxattrcb_layout},
-	{ NULL, NULL }
+	{ true, NULL, NULL }
 };
 
 static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
-- 
cgit v1.2.3


From 32852a81bccd9e3d1953b894966393d1b546576d Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Sat, 14 Jan 2012 22:20:59 -0500
Subject: ceph: fix length validation in parse_reply_info()

"len" is read from network and thus needs validation.  Otherwise, given
a bogus "len" value, p+len could be an out-of-bounds pointer, which is
used in further parsing.

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6203d805eb45..be1415fcaac8 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -262,6 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,
 	/* trace */
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
+		ceph_decode_need(&p, end, len, bad);
 		err = parse_reply_info_trace(&p, p+len, info, features);
 		if (err < 0)
 			goto out_bad;
@@ -270,6 +271,7 @@ static int parse_reply_info(struct ceph_msg *msg,
 	/* extra */
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
+		ceph_decode_need(&p, end, len, bad);
 		err = parse_reply_info_extra(&p, p+len, info, features);
 		if (err < 0)
 			goto out_bad;
-- 
cgit v1.2.3


From d8fb02abdc39f92a1066313e2b17047876afa8f9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 12 Jan 2012 17:48:10 -0800
Subject: ceph: create a new session lock to avoid lock inversion

Lockdep was reporting a possible circular lock dependency in
dentry_lease_is_valid().  That function needs to sample the
session's s_cap_gen and and s_cap_ttl fields coherently, but needs
to do so while holding a dentry lock.  The s_cap_lock field was
being used to protect the two fields, but that can't be taken while
holding a lock on a dentry within the session.

In most cases, the s_cap_gen and s_cap_ttl fields only get operated
on separately.  But in three cases they need to be updated together.
Implement a new lock to protect the spots updating both fields
atomically is required.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       | 4 ++--
 fs/ceph/dir.c        | 4 ++--
 fs/ceph/mds_client.c | 8 +++++---
 fs/ceph/mds_client.h | 7 +++++--
 4 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8b53193e4f7c..90d789df9ce0 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -641,10 +641,10 @@ static int __cap_is_valid(struct ceph_cap *cap)
 	unsigned long ttl;
 	u32 gen;
 
-	spin_lock(&cap->session->s_cap_lock);
+	spin_lock(&cap->session->s_gen_ttl_lock);
 	gen = cap->session->s_cap_gen;
 	ttl = cap->session->s_cap_ttl;
-	spin_unlock(&cap->session->s_cap_lock);
+	spin_unlock(&cap->session->s_gen_ttl_lock);
 
 	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
 		dout("__cap_is_valid %p cap %p issued %s "
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 98954003a8d3..63c52f33361b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -975,10 +975,10 @@ static int dentry_lease_is_valid(struct dentry *dentry)
 	di = ceph_dentry(dentry);
 	if (di && di->lease_session) {
 		s = di->lease_session;
-		spin_lock(&s->s_cap_lock);
+		spin_lock(&s->s_gen_ttl_lock);
 		gen = s->s_cap_gen;
 		ttl = s->s_cap_ttl;
-		spin_unlock(&s->s_cap_lock);
+		spin_unlock(&s->s_gen_ttl_lock);
 
 		if (di->lease_gen == gen &&
 		    time_before(jiffies, dentry->d_time) &&
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index be1415fcaac8..a4fdf9397a90 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -400,9 +400,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
 	s->s_con.peer_name.num = cpu_to_le64(mds);
 
-	spin_lock_init(&s->s_cap_lock);
+	spin_lock_init(&s->s_gen_ttl_lock);
 	s->s_cap_gen = 0;
 	s->s_cap_ttl = 0;
+
+	spin_lock_init(&s->s_cap_lock);
 	s->s_renew_requested = 0;
 	s->s_renew_seq = 0;
 	INIT_LIST_HEAD(&s->s_caps);
@@ -2328,10 +2330,10 @@ static void handle_session(struct ceph_mds_session *session,
 	case CEPH_SESSION_STALE:
 		pr_info("mds%d caps went stale, renewing\n",
 			session->s_mds);
-		spin_lock(&session->s_cap_lock);
+		spin_lock(&session->s_gen_ttl_lock);
 		session->s_cap_gen++;
 		session->s_cap_ttl = 0;
-		spin_unlock(&session->s_cap_lock);
+		spin_unlock(&session->s_gen_ttl_lock);
 		send_renew_caps(mdsc, session);
 		break;
 
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index a50ca0e39475..8c7c04ebb595 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -117,10 +117,13 @@ struct ceph_mds_session {
 	void             *s_authorizer_buf, *s_authorizer_reply_buf;
 	size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
 
-	/* protected by s_cap_lock */
-	spinlock_t        s_cap_lock;
+	/* protected by s_gen_ttl_lock */
+	spinlock_t        s_gen_ttl_lock;
 	u32               s_cap_gen;  /* inc each time we get mds stale msg */
 	unsigned long     s_cap_ttl;  /* when session caps expire */
+
+	/* protected by s_cap_lock */
+	spinlock_t        s_cap_lock;
 	struct list_head  s_caps;     /* all caps issued by this session */
 	int               s_nr_caps, s_trim_caps;
 	int               s_num_cap_releases;
-- 
cgit v1.2.3


From 8cdb878dcb359fd1137e9abdee9322f5e9bcfdf8 Mon Sep 17 00:00:00 2001
From: Christopher Yeoh <cyeoh@au1.ibm.com>
Date: Thu, 2 Feb 2012 11:34:09 +1030
Subject: Fix race in process_vm_rw_core

This fixes the race in process_vm_core found by Oleg (see

  http://article.gmane.org/gmane.linux.kernel/1235667/

for details).

This has been updated since I last sent it as the creation of the new
mm_access() function did almost exactly the same thing as parts of the
previous version of this patch did.

In order to use mm_access() even when /proc isn't enabled, we move it to
kernel/fork.c where other related process mm access functions already
are.

Signed-off-by: Chris Yeoh <yeohc@au1.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d9512bd03e6c..d4548dd49b02 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -198,26 +198,6 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
 	return result;
 }
 
-static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
-{
-	struct mm_struct *mm;
-	int err;
-
-	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
-	if (err)
-		return ERR_PTR(err);
-
-	mm = get_task_mm(task);
-	if (mm && mm != current->mm &&
-			!ptrace_may_access(task, mode)) {
-		mmput(mm);
-		mm = ERR_PTR(-EACCES);
-	}
-	mutex_unlock(&task->signal->cred_guard_mutex);
-
-	return mm;
-}
-
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
 	return mm_access(task, PTRACE_MODE_READ);
-- 
cgit v1.2.3


From de47a4176c532ef5961b8a46a2d541a3517412d3 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Thu, 2 Feb 2012 15:28:28 -0600
Subject: cifs: Fix oops in session setup code for null user mounts

For null user mounts, do not invoke string length function
during session setup.

Cc: <stable@kernel.org
Reported-and-Tested-by: Chris Clayton <chris2553@googlemail.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/sess.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index eb767412177d..551d0c2b9736 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -246,16 +246,15 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
 	/* copy user */
 	/* BB what about null user mounts - check that we do this BB */
 	/* copy user */
-	if (ses->user_name != NULL)
+	if (ses->user_name != NULL) {
 		strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE);
+		bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);
+	}
 	/* else null user mount */
-
-	bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);
 	*bcc_ptr = 0;
 	bcc_ptr++; /* account for null termination */
 
 	/* copy domain */
-
 	if (ses->domainName != NULL) {
 		strncpy(bcc_ptr, ses->domainName, 256);
 		bcc_ptr += strnlen(ses->domainName, 256);
-- 
cgit v1.2.3


From 331818f1c468a24e581aedcbe52af799366a9dfe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 3 Feb 2012 18:30:53 -0500
Subject: NFSv4: Fix an Oops in the NFSv4 getacl code

Commit bf118a342f10dafe44b14451a1392c3254629a1f (NFSv4: include bitmap
in nfsv4 get acl data) introduces the 'acl_scratch' page for the case
where we may need to decode multi-page data. However it fails to take
into account the fact that the variable may be NULL (for the case where
we're not doing multi-page decode), and it also attaches it to the
encoding xdr_stream rather than the decoding one.

The immediate result is an Oops in nfs4_xdr_enc_getacl due to the
call to page_address() with a NULL page pointer.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Andy Adamson <andros@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/nfs4proc.c | 8 ++++----
 fs/nfs/nfs4xdr.c  | 5 ++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f0c849c98fe4..d202e04aca94 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3575,8 +3575,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 	}
 	if (npages > 1) {
 		/* for decoding across pages */
-		args.acl_scratch = alloc_page(GFP_KERNEL);
-		if (!args.acl_scratch)
+		res.acl_scratch = alloc_page(GFP_KERNEL);
+		if (!res.acl_scratch)
 			goto out_free;
 	}
 	args.acl_len = npages * PAGE_SIZE;
@@ -3612,8 +3612,8 @@ out_free:
 	for (i = 0; i < npages; i++)
 		if (pages[i])
 			__free_page(pages[i]);
-	if (args.acl_scratch)
-		__free_page(args.acl_scratch);
+	if (res.acl_scratch)
+		__free_page(res.acl_scratch);
 	return ret;
 }
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 95e92e438407..33bd8d0f745d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2522,7 +2522,6 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
 
 	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
 		args->acl_pages, args->acl_pgbase, args->acl_len);
-	xdr_set_scratch_buffer(xdr, page_address(args->acl_scratch), PAGE_SIZE);
 
 	encode_nops(&hdr);
 }
@@ -6032,6 +6031,10 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	struct compound_hdr hdr;
 	int status;
 
+	if (res->acl_scratch != NULL) {
+		void *p = page_address(res->acl_scratch);
+		xdr_set_scratch_buffer(xdr, p, PAGE_SIZE);
+	}
 	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-- 
cgit v1.2.3


From 96e02d1586782eadf051fa3d6bc4132d2447ac2c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 4 Feb 2012 10:47:10 +0100
Subject: exec: fix use-after-free bug in setup_new_exec()

Setting the task name is done within setup_new_exec() by accessing
bprm->filename. However this happens after flush_old_exec().
This may result in a use after free bug, flush_old_exec() may
"complete" vfork_done, which will wake up the parent which in turn
may free the passed in filename.
To fix this add a new tcomm field in struct linux_binprm which
contains the now early generated task name until it is used.

Fixes this bug on s390:

  Unable to handle kernel pointer dereference at virtual kernel address 0000000039768000
  Process kworker/u:3 (pid: 245, task: 000000003a3dc840, ksp: 0000000039453818)
  Krnl PSW : 0704000180000000 0000000000282e94 (setup_new_exec+0xa0/0x374)
  Call Trace:
  ([<0000000000282e2c>] setup_new_exec+0x38/0x374)
   [<00000000002dd12e>] load_elf_binary+0x402/0x1bf4
   [<0000000000280a42>] search_binary_handler+0x38e/0x5bc
   [<0000000000282b6c>] do_execve_common+0x410/0x514
   [<0000000000282cb6>] do_execve+0x46/0x58
   [<00000000005bce58>] kernel_execve+0x28/0x70
   [<000000000014ba2e>] ____call_usermodehelper+0x102/0x140
   [<00000000005bc8da>] kernel_thread_starter+0x6/0xc
   [<00000000005bc8d4>] kernel_thread_starter+0x0/0xc
  Last Breaking-Event-Address:
   [<00000000002830f0>] setup_new_exec+0x2fc/0x374

  Kernel panic - not syncing: Fatal exception: panic_on_oops

Reported-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index aeb135c7ff5c..92ce83a11e90 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1071,6 +1071,21 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 	perf_event_comm(tsk);
 }
 
+static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
+{
+	int i, ch;
+
+	/* Copies the binary name from after last slash */
+	for (i = 0; (ch = *(fn++)) != '\0';) {
+		if (ch == '/')
+			i = 0; /* overwrite what we wrote */
+		else
+			if (i < len - 1)
+				tcomm[i++] = ch;
+	}
+	tcomm[i] = '\0';
+}
+
 int flush_old_exec(struct linux_binprm * bprm)
 {
 	int retval;
@@ -1085,6 +1100,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 
 	set_mm_exe_file(bprm->mm, bprm->file);
 
+	filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
 	/*
 	 * Release all of the old mmap stuff
 	 */
@@ -1116,10 +1132,6 @@ EXPORT_SYMBOL(would_dump);
 
 void setup_new_exec(struct linux_binprm * bprm)
 {
-	int i, ch;
-	const char *name;
-	char tcomm[sizeof(current->comm)];
-
 	arch_pick_mmap_layout(current->mm);
 
 	/* This is the point of no return */
@@ -1130,18 +1142,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 	else
 		set_dumpable(current->mm, suid_dumpable);
 
-	name = bprm->filename;
-
-	/* Copies the binary name from after last slash */
-	for (i=0; (ch = *(name++)) != '\0';) {
-		if (ch == '/')
-			i = 0; /* overwrite what we wrote */
-		else
-			if (i < (sizeof(tcomm) - 1))
-				tcomm[i++] = ch;
-	}
-	tcomm[i] = '\0';
-	set_task_comm(current, tcomm);
+	set_task_comm(current, bprm->tcomm);
 
 	/* Set the new mm task size. We have to do that late because it may
 	 * depend on TIF_32BIT which is only updated in flush_thread() on
-- 
cgit v1.2.3


From 11a3122f6cf2d988a77eb8883d0fc49cd013a6d5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 7 Feb 2012 07:51:30 +0100
Subject: block: strip out locking optimization in put_io_context()

put_io_context() performed a complex trylock dancing to avoid
deferring ioc release to workqueue.  It was also broken on UP because
trylock was always assumed to succeed which resulted in unbalanced
preemption count.

While there are ways to fix the UP breakage, even the most
pathological microbench (forced ioc allocation and tight fork/exit
loop) fails to show any appreciable performance benefit of the
optimization.  Strip it out.  If there turns out to be workloads which
are affected by this change, simpler optimization from the discussion
thread can be applied later.

Signed-off-by: Tejun Heo <tj@kernel.org>
LKML-Reference: <1328514611.21268.66.camel@sli10-conroe>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/ioprio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index f84b380d65e5..0f1b9515213b 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -51,7 +51,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
 		ioc_ioprio_changed(ioc, ioprio);
-		put_io_context(ioc, NULL);
+		put_io_context(ioc);
 	}
 
 	return err;
-- 
cgit v1.2.3


From 4edc53c1f8cdd99d349165d6c61c45aa4e8e2564 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 7 Feb 2012 06:30:51 -0500
Subject: cifs: fix error handling when cifscreds key payload is an error

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9c288653e6d6..940189bd6490 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2125,7 +2125,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 	down_read(&key->sem);
 	upayload = key->payload.data;
 	if (IS_ERR_OR_NULL(upayload)) {
-		rc = PTR_ERR(key);
+		rc = upayload ? PTR_ERR(upayload) : -EINVAL;
 		goto out_key_put;
 	}
 
-- 
cgit v1.2.3


From 8b0192a5f478da1c1ae906bf3ffff53f26204f56 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 7 Feb 2012 06:30:52 -0500
Subject: cifs: request oplock when doing open on lookup

Currently, it's always set to 0 (no oplock requested).

Cc: <stable@vger.kernel.org>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index df8fecb5b993..63a196b97d50 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -492,7 +492,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 {
 	int xid;
 	int rc = 0; /* to get around spurious gcc warning, set to zero here */
-	__u32 oplock = 0;
+	__u32 oplock = enable_oplocks ? REQ_OPLOCK : 0;
 	__u16 fileHandle = 0;
 	bool posix_open = false;
 	struct cifs_sb_info *cifs_sb;
-- 
cgit v1.2.3


From ff4fa4a25a33f92b5653bb43add0c63bea98d464 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 7 Feb 2012 06:31:05 -0500
Subject: cifs: don't return error from standard_receive3 after marking
 response malformed

standard_receive3 will check the validity of the response from the
server (via checkSMB). It'll pass the result of that check to handle_mid
which will dequeue it and mark it with a status of
MID_RESPONSE_MALFORMED if checkSMB returned an error. At that point,
standard_receive3 will also return an error, which will make the
demultiplex thread skip doing the callback for the mid.

This is wrong -- if we were able to identify the request and the
response is marked malformed, then we want the demultiplex thread to do
the callback. Fix this by making standard_receive3 return 0 in this
situation.

Cc: stable@vger.kernel.org
Reported-and-Tested-by: Mark Moseley <moseleymark@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 940189bd6490..602f77c304c9 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -773,10 +773,11 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 		cifs_dump_mem("Bad SMB: ", buf,
 			min_t(unsigned int, server->total_read, 48));
 
-	if (mid)
-		handle_mid(mid, server, smb_buffer, length);
+	if (!mid)
+		return length;
 
-	return length;
+	handle_mid(mid, server, smb_buffer, length);
+	return 0;
 }
 
 static int
-- 
cgit v1.2.3


From 5abebfdd02450fa1349daacf242e70b3736581e3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 8 Feb 2012 22:07:18 +0100
Subject: bio: don't overflow in bio_get_nr_vecs()

There were two places bio_get_nr_vecs() could overflow:

First, it did a left shift to convert from sectors to bytes immediately
before dividing by PAGE_SIZE.  If PAGE_SIZE ever was less than 512 a great
many things would break, so dividing by PAGE_SIZE >> 9 is safe and will
generate smaller code too.

The nastier overflow was in the DIV_ROUND_UP() (that's what the code was
effectively doing, anyways).  If n + d overflowed, the whole thing would
return 0 which breaks things rather effectively.

bio_get_nr_vecs() doesn't claim to give an exact value anyways, so the
DIV_ROUND_UP() is silly; we could do a straight divide except if a
device's queue_max_sectors was less than PAGE_SIZE we'd return 0.  So we
just add 1; this should always be safe - things will break badly if
bio_get_nr_vecs() returns > BIO_MAX_PAGES (bio_alloc() will suddenly start
failing) but it's queue_max_segments that must guard against this, if
queue_max_sectors is preventing this from happen things are going to
explode on architectures with different PAGE_SIZE.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/bio.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index b1fe82cf88cf..b980ecde026a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -505,13 +505,9 @@ EXPORT_SYMBOL(bio_clone);
 int bio_get_nr_vecs(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
-	int nr_pages;
-
-	nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (nr_pages > queue_max_segments(q))
-		nr_pages = queue_max_segments(q);
-
-	return nr_pages;
+	return min_t(unsigned,
+		     queue_max_segments(q),
+		     queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
 }
 EXPORT_SYMBOL(bio_get_nr_vecs);
 
-- 
cgit v1.2.3


From 1ecd3c7ea76488c63b4b0a2561fd7eaf96cc8028 Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Wed, 8 Feb 2012 17:13:37 -0800
Subject: nilfs2: avoid overflowing segment numbers in
 nilfs_ioctl_clean_segments()

nsegs is read from userspace.  Limit its value and avoid overflowing nsegs
* sizeof(__u64) in the subsequent call to memdup_user().

This patch complements 481fe17e973fb9 ("nilfs2: potential integer overflow
in nilfs_ioctl_clean_segments()").

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Cc: Haogang Chen <haogangchen@gmail.com>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 886649627c3d..2a70fce70c65 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -603,6 +603,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 	nsegs = argv[4].v_nmembs;
 	if (argv[4].v_size != argsz[4])
 		goto out;
+	if (nsegs > UINT_MAX / sizeof(__u64))
+		goto out;
 
 	/*
 	 * argv[4] points to segment numbers this ioctl cleans.  We
-- 
cgit v1.2.3


From b9f9a03150969e4bd9967c20bce67c4de769058f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 9 Feb 2012 15:31:36 -0500
Subject: NFSv4: Ensure we throw out bad delegation stateids on
 NFS4ERR_BAD_STATEID

To ensure that we don't just reuse the bad delegation when we attempt to
recover the nfs4_state that received the bad stateid error.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/nfs4state.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a53f33b4ac3a..45392032e7bd 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1132,6 +1132,8 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4
 {
 	struct nfs_client *clp = server->nfs_client;
 
+	if (test_and_clear_bit(NFS_DELEGATED_STATE, &state->flags))
+		nfs_async_inode_return_delegation(state->inode, &state->stateid);
 	nfs4_state_mark_reclaim_nograce(clp, state);
 	nfs4_schedule_state_manager(clp);
 }
-- 
cgit v1.2.3


From 04da0c8196ac0b12fb6b84f4b7a51ad2fa56d869 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 1 Feb 2012 13:57:20 +0000
Subject: xfs: use a normal shrinker for the dquot freelist

Stop reusing dquots from the freelist when allocating new ones directly, and
implement a shrinker that actually follows the specifications for the
interface.  The shrinker implementation is still highly suboptimal at this
point, but we can gradually work on it.

This also fixes an bug in the previous lock ordering, where we would take
the hash and dqlist locks inside of the freelist lock against the normal
lock ordering.  This is only solvable by introducing the dispose list,
and thus not when using direct reclaim of unused dquots for new allocations.

As a side-effect the quota upper bound and used to free ratio values in
/proc/fs/xfs/xqm are set to 0 as these values don't make any sense in the
new world order.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/kmem.h         |   6 --
 fs/xfs/xfs_dquot.c    | 103 +++++-------------
 fs/xfs/xfs_qm.c       | 291 +++++++++++++++++++-------------------------------
 fs/xfs/xfs_qm.h       |  14 ---
 fs/xfs/xfs_qm_stats.c |   4 +-
 fs/xfs/xfs_trace.h    |   5 +-
 6 files changed, 141 insertions(+), 282 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 292eff198030..ab7c53fe346e 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -110,10 +110,4 @@ kmem_zone_destroy(kmem_zone_t *zone)
 extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
 extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
 
-static inline int
-kmem_shake_allow(gfp_t gfp_mask)
-{
-	return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS));
-}
-
 #endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index b4ff40b5f918..cbcb7bea38e2 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -62,82 +62,6 @@ int xfs_dqerror_mod = 33;
 
 static struct lock_class_key xfs_dquot_other_class;
 
-/*
- * Allocate and initialize a dquot. We don't always allocate fresh memory;
- * we try to reclaim a free dquot if the number of incore dquots are above
- * a threshold.
- * The only field inside the core that gets initialized at this point
- * is the d_id field. The idea is to fill in the entire q_core
- * when we read in the on disk dquot.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqinit(
-	xfs_mount_t  *mp,
-	xfs_dqid_t   id,
-	uint	     type)
-{
-	xfs_dquot_t	*dqp;
-	boolean_t	brandnewdquot;
-
-	brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
-	dqp->dq_flags = type;
-	dqp->q_core.d_id = cpu_to_be32(id);
-	dqp->q_mount = mp;
-
-	/*
-	 * No need to re-initialize these if this is a reclaimed dquot.
-	 */
-	if (brandnewdquot) {
-		INIT_LIST_HEAD(&dqp->q_freelist);
-		mutex_init(&dqp->q_qlock);
-		init_waitqueue_head(&dqp->q_pinwait);
-
-		/*
-		 * Because we want to use a counting completion, complete
-		 * the flush completion once to allow a single access to
-		 * the flush completion without blocking.
-		 */
-		init_completion(&dqp->q_flush);
-		complete(&dqp->q_flush);
-
-		trace_xfs_dqinit(dqp);
-	} else {
-		/*
-		 * Only the q_core portion was zeroed in dqreclaim_one().
-		 * So, we need to reset others.
-		 */
-		dqp->q_nrefs = 0;
-		dqp->q_blkno = 0;
-		INIT_LIST_HEAD(&dqp->q_mplist);
-		INIT_LIST_HEAD(&dqp->q_hashlist);
-		dqp->q_bufoffset = 0;
-		dqp->q_fileoffset = 0;
-		dqp->q_transp = NULL;
-		dqp->q_gdquot = NULL;
-		dqp->q_res_bcount = 0;
-		dqp->q_res_icount = 0;
-		dqp->q_res_rtbcount = 0;
-		atomic_set(&dqp->q_pincount, 0);
-		dqp->q_hash = NULL;
-		ASSERT(list_empty(&dqp->q_freelist));
-
-		trace_xfs_dqreuse(dqp);
-	}
-
-	/*
-	 * In either case we need to make sure group quotas have a different
-	 * lock class than user quotas, to make sure lockdep knows we can
-	 * locks of one of each at the same time.
-	 */
-	if (!(type & XFS_DQ_USER))
-		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
-
-	/*
-	 * log item gets initialized later
-	 */
-	return (dqp);
-}
-
 /*
  * This is called to free all the memory associated with a dquot
  */
@@ -567,7 +491,32 @@ xfs_qm_dqread(
 	int			error;
 	int			cancelflags = 0;
 
-	dqp = xfs_qm_dqinit(mp, id, type);
+
+	dqp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
+
+	dqp->dq_flags = type;
+	dqp->q_core.d_id = cpu_to_be32(id);
+	dqp->q_mount = mp;
+	INIT_LIST_HEAD(&dqp->q_freelist);
+	mutex_init(&dqp->q_qlock);
+	init_waitqueue_head(&dqp->q_pinwait);
+
+	/*
+	 * Because we want to use a counting completion, complete
+	 * the flush completion once to allow a single access to
+	 * the flush completion without blocking.
+	 */
+	init_completion(&dqp->q_flush);
+	complete(&dqp->q_flush);
+
+	/*
+	 * Make sure group quotas have a different lock class than user
+	 * quotas.
+	 */
+	if (!(type & XFS_DQ_USER))
+		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+
+	atomic_inc(&xfs_Gqm->qm_totaldquots);
 
 	trace_xfs_dqread(dqp);
 
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 671f37eae1c7..c436def733bf 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -50,7 +50,6 @@
  */
 struct mutex	xfs_Gqm_lock;
 struct xfs_qm	*xfs_Gqm;
-uint		ndquot;
 
 kmem_zone_t	*qm_dqzone;
 kmem_zone_t	*qm_dqtrxzone;
@@ -93,7 +92,6 @@ xfs_Gqm_init(void)
 		goto out_free_udqhash;
 
 	hsize /= sizeof(xfs_dqhash_t);
-	ndquot = hsize << 8;
 
 	xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
 	xqm->qm_dqhashmask = hsize - 1;
@@ -137,7 +135,6 @@ xfs_Gqm_init(void)
 		xqm->qm_dqtrxzone = qm_dqtrxzone;
 
 	atomic_set(&xqm->qm_totaldquots, 0);
-	xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
 	xqm->qm_nrefs = 0;
 	return xqm;
 
@@ -1600,216 +1597,150 @@ xfs_qm_init_quotainos(
 	return 0;
 }
 
+STATIC void
+xfs_qm_dqfree_one(
+	struct xfs_dquot	*dqp)
+{
+	struct xfs_mount	*mp = dqp->q_mount;
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
 
+	mutex_lock(&dqp->q_hash->qh_lock);
+	list_del_init(&dqp->q_hashlist);
+	dqp->q_hash->qh_version++;
+	mutex_unlock(&dqp->q_hash->qh_lock);
 
-/*
- * Pop the least recently used dquot off the freelist and recycle it.
- */
-STATIC struct xfs_dquot *
-xfs_qm_dqreclaim_one(void)
+	mutex_lock(&qi->qi_dqlist_lock);
+	list_del_init(&dqp->q_mplist);
+	qi->qi_dquots--;
+	qi->qi_dqreclaims++;
+	mutex_unlock(&qi->qi_dqlist_lock);
+
+	xfs_qm_dqdestroy(dqp);
+}
+
+STATIC void
+xfs_qm_dqreclaim_one(
+	struct xfs_dquot	*dqp,
+	struct list_head	*dispose_list)
 {
-	struct xfs_dquot	*dqp;
-	int			restarts = 0;
+	struct xfs_mount	*mp = dqp->q_mount;
+	int			error;
 
-	mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-restart:
-	list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
-		struct xfs_mount *mp = dqp->q_mount;
+	if (!xfs_dqlock_nowait(dqp))
+		goto out_busy;
 
-		if (!xfs_dqlock_nowait(dqp))
-			continue;
+	/*
+	 * This dquot has acquired a reference in the meantime remove it from
+	 * the freelist and try again.
+	 */
+	if (dqp->q_nrefs) {
+		xfs_dqunlock(dqp);
 
-		/*
-		 * This dquot has already been grabbed by dqlookup.
-		 * Remove it from the freelist and try again.
-		 */
-		if (dqp->q_nrefs) {
-			trace_xfs_dqreclaim_want(dqp);
-			XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-
-			list_del_init(&dqp->q_freelist);
-			xfs_Gqm->qm_dqfrlist_cnt--;
-			restarts++;
-			goto dqunlock;
-		}
+		trace_xfs_dqreclaim_want(dqp);
+		XQM_STATS_INC(xqmstats.xs_qm_dqwants);
 
-		ASSERT(dqp->q_hash);
-		ASSERT(!list_empty(&dqp->q_mplist));
+		list_del_init(&dqp->q_freelist);
+		xfs_Gqm->qm_dqfrlist_cnt--;
+		return;
+	}
 
-		/*
-		 * Try to grab the flush lock. If this dquot is in the process
-		 * of getting flushed to disk, we don't want to reclaim it.
-		 */
-		if (!xfs_dqflock_nowait(dqp))
-			goto dqunlock;
+	ASSERT(dqp->q_hash);
+	ASSERT(!list_empty(&dqp->q_mplist));
 
-		/*
-		 * We have the flush lock so we know that this is not in the
-		 * process of being flushed. So, if this is dirty, flush it
-		 * DELWRI so that we don't get a freelist infested with
-		 * dirty dquots.
-		 */
-		if (XFS_DQ_IS_DIRTY(dqp)) {
-			int	error;
+	/*
+	 * Try to grab the flush lock. If this dquot is in the process of
+	 * getting flushed to disk, we don't want to reclaim it.
+	 */
+	if (!xfs_dqflock_nowait(dqp))
+		goto out_busy;
 
-			trace_xfs_dqreclaim_dirty(dqp);
+	/*
+	 * We have the flush lock so we know that this is not in the
+	 * process of being flushed. So, if this is dirty, flush it
+	 * DELWRI so that we don't get a freelist infested with
+	 * dirty dquots.
+	 */
+	if (XFS_DQ_IS_DIRTY(dqp)) {
+		trace_xfs_dqreclaim_dirty(dqp);
 
-			/*
-			 * We flush it delayed write, so don't bother
-			 * releasing the freelist lock.
-			 */
-			error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK);
-			if (error) {
-				xfs_warn(mp, "%s: dquot %p flush failed",
-					__func__, dqp);
-			}
-			goto dqunlock;
+		/*
+		 * We flush it delayed write, so don't bother releasing the
+		 * freelist lock.
+		 */
+		error = xfs_qm_dqflush(dqp, 0);
+		if (error) {
+			xfs_warn(mp, "%s: dquot %p flush failed",
+				 __func__, dqp);
 		}
-		xfs_dqfunlock(dqp);
 
 		/*
-		 * Prevent lookup now that we are going to reclaim the dquot.
-		 * Once XFS_DQ_FREEING is set lookup won't touch the dquot,
-		 * thus we can drop the lock now.
+		 * Give the dquot another try on the freelist, as the
+		 * flushing will take some time.
 		 */
-		dqp->dq_flags |= XFS_DQ_FREEING;
-		xfs_dqunlock(dqp);
-
-		mutex_lock(&dqp->q_hash->qh_lock);
-		list_del_init(&dqp->q_hashlist);
-		dqp->q_hash->qh_version++;
-		mutex_unlock(&dqp->q_hash->qh_lock);
-
-		mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
-		list_del_init(&dqp->q_mplist);
-		mp->m_quotainfo->qi_dquots--;
-		mp->m_quotainfo->qi_dqreclaims++;
-		mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+		goto out_busy;
+	}
+	xfs_dqfunlock(dqp);
 
-		ASSERT(dqp->q_nrefs == 0);
-		list_del_init(&dqp->q_freelist);
-		xfs_Gqm->qm_dqfrlist_cnt--;
+	/*
+	 * Prevent lookups now that we are past the point of no return.
+	 */
+	dqp->dq_flags |= XFS_DQ_FREEING;
+	xfs_dqunlock(dqp);
 
-		mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-		return dqp;
-dqunlock:
-		xfs_dqunlock(dqp);
-		if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-			break;
-		goto restart;
-	}
+	ASSERT(dqp->q_nrefs == 0);
+	list_move_tail(&dqp->q_freelist, dispose_list);
+	xfs_Gqm->qm_dqfrlist_cnt--;
 
-	mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-	return NULL;
-}
+	trace_xfs_dqreclaim_done(dqp);
+	XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
+	return;
 
-/*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
- * favor the lookup function ...
- */
-STATIC int
-xfs_qm_shake_freelist(
-	int	howmany)
-{
-	int		nreclaimed = 0;
-	xfs_dquot_t	*dqp;
+out_busy:
+	xfs_dqunlock(dqp);
 
-	if (howmany <= 0)
-		return 0;
+	/*
+	 * Move the dquot to the tail of the list so that we don't spin on it.
+	 */
+	list_move_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
 
-	while (nreclaimed < howmany) {
-		dqp = xfs_qm_dqreclaim_one();
-		if (!dqp)
-			return nreclaimed;
-		xfs_qm_dqdestroy(dqp);
-		nreclaimed++;
-	}
-	return nreclaimed;
+	trace_xfs_dqreclaim_busy(dqp);
+	XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
 }
 
-/*
- * The kmem_shake interface is invoked when memory is running low.
- */
-/* ARGSUSED */
 STATIC int
 xfs_qm_shake(
-	struct shrinker	*shrink,
-	struct shrink_control *sc)
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
 {
-	int	ndqused, nfree, n;
-	gfp_t gfp_mask = sc->gfp_mask;
-
-	if (!kmem_shake_allow(gfp_mask))
-		return 0;
-	if (!xfs_Gqm)
-		return 0;
-
-	nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
-	/* incore dquots in all f/s's */
-	ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
-
-	ASSERT(ndqused >= 0);
+	int			nr_to_scan = sc->nr_to_scan;
+	LIST_HEAD		(dispose_list);
+	struct xfs_dquot	*dqp;
 
-	if (nfree <= ndqused && nfree < ndquot)
+	if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
 		return 0;
+	if (!nr_to_scan)
+		goto out;
 
-	ndqused *= xfs_Gqm->qm_dqfree_ratio;	/* target # of free dquots */
-	n = nfree - ndqused - ndquot;		/* # over target */
-
-	return xfs_qm_shake_freelist(MAX(nfree, n));
-}
-
-
-/*------------------------------------------------------------------*/
-
-/*
- * Return a new incore dquot. Depending on the number of
- * dquots in the system, we either allocate a new one on the kernel heap,
- * or reclaim a free one.
- * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
- * to reclaim an existing one from the freelist.
- */
-boolean_t
-xfs_qm_dqalloc_incore(
-	xfs_dquot_t **O_dqpp)
-{
-	xfs_dquot_t	*dqp;
-
-	/*
-	 * Check against high water mark to see if we want to pop
-	 * a nincompoop dquot off the freelist.
-	 */
-	if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
-		/*
-		 * Try to recycle a dquot from the freelist.
-		 */
-		if ((dqp = xfs_qm_dqreclaim_one())) {
-			XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
-			/*
-			 * Just zero the core here. The rest will get
-			 * reinitialized by caller. XXX we shouldn't even
-			 * do this zero ...
-			 */
-			memset(&dqp->q_core, 0, sizeof(dqp->q_core));
-			*O_dqpp = dqp;
-			return B_FALSE;
-		}
-		XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
+	mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+	while (!list_empty(&xfs_Gqm->qm_dqfrlist)) {
+		if (nr_to_scan-- <= 0)
+			break;
+		dqp = list_first_entry(&xfs_Gqm->qm_dqfrlist, struct xfs_dquot,
+				       q_freelist);
+		xfs_qm_dqreclaim_one(dqp, &dispose_list);
 	}
+	mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
 
-	/*
-	 * Allocate a brand new dquot on the kernel heap and return it
-	 * to the caller to initialize.
-	 */
-	ASSERT(xfs_Gqm->qm_dqzone != NULL);
-	*O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
-	atomic_inc(&xfs_Gqm->qm_totaldquots);
-
-	return B_TRUE;
+	while (!list_empty(&dispose_list)) {
+		dqp = list_first_entry(&dispose_list, struct xfs_dquot,
+				       q_freelist);
+		list_del_init(&dqp->q_freelist);
+		xfs_qm_dqfree_one(dqp);
+	}
+out:
+	return (xfs_Gqm->qm_dqfrlist_cnt / 100) * sysctl_vfs_cache_pressure;
 }
 
-
 /*
  * Start a transaction and write the incore superblock changes to
  * disk. flags parameter indicates which fields have changed.
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 9b4f3adefbc5..9a9b997e1a0a 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -26,23 +26,11 @@
 struct xfs_qm;
 struct xfs_inode;
 
-extern uint		ndquot;
 extern struct mutex	xfs_Gqm_lock;
 extern struct xfs_qm	*xfs_Gqm;
 extern kmem_zone_t	*qm_dqzone;
 extern kmem_zone_t	*qm_dqtrxzone;
 
-/*
- * Ditto, for xfs_qm_dqreclaim_one.
- */
-#define XFS_QM_RECLAIM_MAX_RESTARTS	4
-
-/*
- * Ideal ratio of free to in use dquots. Quota manager makes an attempt
- * to keep this balance.
- */
-#define XFS_QM_DQFREE_RATIO		2
-
 /*
  * Dquot hashtable constants/threshold values.
  */
@@ -74,7 +62,6 @@ typedef struct xfs_qm {
 	int		 qm_dqfrlist_cnt;
 	atomic_t	 qm_totaldquots; /* total incore dquots */
 	uint		 qm_nrefs;	 /* file systems with quota on */
-	int		 qm_dqfree_ratio;/* ratio of free to inuse dquots */
 	kmem_zone_t	*qm_dqzone;	 /* dquot mem-alloc zone */
 	kmem_zone_t	*qm_dqtrxzone;	 /* t_dqinfo of transactions */
 } xfs_qm_t;
@@ -143,7 +130,6 @@ extern int		xfs_qm_quotacheck(xfs_mount_t *);
 extern int		xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
 
 /* dquot stuff */
-extern boolean_t	xfs_qm_dqalloc_incore(xfs_dquot_t **);
 extern int		xfs_qm_dqpurge_all(xfs_mount_t *, uint);
 extern void		xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
 
diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
index 8671a0b32644..5729ba570877 100644
--- a/fs/xfs/xfs_qm_stats.c
+++ b/fs/xfs/xfs_qm_stats.c
@@ -42,9 +42,9 @@ static int xqm_proc_show(struct seq_file *m, void *v)
 {
 	/* maximum; incore; ratio free to inuse; freelist */
 	seq_printf(m, "%d\t%d\t%d\t%u\n",
-			ndquot,
+			0,
 			xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
-			xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
+			0,
 			xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
 	return 0;
 }
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6b6df5802e95..bb134a819930 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -733,11 +733,10 @@ DEFINE_EVENT(xfs_dquot_class, name, \
 DEFINE_DQUOT_EVENT(xfs_dqadjust);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
-DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_done);
 DEFINE_DQUOT_EVENT(xfs_dqattach_found);
 DEFINE_DQUOT_EVENT(xfs_dqattach_get);
-DEFINE_DQUOT_EVENT(xfs_dqinit);
-DEFINE_DQUOT_EVENT(xfs_dqreuse);
 DEFINE_DQUOT_EVENT(xfs_dqalloc);
 DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
 DEFINE_DQUOT_EVENT(xfs_dqread);
-- 
cgit v1.2.3


From 05293485a0b6b1f803e8a3c0ff188c38f6969985 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Mon, 13 Feb 2012 20:51:05 +0000
Subject: XFS: xfs_trans_add_item() - don't assign in ASSERT() when compare is
 intended

It looks to me like the two ASSERT()s in xfs_trans_add_item() really
want to do a compare (==) rather than assignment (=).
This patch changes it from the latter to the former.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_trans.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 329b06aba1c2..7adcdf15ae0c 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1151,8 +1151,8 @@ xfs_trans_add_item(
 {
 	struct xfs_log_item_desc *lidp;
 
-	ASSERT(lip->li_mountp = tp->t_mountp);
-	ASSERT(lip->li_ailp = tp->t_mountp->m_ail);
+	ASSERT(lip->li_mountp == tp->t_mountp);
+	ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
 
 	lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
 
-- 
cgit v1.2.3


From e188dc02d3a9c911be56eca5aa114fe7e9822d53 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 3 Feb 2012 14:25:18 +0100
Subject: vfs: fix d_inode_lookup() dentry ref leak

d_inode_lookup() leaks a dentry reference on IS_DEADDIR().

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 208c6aa4a989..a780ea515c47 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1095,8 +1095,10 @@ static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentr
 	struct dentry *old;
 
 	/* Don't create child dentry for a dead directory. */
-	if (unlikely(IS_DEADDIR(inode)))
+	if (unlikely(IS_DEADDIR(inode))) {
+		dput(dentry);
 		return ERR_PTR(-ENOENT);
+	}
 
 	old = inode->i_op->lookup(inode, dentry, nd);
 	if (unlikely(old)) {
-- 
cgit v1.2.3


From 1d6f2097865e64963e90cce04980dce2f9fc023f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 22 Aug 2011 11:52:28 +0800
Subject: autofs4 - fix lockdep splat in autofs

When recursing down the locks when traversing a tree/list in
get_next_positive_dentry() or get_next_positive_subdir() a lock can
change from being nested to being a parent which breaks lockdep. This
patch tells lockdep about what we did.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/expire.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 450f529a4eae..1feb68ecef95 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -124,6 +124,7 @@ start:
 	/* Negative dentry - try next */
 	if (!simple_positive(q)) {
 		spin_unlock(&p->d_lock);
+		lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_);
 		p = q;
 		goto again;
 	}
@@ -186,6 +187,7 @@ again:
 	/* Negative dentry - try next */
 	if (!simple_positive(ret)) {
 		spin_unlock(&p->d_lock);
+		lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_);
 		p = ret;
 		goto again;
 	}
-- 
cgit v1.2.3


From 074b85175a43a23fdbde60f55feea636e0bf0f85 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Wed, 8 Feb 2012 12:39:07 -0800
Subject: vfs: fix panic in __d_lookup() with high dentry hashtable counts

When the number of dentry cache hash table entries gets too high
(2147483648 entries), as happens by default on a 16TB system, use of a
signed integer in the dcache_init() initialization loop prevents the
dentry_hashtable from getting initialized, causing a panic in
__d_lookup().  Fix this in dcache_init() and similar areas.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 8 ++++----
 fs/inode.c  | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 16a53cc2cc02..fe19ac13f75f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2968,7 +2968,7 @@ __setup("dhash_entries=", set_dhash_entries);
 
 static void __init dcache_init_early(void)
 {
-	int loop;
+	unsigned int loop;
 
 	/* If hashes are distributed across NUMA nodes, defer
 	 * hash allocation until vmalloc space is available.
@@ -2986,13 +2986,13 @@ static void __init dcache_init_early(void)
 					&d_hash_mask,
 					0);
 
-	for (loop = 0; loop < (1 << d_hash_shift); loop++)
+	for (loop = 0; loop < (1U << d_hash_shift); loop++)
 		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
 }
 
 static void __init dcache_init(void)
 {
-	int loop;
+	unsigned int loop;
 
 	/* 
 	 * A constructor could be added for stable state like the lists,
@@ -3016,7 +3016,7 @@ static void __init dcache_init(void)
 					&d_hash_mask,
 					0);
 
-	for (loop = 0; loop < (1 << d_hash_shift); loop++)
+	for (loop = 0; loop < (1U << d_hash_shift); loop++)
 		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index fb10d86ffad7..d3ebdbe723d0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1651,7 +1651,7 @@ __setup("ihash_entries=", set_ihash_entries);
  */
 void __init inode_init_early(void)
 {
-	int loop;
+	unsigned int loop;
 
 	/* If hashes are distributed across NUMA nodes, defer
 	 * hash allocation until vmalloc space is available.
@@ -1669,13 +1669,13 @@ void __init inode_init_early(void)
 					&i_hash_mask,
 					0);
 
-	for (loop = 0; loop < (1 << i_hash_shift); loop++)
+	for (loop = 0; loop < (1U << i_hash_shift); loop++)
 		INIT_HLIST_HEAD(&inode_hashtable[loop]);
 }
 
 void __init inode_init(void)
 {
-	int loop;
+	unsigned int loop;
 
 	/* inode slab cache */
 	inode_cachep = kmem_cache_create("inode_cache",
@@ -1699,7 +1699,7 @@ void __init inode_init(void)
 					&i_hash_mask,
 					0);
 
-	for (loop = 0; loop < (1 << i_hash_shift); loop++)
+	for (loop = 0; loop < (1U << i_hash_shift); loop++)
 		INIT_HLIST_HEAD(&inode_hashtable[loop]);
 }
 
-- 
cgit v1.2.3


From 6b6dc836a195e077e76977b6c020a73de411b46d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 10 Feb 2012 11:03:00 +0100
Subject: vfs: Provide function to get superblock and wait for it to thaw

In quota code we need to find a superblock corresponding to a device and wait
for superblock to be unfrozen. However this waiting has to happen without
s_umount semaphore because that is required for superblock to thaw. So provide
a function in VFS for this to keep dances with s_umount where they belong.

[AV: implementation switched to saner variant]

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 6015c02296b7..6277ec6cb60a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -633,6 +633,28 @@ rescan:
 
 EXPORT_SYMBOL(get_super);
 
+/**
+ *	get_super_thawed - get thawed superblock of a device
+ *	@bdev: device to get the superblock for
+ *
+ *	Scans the superblock list and finds the superblock of the file system
+ *	mounted on the device. The superblock is returned once it is thawed
+ *	(or immediately if it was not frozen). %NULL is returned if no match
+ *	is found.
+ */
+struct super_block *get_super_thawed(struct block_device *bdev)
+{
+	while (1) {
+		struct super_block *s = get_super(bdev);
+		if (!s || s->s_frozen == SB_UNFROZEN)
+			return s;
+		up_read(&s->s_umount);
+		vfs_check_frozen(s, SB_FREEZE_WRITE);
+		put_super(s);
+	}
+}
+EXPORT_SYMBOL(get_super_thawed);
+
 /**
  * get_active_super - get an active reference to the superblock of a device
  * @bdev: device to get the superblock for
-- 
cgit v1.2.3


From dcdbed853d9fbb0547b781ba676049b87f54129a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 10 Feb 2012 11:03:01 +0100
Subject: quota: Fix deadlock with suspend and quotas

This script causes a kernel deadlock:
set -e
DEVICE=/dev/vg1/linear
lvchange -ay $DEVICE
mkfs.ext3 $DEVICE
mount -t ext3 -o usrquota,grpquota $DEVICE /mnt/test
quotacheck -gu /mnt/test
umount /mnt/test
mount -t ext3 -o usrquota,grpquota $DEVICE /mnt/test
quotaon /mnt/test
dmsetup suspend $DEVICE
setquota -u root 1 2 3 4 /mnt/test &
sleep 1
dmsetup resume $DEVICE

setquota acquired semaphore s_umount for read and then tried to perform a
transaction (and waits because the device is suspended).  dmsetup resume tries
to acquire s_umount for write before resuming the device (and waits for
setquota).

Fix the deadlock by grabbing a thawed superblock for quota commands which need
it.

Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/quota/quota.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 7898cd688a00..fc2c4388d126 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -292,11 +292,26 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 	}
 }
 
+/* Return 1 if 'cmd' will block on frozen filesystem */
+static int quotactl_cmd_write(int cmd)
+{
+	switch (cmd) {
+	case Q_GETFMT:
+	case Q_GETINFO:
+	case Q_SYNC:
+	case Q_XGETQSTAT:
+	case Q_XGETQUOTA:
+	case Q_XQUOTASYNC:
+		return 0;
+	}
+	return 1;
+}
+
 /*
  * look up a superblock on which quota ops will be performed
  * - use the name of a block device to find the superblock thereon
  */
-static struct super_block *quotactl_block(const char __user *special)
+static struct super_block *quotactl_block(const char __user *special, int cmd)
 {
 #ifdef CONFIG_BLOCK
 	struct block_device *bdev;
@@ -309,7 +324,10 @@ static struct super_block *quotactl_block(const char __user *special)
 	putname(tmp);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
-	sb = get_super(bdev);
+	if (quotactl_cmd_write(cmd))
+		sb = get_super_thawed(bdev);
+	else
+		sb = get_super(bdev);
 	bdput(bdev);
 	if (!sb)
 		return ERR_PTR(-ENODEV);
@@ -361,7 +379,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 			pathp = &path;
 	}
 
-	sb = quotactl_block(special);
+	sb = quotactl_block(special, cmds);
 	if (IS_ERR(sb)) {
 		ret = PTR_ERR(sb);
 		goto out;
-- 
cgit v1.2.3


From fcf83067bf6eb101a35620d752bd559d473cfbaa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Feb 2012 20:56:29 -0500
Subject: vfs: fix compat_sys_stat() handling of overflows in st_nlink

Massaged cp_compat_stat() into form closer to cp_new_stat(); the only
real issue had been in handling of st_nlink overflows - native 32bit
stat(2) returns -EOVERFLOW in such situations, compat one silently
loses upper bits.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c | 56 +++++++++++++++++++++++++-------------------------------
 1 file changed, 25 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index fa9d721ecfee..07880bae28a9 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -131,41 +131,35 @@ asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_tim
 
 static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
 {
-	compat_ino_t ino = stat->ino;
-	typeof(ubuf->st_uid) uid = 0;
-	typeof(ubuf->st_gid) gid = 0;
-	int err;
+	struct compat_stat tmp;
 
-	SET_UID(uid, stat->uid);
-	SET_GID(gid, stat->gid);
+	if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+		return -EOVERFLOW;
 
-	if ((u64) stat->size > MAX_NON_LFS ||
-	    !old_valid_dev(stat->dev) ||
-	    !old_valid_dev(stat->rdev))
+	memset(&tmp, 0, sizeof(tmp));
+	tmp.st_dev = old_encode_dev(stat->dev);
+	tmp.st_ino = stat->ino;
+	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
 		return -EOVERFLOW;
-	if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino)
+	tmp.st_mode = stat->mode;
+	tmp.st_nlink = stat->nlink;
+	if (tmp.st_nlink != stat->nlink)
 		return -EOVERFLOW;
-
-	if (clear_user(ubuf, sizeof(*ubuf)))
-		return -EFAULT;
-
-	err  = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev);
-	err |= __put_user(ino, &ubuf->st_ino);
-	err |= __put_user(stat->mode, &ubuf->st_mode);
-	err |= __put_user(stat->nlink, &ubuf->st_nlink);
-	err |= __put_user(uid, &ubuf->st_uid);
-	err |= __put_user(gid, &ubuf->st_gid);
-	err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev);
-	err |= __put_user(stat->size, &ubuf->st_size);
-	err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime);
-	err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec);
-	err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime);
-	err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec);
-	err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime);
-	err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec);
-	err |= __put_user(stat->blksize, &ubuf->st_blksize);
-	err |= __put_user(stat->blocks, &ubuf->st_blocks);
-	return err;
+	SET_UID(tmp.st_uid, stat->uid);
+	SET_GID(tmp.st_gid, stat->gid);
+	tmp.st_rdev = old_encode_dev(stat->rdev);
+	if ((u64) stat->size > MAX_NON_LFS)
+		return -EOVERFLOW;
+	tmp.st_size = stat->size;
+	tmp.st_atime = stat->atime.tv_sec;
+	tmp.st_atime_nsec = stat->atime.tv_nsec;
+	tmp.st_mtime = stat->mtime.tv_sec;
+	tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+	tmp.st_ctime = stat->ctime.tv_sec;
+	tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+	tmp.st_blocks = stat->blocks;
+	tmp.st_blksize = stat->blksize;
+	return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
 }
 
 asmlinkage long compat_sys_newstat(const char __user * filename,
-- 
cgit v1.2.3


From 847c9db5cb50841589b8ebd3da0769b1b02fb3b2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Feb 2012 21:00:05 -0500
Subject: ocfs2: deal with wraparounds of i_nlink in ocfs2_rename()

unfortunately, nlink_t may be smaller than 32 bits and ->i_nlink
on ocfs2 can grow up to 0xffffffff; storing it in nlink_t variable
will lose upper bits on such architectures.  Needs to be made u32,
until we get kernel-side nlink_t uniformly 32bit...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index be244692550d..a9856e3eaaf0 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1053,7 +1053,7 @@ static int ocfs2_rename(struct inode *old_dir,
 	handle_t *handle = NULL;
 	struct buffer_head *old_dir_bh = NULL;
 	struct buffer_head *new_dir_bh = NULL;
-	nlink_t old_dir_nlink = old_dir->i_nlink;
+	u32 old_dir_nlink = old_dir->i_nlink;
 	struct ocfs2_dinode *old_di;
 	struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
 	struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
-- 
cgit v1.2.3


From 4a26620df451ad46151ad21d711ed43e963c004e Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Sat, 5 Nov 2011 13:45:08 -0400
Subject: eCryptfs: Improve statfs reporting

statfs() calls on eCryptfs files returned the wrong filesystem type and,
when using filename encryption, the wrong maximum filename length.

If mount-wide filename encryption is enabled, the cipher block size and
the lower filesystem's max filename length will determine the max
eCryptfs filename length. Pre-tested, known good lengths are used when
the lower filesystem's namelen is 255 and a cipher with 8 or 16 byte
block sizes is used. In other, less common cases, we fall back to a safe
rounded-down estimate when determining the eCryptfs namelen.

https://launchpad.net/bugs/885744

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Reported-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: John Johansen <john.johansen@canonical.com>
---
 fs/ecryptfs/crypto.c          | 68 ++++++++++++++++++++++++++++++++++++++-----
 fs/ecryptfs/ecryptfs_kernel.h |  6 ++++
 fs/ecryptfs/keystore.c        |  9 ++----
 fs/ecryptfs/super.c           | 14 ++++++++-
 4 files changed, 83 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 63ab24510649..ea9931281557 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1990,6 +1990,17 @@ out:
 	return;
 }
 
+static size_t ecryptfs_max_decoded_size(size_t encoded_size)
+{
+	/* Not exact; conservatively long. Every block of 4
+	 * encoded characters decodes into a block of 3
+	 * decoded characters. This segment of code provides
+	 * the caller with the maximum amount of allocated
+	 * space that @dst will need to point to in a
+	 * subsequent call. */
+	return ((encoded_size + 1) * 3) / 4;
+}
+
 /**
  * ecryptfs_decode_from_filename
  * @dst: If NULL, this function only sets @dst_size and returns. If
@@ -2008,13 +2019,7 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
 	size_t dst_byte_offset = 0;
 
 	if (dst == NULL) {
-		/* Not exact; conservatively long. Every block of 4
-		 * encoded characters decodes into a block of 3
-		 * decoded characters. This segment of code provides
-		 * the caller with the maximum amount of allocated
-		 * space that @dst will need to point to in a
-		 * subsequent call. */
-		(*dst_size) = (((src_size + 1) * 3) / 4);
+		(*dst_size) = ecryptfs_max_decoded_size(src_size);
 		goto out;
 	}
 	while (src_byte_offset < src_size) {
@@ -2239,3 +2244,52 @@ out_free:
 out:
 	return rc;
 }
+
+#define ENC_NAME_MAX_BLOCKLEN_8_OR_16	143
+
+int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
+			   struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	struct blkcipher_desc desc;
+	struct mutex *tfm_mutex;
+	size_t cipher_blocksize;
+	int rc;
+
+	if (!(mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) {
+		(*namelen) = lower_namelen;
+		return 0;
+	}
+
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+			mount_crypt_stat->global_default_fn_cipher_name);
+	if (unlikely(rc)) {
+		(*namelen) = 0;
+		return rc;
+	}
+
+	mutex_lock(tfm_mutex);
+	cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm);
+	mutex_unlock(tfm_mutex);
+
+	/* Return an exact amount for the common cases */
+	if (lower_namelen == NAME_MAX
+	    && (cipher_blocksize == 8 || cipher_blocksize == 16)) {
+		(*namelen) = ENC_NAME_MAX_BLOCKLEN_8_OR_16;
+		return 0;
+	}
+
+	/* Return a safe estimate for the uncommon cases */
+	(*namelen) = lower_namelen;
+	(*namelen) -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+	/* Since this is the max decoded size, subtract 1 "decoded block" len */
+	(*namelen) = ecryptfs_max_decoded_size(*namelen) - 3;
+	(*namelen) -= ECRYPTFS_TAG_70_MAX_METADATA_SIZE;
+	(*namelen) -= ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES;
+	/* Worst case is that the filename is padded nearly a full block size */
+	(*namelen) -= cipher_blocksize - 1;
+
+	if ((*namelen) < 0)
+		(*namelen) = 0;
+
+	return 0;
+}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a2362df58ae8..867b64c5d84f 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -162,6 +162,10 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
 #define MD5_DIGEST_SIZE 16
 #define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
+#define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \
+					   + ECRYPTFS_SIG_SIZE + 1 + 1)
+#define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \
+					   + ECRYPTFS_SIG_SIZE + 1 + 1)
 #define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
 #define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
 #define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
@@ -701,6 +705,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 			     size_t *packet_size,
 			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
 			     char *data, size_t max_packet_size);
+int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
+			   struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
 int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
 		       loff_t offset);
 
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 8e3b943e330f..2333203a120b 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -679,10 +679,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 	 * Octets N3-N4: Block-aligned encrypted filename
 	 *  - Consists of a minimum number of random characters, a \0
 	 *    separator, and then the filename */
-	s->max_packet_size = (1                   /* Tag 70 identifier */
-			      + 3                 /* Max Tag 70 packet size */
-			      + ECRYPTFS_SIG_SIZE /* FNEK sig */
-			      + 1                 /* Cipher identifier */
+	s->max_packet_size = (ECRYPTFS_TAG_70_MAX_METADATA_SIZE
 			      + s->block_aligned_filename_size);
 	if (dest == NULL) {
 		(*packet_size) = s->max_packet_size;
@@ -934,10 +931,10 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 		goto out;
 	}
 	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-	if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
+	if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) {
 		printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
 		       "at least [%d]\n", __func__, max_packet_size,
-			(1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
+		       ECRYPTFS_TAG_70_MIN_METADATA_SIZE);
 		rc = -EINVAL;
 		goto out;
 	}
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 9df7fd6e0c39..cf152823bbf4 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -30,6 +30,8 @@
 #include <linux/seq_file.h>
 #include <linux/file.h>
 #include <linux/crypto.h>
+#include <linux/statfs.h>
+#include <linux/magic.h>
 #include "ecryptfs_kernel.h"
 
 struct kmem_cache *ecryptfs_inode_info_cache;
@@ -102,10 +104,20 @@ static void ecryptfs_destroy_inode(struct inode *inode)
 static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	int rc;
 
 	if (!lower_dentry->d_sb->s_op->statfs)
 		return -ENOSYS;
-	return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
+
+	rc = lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
+	if (rc)
+		return rc;
+
+	buf->f_type = ECRYPTFS_SUPER_MAGIC;
+	rc = ecryptfs_set_f_namelen(&buf->f_namelen, buf->f_namelen,
+	       &ecryptfs_superblock_to_private(dentry->d_sb)->mount_crypt_stat);
+
+	return rc;
 }
 
 /**
-- 
cgit v1.2.3


From 545d680938be1e86a6c5250701ce9abaf360c495 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Tue, 7 Feb 2012 17:55:40 -0600
Subject: eCryptfs: Copy up lower inode attrs after setting lower xattr

After passing through a ->setxattr() call, eCryptfs needs to copy the
inode attributes from the lower inode to the eCryptfs inode, as they
may have changed in the lower filesystem's ->setxattr() path.

One example is if an extended attribute containing a POSIX Access
Control List is being set. The new ACL may cause the lower filesystem to
modify the mode of the lower inode and the eCryptfs inode would need to
be updated to reflect the new mode.

https://launchpad.net/bugs/926292

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Reported-by: Sebastien Bacher <seb128@ubuntu.com>
Cc: John Johansen <john.johansen@canonical.com>
Cc: <stable@vger.kernel.org>
---
 fs/ecryptfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 19892d7d2ed1..ab35b113003b 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1085,6 +1085,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	}
 
 	rc = vfs_setxattr(lower_dentry, name, value, size, flags);
+	if (!rc)
+		fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode);
 out:
 	return rc;
 }
-- 
cgit v1.2.3


From 465c9343c5b746ec2325a220fa3e50cc647d2db7 Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Fri, 10 Feb 2012 13:39:50 +0800
Subject: ecryptfs: remove the second argument of k[un]map_atomic()

Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/mmap.c       | 4 ++--
 fs/ecryptfs/read_write.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 10ec695ccd68..a46b3a8fee1e 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -150,7 +150,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 			/* This is a header extent */
 			char *page_virt;
 
-			page_virt = kmap_atomic(page, KM_USER0);
+			page_virt = kmap_atomic(page);
 			memset(page_virt, 0, PAGE_CACHE_SIZE);
 			/* TODO: Support more than one header extent */
 			if (view_extent_num == 0) {
@@ -163,7 +163,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 							       crypt_stat,
 							       &written);
 			}
-			kunmap_atomic(page_virt, KM_USER0);
+			kunmap_atomic(page_virt);
 			flush_dcache_page(page);
 			if (rc) {
 				printk(KERN_ERR "%s: Error reading xattr "
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 5c0106f75775..b2a34a192f4f 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -156,7 +156,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
 			       ecryptfs_page_idx, rc);
 			goto out;
 		}
-		ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
+		ecryptfs_page_virt = kmap_atomic(ecryptfs_page);
 
 		/*
 		 * pos: where we're now writing, offset: where the request was
@@ -179,7 +179,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
 			       (data + data_offset), num_bytes);
 			data_offset += num_bytes;
 		}
-		kunmap_atomic(ecryptfs_page_virt, KM_USER0);
+		kunmap_atomic(ecryptfs_page_virt);
 		flush_dcache_page(ecryptfs_page);
 		SetPageUptodate(ecryptfs_page);
 		unlock_page(ecryptfs_page);
-- 
cgit v1.2.3


From f86f36a6ae625eda87a13e1ea102a908e08f491b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 14 Feb 2012 20:33:19 -0500
Subject: NFSv4.1: Fix a NFSv4.1 session initialisation regression

Commit aacd553 (NFSv4.1: cleanup init and reset of session slot tables)
introduces a regression in the session initialisation code. New tables
now find their sequence ids initialised to 0, rather than the mandated
value of 1 (see RFC5661).

Fix the problem by merging nfs4_reset_slot_table() and nfs4_init_slot_table().
Since the tbl->max_slots is initialised to 0, the test in
nfs4_reset_slot_table for max_reqs != tbl->max_slots will automatically
pass for an empty table.

Reported-by: Vitaliy Gusev <gusev.vitaliy@nexenta.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 107 +++++++++++++++++++++---------------------------------
 1 file changed, 42 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d202e04aca94..b4d67feab90b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5008,37 +5008,53 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
 	return status;
 }
 
+static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags)
+{
+	return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags);
+}
+
+static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl,
+		struct nfs4_slot *new,
+		u32 max_slots,
+		u32 ivalue)
+{
+	struct nfs4_slot *old = NULL;
+	u32 i;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	if (new) {
+		old = tbl->slots;
+		tbl->slots = new;
+		tbl->max_slots = max_slots;
+	}
+	tbl->highest_used_slotid = -1;	/* no slot is currently used */
+	for (i = 0; i < tbl->max_slots; i++)
+		tbl->slots[i].seq_nr = ivalue;
+	spin_unlock(&tbl->slot_tbl_lock);
+	kfree(old);
+}
+
 /*
- * Reset a slot table
+ * (re)Initialise a slot table
  */
-static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
-				 int ivalue)
+static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
+				 u32 ivalue)
 {
 	struct nfs4_slot *new = NULL;
-	int i;
-	int ret = 0;
+	int ret = -ENOMEM;
 
 	dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
 		max_reqs, tbl->max_slots);
 
 	/* Does the newly negotiated max_reqs match the existing slot table? */
 	if (max_reqs != tbl->max_slots) {
-		ret = -ENOMEM;
-		new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
-			      GFP_NOFS);
+		new = nfs4_alloc_slots(max_reqs, GFP_NOFS);
 		if (!new)
 			goto out;
-		ret = 0;
-		kfree(tbl->slots);
-	}
-	spin_lock(&tbl->slot_tbl_lock);
-	if (new) {
-		tbl->slots = new;
-		tbl->max_slots = max_reqs;
 	}
-	for (i = 0; i < tbl->max_slots; ++i)
-		tbl->slots[i].seq_nr = ivalue;
-	spin_unlock(&tbl->slot_tbl_lock);
+	ret = 0;
+
+	nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue);
 	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
 		tbl, tbl->slots, tbl->max_slots);
 out:
@@ -5060,36 +5076,6 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session)
 	return;
 }
 
-/*
- * Initialize slot table
- */
-static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
-		int max_slots, int ivalue)
-{
-	struct nfs4_slot *slot;
-	int ret = -ENOMEM;
-
-	BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE);
-
-	dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
-
-	slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS);
-	if (!slot)
-		goto out;
-	ret = 0;
-
-	spin_lock(&tbl->slot_tbl_lock);
-	tbl->max_slots = max_slots;
-	tbl->slots = slot;
-	tbl->highest_used_slotid = -1;  /* no slot is currently used */
-	spin_unlock(&tbl->slot_tbl_lock);
-	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
-		tbl, tbl->slots, tbl->max_slots);
-out:
-	dprintk("<-- %s: return %d\n", __func__, ret);
-	return ret;
-}
-
 /*
  * Initialize or reset the forechannel and backchannel tables
  */
@@ -5101,25 +5087,16 @@ static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
 	dprintk("--> %s\n", __func__);
 	/* Fore channel */
 	tbl = &ses->fc_slot_table;
-	if (tbl->slots == NULL) {
-		status = nfs4_init_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
-		if (status) /* -ENOMEM */
-			return status;
-	} else {
-		status = nfs4_reset_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
-		if (status)
-			return status;
-	}
+	status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
+	if (status) /* -ENOMEM */
+		return status;
 	/* Back channel */
 	tbl = &ses->bc_slot_table;
-	if (tbl->slots == NULL) {
-		status = nfs4_init_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
-		if (status)
-			/* Fore and back channel share a connection so get
-			 * both slot tables or neither */
-			nfs4_destroy_slot_tables(ses);
-	} else
-		status = nfs4_reset_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
+	status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
+	if (status && tbl->slots == NULL)
+		/* Fore and back channel share a connection so get
+		 * both slot tables or neither */
+		nfs4_destroy_slot_tables(ses);
 	return status;
 }
 
-- 
cgit v1.2.3


From abe9a6d57b4544ac208401f9c0a4262814db2be4 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Thu, 16 Feb 2012 11:17:05 -0500
Subject: NFSv4: fix server_scope memory leak

server_scope would never be freed if nfs4_check_cl_exchange_flags() returned
non-zero

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b4d67feab90b..ec9f6ef6c5dd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4883,8 +4883,10 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 				clp->cl_rpcclient->cl_auth->au_flavor);
 
 	res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL);
-	if (unlikely(!res.server_scope))
-		return -ENOMEM;
+	if (unlikely(!res.server_scope)) {
+		status = -ENOMEM;
+		goto out;
+	}
 
 	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
 	if (!status)
@@ -4901,12 +4903,13 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 			clp->server_scope = NULL;
 		}
 
-		if (!clp->server_scope)
+		if (!clp->server_scope) {
 			clp->server_scope = res.server_scope;
-		else
-			kfree(res.server_scope);
+			goto out;
+		}
 	}
-
+	kfree(res.server_scope);
+out:
 	dprintk("<-- %s status= %d\n", __func__, status);
 	return status;
 }
-- 
cgit v1.2.3


From 20f12d8ac01917d96860f352f67eddd912df0afb Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Mon, 6 Feb 2012 12:50:07 +0000
Subject: xfs: change available ranges of softlimit and hardlimit in quota
 check

In general, quota allows us to use disk blocks and inodes up to each
limit, that is, they are available if they don't exceed their limitations.
Current xfs sets their available ranges to lower than them except disk
inode quota check. So, this patch changes the ranges to not beyond them.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: Ben Myers <bpm@sgi.com>
Cc: Alex Elder <elder@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dquot.c       | 24 ++++++++++++------------
 fs/xfs/xfs_log_recover.c |  6 +++---
 fs/xfs/xfs_qm_syscalls.c |  4 ++--
 fs/xfs/xfs_trans_dquot.c |  4 ++--
 4 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index cbcb7bea38e2..53db20ee3e77 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -139,10 +139,10 @@ xfs_qm_adjust_dqtimers(
 
 	if (!d->d_btimer) {
 		if ((d->d_blk_softlimit &&
-		     (be64_to_cpu(d->d_bcount) >=
+		     (be64_to_cpu(d->d_bcount) >
 		      be64_to_cpu(d->d_blk_softlimit))) ||
 		    (d->d_blk_hardlimit &&
-		     (be64_to_cpu(d->d_bcount) >=
+		     (be64_to_cpu(d->d_bcount) >
 		      be64_to_cpu(d->d_blk_hardlimit)))) {
 			d->d_btimer = cpu_to_be32(get_seconds() +
 					mp->m_quotainfo->qi_btimelimit);
@@ -151,10 +151,10 @@ xfs_qm_adjust_dqtimers(
 		}
 	} else {
 		if ((!d->d_blk_softlimit ||
-		     (be64_to_cpu(d->d_bcount) <
+		     (be64_to_cpu(d->d_bcount) <=
 		      be64_to_cpu(d->d_blk_softlimit))) &&
 		    (!d->d_blk_hardlimit ||
-		    (be64_to_cpu(d->d_bcount) <
+		    (be64_to_cpu(d->d_bcount) <=
 		     be64_to_cpu(d->d_blk_hardlimit)))) {
 			d->d_btimer = 0;
 		}
@@ -162,10 +162,10 @@ xfs_qm_adjust_dqtimers(
 
 	if (!d->d_itimer) {
 		if ((d->d_ino_softlimit &&
-		     (be64_to_cpu(d->d_icount) >=
+		     (be64_to_cpu(d->d_icount) >
 		      be64_to_cpu(d->d_ino_softlimit))) ||
 		    (d->d_ino_hardlimit &&
-		     (be64_to_cpu(d->d_icount) >=
+		     (be64_to_cpu(d->d_icount) >
 		      be64_to_cpu(d->d_ino_hardlimit)))) {
 			d->d_itimer = cpu_to_be32(get_seconds() +
 					mp->m_quotainfo->qi_itimelimit);
@@ -174,10 +174,10 @@ xfs_qm_adjust_dqtimers(
 		}
 	} else {
 		if ((!d->d_ino_softlimit ||
-		     (be64_to_cpu(d->d_icount) <
+		     (be64_to_cpu(d->d_icount) <=
 		      be64_to_cpu(d->d_ino_softlimit)))  &&
 		    (!d->d_ino_hardlimit ||
-		     (be64_to_cpu(d->d_icount) <
+		     (be64_to_cpu(d->d_icount) <=
 		      be64_to_cpu(d->d_ino_hardlimit)))) {
 			d->d_itimer = 0;
 		}
@@ -185,10 +185,10 @@ xfs_qm_adjust_dqtimers(
 
 	if (!d->d_rtbtimer) {
 		if ((d->d_rtb_softlimit &&
-		     (be64_to_cpu(d->d_rtbcount) >=
+		     (be64_to_cpu(d->d_rtbcount) >
 		      be64_to_cpu(d->d_rtb_softlimit))) ||
 		    (d->d_rtb_hardlimit &&
-		     (be64_to_cpu(d->d_rtbcount) >=
+		     (be64_to_cpu(d->d_rtbcount) >
 		      be64_to_cpu(d->d_rtb_hardlimit)))) {
 			d->d_rtbtimer = cpu_to_be32(get_seconds() +
 					mp->m_quotainfo->qi_rtbtimelimit);
@@ -197,10 +197,10 @@ xfs_qm_adjust_dqtimers(
 		}
 	} else {
 		if ((!d->d_rtb_softlimit ||
-		     (be64_to_cpu(d->d_rtbcount) <
+		     (be64_to_cpu(d->d_rtbcount) <=
 		      be64_to_cpu(d->d_rtb_softlimit))) &&
 		    (!d->d_rtb_hardlimit ||
-		     (be64_to_cpu(d->d_rtbcount) <
+		     (be64_to_cpu(d->d_rtbcount) <=
 		      be64_to_cpu(d->d_rtb_hardlimit)))) {
 			d->d_rtbtimer = 0;
 		}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 15ff5392fb65..0ed9ee77937c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1981,7 +1981,7 @@ xfs_qm_dqcheck(
 
 	if (!errs && ddq->d_id) {
 		if (ddq->d_blk_softlimit &&
-		    be64_to_cpu(ddq->d_bcount) >=
+		    be64_to_cpu(ddq->d_bcount) >
 				be64_to_cpu(ddq->d_blk_softlimit)) {
 			if (!ddq->d_btimer) {
 				if (flags & XFS_QMOPT_DOWARN)
@@ -1992,7 +1992,7 @@ xfs_qm_dqcheck(
 			}
 		}
 		if (ddq->d_ino_softlimit &&
-		    be64_to_cpu(ddq->d_icount) >=
+		    be64_to_cpu(ddq->d_icount) >
 				be64_to_cpu(ddq->d_ino_softlimit)) {
 			if (!ddq->d_itimer) {
 				if (flags & XFS_QMOPT_DOWARN)
@@ -2003,7 +2003,7 @@ xfs_qm_dqcheck(
 			}
 		}
 		if (ddq->d_rtb_softlimit &&
-		    be64_to_cpu(ddq->d_rtbcount) >=
+		    be64_to_cpu(ddq->d_rtbcount) >
 				be64_to_cpu(ddq->d_rtb_softlimit)) {
 			if (!ddq->d_rtbtimer) {
 				if (flags & XFS_QMOPT_DOWARN)
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index eafbcff81f3a..711a86e39ff0 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -813,11 +813,11 @@ xfs_qm_export_dquot(
 	     (XFS_IS_OQUOTA_ENFORCED(mp) &&
 			(dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
 	    dst->d_id != 0) {
-		if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
+		if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) &&
 		    (dst->d_blk_softlimit > 0)) {
 			ASSERT(dst->d_btimer != 0);
 		}
-		if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
+		if (((int) dst->d_icount > (int) dst->d_ino_softlimit) &&
 		    (dst->d_ino_softlimit > 0)) {
 			ASSERT(dst->d_itimer != 0);
 		}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 4d00ee67792d..85255536b4b6 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -649,12 +649,12 @@ xfs_trans_dqresv(
 			 * nblks.
 			 */
 			if (hardlimit > 0ULL &&
-			    hardlimit <= nblks + *resbcountp) {
+			    hardlimit < nblks + *resbcountp) {
 				xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
 				goto error_return;
 			}
 			if (softlimit > 0ULL &&
-			    softlimit <= nblks + *resbcountp) {
+			    softlimit < nblks + *resbcountp) {
 				if ((timer != 0 && get_seconds() > timer) ||
 				    (warns != 0 && warns >= warnlimit)) {
 					xfs_quota_warn(mp, dqp,
-- 
cgit v1.2.3


From c922bbc819324558e61402a7a76c10c550ca61bc Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Mon, 6 Feb 2012 12:50:30 +0000
Subject: xfs: make inode quota check more general

The xfs checks quota when reserving disk blocks and inodes. In the block
reservation, it checks if the total number of blocks including current
usage and new reservation exceed quota. In the inode reservation,
it checks using the total number of inodes including only current usage
without new reservation. However, this inode quota check works well
since the caller of xfs_trans_dquot() always sets the argument of the
number of new inode reservation to 1 or 0 and inode is reserved one by
one in current xfs.

To make it more general, this patch changes it to the same way as the
block quota check.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: Ben Myers <bpm@sgi.com>
Cc: Alex Elder <elder@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_trans_dquot.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 85255536b4b6..c4ba366d24e6 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -677,11 +677,13 @@ xfs_trans_dqresv(
 			if (!softlimit)
 				softlimit = q->qi_isoftlimit;
 
-			if (hardlimit > 0ULL && count >= hardlimit) {
+			if (hardlimit > 0ULL &&
+			    hardlimit < ninos + count) {
 				xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
 				goto error_return;
 			}
-			if (softlimit > 0ULL && count >= softlimit) {
+			if (softlimit > 0ULL &&
+			    softlimit < ninos + count) {
 				if  ((timer != 0 && get_seconds() > timer) ||
 				     (warns != 0 && warns >= warnlimit)) {
 					xfs_quota_warn(mp, dqp,
-- 
cgit v1.2.3


From faf309009e2e18d30c032b7d9479f29b91677c37 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 21 Feb 2012 17:24:20 -0800
Subject: sys_poll: fix incorrect type for 'timeout' parameter

The 'poll()' system call timeout parameter is supposed to be 'int', not
'long'.

Now, the reason this matters is that right now 32-bit compat mode is
broken on at least x86-64, because the 32-bit code just calls
'sys_poll()' directly on x86-64, and the 32-bit argument will have been
zero-extended, turning a signed 'int' into a large unsigned 'long'
value.

We could just introduce a 'compat_sys_poll()' function for this, and
that may eventually be what we have to do, but since the actual standard
poll() semantics is *supposed* to be 'int', and since at least on x86-64
glibc sign-extends the argument before invocing the system call (so
nobody can actually use a 64-bit timeout value in user space _anyway_,
even in 64-bit binaries), the simpler solution would seem to be to just
fix the definition of the system call to match what it should have been
from the very start.

If it turns out that somebody somehow circumvents the user-level libc
64-bit sign extension and actually uses a large unsigned 64-bit timeout
despite that not being how poll() is supposed to work, we will need to
do the compat_sys_poll() approach.

Reported-by: Thomas Meyer <thomas@m3y3r.de>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/select.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index d33418fdc858..e782258d0de3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -912,7 +912,7 @@ static long do_restart_poll(struct restart_block *restart_block)
 }
 
 SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
-		long, timeout_msecs)
+		int, timeout_msecs)
 {
 	struct timespec end_time, *to = NULL;
 	int ret;
-- 
cgit v1.2.3