From 892c4467e335e9050c95e0d8409c136c4dadaca2 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 7 Jan 2009 16:48:52 -0600
Subject: dlm: fix seq_file usage in debugfs lock dump

The old code would leak iterators and leave reference counts on
rsbs because it was ignoring the "stop" seq callback.  The code
followed an example that used the seq operations differently.
This new code is based on actually understanding how the seq
operations work.  It also improves things by saving the hash bucket
in the position to avoid cycling through completed buckets in start.

Siged-off-by: Davd Teigland <teigland@redhat.com>
---
 fs/dlm/debug_fs.c | 696 ++++++++++++++++++++++++++----------------------------
 1 file changed, 337 insertions(+), 359 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 2f107d1a6a45..bc4af3ef65a3 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2009 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -25,19 +25,6 @@ static struct mutex debug_buf_lock;
 
 static struct dentry *dlm_root;
 
-struct rsb_iter {
-	int entry;
-	int format;
-	int header;
-	struct dlm_ls *ls;
-	struct list_head *next;
-	struct dlm_rsb *rsb;
-};
-
-/*
- * dump all rsb's in the lockspace hash table
- */
-
 static char *print_lockmode(int mode)
 {
 	switch (mode) {
@@ -60,13 +47,13 @@ static char *print_lockmode(int mode)
 	}
 }
 
-static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
-			       struct dlm_rsb *res)
+static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			      struct dlm_rsb *res)
 {
 	seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
 
-	if (lkb->lkb_status == DLM_LKSTS_CONVERT
-	    || lkb->lkb_status == DLM_LKSTS_WAITING)
+	if (lkb->lkb_status == DLM_LKSTS_CONVERT ||
+	    lkb->lkb_status == DLM_LKSTS_WAITING)
 		seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
 
 	if (lkb->lkb_nodeid) {
@@ -80,33 +67,42 @@ static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
 	if (lkb->lkb_wait_type)
 		seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
 
-	seq_printf(s, "\n");
+	return seq_printf(s, "\n");
 }
 
 static int print_format1(struct dlm_rsb *res, struct seq_file *s)
 {
 	struct dlm_lkb *lkb;
 	int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
+	int rv;
 
 	lock_rsb(res);
 
-	seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
+	rv = seq_printf(s, "\nResource %p Name (len=%d) \"",
+			res, res->res_length);
+	if (rv)
+		goto out;
+
 	for (i = 0; i < res->res_length; i++) {
 		if (isprint(res->res_name[i]))
 			seq_printf(s, "%c", res->res_name[i]);
 		else
 			seq_printf(s, "%c", '.');
 	}
+
 	if (res->res_nodeid > 0)
-		seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
-			   res->res_nodeid);
+		rv = seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
+				res->res_nodeid);
 	else if (res->res_nodeid == 0)
-		seq_printf(s, "\"  \nMaster Copy\n");
+		rv = seq_printf(s, "\"  \nMaster Copy\n");
 	else if (res->res_nodeid == -1)
-		seq_printf(s, "\"  \nLooking up master (lkid %x)\n",
-			   res->res_first_lkid);
+		rv = seq_printf(s, "\"  \nLooking up master (lkid %x)\n",
+			   	res->res_first_lkid);
 	else
-		seq_printf(s, "\"  \nInvalid master %d\n", res->res_nodeid);
+		rv = seq_printf(s, "\"  \nInvalid master %d\n",
+				res->res_nodeid);
+	if (rv)
+		goto out;
 
 	/* Print the LVB: */
 	if (res->res_lvbptr) {
@@ -119,52 +115,66 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s)
 		}
 		if (rsb_flag(res, RSB_VALNOTVALID))
 			seq_printf(s, " (INVALID)");
-		seq_printf(s, "\n");
+		rv = seq_printf(s, "\n");
+		if (rv)
+			goto out;
 	}
 
 	root_list = !list_empty(&res->res_root_list);
 	recover_list = !list_empty(&res->res_recover_list);
 
 	if (root_list || recover_list) {
-		seq_printf(s, "Recovery: root %d recover %d flags %lx "
-			   "count %d\n", root_list, recover_list,
-			   res->res_flags, res->res_recover_locks_count);
+		rv = seq_printf(s, "Recovery: root %d recover %d flags %lx "
+				"count %d\n", root_list, recover_list,
+			   	res->res_flags, res->res_recover_locks_count);
+		if (rv)
+			goto out;
 	}
 
 	/* Print the locks attached to this resource */
 	seq_printf(s, "Granted Queue\n");
-	list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
-		print_format1_lock(s, lkb, res);
+	list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) {
+		rv = print_format1_lock(s, lkb, res);
+		if (rv)
+			goto out;
+	}
 
 	seq_printf(s, "Conversion Queue\n");
-	list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
-		print_format1_lock(s, lkb, res);
+	list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) {
+		rv = print_format1_lock(s, lkb, res);
+		if (rv)
+			goto out;
+	}
 
 	seq_printf(s, "Waiting Queue\n");
-	list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
-		print_format1_lock(s, lkb, res);
+	list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) {
+		rv = print_format1_lock(s, lkb, res);
+		if (rv)
+			goto out;
+	}
 
 	if (list_empty(&res->res_lookup))
 		goto out;
 
 	seq_printf(s, "Lookup Queue\n");
 	list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
-		seq_printf(s, "%08x %s", lkb->lkb_id,
-			   print_lockmode(lkb->lkb_rqmode));
+		rv = seq_printf(s, "%08x %s", lkb->lkb_id,
+				print_lockmode(lkb->lkb_rqmode));
 		if (lkb->lkb_wait_type)
 			seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
-		seq_printf(s, "\n");
+		rv = seq_printf(s, "\n");
 	}
  out:
 	unlock_rsb(res);
-	return 0;
+	return rv;
 }
 
-static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
-			       struct dlm_rsb *r)
+static int print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			      struct dlm_rsb *r)
 {
 	u64 xid = 0;
 	u64 us;
+	int rv;
 
 	if (lkb->lkb_flags & DLM_IFL_USER) {
 		if (lkb->lkb_ua)
@@ -177,69 +187,82 @@ static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
 	/* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
 	   r_nodeid r_len r_name */
 
-	seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
-		   lkb->lkb_id,
-		   lkb->lkb_nodeid,
-		   lkb->lkb_remid,
-		   lkb->lkb_ownpid,
-		   (unsigned long long)xid,
-		   lkb->lkb_exflags,
-		   lkb->lkb_flags,
-		   lkb->lkb_status,
-		   lkb->lkb_grmode,
-		   lkb->lkb_rqmode,
-		   (unsigned long long)us,
-		   r->res_nodeid,
-		   r->res_length,
-		   r->res_name);
+	rv = seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
+			lkb->lkb_id,
+			lkb->lkb_nodeid,
+			lkb->lkb_remid,
+			lkb->lkb_ownpid,
+			(unsigned long long)xid,
+			lkb->lkb_exflags,
+			lkb->lkb_flags,
+			lkb->lkb_status,
+			lkb->lkb_grmode,
+			lkb->lkb_rqmode,
+			(unsigned long long)us,
+			r->res_nodeid,
+			r->res_length,
+			r->res_name);
+	return rv;
 }
 
 static int print_format2(struct dlm_rsb *r, struct seq_file *s)
 {
 	struct dlm_lkb *lkb;
+	int rv = 0;
 
 	lock_rsb(r);
 
-	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
-		print_format2_lock(s, lkb, r);
-
-	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
-		print_format2_lock(s, lkb, r);
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		rv = print_format2_lock(s, lkb, r);
+		if (rv)
+			goto out;
+	}
 
-	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
-		print_format2_lock(s, lkb, r);
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		rv = print_format2_lock(s, lkb, r);
+		if (rv)
+			goto out;
+	}
 
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+		rv = print_format2_lock(s, lkb, r);
+		if (rv)
+			goto out;
+	}
+ out:
 	unlock_rsb(r);
-	return 0;
+	return rv;
 }
 
-static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
-			       int rsb_lookup)
+static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			      int rsb_lookup)
 {
 	u64 xid = 0;
+	int rv;
 
 	if (lkb->lkb_flags & DLM_IFL_USER) {
 		if (lkb->lkb_ua)
 			xid = lkb->lkb_ua->xid;
 	}
 
-	seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
-		   lkb->lkb_id,
-		   lkb->lkb_nodeid,
-		   lkb->lkb_remid,
-		   lkb->lkb_ownpid,
-		   (unsigned long long)xid,
-		   lkb->lkb_exflags,
-		   lkb->lkb_flags,
-		   lkb->lkb_status,
-		   lkb->lkb_grmode,
-		   lkb->lkb_rqmode,
-		   lkb->lkb_highbast,
-		   rsb_lookup,
-		   lkb->lkb_wait_type,
-		   lkb->lkb_lvbseq,
-		   (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
-		   (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
+	rv = seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
+			lkb->lkb_id,
+			lkb->lkb_nodeid,
+			lkb->lkb_remid,
+			lkb->lkb_ownpid,
+			(unsigned long long)xid,
+			lkb->lkb_exflags,
+			lkb->lkb_flags,
+			lkb->lkb_status,
+			lkb->lkb_grmode,
+			lkb->lkb_rqmode,
+			lkb->lkb_highbast,
+			rsb_lookup,
+			lkb->lkb_wait_type,
+			lkb->lkb_lvbseq,
+			(unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
+			(unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
+	return rv;
 }
 
 static int print_format3(struct dlm_rsb *r, struct seq_file *s)
@@ -247,18 +270,21 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
 	struct dlm_lkb *lkb;
 	int i, lvblen = r->res_ls->ls_lvblen;
 	int print_name = 1;
+	int rv;
 
 	lock_rsb(r);
 
-	seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
-		   r,
-		   r->res_nodeid,
-		   r->res_first_lkid,
-		   r->res_flags,
-		   !list_empty(&r->res_root_list),
-		   !list_empty(&r->res_recover_list),
-		   r->res_recover_locks_count,
-		   r->res_length);
+	rv = seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
+			r,
+			r->res_nodeid,
+			r->res_first_lkid,
+			r->res_flags,
+			!list_empty(&r->res_root_list),
+			!list_empty(&r->res_recover_list),
+			r->res_recover_locks_count,
+			r->res_length);
+	if (rv)
+		goto out;
 
 	for (i = 0; i < r->res_length; i++) {
 		if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
@@ -273,7 +299,9 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
 		else
 			seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
 	}
-	seq_printf(s, "\n");
+	rv = seq_printf(s, "\n");
+	if (rv)
+		goto out;
 
 	if (!r->res_lvbptr)
 		goto do_locks;
@@ -282,344 +310,294 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
 
 	for (i = 0; i < lvblen; i++)
 		seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
-	seq_printf(s, "\n");
+	rv = seq_printf(s, "\n");
+	if (rv)
+		goto out;
 
  do_locks:
-	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
-		print_format3_lock(s, lkb, 0);
-
-	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
-		print_format3_lock(s, lkb, 0);
-
-	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
-		print_format3_lock(s, lkb, 0);
-
-	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
-		print_format3_lock(s, lkb, 1);
-
-	unlock_rsb(r);
-	return 0;
-}
-
-static int rsb_iter_next(struct rsb_iter *ri)
-{
-	struct dlm_ls *ls = ri->ls;
-	int i;
-
-	if (!ri->next) {
- top:
-		/* Find the next non-empty hash bucket */
-		for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
-			read_lock(&ls->ls_rsbtbl[i].lock);
-			if (!list_empty(&ls->ls_rsbtbl[i].list)) {
-				ri->next = ls->ls_rsbtbl[i].list.next;
-				ri->rsb = list_entry(ri->next, struct dlm_rsb,
-							res_hashchain);
-				dlm_hold_rsb(ri->rsb);
-				read_unlock(&ls->ls_rsbtbl[i].lock);
-				break;
-			}
-			read_unlock(&ls->ls_rsbtbl[i].lock);
-		}
-		ri->entry = i;
-
-		if (ri->entry >= ls->ls_rsbtbl_size)
-			return 1;
-	} else {
-		struct dlm_rsb *old = ri->rsb;
-		i = ri->entry;
-		read_lock(&ls->ls_rsbtbl[i].lock);
-		ri->next = ri->next->next;
-		if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
-			/* End of list - move to next bucket */
-			ri->next = NULL;
-			ri->entry++;
-			read_unlock(&ls->ls_rsbtbl[i].lock);
-			dlm_put_rsb(old);
-			goto top;
-		}
-		ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
-		dlm_hold_rsb(ri->rsb);
-		read_unlock(&ls->ls_rsbtbl[i].lock);
-		dlm_put_rsb(old);
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		rv = print_format3_lock(s, lkb, 0);
+		if (rv)
+			goto out;
 	}
 
-	return 0;
-}
-
-static void rsb_iter_free(struct rsb_iter *ri)
-{
-	kfree(ri);
-}
-
-static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
-{
-	struct rsb_iter *ri;
-
-	ri = kzalloc(sizeof *ri, GFP_KERNEL);
-	if (!ri)
-		return NULL;
-
-	ri->ls = ls;
-	ri->entry = 0;
-	ri->next = NULL;
-	ri->format = 1;
-
-	if (rsb_iter_next(ri)) {
-		rsb_iter_free(ri);
-		return NULL;
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		rv = print_format3_lock(s, lkb, 0);
+		if (rv)
+			goto out;
 	}
 
-	return ri;
-}
-
-static void *rsb_seq_start(struct seq_file *file, loff_t *pos)
-{
-	struct rsb_iter *ri;
-	loff_t n = *pos;
-
-	ri = rsb_iter_init(file->private);
-	if (!ri)
-		return NULL;
-
-	while (n--) {
-		if (rsb_iter_next(ri)) {
-			rsb_iter_free(ri);
-			return NULL;
-		}
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+		rv = print_format3_lock(s, lkb, 0);
+		if (rv)
+			goto out;
 	}
 
-	return ri;
-}
-
-static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
-{
-	struct rsb_iter *ri = iter_ptr;
-
-	(*pos)++;
-
-	if (rsb_iter_next(ri)) {
-		rsb_iter_free(ri);
-		return NULL;
+	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) {
+		rv = print_format3_lock(s, lkb, 1);
+		if (rv)
+			goto out;
 	}
-
-	return ri;
+ out:
+	unlock_rsb(r);
+	return rv;
 }
 
-static void rsb_seq_stop(struct seq_file *file, void *iter_ptr)
-{
-	/* nothing for now */
-}
+struct rsbtbl_iter {
+	struct dlm_rsb *rsb;
+	unsigned bucket;
+	int format;
+	int header;
+};
 
-static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
+/* seq_printf returns -1 if the buffer is full, and 0 otherwise.
+   If the buffer is full, seq_printf can be called again, but it
+   does nothing and just returns -1.  So, the these printing routines
+   periodically check the return value to avoid wasting too much time
+   trying to print to a full buffer. */
+
+static int table_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-	struct rsb_iter *ri = iter_ptr;
+	struct rsbtbl_iter *ri = iter_ptr;
+	int rv = 0;
 
 	switch (ri->format) {
 	case 1:
-		print_format1(ri->rsb, file);
+		rv = print_format1(ri->rsb, seq);
 		break;
 	case 2:
 		if (ri->header) {
-			seq_printf(file, "id nodeid remid pid xid exflags "
-					 "flags sts grmode rqmode time_ms "
-					 "r_nodeid r_len r_name\n");
+			seq_printf(seq, "id nodeid remid pid xid exflags "
+					"flags sts grmode rqmode time_ms "
+					"r_nodeid r_len r_name\n");
 			ri->header = 0;
 		}
-		print_format2(ri->rsb, file);
+		rv = print_format2(ri->rsb, seq);
 		break;
 	case 3:
 		if (ri->header) {
-			seq_printf(file, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+			seq_printf(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
 			ri->header = 0;
 		}
-		print_format3(ri->rsb, file);
+		rv = print_format3(ri->rsb, seq);
 		break;
 	}
 
-	return 0;
+	return rv;
 }
 
-static struct seq_operations rsb_seq_ops = {
-	.start = rsb_seq_start,
-	.next  = rsb_seq_next,
-	.stop  = rsb_seq_stop,
-	.show  = rsb_seq_show,
-};
+static struct seq_operations format1_seq_ops;
+static struct seq_operations format2_seq_ops;
+static struct seq_operations format3_seq_ops;
 
-static int rsb_open(struct inode *inode, struct file *file)
+static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct seq_file *seq;
-	int ret;
-
-	ret = seq_open(file, &rsb_seq_ops);
-	if (ret)
-		return ret;
-
-	seq = file->private_data;
-	seq->private = inode->i_private;
-
-	return 0;
-}
-
-static const struct file_operations rsb_fops = {
-	.owner   = THIS_MODULE,
-	.open    = rsb_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release
-};
+	struct dlm_ls *ls = seq->private;
+	struct rsbtbl_iter *ri;
+	struct dlm_rsb *r;
+	loff_t n = *pos;
+	unsigned bucket, entry;
 
-/*
- * Dump state in compact per-lock listing
- */
+	bucket = n >> 32;
+	entry = n & ((1LL << 32) - 1);
 
-static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos)
-{
-	struct rsb_iter *ri;
+	if (bucket >= ls->ls_rsbtbl_size)
+		return NULL;
 
-	ri = kzalloc(sizeof *ri, GFP_KERNEL);
+	ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL);
 	if (!ri)
 		return NULL;
-
-	ri->ls = ls;
-	ri->entry = 0;
-	ri->next = NULL;
-	ri->format = 2;
-
-	if (*pos == 0)
+	if (n == 0)
 		ri->header = 1;
-
-	if (rsb_iter_next(ri)) {
-		rsb_iter_free(ri);
-		return NULL;
+	if (seq->op == &format1_seq_ops)
+		ri->format = 1;
+	if (seq->op == &format2_seq_ops)
+		ri->format = 2;
+	if (seq->op == &format3_seq_ops)
+		ri->format = 3;
+
+	read_lock(&ls->ls_rsbtbl[bucket].lock);
+	if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+		list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list,
+				    res_hashchain) {
+			if (!entry--) {
+				dlm_hold_rsb(r);
+				ri->rsb = r;
+				ri->bucket = bucket;
+				read_unlock(&ls->ls_rsbtbl[bucket].lock);
+				return ri;
+			}
+		}
 	}
+	read_unlock(&ls->ls_rsbtbl[bucket].lock);
 
-	return ri;
-}
+	/*
+	 * move to the first rsb in the next non-empty bucket
+	 */
 
-static void *locks_seq_start(struct seq_file *file, loff_t *pos)
-{
-	struct rsb_iter *ri;
-	loff_t n = *pos;
+	/* zero the entry */
+	n &= ~((1LL << 32) - 1);
 
-	ri = locks_iter_init(file->private, pos);
-	if (!ri)
-		return NULL;
+	while (1) {
+		bucket++;
+		n += 1LL << 32;
 
-	while (n--) {
-		if (rsb_iter_next(ri)) {
-			rsb_iter_free(ri);
+		if (bucket >= ls->ls_rsbtbl_size) {
+			kfree(ri);
 			return NULL;
 		}
-	}
 
-	return ri;
+		read_lock(&ls->ls_rsbtbl[bucket].lock);
+		if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+			r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
+					     struct dlm_rsb, res_hashchain);
+			dlm_hold_rsb(r);
+			ri->rsb = r;
+			ri->bucket = bucket;
+			read_unlock(&ls->ls_rsbtbl[bucket].lock);
+			*pos = n;
+			return ri;
+		}
+		read_unlock(&ls->ls_rsbtbl[bucket].lock);
+	}
 }
 
-static struct seq_operations locks_seq_ops = {
-	.start = locks_seq_start,
-	.next  = rsb_seq_next,
-	.stop  = rsb_seq_stop,
-	.show  = rsb_seq_show,
-};
-
-static int locks_open(struct inode *inode, struct file *file)
+static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
 {
-	struct seq_file *seq;
-	int ret;
-
-	ret = seq_open(file, &locks_seq_ops);
-	if (ret)
-		return ret;
-
-	seq = file->private_data;
-	seq->private = inode->i_private;
-
-	return 0;
-}
-
-static const struct file_operations locks_fops = {
-	.owner   = THIS_MODULE,
-	.open    = locks_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release
-};
-
-/*
- * Dump all rsb/lvb/lkb state in compact listing, more complete than _locks
- * This can replace both formats 1 and 2 eventually.
- */
+	struct dlm_ls *ls = seq->private;
+	struct rsbtbl_iter *ri = iter_ptr;
+	struct list_head *next;
+	struct dlm_rsb *r, *rp;
+	loff_t n = *pos;
+	unsigned bucket;
+
+	bucket = n >> 32;
+
+	/*
+	 * move to the next rsb in the same bucket
+	 */
+
+	read_lock(&ls->ls_rsbtbl[bucket].lock);
+	rp = ri->rsb;
+	next = rp->res_hashchain.next;
+
+	if (next != &ls->ls_rsbtbl[bucket].list) {
+		r = list_entry(next, struct dlm_rsb, res_hashchain);
+		dlm_hold_rsb(r);
+		ri->rsb = r;
+		read_unlock(&ls->ls_rsbtbl[bucket].lock);
+		dlm_put_rsb(rp);
+		++*pos;
+		return ri;
+	}
+	read_unlock(&ls->ls_rsbtbl[bucket].lock);
+	dlm_put_rsb(rp);
 
-static struct rsb_iter *all_iter_init(struct dlm_ls *ls, loff_t *pos)
-{
-	struct rsb_iter *ri;
+	/*
+	 * move to the first rsb in the next non-empty bucket
+	 */
 
-	ri = kzalloc(sizeof *ri, GFP_KERNEL);
-	if (!ri)
-		return NULL;
+	/* zero the entry */
+	n &= ~((1LL << 32) - 1);
 
-	ri->ls = ls;
-	ri->entry = 0;
-	ri->next = NULL;
-	ri->format = 3;
+	while (1) {
+		bucket++;
+		n += 1LL << 32;
 
-	if (*pos == 0)
-		ri->header = 1;
+		if (bucket >= ls->ls_rsbtbl_size) {
+			kfree(ri);
+			return NULL;
+		}
 
-	if (rsb_iter_next(ri)) {
-		rsb_iter_free(ri);
-		return NULL;
+		read_lock(&ls->ls_rsbtbl[bucket].lock);
+		if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+			r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
+					     struct dlm_rsb, res_hashchain);
+			dlm_hold_rsb(r);
+			ri->rsb = r;
+			ri->bucket = bucket;
+			read_unlock(&ls->ls_rsbtbl[bucket].lock);
+			*pos = n;
+			return ri;
+		}
+		read_unlock(&ls->ls_rsbtbl[bucket].lock);
 	}
-
-	return ri;
 }
 
-static void *all_seq_start(struct seq_file *file, loff_t *pos)
+static void table_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
-	struct rsb_iter *ri;
-	loff_t n = *pos;
-
-	ri = all_iter_init(file->private, pos);
-	if (!ri)
-		return NULL;
+	struct rsbtbl_iter *ri = iter_ptr;
 
-	while (n--) {
-		if (rsb_iter_next(ri)) {
-			rsb_iter_free(ri);
-			return NULL;
-		}
+	if (ri) {
+		dlm_put_rsb(ri->rsb);
+		kfree(ri);
 	}
-
-	return ri;
 }
 
-static struct seq_operations all_seq_ops = {
-	.start = all_seq_start,
-	.next  = rsb_seq_next,
-	.stop  = rsb_seq_stop,
-	.show  = rsb_seq_show,
+static struct seq_operations format1_seq_ops = {
+	.start = table_seq_start,
+	.next  = table_seq_next,
+	.stop  = table_seq_stop,
+	.show  = table_seq_show,
 };
 
-static int all_open(struct inode *inode, struct file *file)
+static struct seq_operations format2_seq_ops = {
+	.start = table_seq_start,
+	.next  = table_seq_next,
+	.stop  = table_seq_stop,
+	.show  = table_seq_show,
+};
+
+static struct seq_operations format3_seq_ops = {
+	.start = table_seq_start,
+	.next  = table_seq_next,
+	.stop  = table_seq_stop,
+	.show  = table_seq_show,
+};
+
+static const struct file_operations format1_fops;
+static const struct file_operations format2_fops;
+static const struct file_operations format3_fops;
+
+static int table_open(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq;
-	int ret;
+	int ret = -1;
+
+	if (file->f_op == &format1_fops)
+		ret = seq_open(file, &format1_seq_ops);
+	else if (file->f_op == &format2_fops)
+		ret = seq_open(file, &format2_seq_ops);
+	else if (file->f_op == &format3_fops)
+		ret = seq_open(file, &format3_seq_ops);
 
-	ret = seq_open(file, &all_seq_ops);
 	if (ret)
 		return ret;
 
 	seq = file->private_data;
-	seq->private = inode->i_private;
-
+	seq->private = inode->i_private; /* the dlm_ls */
 	return 0;
 }
 
-static const struct file_operations all_fops = {
+static const struct file_operations format1_fops = {
+	.owner   = THIS_MODULE,
+	.open    = table_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static const struct file_operations format2_fops = {
+	.owner   = THIS_MODULE,
+	.open    = table_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static const struct file_operations format3_fops = {
 	.owner   = THIS_MODULE,
-	.open    = all_open,
+	.open    = table_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
 	.release = seq_release
@@ -689,7 +667,7 @@ int dlm_create_debug_file(struct dlm_ls *ls)
 						      S_IFREG | S_IRUGO,
 						      dlm_root,
 						      ls,
-						      &rsb_fops);
+						      &format1_fops);
 	if (!ls->ls_debug_rsb_dentry)
 		goto fail;
 
@@ -702,7 +680,7 @@ int dlm_create_debug_file(struct dlm_ls *ls)
 							S_IFREG | S_IRUGO,
 							dlm_root,
 							ls,
-							&locks_fops);
+							&format2_fops);
 	if (!ls->ls_debug_locks_dentry)
 		goto fail;
 
@@ -715,7 +693,7 @@ int dlm_create_debug_file(struct dlm_ls *ls)
 						      S_IFREG | S_IRUGO,
 						      dlm_root,
 						      ls,
-						      &all_fops);
+						      &format3_fops);
 	if (!ls->ls_debug_all_dentry)
 		goto fail;
 
-- 
cgit v1.2.3


From c7be761a8163d2f1ac0b606c21e4316b7abc5af7 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 7 Jan 2009 16:50:41 -0600
Subject: dlm: change rsbtbl rwlock to spinlock

The rwlock is almost always used in write mode, so there's no reason
to not use a spinlock instead.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/debug_fs.c     | 24 ++++++++++++------------
 fs/dlm/dlm_internal.h |  2 +-
 fs/dlm/lock.c         | 26 +++++++++++++-------------
 fs/dlm/lockspace.c    |  2 +-
 fs/dlm/recover.c      | 10 +++++-----
 5 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index bc4af3ef65a3..1d1d27442235 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -416,7 +416,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 	if (seq->op == &format3_seq_ops)
 		ri->format = 3;
 
-	read_lock(&ls->ls_rsbtbl[bucket].lock);
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
 	if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
 		list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list,
 				    res_hashchain) {
@@ -424,12 +424,12 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 				dlm_hold_rsb(r);
 				ri->rsb = r;
 				ri->bucket = bucket;
-				read_unlock(&ls->ls_rsbtbl[bucket].lock);
+				spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 				return ri;
 			}
 		}
 	}
-	read_unlock(&ls->ls_rsbtbl[bucket].lock);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 
 	/*
 	 * move to the first rsb in the next non-empty bucket
@@ -447,18 +447,18 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 			return NULL;
 		}
 
-		read_lock(&ls->ls_rsbtbl[bucket].lock);
+		spin_lock(&ls->ls_rsbtbl[bucket].lock);
 		if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
 			r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
 					     struct dlm_rsb, res_hashchain);
 			dlm_hold_rsb(r);
 			ri->rsb = r;
 			ri->bucket = bucket;
-			read_unlock(&ls->ls_rsbtbl[bucket].lock);
+			spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 			*pos = n;
 			return ri;
 		}
-		read_unlock(&ls->ls_rsbtbl[bucket].lock);
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 	}
 }
 
@@ -477,7 +477,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
 	 * move to the next rsb in the same bucket
 	 */
 
-	read_lock(&ls->ls_rsbtbl[bucket].lock);
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
 	rp = ri->rsb;
 	next = rp->res_hashchain.next;
 
@@ -485,12 +485,12 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
 		r = list_entry(next, struct dlm_rsb, res_hashchain);
 		dlm_hold_rsb(r);
 		ri->rsb = r;
-		read_unlock(&ls->ls_rsbtbl[bucket].lock);
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 		dlm_put_rsb(rp);
 		++*pos;
 		return ri;
 	}
-	read_unlock(&ls->ls_rsbtbl[bucket].lock);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 	dlm_put_rsb(rp);
 
 	/*
@@ -509,18 +509,18 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
 			return NULL;
 		}
 
-		read_lock(&ls->ls_rsbtbl[bucket].lock);
+		spin_lock(&ls->ls_rsbtbl[bucket].lock);
 		if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
 			r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
 					     struct dlm_rsb, res_hashchain);
 			dlm_hold_rsb(r);
 			ri->rsb = r;
 			ri->bucket = bucket;
-			read_unlock(&ls->ls_rsbtbl[bucket].lock);
+			spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 			*pos = n;
 			return ri;
 		}
-		read_unlock(&ls->ls_rsbtbl[bucket].lock);
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 	}
 }
 
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index ef2f1e353966..076e86f38bc8 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -105,7 +105,7 @@ struct dlm_dirtable {
 struct dlm_rsbtable {
 	struct list_head	list;
 	struct list_head	toss;
-	rwlock_t		lock;
+	spinlock_t		lock;
 };
 
 struct dlm_lkbtable {
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 6cfe65bbf4a2..01e7d39c5fba 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -412,9 +412,9 @@ static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
 		      unsigned int flags, struct dlm_rsb **r_ret)
 {
 	int error;
-	write_lock(&ls->ls_rsbtbl[b].lock);
+	spin_lock(&ls->ls_rsbtbl[b].lock);
 	error = _search_rsb(ls, name, len, b, flags, r_ret);
-	write_unlock(&ls->ls_rsbtbl[b].lock);
+	spin_unlock(&ls->ls_rsbtbl[b].lock);
 	return error;
 }
 
@@ -478,16 +478,16 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
 		r->res_nodeid = nodeid;
 	}
 
-	write_lock(&ls->ls_rsbtbl[bucket].lock);
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
 	error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
 	if (!error) {
-		write_unlock(&ls->ls_rsbtbl[bucket].lock);
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 		dlm_free_rsb(r);
 		r = tmp;
 		goto out;
 	}
 	list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
-	write_unlock(&ls->ls_rsbtbl[bucket].lock);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 	error = 0;
  out:
 	*r_ret = r;
@@ -530,9 +530,9 @@ static void put_rsb(struct dlm_rsb *r)
 	struct dlm_ls *ls = r->res_ls;
 	uint32_t bucket = r->res_bucket;
 
-	write_lock(&ls->ls_rsbtbl[bucket].lock);
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
 	kref_put(&r->res_ref, toss_rsb);
-	write_unlock(&ls->ls_rsbtbl[bucket].lock);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 }
 
 void dlm_put_rsb(struct dlm_rsb *r)
@@ -967,7 +967,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
 
 	for (;;) {
 		found = 0;
-		write_lock(&ls->ls_rsbtbl[b].lock);
+		spin_lock(&ls->ls_rsbtbl[b].lock);
 		list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
 					    res_hashchain) {
 			if (!time_after_eq(jiffies, r->res_toss_time +
@@ -978,20 +978,20 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
 		}
 
 		if (!found) {
-			write_unlock(&ls->ls_rsbtbl[b].lock);
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
 			break;
 		}
 
 		if (kref_put(&r->res_ref, kill_rsb)) {
 			list_del(&r->res_hashchain);
-			write_unlock(&ls->ls_rsbtbl[b].lock);
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
 
 			if (is_master(r))
 				dir_remove(r);
 			dlm_free_rsb(r);
 			count++;
 		} else {
-			write_unlock(&ls->ls_rsbtbl[b].lock);
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
 			log_error(ls, "tossed rsb in use %s", r->res_name);
 		}
 	}
@@ -4224,7 +4224,7 @@ static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
 {
 	struct dlm_rsb *r, *r_ret = NULL;
 
-	read_lock(&ls->ls_rsbtbl[bucket].lock);
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
 	list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
 		if (!rsb_flag(r, RSB_LOCKS_PURGED))
 			continue;
@@ -4233,7 +4233,7 @@ static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
 		r_ret = r;
 		break;
 	}
-	read_unlock(&ls->ls_rsbtbl[bucket].lock);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 	return r_ret;
 }
 
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 8d86b7960f0d..aa32e5f02493 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -464,7 +464,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	for (i = 0; i < size; i++) {
 		INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
 		INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
-		rwlock_init(&ls->ls_rsbtbl[i].lock);
+		spin_lock_init(&ls->ls_rsbtbl[i].lock);
 	}
 
 	size = dlm_config.ci_lkbtbl_size;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 80aba5bdd4a4..eda43f362616 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -726,7 +726,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
 	}
 
 	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
-		read_lock(&ls->ls_rsbtbl[i].lock);
+		spin_lock(&ls->ls_rsbtbl[i].lock);
 		list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
 			list_add(&r->res_root_list, &ls->ls_root_list);
 			dlm_hold_rsb(r);
@@ -737,7 +737,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
 		   but no other recovery steps should do anything with them. */
 
 		if (dlm_no_directory(ls)) {
-			read_unlock(&ls->ls_rsbtbl[i].lock);
+			spin_unlock(&ls->ls_rsbtbl[i].lock);
 			continue;
 		}
 
@@ -745,7 +745,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
 			list_add(&r->res_root_list, &ls->ls_root_list);
 			dlm_hold_rsb(r);
 		}
-		read_unlock(&ls->ls_rsbtbl[i].lock);
+		spin_unlock(&ls->ls_rsbtbl[i].lock);
 	}
  out:
 	up_write(&ls->ls_root_sem);
@@ -775,7 +775,7 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
 	int i;
 
 	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
-		write_lock(&ls->ls_rsbtbl[i].lock);
+		spin_lock(&ls->ls_rsbtbl[i].lock);
 		list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
 					 res_hashchain) {
 			if (dlm_no_directory(ls) || !is_master(r)) {
@@ -783,7 +783,7 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
 				dlm_free_rsb(r);
 			}
 		}
-		write_unlock(&ls->ls_rsbtbl[i].lock);
+		spin_unlock(&ls->ls_rsbtbl[i].lock);
 	}
 }
 
-- 
cgit v1.2.3


From c9a98553d513dfc82cdce869970d5662c1f22c68 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 1 Jan 2009 14:21:16 -0500
Subject: [XFS] pass XFS_IGET_BULKSTAT to xfs_iget for handle operations

NFS clients or users of the handle ioctls can pass us arbitrary inode
numbers through the exportfs interface.  Make sure we use the
XFS_IGET_BULKSTAT so that these don't cause shutdowns due to the corruption
checks.  Also translate the EINVAL we get back for invalid inode clusters
into an ESTALE which is more appropinquate, and remove the useless check
for a NULL inode on a successfull xfs_iget return.

I have a testcase to reproduce this using the handle interface which
I will submit to xfsqa.

Reported-by: Mario Becroft <mb@gem.win.co.nz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_export.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 595751f78350..87b8cbd23d4b 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -126,11 +126,26 @@ xfs_nfs_get_inode(
 	if (ino == 0)
 		return ERR_PTR(-ESTALE);
 
-	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
-	if (error)
+	/*
+	 * The XFS_IGET_BULKSTAT means that an invalid inode number is just
+	 * fine and not an indication of a corrupted filesystem.  Because
+	 * clients can send any kind of invalid file handle, e.g. after
+	 * a restore on the server we have to deal with this case gracefully.
+	 */
+	error = xfs_iget(mp, NULL, ino, XFS_IGET_BULKSTAT,
+			 XFS_ILOCK_SHARED, &ip, 0);
+	if (error) {
+		/*
+		 * EINVAL means the inode cluster doesn't exist anymore.
+		 * This implies the filehandle is stale, so we should
+		 * translate it here.
+		 * We don't use ESTALE directly down the chain to not
+		 * confuse applications using bulkstat that expect EINVAL.
+		 */
+		if (error == EINVAL)
+			error = ESTALE;
 		return ERR_PTR(-error);
-	if (!ip)
-		return ERR_PTR(-EIO);
+	}
 
 	if (ip->i_d.di_gen != generation) {
 		xfs_iput_new(ip, XFS_ILOCK_SHARED);
-- 
cgit v1.2.3


From 9800b550355e99c9bcaba7ec6540751dce0823d7 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 1 Jan 2009 16:40:10 -0600
Subject: [XFS] Remove several unused typedefs.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.h | 2 --
 fs/xfs/xfs_acl.h            | 1 -
 fs/xfs/xfs_types.h          | 2 --
 3 files changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 7b26f5ff9692..1dd528849755 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -21,8 +21,6 @@
 extern struct workqueue_struct *xfsdatad_workqueue;
 extern mempool_t *xfs_ioend_pool;
 
-typedef void (*xfs_ioend_func_t)(void *);
-
 /*
  * xfs_ioend struct manages large extent writes for XFS.
  * It can manage several multi-page bio's at once.
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index a4e293b93efa..642f1db4def4 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -22,7 +22,6 @@
  * Access Control Lists
  */
 typedef __uint16_t	xfs_acl_perm_t;
-typedef __int32_t	xfs_acl_type_t;
 typedef __int32_t	xfs_acl_tag_t;
 typedef __int32_t	xfs_acl_id_t;
 
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 0f5191644ab2..baedbd14dc21 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -111,8 +111,6 @@ typedef __uint64_t	xfs_fileoff_t;	/* block number in a file */
 typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */
 typedef __uint64_t	xfs_filblks_t;	/* number of blocks in a file */
 
-typedef __uint8_t	xfs_arch_t;	/* architecture of an xfs fs */
-
 /*
  * Null values for the types.
  */
-- 
cgit v1.2.3


From c9fb86a917640d66ba2e0613a12f3a76eda8a30f Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 1 Jan 2009 16:40:11 -0600
Subject: [XFS] Remove macro-to-function indirections in attr code

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_attr_leaf.c | 72 +++++++++++++++++++++++++-------------------------
 fs/xfs/xfs_attr_leaf.h | 12 ---------
 2 files changed, 36 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 79da6b2ea99e..6c323f8a4cd1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -736,7 +736,7 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
 			continue;		/* don't copy partial entries */
 		if (!(entry->flags & XFS_ATTR_LOCAL))
 			return(0);
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+		name_loc = xfs_attr_leaf_name_local(leaf, i);
 		if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
 			return(0);
 		if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
@@ -823,7 +823,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
 		if (!entry->nameidx)
 			continue;
 		ASSERT(entry->flags & XFS_ATTR_LOCAL);
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+		name_loc = xfs_attr_leaf_name_local(leaf, i);
 		nargs.name = (char *)name_loc->nameval;
 		nargs.namelen = name_loc->namelen;
 		nargs.value = (char *)&name_loc->nameval[nargs.namelen];
@@ -1141,14 +1141,14 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
 	 * as part of this transaction (a split operation for example).
 	 */
 	if (entry->flags & XFS_ATTR_LOCAL) {
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+		name_loc = xfs_attr_leaf_name_local(leaf, args->index);
 		name_loc->namelen = args->namelen;
 		name_loc->valuelen = cpu_to_be16(args->valuelen);
 		memcpy((char *)name_loc->nameval, args->name, args->namelen);
 		memcpy((char *)&name_loc->nameval[args->namelen], args->value,
 				   be16_to_cpu(name_loc->valuelen));
 	} else {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
 		name_rmt->namelen = args->namelen;
 		memcpy((char *)name_rmt->name, args->name, args->namelen);
 		entry->flags |= XFS_ATTR_INCOMPLETE;
@@ -1159,7 +1159,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
 		args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
 	}
 	xfs_da_log_buf(args->trans, bp,
-	     XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+	     XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
 				   xfs_attr_leaf_entsize(leaf, args->index)));
 
 	/*
@@ -1749,10 +1749,10 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
 	/*
 	 * Compress the remaining entries and zero out the removed stuff.
 	 */
-	memset(XFS_ATTR_LEAF_NAME(leaf, args->index), 0, entsize);
+	memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize);
 	be16_add_cpu(&hdr->usedbytes, -entsize);
 	xfs_da_log_buf(args->trans, bp,
-	     XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+	     XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
 				   entsize));
 
 	tmp = (be16_to_cpu(hdr->count) - args->index)
@@ -1985,7 +1985,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
 			continue;
 		}
 		if (entry->flags & XFS_ATTR_LOCAL) {
-			name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe);
+			name_loc = xfs_attr_leaf_name_local(leaf, probe);
 			if (name_loc->namelen != args->namelen)
 				continue;
 			if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0)
@@ -1995,7 +1995,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
 			args->index = probe;
 			return(XFS_ERROR(EEXIST));
 		} else {
-			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, probe);
+			name_rmt = xfs_attr_leaf_name_remote(leaf, probe);
 			if (name_rmt->namelen != args->namelen)
 				continue;
 			if (memcmp(args->name, (char *)name_rmt->name,
@@ -2035,7 +2035,7 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
 
 	entry = &leaf->entries[args->index];
 	if (entry->flags & XFS_ATTR_LOCAL) {
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+		name_loc = xfs_attr_leaf_name_local(leaf, args->index);
 		ASSERT(name_loc->namelen == args->namelen);
 		ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
 		valuelen = be16_to_cpu(name_loc->valuelen);
@@ -2050,7 +2050,7 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
 		args->valuelen = valuelen;
 		memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
 	} else {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
 		ASSERT(name_rmt->namelen == args->namelen);
 		ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
 		valuelen = be32_to_cpu(name_rmt->valuelen);
@@ -2143,7 +2143,7 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
 		 * off for 6.2, should be revisited later.
 		 */
 		if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
-			memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
+			memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
 			be16_add_cpu(&hdr_s->usedbytes, -tmp);
 			be16_add_cpu(&hdr_s->count, -1);
 			entry_d--;	/* to compensate for ++ in loop hdr */
@@ -2160,11 +2160,11 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
 			entry_d->flags = entry_s->flags;
 			ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
 							<= XFS_LBSIZE(mp));
-			memmove(XFS_ATTR_LEAF_NAME(leaf_d, desti),
-				XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp);
+			memmove(xfs_attr_leaf_name(leaf_d, desti),
+				xfs_attr_leaf_name(leaf_s, start_s + i), tmp);
 			ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
 							<= XFS_LBSIZE(mp));
-			memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
+			memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
 			be16_add_cpu(&hdr_s->usedbytes, -tmp);
 			be16_add_cpu(&hdr_d->usedbytes, tmp);
 			be16_add_cpu(&hdr_s->count, -1);
@@ -2276,12 +2276,12 @@ xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
 
 	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
 	if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, index);
-		size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(name_loc->namelen,
+		name_loc = xfs_attr_leaf_name_local(leaf, index);
+		size = xfs_attr_leaf_entsize_local(name_loc->namelen,
 						   be16_to_cpu(name_loc->valuelen));
 	} else {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, index);
-		size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(name_rmt->namelen);
+		name_rmt = xfs_attr_leaf_name_remote(leaf, index);
+		size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
 	}
 	return(size);
 }
@@ -2297,13 +2297,13 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
 {
 	int size;
 
-	size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(namelen, valuelen);
-	if (size < XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(blocksize)) {
+	size = xfs_attr_leaf_entsize_local(namelen, valuelen);
+	if (size < xfs_attr_leaf_entsize_local_max(blocksize)) {
 		if (local) {
 			*local = 1;
 		}
 	} else {
-		size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(namelen);
+		size = xfs_attr_leaf_entsize_remote(namelen);
 		if (local) {
 			*local = 0;
 		}
@@ -2372,7 +2372,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 
 		if (entry->flags & XFS_ATTR_LOCAL) {
 			xfs_attr_leaf_name_local_t *name_loc =
-				XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+				xfs_attr_leaf_name_local(leaf, i);
 
 			retval = context->put_listent(context,
 						entry->flags,
@@ -2384,7 +2384,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 				return retval;
 		} else {
 			xfs_attr_leaf_name_remote_t *name_rmt =
-				XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+				xfs_attr_leaf_name_remote(leaf, i);
 
 			int valuelen = be32_to_cpu(name_rmt->valuelen);
 
@@ -2468,11 +2468,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
 
 #ifdef DEBUG
 	if (entry->flags & XFS_ATTR_LOCAL) {
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+		name_loc = xfs_attr_leaf_name_local(leaf, args->index);
 		namelen = name_loc->namelen;
 		name = (char *)name_loc->nameval;
 	} else {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
 		namelen = name_rmt->namelen;
 		name = (char *)name_rmt->name;
 	}
@@ -2487,7 +2487,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
 
 	if (args->rmtblkno) {
 		ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
 		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
 		name_rmt->valuelen = cpu_to_be32(args->valuelen);
 		xfs_da_log_buf(args->trans, bp,
@@ -2534,7 +2534,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
 	xfs_da_log_buf(args->trans, bp,
 			XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
 	if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
 		name_rmt->valueblk = 0;
 		name_rmt->valuelen = 0;
 		xfs_da_log_buf(args->trans, bp,
@@ -2607,20 +2607,20 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 
 #ifdef DEBUG
 	if (entry1->flags & XFS_ATTR_LOCAL) {
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf1, args->index);
+		name_loc = xfs_attr_leaf_name_local(leaf1, args->index);
 		namelen1 = name_loc->namelen;
 		name1 = (char *)name_loc->nameval;
 	} else {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
 		namelen1 = name_rmt->namelen;
 		name1 = (char *)name_rmt->name;
 	}
 	if (entry2->flags & XFS_ATTR_LOCAL) {
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf2, args->index2);
+		name_loc = xfs_attr_leaf_name_local(leaf2, args->index2);
 		namelen2 = name_loc->namelen;
 		name2 = (char *)name_loc->nameval;
 	} else {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+		name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
 		namelen2 = name_rmt->namelen;
 		name2 = (char *)name_rmt->name;
 	}
@@ -2637,7 +2637,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 			  XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
 	if (args->rmtblkno) {
 		ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
 		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
 		name_rmt->valuelen = cpu_to_be32(args->valuelen);
 		xfs_da_log_buf(args->trans, bp1,
@@ -2648,7 +2648,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 	xfs_da_log_buf(args->trans, bp2,
 			  XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
 	if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+		name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
 		name_rmt->valueblk = 0;
 		name_rmt->valuelen = 0;
 		xfs_da_log_buf(args->trans, bp2,
@@ -2855,7 +2855,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
 	for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
 		if (be16_to_cpu(entry->nameidx) &&
 		    ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
-			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+			name_rmt = xfs_attr_leaf_name_remote(leaf, i);
 			if (name_rmt->valueblk)
 				count++;
 		}
@@ -2883,7 +2883,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
 	for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
 		if (be16_to_cpu(entry->nameidx) &&
 		    ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
-			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+			name_rmt = xfs_attr_leaf_name_remote(leaf, i);
 			if (name_rmt->valueblk) {
 				lp->valueblk = be32_to_cpu(name_rmt->valueblk);
 				lp->valuelen = XFS_B_TO_FSB(dp->i_mount,
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 83e9af417ca2..9c7d22fdcf4d 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -151,8 +151,6 @@ typedef struct xfs_attr_leafblock {
 /*
  * Cast typed pointers for "local" and "remote" name/value structs.
  */
-#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx)	\
-	xfs_attr_leaf_name_remote(leafp,idx)
 static inline xfs_attr_leaf_name_remote_t *
 xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
 {
@@ -160,8 +158,6 @@ xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
 		&((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
 }
 
-#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx)	\
-	xfs_attr_leaf_name_local(leafp,idx)
 static inline xfs_attr_leaf_name_local_t *
 xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
 {
@@ -169,8 +165,6 @@ xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
 		&((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
 }
 
-#define XFS_ATTR_LEAF_NAME(leafp,idx)		\
-	xfs_attr_leaf_name(leafp,idx)
 static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
 {
 	return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
@@ -181,24 +175,18 @@ static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
  * a "local" name/value structure, a "remote" name/value structure, and
  * a pointer which might be either.
  */
-#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen)	\
-	xfs_attr_leaf_entsize_remote(nlen)
 static inline int xfs_attr_leaf_entsize_remote(int nlen)
 {
 	return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
 		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
 }
 
-#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen)	\
-	xfs_attr_leaf_entsize_local(nlen,vlen)
 static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
 {
 	return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
 		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
 }
 
-#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize)	\
-	xfs_attr_leaf_entsize_local_max(bsize)
 static inline int xfs_attr_leaf_entsize_local_max(int bsize)
 {
 	return (((bsize) >> 1) + ((bsize) >> 2));
-- 
cgit v1.2.3


From fb82557f16f3700ae4961a4ce599bdaff6a10b1c Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Fri, 9 Jan 2009 15:53:54 +1100
Subject: [XFS] Remove macro-to-function indirections in the mask code

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_ag.h         |  2 +-
 fs/xfs/xfs_bit.h        | 10 +-----
 fs/xfs/xfs_bmap_btree.c | 84 ++++++++++++++++++++++++-------------------------
 3 files changed, 44 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f2e21817a226..d3b3cf742999 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -231,7 +231,7 @@ typedef struct xfs_perag
 #define	XFS_FSB_TO_AGNO(mp,fsbno)	\
 	((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
 #define	XFS_FSB_TO_AGBNO(mp,fsbno)	\
-	((xfs_agblock_t)((fsbno) & XFS_MASK32LO((mp)->m_sb.sb_agblklog)))
+	((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
 #define	XFS_AGB_TO_DADDR(mp,agno,agbno)	\
 	((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
 		(xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index bca7b243c319..f1e3c907044d 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -23,24 +23,16 @@
  */
 
 /*
- * masks with n high/low bits set, 32-bit values & 64-bit values
+ * masks with n high/low bits set, 64-bit values
  */
-#define	XFS_MASK32HI(n)		xfs_mask32hi(n)
-static inline __uint32_t xfs_mask32hi(int n)
-{
-	return (__uint32_t)-1 << (32 - (n));
-}
-#define	XFS_MASK64HI(n)		xfs_mask64hi(n)
 static inline __uint64_t xfs_mask64hi(int n)
 {
 	return (__uint64_t)-1 << (64 - (n));
 }
-#define	XFS_MASK32LO(n)		xfs_mask32lo(n)
 static inline __uint32_t xfs_mask32lo(int n)
 {
 	return ((__uint32_t)1 << (n)) - 1;
 }
-#define	XFS_MASK64LO(n)		xfs_mask64lo(n)
 static inline __uint64_t xfs_mask64lo(int n)
 {
 	return ((__uint64_t)1 << (n)) - 1;
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 8f1ec73725d3..ba6b08c2fb02 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -110,16 +110,16 @@ __xfs_bmbt_get_all(
 
 	ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
 	s->br_startoff = ((xfs_fileoff_t)l0 &
-			   XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+			   xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 #if XFS_BIG_BLKNOS
-	s->br_startblock = (((xfs_fsblock_t)l0 & XFS_MASK64LO(9)) << 43) |
+	s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
 			   (((xfs_fsblock_t)l1) >> 21);
 #else
 #ifdef DEBUG
 	{
 		xfs_dfsbno_t	b;
 
-		b = (((xfs_dfsbno_t)l0 & XFS_MASK64LO(9)) << 43) |
+		b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
 		    (((xfs_dfsbno_t)l1) >> 21);
 		ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
 		s->br_startblock = (xfs_fsblock_t)b;
@@ -128,7 +128,7 @@ __xfs_bmbt_get_all(
 	s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
 #endif	/* DEBUG */
 #endif	/* XFS_BIG_BLKNOS */
-	s->br_blockcount = (xfs_filblks_t)(l1 & XFS_MASK64LO(21));
+	s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
 	/* This is xfs_extent_state() in-line */
 	if (ext_flag) {
 		ASSERT(s->br_blockcount != 0);	/* saved for DMIG */
@@ -153,7 +153,7 @@ xfs_filblks_t
 xfs_bmbt_get_blockcount(
 	xfs_bmbt_rec_host_t	*r)
 {
-	return (xfs_filblks_t)(r->l1 & XFS_MASK64LO(21));
+	return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
 }
 
 /*
@@ -164,13 +164,13 @@ xfs_bmbt_get_startblock(
 	xfs_bmbt_rec_host_t	*r)
 {
 #if XFS_BIG_BLKNOS
-	return (((xfs_fsblock_t)r->l0 & XFS_MASK64LO(9)) << 43) |
+	return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
 	       (((xfs_fsblock_t)r->l1) >> 21);
 #else
 #ifdef DEBUG
 	xfs_dfsbno_t	b;
 
-	b = (((xfs_dfsbno_t)r->l0 & XFS_MASK64LO(9)) << 43) |
+	b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
 	    (((xfs_dfsbno_t)r->l1) >> 21);
 	ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
 	return (xfs_fsblock_t)b;
@@ -188,7 +188,7 @@ xfs_bmbt_get_startoff(
 	xfs_bmbt_rec_host_t	*r)
 {
 	return ((xfs_fileoff_t)r->l0 &
-		 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 }
 
 xfs_exntst_t
@@ -219,7 +219,7 @@ xfs_filblks_t
 xfs_bmbt_disk_get_blockcount(
 	xfs_bmbt_rec_t	*r)
 {
-	return (xfs_filblks_t)(be64_to_cpu(r->l1) & XFS_MASK64LO(21));
+	return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
 }
 
 /*
@@ -230,7 +230,7 @@ xfs_bmbt_disk_get_startoff(
 	xfs_bmbt_rec_t	*r)
 {
 	return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
-		 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 }
 
 
@@ -248,33 +248,33 @@ xfs_bmbt_set_allf(
 	int		extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
 
 	ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
-	ASSERT((startoff & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
-	ASSERT((blockcount & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
 
 #if XFS_BIG_BLKNOS
-	ASSERT((startblock & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
 
 	r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
 		((xfs_bmbt_rec_base_t)startoff << 9) |
 		((xfs_bmbt_rec_base_t)startblock >> 43);
 	r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
 		((xfs_bmbt_rec_base_t)blockcount &
-		(xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+		(xfs_bmbt_rec_base_t)xfs_mask64lo(21));
 #else	/* !XFS_BIG_BLKNOS */
 	if (ISNULLSTARTBLOCK(startblock)) {
 		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
 			((xfs_bmbt_rec_base_t)startoff << 9) |
-			 (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
-		r->l1 = XFS_MASK64HI(11) |
+			 (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+		r->l1 = xfs_mask64hi(11) |
 			  ((xfs_bmbt_rec_base_t)startblock << 21) |
 			  ((xfs_bmbt_rec_base_t)blockcount &
-			   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+			   (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
 	} else {
 		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
 			((xfs_bmbt_rec_base_t)startoff << 9);
 		r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
 			 ((xfs_bmbt_rec_base_t)blockcount &
-			 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+			 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
 	}
 #endif	/* XFS_BIG_BLKNOS */
 }
@@ -306,11 +306,11 @@ xfs_bmbt_disk_set_allf(
 	int			extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
 
 	ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
-	ASSERT((startoff & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
-	ASSERT((blockcount & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
 
 #if XFS_BIG_BLKNOS
-	ASSERT((startblock & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
 
 	r->l0 = cpu_to_be64(
 		((xfs_bmbt_rec_base_t)extent_flag << 63) |
@@ -319,17 +319,17 @@ xfs_bmbt_disk_set_allf(
 	r->l1 = cpu_to_be64(
 		((xfs_bmbt_rec_base_t)startblock << 21) |
 		 ((xfs_bmbt_rec_base_t)blockcount &
-		  (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+		  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
 #else	/* !XFS_BIG_BLKNOS */
 	if (ISNULLSTARTBLOCK(startblock)) {
 		r->l0 = cpu_to_be64(
 			((xfs_bmbt_rec_base_t)extent_flag << 63) |
 			 ((xfs_bmbt_rec_base_t)startoff << 9) |
-			  (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
-		r->l1 = cpu_to_be64(XFS_MASK64HI(11) |
+			  (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
+		r->l1 = cpu_to_be64(xfs_mask64hi(11) |
 			  ((xfs_bmbt_rec_base_t)startblock << 21) |
 			  ((xfs_bmbt_rec_base_t)blockcount &
-			   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+			   (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
 	} else {
 		r->l0 = cpu_to_be64(
 			((xfs_bmbt_rec_base_t)extent_flag << 63) |
@@ -337,7 +337,7 @@ xfs_bmbt_disk_set_allf(
 		r->l1 = cpu_to_be64(
 			((xfs_bmbt_rec_base_t)startblock << 21) |
 			 ((xfs_bmbt_rec_base_t)blockcount &
-			  (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+			  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
 	}
 #endif	/* XFS_BIG_BLKNOS */
 }
@@ -362,9 +362,9 @@ xfs_bmbt_set_blockcount(
 	xfs_bmbt_rec_host_t *r,
 	xfs_filblks_t	v)
 {
-	ASSERT((v & XFS_MASK64HI(43)) == 0);
-	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(43)) |
-		  (xfs_bmbt_rec_base_t)(v & XFS_MASK64LO(21));
+	ASSERT((v & xfs_mask64hi(43)) == 0);
+	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
+		  (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
 }
 
 /*
@@ -376,21 +376,21 @@ xfs_bmbt_set_startblock(
 	xfs_fsblock_t	v)
 {
 #if XFS_BIG_BLKNOS
-	ASSERT((v & XFS_MASK64HI(12)) == 0);
-	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(55)) |
+	ASSERT((v & xfs_mask64hi(12)) == 0);
+	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
 		  (xfs_bmbt_rec_base_t)(v >> 43);
-	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)) |
+	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
 		  (xfs_bmbt_rec_base_t)(v << 21);
 #else	/* !XFS_BIG_BLKNOS */
 	if (ISNULLSTARTBLOCK(v)) {
-		r->l0 |= (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
-		r->l1 = (xfs_bmbt_rec_base_t)XFS_MASK64HI(11) |
+		r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+		r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
 			  ((xfs_bmbt_rec_base_t)v << 21) |
-			  (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+			  (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
 	} else {
-		r->l0 &= ~(xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
+		r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
 		r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
-			  (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+			  (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
 	}
 #endif	/* XFS_BIG_BLKNOS */
 }
@@ -403,10 +403,10 @@ xfs_bmbt_set_startoff(
 	xfs_bmbt_rec_host_t *r,
 	xfs_fileoff_t	v)
 {
-	ASSERT((v & XFS_MASK64HI(9)) == 0);
-	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) XFS_MASK64HI(1)) |
+	ASSERT((v & xfs_mask64hi(9)) == 0);
+	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
 		((xfs_bmbt_rec_base_t)v << 9) |
-		  (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
+		  (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
 }
 
 /*
@@ -419,9 +419,9 @@ xfs_bmbt_set_state(
 {
 	ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
 	if (v == XFS_EXT_NORM)
-		r->l0 &= XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN);
+		r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
 	else
-		r->l0 |= XFS_MASK64HI(BMBT_EXNTFLAG_BITLEN);
+		r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
 }
 
 /*
-- 
cgit v1.2.3


From e6edbd1c1cbef278d58cdd8b046599ba8ac90cfc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 8 Jan 2009 13:42:23 -0500
Subject: [XFS] fix compile of xfs_btree_readahead_lblock on m68k

Change the left/right variables to the proper always 64bit xfs_dfsbo_t
type because otherwise compilation fails for Geert on m68k without
CONFIG_LBD:

| fs/xfs/xfs_btree.c: In function 'xfs_btree_readahead_lblock':
| fs/xfs/xfs_btree.c:736: warning: comparison is always true due to limited range of data type
| fs/xfs/xfs_btree.c:741: warning: comparison is always true due to limited range of data type

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_btree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 7ed59267420d..2c3ef20f8842 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -730,8 +730,8 @@ xfs_btree_readahead_lblock(
 	struct xfs_btree_block	*block)
 {
 	int			rval = 0;
-	xfs_fsblock_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
-	xfs_fsblock_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+	xfs_dfsbno_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+	xfs_dfsbno_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 
 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
 		xfs_btree_reada_bufl(cur->bc_mp, left, 1);
-- 
cgit v1.2.3


From 15440319767942a363f282d6585303d3d75088ba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 8 Jan 2009 14:00:00 -0500
Subject: [XFS] truncate readdir offsets to signed 32 bit values

John Stanley reported EOVERFLOW errors in readdir from his self-build
glibc.  I traced this down to glibc enabling d_off overflow checks
in one of the about five million different getdents implementations.

In 2.6.28 Dave Woodhouse moved our readdir double buffering required
for NFS4 readdirplus into nfsd and at that point we lost the capping
of the directory offsets to 32 bit signed values.  Johns glibc used
getdents64 to even implement readdir for normal 32 bit offset dirents,
and failed with EOVERFLOW only if this happens on the first dirent in
a getdents call.  I managed to come up with a testcase that uses
raw getdents and does the EOVERFLOW check manually.  We always hit
it with our last entry due to the special end of directory marker.

The patch below is a dumb version of just putting back the masking,
to make sure we have the same behavior as in 2.6.27 and earlier.

I will work on a better and cleaner fix for 2.6.30.

Reported-by: John Stanley <jpsinthemix@verizon.net>
Tested-by: John Stanley <jpsinthemix@verizon.net>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_dir2_block.c |  7 ++++---
 fs/xfs/xfs_dir2_leaf.c  |  6 +++---
 fs/xfs/xfs_dir2_sf.c    | 15 ++++++++-------
 3 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e2fa0a1d8e96..e1f0a06aaf04 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -517,9 +517,9 @@ xfs_dir2_block_getdents(
 		/*
 		 * If it didn't fit, set the final offset to here & return.
 		 */
-		if (filldir(dirent, dep->name, dep->namelen, cook,
+		if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
 			    ino, DT_UNKNOWN)) {
-			*offset = cook;
+			*offset = cook & 0x7fffffff;
 			xfs_da_brelse(NULL, bp);
 			return 0;
 		}
@@ -529,7 +529,8 @@ xfs_dir2_block_getdents(
 	 * Reached the end of the block.
 	 * Set the offset to a non-existent block 1 and return.
 	 */
-	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0);
+	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+			0x7fffffff;
 	xfs_da_brelse(NULL, bp);
 	return 0;
 }
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 93535992cb60..ef805a374eec 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1092,7 +1092,7 @@ xfs_dir2_leaf_getdents(
 		 * Won't fit.  Return to caller.
 		 */
 		if (filldir(dirent, dep->name, dep->namelen,
-			    xfs_dir2_byte_to_dataptr(mp, curoff),
+			    xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
 			    ino, DT_UNKNOWN))
 			break;
 
@@ -1108,9 +1108,9 @@ xfs_dir2_leaf_getdents(
 	 * All done.  Set output offset value to current offset.
 	 */
 	if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
-		*offset = XFS_DIR2_MAX_DATAPTR;
+		*offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
 	else
-		*offset = xfs_dir2_byte_to_dataptr(mp, curoff);
+		*offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
 	kmem_free(map);
 	if (bp)
 		xfs_da_brelse(NULL, bp);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index b46af0013ec9..a8a8a6efad5b 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -752,8 +752,8 @@ xfs_dir2_sf_getdents(
 #if XFS_BIG_INUMS
 		ino += mp->m_inoadd;
 #endif
-		if (filldir(dirent, ".", 1, dot_offset, ino, DT_DIR)) {
-			*offset = dot_offset;
+		if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, ino, DT_DIR)) {
+			*offset = dot_offset & 0x7fffffff;
 			return 0;
 		}
 	}
@@ -766,8 +766,8 @@ xfs_dir2_sf_getdents(
 #if XFS_BIG_INUMS
 		ino += mp->m_inoadd;
 #endif
-		if (filldir(dirent, "..", 2, dotdot_offset, ino, DT_DIR)) {
-			*offset = dotdot_offset;
+		if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
+			*offset = dotdot_offset & 0x7fffffff;
 			return 0;
 		}
 	}
@@ -791,14 +791,15 @@ xfs_dir2_sf_getdents(
 #endif
 
 		if (filldir(dirent, sfep->name, sfep->namelen,
-					    off, ino, DT_UNKNOWN)) {
-			*offset = off;
+			    off & 0x7fffffff, ino, DT_UNKNOWN)) {
+			*offset = off & 0x7fffffff;
 			return 0;
 		}
 		sfep = xfs_dir2_sf_nextentry(sfp, sfep);
 	}
 
-	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0);
+	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+			0x7fffffff;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 058652a37dd9eac18d6b8c1a311137c679de9dae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 8 Jan 2009 13:42:25 -0500
Subject: [XFS] make xfs_ino_t an unsigned long long

Currently xfs_ino_t is defined as a u64 which can either be an unsigned
long long or on some 64 bit platforms and unsigned long.  Just making
it and unsigned long long mean's it's still always 64 bits wide, but we
don't need to resort to cases to print it.

Fixes a warning regression on 64 bit powerpc in current git.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index baedbd14dc21..b2f724502f1b 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -45,7 +45,7 @@ typedef __uint32_t		prid_t;		/* project ID */
 typedef __uint32_t		inst_t;		/* an instruction */
 
 typedef __s64			xfs_off_t;	/* <file offset> type */
-typedef __u64			xfs_ino_t;	/* <inode> type */
+typedef unsigned long long	xfs_ino_t;	/* <inode> type */
 typedef __s64			xfs_daddr_t;	/* <disk address> type */
 typedef char *			xfs_caddr_t;	/* <core address> type */
 typedef __u32			xfs_dev_t;
-- 
cgit v1.2.3


From 958f8c0e4fc311e23a40635a530c01aec366a6e8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 6 Jan 2009 14:40:44 +1100
Subject: [XFS] remove old vmap cache

XFS's vmap batching simply defers a number (up to 64) of vunmaps, and keeps
track of them in a list. To purge the batch, it just goes through the list and
calls vunamp on each one. This is pretty poor: a global TLB flush is generally
still performed on each vunmap, with the most expensive parts of the operation
being the broadcast IPIs and locking involved in the SMP callouts, and the
locking involved in the vmap management -- none of these are avoided by just
batching up the calls. I'm actually surprised it ever made much difference.
(Now that the lazy vmap allocator is upstream, this description is not quite
right, but the vunmap batching still doesn't seem to do much)

Rip all this logic out of XFS completely. I will improve vmap performance
and scalability directly in subsequent patch.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 75 +---------------------------------------------
 1 file changed, 1 insertion(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index cb329edc925b..0b2177a9fbdc 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -165,75 +165,6 @@ test_page_region(
 	return (mask && (page_private(page) & mask) == mask);
 }
 
-/*
- *	Mapping of multi-page buffers into contiguous virtual space
- */
-
-typedef struct a_list {
-	void		*vm_addr;
-	struct a_list	*next;
-} a_list_t;
-
-static a_list_t		*as_free_head;
-static int		as_list_len;
-static DEFINE_SPINLOCK(as_lock);
-
-/*
- *	Try to batch vunmaps because they are costly.
- */
-STATIC void
-free_address(
-	void		*addr)
-{
-	a_list_t	*aentry;
-
-#ifdef CONFIG_XEN
-	/*
-	 * Xen needs to be able to make sure it can get an exclusive
-	 * RO mapping of pages it wants to turn into a pagetable.  If
-	 * a newly allocated page is also still being vmap()ed by xfs,
-	 * it will cause pagetable construction to fail.  This is a
-	 * quick workaround to always eagerly unmap pages so that Xen
-	 * is happy.
-	 */
-	vunmap(addr);
-	return;
-#endif
-
-	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
-	if (likely(aentry)) {
-		spin_lock(&as_lock);
-		aentry->next = as_free_head;
-		aentry->vm_addr = addr;
-		as_free_head = aentry;
-		as_list_len++;
-		spin_unlock(&as_lock);
-	} else {
-		vunmap(addr);
-	}
-}
-
-STATIC void
-purge_addresses(void)
-{
-	a_list_t	*aentry, *old;
-
-	if (as_free_head == NULL)
-		return;
-
-	spin_lock(&as_lock);
-	aentry = as_free_head;
-	as_free_head = NULL;
-	as_list_len = 0;
-	spin_unlock(&as_lock);
-
-	while ((old = aentry) != NULL) {
-		vunmap(aentry->vm_addr);
-		aentry = aentry->next;
-		kfree(old);
-	}
-}
-
 /*
  *	Internal xfs_buf_t object manipulation
  */
@@ -333,7 +264,7 @@ xfs_buf_free(
 		uint		i;
 
 		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-			free_address(bp->b_addr - bp->b_offset);
+                       vunmap(bp->b_addr - bp->b_offset);
 
 		for (i = 0; i < bp->b_page_count; i++) {
 			struct page	*page = bp->b_pages[i];
@@ -455,8 +386,6 @@ _xfs_buf_map_pages(
 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 		bp->b_flags |= XBF_MAPPED;
 	} else if (flags & XBF_MAPPED) {
-		if (as_list_len > 64)
-			purge_addresses();
 		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
 					VM_MAP, PAGE_KERNEL);
 		if (unlikely(bp->b_addr == NULL))
@@ -1743,8 +1672,6 @@ xfsbufd(
 			count++;
 		}
 
-		if (as_list_len > 0)
-			purge_addresses();
 		if (count)
 			blk_run_address_space(target->bt_mapping);
 
-- 
cgit v1.2.3


From 0087167c9d5b1273e7e6bbe39a9ab13bdb9a39bb Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 6 Jan 2009 14:43:09 +1100
Subject: [XFS] use scalable vmap API

Implement XFS's large buffer support with the new vmap APIs. See the vmap
rewrite (db64fe02) for some numbers. The biggest improvement that comes from
using the new APIs is avoiding the global KVA allocation lock on every call.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 0b2177a9fbdc..d71dc44e21ed 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -264,7 +264,7 @@ xfs_buf_free(
 		uint		i;
 
 		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-                       vunmap(bp->b_addr - bp->b_offset);
+                       vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count);
 
 		for (i = 0; i < bp->b_page_count; i++) {
 			struct page	*page = bp->b_pages[i];
@@ -386,8 +386,8 @@ _xfs_buf_map_pages(
 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 		bp->b_flags |= XBF_MAPPED;
 	} else if (flags & XBF_MAPPED) {
-		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
-					VM_MAP, PAGE_KERNEL);
+               bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+                                       -1, PAGE_KERNEL);
 		if (unlikely(bp->b_addr == NULL))
 			return -ENOMEM;
 		bp->b_addr += bp->b_offset;
-- 
cgit v1.2.3


From c225aa57ff4ffe715df4692676b77c815a337236 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Simon=20Holm=20Th=C3=B8gersen?= <odie@cs.aau.dk>
Date: Sun, 11 Jan 2009 22:34:01 -0500
Subject: ext4: fix wrong use of do_div
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

the following warning:

fs/jbd2/journal.c: In function ‘jbd2_seq_info_show’:
fs/jbd2/journal.c:850: warning: format ‘%lu’ expects type ‘long
unsigned int’, but argument 3 has type ‘uint32_t’

is caused by wrong usage of do_div that modifies the dividend in-place
and returns the quotient. So not only would an incorrect value be
displayed, but s->journal->j_average_commit_time would also be changed
to a wrong value!

Fix it by using div_u64 instead.

Signed-off-by: Simon Holm Thøgersen <odie@cs.aau.dk>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 56675306ed81..eb343008eded 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -37,10 +37,10 @@
 #include <linux/proc_fs.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/math64.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
-#include <asm/div64.h>
 
 EXPORT_SYMBOL(jbd2_journal_start);
 EXPORT_SYMBOL(jbd2_journal_restart);
@@ -846,8 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
 	    jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
 	seq_printf(seq, "  %ums logging transaction\n",
 	    jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
-	seq_printf(seq, "  %luus average transaction commit time\n",
-		   do_div(s->journal->j_average_commit_time, 1000));
+	seq_printf(seq, "  %lluus average transaction commit time\n",
+		   div_u64(s->journal->j_average_commit_time, 1000));
 	seq_printf(seq, "  %lu handles per transaction\n",
 	    s->stats->u.run.rs_handle_count / s->stats->ts_tid);
 	seq_printf(seq, "  %lu blocks per transaction\n",
-- 
cgit v1.2.3


From 62568510b8e2679cbc331d7de10ea9ba81ae8b3d Mon Sep 17 00:00:00 2001
From: Bernd Schmidt <bernds_cb1@t-online.de>
Date: Tue, 13 Jan 2009 22:14:48 +0100
Subject: Fix timeouts in sys_pselect7

Since we (Analog Devices) updated our Blackfin kernel to 2.6.28, we've
seen occasional 5-second hangs from telnet.  telnetd calls select with a
NULL timeout, but with the new kernel, the system call occasionally
returns 0, which causes telnet to call sleep (5).  This did not happen
with earlier kernels.

The code in sys_pselect7 looks a bit strange, in particular the variable
"to" is initialized to NULL, then changed if a non-null timeout was
passed in, but not used further.  It needs to be passed to
core_sys_select instead of &end_time.

This bug was introduced by 8ff3e8e85fa6c312051134b3953e397feb639f51
("select: switch select() and poll() over to hrtimers").

Signed-off-by: Bernd Schmidt <bernd.schmidt@analog.com>
Reviewed-by: Ulrich Drepper <drepper@redhat.com>
Tested-by: Robin Getz <rgetz@blackfin.uclinux.org>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/select.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index 08b91beed806..b0cf1f0896d9 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -610,7 +610,7 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = core_sys_select(n, inp, outp, exp, &end_time);
+	ret = core_sys_select(n, inp, outp, exp, to);
 	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	if (ret == -ERESTARTNOHAND) {
-- 
cgit v1.2.3


From 2ed7c03ec17779afb4fcfa3b8c61df61bd4879ba Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:13:54 +0100
Subject: [CVE-2009-0029] Convert all system calls to return a long

Convert all system calls to return a long. This should be a NOP since all
converted types should have the same size anyway.
With the exception of sys_exit_group which returned void. But that doesn't
matter since the system call doesn't return.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/read_write.c | 18 +++++++++---------
 fs/xattr.c      | 12 ++++++------
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 5cc6924eb158..940367f51f2a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -147,7 +147,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
 }
 EXPORT_SYMBOL(vfs_llseek);
 
-asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
+asmlinkage long sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
 {
 	off_t retval;
 	struct file * file;
@@ -369,7 +369,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
 	file->f_pos = pos;
 }
 
-asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
+asmlinkage long sys_read(unsigned int fd, char __user * buf, size_t count)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -386,7 +386,7 @@ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
 	return ret;
 }
 
-asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count)
+asmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t count)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -403,7 +403,7 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t co
 	return ret;
 }
 
-asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf,
+asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
 			     size_t count, loff_t pos)
 {
 	struct file *file;
@@ -424,7 +424,7 @@ asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf,
 	return ret;
 }
 
-asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf,
+asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
 			      size_t count, loff_t pos)
 {
 	struct file *file;
@@ -672,7 +672,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 
 EXPORT_SYMBOL(vfs_writev);
 
-asmlinkage ssize_t
+asmlinkage long
 sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
 {
 	struct file *file;
@@ -693,7 +693,7 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
 	return ret;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
 {
 	struct file *file;
@@ -812,7 +812,7 @@ out:
 	return retval;
 }
 
-asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count)
+asmlinkage long sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count)
 {
 	loff_t pos;
 	off_t off;
@@ -831,7 +831,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, siz
 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
 }
 
-asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count)
+asmlinkage long sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count)
 {
 	loff_t pos;
 	ssize_t ret;
diff --git a/fs/xattr.c b/fs/xattr.c
index 237804cd6b56..d049ae27aae7 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -349,7 +349,7 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 	return error;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_getxattr(const char __user *pathname, const char __user *name,
 	     void __user *value, size_t size)
 {
@@ -364,7 +364,7 @@ sys_getxattr(const char __user *pathname, const char __user *name,
 	return error;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value,
 	      size_t size)
 {
@@ -379,7 +379,7 @@ sys_lgetxattr(const char __user *pathname, const char __user *name, void __user
 	return error;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size)
 {
 	struct file *f;
@@ -424,7 +424,7 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 	return error;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_listxattr(const char __user *pathname, char __user *list, size_t size)
 {
 	struct path path;
@@ -438,7 +438,7 @@ sys_listxattr(const char __user *pathname, char __user *list, size_t size)
 	return error;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
 {
 	struct path path;
@@ -452,7 +452,7 @@ sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
 	return error;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_flistxattr(int fd, char __user *list, size_t size)
 {
 	struct file *f;
-- 
cgit v1.2.3


From e55380edf68796d75bf41391a781c68ee678587d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:13:55 +0100
Subject: [CVE-2009-0029] Rename old_readdir to sys_old_readdir

This way it matches the generic system call name convention.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/readdir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/readdir.c b/fs/readdir.c
index b318d9b5af2e..8b4c2a0051a6 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -102,7 +102,7 @@ efault:
 	return -EFAULT;
 }
 
-asmlinkage long old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count)
+asmlinkage long sys_old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count)
 {
 	int error;
 	struct file * file;
-- 
cgit v1.2.3


From 1134723e96f6e2abcf8bfd7a2d1c96fcc323ef35 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:13:56 +0100
Subject: [CVE-2009-0029] Remove __attribute__((weak)) from sys_pipe/sys_pipe2

Remove __attribute__((weak)) from common code sys_pipe implemantation.
IA64, ALPHA, SUPERH (32bit) and SPARC (32bit) have own implemantations
with the same name. Just rename them.
For sys_pipe2 there is no architecture specific implementation.

Cc: Richard Henderson <rth@twiddle.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/pipe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 891697112f66..0c64db86c919 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1043,7 +1043,7 @@ int do_pipe(int *fd)
  * sys_pipe() is the normal C calling standard for creating
  * a pipe. It's not the way Unix traditionally does this, though.
  */
-asmlinkage long __weak sys_pipe2(int __user *fildes, int flags)
+asmlinkage long sys_pipe2(int __user *fildes, int flags)
 {
 	int fd[2];
 	int error;
@@ -1059,7 +1059,7 @@ asmlinkage long __weak sys_pipe2(int __user *fildes, int flags)
 	return error;
 }
 
-asmlinkage long __weak sys_pipe(int __user *fildes)
+asmlinkage long sys_pipe(int __user *fildes)
 {
 	return sys_pipe2(fildes, 0);
 }
-- 
cgit v1.2.3


From c9da9f2129d6a421c32e334a83770a9e67f7feac Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:13:57 +0100
Subject: [CVE-2009-0029] Make sys_pselect7 static

Not a single architecture has wired up sys_pselect7 plus it is the
only system call with seven parameters. Just make it static and
rename it to do_pselect which will do the work for sys_pselect6.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/compat.c | 6 +++---
 fs/select.c | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 30f2faa22f5c..65a070e705ab 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1709,7 +1709,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 }
 
 #ifdef HAVE_SET_RESTORE_SIGMASK
-asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
+static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
 	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
 	compat_size_t sigsetsize)
@@ -1775,8 +1775,8 @@ asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
 				(compat_size_t __user *)(sig+sizeof(up))))
 			return -EFAULT;
 	}
-	return compat_sys_pselect7(n, inp, outp, exp, tsp, compat_ptr(up),
-					sigsetsize);
+	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
+				 sigsetsize);
 }
 
 asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
diff --git a/fs/select.c b/fs/select.c
index b0cf1f0896d9..d1651648be11 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -582,9 +582,9 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 }
 
 #ifdef HAVE_SET_RESTORE_SIGMASK
-asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
-		fd_set __user *exp, struct timespec __user *tsp,
-		const sigset_t __user *sigmask, size_t sigsetsize)
+static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
+		       fd_set __user *exp, struct timespec __user *tsp,
+		       const sigset_t __user *sigmask, size_t sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
 	struct timespec ts, end_time, *to = NULL;
@@ -650,7 +650,7 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
 			return -EFAULT;
 	}
 
-	return sys_pselect7(n, inp, outp, exp, tsp, up, sigsetsize);
+	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
 }
 #endif /* HAVE_SET_RESTORE_SIGMASK */
 
-- 
cgit v1.2.3


From 6673e0c3fbeaed2cd08e2fd4a4aa97382d6fedb0 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:02 +0100
Subject: [CVE-2009-0029] System call wrapper special cases

System calls with an unsigned long long argument can't be converted with
the standard wrappers since that would include a cast to long, which in
turn means that we would lose the upper 32 bit on 32 bit architectures.
Also semctl can't use the standard wrapper since it has a 'union'
parameter.

So we handle them as special case and add some extra wrappers instead.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/dcookies.c   | 10 ++++++++--
 fs/open.c       | 27 ++++++++++++++++++++++++---
 fs/read_write.c | 24 ++++++++++++++++++++----
 fs/sync.c       | 26 ++++++++++++++++++++++----
 4 files changed, 74 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/dcookies.c b/fs/dcookies.c
index 180e9fec4ad8..a21cabdbd87b 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -145,7 +145,7 @@ out:
 /* And here is where the userspace process can look up the cookie value
  * to retrieve the path.
  */
-asmlinkage long sys_lookup_dcookie(u64 cookie64, char __user * buf, size_t len)
+SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len)
 {
 	unsigned long cookie = (unsigned long)cookie64;
 	int err = -EINVAL;
@@ -198,7 +198,13 @@ out:
 	mutex_unlock(&dcookie_mutex);
 	return err;
 }
-
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_lookup_dcookie(u64 cookie64, long buf, long len)
+{
+	return SYSC_lookup_dcookie(cookie64, (char __user *) buf, (size_t) len);
+}
+SYSCALL_ALIAS(sys_lookup_dcookie, SyS_lookup_dcookie);
+#endif
 
 static int dcookie_init(void)
 {
diff --git a/fs/open.c b/fs/open.c
index d882fd2351d6..e349013fc790 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -351,21 +351,35 @@ asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
 
 /* LFS versions of truncate are only needed on 32 bit machines */
 #if BITS_PER_LONG == 32
-asmlinkage long sys_truncate64(const char __user * path, loff_t length)
+SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length)
 {
 	return do_sys_truncate(path, length);
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_truncate64(long path, loff_t length)
+{
+	return SYSC_truncate64((const char __user *) path, length);
+}
+SYSCALL_ALIAS(sys_truncate64, SyS_truncate64);
+#endif
 
-asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
+SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length)
 {
 	long ret = do_sys_ftruncate(fd, length, 0);
 	/* avoid REGPARM breakage on x86: */
 	asmlinkage_protect(2, ret, fd, length);
 	return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_ftruncate64(long fd, loff_t length)
+{
+	return SYSC_ftruncate64((unsigned int) fd, length);
+}
+SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
 #endif
+#endif /* BITS_PER_LONG == 32 */
 
-asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
+SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
 {
 	struct file *file;
 	struct inode *inode;
@@ -422,6 +436,13 @@ out_fput:
 out:
 	return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
+{
+	return SYSC_fallocate((int)fd, (int)mode, offset, len);
+}
+SYSCALL_ALIAS(sys_fallocate, SyS_fallocate);
+#endif
 
 /*
  * access() needs to use the real uid/gid, not the effective uid/gid.
diff --git a/fs/read_write.c b/fs/read_write.c
index 940367f51f2a..7a8326bc5903 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -403,8 +403,8 @@ asmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t count
 	return ret;
 }
 
-asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
-			     size_t count, loff_t pos)
+SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
+			size_t count, loff_t pos)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -423,9 +423,17 @@ asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
 
 	return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
+{
+	return SYSC_pread64((unsigned int) fd, (char __user *) buf,
+			    (size_t) count, pos);
+}
+SYSCALL_ALIAS(sys_pread64, SyS_pread64);
+#endif
 
-asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
-			      size_t count, loff_t pos)
+SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
+			 size_t count, loff_t pos)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -444,6 +452,14 @@ asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
 
 	return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
+{
+	return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
+			     (size_t) count, pos);
+}
+SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
+#endif
 
 /*
  * Reduce an iovec's length in-place.  Return the resulting number of segments
diff --git a/fs/sync.c b/fs/sync.c
index ac02b56548bc..23ebbd72ecc9 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -201,8 +201,8 @@ asmlinkage long sys_fdatasync(unsigned int fd)
  * already-instantiated disk blocks, there are no guarantees here that the data
  * will be available after a crash.
  */
-asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
-					unsigned int flags)
+SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
+				unsigned int flags)
 {
 	int ret;
 	struct file *file;
@@ -262,14 +262,32 @@ out_put:
 out:
 	return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes,
+				    long flags)
+{
+	return SYSC_sync_file_range((int) fd, offset, nbytes,
+				    (unsigned int) flags);
+}
+SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range);
+#endif
 
 /* It would be nice if people remember that not all the world's an i386
    when they introduce new system calls */
-asmlinkage long sys_sync_file_range2(int fd, unsigned int flags,
-				     loff_t offset, loff_t nbytes)
+SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags,
+				 loff_t offset, loff_t nbytes)
 {
 	return sys_sync_file_range(fd, offset, nbytes, flags);
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_sync_file_range2(long fd, long flags,
+				     loff_t offset, loff_t nbytes)
+{
+	return SYSC_sync_file_range2((int) fd, (unsigned int) flags,
+				     offset, nbytes);
+}
+SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2);
+#endif
 
 /*
  * `endbyte' is inclusive
-- 
cgit v1.2.3


From a5f8fa9e9ba5ef3305e147f41ad6e1e84ac1f0bd Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:11 +0100
Subject: [CVE-2009-0029] System call wrappers part 09

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/sync.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/sync.c b/fs/sync.c
index 23ebbd72ecc9..a16d53e5fe9d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -36,7 +36,7 @@ static void do_sync(unsigned long wait)
 		laptop_sync_completion();
 }
 
-asmlinkage long sys_sync(void)
+SYSCALL_DEFINE0(sync)
 {
 	do_sync(1);
 	return 0;
@@ -144,12 +144,12 @@ static int do_fsync(unsigned int fd, int datasync)
 	return ret;
 }
 
-asmlinkage long sys_fsync(unsigned int fd)
+SYSCALL_DEFINE1(fsync, unsigned int, fd)
 {
 	return do_fsync(fd, 0);
 }
 
-asmlinkage long sys_fdatasync(unsigned int fd)
+SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
 {
 	return do_fsync(fd, 1);
 }
-- 
cgit v1.2.3


From bdc480e3bef6eb0e7071770834cbdda7e30a5436 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:12 +0100
Subject: [CVE-2009-0029] System call wrappers part 10

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/buffer.c    |  2 +-
 fs/namespace.c |  9 ++++-----
 fs/open.c      | 12 +++++-------
 fs/stat.c      |  2 +-
 4 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index b6e8b8632e2f..b58208f1640a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3243,7 +3243,7 @@ void block_sync_page(struct page *page)
  * Use of bdflush() is deprecated and will be removed in a future kernel.
  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
  */
-asmlinkage long sys_bdflush(int func, long data)
+SYSCALL_DEFINE2(bdflush, int, func, long, data)
 {
 	static int msg_count;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index a40685d800a8..3876a0fbaa60 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1128,7 +1128,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
  * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
  */
 
-asmlinkage long sys_umount(char __user * name, int flags)
+SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 {
 	struct path path;
 	int retval;
@@ -1160,7 +1160,7 @@ out:
 /*
  *	The 2.0 compatible umount. No flags.
  */
-asmlinkage long sys_oldumount(char __user * name)
+SYSCALL_DEFINE1(oldumount, char __user *, name)
 {
 	return sys_umount(name, 0);
 }
@@ -2045,9 +2045,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	return new_ns;
 }
 
-asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
-			  char __user * type, unsigned long flags,
-			  void __user * data)
+SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
+		char __user *, type, unsigned long, flags, void __user *, data)
 {
 	int retval;
 	unsigned long data_page;
diff --git a/fs/open.c b/fs/open.c
index e349013fc790..f6c2f5673ed5 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -122,7 +122,7 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
 	return 0;
 }
 
-asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * buf)
+SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
 {
 	struct path path;
 	int error;
@@ -138,8 +138,7 @@ asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * b
 	return error;
 }
 
-
-asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct statfs64 __user *buf)
+SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
 {
 	struct path path;
 	long error;
@@ -157,8 +156,7 @@ asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct stat
 	return error;
 }
 
-
-asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf)
+SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
 {
 	struct file * file;
 	struct statfs tmp;
@@ -289,7 +287,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_truncate(const char __user * path, unsigned long length)
+SYSCALL_DEFINE2(truncate, const char __user *, path, unsigned long, length)
 {
 	/* on 32-bit boxen it will cut the range 2^31--2^32-1 off */
 	return do_sys_truncate(path, (long)length);
@@ -341,7 +339,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
+SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
 {
 	long ret = do_sys_ftruncate(fd, length, 1);
 	/* avoid REGPARM breakage on x86: */
diff --git a/fs/stat.c b/fs/stat.c
index 7e12a6f82795..a1411648048a 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -152,7 +152,7 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user * statbuf)
+SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
-- 
cgit v1.2.3


From 257ac264d69017270fbc3cf5536953525db4076c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:13 +0100
Subject: [CVE-2009-0029] System call wrappers part 11

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/open.c  |  2 +-
 fs/stat.c  | 20 ++++++++++++--------
 fs/super.c |  2 +-
 3 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index f6c2f5673ed5..322bb60d168c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -174,7 +174,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user *buf)
+SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
 {
 	struct file * file;
 	struct statfs64 tmp;
diff --git a/fs/stat.c b/fs/stat.c
index a1411648048a..f29c5fe4f8b6 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -162,7 +162,8 @@ SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *
 
 	return error;
 }
-asmlinkage long sys_lstat(char __user * filename, struct __old_kernel_stat __user * statbuf)
+
+SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
@@ -172,7 +173,8 @@ asmlinkage long sys_lstat(char __user * filename, struct __old_kernel_stat __use
 
 	return error;
 }
-asmlinkage long sys_fstat(unsigned int fd, struct __old_kernel_stat __user * statbuf)
+
+SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_fstat(fd, &stat);
@@ -235,7 +237,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-asmlinkage long sys_newstat(char __user *filename, struct stat __user *statbuf)
+SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
@@ -246,7 +248,7 @@ asmlinkage long sys_newstat(char __user *filename, struct stat __user *statbuf)
 	return error;
 }
 
-asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
+SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
@@ -280,7 +282,7 @@ out:
 }
 #endif
 
-asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
+SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_fstat(fd, &stat);
@@ -365,7 +367,7 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-asmlinkage long sys_stat64(char __user * filename, struct stat64 __user * statbuf)
+SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat(filename, &stat);
@@ -375,7 +377,8 @@ asmlinkage long sys_stat64(char __user * filename, struct stat64 __user * statbu
 
 	return error;
 }
-asmlinkage long sys_lstat64(char __user * filename, struct stat64 __user * statbuf)
+
+SYSCALL_DEFINE2(lstat64, char __user *, filename, struct stat64 __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_lstat(filename, &stat);
@@ -385,7 +388,8 @@ asmlinkage long sys_lstat64(char __user * filename, struct stat64 __user * statb
 
 	return error;
 }
-asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user * statbuf)
+
+SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_fstat(fd, &stat);
diff --git a/fs/super.c b/fs/super.c
index ed080c417167..645e5403f2a0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -544,7 +544,7 @@ rescan:
 	return NULL;
 }
 
-asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf)
+SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
 {
         struct super_block *s;
         struct ustat tmp;
-- 
cgit v1.2.3


From 64fd1de3d821659ac0a3004fd5ee1de59e64af30 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:14 +0100
Subject: [CVE-2009-0029] System call wrappers part 12

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/xattr.c | 46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/xattr.c b/fs/xattr.c
index d049ae27aae7..0367a5dae2b8 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -251,9 +251,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 	return error;
 }
 
-asmlinkage long
-sys_setxattr(const char __user *pathname, const char __user *name,
-	     const void __user *value, size_t size, int flags)
+SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
+		const char __user *, name, const void __user *, value,
+		size_t, size, int, flags)
 {
 	struct path path;
 	int error;
@@ -270,9 +270,9 @@ sys_setxattr(const char __user *pathname, const char __user *name,
 	return error;
 }
 
-asmlinkage long
-sys_lsetxattr(const char __user *pathname, const char __user *name,
-	      const void __user *value, size_t size, int flags)
+SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
+		const char __user *, name, const void __user *, value,
+		size_t, size, int, flags)
 {
 	struct path path;
 	int error;
@@ -289,9 +289,8 @@ sys_lsetxattr(const char __user *pathname, const char __user *name,
 	return error;
 }
 
-asmlinkage long
-sys_fsetxattr(int fd, const char __user *name, const void __user *value,
-	      size_t size, int flags)
+SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
+		const void __user *,value, size_t, size, int, flags)
 {
 	struct file *f;
 	struct dentry *dentry;
@@ -349,9 +348,8 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 	return error;
 }
 
-asmlinkage long
-sys_getxattr(const char __user *pathname, const char __user *name,
-	     void __user *value, size_t size)
+SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
+		const char __user *, name, void __user *, value, size_t, size)
 {
 	struct path path;
 	ssize_t error;
@@ -364,9 +362,8 @@ sys_getxattr(const char __user *pathname, const char __user *name,
 	return error;
 }
 
-asmlinkage long
-sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value,
-	      size_t size)
+SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
+		const char __user *, name, void __user *, value, size_t, size)
 {
 	struct path path;
 	ssize_t error;
@@ -379,8 +376,8 @@ sys_lgetxattr(const char __user *pathname, const char __user *name, void __user
 	return error;
 }
 
-asmlinkage long
-sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size)
+SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
+		void __user *, value, size_t, size)
 {
 	struct file *f;
 	ssize_t error = -EBADF;
@@ -424,8 +421,8 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 	return error;
 }
 
-asmlinkage long
-sys_listxattr(const char __user *pathname, char __user *list, size_t size)
+SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
+		size_t, size)
 {
 	struct path path;
 	ssize_t error;
@@ -438,8 +435,8 @@ sys_listxattr(const char __user *pathname, char __user *list, size_t size)
 	return error;
 }
 
-asmlinkage long
-sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
+SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
+		size_t, size)
 {
 	struct path path;
 	ssize_t error;
@@ -452,8 +449,7 @@ sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
 	return error;
 }
 
-asmlinkage long
-sys_flistxattr(int fd, char __user *list, size_t size)
+SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
 {
 	struct file *f;
 	ssize_t error = -EBADF;
@@ -485,8 +481,8 @@ removexattr(struct dentry *d, const char __user *name)
 	return vfs_removexattr(d, kname);
 }
 
-asmlinkage long
-sys_removexattr(const char __user *pathname, const char __user *name)
+SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
+		const char __user *, name)
 {
 	struct path path;
 	int error;
-- 
cgit v1.2.3


From 6a6160a7b5c27b3c38651baef92a14fa7072b3c1 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:15 +0100
Subject: [CVE-2009-0029] System call wrappers part 13

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/xattr.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xattr.c b/fs/xattr.c
index 0367a5dae2b8..197c4fcac032 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -499,8 +499,8 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
 	return error;
 }
 
-asmlinkage long
-sys_lremovexattr(const char __user *pathname, const char __user *name)
+SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
+		const char __user *, name)
 {
 	struct path path;
 	int error;
@@ -517,8 +517,7 @@ sys_lremovexattr(const char __user *pathname, const char __user *name)
 	return error;
 }
 
-asmlinkage long
-sys_fremovexattr(int fd, const char __user *name)
+SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 {
 	struct file *f;
 	struct dentry *dentry;
-- 
cgit v1.2.3


From 3480b25743cb7404928d57efeaa3d085708b04c2 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:16 +0100
Subject: [CVE-2009-0029] System call wrappers part 14

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/namei.c     | 8 ++++----
 fs/namespace.c | 4 ++--
 fs/open.c      | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f05bed242422..43fa25259728 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2017,7 +2017,7 @@ out_unlock:
 	return error;
 }
 
-asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
+SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev)
 {
 	return sys_mknodat(AT_FDCWD, filename, mode, dev);
 }
@@ -2302,7 +2302,7 @@ asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
 	return do_unlinkat(dfd, pathname);
 }
 
-asmlinkage long sys_unlink(const char __user *pathname)
+SYSCALL_DEFINE1(unlink, const char __user *, pathname)
 {
 	return do_unlinkat(AT_FDCWD, pathname);
 }
@@ -2370,7 +2370,7 @@ out_putname:
 	return error;
 }
 
-asmlinkage long sys_symlink(const char __user *oldname, const char __user *newname)
+SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
 {
 	return sys_symlinkat(oldname, AT_FDCWD, newname);
 }
@@ -2473,7 +2473,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
+SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
 {
 	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 3876a0fbaa60..228d8c4bfd18 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2171,8 +2171,8 @@ static void chroot_fs_refs(struct path *old_root, struct path *new_root)
  *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
  *    first.
  */
-asmlinkage long sys_pivot_root(const char __user * new_root,
-			       const char __user * put_old)
+SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
+		const char __user *, put_old)
 {
 	struct vfsmount *tmp;
 	struct path new, old, parent_path, root_parent, root;
diff --git a/fs/open.c b/fs/open.c
index 322bb60d168c..9b926de6ed9c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -569,7 +569,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_chroot(const char __user * filename)
+SYSCALL_DEFINE1(chroot, const char __user *, filename)
 {
 	struct path path;
 	int error;
-- 
cgit v1.2.3


From a26eab2400f0477bfac0255600552394855016f7 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:17 +0100
Subject: [CVE-2009-0029] System call wrappers part 15

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/fcntl.c | 11 ++++++-----
 fs/ioctl.c |  2 +-
 fs/namei.c |  2 +-
 fs/open.c  |  4 ++--
 4 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index cdc141946724..bd215cc791da 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -50,7 +50,7 @@ static int get_close_on_exec(unsigned int fd)
 	return res;
 }
 
-asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
+SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
 {
 	int err = -EBADF;
 	struct file * file, *tofree;
@@ -113,7 +113,7 @@ out_unlock:
 	return err;
 }
 
-asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
+SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 {
 	if (unlikely(newfd == oldfd)) { /* corner case */
 		struct files_struct *files = current->files;
@@ -126,7 +126,7 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
 	return sys_dup3(oldfd, newfd, 0);
 }
 
-asmlinkage long sys_dup(unsigned int fildes)
+SYSCALL_DEFINE1(dup, unsigned int, fildes)
 {
 	int ret = -EBADF;
 	struct file *file = fget(fildes);
@@ -335,7 +335,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	return err;
 }
 
-asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
+SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {	
 	struct file *filp;
 	long err = -EBADF;
@@ -358,7 +358,8 @@ out:
 }
 
 #if BITS_PER_LONG == 32
-asmlinkage long sys_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg)
+SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+		unsigned long, arg)
 {	
 	struct file * filp;
 	long err;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 20b0a8a24c6b..240ec63984cb 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -542,7 +542,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 	return error;
 }
 
-asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
+SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {
 	struct file *filp;
 	int error = -EBADF;
diff --git a/fs/namei.c b/fs/namei.c
index 43fa25259728..00c4f37a0391 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2718,7 +2718,7 @@ exit:
 	return error;
 }
 
-asmlinkage long sys_rename(const char __user *oldname, const char __user *newname)
+SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
 {
 	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
 }
diff --git a/fs/open.c b/fs/open.c
index 9b926de6ed9c..ecc75a2c262e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -594,7 +594,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
+SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
 {
 	struct inode * inode;
 	struct dentry * dentry;
@@ -658,7 +658,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_chmod(const char __user *filename, mode_t mode)
+SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
 {
 	return sys_fchmodat(AT_FDCWD, filename, mode);
 }
-- 
cgit v1.2.3


From 002c8976ee537724b20a5e179d9b349309438836 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:18 +0100
Subject: [CVE-2009-0029] System call wrappers part 16

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/aio.c        | 22 +++++++++++-----------
 fs/locks.c      |  2 +-
 fs/open.c       |  2 +-
 fs/read_write.c |  4 ++--
 fs/stat.c       |  4 ++--
 5 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index d6f89d3c15e8..8fa77e233944 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1270,7 +1270,7 @@ static void io_destroy(struct kioctx *ioctx)
  *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
  *	implemented.
  */
-asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp)
+SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 {
 	struct kioctx *ioctx = NULL;
 	unsigned long ctx;
@@ -1308,7 +1308,7 @@ out:
  *	implemented.  May fail with -EFAULT if the context pointed to
  *	is invalid.
  */
-asmlinkage long sys_io_destroy(aio_context_t ctx)
+SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 {
 	struct kioctx *ioctx = lookup_ioctx(ctx);
 	if (likely(NULL != ioctx)) {
@@ -1662,8 +1662,8 @@ out_put_req:
  *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
  *	fail with -ENOSYS if not implemented.
  */
-asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr,
-			      struct iocb __user * __user *iocbpp)
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+		struct iocb __user * __user *, iocbpp)
 {
 	struct kioctx *ctx;
 	long ret = 0;
@@ -1737,8 +1737,8 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
  *	invalid.  May fail with -EAGAIN if the iocb specified was not
  *	cancelled.  Will fail with -ENOSYS if not implemented.
  */
-asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
-			      struct io_event __user *result)
+SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
+		struct io_event __user *, result)
 {
 	int (*cancel)(struct kiocb *iocb, struct io_event *res);
 	struct kioctx *ctx;
@@ -1799,11 +1799,11 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
  *	will be updated if not NULL and the operation blocks.  Will fail
  *	with -ENOSYS if not implemented.
  */
-asmlinkage long sys_io_getevents(aio_context_t ctx_id,
-				 long min_nr,
-				 long nr,
-				 struct io_event __user *events,
-				 struct timespec __user *timeout)
+SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
+		long, min_nr,
+		long, nr,
+		struct io_event __user *, events,
+		struct timespec __user *, timeout)
 {
 	struct kioctx *ioctx = lookup_ioctx(ctx_id);
 	long ret = -EINVAL;
diff --git a/fs/locks.c b/fs/locks.c
index 46a2e12f7d42..ec3deea29e37 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1564,7 +1564,7 @@ EXPORT_SYMBOL(flock_lock_file_wait);
  *	%LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
  *	processes read and write access respectively.
  */
-asmlinkage long sys_flock(unsigned int fd, unsigned int cmd)
+SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 {
 	struct file *filp;
 	struct file_lock *lock;
diff --git a/fs/open.c b/fs/open.c
index ecc75a2c262e..293408b1c165 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1081,7 +1081,7 @@ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
  * For backward compatibility?  Maybe this should be moved
  * into arch/i386 instead?
  */
-asmlinkage long sys_creat(const char __user * pathname, int mode)
+SYSCALL_DEFINE2(creat, const char __user *, pathname, int, mode)
 {
 	return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
 }
diff --git a/fs/read_write.c b/fs/read_write.c
index 7a8326bc5903..0671aa016b6f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -828,7 +828,7 @@ out:
 	return retval;
 }
 
-asmlinkage long sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count)
+SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
 {
 	loff_t pos;
 	off_t off;
@@ -847,7 +847,7 @@ asmlinkage long sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t
 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
 }
 
-asmlinkage long sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count)
+SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
 {
 	loff_t pos;
 	ssize_t ret;
diff --git a/fs/stat.c b/fs/stat.c
index f29c5fe4f8b6..d712a0dfb50f 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -320,8 +320,8 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
 	return error;
 }
 
-asmlinkage long sys_readlink(const char __user *path, char __user *buf,
-				int bufsiz)
+SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
+		int, bufsiz)
 {
 	return sys_readlinkat(AT_FDCWD, path, buf, bufsiz);
 }
-- 
cgit v1.2.3


From ca013e945b1ba5828b151ee646946f1297b67a4c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:19 +0100
Subject: [CVE-2009-0029] System call wrappers part 17

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/open.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 293408b1c165..4a6d80064746 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -517,7 +517,7 @@ out:
 	return res;
 }
 
-asmlinkage long sys_access(const char __user *filename, int mode)
+SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
 {
 	return sys_faccessat(AT_FDCWD, filename, mode);
 }
@@ -688,7 +688,7 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 	return error;
 }
 
-asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
+SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
 {
 	struct path path;
 	int error;
@@ -732,7 +732,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group)
+SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
 {
 	struct path path;
 	int error;
@@ -751,8 +751,7 @@ out:
 	return error;
 }
 
-
-asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
+SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 {
 	struct file * file;
 	int error = -EBADF;
@@ -1048,7 +1047,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 	return fd;
 }
 
-asmlinkage long sys_open(const char __user *filename, int flags, int mode)
+SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
 {
 	long ret;
 
@@ -1117,7 +1116,7 @@ EXPORT_SYMBOL(filp_close);
  * releasing the fd. This ensures that one clone task can't release
  * an fd while another clone is opening it.
  */
-asmlinkage long sys_close(unsigned int fd)
+SYSCALL_DEFINE1(close, unsigned int, fd)
 {
 	struct file * filp;
 	struct files_struct *files = current->files;
@@ -1150,14 +1149,13 @@ out_unlock:
 	spin_unlock(&files->file_lock);
 	return -EBADF;
 }
-
 EXPORT_SYMBOL(sys_close);
 
 /*
  * This routine simulates a hangup on the tty, to arrange that users
  * are given clean terminals at login time.
  */
-asmlinkage long sys_vhangup(void)
+SYSCALL_DEFINE0(vhangup)
 {
 	if (capable(CAP_SYS_TTY_CONFIG)) {
 		tty_vhangup_self();
-- 
cgit v1.2.3


From 003d7ab479168132a2b2c6700fe682b08f08ab0c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:21 +0100
Subject: [CVE-2009-0029] System call wrappers part 19

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/read_write.c | 8 ++++----
 fs/utimes.c     | 5 +++--
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 0671aa016b6f..fad10af59d95 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -147,7 +147,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
 }
 EXPORT_SYMBOL(vfs_llseek);
 
-asmlinkage long sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
+SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
 {
 	off_t retval;
 	struct file * file;
@@ -171,9 +171,9 @@ bad:
 }
 
 #ifdef __ARCH_WANT_SYS_LLSEEK
-asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high,
-			   unsigned long offset_low, loff_t __user * result,
-			   unsigned int origin)
+SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
+		unsigned long, offset_low, loff_t __user *, result,
+		unsigned int, origin)
 {
 	int retval;
 	struct file * file;
diff --git a/fs/utimes.c b/fs/utimes.c
index 6929e3e91d05..ee853615798a 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -24,7 +24,7 @@
  * must be owner or have write permission.
  * Else, update from *times, must be owner or super user.
  */
-asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times)
+SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
 {
 	struct timespec tv[2];
 
@@ -214,7 +214,8 @@ asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __u
 	return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0);
 }
 
-asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes)
+SYSCALL_DEFINE2(utimes, char __user *, filename,
+		struct timeval __user *, utimes)
 {
 	return sys_futimesat(AT_FDCWD, filename, utimes);
 }
-- 
cgit v1.2.3


From 3cdad42884bbd95d5aa01297e8236ea1bad70053 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:22 +0100
Subject: [CVE-2009-0029] System call wrappers part 20

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/dcache.c     |  2 +-
 fs/namei.c      |  4 ++--
 fs/open.c       |  4 ++--
 fs/quota.c      |  3 ++-
 fs/read_write.c | 13 +++++++------
 5 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 4547f66884a0..937df0fb0da5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2092,7 +2092,7 @@ Elong:
  *		return NULL;
  *	}
  */
-asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
+SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 {
 	int error;
 	struct path pwd, root;
diff --git a/fs/namei.c b/fs/namei.c
index 00c4f37a0391..90520f05f997 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2081,7 +2081,7 @@ out_err:
 	return error;
 }
 
-asmlinkage long sys_mkdir(const char __user *pathname, int mode)
+SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
 {
 	return sys_mkdirat(AT_FDCWD, pathname, mode);
 }
@@ -2195,7 +2195,7 @@ exit1:
 	return error;
 }
 
-asmlinkage long sys_rmdir(const char __user *pathname)
+SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
 {
 	return do_rmdir(AT_FDCWD, pathname);
 }
diff --git a/fs/open.c b/fs/open.c
index 4a6d80064746..bc49e3c388d9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -522,7 +522,7 @@ SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
 	return sys_faccessat(AT_FDCWD, filename, mode);
 }
 
-asmlinkage long sys_chdir(const char __user * filename)
+SYSCALL_DEFINE1(chdir, const char __user *, filename)
 {
 	struct path path;
 	int error;
@@ -543,7 +543,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_fchdir(unsigned int fd)
+SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 {
 	struct file *file;
 	struct inode *inode;
diff --git a/fs/quota.c b/fs/quota.c
index 4a8c94f05f76..d76ada914f98 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -371,7 +371,8 @@ static inline struct super_block *quotactl_block(const char __user *special)
  * calls. Maybe we need to add the process quotas etc. in the future,
  * but we probably should use rlimits for that.
  */
-asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr)
+SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
+		qid_t, id, void __user *, addr)
 {
 	uint cmds, type;
 	struct super_block *sb = NULL;
diff --git a/fs/read_write.c b/fs/read_write.c
index fad10af59d95..400fe81c973e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -369,7 +369,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
 	file->f_pos = pos;
 }
 
-asmlinkage long sys_read(unsigned int fd, char __user * buf, size_t count)
+SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -386,7 +386,8 @@ asmlinkage long sys_read(unsigned int fd, char __user * buf, size_t count)
 	return ret;
 }
 
-asmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t count)
+SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
+		size_t, count)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -688,8 +689,8 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 
 EXPORT_SYMBOL(vfs_writev);
 
-asmlinkage long
-sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
+SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -709,8 +710,8 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
 	return ret;
 }
 
-asmlinkage long
-sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
+SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
-- 
cgit v1.2.3


From 20f37034fb966a1c35894f9fe529fda0b6440101 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:23 +0100
Subject: [CVE-2009-0029] System call wrappers part 21

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/readdir.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/readdir.c b/fs/readdir.c
index 8b4c2a0051a6..cf6a0e39819a 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -187,7 +187,8 @@ efault:
 	return -EFAULT;
 }
 
-asmlinkage long sys_getdents(unsigned int fd, struct linux_dirent __user * dirent, unsigned int count)
+SYSCALL_DEFINE3(getdents, unsigned int, fd,
+		struct linux_dirent __user *, dirent, unsigned int, count)
 {
 	struct file * file;
 	struct linux_dirent __user * lastdirent;
@@ -268,7 +269,8 @@ efault:
 	return -EFAULT;
 }
 
-asmlinkage long sys_getdents64(unsigned int fd, struct linux_dirent64 __user * dirent, unsigned int count)
+SYSCALL_DEFINE3(getdents64, unsigned int, fd,
+		struct linux_dirent64 __user *, dirent, unsigned int, count)
 {
 	struct file * file;
 	struct linux_dirent64 __user * lastdirent;
-- 
cgit v1.2.3


From 5a8a82b1d306a325d899b67715618413657efda4 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:25 +0100
Subject: [CVE-2009-0029] System call wrappers part 23

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/eventpoll.c | 18 +++++++++---------
 fs/select.c    |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 96355d505347..ba2f9ec71192 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1110,7 +1110,7 @@ retry:
 /*
  * Open an eventpoll file descriptor.
  */
-asmlinkage long sys_epoll_create1(int flags)
+SYSCALL_DEFINE1(epoll_create1, int, flags)
 {
 	int error, fd = -1;
 	struct eventpoll *ep;
@@ -1150,7 +1150,7 @@ error_return:
 	return fd;
 }
 
-asmlinkage long sys_epoll_create(int size)
+SYSCALL_DEFINE1(epoll_create, int, size)
 {
 	if (size < 0)
 		return -EINVAL;
@@ -1163,8 +1163,8 @@ asmlinkage long sys_epoll_create(int size)
  * the eventpoll file that enables the insertion/removal/change of
  * file descriptors inside the interest set.
  */
-asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
-			      struct epoll_event __user *event)
+SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
+		struct epoll_event __user *, event)
 {
 	int error;
 	struct file *file, *tfile;
@@ -1261,8 +1261,8 @@ error_return:
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_wait(2).
  */
-asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
-			       int maxevents, int timeout)
+SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout)
 {
 	int error;
 	struct file *file;
@@ -1319,9 +1319,9 @@ error_return:
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_pwait(2).
  */
-asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
-		int maxevents, int timeout, const sigset_t __user *sigmask,
-		size_t sigsetsize)
+SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout, const sigset_t __user *, sigmask,
+		size_t, sigsetsize)
 {
 	int error;
 	sigset_t ksigmask, sigsaved;
diff --git a/fs/select.c b/fs/select.c
index d1651648be11..338f703403af 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -557,8 +557,8 @@ out_nofds:
 	return ret;
 }
 
-asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-			fd_set __user *exp, struct timeval __user *tvp)
+SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
+		fd_set __user *, exp, struct timeval __user *, tvp)
 {
 	struct timespec end_time, *to = NULL;
 	struct timeval tv;
@@ -854,8 +854,8 @@ static long do_restart_poll(struct restart_block *restart_block)
 	return ret;
 }
 
-asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
-			long timeout_msecs)
+SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
+		long, timeout_msecs)
 {
 	struct timespec end_time, *to = NULL;
 	int ret;
-- 
cgit v1.2.3


From 1e7bfb2134dfec37ce04fb3a4ca89299e892d10c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:29 +0100
Subject: [CVE-2009-0029] System call wrappers part 27

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/exec.c        | 2 +-
 fs/filesystems.c | 2 +-
 fs/nfsctl.c      | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 71a6efe5d8bd..0dd60a01f1b4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -99,7 +99,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
  *
  * Also note that we take the address to load from from the file itself.
  */
-asmlinkage long sys_uselib(const char __user * library)
+SYSCALL_DEFINE1(uselib, const char __user *, library)
 {
 	struct file *file;
 	struct nameidata nd;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index d488dcd7f2bb..1aa70260e6d1 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -179,7 +179,7 @@ static int fs_maxindex(void)
 /*
  * Whee.. Weird sysv syscall. 
  */
-asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2)
+SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
 {
 	int retval = -EINVAL;
 
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index b27451909dff..8f9a20556f79 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -86,8 +86,8 @@ static struct {
 	},
 };
 
-long
-asmlinkage sys_nfsservctl(int cmd, struct nfsctl_arg __user *arg, void __user *res)
+SYSCALL_DEFINE3(nfsservctl, int, cmd, struct nfsctl_arg __user *, arg,
+		void __user *, res)
 {
 	struct file *file;
 	void __user *p = &arg->u;
-- 
cgit v1.2.3


From 938bb9f5e840eddbf54e4f62f6c5ba9b3ae12c9d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:30 +0100
Subject: [CVE-2009-0029] System call wrappers part 28

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/ioprio.c                      | 5 ++---
 fs/notify/inotify/inotify_user.c | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index 1a39ac370942..c7c0b28d7d21 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -72,7 +72,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
 
-asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
+SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 {
 	int class = IOPRIO_PRIO_CLASS(ioprio);
 	int data = IOPRIO_PRIO_DATA(ioprio);
@@ -188,7 +188,7 @@ int ioprio_best(unsigned short aprio, unsigned short bprio)
 		return aprio;
 }
 
-asmlinkage long sys_ioprio_get(int which, int who)
+SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
@@ -252,4 +252,3 @@ asmlinkage long sys_ioprio_get(int which, int who)
 	read_unlock(&tasklist_lock);
 	return ret;
 }
-
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 81b8644b0136..efef1ffca77b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -576,7 +576,7 @@ static const struct inotify_operations inotify_user_ops = {
 	.destroy_watch	= free_inotify_user_watch,
 };
 
-asmlinkage long sys_inotify_init1(int flags)
+SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
 	struct inotify_device *dev;
 	struct inotify_handle *ih;
@@ -655,7 +655,7 @@ out_put_fd:
 	return ret;
 }
 
-asmlinkage long sys_inotify_init(void)
+SYSCALL_DEFINE0(inotify_init)
 {
 	return sys_inotify_init1(0);
 }
-- 
cgit v1.2.3


From 2e4d0924eb0c403ce4014fa139d1d61bf2c44fee Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:31 +0100
Subject: [CVE-2009-0029] System call wrappers part 29

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/namei.c                       | 21 ++++++++++-----------
 fs/notify/inotify/inotify_user.c |  5 +++--
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 90520f05f997..bbc15c237558 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1962,8 +1962,8 @@ static int may_mknod(mode_t mode)
 	}
 }
 
-asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
-				unsigned dev)
+SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
+		unsigned, dev)
 {
 	int error;
 	char *tmp;
@@ -2044,7 +2044,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return error;
 }
 
-asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
+SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
 {
 	int error = 0;
 	char * tmp;
@@ -2291,7 +2291,7 @@ slashes:
 	goto exit2;
 }
 
-asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
+SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
 {
 	if ((flag & ~AT_REMOVEDIR) != 0)
 		return -EINVAL;
@@ -2328,8 +2328,8 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 	return error;
 }
 
-asmlinkage long sys_symlinkat(const char __user *oldname,
-			      int newdfd, const char __user *newname)
+SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
+		int, newdfd, const char __user *, newname)
 {
 	int error;
 	char *from;
@@ -2422,9 +2422,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
  * with linux 2.0, and to avoid hard-linking to directories
  * and other special files.  --ADM
  */
-asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
-			   int newdfd, const char __user *newname,
-			   int flags)
+SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
+		int, newdfd, const char __user *, newname, int, flags)
 {
 	struct dentry *new_dentry;
 	struct nameidata nd;
@@ -2624,8 +2623,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return error;
 }
 
-asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
-			     int newdfd, const char __user *newname)
+SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
+		int, newdfd, const char __user *, newname)
 {
 	struct dentry *old_dir, *new_dir;
 	struct dentry *old_dentry, *new_dentry;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index efef1ffca77b..d53a1838d6e8 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -660,7 +660,8 @@ SYSCALL_DEFINE0(inotify_init)
 	return sys_inotify_init1(0);
 }
 
-asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask)
+SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
+		u32, mask)
 {
 	struct inode *inode;
 	struct inotify_device *dev;
@@ -704,7 +705,7 @@ fput_and_out:
 	return ret;
 }
 
-asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd)
+SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
 	struct file *filp;
 	struct inotify_device *dev;
-- 
cgit v1.2.3


From 6559eed8ca7db0531a207cd80be5e28cd6f213c5 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:32 +0100
Subject: [CVE-2009-0029] System call wrappers part 30

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/open.c   | 13 ++++++-------
 fs/stat.c   | 12 ++++++------
 fs/utimes.c |  6 ++++--
 3 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index bc49e3c388d9..a3a78ceb2a2b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -447,7 +447,7 @@ SYSCALL_ALIAS(sys_fallocate, SyS_fallocate);
  * We do this by temporarily clearing all FS-related capabilities and
  * switching the fsuid/fsgid around to the real ones.
  */
-asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
+SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 {
 	const struct cred *old_cred;
 	struct cred *override_cred;
@@ -628,8 +628,7 @@ out:
 	return err;
 }
 
-asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
-			     mode_t mode)
+SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
 {
 	struct path path;
 	struct inode *inode;
@@ -707,8 +706,8 @@ out:
 	return error;
 }
 
-asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
-			     gid_t group, int flag)
+SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
+		gid_t, group, int, flag)
 {
 	struct path path;
 	int error = -EINVAL;
@@ -1060,8 +1059,8 @@ SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
 	return ret;
 }
 
-asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
-			   int mode)
+SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
+		int, mode)
 {
 	long ret;
 
diff --git a/fs/stat.c b/fs/stat.c
index d712a0dfb50f..2db740a0cfb5 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -260,8 +260,8 @@ SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf
 }
 
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
-asmlinkage long sys_newfstatat(int dfd, char __user *filename,
-				struct stat __user *statbuf, int flag)
+SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
+		struct stat __user *, statbuf, int, flag)
 {
 	struct kstat stat;
 	int error = -EINVAL;
@@ -293,8 +293,8 @@ SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
 	return error;
 }
 
-asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
-				char __user *buf, int bufsiz)
+SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
+		char __user *, buf, int, bufsiz)
 {
 	struct path path;
 	int error;
@@ -400,8 +400,8 @@ SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
 	return error;
 }
 
-asmlinkage long sys_fstatat64(int dfd, char __user *filename,
-			       struct stat64 __user *statbuf, int flag)
+SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
+		struct stat64 __user *, statbuf, int, flag)
 {
 	struct kstat stat;
 	int error = -EINVAL;
diff --git a/fs/utimes.c b/fs/utimes.c
index ee853615798a..e4c75db5d373 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -170,7 +170,8 @@ out:
 	return error;
 }
 
-asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __user *utimes, int flags)
+SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
+		struct timespec __user *, utimes, int, flags)
 {
 	struct timespec tstimes[2];
 
@@ -187,7 +188,8 @@ asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __
 	return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
 }
 
-asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __user *utimes)
+SYSCALL_DEFINE3(futimesat, int, dfd, char __user *, filename,
+		struct timeval __user *, utimes)
 {
 	struct timeval times[2];
 	struct timespec tstimes[2];
-- 
cgit v1.2.3


From 836f92adf121f806e9beb5b6b88bd5c9c4ea3f24 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:33 +0100
Subject: [CVE-2009-0029] System call wrappers part 31

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/signalfd.c |  8 ++++----
 fs/splice.c   | 12 ++++++------
 fs/timerfd.c  |  8 ++++----
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/signalfd.c b/fs/signalfd.c
index 9c39bc7f8431..b07565c94386 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -205,8 +205,8 @@ static const struct file_operations signalfd_fops = {
 	.read		= signalfd_read,
 };
 
-asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
-			      size_t sizemask, int flags)
+SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
+		size_t, sizemask, int, flags)
 {
 	sigset_t sigmask;
 	struct signalfd_ctx *ctx;
@@ -259,8 +259,8 @@ asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
 	return ufd;
 }
 
-asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask,
-			     size_t sizemask)
+SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
+		size_t, sizemask)
 {
 	return sys_signalfd4(ufd, user_mask, sizemask, 0);
 }
diff --git a/fs/splice.c b/fs/splice.c
index a54b3e3f10a7..4ed0ba44a966 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1435,8 +1435,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
  * Currently we punt and implement it as a normal copy, see pipe_to_user().
  *
  */
-asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
-			     unsigned long nr_segs, unsigned int flags)
+SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
+		unsigned long, nr_segs, unsigned int, flags)
 {
 	struct file *file;
 	long error;
@@ -1461,9 +1461,9 @@ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
 	return error;
 }
 
-asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
-			   int fd_out, loff_t __user *off_out,
-			   size_t len, unsigned int flags)
+SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
+		int, fd_out, loff_t __user *, off_out,
+		size_t, len, unsigned int, flags)
 {
 	long error;
 	struct file *in, *out;
@@ -1685,7 +1685,7 @@ static long do_tee(struct file *in, struct file *out, size_t len,
 	return ret;
 }
 
-asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
+SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
 {
 	struct file *in;
 	int error, fput_in;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 0862f0e49d0c..c8c14f58b96f 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -177,7 +177,7 @@ static struct file *timerfd_fget(int fd)
 	return file;
 }
 
-asmlinkage long sys_timerfd_create(int clockid, int flags)
+SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 {
 	int ufd;
 	struct timerfd_ctx *ctx;
@@ -208,9 +208,9 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
 	return ufd;
 }
 
-asmlinkage long sys_timerfd_settime(int ufd, int flags,
-				    const struct itimerspec __user *utmr,
-				    struct itimerspec __user *otmr)
+SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+		const struct itimerspec __user *, utmr,
+		struct itimerspec __user *, otmr)
 {
 	struct file *file;
 	struct timerfd_ctx *ctx;
-- 
cgit v1.2.3


From d4e82042c4cfa87a7d51710b71f568fe80132551 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:34 +0100
Subject: [CVE-2009-0029] System call wrappers part 32

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/eventfd.c |  5 ++---
 fs/pipe.c    |  2 +-
 fs/readdir.c |  3 ++-
 fs/select.c  | 11 ++++++-----
 fs/timerfd.c |  2 +-
 5 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 08bf558d0408..5de2c2db3aa2 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -198,7 +198,7 @@ struct file *eventfd_fget(int fd)
 	return file;
 }
 
-asmlinkage long sys_eventfd2(unsigned int count, int flags)
+SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
 	int fd;
 	struct eventfd_ctx *ctx;
@@ -228,8 +228,7 @@ asmlinkage long sys_eventfd2(unsigned int count, int flags)
 	return fd;
 }
 
-asmlinkage long sys_eventfd(unsigned int count)
+SYSCALL_DEFINE1(eventfd, unsigned int, count)
 {
 	return sys_eventfd2(count, 0);
 }
-
diff --git a/fs/pipe.c b/fs/pipe.c
index 0c64db86c919..b89c878588a9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1043,7 +1043,7 @@ int do_pipe(int *fd)
  * sys_pipe() is the normal C calling standard for creating
  * a pipe. It's not the way Unix traditionally does this, though.
  */
-asmlinkage long sys_pipe2(int __user *fildes, int flags)
+SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
 {
 	int fd[2];
 	int error;
diff --git a/fs/readdir.c b/fs/readdir.c
index cf6a0e39819a..7723401f8d8b 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -102,7 +102,8 @@ efault:
 	return -EFAULT;
 }
 
-asmlinkage long sys_old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count)
+SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
+		struct old_linux_dirent __user *, dirent, unsigned int, count)
 {
 	int error;
 	struct file * file;
diff --git a/fs/select.c b/fs/select.c
index 338f703403af..0fe0e1469df3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -636,8 +636,9 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
  * which has a pointer to the sigset_t itself followed by a size_t containing
  * the sigset size.
  */
-asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
-	fd_set __user *exp, struct timespec __user *tsp, void __user *sig)
+SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
+		fd_set __user *, exp, struct timespec __user *, tsp,
+		void __user *, sig)
 {
 	size_t sigsetsize = 0;
 	sigset_t __user *up = NULL;
@@ -889,9 +890,9 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 }
 
 #ifdef HAVE_SET_RESTORE_SIGMASK
-asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
-	struct timespec __user *tsp, const sigset_t __user *sigmask,
-	size_t sigsetsize)
+SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
+		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
+		size_t, sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
 	struct timespec ts, end_time, *to = NULL;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index c8c14f58b96f..6a123b8ff3f5 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -265,7 +265,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	return 0;
 }
 
-asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr)
+SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
 {
 	struct file *file;
 	struct timerfd_ctx *ctx;
-- 
cgit v1.2.3


From 2b66421995d2e93c9d1a0111acf2581f8529c6e5 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:35 +0100
Subject: [CVE-2009-0029] System call wrappers part 33

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/pipe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index b89c878588a9..3a48ba5179d5 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1059,7 +1059,7 @@ SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
 	return error;
 }
 
-asmlinkage long sys_pipe(int __user *fildes)
+SYSCALL_DEFINE1(pipe, int __user *, fildes)
 {
 	return sys_pipe2(fildes, 0);
 }
-- 
cgit v1.2.3


From 1bcbf31337391a2f54ef6c1e8871c2de5944a7dc Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Thu, 15 Jan 2009 13:51:03 -0800
Subject: btrfs & squashfs: Move btrfs and squashfsto's magic number to
 <linux/magic.h>

Use the standard magic.h for btrfs and squashfs.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Cc: Phillip Lougher <phillip@lougher.demon.co.uk>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/super.c          | 2 +-
 fs/squashfs/squashfs_fs.h | 1 -
 fs/squashfs/super.c       | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0a14b495532f..7256cf242eb0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
 #include <linux/namei.h>
 #include <linux/miscdevice.h>
 #include <linux/version.h>
+#include <linux/magic.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -51,7 +52,6 @@
 #include "export.h"
 #include "compression.h"
 
-#define BTRFS_SUPER_MAGIC 0x9123683E
 
 static struct super_operations btrfs_super_ops;
 
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 6840da1bf21e..283daafc568e 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -26,7 +26,6 @@
 #define SQUASHFS_CACHED_FRAGMENTS	CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE
 #define SQUASHFS_MAJOR			4
 #define SQUASHFS_MINOR			0
-#define SQUASHFS_MAGIC			0x73717368
 #define SQUASHFS_START			0
 
 /* size of metadata (inode and directory) blocks */
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index a0466d7467b2..071df5b5b491 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/zlib.h>
+#include <linux/magic.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-- 
cgit v1.2.3


From 6b7021ef7e1a703c7092daeceda063951b22b4f6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 15 Jan 2009 13:51:29 -0800
Subject: ext2: also update the inode on disk when dir is IS_DIRSYNC

We used to just write changed page for IS_DIRSYNC inodes.  But we also
have to update the directory inode itself just for the case that we've
allocated a new block and changed i_size.

[akpm@linux-foundation.org: still sync the data page]
Signed-off-by: Jan Kara <jack@suse.cz>
Tested-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/dir.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 9a0fc400f91c..2999d72153b7 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -95,10 +95,13 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
 		mark_inode_dirty(dir);
 	}
 
-	if (IS_DIRSYNC(dir))
+	if (IS_DIRSYNC(dir)) {
 		err = write_one_page(page, 1);
-	else
+		if (!err)
+			err = ext2_sync_inode(dir);
+	} else {
 		unlock_page(page);
+	}
 
 	return err;
 }
-- 
cgit v1.2.3


From 06a279d636734da32bb62dd2f7b0ade666f65d7c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 17 Jan 2009 18:41:37 -0500
Subject: ext4: only use i_size_high for regular files

Directories are not allowed to be bigger than 2GB, so don't use
i_size_high for anything other than regular files.  E2fsck should
complain about these inodes, but the simplest thing to do for the
kernel is to only use i_size_high for regular files.

This prevents an intentially corrupted filesystem from causing the
kernel to burn a huge amount of CPU and issuing error messages such
as:

EXT4-fs warning (device loop0): ext4_block_to_path: block 135090028 > max

Thanks to David Maciejak from Fortinet's FortiGuard Global Security
Research Team for reporting this issue.

http://bugzilla.kernel.org/show_bug.cgi?id=12375

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/ext4.h  | 7 +++++--
 fs/ext4/inode.c | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c668e4377d76..aafc9eba1c25 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1206,8 +1206,11 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
 
 static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
 {
-	return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
-		le32_to_cpu(raw_inode->i_size_lo);
+	if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+		return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+			le32_to_cpu(raw_inode->i_size_lo);
+	else
+		return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
 }
 
 static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a6444cee0c7e..49484ba801c9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -360,9 +360,9 @@ static int ext4_block_to_path(struct inode *inode,
 		final = ptrs;
 	} else {
 		ext4_warning(inode->i_sb, "ext4_block_to_path",
-				"block %lu > max",
+				"block %lu > max in inode %lu",
 				i_block + direct_blocks +
-				indirect_blocks + double_blocks);
+				indirect_blocks + double_blocks, inode->i_ino);
 	}
 	if (boundary)
 		*boundary = final - 1 - (i_block & (ptrs - 1));
-- 
cgit v1.2.3


From e6b8bc09ba2075cd91fbffefcd2778b1a00bd76f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 16 Jan 2009 11:13:40 -0500
Subject: ext4: Add sanity check to make_indexed_dir

Make sure the rec_len field in the '..' entry is sane, lest we overrun
the directory block and cause a kernel oops on a purposefully
corrupted filesystem.

Thanks to Sami Liedes for reporting this bug.

http://bugzilla.kernel.org/show_bug.cgi?id=12430

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/namei.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index fec0b4c2f5f1..ba702bd7910d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1368,7 +1368,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	struct fake_dirent *fde;
 
 	blocksize =  dir->i_sb->s_blocksize;
-	dxtrace(printk(KERN_DEBUG "Creating index\n"));
+	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
 	retval = ext4_journal_get_write_access(handle, bh);
 	if (retval) {
 		ext4_std_error(dir->i_sb, retval);
@@ -1377,6 +1377,20 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	}
 	root = (struct dx_root *) bh->b_data;
 
+	/* The 0th block becomes the root, move the dirents out */
+	fde = &root->dotdot;
+	de = (struct ext4_dir_entry_2 *)((char *)fde +
+		ext4_rec_len_from_disk(fde->rec_len));
+	if ((char *) de >= (((char *) root) + blocksize)) {
+		ext4_error(dir->i_sb, __func__,
+			   "invalid rec_len for '..' in inode %lu",
+			   dir->i_ino);
+		brelse(bh);
+		return -EIO;
+	}
+	len = ((char *) root) + blocksize - (char *) de;
+
+	/* Allocate new block for the 0th block's dirents */
 	bh2 = ext4_append(handle, dir, &block, &retval);
 	if (!(bh2)) {
 		brelse(bh);
@@ -1385,11 +1399,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
 	data1 = bh2->b_data;
 
-	/* The 0th block becomes the root, move the dirents out */
-	fde = &root->dotdot;
-	de = (struct ext4_dir_entry_2 *)((char *)fde +
-		ext4_rec_len_from_disk(fde->rec_len));
-	len = ((char *) root) + blocksize - (char *) de;
 	memcpy (data1, de, len);
 	de = (struct ext4_dir_entry_2 *) data1;
 	top = data1 + len;
-- 
cgit v1.2.3


From a21102b55c4f8dfd3adb4a15a34cd62237b46039 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 16 Jan 2009 11:13:47 -0500
Subject: ext3: Add sanity check to make_indexed_dir

Make sure the rec_len field in the '..' entry is sane, lest we overrun
the directory block and cause a kernel oops on a purposefully
corrupted filesystem.

This fixes a bug related to a bug originally reported by Sami Liedes
for ext4 at:

http://bugzilla.kernel.org/show_bug.cgi?id=12430

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext3/namei.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 69a3d19ca9fd..4db4ffa1edad 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1358,7 +1358,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	struct fake_dirent *fde;
 
 	blocksize =  dir->i_sb->s_blocksize;
-	dxtrace(printk("Creating index\n"));
+	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
 	retval = ext3_journal_get_write_access(handle, bh);
 	if (retval) {
 		ext3_std_error(dir->i_sb, retval);
@@ -1367,6 +1367,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	}
 	root = (struct dx_root *) bh->b_data;
 
+	/* The 0th block becomes the root, move the dirents out */
+	fde = &root->dotdot;
+	de = (struct ext3_dir_entry_2 *)((char *)fde +
+			ext3_rec_len_from_disk(fde->rec_len));
+	if ((char *) de >= (((char *) root) + blocksize)) {
+		ext3_error(dir->i_sb, __func__,
+			   "invalid rec_len for '..' in inode %lu",
+			   dir->i_ino);
+		brelse(bh);
+		return -EIO;
+	}
+	len = ((char *) root) + blocksize - (char *) de;
+
 	bh2 = ext3_append (handle, dir, &block, &retval);
 	if (!(bh2)) {
 		brelse(bh);
@@ -1375,11 +1388,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
 	data1 = bh2->b_data;
 
-	/* The 0th block becomes the root, move the dirents out */
-	fde = &root->dotdot;
-	de = (struct ext3_dir_entry_2 *)((char *)fde +
-			ext3_rec_len_from_disk(fde->rec_len));
-	len = ((char *) root) + blocksize - (char *) de;
 	memcpy (data1, de, len);
 	de = (struct ext3_dir_entry_2 *) data1;
 	top = data1 + len;
-- 
cgit v1.2.3


From 1d9e2ae949411c2f329f30e01ea0355cd02c4296 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 Jan 2009 11:58:19 -0500
Subject: Btrfs: Clear the device->running_pending flag before bailing on
 congestion

Btrfs maintains a queue of async bio submissions so the checksumming
threads don't have to wait on get_request_wait.  In order to avoid
extra wakeups, this code has a running_pending flag that is used
to tell new submissions they don't need to wake the thread.

When the threads notice congestion on a single device, they
may decide to requeue the job and move on to other devices.  This
makes sure the running_pending flag is cleared before the
job is requeued.

It should help avoid IO stalls by making sure the task is woken up
when new submissions come in.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b187b537888e..3451e1cca2b5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -220,6 +220,7 @@ loop:
 				tail->bi_next = old_head;
 			else
 				device->pending_bio_tail = tail;
+			device->running_pending = 0;
 
 			spin_unlock(&device->io_lock);
 			btrfs_requeue_work(&device->work);
-- 
cgit v1.2.3


From c071fcfdb60e7abbe95e02460005d6bca165bf24 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 Jan 2009 11:59:08 -0500
Subject: Btrfs: fix ioctl arg size (userland incompatible change!)

The structure used to send device in btrfs ioctl calls was not
properly aligned, and so 32 bit ioctls would not work properly on
64 bit kernels.

We could fix this with compat ioctls, but we're just one byte away
and it doesn't make sense at this stage to carry about the compat ioctls
forever at this stage in the project.

This patch brings the ioctl arg up to an evenly aligned 4k.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.h | 14 ++++++++------
 fs/btrfs/super.c |  3 ++-
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 78049ea208db..b320b103fa13 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,13 +22,20 @@
 
 #define BTRFS_IOCTL_MAGIC 0x94
 #define BTRFS_VOL_NAME_MAX 255
-#define BTRFS_PATH_NAME_MAX 3072
+#define BTRFS_PATH_NAME_MAX 4087
 
+/* this should be 4k */
 struct btrfs_ioctl_vol_args {
 	__s64 fd;
 	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
+struct btrfs_ioctl_clone_range_args {
+  __s64 src_fd;
+  __u64 src_offset, src_length;
+  __u64 dest_offset;
+};
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -52,11 +59,6 @@ struct btrfs_ioctl_vol_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
 				   struct btrfs_ioctl_vol_args)
-struct btrfs_ioctl_clone_range_args {
-  __s64 src_fd;
-  __u64 src_offset, src_length;
-  __u64 dest_offset;
-};
 
 #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
 				  struct btrfs_ioctl_clone_range_args)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b4c101d9322c..92c9b543deff 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -582,7 +582,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 {
 	struct btrfs_ioctl_vol_args *vol;
 	struct btrfs_fs_devices *fs_devices;
-	int ret = 0;
+	int ret = -ENOTTY;
 	int len;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -594,6 +594,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 		goto out;
 	}
 	len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
+
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
 		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
-- 
cgit v1.2.3


From cc33412fb1f11613e20f9dfc2919a77ecd63fbc4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Jan 2009 17:23:05 +0100
Subject: quota: Improve locking

We implement dqget() and dqput() that need neither dqonoff_mutex nor dqptr_sem.
Then move dqget() and dqput() calls so that they are not called from under
dqptr_sem. This is important because filesystem callbacks aren't called from
under dqptr_sem which used to cause *lots* of problems with lock ranking
(and with OCFS2 they became close to unsolvable).

The patch also removes two functions which were introduced solely because OCFS2
needed them to cope with the old locking scheme. As time showed, they were not
enough for OCFS2 anyway and it would be unnecessary work to adapt them to the
new locking scheme in which they aren't needed.  As a result OCFS2 needs the
following patch to compile properly with quotas.  Sorry to any bisecters which
hit this in advance.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/dquot.c | 218 ++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 122 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 48c0571f831d..bca3cac4bee7 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -87,14 +87,17 @@
 #define __DQUOT_PARANOIA
 
 /*
- * There are two quota SMP locks. dq_list_lock protects all lists with quotas
- * and quota formats and also dqstats structure containing statistics about the
- * lists. dq_data_lock protects data from dq_dqb and also mem_dqinfo structures
- * and also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
+ * There are three quota SMP locks. dq_list_lock protects all lists with quotas
+ * and quota formats, dqstats structure containing statistics about the lists
+ * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
+ * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
  * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
- * in inode_add_bytes() and inode_sub_bytes().
+ * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects
+ * modifications of quota state (on quotaon and quotaoff) and readers who care
+ * about latest values take it as well.
  *
- * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock
+ * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock,
+ *   dq_list_lock > dq_state_lock
  *
  * Note that some things (eg. sb pointer, type, id) doesn't change during
  * the life of the dquot structure and so needn't to be protected by a lock
@@ -103,12 +106,7 @@
  * operation is just reading pointers from inode (or not using them at all) the
  * read lock is enough. If pointers are altered function must hold write lock
  * (these locking rules also apply for S_NOQUOTA flag in the inode - note that
- * for altering the flag i_mutex is also needed).  If operation is holding
- * reference to dquot in other way (e.g. quotactl ops) it must be guarded by
- * dqonoff_mutex.
- * This locking assures that:
- *   a) update/access to dquot pointers in inode is serialized
- *   b) everyone is guarded against invalidate_dquots()
+ * for altering the flag i_mutex is also needed).
  *
  * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
  * from inodes (dquot_alloc_space() and such don't check the dq_lock).
@@ -122,10 +120,17 @@
  * Lock ordering (including related VFS locks) is the following:
  *   i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock >
  *   dqio_mutex
+ * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem >
+ * dqptr_sem. But filesystem has to count with the fact that functions such as
+ * dquot_alloc_space() acquire dqptr_sem and they usually have to be called
+ * from inside a transaction to keep filesystem consistency after a crash. Also
+ * filesystems usually want to do some IO on dquot from ->mark_dirty which is
+ * called with dqptr_sem held.
  * i_mutex on quota files is special (it's below dqio_mutex)
  */
 
 static DEFINE_SPINLOCK(dq_list_lock);
+static DEFINE_SPINLOCK(dq_state_lock);
 DEFINE_SPINLOCK(dq_data_lock);
 
 static char *quotatypes[] = INITQFNAMES;
@@ -428,7 +433,7 @@ static inline void do_destroy_dquot(struct dquot *dquot)
  * quota is disabled and pointers from inodes removed so there cannot be new
  * quota users. There can still be some users of quotas due to inodes being
  * just deleted or pruned by prune_icache() (those are not attached to any
- * list). We have to wait for such users.
+ * list) or parallel quotactl call. We have to wait for such users.
  */
 static void invalidate_dquots(struct super_block *sb, int type)
 {
@@ -600,7 +605,6 @@ static struct shrinker dqcache_shrinker = {
 /*
  * Put reference to dquot
  * NOTE: If you change this function please check whether dqput_blocks() works right...
- * MUST be called with either dqptr_sem or dqonoff_mutex held
  */
 void dqput(struct dquot *dquot)
 {
@@ -696,37 +700,31 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 	return dquot;
 }
 
-/*
- * Check whether dquot is in memory.
- * MUST be called with either dqptr_sem or dqonoff_mutex held
- */
-int dquot_is_cached(struct super_block *sb, unsigned int id, int type)
-{
-	unsigned int hashent = hashfn(sb, id, type);
-	int ret = 0;
-
-        if (!sb_has_quota_active(sb, type))
-		return 0;
-	spin_lock(&dq_list_lock);
-	if (find_dquot(hashent, sb, id, type) != NODQUOT)
-		ret = 1;
-	spin_unlock(&dq_list_lock);
-	return ret;
-}
-
 /*
  * Get reference to dquot
- * MUST be called with either dqptr_sem or dqonoff_mutex held
+ *
+ * Locking is slightly tricky here. We are guarded from parallel quotaoff()
+ * destroying our dquot by:
+ *   a) checking for quota flags under dq_list_lock and
+ *   b) getting a reference to dquot before we release dq_list_lock
  */
 struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
 	unsigned int hashent = hashfn(sb, id, type);
-	struct dquot *dquot, *empty = NODQUOT;
+	struct dquot *dquot = NODQUOT, *empty = NODQUOT;
 
         if (!sb_has_quota_active(sb, type))
 		return NODQUOT;
 we_slept:
 	spin_lock(&dq_list_lock);
+	spin_lock(&dq_state_lock);
+	if (!sb_has_quota_active(sb, type)) {
+		spin_unlock(&dq_state_lock);
+		spin_unlock(&dq_list_lock);
+		goto out;
+	}
+	spin_unlock(&dq_state_lock);
+
 	if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) {
 		if (empty == NODQUOT) {
 			spin_unlock(&dq_list_lock);
@@ -735,6 +733,7 @@ we_slept:
 			goto we_slept;
 		}
 		dquot = empty;
+		empty = NODQUOT;
 		dquot->dq_id = id;
 		/* all dquots go on the inuse_list */
 		put_inuse(dquot);
@@ -749,8 +748,6 @@ we_slept:
 		dqstats.cache_hits++;
 		dqstats.lookups++;
 		spin_unlock(&dq_list_lock);
-		if (empty)
-			do_destroy_dquot(empty);
 	}
 	/* Wait for dq_lock - after this we know that either dquot_release() is already
 	 * finished or it will be canceled due to dq_count > 1 test */
@@ -758,11 +755,15 @@ we_slept:
 	/* Read the dquot and instantiate it (everything done only if needed) */
 	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && sb->dq_op->acquire_dquot(dquot) < 0) {
 		dqput(dquot);
-		return NODQUOT;
+		dquot = NODQUOT;
+		goto out;
 	}
 #ifdef __DQUOT_PARANOIA
 	BUG_ON(!dquot->dq_sb);	/* Has somebody invalidated entry under us? */
 #endif
+out:
+	if (empty)
+		do_destroy_dquot(empty);
 
 	return dquot;
 }
@@ -1198,63 +1199,76 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
 }
 /*
  *	Initialize quota pointers in inode
- *	Transaction must be started at entry
+ *	We do things in a bit complicated way but by that we avoid calling
+ *	dqget() and thus filesystem callbacks under dqptr_sem.
  */
 int dquot_initialize(struct inode *inode, int type)
 {
 	unsigned int id = 0;
 	int cnt, ret = 0;
+	struct dquot *got[MAXQUOTAS] = { NODQUOT, NODQUOT };
+	struct super_block *sb = inode->i_sb;
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
 		return 0;
-	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+
+	/* First get references to structures we might need. */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+		switch (cnt) {
+		case USRQUOTA:
+			id = inode->i_uid;
+			break;
+		case GRPQUOTA:
+			id = inode->i_gid;
+			break;
+		}
+		got[cnt] = dqget(sb, id, cnt);
+	}
+
+	down_write(&sb_dqopt(sb)->dqptr_sem);
 	/* Having dqptr_sem we know NOQUOTA flags can't be altered... */
 	if (IS_NOQUOTA(inode))
 		goto out_err;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 			continue;
+		/* Avoid races with quotaoff() */
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
 		if (inode->i_dquot[cnt] == NODQUOT) {
-			switch (cnt) {
-				case USRQUOTA:
-					id = inode->i_uid;
-					break;
-				case GRPQUOTA:
-					id = inode->i_gid;
-					break;
-			}
-			inode->i_dquot[cnt] = dqget(inode->i_sb, id, cnt);
+			inode->i_dquot[cnt] = got[cnt];
+			got[cnt] = NODQUOT;
 		}
 	}
 out_err:
-	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	up_write(&sb_dqopt(sb)->dqptr_sem);
+	/* Drop unused references */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		dqput(got[cnt]);
 	return ret;
 }
 
 /*
  * 	Release all quotas referenced by inode
- *	Transaction must be started at an entry
  */
-int dquot_drop_locked(struct inode *inode)
+int dquot_drop(struct inode *inode)
 {
 	int cnt;
+	struct dquot *put[MAXQUOTAS];
 
+	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] != NODQUOT) {
-			dqput(inode->i_dquot[cnt]);
-			inode->i_dquot[cnt] = NODQUOT;
-		}
+		put[cnt] = inode->i_dquot[cnt];
+		inode->i_dquot[cnt] = NODQUOT;
 	}
-	return 0;
-}
-
-int dquot_drop(struct inode *inode)
-{
-	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	dquot_drop_locked(inode);
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		dqput(put[cnt]);
 	return 0;
 }
 
@@ -1470,8 +1484,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	qsize_t space;
 	struct dquot *transfer_from[MAXQUOTAS];
 	struct dquot *transfer_to[MAXQUOTAS];
-	int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid,
-	    chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid;
+	int cnt, ret = QUOTA_OK;
+	int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid,
+	    chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid;
 	char warntype_to[MAXQUOTAS];
 	char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
 
@@ -1479,21 +1494,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
          * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
 		return QUOTA_OK;
-	/* Clear the arrays */
+	/* Initialize the arrays */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		transfer_to[cnt] = transfer_from[cnt] = NODQUOT;
+		transfer_from[cnt] = NODQUOT;
+		transfer_to[cnt] = NODQUOT;
 		warntype_to[cnt] = QUOTA_NL_NOWARN;
-	}
-	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	/* Now recheck reliably when holding dqptr_sem */
-	if (IS_NOQUOTA(inode)) {	/* File without quota accounting? */
-		up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-		return QUOTA_OK;
-	}
-	/* First build the transfer_to list - here we can block on
-	 * reading/instantiating of dquots.  We know that the transaction for
-	 * us was already started so we don't violate lock ranking here */
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		switch (cnt) {
 			case USRQUOTA:
 				if (!chuid)
@@ -1507,6 +1512,13 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 				break;
 		}
 	}
+
+	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	/* Now recheck reliably when holding dqptr_sem */
+	if (IS_NOQUOTA(inode)) {	/* File without quota accounting? */
+		up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+		goto put_all;
+	}
 	spin_lock(&dq_data_lock);
 	space = inode_get_bytes(inode);
 	/* Build the transfer_from list and check the limits */
@@ -1517,7 +1529,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
 		    NO_QUOTA || check_bdq(transfer_to[cnt], space, 0,
 		    warntype_to + cnt) == NO_QUOTA)
-			goto warn_put_all;
+			goto over_quota;
 	}
 
 	/*
@@ -1545,28 +1557,37 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 
 		inode->i_dquot[cnt] = transfer_to[cnt];
 	}
-	ret = QUOTA_OK;
-warn_put_all:
 	spin_unlock(&dq_data_lock);
+	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+
 	/* Dirtify all the dquots - this can block when journalling */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (transfer_from[cnt])
 			mark_dquot_dirty(transfer_from[cnt]);
-		if (transfer_to[cnt])
+		if (transfer_to[cnt]) {
 			mark_dquot_dirty(transfer_to[cnt]);
+			/* The reference we got is transferred to the inode */
+			transfer_to[cnt] = NODQUOT;
+		}
 	}
+warn_put_all:
 	flush_warnings(transfer_to, warntype_to);
 	flush_warnings(transfer_from, warntype_from_inodes);
 	flush_warnings(transfer_from, warntype_from_space);
-	
+put_all:
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (ret == QUOTA_OK && transfer_from[cnt] != NODQUOT)
-			dqput(transfer_from[cnt]);
-		if (ret == NO_QUOTA && transfer_to[cnt] != NODQUOT)
-			dqput(transfer_to[cnt]);
+		dqput(transfer_from[cnt]);
+		dqput(transfer_to[cnt]);
 	}
-	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return ret;
+over_quota:
+	spin_unlock(&dq_data_lock);
+	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	/* Clear dquot pointers we don't want to dqput() */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		transfer_from[cnt] = NODQUOT;
+	ret = NO_QUOTA;
+	goto warn_put_all;
 }
 
 /* Wrapper for transferring ownership of an inode */
@@ -1651,19 +1672,24 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 			continue;
 
 		if (flags & DQUOT_SUSPENDED) {
+			spin_lock(&dq_state_lock);
 			dqopt->flags |=
 				dquot_state_flag(DQUOT_SUSPENDED, cnt);
+			spin_unlock(&dq_state_lock);
 		} else {
+			spin_lock(&dq_state_lock);
 			dqopt->flags &= ~dquot_state_flag(flags, cnt);
 			/* Turning off suspended quotas? */
 			if (!sb_has_quota_loaded(sb, cnt) &&
 			    sb_has_quota_suspended(sb, cnt)) {
 				dqopt->flags &=	~dquot_state_flag(
 							DQUOT_SUSPENDED, cnt);
+				spin_unlock(&dq_state_lock);
 				iput(dqopt->files[cnt]);
 				dqopt->files[cnt] = NULL;
 				continue;
 			}
+			spin_unlock(&dq_state_lock);
 		}
 
 		/* We still have to keep quota loaded? */
@@ -1830,7 +1856,9 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 	}
 	mutex_unlock(&dqopt->dqio_mutex);
 	mutex_unlock(&inode->i_mutex);
+	spin_lock(&dq_state_lock);
 	dqopt->flags |= dquot_state_flag(flags, type);
+	spin_unlock(&dq_state_lock);
 
 	add_dquot_ref(sb, type);
 	mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1872,9 +1900,11 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
 	}
 	inode = dqopt->files[type];
 	dqopt->files[type] = NULL;
+	spin_lock(&dq_state_lock);
 	flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
 						DQUOT_LIMITS_ENABLED, type);
 	dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
+	spin_unlock(&dq_state_lock);
 	mutex_unlock(&dqopt->dqonoff_mutex);
 
 	flags = dquot_generic_flag(flags, type);
@@ -1952,7 +1982,9 @@ int vfs_quota_enable(struct inode *inode, int type, int format_id,
 			ret = -EBUSY;
 			goto out_lock;
 		}
+		spin_lock(&dq_state_lock);
 		sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
+		spin_unlock(&dq_state_lock);
 out_lock:
 		mutex_unlock(&dqopt->dqonoff_mutex);
 		return ret;
@@ -2039,14 +2071,12 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 {
 	struct dquot *dquot;
 
-	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	if (!(dquot = dqget(sb, id, type))) {
-		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	dquot = dqget(sb, id, type);
+	if (dquot == NODQUOT)
 		return -ESRCH;
-	}
 	do_get_dqblk(dquot, di);
 	dqput(dquot);
-	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+
 	return 0;
 }
 
@@ -2130,7 +2160,6 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 	struct dquot *dquot;
 	int rc;
 
-	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
 	dquot = dqget(sb, id, type);
 	if (!dquot) {
 		rc = -ESRCH;
@@ -2139,7 +2168,6 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 	rc = do_set_dqblk(dquot, di);
 	dqput(dquot);
 out:
-	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return rc;
 }
 
@@ -2370,11 +2398,9 @@ EXPORT_SYMBOL(dquot_release);
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 EXPORT_SYMBOL(dquot_initialize);
 EXPORT_SYMBOL(dquot_drop);
-EXPORT_SYMBOL(dquot_drop_locked);
 EXPORT_SYMBOL(vfs_dq_drop);
 EXPORT_SYMBOL(dqget);
 EXPORT_SYMBOL(dqput);
-EXPORT_SYMBOL(dquot_is_cached);
 EXPORT_SYMBOL(dquot_alloc_space);
 EXPORT_SYMBOL(dquot_alloc_inode);
 EXPORT_SYMBOL(dquot_free_space);
-- 
cgit v1.2.3


From dedb0d48a9d4d57086526b94a4b64da789a646e4 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 9 Jan 2009 21:02:37 +0200
Subject: UBIFS: do not commit twice

VFS calls '->sync_fs()' twice - first time with @wait = 0, second
time with @wait = 1. As a result, we may commit and synchronize
write-buffers twice. Avoid doing this by returning immediatelly if
@wait = 0.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 89556ee72518..a7fc97f4d9de 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -432,18 +432,19 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	int i, err;
 	struct ubifs_info *c = sb->s_fs_info;
 	struct writeback_control wbc = {
-		.sync_mode   = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+		.sync_mode   = WB_SYNC_ALL,
 		.range_start = 0,
 		.range_end   = LLONG_MAX,
 		.nr_to_write = LONG_MAX,
 	};
 
 	/*
-	 * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an
-	 * advisory thing to help the file system shove lots of data into the
-	 * queues. If some gets missed then it'll be picked up on the second
+	 * Zero @wait is just an advisory thing to help the file system shove
+	 * lots of data into the queues, and there will be the second
 	 * '->sync_fs()' call, with non-zero @wait.
 	 */
+	if (!wait)
+		return 0;
 
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
-- 
cgit v1.2.3


From e8b815663b1bfd9c255af5176604ec0eafdf6ed7 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 15 Jan 2009 17:43:23 +0200
Subject: UBIFS: constify operations

Mark super, file, and inode operation structcutes with 'const'.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/dir.c   |  4 ++--
 fs/ubifs/file.c  |  8 ++++----
 fs/ubifs/super.c |  2 +-
 fs/ubifs/ubifs.h | 14 +++++++-------
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f448ab1f9c38..d29b771cce45 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1199,7 +1199,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	return 0;
 }
 
-struct inode_operations ubifs_dir_inode_operations = {
+const struct inode_operations ubifs_dir_inode_operations = {
 	.lookup      = ubifs_lookup,
 	.create      = ubifs_create,
 	.link        = ubifs_link,
@@ -1219,7 +1219,7 @@ struct inode_operations ubifs_dir_inode_operations = {
 #endif
 };
 
-struct file_operations ubifs_dir_operations = {
+const struct file_operations ubifs_dir_operations = {
 	.llseek         = ubifs_dir_llseek,
 	.release        = ubifs_dir_release,
 	.read           = generic_read_dir,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index bf37374567fa..17443d97e6f1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1541,7 +1541,7 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-struct address_space_operations ubifs_file_address_operations = {
+const struct address_space_operations ubifs_file_address_operations = {
 	.readpage       = ubifs_readpage,
 	.writepage      = ubifs_writepage,
 	.write_begin    = ubifs_write_begin,
@@ -1551,7 +1551,7 @@ struct address_space_operations ubifs_file_address_operations = {
 	.releasepage    = ubifs_releasepage,
 };
 
-struct inode_operations ubifs_file_inode_operations = {
+const struct inode_operations ubifs_file_inode_operations = {
 	.setattr     = ubifs_setattr,
 	.getattr     = ubifs_getattr,
 #ifdef CONFIG_UBIFS_FS_XATTR
@@ -1562,14 +1562,14 @@ struct inode_operations ubifs_file_inode_operations = {
 #endif
 };
 
-struct inode_operations ubifs_symlink_inode_operations = {
+const struct inode_operations ubifs_symlink_inode_operations = {
 	.readlink    = generic_readlink,
 	.follow_link = ubifs_follow_link,
 	.setattr     = ubifs_setattr,
 	.getattr     = ubifs_getattr,
 };
 
-struct file_operations ubifs_file_operations = {
+const struct file_operations ubifs_file_operations = {
 	.llseek         = generic_file_llseek,
 	.read           = do_sync_read,
 	.write          = do_sync_write,
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index a7fc97f4d9de..53811e567a69 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1778,7 +1778,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-struct super_operations ubifs_super_operations = {
+const struct super_operations ubifs_super_operations = {
 	.alloc_inode   = ubifs_alloc_inode,
 	.destroy_inode = ubifs_destroy_inode,
 	.put_super     = ubifs_put_super,
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index fc2a4cc66d03..0881897a4208 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1405,13 +1405,13 @@ extern struct list_head ubifs_infos;
 extern spinlock_t ubifs_infos_lock;
 extern atomic_long_t ubifs_clean_zn_cnt;
 extern struct kmem_cache *ubifs_inode_slab;
-extern struct super_operations ubifs_super_operations;
-extern struct address_space_operations ubifs_file_address_operations;
-extern struct file_operations ubifs_file_operations;
-extern struct inode_operations ubifs_file_inode_operations;
-extern struct file_operations ubifs_dir_operations;
-extern struct inode_operations ubifs_dir_inode_operations;
-extern struct inode_operations ubifs_symlink_inode_operations;
+extern const struct super_operations ubifs_super_operations;
+extern const struct address_space_operations ubifs_file_address_operations;
+extern const struct file_operations ubifs_file_operations;
+extern const struct inode_operations ubifs_file_inode_operations;
+extern const struct file_operations ubifs_dir_operations;
+extern const struct inode_operations ubifs_dir_inode_operations;
+extern const struct inode_operations ubifs_symlink_inode_operations;
 extern struct backing_dev_info ubifs_backing_dev_info;
 extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
 
-- 
cgit v1.2.3


From ab596ad8972f314ace538799734c7e1bdd1da2ff Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:02:57 +0100
Subject: xfs: fix dentry aliasing issues in open_by_handle

Open by handle just grabs an inode by handle and then creates itself
a dentry for it.  While this works for regular files it is horribly
broken for directories, where the VFS locking relies on the fact that
there is only just one single dentry for a given inode, and that
these are always connected to the root of the filesystem so that
it's locking algorithms work (see Documentations/filesystems/Locking)

Remove all the existing open by handle code and replace it with a small
wrapper around the exportfs code which deals with all these issues.
At the same time we also make the checks for a valid handle strict
enough to reject all not perfectly well formed handles - given that
we never hand out others that's okay and simplifies the code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/Kconfig                 |   1 +
 fs/xfs/linux-2.6/xfs_ioctl.c   | 305 ++++++++++++++++++-----------------------
 fs/xfs/linux-2.6/xfs_ioctl.h   |  15 +-
 fs/xfs/linux-2.6/xfs_ioctl32.c | 175 +++++++----------------
 4 files changed, 196 insertions(+), 300 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 3f53dd101f99..29228f5899cd 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,7 @@
 config XFS_FS
 	tristate "XFS filesystem support"
 	depends on BLOCK
+	select EXPORTFS
 	help
 	  XFS is a high performance journaling filesystem which originated
 	  on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index e5be1e0be802..4bd112313f33 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -50,12 +50,14 @@
 #include "xfs_vnodeops.h"
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
+#include "xfs_export.h"
 
 #include <linux/capability.h>
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
+#include <linux/exportfs.h>
 
 /*
  * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
@@ -164,97 +166,69 @@ xfs_find_handle(
 	return 0;
 }
 
-
 /*
- * Convert userspace handle data into inode.
- *
- * We use the fact that all the fsop_handlereq ioctl calls have a data
- * structure argument whose first component is always a xfs_fsop_handlereq_t,
- * so we can pass that sub structure into this handy, shared routine.
- *
- * If no error, caller must always iput the returned inode.
+ * No need to do permission checks on the various pathname components
+ * as the handle operations are privileged.
  */
 STATIC int
-xfs_vget_fsop_handlereq(
-	xfs_mount_t		*mp,
-	struct inode		*parinode,	/* parent inode pointer    */
-	xfs_fsop_handlereq_t	*hreq,
-	struct inode		**inode)
+xfs_handle_acceptable(
+	void			*context,
+	struct dentry		*dentry)
+{
+	return 1;
+}
+
+/*
+ * Convert userspace handle data into a dentry.
+ */
+struct dentry *
+xfs_handle_to_dentry(
+	struct file		*parfilp,
+	void __user		*uhandle,
+	u32			hlen)
 {
-	void			__user *hanp;
-	size_t			hlen;
-	xfs_fid_t		*xfid;
-	xfs_handle_t		*handlep;
 	xfs_handle_t		handle;
-	xfs_inode_t		*ip;
-	xfs_ino_t		ino;
-	__u32			igen;
-	int			error;
+	struct xfs_fid64	fid;
 
 	/*
 	 * Only allow handle opens under a directory.
 	 */
-	if (!S_ISDIR(parinode->i_mode))
-		return XFS_ERROR(ENOTDIR);
-
-	hanp = hreq->ihandle;
-	hlen = hreq->ihandlen;
-	handlep = &handle;
-
-	if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
-		return XFS_ERROR(EINVAL);
-	if (copy_from_user(handlep, hanp, hlen))
-		return XFS_ERROR(EFAULT);
-	if (hlen < sizeof(*handlep))
-		memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
-	if (hlen > sizeof(handlep->ha_fsid)) {
-		if (handlep->ha_fid.fid_len !=
-		    (hlen - sizeof(handlep->ha_fsid) -
-		            sizeof(handlep->ha_fid.fid_len)) ||
-		    handlep->ha_fid.fid_pad)
-			return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * Crack the handle, obtain the inode # & generation #
-	 */
-	xfid = (struct xfs_fid *)&handlep->ha_fid;
-	if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
-		ino  = xfid->fid_ino;
-		igen = xfid->fid_gen;
-	} else {
-		return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * Get the XFS inode, building a Linux inode to go with it.
-	 */
-	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
-	if (error)
-		return error;
-	if (ip == NULL)
-		return XFS_ERROR(EIO);
-	if (ip->i_d.di_gen != igen) {
-		xfs_iput_new(ip, XFS_ILOCK_SHARED);
-		return XFS_ERROR(ENOENT);
-	}
-
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode))
+		return ERR_PTR(-ENOTDIR);
+
+	if (hlen != sizeof(xfs_handle_t))
+		return ERR_PTR(-EINVAL);
+	if (copy_from_user(&handle, uhandle, hlen))
+		return ERR_PTR(-EFAULT);
+	if (handle.ha_fid.fid_len !=
+	    sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len))
+		return ERR_PTR(-EINVAL);
+
+	memset(&fid, 0, sizeof(struct fid));
+	fid.ino = handle.ha_fid.fid_ino;
+	fid.gen = handle.ha_fid.fid_gen;
+
+	return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3,
+			FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
+			xfs_handle_acceptable, NULL);
+}
 
-	*inode = VFS_I(ip);
-	return 0;
+STATIC struct dentry *
+xfs_handlereq_to_dentry(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
 }
 
 int
 xfs_open_by_handle(
-	xfs_mount_t		*mp,
-	xfs_fsop_handlereq_t	*hreq,
 	struct file		*parfilp,
-	struct inode		*parinode)
+	xfs_fsop_handlereq_t	*hreq)
 {
 	const struct cred	*cred = current_cred();
 	int			error;
-	int			new_fd;
+	int			fd;
 	int			permflag;
 	struct file		*filp;
 	struct inode		*inode;
@@ -263,19 +237,21 @@ xfs_open_by_handle(
 	if (!capable(CAP_SYS_ADMIN))
 		return -XFS_ERROR(EPERM);
 
-	error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
-	if (error)
-		return -error;
+	dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	inode = dentry->d_inode;
 
 	/* Restrict xfs_open_by_handle to directories & regular files. */
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
-		iput(inode);
-		return -XFS_ERROR(EINVAL);
+		error = -XFS_ERROR(EPERM);
+		goto out_dput;
 	}
 
 #if BITS_PER_LONG != 32
 	hreq->oflags |= O_LARGEFILE;
 #endif
+
 	/* Put open permission in namei format. */
 	permflag = hreq->oflags;
 	if ((permflag+1) & O_ACCMODE)
@@ -285,50 +261,45 @@ xfs_open_by_handle(
 
 	if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
 	    (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
-		iput(inode);
-		return -XFS_ERROR(EPERM);
+		error = -XFS_ERROR(EPERM);
+		goto out_dput;
 	}
 
 	if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
-		iput(inode);
-		return -XFS_ERROR(EACCES);
+		error = -XFS_ERROR(EACCES);
+		goto out_dput;
 	}
 
 	/* Can't write directories. */
-	if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
-		iput(inode);
-		return -XFS_ERROR(EISDIR);
+	if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
+		error = -XFS_ERROR(EISDIR);
+		goto out_dput;
 	}
 
-	if ((new_fd = get_unused_fd()) < 0) {
-		iput(inode);
-		return new_fd;
+	fd = get_unused_fd();
+	if (fd < 0) {
+		error = fd;
+		goto out_dput;
 	}
 
-	dentry = d_obtain_alias(inode);
-	if (IS_ERR(dentry)) {
-		put_unused_fd(new_fd);
-		return PTR_ERR(dentry);
-	}
-
-	/* Ensure umount returns EBUSY on umounts while this file is open. */
-	mntget(parfilp->f_path.mnt);
-
-	/* Create file pointer. */
-	filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred);
+	filp = dentry_open(dentry, mntget(parfilp->f_path.mnt),
+			   hreq->oflags, cred);
 	if (IS_ERR(filp)) {
-		put_unused_fd(new_fd);
-		return -XFS_ERROR(-PTR_ERR(filp));
+		put_unused_fd(fd);
+		return PTR_ERR(filp);
 	}
 
 	if (inode->i_mode & S_IFREG) {
-		/* invisible operation should not change atime */
 		filp->f_flags |= O_NOATIME;
 		filp->f_mode |= FMODE_NOCMTIME;
 	}
 
-	fd_install(new_fd, filp);
-	return new_fd;
+	fd_install(fd, filp);
+	return fd;
+
+ out_dput:
+	dput(dentry);
+	return error;
 }
 
 /*
@@ -359,11 +330,10 @@ do_readlink(
 
 int
 xfs_readlink_by_handle(
-	xfs_mount_t		*mp,
-	xfs_fsop_handlereq_t	*hreq,
-	struct inode		*parinode)
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq)
 {
-	struct inode		*inode;
+	struct dentry		*dentry;
 	__u32			olen;
 	void			*link;
 	int			error;
@@ -371,26 +341,28 @@ xfs_readlink_by_handle(
 	if (!capable(CAP_SYS_ADMIN))
 		return -XFS_ERROR(EPERM);
 
-	error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
-	if (error)
-		return -error;
+	dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
 	/* Restrict this handle operation to symlinks only. */
-	if (!S_ISLNK(inode->i_mode)) {
+	if (!S_ISLNK(dentry->d_inode->i_mode)) {
 		error = -XFS_ERROR(EINVAL);
-		goto out_iput;
+		goto out_dput;
 	}
 
 	if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
 		error = -XFS_ERROR(EFAULT);
-		goto out_iput;
+		goto out_dput;
 	}
 
 	link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
-	if (!link)
-		goto out_iput;
+	if (!link) {
+		error = -XFS_ERROR(ENOMEM);
+		goto out_dput;
+	}
 
-	error = -xfs_readlink(XFS_I(inode), link);
+	error = -xfs_readlink(XFS_I(dentry->d_inode), link);
 	if (error)
 		goto out_kfree;
 	error = do_readlink(hreq->ohandle, olen, link);
@@ -399,32 +371,31 @@ xfs_readlink_by_handle(
 
  out_kfree:
 	kfree(link);
- out_iput:
-	iput(inode);
+ out_dput:
+	dput(dentry);
 	return error;
 }
 
 STATIC int
 xfs_fssetdm_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
-	struct inode		*parinode)
+	struct file		*parfilp,
+	void			__user *arg)
 {
 	int			error;
 	struct fsdmidata	fsd;
 	xfs_fsop_setdm_handlereq_t dmhreq;
-	struct inode		*inode;
+	struct dentry		*dentry;
 
 	if (!capable(CAP_MKNOD))
 		return -XFS_ERROR(EPERM);
 	if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
 
-	error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &inode);
-	if (error)
-		return -error;
+	dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
+	if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
 		error = -XFS_ERROR(EPERM);
 		goto out;
 	}
@@ -434,24 +405,23 @@ xfs_fssetdm_by_handle(
 		goto out;
 	}
 
-	error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask,
+	error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
 				 fsd.fsd_dmstate);
 
  out:
-	iput(inode);
+	dput(dentry);
 	return error;
 }
 
 STATIC int
 xfs_attrlist_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
-	struct inode		*parinode)
+	struct file		*parfilp,
+	void			__user *arg)
 {
-	int			error;
+	int			error = -ENOMEM;
 	attrlist_cursor_kern_t	*cursor;
 	xfs_fsop_attrlist_handlereq_t al_hreq;
-	struct inode		*inode;
+	struct dentry		*dentry;
 	char			*kbuf;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -467,16 +437,16 @@ xfs_attrlist_by_handle(
 	if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
 		return -XFS_ERROR(EINVAL);
 
-	error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, &inode);
-	if (error)
-		goto out;
+	dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
 	kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
 	if (!kbuf)
-		goto out_vn_rele;
+		goto out_dput;
 
 	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-	error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen,
+	error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
 					al_hreq.flags, cursor);
 	if (error)
 		goto out_kfree;
@@ -486,10 +456,9 @@ xfs_attrlist_by_handle(
 
  out_kfree:
 	kfree(kbuf);
- out_vn_rele:
-	iput(inode);
- out:
-	return -error;
+ out_dput:
+	dput(dentry);
+	return error;
 }
 
 int
@@ -564,15 +533,13 @@ xfs_attrmulti_attr_remove(
 
 STATIC int
 xfs_attrmulti_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
 	struct file		*parfilp,
-	struct inode		*parinode)
+	void			__user *arg)
 {
 	int			error;
 	xfs_attr_multiop_t	*ops;
 	xfs_fsop_attrmulti_handlereq_t am_hreq;
-	struct inode		*inode;
+	struct dentry		*dentry;
 	unsigned int		i, size;
 	char			*attr_name;
 
@@ -581,19 +548,19 @@ xfs_attrmulti_by_handle(
 	if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
 
-	error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &inode);
-	if (error)
-		goto out;
+	dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
 	error = E2BIG;
 	size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
 	if (!size || size > 16 * PAGE_SIZE)
-		goto out_vn_rele;
+		goto out_dput;
 
 	error = ENOMEM;
 	ops = kmalloc(size, GFP_KERNEL);
 	if (!ops)
-		goto out_vn_rele;
+		goto out_dput;
 
 	error = EFAULT;
 	if (copy_from_user(ops, am_hreq.ops, size))
@@ -615,25 +582,28 @@ xfs_attrmulti_by_handle(
 
 		switch (ops[i].am_opcode) {
 		case ATTR_OP_GET:
-			ops[i].am_error = xfs_attrmulti_attr_get(inode,
-					attr_name, ops[i].am_attrvalue,
-					&ops[i].am_length, ops[i].am_flags);
+			ops[i].am_error = xfs_attrmulti_attr_get(
+					dentry->d_inode, attr_name,
+					ops[i].am_attrvalue, &ops[i].am_length,
+					ops[i].am_flags);
 			break;
 		case ATTR_OP_SET:
 			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
 			if (ops[i].am_error)
 				break;
-			ops[i].am_error = xfs_attrmulti_attr_set(inode,
-					attr_name, ops[i].am_attrvalue,
-					ops[i].am_length, ops[i].am_flags);
+			ops[i].am_error = xfs_attrmulti_attr_set(
+					dentry->d_inode, attr_name,
+					ops[i].am_attrvalue, ops[i].am_length,
+					ops[i].am_flags);
 			mnt_drop_write(parfilp->f_path.mnt);
 			break;
 		case ATTR_OP_REMOVE:
 			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
 			if (ops[i].am_error)
 				break;
-			ops[i].am_error = xfs_attrmulti_attr_remove(inode,
-					attr_name, ops[i].am_flags);
+			ops[i].am_error = xfs_attrmulti_attr_remove(
+					dentry->d_inode, attr_name,
+					ops[i].am_flags);
 			mnt_drop_write(parfilp->f_path.mnt);
 			break;
 		default:
@@ -647,9 +617,8 @@ xfs_attrmulti_by_handle(
 	kfree(attr_name);
  out_kfree_ops:
 	kfree(ops);
- out_vn_rele:
-	iput(inode);
- out:
+ out_dput:
+	dput(dentry);
 	return -error;
 }
 
@@ -1440,23 +1409,23 @@ xfs_file_ioctl(
 
 		if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
 			return -XFS_ERROR(EFAULT);
-		return xfs_open_by_handle(mp, &hreq, filp, inode);
+		return xfs_open_by_handle(filp, &hreq);
 	}
 	case XFS_IOC_FSSETDM_BY_HANDLE:
-		return xfs_fssetdm_by_handle(mp, arg, inode);
+		return xfs_fssetdm_by_handle(filp, arg);
 
 	case XFS_IOC_READLINK_BY_HANDLE: {
 		xfs_fsop_handlereq_t	hreq;
 
 		if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
 			return -XFS_ERROR(EFAULT);
-		return xfs_readlink_by_handle(mp, &hreq, inode);
+		return xfs_readlink_by_handle(filp, &hreq);
 	}
 	case XFS_IOC_ATTRLIST_BY_HANDLE:
-		return xfs_attrlist_by_handle(mp, arg, inode);
+		return xfs_attrlist_by_handle(filp, arg);
 
 	case XFS_IOC_ATTRMULTI_BY_HANDLE:
-		return xfs_attrmulti_by_handle(mp, arg, filp, inode);
+		return xfs_attrmulti_by_handle(filp, arg);
 
 	case XFS_IOC_SWAPEXT: {
 		struct xfs_swapext	sxp;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 8c16bf2d7e03..7bd7c6afc1eb 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -34,16 +34,13 @@ xfs_find_handle(
 
 extern int
 xfs_open_by_handle(
-	xfs_mount_t		*mp,
-	xfs_fsop_handlereq_t	*hreq,
 	struct file		*parfilp,
-	struct inode		*parinode);
+	xfs_fsop_handlereq_t	*hreq);
 
 extern int
 xfs_readlink_by_handle(
-	xfs_mount_t		*mp,
-	xfs_fsop_handlereq_t	*hreq,
-	struct inode		*parinode);
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq);
 
 extern int
 xfs_attrmulti_attr_get(
@@ -67,6 +64,12 @@ xfs_attrmulti_attr_remove(
 	char			*name,
 	__uint32_t		flags);
 
+extern struct dentry *
+xfs_handle_to_dentry(
+	struct file		*parfilp,
+	void __user		*uhandle,
+	u32			hlen);
+
 extern long
 xfs_file_ioctl(
 	struct file		*filp,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 50903ad3182e..fd4362063f25 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -340,96 +340,24 @@ xfs_compat_handlereq_copyin(
 	return 0;
 }
 
-/*
- * Convert userspace handle data into inode.
- *
- * We use the fact that all the fsop_handlereq ioctl calls have a data
- * structure argument whose first component is always a xfs_fsop_handlereq_t,
- * so we can pass that sub structure into this handy, shared routine.
- *
- * If no error, caller must always iput the returned inode.
- */
-STATIC int
-xfs_vget_fsop_handlereq_compat(
-	xfs_mount_t		*mp,
-	struct inode		*parinode,	/* parent inode pointer    */
-	compat_xfs_fsop_handlereq_t	*hreq,
-	struct inode		**inode)
+STATIC struct dentry *
+xfs_compat_handlereq_to_dentry(
+	struct file		*parfilp,
+	compat_xfs_fsop_handlereq_t *hreq)
 {
-	void			__user *hanp;
-	size_t			hlen;
-	xfs_fid_t		*xfid;
-	xfs_handle_t		*handlep;
-	xfs_handle_t		handle;
-	xfs_inode_t		*ip;
-	xfs_ino_t		ino;
-	__u32			igen;
-	int			error;
-
-	/*
-	 * Only allow handle opens under a directory.
-	 */
-	if (!S_ISDIR(parinode->i_mode))
-		return XFS_ERROR(ENOTDIR);
-
-	hanp = compat_ptr(hreq->ihandle);
-	hlen = hreq->ihandlen;
-	handlep = &handle;
-
-	if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
-		return XFS_ERROR(EINVAL);
-	if (copy_from_user(handlep, hanp, hlen))
-		return XFS_ERROR(EFAULT);
-	if (hlen < sizeof(*handlep))
-		memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
-	if (hlen > sizeof(handlep->ha_fsid)) {
-		if (handlep->ha_fid.fid_len !=
-		    (hlen - sizeof(handlep->ha_fsid) -
-			    sizeof(handlep->ha_fid.fid_len)) ||
-		    handlep->ha_fid.fid_pad)
-			return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * Crack the handle, obtain the inode # & generation #
-	 */
-	xfid = (struct xfs_fid *)&handlep->ha_fid;
-	if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
-		ino  = xfid->fid_ino;
-		igen = xfid->fid_gen;
-	} else {
-		return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * Get the XFS inode, building a Linux inode to go with it.
-	 */
-	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
-	if (error)
-		return error;
-	if (ip == NULL)
-		return XFS_ERROR(EIO);
-	if (ip->i_d.di_gen != igen) {
-		xfs_iput_new(ip, XFS_ILOCK_SHARED);
-		return XFS_ERROR(ENOENT);
-	}
-
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-	*inode = VFS_I(ip);
-	return 0;
+	return xfs_handle_to_dentry(parfilp,
+			compat_ptr(hreq->ihandle), hreq->ihandlen);
 }
 
 STATIC int
 xfs_compat_attrlist_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
-	struct inode		*parinode)
+	struct file		*parfilp,
+	void			__user *arg)
 {
 	int			error;
 	attrlist_cursor_kern_t	*cursor;
 	compat_xfs_fsop_attrlist_handlereq_t al_hreq;
-	struct inode		*inode;
+	struct dentry		*dentry;
 	char			*kbuf;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -446,17 +374,17 @@ xfs_compat_attrlist_by_handle(
 	if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
 		return -XFS_ERROR(EINVAL);
 
-	error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq,
-					       &inode);
-	if (error)
-		goto out;
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
+	error = -ENOMEM;
 	kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
 	if (!kbuf)
-		goto out_vn_rele;
+		goto out_dput;
 
 	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-	error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen,
+	error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
 					al_hreq.flags, cursor);
 	if (error)
 		goto out_kfree;
@@ -466,22 +394,20 @@ xfs_compat_attrlist_by_handle(
 
  out_kfree:
 	kfree(kbuf);
- out_vn_rele:
-	iput(inode);
- out:
-	return -error;
+ out_dput:
+	dput(dentry);
+	return error;
 }
 
 STATIC int
 xfs_compat_attrmulti_by_handle(
-	xfs_mount_t				*mp,
-	void					__user *arg,
-	struct inode				*parinode)
+	struct file				*parfilp,
+	void					__user *arg)
 {
 	int					error;
 	compat_xfs_attr_multiop_t		*ops;
 	compat_xfs_fsop_attrmulti_handlereq_t	am_hreq;
-	struct inode				*inode;
+	struct dentry				*dentry;
 	unsigned int				i, size;
 	char					*attr_name;
 
@@ -491,20 +417,19 @@ xfs_compat_attrmulti_by_handle(
 			   sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
 
-	error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq,
-					       &inode);
-	if (error)
-		goto out;
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
 	error = E2BIG;
 	size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
 	if (!size || size > 16 * PAGE_SIZE)
-		goto out_vn_rele;
+		goto out_dput;
 
 	error = ENOMEM;
 	ops = kmalloc(size, GFP_KERNEL);
 	if (!ops)
-		goto out_vn_rele;
+		goto out_dput;
 
 	error = EFAULT;
 	if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
@@ -527,20 +452,21 @@ xfs_compat_attrmulti_by_handle(
 
 		switch (ops[i].am_opcode) {
 		case ATTR_OP_GET:
-			ops[i].am_error = xfs_attrmulti_attr_get(inode,
-					attr_name,
+			ops[i].am_error = xfs_attrmulti_attr_get(
+					dentry->d_inode, attr_name,
 					compat_ptr(ops[i].am_attrvalue),
 					&ops[i].am_length, ops[i].am_flags);
 			break;
 		case ATTR_OP_SET:
-			ops[i].am_error = xfs_attrmulti_attr_set(inode,
-					attr_name,
+			ops[i].am_error = xfs_attrmulti_attr_set(
+					dentry->d_inode, attr_name,
 					compat_ptr(ops[i].am_attrvalue),
 					ops[i].am_length, ops[i].am_flags);
 			break;
 		case ATTR_OP_REMOVE:
-			ops[i].am_error = xfs_attrmulti_attr_remove(inode,
-					attr_name, ops[i].am_flags);
+			ops[i].am_error = xfs_attrmulti_attr_remove(
+					dentry->d_inode, attr_name,
+					ops[i].am_flags);
 			break;
 		default:
 			ops[i].am_error = EINVAL;
@@ -553,22 +479,20 @@ xfs_compat_attrmulti_by_handle(
 	kfree(attr_name);
  out_kfree_ops:
 	kfree(ops);
- out_vn_rele:
-	iput(inode);
- out:
+ out_dput:
+	dput(dentry);
 	return -error;
 }
 
 STATIC int
 xfs_compat_fssetdm_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
-	struct inode		*parinode)
+	struct file		*parfilp,
+	void			__user *arg)
 {
 	int			error;
 	struct fsdmidata	fsd;
 	compat_xfs_fsop_setdm_handlereq_t dmhreq;
-	struct inode		*inode;
+	struct dentry		*dentry;
 
 	if (!capable(CAP_MKNOD))
 		return -XFS_ERROR(EPERM);
@@ -576,12 +500,11 @@ xfs_compat_fssetdm_by_handle(
 			   sizeof(compat_xfs_fsop_setdm_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
 
-	error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq,
-					       &inode);
-	if (error)
-		return -error;
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
+	if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
 		error = -XFS_ERROR(EPERM);
 		goto out;
 	}
@@ -591,11 +514,11 @@ xfs_compat_fssetdm_by_handle(
 		goto out;
 	}
 
-	error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask,
+	error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
 				 fsd.fsd_dmstate);
 
 out:
-	iput(inode);
+	dput(dentry);
 	return error;
 }
 
@@ -722,21 +645,21 @@ xfs_file_compat_ioctl(
 
 		if (xfs_compat_handlereq_copyin(&hreq, arg))
 			return -XFS_ERROR(EFAULT);
-		return xfs_open_by_handle(mp, &hreq, filp, inode);
+		return xfs_open_by_handle(filp, &hreq);
 	}
 	case XFS_IOC_READLINK_BY_HANDLE_32: {
 		struct xfs_fsop_handlereq	hreq;
 
 		if (xfs_compat_handlereq_copyin(&hreq, arg))
 			return -XFS_ERROR(EFAULT);
-		return xfs_readlink_by_handle(mp, &hreq, inode);
+		return xfs_readlink_by_handle(filp, &hreq);
 	}
 	case XFS_IOC_ATTRLIST_BY_HANDLE_32:
-		return xfs_compat_attrlist_by_handle(mp, arg, inode);
+		return xfs_compat_attrlist_by_handle(filp, arg);
 	case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
-		return xfs_compat_attrmulti_by_handle(mp, arg, inode);
+		return xfs_compat_attrmulti_by_handle(filp, arg);
 	case XFS_IOC_FSSETDM_BY_HANDLE_32:
-		return xfs_compat_fssetdm_by_handle(mp, arg, inode);
+		return xfs_compat_fssetdm_by_handle(filp, arg);
 	default:
 		return -XFS_ERROR(ENOIOCTLCMD);
 	}
-- 
cgit v1.2.3


From 62e194ecdaf8a1935991c1f8704886328d96a391 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:03:03 +0100
Subject: xfs: use mnt_want_write in compat_attrmulti ioctl

The compat version of the attrmulti ioctl needs to ask for and then
later release write access to the mount just like the native version,
otherwise we could potentially write to read-only mounts.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/linux-2.6/xfs_ioctl32.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index fd4362063f25..c70c4e3db790 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -17,6 +17,7 @@
  */
 #include <linux/compat.h>
 #include <linux/ioctl.h>
+#include <linux/mount.h>
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -458,15 +459,23 @@ xfs_compat_attrmulti_by_handle(
 					&ops[i].am_length, ops[i].am_flags);
 			break;
 		case ATTR_OP_SET:
+			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+			if (ops[i].am_error)
+				break;
 			ops[i].am_error = xfs_attrmulti_attr_set(
 					dentry->d_inode, attr_name,
 					compat_ptr(ops[i].am_attrvalue),
 					ops[i].am_length, ops[i].am_flags);
+			mnt_drop_write(parfilp->f_path.mnt);
 			break;
 		case ATTR_OP_REMOVE:
+			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+			if (ops[i].am_error)
+				break;
 			ops[i].am_error = xfs_attrmulti_attr_remove(
 					dentry->d_inode, attr_name,
 					ops[i].am_flags);
+			mnt_drop_write(parfilp->f_path.mnt);
 			break;
 		default:
 			ops[i].am_error = EINVAL;
-- 
cgit v1.2.3


From 080dda7f5e8e8df95bcd17a5345c276e365a2054 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:03:11 +0100
Subject: xfs: add a separate lock class for the per-mount list of dquots

We can have both a a quota hash chain and the per-mount list locked at
the same time.  But given that both use the same struct dqhash as list
head we have to tell lockdep that they are different lock classes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/quota/xfs_qm.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 6b13960cf318..7a2beb64314f 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1070,6 +1070,13 @@ xfs_qm_sync(
 	return 0;
 }
 
+/*
+ * The hash chains and the mplist use the same xfs_dqhash structure as
+ * their list head, but we can take the mplist qh_lock and one of the
+ * hash qh_locks at the same time without any problem as they aren't
+ * related.
+ */
+static struct lock_class_key xfs_quota_mplist_class;
 
 /*
  * This initializes all the quota information that's kept in the
@@ -1105,6 +1112,8 @@ xfs_qm_init_quotainfo(
 	}
 
 	xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
+	lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class);
+
 	qinf->qi_dqreclaims = 0;
 
 	/* mutex used to serialize quotaoffs */
-- 
cgit v1.2.3


From 4f2d4ac6e5eb7d72e8df7f3fbf67a78dab8b91cf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:03:19 +0100
Subject: xfs: lockdep annotations for xfs_dqlock2

xfs_dqlock2 locks two xfs_dquots, which is fine as it always locks the
dquot with the lower id first.  Use mutex_lock_nested to tell lockdep
about this fact.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/quota/xfs_dquot.c | 24 ++++++++++++++----------
 fs/xfs/quota/xfs_dquot.h | 10 ++++++++++
 2 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 591ca6602bfb..36d1bb6140d1 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1383,6 +1383,12 @@ xfs_dqunlock_nonotify(
 	mutex_unlock(&(dqp->q_qlock));
 }
 
+/*
+ * Lock two xfs_dquot structures.
+ *
+ * To avoid deadlocks we always lock the quota structure with
+ * the lowerd id first.
+ */
 void
 xfs_dqlock2(
 	xfs_dquot_t	*d1,
@@ -1392,18 +1398,16 @@ xfs_dqlock2(
 		ASSERT(d1 != d2);
 		if (be32_to_cpu(d1->q_core.d_id) >
 		    be32_to_cpu(d2->q_core.d_id)) {
-			xfs_dqlock(d2);
-			xfs_dqlock(d1);
+			mutex_lock(&d2->q_qlock);
+			mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
 		} else {
-			xfs_dqlock(d1);
-			xfs_dqlock(d2);
-		}
-	} else {
-		if (d1) {
-			xfs_dqlock(d1);
-		} else if (d2) {
-			xfs_dqlock(d2);
+			mutex_lock(&d1->q_qlock);
+			mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
 		}
+	} else if (d1) {
+		mutex_lock(&d1->q_qlock);
+	} else if (d2) {
+		mutex_lock(&d2->q_qlock);
 	}
 }
 
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 7e455337e2ba..d443e93b4331 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -97,6 +97,16 @@ typedef struct xfs_dquot {
 #define dq_hashlist	q_lists.dqm_hashlist
 #define dq_flags	q_lists.dqm_flags
 
+/*
+ * Lock hierachy for q_qlock:
+ *	XFS_QLOCK_NORMAL is the implicit default,
+ * 	XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
+ */
+enum {
+	XFS_QLOCK_NORMAL = 0,
+	XFS_QLOCK_NESTED,
+};
+
 #define XFS_DQHOLD(dqp)		((dqp)->q_nrefs++)
 
 #ifdef DEBUG
-- 
cgit v1.2.3


From 5aa2dc0a0697c762874241fa9ddbecd2d878b934 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:03:25 +0100
Subject: xfs: add a lock class for group/project dquots

We can have both a user and a group/project dquot locked at the same time,
as long as the user dquot is locked first.  Tell lockdep about that fact
by making the group/project dquots a different lock class.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/quota/xfs_dquot.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 36d1bb6140d1..f0bc7846580b 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -73,6 +73,8 @@ int xfs_dqreq_num;
 int xfs_dqerror_mod = 33;
 #endif
 
+static struct lock_class_key xfs_dquot_other_class;
+
 /*
  * Allocate and initialize a dquot. We don't always allocate fresh memory;
  * we try to reclaim a free dquot if the number of incore dquots are above
@@ -139,7 +141,15 @@ xfs_qm_dqinit(
 		 ASSERT(dqp->q_trace);
 		 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
 #endif
-	 }
+	}
+
+	/*
+	 * In either case we need to make sure group quotas have a different
+	 * lock class than user quotas, to make sure lockdep knows we can
+	 * locks of one of each at the same time.
+	 */
+	if (!(type & XFS_DQ_USER))
+		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
 
 	/*
 	 * log item gets initialized later
-- 
cgit v1.2.3


From 49739140e57a65114d9e1976c4c158d2145595fb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:04:07 +0100
Subject: xfs: fix bad_features2 fixups for the root filesystem

Currently the bad_features2 fixup and the alignment updates in the superblock
are skipped if we mount a filesystem read-only.  But for the root filesystem
the typical case is to mount read-only first and only later remount writeable
so we'll never perform this update at all.  It's not a big problem but means
the logs of people needing the fixup get spammed at every boot because they
never happen on disk.

Reported-by: Arkadiusz Miskiewicz <arekm@maven.pl>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/linux-2.6/xfs_super.c | 17 ++++++++++++++++-
 fs/xfs/xfs_mount.c           | 26 +++++++++++++-------------
 fs/xfs/xfs_mount.h           |  3 +++
 3 files changed, 32 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 95a971080368..c71e226da7f5 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1197,6 +1197,7 @@ xfs_fs_remount(
 	struct xfs_mount	*mp = XFS_M(sb);
 	substring_t		args[MAX_OPT_ARGS];
 	char			*p;
+	int			error;
 
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
@@ -1247,11 +1248,25 @@ xfs_fs_remount(
 		}
 	}
 
-	/* rw/ro -> rw */
+	/* ro -> rw */
 	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
 		mp->m_flags &= ~XFS_MOUNT_RDONLY;
 		if (mp->m_flags & XFS_MOUNT_BARRIER)
 			xfs_mountfs_check_barriers(mp);
+
+		/*
+		 * If this is the first remount to writeable state we
+		 * might have some superblock changes to update.
+		 */
+		if (mp->m_update_flags) {
+			error = xfs_mount_log_sb(mp, mp->m_update_flags);
+			if (error) {
+				cmn_err(CE_WARN,
+					"XFS: failed to write sb changes");
+				return error;
+			}
+			mp->m_update_flags = 0;
+		}
 	}
 
 	/* rw -> ro */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3c97c6463a4e..35300250e86d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,6 @@
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
 
-STATIC int	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 STATIC int	xfs_uuid_mount(xfs_mount_t *);
 STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
 
@@ -682,7 +681,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
  * Update alignment values based on mount options and sb values
  */
 STATIC int
-xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
+xfs_update_alignment(xfs_mount_t *mp)
 {
 	xfs_sb_t	*sbp = &(mp->m_sb);
 
@@ -736,11 +735,11 @@ xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
 		if (xfs_sb_version_hasdalign(sbp)) {
 			if (sbp->sb_unit != mp->m_dalign) {
 				sbp->sb_unit = mp->m_dalign;
-				*update_flags |= XFS_SB_UNIT;
+				mp->m_update_flags |= XFS_SB_UNIT;
 			}
 			if (sbp->sb_width != mp->m_swidth) {
 				sbp->sb_width = mp->m_swidth;
-				*update_flags |= XFS_SB_WIDTH;
+				mp->m_update_flags |= XFS_SB_WIDTH;
 			}
 		}
 	} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
@@ -905,7 +904,6 @@ xfs_mountfs(
 	xfs_sb_t	*sbp = &(mp->m_sb);
 	xfs_inode_t	*rip;
 	__uint64_t	resblks;
-	__int64_t	update_flags = 0LL;
 	uint		quotamount, quotaflags;
 	int		uuid_mounted = 0;
 	int		error = 0;
@@ -933,7 +931,7 @@ xfs_mountfs(
 			"XFS: correcting sb_features alignment problem");
 		sbp->sb_features2 |= sbp->sb_bad_features2;
 		sbp->sb_bad_features2 = sbp->sb_features2;
-		update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
+		mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
 
 		/*
 		 * Re-check for ATTR2 in case it was found in bad_features2
@@ -947,11 +945,11 @@ xfs_mountfs(
 	if (xfs_sb_version_hasattr2(&mp->m_sb) &&
 	   (mp->m_flags & XFS_MOUNT_NOATTR2)) {
 		xfs_sb_version_removeattr2(&mp->m_sb);
-		update_flags |= XFS_SB_FEATURES2;
+		mp->m_update_flags |= XFS_SB_FEATURES2;
 
 		/* update sb_versionnum for the clearing of the morebits */
 		if (!sbp->sb_features2)
-			update_flags |= XFS_SB_VERSIONNUM;
+			mp->m_update_flags |= XFS_SB_VERSIONNUM;
 	}
 
 	/*
@@ -960,7 +958,7 @@ xfs_mountfs(
 	 * allocator alignment is within an ag, therefore ag has
 	 * to be aligned at stripe boundary.
 	 */
-	error = xfs_update_alignment(mp, &update_flags);
+	error = xfs_update_alignment(mp);
 	if (error)
 		goto error1;
 
@@ -1137,10 +1135,12 @@ xfs_mountfs(
 	}
 
 	/*
-	 * If fs is not mounted readonly, then update the superblock changes.
+	 * If this is a read-only mount defer the superblock updates until
+	 * the next remount into writeable mode.  Otherwise we would never
+	 * perform the update e.g. for the root filesystem.
 	 */
-	if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-		error = xfs_mount_log_sb(mp, update_flags);
+	if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		error = xfs_mount_log_sb(mp, mp->m_update_flags);
 		if (error) {
 			cmn_err(CE_WARN, "XFS: failed to write sb changes");
 			goto error4;
@@ -1820,7 +1820,7 @@ xfs_uuid_mount(
  * be altered by the mount options, as well as any potential sb_features2
  * fixup. Only the first superblock is updated.
  */
-STATIC int
+int
 xfs_mount_log_sb(
 	xfs_mount_t	*mp,
 	__int64_t	fields)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index c1e028467327..e37eff6761eb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -327,6 +327,8 @@ typedef struct xfs_mount {
 	spinlock_t		m_sync_lock;	/* work item list lock */
 	int			m_sync_seq;	/* sync thread generation no. */
 	wait_queue_head_t	m_wait_single_sync_task;
+	__int64_t		m_update_flags;	/* sb flags we need to update
+						   on the next remount,rw */
 } xfs_mount_t;
 
 /*
@@ -514,6 +516,7 @@ extern int	xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
 			int64_t, int);
 extern int	xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
 			uint, int);
+extern int	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int	xfs_readsb(xfs_mount_t *, int);
 extern void	xfs_freesb(xfs_mount_t *);
-- 
cgit v1.2.3


From b828d8c33867dd6479644c06500975570bfd525c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:04:16 +0100
Subject: xfs: sanity check attr fork size

Recently we have quite a few kerneloops reports about dereferencing a NULL
if_data in the attribute fork.  From looking over the code this can only
happen if we pass a 0 size argument to xfs_iformat_local.  This implies some
sort of corruption and in fact the only mailinglist report about this from
earlier this year was after a powerfail presumably on a system with write
cache and without barriers.

Add a quick sanity check for the attr fork size in xfs_iformat to catch
these early and without an oops.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5a5e035e5d38..323ecd76a12c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -424,6 +424,19 @@ xfs_iformat(
 	case XFS_DINODE_FMT_LOCAL:
 		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
 		size = be16_to_cpu(atp->hdr.totsize);
+
+		if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
+			xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+				"corrupt inode %Lu "
+				"(bad attr fork size %Ld).",
+				(unsigned long long) ip->i_ino,
+				(long long) size);
+			XFS_CORRUPTION_ERROR("xfs_iformat(8)",
+					     XFS_ERRLEVEL_LOW,
+					     ip->i_mount, dip);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+
 		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
 		break;
 	case XFS_DINODE_FMT_EXTENTS:
-- 
cgit v1.2.3


From b6e3222732a3551e786aa47b90a8eab2a517711c Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Wed, 14 Jan 2009 23:22:07 -0600
Subject: [XFS] Remove the rest of the macro-to-function indirections.

Remove the last of the macros-defined-to-static-functions.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/quota/xfs_dquot.c  |   2 +-
 fs/xfs/xfs_ag.h           |   6 +-
 fs/xfs/xfs_alloc_btree.c  |   2 +-
 fs/xfs/xfs_attr.c         |  26 ++++----
 fs/xfs/xfs_bmap.c         | 166 +++++++++++++++++++++++-----------------------
 fs/xfs/xfs_bmap.h         |   2 -
 fs/xfs/xfs_bmap_btree.c   |  10 +--
 fs/xfs/xfs_bmap_btree.h   |   4 --
 fs/xfs/xfs_btree.c        |   6 +-
 fs/xfs/xfs_da_btree.c     |   8 +--
 fs/xfs/xfs_ialloc.c       |   6 +-
 fs/xfs/xfs_ialloc.h       |   2 -
 fs/xfs/xfs_ialloc_btree.h |   1 -
 fs/xfs/xfs_inode.c        |   6 +-
 fs/xfs/xfs_inode_item.h   |   4 --
 fs/xfs/xfs_iomap.c        |  10 +--
 fs/xfs/xfs_itable.c       |   6 +-
 fs/xfs/xfs_mount.h        |   6 +-
 fs/xfs/xfs_rename.c       |   2 +-
 fs/xfs/xfs_rtalloc.c      |   2 +-
 fs/xfs/xfs_rw.h           |   1 -
 fs/xfs/xfs_sb.h           |   2 +-
 fs/xfs/xfs_vnodeops.c     |  20 +++---
 23 files changed, 142 insertions(+), 158 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index f0bc7846580b..6543c0b29753 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -431,7 +431,7 @@ xfs_qm_dqalloc(
 	/*
 	 * Initialize the bmap freelist prior to calling bmapi code.
 	 */
-	XFS_BMAP_INIT(&flist, &firstblock);
+	xfs_bmap_init(&flist, &firstblock);
 	xfs_ilock(quotip, XFS_ILOCK_EXCL);
 	/*
 	 * Return if this type of quotas is turned off while we didn't
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index d3b3cf742999..143d63ecb20a 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -244,8 +244,8 @@ typedef struct xfs_perag
 #define	XFS_AG_CHECK_DADDR(mp,d,len)	\
 	((len) == 1 ? \
 	    ASSERT((d) == XFS_SB_DADDR || \
-		   XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \
-	    ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \
-		   XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1)))
+		   xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
+	    ASSERT(xfs_daddr_to_agno(mp, d) == \
+		   xfs_daddr_to_agno(mp, (d) + (len) - 1)))
 
 #endif	/* __XFS_AG_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 733cb75a8c5d..c10c3a292d30 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -115,7 +115,7 @@ xfs_allocbt_free_block(
 	xfs_agblock_t		bno;
 	int			error;
 
-	bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp));
+	bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
 	error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
 	if (error)
 		return error;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index f7cdc28aff41..5fde1654b430 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -374,7 +374,7 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
 		 * It won't fit in the shortform, transform to a leaf block.
 		 * GROT: another possible req'mt for a double-split btree op.
 		 */
-		XFS_BMAP_INIT(args.flist, args.firstblock);
+		xfs_bmap_init(args.flist, args.firstblock);
 		error = xfs_attr_shortform_to_leaf(&args);
 		if (!error) {
 			error = xfs_bmap_finish(&args.trans, args.flist,
@@ -956,7 +956,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * Commit that transaction so that the node_addname() call
 		 * can manage its own transactions.
 		 */
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_attr_leaf_to_node(args);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1057,7 +1057,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * If the result is small enough, shrink it all into the inode.
 		 */
 		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-			XFS_BMAP_INIT(args->flist, args->firstblock);
+			xfs_bmap_init(args->flist, args->firstblock);
 			error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
 			/* bp is gone due to xfs_da_shrink_inode */
 			if (!error) {
@@ -1135,7 +1135,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
 	 * If the result is small enough, shrink it all into the inode.
 	 */
 	if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
 		/* bp is gone due to xfs_da_shrink_inode */
 		if (!error) {
@@ -1290,7 +1290,7 @@ restart:
 			 * have been a b-tree.
 			 */
 			xfs_da_state_free(state);
-			XFS_BMAP_INIT(args->flist, args->firstblock);
+			xfs_bmap_init(args->flist, args->firstblock);
 			error = xfs_attr_leaf_to_node(args);
 			if (!error) {
 				error = xfs_bmap_finish(&args->trans,
@@ -1331,7 +1331,7 @@ restart:
 		 * in the index/blkno/rmtblkno/rmtblkcnt fields and
 		 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
 		 */
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_da_split(state);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1443,7 +1443,7 @@ restart:
 		 * Check to see if the tree needs to be collapsed.
 		 */
 		if (retval && (state->path.active > 1)) {
-			XFS_BMAP_INIT(args->flist, args->firstblock);
+			xfs_bmap_init(args->flist, args->firstblock);
 			error = xfs_da_join(state);
 			if (!error) {
 				error = xfs_bmap_finish(&args->trans,
@@ -1579,7 +1579,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 	 * Check to see if the tree needs to be collapsed.
 	 */
 	if (retval && (state->path.active > 1)) {
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_da_join(state);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1630,7 +1630,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 						       == XFS_ATTR_LEAF_MAGIC);
 
 		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-			XFS_BMAP_INIT(args->flist, args->firstblock);
+			xfs_bmap_init(args->flist, args->firstblock);
 			error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
 			/* bp is gone due to xfs_da_shrink_inode */
 			if (!error) {
@@ -2069,7 +2069,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		/*
 		 * Allocate a single extent, up to the size of the value.
 		 */
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		nmap = 1;
 		error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
 				  blkcnt,
@@ -2123,7 +2123,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		/*
 		 * Try to remember where we decided to put the value.
 		 */
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		nmap = 1;
 		error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
 				  args->rmtblkcnt,
@@ -2188,7 +2188,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		/*
 		 * Try to remember where we decided to put the value.
 		 */
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		nmap = 1;
 		error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
 					args->rmtblkcnt,
@@ -2229,7 +2229,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 	blkcnt = args->rmtblkcnt;
 	done = 0;
 	while (!done) {
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
 				    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
 				    1, args->firstblock, args->flist,
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 138308e70d14..c852cd65aaea 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -595,9 +595,9 @@ xfs_bmap_add_extent(
 		xfs_iext_insert(ifp, 0, 1, new);
 		ASSERT(cur == NULL);
 		ifp->if_lastex = 0;
-		if (!ISNULLSTARTBLOCK(new->br_startblock)) {
+		if (!isnullstartblock(new->br_startblock)) {
 			XFS_IFORK_NEXT_SET(ip, whichfork, 1);
-			logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+			logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 		} else
 			logflags = 0;
 		/* DELTA: single new extent */
@@ -613,7 +613,7 @@ xfs_bmap_add_extent(
 	/*
 	 * Any kind of new delayed allocation goes here.
 	 */
-	else if (ISNULLSTARTBLOCK(new->br_startblock)) {
+	else if (isnullstartblock(new->br_startblock)) {
 		if (cur)
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
@@ -644,11 +644,11 @@ xfs_bmap_add_extent(
 		 * in a delayed or unwritten allocation with a real one, or
 		 * converting real back to unwritten.
 		 */
-		if (!ISNULLSTARTBLOCK(new->br_startblock) &&
+		if (!isnullstartblock(new->br_startblock) &&
 		    new->br_startoff + new->br_blockcount > prev.br_startoff) {
 			if (prev.br_state != XFS_EXT_UNWRITTEN &&
-			    ISNULLSTARTBLOCK(prev.br_startblock)) {
-				da_old = STARTBLOCKVAL(prev.br_startblock);
+			    isnullstartblock(prev.br_startblock)) {
+				da_old = startblockval(prev.br_startblock);
 				if (cur)
 					ASSERT(cur->bc_private.b.flags &
 						XFS_BTCUR_BPRV_WASDEL);
@@ -803,7 +803,7 @@ xfs_bmap_add_extent_delay_real(
 	 */
 	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
-		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+		STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
 	}
 	STATE_SET(LEFT_CONTIG,
 		STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
@@ -820,7 +820,7 @@ xfs_bmap_add_extent_delay_real(
 			idx <
 			ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
-		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+		STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
 	}
 	STATE_SET(RIGHT_CONTIG,
 		STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
@@ -1019,8 +1019,8 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-			STARTBLOCKVAL(PREV.br_startblock));
-		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			startblockval(PREV.br_startblock));
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK);
 		*dnew = temp;
 		/* DELTA: The boundary between two in-core extents moved. */
@@ -1067,10 +1067,10 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-			STARTBLOCKVAL(PREV.br_startblock) -
+			startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
 		ep = xfs_iext_get_ext(ifp, idx + 1);
-		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK);
 		*dnew = temp;
 		/* DELTA: One in-core extent is split in two. */
@@ -1110,8 +1110,8 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-			STARTBLOCKVAL(PREV.br_startblock));
-		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			startblockval(PREV.br_startblock));
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK);
 		*dnew = temp;
 		/* DELTA: The boundary between two in-core extents moved. */
@@ -1157,10 +1157,10 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-			STARTBLOCKVAL(PREV.br_startblock) -
+			startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
 		ep = xfs_iext_get_ext(ifp, idx);
-		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK);
 		*dnew = temp;
 		/* DELTA: One in-core extent is split in two. */
@@ -1213,7 +1213,7 @@ xfs_bmap_add_extent_delay_real(
 		}
 		temp = xfs_bmap_worst_indlen(ip, temp);
 		temp2 = xfs_bmap_worst_indlen(ip, temp2);
-		diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) -
+		diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
 		if (diff > 0 &&
 		    xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) {
@@ -1241,11 +1241,11 @@ xfs_bmap_add_extent_delay_real(
 			}
 		}
 		ep = xfs_iext_get_ext(ifp, idx);
-		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK);
 		XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
-			NULLSTARTBLOCK((int)temp2));
+			nullstartblock((int)temp2));
 		XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
 		*dnew = temp + temp2;
 		/* DELTA: One in-core extent is split in three. */
@@ -1365,7 +1365,7 @@ xfs_bmap_add_extent_unwritten_real(
 	 */
 	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
-		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+		STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
 	}
 	STATE_SET(LEFT_CONTIG,
 		STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
@@ -1382,7 +1382,7 @@ xfs_bmap_add_extent_unwritten_real(
 			idx <
 			ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
-		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+		STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
 	}
 	STATE_SET(RIGHT_CONTIG,
 		STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
@@ -1889,13 +1889,13 @@ xfs_bmap_add_extent_hole_delay(
 	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	ep = xfs_iext_get_ext(ifp, idx);
 	state = 0;
-	ASSERT(ISNULLSTARTBLOCK(new->br_startblock));
+	ASSERT(isnullstartblock(new->br_startblock));
 	/*
 	 * Check and set flags if this segment has a left neighbor
 	 */
 	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
-		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+		STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
 	}
 	/*
 	 * Check and set flags if the current (right) segment exists.
@@ -1905,7 +1905,7 @@ xfs_bmap_add_extent_hole_delay(
 			   idx <
 			   ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
 		xfs_bmbt_get_all(ep, &right);
-		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+		STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
 	}
 	/*
 	 * Set contiguity flags on the left and right neighbors.
@@ -1938,12 +1938,12 @@ xfs_bmap_add_extent_hole_delay(
 		XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1,
 			XFS_DATA_FORK);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
-		oldlen = STARTBLOCKVAL(left.br_startblock) +
-			STARTBLOCKVAL(new->br_startblock) +
-			STARTBLOCKVAL(right.br_startblock);
+		oldlen = startblockval(left.br_startblock) +
+			startblockval(new->br_startblock) +
+			startblockval(right.br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
-			NULLSTARTBLOCK((int)newlen));
+			nullstartblock((int)newlen));
 		XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1,
 			XFS_DATA_FORK);
 		XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK);
@@ -1964,11 +1964,11 @@ xfs_bmap_add_extent_hole_delay(
 		XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1,
 			XFS_DATA_FORK);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
-		oldlen = STARTBLOCKVAL(left.br_startblock) +
-			STARTBLOCKVAL(new->br_startblock);
+		oldlen = startblockval(left.br_startblock) +
+			startblockval(new->br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
-			NULLSTARTBLOCK((int)newlen));
+			nullstartblock((int)newlen));
 		XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1,
 			XFS_DATA_FORK);
 		ip->i_df.if_lastex = idx - 1;
@@ -1985,11 +1985,11 @@ xfs_bmap_add_extent_hole_delay(
 		 */
 		XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK);
 		temp = new->br_blockcount + right.br_blockcount;
-		oldlen = STARTBLOCKVAL(new->br_startblock) +
-			STARTBLOCKVAL(right.br_startblock);
+		oldlen = startblockval(new->br_startblock) +
+			startblockval(right.br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
 		xfs_bmbt_set_allf(ep, new->br_startoff,
-			NULLSTARTBLOCK((int)newlen), temp, right.br_state);
+			nullstartblock((int)newlen), temp, right.br_state);
 		XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK);
 		ip->i_df.if_lastex = idx;
 		/* DELTA: One in-core extent grew into a hole. */
@@ -2085,7 +2085,7 @@ xfs_bmap_add_extent_hole_real(
 	 */
 	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
-		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+		STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
 	}
 	/*
 	 * Check and set flags if this segment has a current value.
@@ -2095,7 +2095,7 @@ xfs_bmap_add_extent_hole_real(
 			   idx <
 			   ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
 		xfs_bmbt_get_all(ep, &right);
-		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+		STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
 	}
 	/*
 	 * We're inserting a real allocation between "left" and "right".
@@ -2143,7 +2143,7 @@ xfs_bmap_add_extent_hole_real(
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		if (cur == NULL) {
-			rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 		} else {
 			rval = XFS_ILOG_CORE;
 			if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2185,7 +2185,7 @@ xfs_bmap_add_extent_hole_real(
 		XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork);
 		ifp->if_lastex = idx - 1;
 		if (cur == NULL) {
-			rval = XFS_ILOG_FEXT(whichfork);
+			rval = xfs_ilog_fext(whichfork);
 		} else {
 			rval = 0;
 			if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2220,7 +2220,7 @@ xfs_bmap_add_extent_hole_real(
 		XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork);
 		ifp->if_lastex = idx;
 		if (cur == NULL) {
-			rval = XFS_ILOG_FEXT(whichfork);
+			rval = xfs_ilog_fext(whichfork);
 		} else {
 			rval = 0;
 			if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2254,7 +2254,7 @@ xfs_bmap_add_extent_hole_real(
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		if (cur == NULL) {
-			rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 		} else {
 			rval = XFS_ILOG_CORE;
 			if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2482,7 +2482,7 @@ xfs_bmap_adjacent(
 	 * try to use it's last block as our starting point.
 	 */
 	if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
-	    !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
+	    !isnullstartblock(ap->prevp->br_startblock) &&
 	    ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
 		    ap->prevp->br_startblock)) {
 		ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
@@ -2511,7 +2511,7 @@ xfs_bmap_adjacent(
 		 * start block based on it.
 		 */
 		if (ap->prevp->br_startoff != NULLFILEOFF &&
-		    !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
+		    !isnullstartblock(ap->prevp->br_startblock) &&
 		    (prevbno = ap->prevp->br_startblock +
 			       ap->prevp->br_blockcount) &&
 		    ISVALID(prevbno, ap->prevp->br_startblock)) {
@@ -2552,7 +2552,7 @@ xfs_bmap_adjacent(
 		 * If there's a following (right) block, select a requested
 		 * start block based on it.
 		 */
-		if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) {
+		if (!isnullstartblock(ap->gotp->br_startblock)) {
 			/*
 			 * Calculate gap to start of next block.
 			 */
@@ -3082,7 +3082,7 @@ xfs_bmap_btree_to_extents(
 	ASSERT(ifp->if_broot == NULL);
 	ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
 	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
-	*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 	return 0;
 }
 
@@ -3136,8 +3136,8 @@ xfs_bmap_del_extent(
 	del_endoff = del->br_startoff + del->br_blockcount;
 	got_endoff = got.br_startoff + got.br_blockcount;
 	ASSERT(got_endoff >= del_endoff);
-	delay = ISNULLSTARTBLOCK(got.br_startblock);
-	ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay);
+	delay = isnullstartblock(got.br_startblock);
+	ASSERT(isnullstartblock(del->br_startblock) == delay);
 	flags = 0;
 	qfield = 0;
 	error = 0;
@@ -3189,7 +3189,7 @@ xfs_bmap_del_extent(
 		}
 		da_old = da_new = 0;
 	} else {
-		da_old = STARTBLOCKVAL(got.br_startblock);
+		da_old = startblockval(got.br_startblock);
 		da_new = 0;
 		nblks = 0;
 		do_fx = 0;
@@ -3213,7 +3213,7 @@ xfs_bmap_del_extent(
 			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		flags |= XFS_ILOG_CORE;
 		if (!cur) {
-			flags |= XFS_ILOG_FEXT(whichfork);
+			flags |= xfs_ilog_fext(whichfork);
 			break;
 		}
 		if ((error = xfs_btree_delete(cur, &i)))
@@ -3233,7 +3233,7 @@ xfs_bmap_del_extent(
 		if (delay) {
 			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 				da_old);
-			xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 			XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx,
 				whichfork);
 			da_new = temp;
@@ -3242,7 +3242,7 @@ xfs_bmap_del_extent(
 		xfs_bmbt_set_startblock(ep, del_endblock);
 		XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork);
 		if (!cur) {
-			flags |= XFS_ILOG_FEXT(whichfork);
+			flags |= xfs_ilog_fext(whichfork);
 			break;
 		}
 		if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
@@ -3262,7 +3262,7 @@ xfs_bmap_del_extent(
 		if (delay) {
 			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 				da_old);
-			xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 			XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx,
 				whichfork);
 			da_new = temp;
@@ -3270,7 +3270,7 @@ xfs_bmap_del_extent(
 		}
 		XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork);
 		if (!cur) {
-			flags |= XFS_ILOG_FEXT(whichfork);
+			flags |= xfs_ilog_fext(whichfork);
 			break;
 		}
 		if ((error = xfs_bmbt_update(cur, got.br_startoff,
@@ -3345,22 +3345,22 @@ xfs_bmap_del_extent(
 				}
 				XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			} else
-				flags |= XFS_ILOG_FEXT(whichfork);
+				flags |= xfs_ilog_fext(whichfork);
 			XFS_IFORK_NEXT_SET(ip, whichfork,
 				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		} else {
 			ASSERT(whichfork == XFS_DATA_FORK);
 			temp = xfs_bmap_worst_indlen(ip, temp);
-			xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 			temp2 = xfs_bmap_worst_indlen(ip, temp2);
-			new.br_startblock = NULLSTARTBLOCK((int)temp2);
+			new.br_startblock = nullstartblock((int)temp2);
 			da_new = temp + temp2;
 			while (da_new > da_old) {
 				if (temp) {
 					temp--;
 					da_new--;
 					xfs_bmbt_set_startblock(ep,
-						NULLSTARTBLOCK((int)temp));
+						nullstartblock((int)temp));
 				}
 				if (da_new == da_old)
 					break;
@@ -3368,7 +3368,7 @@ xfs_bmap_del_extent(
 					temp2--;
 					da_new--;
 					new.br_startblock =
-						NULLSTARTBLOCK((int)temp2);
+						nullstartblock((int)temp2);
 				}
 			}
 		}
@@ -3545,7 +3545,7 @@ xfs_bmap_extents_to_btree(
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	for (cnt = i = 0; i < nextents; i++) {
 		ep = xfs_iext_get_ext(ifp, i);
-		if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) {
+		if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
 			arp->l0 = cpu_to_be64(ep->l0);
 			arp->l1 = cpu_to_be64(ep->l1);
 			arp++; cnt++;
@@ -3572,7 +3572,7 @@ xfs_bmap_extents_to_btree(
 	xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
 	ASSERT(*curp == NULL);
 	*curp = cur;
-	*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
+	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
 	return 0;
 }
 
@@ -3676,7 +3676,7 @@ xfs_bmap_local_to_extents(
 		ip->i_d.di_nblocks = 1;
 		XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip,
 			XFS_TRANS_DQ_BCOUNT, 1L);
-		flags |= XFS_ILOG_FEXT(whichfork);
+		flags |= xfs_ilog_fext(whichfork);
 	} else {
 		ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
 		xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
@@ -4082,7 +4082,7 @@ xfs_bmap_add_attrfork(
 		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	ip->i_afp->if_flags = XFS_IFEXTENTS;
 	logflags = 0;
-	XFS_BMAP_INIT(&flist, &firstblock);
+	xfs_bmap_init(&flist, &firstblock);
 	switch (ip->i_d.di_format) {
 	case XFS_DINODE_FMT_LOCAL:
 		error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
@@ -4162,7 +4162,7 @@ xfs_bmap_add_free(
 	ASSERT(bno != NULLFSBLOCK);
 	ASSERT(len > 0);
 	ASSERT(len <= MAXEXTLEN);
-	ASSERT(!ISNULLSTARTBLOCK(bno));
+	ASSERT(!isnullstartblock(bno));
 	agno = XFS_FSB_TO_AGNO(mp, bno);
 	agbno = XFS_FSB_TO_AGBNO(mp, bno);
 	ASSERT(agno < mp->m_sb.sb_agcount);
@@ -4909,7 +4909,7 @@ xfs_bmapi(
 			got.br_startoff = end;
 		inhole = eof || got.br_startoff > bno;
 		wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) &&
-			ISNULLSTARTBLOCK(got.br_startblock);
+			isnullstartblock(got.br_startblock);
 		/*
 		 * First, deal with the hole before the allocated space
 		 * that we found, if any.
@@ -5028,7 +5028,7 @@ xfs_bmapi(
 				}
 
 				ip->i_delayed_blks += alen;
-				abno = NULLSTARTBLOCK(indlen);
+				abno = nullstartblock(indlen);
 			} else {
 				/*
 				 * If first time, allocate and fill in
@@ -5144,8 +5144,8 @@ xfs_bmapi(
 				aoff + alen);
 #ifdef DEBUG
 			if (flags & XFS_BMAPI_DELAY) {
-				ASSERT(ISNULLSTARTBLOCK(got.br_startblock));
-				ASSERT(STARTBLOCKVAL(got.br_startblock) > 0);
+				ASSERT(isnullstartblock(got.br_startblock));
+				ASSERT(startblockval(got.br_startblock) > 0);
 			}
 			ASSERT(got.br_state == XFS_EXT_NORM ||
 			       got.br_state == XFS_EXT_UNWRITTEN);
@@ -5179,7 +5179,7 @@ xfs_bmapi(
 			ASSERT((bno >= obno) || (n == 0));
 			ASSERT(bno < end);
 			mval->br_startoff = bno;
-			if (ISNULLSTARTBLOCK(got.br_startblock)) {
+			if (isnullstartblock(got.br_startblock)) {
 				ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
 				mval->br_startblock = DELAYSTARTBLOCK;
 			} else
@@ -5201,7 +5201,7 @@ xfs_bmapi(
 			ASSERT(mval->br_blockcount <= len);
 		} else {
 			*mval = got;
-			if (ISNULLSTARTBLOCK(mval->br_startblock)) {
+			if (isnullstartblock(mval->br_startblock)) {
 				ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
 				mval->br_startblock = DELAYSTARTBLOCK;
 			}
@@ -5329,12 +5329,12 @@ error0:
 	 * Log everything.  Do this after conversion, there's no point in
 	 * logging the extent records if we've converted to btree format.
 	 */
-	if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
+	if ((logflags & xfs_ilog_fext(whichfork)) &&
 	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-		logflags &= ~XFS_ILOG_FEXT(whichfork);
-	else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
+		logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
 		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-		logflags &= ~XFS_ILOG_FBROOT(whichfork);
+		logflags &= ~xfs_ilog_fbroot(whichfork);
 	/*
 	 * Log whatever the flags say, even if error.  Otherwise we might miss
 	 * detecting a case where the data is changed, there's an error,
@@ -5411,7 +5411,7 @@ xfs_bmapi_single(
 		*fsb = NULLFSBLOCK;
 		return 0;
 	}
-	ASSERT(!ISNULLSTARTBLOCK(got.br_startblock));
+	ASSERT(!isnullstartblock(got.br_startblock));
 	ASSERT(bno < got.br_startoff + got.br_blockcount);
 	*fsb = got.br_startblock + (bno - got.br_startoff);
 	ifp->if_lastex = lastx;
@@ -5543,7 +5543,7 @@ xfs_bunmapi(
 		 */
 		ASSERT(ep != NULL);
 		del = got;
-		wasdel = ISNULLSTARTBLOCK(del.br_startblock);
+		wasdel = isnullstartblock(del.br_startblock);
 		if (got.br_startoff < start) {
 			del.br_startoff = start;
 			del.br_blockcount -= start - got.br_startoff;
@@ -5638,7 +5638,7 @@ xfs_bunmapi(
 				xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
 						lastx - 1), &prev);
 				ASSERT(prev.br_state == XFS_EXT_NORM);
-				ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock));
+				ASSERT(!isnullstartblock(prev.br_startblock));
 				ASSERT(del.br_startblock ==
 				       prev.br_startblock + prev.br_blockcount);
 				if (prev.br_startoff < start) {
@@ -5666,7 +5666,7 @@ xfs_bunmapi(
 			}
 		}
 		if (wasdel) {
-			ASSERT(STARTBLOCKVAL(del.br_startblock) > 0);
+			ASSERT(startblockval(del.br_startblock) > 0);
 			/* Update realtime/data freespace, unreserve quota */
 			if (isrt) {
 				xfs_filblks_t rtexts;
@@ -5782,12 +5782,12 @@ error0:
 	 * Log everything.  Do this after conversion, there's no point in
 	 * logging the extent records if we've converted to btree format.
 	 */
-	if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
+	if ((logflags & xfs_ilog_fext(whichfork)) &&
 	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-		logflags &= ~XFS_ILOG_FEXT(whichfork);
-	else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
+		logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
 		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-		logflags &= ~XFS_ILOG_FBROOT(whichfork);
+		logflags &= ~xfs_ilog_fbroot(whichfork);
 	/*
 	 * Log inode even in the error case, if the transaction
 	 * is dirty we'll need to shut down the filesystem.
@@ -5838,7 +5838,7 @@ xfs_getbmapx_fix_eof_hole(
 		if (startblock == DELAYSTARTBLOCK)
 			out->bmv_block = -2;
 		else
-			out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
+			out->bmv_block = xfs_fsb_to_db(ip, startblock);
 		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
 		ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 		if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
@@ -5979,7 +5979,7 @@ xfs_getbmap(
 	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
 		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
 
-	bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
+	bmapi_flags = xfs_bmapi_aflag(whichfork) |
 			((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
 
 	/*
@@ -6098,7 +6098,7 @@ xfs_bmap_isaeof(
 	 */
 	*aeof = (off >= s.br_startoff &&
 		 off < s.br_startoff + s.br_blockcount &&
-		 ISNULLSTARTBLOCK(s.br_startblock)) ||
+		 isnullstartblock(s.br_startblock)) ||
 		off >= s.br_startoff + s.br_blockcount;
 	return 0;
 }
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 284571c05ed0..be2979d88d32 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -95,7 +95,6 @@ typedef	struct xfs_bmap_free
 					/* need write cache flushing and no */
 					/* additional allocation alignments */
 
-#define	XFS_BMAPI_AFLAG(w)	xfs_bmapi_aflag(w)
 static inline int xfs_bmapi_aflag(int w)
 {
 	return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
@@ -107,7 +106,6 @@ static inline int xfs_bmapi_aflag(int w)
 #define	DELAYSTARTBLOCK		((xfs_fsblock_t)-1LL)
 #define	HOLESTARTBLOCK		((xfs_fsblock_t)-2LL)
 
-#define	XFS_BMAP_INIT(flp,fbp)	xfs_bmap_init(flp,fbp)
 static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
 {
 	((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index ba6b08c2fb02..0760d352586f 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -121,7 +121,7 @@ __xfs_bmbt_get_all(
 
 		b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
 		    (((xfs_dfsbno_t)l1) >> 21);
-		ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+		ASSERT((b >> 32) == 0 || isnulldstartblock(b));
 		s->br_startblock = (xfs_fsblock_t)b;
 	}
 #else	/* !DEBUG */
@@ -172,7 +172,7 @@ xfs_bmbt_get_startblock(
 
 	b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
 	    (((xfs_dfsbno_t)r->l1) >> 21);
-	ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+	ASSERT((b >> 32) == 0 || isnulldstartblock(b));
 	return (xfs_fsblock_t)b;
 #else	/* !DEBUG */
 	return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
@@ -261,7 +261,7 @@ xfs_bmbt_set_allf(
 		((xfs_bmbt_rec_base_t)blockcount &
 		(xfs_bmbt_rec_base_t)xfs_mask64lo(21));
 #else	/* !XFS_BIG_BLKNOS */
-	if (ISNULLSTARTBLOCK(startblock)) {
+	if (isnullstartblock(startblock)) {
 		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
 			((xfs_bmbt_rec_base_t)startoff << 9) |
 			 (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
@@ -321,7 +321,7 @@ xfs_bmbt_disk_set_allf(
 		 ((xfs_bmbt_rec_base_t)blockcount &
 		  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
 #else	/* !XFS_BIG_BLKNOS */
-	if (ISNULLSTARTBLOCK(startblock)) {
+	if (isnullstartblock(startblock)) {
 		r->l0 = cpu_to_be64(
 			((xfs_bmbt_rec_base_t)extent_flag << 63) |
 			 ((xfs_bmbt_rec_base_t)startoff << 9) |
@@ -382,7 +382,7 @@ xfs_bmbt_set_startblock(
 	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
 		  (xfs_bmbt_rec_base_t)(v << 21);
 #else	/* !XFS_BIG_BLKNOS */
-	if (ISNULLSTARTBLOCK(v)) {
+	if (isnullstartblock(v)) {
 		r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
 		r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
 			  ((xfs_bmbt_rec_base_t)v << 21) |
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index a4555abb6622..0e8df007615e 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -76,26 +76,22 @@ typedef struct xfs_bmbt_rec_host {
 #define DSTARTBLOCKMASK		\
 	(((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
 
-#define ISNULLSTARTBLOCK(x)	isnullstartblock(x)
 static inline int isnullstartblock(xfs_fsblock_t x)
 {
 	return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
 }
 
-#define ISNULLDSTARTBLOCK(x)	isnulldstartblock(x)
 static inline int isnulldstartblock(xfs_dfsbno_t x)
 {
 	return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
 }
 
-#define NULLSTARTBLOCK(k)	nullstartblock(k)
 static inline xfs_fsblock_t nullstartblock(int k)
 {
 	ASSERT(k < (1 << STARTBLOCKVALBITS));
 	return STARTBLOCKMASK | (k);
 }
 
-#define STARTBLOCKVAL(x)	startblockval(x)
 static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
 {
 	return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 2c3ef20f8842..4681519ded91 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -960,7 +960,7 @@ xfs_btree_buf_to_ptr(
 		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
 					XFS_BUF_ADDR(bp)));
 	else {
-		ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp,
+		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
 					XFS_BUF_ADDR(bp)));
 	}
 }
@@ -2454,7 +2454,7 @@ xfs_btree_new_iroot(
 	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
 
 	*logflags |=
-		XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
+		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
 	*stat = 1;
 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
 	return 0;
@@ -3048,7 +3048,7 @@ xfs_btree_kill_iroot(
 	cur->bc_bufs[level - 1] = NULL;
 	be16_add_cpu(&block->bb_level, -1);
 	xfs_trans_log_inode(cur->bc_tp, ip,
-		XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
 	cur->bc_nlevels--;
 out0:
 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index a11a8390bf6c..c45f74ff1a5b 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1597,7 +1597,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	nmap = 1;
 	ASSERT(args->firstblock != NULL);
 	if ((error = xfs_bmapi(tp, dp, bno, count,
-			XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
+			xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
 			XFS_BMAPI_CONTIG,
 			args->firstblock, args->total, &map, &nmap,
 			args->flist, NULL))) {
@@ -1618,7 +1618,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
 			c = (int)(bno + count - b);
 			if ((error = xfs_bmapi(tp, dp, b, c,
-					XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|
+					xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
 					XFS_BMAPI_METADATA,
 					args->firstblock, args->total,
 					&mapp[mapi], &nmap, args->flist,
@@ -1882,7 +1882,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 		 * the last block to the place we want to kill.
 		 */
 		if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
-				XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA,
+				xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
 				0, args->firstblock, args->flist, NULL,
 				&done)) == ENOSPC) {
 			if (w != XFS_DATA_FORK)
@@ -1987,7 +1987,7 @@ xfs_da_do_buf(
 			if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
 					nfsb,
 					XFS_BMAPI_METADATA |
-						XFS_BMAPI_AFLAG(whichfork),
+						xfs_bmapi_aflag(whichfork),
 					NULL, 0, mapp, &nmap, NULL, NULL)))
 				goto exit0;
 		}
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index e6ebbaeb4dc6..ab016e5ae7be 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -357,7 +357,7 @@ xfs_ialloc_ag_alloc(
 			int	ioffset = i << args.mp->m_sb.sb_inodelog;
 			uint	isize = sizeof(struct xfs_dinode);
 
-			free = XFS_MAKE_IPTR(args.mp, fbuf, i);
+			free = xfs_make_iptr(args.mp, fbuf, i);
 			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 			free->di_version = version;
 			free->di_gen = cpu_to_be32(gen);
@@ -937,7 +937,7 @@ nextag:
 			}
 		}
 	}
-	offset = XFS_IALLOC_FIND_FREE(&rec.ir_free);
+	offset = xfs_ialloc_find_free(&rec.ir_free);
 	ASSERT(offset >= 0);
 	ASSERT(offset < XFS_INODES_PER_CHUNK);
 	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1279,7 +1279,7 @@ xfs_imap(
 		offset = XFS_INO_TO_OFFSET(mp, ino);
 		ASSERT(offset < mp->m_sb.sb_inopblock);
 
-		cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno);
+		cluster_agbno = xfs_daddr_to_agbno(mp, imap->im_blkno);
 		offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
 
 		imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 50f558a4e0a8..aeee8278f92c 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -39,7 +39,6 @@ struct xfs_trans;
 /*
  * Make an inode pointer out of the buffer/offset.
  */
-#define	XFS_MAKE_IPTR(mp,b,o)		xfs_make_iptr(mp,b,o)
 static inline struct xfs_dinode *
 xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
 {
@@ -50,7 +49,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
 /*
  * Find a free (set) bit in the inode bitmask.
  */
-#define	XFS_IALLOC_FIND_FREE(fp)	xfs_ialloc_find_free(fp)
 static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
 {
 	return xfs_lowbit64(*fp);
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 37e5dd01a577..5580e255ff06 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -36,7 +36,6 @@ typedef	__uint64_t	xfs_inofree_t;
 #define	XFS_INODES_PER_CHUNK_LOG	(XFS_NBBYLOG + 3)
 #define	XFS_INOBT_ALL_FREE	((xfs_inofree_t)-1)
 
-#define	XFS_INOBT_MASKN(i,n)		xfs_inobt_maskn(i,n)
 static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
 {
 	return (((n) >= XFS_INODES_PER_CHUNK ? \
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 323ecd76a12c..e7ae08d1df48 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1614,10 +1614,10 @@ xfs_itruncate_finish(
 		 * in this file with garbage in them once recovery
 		 * runs.
 		 */
-		XFS_BMAP_INIT(&free_list, &first_block);
+		xfs_bmap_init(&free_list, &first_block);
 		error = xfs_bunmapi(ntp, ip,
 				    first_unmap_block, unmap_len,
-				    XFS_BMAPI_AFLAG(fork) |
+				    xfs_bmapi_aflag(fork) |
 				      (sync ? 0 : XFS_BMAPI_ASYNC),
 				    XFS_ITRUNC_MAX_EXTENTS,
 				    &first_block, &free_list,
@@ -2570,7 +2570,7 @@ xfs_iextents_copy(
 	for (i = 0; i < nrecs; i++) {
 		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
 		start_block = xfs_bmbt_get_startblock(ep);
-		if (ISNULLSTARTBLOCK(start_block)) {
+		if (isnullstartblock(start_block)) {
 			/*
 			 * It's a delayed allocation extent, so skip it.
 			 */
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 1ff04cc323ad..9957d0602d54 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -111,20 +111,16 @@ typedef struct xfs_inode_log_format_64 {
 
 #define	XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
 
-
-#define	XFS_ILOG_FBROOT(w)	xfs_ilog_fbroot(w)
 static inline int xfs_ilog_fbroot(int w)
 {
 	return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
 }
 
-#define	XFS_ILOG_FEXT(w)	xfs_ilog_fext(w)
 static inline int xfs_ilog_fext(int w)
 {
 	return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
 }
 
-#define	XFS_ILOG_FDATA(w)	xfs_ilog_fdata(w)
 static inline int xfs_ilog_fdata(int w)
 {
 	return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 911062cf73a6..08ce72316bfe 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -155,7 +155,7 @@ xfs_imap_to_bmap(
 			iomapp->iomap_bn = IOMAP_DADDR_NULL;
 			iomapp->iomap_flags |= IOMAP_DELAY;
 		} else {
-			iomapp->iomap_bn = XFS_FSB_TO_DB(ip, start_block);
+			iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
 			if (ISUNWRITTEN(imap))
 				iomapp->iomap_flags |= IOMAP_UNWRITTEN;
 		}
@@ -261,7 +261,7 @@ xfs_iomap(
 		xfs_iunlock(ip, lockmode);
 		lockmode = 0;
 
-		if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) {
+		if (nimaps && !isnullstartblock(imap.br_startblock)) {
 			xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip,
 					offset, count, iomapp, &imap, flags);
 			break;
@@ -491,7 +491,7 @@ xfs_iomap_write_direct(
 	/*
 	 * Issue the xfs_bmapi() call to allocate the blocks
 	 */
-	XFS_BMAP_INIT(&free_list, &firstfsb);
+	xfs_bmap_init(&free_list, &firstfsb);
 	nimaps = 1;
 	error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
 		&firstfsb, 0, &imap, &nimaps, &free_list, NULL);
@@ -751,7 +751,7 @@ xfs_iomap_write_allocate(
 			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 			xfs_trans_ihold(tp, ip);
 
-			XFS_BMAP_INIT(&free_list, &first_block);
+			xfs_bmap_init(&free_list, &first_block);
 
 			/*
 			 * it is possible that the extents have changed since
@@ -911,7 +911,7 @@ xfs_iomap_write_unwritten(
 		/*
 		 * Modify the unwritten extent state of the buffer.
 		 */
-		XFS_BMAP_INIT(&free_list, &firstfsb);
+		xfs_bmap_init(&free_list, &firstfsb);
 		nimaps = 1;
 		error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
 				  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index e19d0a8d5618..cf98a805ec90 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -453,7 +453,7 @@ xfs_bulkstat(
 			    (chunkidx = agino - gino + 1) <
 				    XFS_INODES_PER_CHUNK &&
 					/* there are some left allocated */
-			    XFS_INOBT_MASKN(chunkidx,
+			    xfs_inobt_maskn(chunkidx,
 				    XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
 				/*
 				 * Grab the chunk record.  Mark all the
@@ -464,7 +464,7 @@ xfs_bulkstat(
 					if (XFS_INOBT_MASK(i) & ~gfree)
 						gcnt++;
 				}
-				gfree |= XFS_INOBT_MASKN(0, chunkidx);
+				gfree |= xfs_inobt_maskn(0, chunkidx);
 				irbp->ir_startino = gino;
 				irbp->ir_freecount = gcnt;
 				irbp->ir_free = gfree;
@@ -535,7 +535,7 @@ xfs_bulkstat(
 				     chunkidx < XFS_INODES_PER_CHUNK;
 				     chunkidx += nicluster,
 				     agbno += nbcluster) {
-					if (XFS_INOBT_MASKN(chunkidx,
+					if (xfs_inobt_maskn(chunkidx,
 							    nicluster) & ~gfree)
 						xfs_btree_reada_bufs(mp, agno,
 							agbno, nbcluster);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e37eff6761eb..f5e9937f9bdb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -44,9 +44,9 @@ typedef struct xfs_trans_reservations {
 
 #ifndef __KERNEL__
 
-#define XFS_DADDR_TO_AGNO(mp,d) \
+#define xfs_daddr_to_agno(mp,d) \
 	((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
-#define XFS_DADDR_TO_AGBNO(mp,d) \
+#define xfs_daddr_to_agbno(mp,d) \
 	((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
 
 #else /* __KERNEL__ */
@@ -441,7 +441,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
  */
 #define XFS_MFSI_QUIET		0x40	/* Be silent if mount errors found */
 
-#define XFS_DADDR_TO_AGNO(mp,d)         xfs_daddr_to_agno(mp,d)
 static inline xfs_agnumber_t
 xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
 {
@@ -450,7 +449,6 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
 	return (xfs_agnumber_t) ld;
 }
 
-#define XFS_DADDR_TO_AGBNO(mp,d)        xfs_daddr_to_agbno(mp,d)
 static inline xfs_agblock_t
 xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 {
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 86471bb40fd4..58f85e9cd11d 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -147,7 +147,7 @@ xfs_rename(
 	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
 				inodes, &num_inodes);
 
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 	tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index edf12c7b834c..c5bb86f3ec05 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -120,7 +120,7 @@ xfs_growfs_rt_alloc(
 		if ((error = xfs_trans_iget(mp, tp, ino, 0,
 						XFS_ILOCK_EXCL, &ip)))
 			goto error_cancel;
-		XFS_BMAP_INIT(&flist, &firstblock);
+		xfs_bmap_init(&flist, &firstblock);
 		/*
 		 * Allocate blocks to the bitmap file.
 		 */
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f87db5344ce6..f76c003ec55d 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -28,7 +28,6 @@ struct xfs_mount;
  * file is a real time file or not, because the bmap code
  * does.
  */
-#define	XFS_FSB_TO_DB(ip,fsb)	xfs_fsb_to_db(ip,fsb)
 static inline xfs_daddr_t
 xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1ed71916e4c9..1b017c657494 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -505,7 +505,7 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
 
 #define	XFS_HDR_BLOCK(mp,d)	((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
 #define	XFS_DADDR_TO_FSB(mp,d)	XFS_AGB_TO_FSB(mp, \
-			XFS_DADDR_TO_AGNO(mp,d), XFS_DADDR_TO_AGBNO(mp,d))
+			xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
 #define	XFS_FSB_TO_DADDR(mp,fsbno)	XFS_AGB_TO_DADDR(mp, \
 			XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
 
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f07bf8768c3a..0e55c5d7db5f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -862,7 +862,7 @@ xfs_inactive_symlink_rmt(
 	 * Find the block(s) so we can inval and unmap them.
 	 */
 	done = 0;
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 	nmaps = ARRAY_SIZE(mval);
 	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
 			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
@@ -1288,7 +1288,7 @@ xfs_inactive(
 	/*
 	 * Free the inode.
 	 */
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 	error = xfs_ifree(tp, ip, &free_list);
 	if (error) {
 		/*
@@ -1461,7 +1461,7 @@ xfs_create(
 	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 	unlock_dp_on_error = B_TRUE;
 
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 
 	ASSERT(ip == NULL);
 
@@ -1879,7 +1879,7 @@ xfs_remove(
 		}
 	}
 
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
 					&first_block, &free_list, resblks);
 	if (error) {
@@ -2059,7 +2059,7 @@ xfs_link(
 	if (error)
 		goto error_return;
 
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 
 	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
 					&first_block, &free_list, resblks);
@@ -2231,7 +2231,7 @@ xfs_mkdir(
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 
 	error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
 					&first_block, &free_list, resblks ?
@@ -2438,7 +2438,7 @@ xfs_symlink(
 	 * Initialize the bmap freelist prior to calling either
 	 * bmapi or the directory create code.
 	 */
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 
 	/*
 	 * Allocate an inode for the symlink.
@@ -2860,7 +2860,7 @@ retry:
 		/*
 		 * Issue the xfs_bmapi() call to allocate the blocks
 		 */
-		XFS_BMAP_INIT(&free_list, &firstfsb);
+		xfs_bmap_init(&free_list, &firstfsb);
 		error = xfs_bmapi(tp, ip, startoffset_fsb,
 				  allocatesize_fsb, bmapi_flag,
 				  &firstfsb, 0, imapp, &nimaps,
@@ -2980,7 +2980,7 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_UNDONE(bp);
 		XFS_BUF_UNWRITE(bp);
 		XFS_BUF_READ(bp);
-		XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
+		XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
 		xfsbdstrat(mp, bp);
 		error = xfs_iowait(bp);
 		if (error) {
@@ -3186,7 +3186,7 @@ xfs_free_file_space(
 		/*
 		 * issue the bunmapi() call to free the blocks
 		 */
-		XFS_BMAP_INIT(&free_list, &firstfsb);
+		xfs_bmap_init(&free_list, &firstfsb);
 		error = xfs_bunmapi(tp, ip, startoffset_fsb,
 				  endoffset_fsb - startoffset_fsb,
 				  0, 2, &firstfsb, &free_list, NULL, &done);
-- 
cgit v1.2.3


From a50412e3f8ce95d7ed558370d7dde5171fd04283 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 6 Jan 2009 19:54:02 +0200
Subject: UBIFS: do not treat all data as short term

UBIFS wrongly tells UBI that all data is short term. Use proper
hints instead. Thanks to Xiaochuan-Xu for noticing this.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/journal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 9b7c54e0cd2a..a11ca0958a23 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -208,7 +208,7 @@ again:
 	offs = 0;
 
 out:
-	err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM);
+	err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
 	if (err)
 		goto out_unlock;
 
-- 
cgit v1.2.3


From 7078202e55b565582fcbd831a8dd3069bdc72610 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 19 Jan 2009 19:57:27 +0200
Subject: UBIFS: document dark_wm and dead_wm better

Just add more commentaries. Also some commentary fixes for
lprops flags.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c    | 20 ++++++++++++++++++++
 fs/ubifs/super.c | 11 ++---------
 fs/ubifs/ubifs.h |  4 ++--
 3 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 9832f9abe28e..b2e5f1133377 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -31,6 +31,26 @@
  * to be reused. Garbage collection will cause the number of dirty index nodes
  * to grow, however sufficient space is reserved for the index to ensure the
  * commit will never run out of space.
+ *
+ * Notes about dead watermark. At current UBIFS implementation we assume that
+ * LEBs which have less than @c->dead_wm bytes of free + dirty space are full
+ * and not worth garbage-collecting. The dead watermark is one min. I/O unit
+ * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
+ * Garbage Collector has to synchronize the GC head's write buffer before
+ * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
+ * actually reclaim even very small pieces of dirty space by garbage collecting
+ * enough dirty LEBs, but we do not bother doing this at this implementation.
+ *
+ * Notes about dark watermark. The results of GC work depends on how big are
+ * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
+ * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
+ * have to waste large pieces of free space at the end of LEB B, because nodes
+ * from LEB A would not fit. And the worst situation is when all nodes are of
+ * maximum size. So dark watermark is the amount of free + dirty space in LEB
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
+ * be unable to reclaim it. So, LEBs with free + dirty greater than dark
+ * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
+ * good, and GC takes extra care when moving them.
  */
 
 #include <linux/pagemap.h>
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 53811e567a69..da99da098efd 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -573,15 +573,8 @@ static int init_constants_early(struct ubifs_info *c)
 	c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
 
 	/*
-	 * Initialize dead and dark LEB space watermarks.
-	 *
-	 * Dead space is the space which cannot be used. Its watermark is
-	 * equivalent to min. I/O unit or minimum node size if it is greater
-	 * then min. I/O unit.
-	 *
-	 * Dark space is the space which might be used, or might not, depending
-	 * on which node should be written to the LEB. Its watermark is
-	 * equivalent to maximum UBIFS node size.
+	 * Initialize dead and dark LEB space watermarks. See gc.c for comments
+	 * about these values.
 	 */
 	c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
 	c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 0881897a4208..2e78d6ac007e 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -426,9 +426,9 @@ struct ubifs_unclean_leb {
  * LEB properties flags.
  *
  * LPROPS_UNCAT: not categorized
- * LPROPS_DIRTY: dirty > 0, not index
+ * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index
  * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
- * LPROPS_FREE: free > 0, not empty, not index
+ * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index
  * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
  * LPROPS_EMPTY: LEB is empty, not taken
  * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
-- 
cgit v1.2.3


From e7f07968c16bdd9480001c0a9de013ba56889cf9 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 20 Jan 2009 09:50:19 -0500
Subject: ext4: Fix ext4_free_blocks() w/o a journal when files have indirect
 blocks

When trying to unlink a file with indirect blocks on a filesystem
without a journal, the "circular indirect block" sanity test was
getting falsely triggered.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 49484ba801c9..b4386dafeb0c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3622,7 +3622,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 		 * block pointed to itself, it would have been detached when
 		 * the block was cleared. Check for this instead of OOPSing.
 		 */
-		if (bh2jh(this_bh))
+		if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
 			ext4_handle_dirty_metadata(handle, inode, this_bh);
 		else
 			ext4_error(inode->i_sb, __func__,
-- 
cgit v1.2.3


From 4503efd0891c40e30928afb4b23dc3f99c62a6b2 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 20 Jan 2009 15:51:16 -0800
Subject: sysfs: fix problems with binary files

Some sysfs binary files don't like having 0 passed to them as a size.
Fix this up at the root by just returning to the vfs if userspace asks
us for a zero sized buffer.

Thanks to Pavel Roskin for pointing this out.

Reported-by: Pavel Roskin <proski@gnu.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/bin.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 66f6e58a7e4b..f2c478c3424e 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -63,6 +63,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 	int count = min_t(size_t, bytes, PAGE_SIZE);
 	char *temp;
 
+	if (!bytes)
+		return 0;
+
 	if (size) {
 		if (offs > size)
 			return 0;
@@ -131,6 +134,9 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 	int count = min_t(size_t, bytes, PAGE_SIZE);
 	char *temp;
 
+	if (!bytes)
+		return 0;
+
 	if (size) {
 		if (offs > size)
 			return 0;
-- 
cgit v1.2.3


From c475146d8f3b97e79f9ef88521e28ad40ac07de6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Jan 2009 17:44:34 +0100
Subject: ocfs2: Remove ocfs2_dquot_initialize() and ocfs2_dquot_drop()

Since ->acquire_dquot and ->release_dquot callbacks aren't called under
dqptr_sem anymore, we don't have to start a transaction and obtain locks
so early. So we can just remove all this complicated stuff.

Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Mark Fasheh <mfasheh@suse.de>
---
 fs/ocfs2/quota_global.c | 169 +-----------------------------------------------
 1 file changed, 2 insertions(+), 167 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 6aff8f2d3e49..f4efa89baee5 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -810,171 +810,6 @@ out:
 	return status;
 }
 
-/* This is difficult. We have to lock quota inode and start transaction
- * in this function but we don't want to take the penalty of exlusive
- * quota file lock when we are just going to use cached structures. So
- * we just take read lock check whether we have dquot cached and if so,
- * we don't have to take the write lock... */
-static int ocfs2_dquot_initialize(struct inode *inode, int type)
-{
-	handle_t *handle = NULL;
-	int status = 0;
-	struct super_block *sb = inode->i_sb;
-	struct ocfs2_mem_dqinfo *oinfo;
-	int exclusive = 0;
-	int cnt;
-	qid_t id;
-
-	mlog_entry_void();
-
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (type != -1 && cnt != type)
-			continue;
-		if (!sb_has_quota_active(sb, cnt))
-			continue;
-		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
-		status = ocfs2_lock_global_qf(oinfo, 0);
-		if (status < 0)
-			goto out;
-		/* This is just a performance optimization not a reliable test.
-		 * Since we hold an inode lock, noone can actually release
-		 * the structure until we are finished with initialization. */
-		if (inode->i_dquot[cnt] != NODQUOT) {
-			ocfs2_unlock_global_qf(oinfo, 0);
-			continue;
-		}
-		/* When we have inode lock, we know that no dquot_release() can
-		 * run and thus we can safely check whether we need to
-		 * read+modify global file to get quota information or whether
-		 * our node already has it. */
-		if (cnt == USRQUOTA)
-			id = inode->i_uid;
-		else if (cnt == GRPQUOTA)
-			id = inode->i_gid;
-		else
-			BUG();
-		/* Obtain exclusion from quota off... */
-		down_write(&sb_dqopt(sb)->dqptr_sem);
-		exclusive = !dquot_is_cached(sb, id, cnt);
-		up_write(&sb_dqopt(sb)->dqptr_sem);
-		if (exclusive) {
-			status = ocfs2_lock_global_qf(oinfo, 1);
-			if (status < 0) {
-				exclusive = 0;
-				mlog_errno(status);
-				goto out_ilock;
-			}
-			handle = ocfs2_start_trans(OCFS2_SB(sb),
-					ocfs2_calc_qinit_credits(sb, cnt));
-			if (IS_ERR(handle)) {
-				status = PTR_ERR(handle);
-				mlog_errno(status);
-				goto out_ilock;
-			}
-		}
-		dquot_initialize(inode, cnt);
-		if (exclusive) {
-			ocfs2_commit_trans(OCFS2_SB(sb), handle);
-			ocfs2_unlock_global_qf(oinfo, 1);
-		}
-		ocfs2_unlock_global_qf(oinfo, 0);
-	}
-	mlog_exit(0);
-	return 0;
-out_ilock:
-	if (exclusive)
-		ocfs2_unlock_global_qf(oinfo, 1);
-	ocfs2_unlock_global_qf(oinfo, 0);
-out:
-	mlog_exit(status);
-	return status;
-}
-
-static int ocfs2_dquot_drop_slow(struct inode *inode)
-{
-	int status = 0;
-	int cnt;
-	int got_lock[MAXQUOTAS] = {0, 0};
-	handle_t *handle;
-	struct super_block *sb = inode->i_sb;
-	struct ocfs2_mem_dqinfo *oinfo;
-
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!sb_has_quota_active(sb, cnt))
-			continue;
-		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
-		status = ocfs2_lock_global_qf(oinfo, 1);
-		if (status < 0)
-			goto out;
-		got_lock[cnt] = 1;
-	}
-	handle = ocfs2_start_trans(OCFS2_SB(sb),
-			ocfs2_calc_qinit_credits(sb, USRQUOTA) +
-			ocfs2_calc_qinit_credits(sb, GRPQUOTA));
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		mlog_errno(status);
-		goto out;
-	}
-	dquot_drop(inode);
-	ocfs2_commit_trans(OCFS2_SB(sb), handle);
-out:
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if (got_lock[cnt]) {
-			oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
-			ocfs2_unlock_global_qf(oinfo, 1);
-		}
-	return status;
-}
-
-/* See the comment before ocfs2_dquot_initialize. */
-static int ocfs2_dquot_drop(struct inode *inode)
-{
-	int status = 0;
-	struct super_block *sb = inode->i_sb;
-	struct ocfs2_mem_dqinfo *oinfo;
-	int exclusive = 0;
-	int cnt;
-	int got_lock[MAXQUOTAS] = {0, 0};
-
-	mlog_entry_void();
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!sb_has_quota_active(sb, cnt))
-			continue;
-		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
-		status = ocfs2_lock_global_qf(oinfo, 0);
-		if (status < 0)
-			goto out;
-		got_lock[cnt] = 1;
-	}
-	/* Lock against anyone releasing references so that when when we check
-	 * we know we are not going to be last ones to release dquot */
-	down_write(&sb_dqopt(sb)->dqptr_sem);
-	/* Urgh, this is a terrible hack :( */
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] != NODQUOT &&
-		    atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) {
-			exclusive = 1;
-			break;
-		}
-	}
-	if (!exclusive)
-		dquot_drop_locked(inode);
-	up_write(&sb_dqopt(sb)->dqptr_sem);
-out:
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if (got_lock[cnt]) {
-			oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
-			ocfs2_unlock_global_qf(oinfo, 0);
-		}
-	/* In case we bailed out because we had to do expensive locking
-	 * do it now... */
-	if (exclusive)
-		status = ocfs2_dquot_drop_slow(inode);
-	mlog_exit(status);
-	return status;
-}
-
 static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
 {
 	struct ocfs2_dquot *dquot =
@@ -991,8 +826,8 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
 }
 
 struct dquot_operations ocfs2_quota_operations = {
-	.initialize	= ocfs2_dquot_initialize,
-	.drop		= ocfs2_dquot_drop,
+	.initialize	= dquot_initialize,
+	.drop		= dquot_drop,
 	.alloc_space	= dquot_alloc_space,
 	.alloc_inode	= dquot_alloc_inode,
 	.free_space	= dquot_free_space,
-- 
cgit v1.2.3


From 19d00cc196a3a66fd074f62b39d219f743b92338 Mon Sep 17 00:00:00 2001
From: Wang Cong <xiyou.wangcong@gmail.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: cleanup fs/btrfs/super.c::btrfs_control_ioctl()

- Remove the unused local variable 'len';
- Check return value of kmalloc().

Signed-off-by: Wang Cong <wangcong@zeuux.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 92c9b543deff..795b6246bcdf 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -583,17 +583,18 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 	struct btrfs_ioctl_vol_args *vol;
 	struct btrfs_fs_devices *fs_devices;
 	int ret = -ENOTTY;
-	int len;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	vol = kmalloc(sizeof(*vol), GFP_KERNEL);
+	if (!vol)
+		return -ENOMEM;
+
 	if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
 		ret = -EFAULT;
 		goto out;
 	}
-	len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
 
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
-- 
cgit v1.2.3


From 070604040b86511cc2df0f25f98e26c5529bd928 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: cleanup xattr code

Andrew's review of the xattr code revealed some minor issues that this patch
addresses.  Just an error return fix, got rid of a useless statement and
commented one of the trickier parts of __btrfs_getxattr.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/xattr.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 7f332e270894..b4fa5f4b6ad1 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -45,9 +45,12 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
 	/* lookup the xattr by name */
 	di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
 				strlen(name), 0);
-	if (!di || IS_ERR(di)) {
+	if (!di) {
 		ret = -ENODATA;
 		goto out;
+	} else if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
 	}
 
 	leaf = path->nodes[0];
@@ -62,6 +65,14 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
 		ret = -ERANGE;
 		goto out;
 	}
+
+	/*
+	 * The way things are packed into the leaf is like this
+	 * |struct btrfs_dir_item|name|data|
+	 * where name is the xattr name, so security.foo, and data is the
+	 * content of the xattr.  data_ptr points to the location in memory
+	 * where the data starts in the in memory leaf
+	 */
 	data_ptr = (unsigned long)((char *)(di + 1) +
 				   btrfs_dir_name_len(leaf, di));
 	read_extent_buffer(leaf, buffer, data_ptr,
@@ -176,7 +187,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
-	ret = 0;
 	advance = 0;
 	while (1) {
 		leaf = path->nodes[0];
-- 
cgit v1.2.3


From 7eaebe7d503c3ef240ac7b3efc5433fe647c0298 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: removed unused #include <version.h>'s

Removed unused #include <version.h>'s in btrfs

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 1 -
 fs/btrfs/compression.c  | 1 -
 fs/btrfs/disk-io.c      | 1 -
 fs/btrfs/extent-tree.c  | 1 -
 fs/btrfs/extent_io.c    | 1 -
 fs/btrfs/extent_map.c   | 1 -
 fs/btrfs/file.c         | 1 -
 fs/btrfs/inode.c        | 1 -
 fs/btrfs/ioctl.c        | 1 -
 fs/btrfs/super.c        | 1 -
 fs/btrfs/volumes.c      | 1 -
 11 files changed, 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8e2fec05dbe0..d5f4e94f2ca2 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/version.h>
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ee848d8585d9..ab07627084f1 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,7 +32,6 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
-#include <linux/version.h>
 #include <linux/pagevec.h>
 #include "compat.h"
 #include "ctree.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81a313874ae5..37e12f620392 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/version.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/scatterlist.h>
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 293da650873f..cdc961e7556a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,7 +19,6 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
-#include <linux/version.h>
 #include "compat.h"
 #include "hash.h"
 #include "crc32c.h"
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e086d407f1fa..c9446d4840ed 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -9,7 +9,6 @@
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
-#include <linux/version.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include "extent_io.h"
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a83e33ada32..50da69da20ce 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,7 +3,6 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <linux/version.h>
 #include <linux/hardirq.h>
 #include "extent_map.h"
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 90268334145e..fbcbf43f5114 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -29,7 +29,6 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
-#include <linux/version.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8adfe059ab41..44dbd550c4bd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -34,7 +34,6 @@
 #include <linux/statfs.h>
 #include <linux/compat.h>
 #include <linux/bit_spinlock.h>
-#include <linux/version.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c2aa33e3feb5..988fdc8b49eb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -38,7 +38,6 @@
 #include <linux/compat.h>
 #include <linux/bit_spinlock.h>
 #include <linux/security.h>
-#include <linux/version.h>
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
 #include "compat.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 795b6246bcdf..4c0b7569ba52 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -37,7 +37,6 @@
 #include <linux/ctype.h>
 #include <linux/namei.h>
 #include <linux/miscdevice.h>
-#include <linux/version.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3451e1cca2b5..f6e1fc55104e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,7 +20,6 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
-#include <linux/version.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
-- 
cgit v1.2.3


From 119e10cf1b2f6a6cafff74f32373d631489f54c2 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rdreier@cisco.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: Remove extra KERN_INFO in the middle of a line

The "devid <xxx> transid <xxx>" printk in btrfs_scan_one_device()
actually follows another printk that doesn't end in a newline (since the
intention is for the two printks to make one line of output), so the
KERN_INFO just ends up messing up the output:

    device label exp <6>devid 1 transid 9 /dev/sda5

Fix this by changing the extra KERN_INFO to KERN_CONT.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f6e1fc55104e..397c8db1bc27 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -577,7 +577,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		       *(unsigned long long *)disk_super->fsid,
 		       *(unsigned long long *)(disk_super->fsid + 8));
 	}
-	printk(KERN_INFO "devid %llu transid %llu %s\n",
+	printk(KERN_CONT "devid %llu transid %llu %s\n",
 	       (unsigned long long)devid, (unsigned long long)transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
-- 
cgit v1.2.3


From 57506d50ed6db7b0e7ddc9845e86e81f140983d5 Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: check return value for kthread_run() correctly

kthread_run() returns the kthread or ERR_PTR(-ENOMEM), not NULL.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 37e12f620392..0d8ccd625ba2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1739,13 +1739,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
-	if (!fs_info->cleaner_kthread)
+	if (IS_ERR(fs_info->cleaner_kthread))
 		goto fail_csum_root;
 
 	fs_info->transaction_kthread = kthread_run(transaction_kthread,
 						   tree_root,
 						   "btrfs-transaction");
-	if (!fs_info->transaction_kthread)
+	if (IS_ERR(fs_info->transaction_kthread))
 		goto fail_cleaner;
 
 	if (btrfs_super_log_root(disk_super) != 0) {
-- 
cgit v1.2.3


From c6e308713a47527f88a277ee95b7c5d1db80f77b Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Wed, 21 Jan 2009 10:59:08 -0500
Subject: Btrfs: simplify iteration codes

Merge list_for_each* and list_entry to list_for_each_entry*

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c      | 15 ++++-----------
 fs/btrfs/extent-tree.c  |  8 ++------
 fs/btrfs/inode.c        |  5 ++---
 fs/btrfs/ordered-data.c |  4 +---
 fs/btrfs/transaction.c  |  4 +---
 fs/btrfs/volumes.c      | 35 +++++++++--------------------------
 6 files changed, 19 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0d8ccd625ba2..26a18779e84b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1135,7 +1135,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 {
 	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
 	int ret = 0;
-	struct list_head *cur;
 	struct btrfs_device *device;
 	struct backing_dev_info *bdi;
 #if 0
@@ -1143,8 +1142,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	    btrfs_congested_async(info, 0))
 		return 1;
 #endif
-	list_for_each(cur, &info->fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
 		if (!device->bdev)
 			continue;
 		bdi = blk_get_backing_dev_info(device->bdev);
@@ -1162,13 +1160,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
  */
 static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
-	struct list_head *cur;
 	struct btrfs_device *device;
 	struct btrfs_fs_info *info;
 
 	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
-	list_for_each(cur, &info->fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
 		if (!device->bdev)
 			continue;
 
@@ -1994,7 +1990,6 @@ static int write_dev_supers(struct btrfs_device *device,
 
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
-	struct list_head *cur;
 	struct list_head *head = &root->fs_info->fs_devices->devices;
 	struct btrfs_device *dev;
 	struct btrfs_super_block *sb;
@@ -2010,8 +2005,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
 	sb = &root->fs_info->super_for_commit;
 	dev_item = &sb->dev_item;
-	list_for_each(cur, head) {
-		dev = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(dev, head, dev_list) {
 		if (!dev->bdev) {
 			total_errors++;
 			continue;
@@ -2044,8 +2038,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	}
 
 	total_errors = 0;
-	list_for_each(cur, head) {
-		dev = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(dev, head, dev_list) {
 		if (!dev->bdev)
 			continue;
 		if (!dev->in_fs_metadata || !dev->writeable)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cdc961e7556a..a4e36c38b81e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -325,10 +325,8 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 						  u64 flags)
 {
 	struct list_head *head = &info->space_info;
-	struct list_head *cur;
 	struct btrfs_space_info *found;
-	list_for_each(cur, head) {
-		found = list_entry(cur, struct btrfs_space_info, list);
+	list_for_each_entry(found, head, list) {
 		if (found->flags == flags)
 			return found;
 	}
@@ -3013,7 +3011,6 @@ loop_check:
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 {
 	struct btrfs_block_group_cache *cache;
-	struct list_head *l;
 
 	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
 	       (unsigned long long)(info->total_bytes - info->bytes_used -
@@ -3021,8 +3018,7 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 	       (info->full) ? "" : "not ");
 
 	down_read(&info->groups_sem);
-	list_for_each(l, &info->block_groups) {
-		cache = list_entry(l, struct btrfs_block_group_cache, list);
+	list_for_each_entry(cache, &info->block_groups, list) {
 		spin_lock(&cache->lock);
 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
 		       "%llu pinned %llu reserved\n",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 44dbd550c4bd..45cf03ee1bc2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1323,12 +1323,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 			     struct inode *inode, u64 file_offset,
 			     struct list_head *list)
 {
-	struct list_head *cur;
 	struct btrfs_ordered_sum *sum;
 
 	btrfs_set_trans_block_group(trans, inode);
-	list_for_each(cur, list) {
-		sum = list_entry(cur, struct btrfs_ordered_sum, list);
+
+	list_for_each_entry(sum, list, list) {
 		btrfs_csum_file_blocks(trans,
 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
 	}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a20940170274..77c2411a5f0f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -613,7 +613,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 	struct btrfs_sector_sum *sector_sums;
 	struct btrfs_ordered_extent *ordered;
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
-	struct list_head *cur;
 	unsigned long num_sectors;
 	unsigned long i;
 	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
@@ -624,8 +623,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 		return 1;
 
 	mutex_lock(&tree->mutex);
-	list_for_each_prev(cur, &ordered->list) {
-		ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
+	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
 		if (disk_bytenr >= ordered_sum->bytenr) {
 			num_sectors = ordered_sum->len / sectorsize;
 			sector_sums = ordered_sum->sums;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8a08f9443340..919172de5c9a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -852,11 +852,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_pending_snapshot *pending;
 	struct list_head *head = &trans->transaction->pending_snapshots;
-	struct list_head *cur;
 	int ret;
 
-	list_for_each(cur, head) {
-		pending = list_entry(cur, struct btrfs_pending_snapshot, list);
+	list_for_each_entry(pending, head, list) {
 		ret = create_pending_snapshot(trans, fs_info, pending);
 		BUG_ON(ret);
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 397c8db1bc27..fd0bedb07a64 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -103,10 +103,8 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
 						   u64 devid, u8 *uuid)
 {
 	struct btrfs_device *dev;
-	struct list_head *cur;
 
-	list_for_each(cur, head) {
-		dev = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(dev, head, dev_list) {
 		if (dev->devid == devid &&
 		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 			return dev;
@@ -117,11 +115,9 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
 
 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 {
-	struct list_head *cur;
 	struct btrfs_fs_devices *fs_devices;
 
-	list_for_each(cur, &fs_uuids) {
-		fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
+	list_for_each_entry(fs_devices, &fs_uuids, list) {
 		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 			return fs_devices;
 	}
@@ -344,14 +340,11 @@ error:
 
 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct list_head *tmp;
-	struct list_head *cur;
-	struct btrfs_device *device;
+	struct btrfs_device *device, *next;
 
 	mutex_lock(&uuid_mutex);
 again:
-	list_for_each_safe(cur, tmp, &fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 		if (device->in_fs_metadata)
 			continue;
 
@@ -382,14 +375,12 @@ again:
 
 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct list_head *cur;
 	struct btrfs_device *device;
 
 	if (--fs_devices->opened > 0)
 		return 0;
 
-	list_for_each(cur, &fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (device->bdev) {
 			close_bdev_exclusive(device->bdev, device->mode);
 			fs_devices->open_devices--;
@@ -438,7 +429,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
-	struct list_head *cur;
 	struct btrfs_device *device;
 	struct block_device *latest_bdev = NULL;
 	struct buffer_head *bh;
@@ -449,8 +439,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	int seeding = 1;
 	int ret = 0;
 
-	list_for_each(cur, head) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, head, dev_list) {
 		if (device->bdev)
 			continue;
 		if (!device->name)
@@ -1016,14 +1005,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	}
 
 	if (strcmp(device_path, "missing") == 0) {
-		struct list_head *cur;
 		struct list_head *devices;
 		struct btrfs_device *tmp;
 
 		device = NULL;
 		devices = &root->fs_info->fs_devices->devices;
-		list_for_each(cur, devices) {
-			tmp = list_entry(cur, struct btrfs_device, dev_list);
+		list_for_each_entry(tmp, devices, dev_list) {
 			if (tmp->in_fs_metadata && !tmp->bdev) {
 				device = tmp;
 				break;
@@ -1279,7 +1266,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
 	struct block_device *bdev;
-	struct list_head *cur;
 	struct list_head *devices;
 	struct super_block *sb = root->fs_info->sb;
 	u64 total_bytes;
@@ -1303,8 +1289,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	mutex_lock(&root->fs_info->volume_mutex);
 
 	devices = &root->fs_info->fs_devices->devices;
-	list_for_each(cur, devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, devices, dev_list) {
 		if (device->bdev == bdev) {
 			ret = -EEXIST;
 			goto error;
@@ -1703,7 +1688,6 @@ static u64 div_factor(u64 num, int factor)
 int btrfs_balance(struct btrfs_root *dev_root)
 {
 	int ret;
-	struct list_head *cur;
 	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
 	struct btrfs_device *device;
 	u64 old_size;
@@ -1722,8 +1706,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	dev_root = dev_root->fs_info->dev_root;
 
 	/* step one make some room on all the devices */
-	list_for_each(cur, devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, devices, dev_list) {
 		old_size = device->total_bytes;
 		size_to_free = div_factor(old_size, 1);
 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
-- 
cgit v1.2.3


From 3dfdb9348ada18c74c39b9ae7b115e0594792281 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: fix locking issue in btrfs_remove_block_group

We should hold the block_group_cache_lock while modifying the
block groups red-black tree. Thank you,

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a4e36c38b81e..3bed6a7e4b22 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5952,9 +5952,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	btrfs_remove_free_space_cache(block_group);
+	spin_lock(&root->fs_info->block_group_cache_lock);
 	rb_erase(&block_group->cache_node,
 		 &root->fs_info->block_group_cache_tree);
+	spin_unlock(&root->fs_info->block_group_cache_lock);
+	btrfs_remove_free_space_cache(block_group);
 	down_write(&block_group->space_info->groups_sem);
 	list_del(&block_group->list);
 	up_write(&block_group->space_info->groups_sem);
-- 
cgit v1.2.3


From 5a7be515b1f4569aac601170fc681741434cca92 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: Fix infinite loop in btrfs_extent_post_op

btrfs_extent_post_op calls finish_current_insert and del_pending_extents. They
both may enter infinite loops.

finish_current_insert enters infinite loop if it only finds some backrefs to
update.  The fix is to check for pending backref updates before restarting the
loop.

The infinite loop in del_pending_extents is due to a the skipped variable
not being properly reset before looping around.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3bed6a7e4b22..aeaec84ebed8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2156,7 +2156,8 @@ again:
 		ret = find_first_extent_bit(&info->extent_ins, search, &start,
 					    &end, EXTENT_WRITEBACK);
 		if (ret) {
-			if (skipped && all && !num_inserts) {
+			if (skipped && all && !num_inserts &&
+			    list_empty(&update_list)) {
 				skipped = 0;
 				search = 0;
 				continue;
@@ -2544,6 +2545,7 @@ again:
 		if (ret) {
 			if (all && skipped && !nr) {
 				search = 0;
+				skipped = 0;
 				continue;
 			}
 			mutex_unlock(&info->extent_ins_mutex);
-- 
cgit v1.2.3


From 653249ff9aea51e1ace6bd437389f06e2b84393f Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: remove duplicated #include

Removed duplicated #include "compat.h"in
fs/btrfs/extent-tree.c

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index aeaec84ebed8..c643433629a7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -29,7 +29,6 @@
 #include "volumes.h"
 #include "locking.h"
 #include "ref-cache.h"
-#include "compat.h"
 
 #define PENDING_EXTENT_INSERT 0
 #define PENDING_EXTENT_DELETE 1
-- 
cgit v1.2.3


From 95029d7d598babf62276d9006e575992b1333ba5 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: change/remove typedef

Change one typedef to a regular enum, and remove an unused one.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eee060f88113..e1fec636f37f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -454,17 +454,11 @@ struct btrfs_timespec {
 	__le32 nsec;
 } __attribute__ ((__packed__));
 
-typedef enum {
+enum btrfs_compression_type {
 	BTRFS_COMPRESS_NONE = 0,
 	BTRFS_COMPRESS_ZLIB = 1,
 	BTRFS_COMPRESS_LAST = 2,
-} btrfs_compression_type;
-
-/* we don't understand any encryption methods right now */
-typedef enum {
-	BTRFS_ENCRYPTION_NONE = 0,
-	BTRFS_ENCRYPTION_LAST = 1,
-} btrfs_encryption_type;
+};
 
 struct btrfs_inode_item {
 	/* nfs style generation number */
-- 
cgit v1.2.3


From 86288a198d8e4e8411ff02f9ab848245e8f11257 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: fix stop searching test in replace_one_extent

replace_one_extent searches tree leaves for references to a given extent. It
stops searching if it goes beyond the last possible position.

The last possible position is computed by adding the starting offset of a found
file extent to the full size of the extent. The code uses physical size of the
extent as the full size. This is incorrect when compression is used.

The fix is get the full size from ram_bytes field of file extent item.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c643433629a7..1d7f043152b0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4440,7 +4440,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
 	u64 lock_end = 0;
 	u64 num_bytes;
 	u64 ext_offset;
-	u64 first_pos;
+	u64 search_end = (u64)-1;
 	u32 nritems;
 	int nr_scaned = 0;
 	int extent_locked = 0;
@@ -4448,7 +4448,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
 	int ret;
 
 	memcpy(&key, leaf_key, sizeof(key));
-	first_pos = INT_LIMIT(loff_t) - extent_key->offset;
 	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
 		if (key.objectid < ref_path->owner_objectid ||
 		    (key.objectid == ref_path->owner_objectid &&
@@ -4497,7 +4496,7 @@ next:
 			if ((key.objectid > ref_path->owner_objectid) ||
 			    (key.objectid == ref_path->owner_objectid &&
 			     key.type > BTRFS_EXTENT_DATA_KEY) ||
-			    (key.offset >= first_pos + extent_key->offset))
+			    key.offset >= search_end)
 				break;
 		}
 
@@ -4530,8 +4529,10 @@ next:
 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
 		ext_offset = btrfs_file_extent_offset(leaf, fi);
 
-		if (first_pos > key.offset - ext_offset)
-			first_pos = key.offset - ext_offset;
+		if (search_end == (u64)-1) {
+			search_end = key.offset - ext_offset +
+				btrfs_file_extent_ram_bytes(leaf, fi);
+		}
 
 		if (!extent_locked) {
 			lock_start = key.offset;
@@ -4720,7 +4721,7 @@ next:
 		}
 skip:
 		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
-		    key.offset >= first_pos + extent_key->offset)
+		    key.offset >= search_end)
 			break;
 
 		cond_resched();
-- 
cgit v1.2.3


From 7e6628544abad773222d8b177f738ac2db1859de Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: open_ctree() error handling can oops on fs_info

a bug in open_ctree:

struct btrfs_root *open_ctree(..)
{
....
	if (!extent_root || !tree_root || !fs_info ||
	    !chunk_root || !dev_root || !csum_root) {
		err = -ENOMEM;
		goto fail;
//When code flow goes to "fail", fs_info may be NULL or uninitialized.
	}
....

fail:
	btrfs_close_devices(fs_info->fs_devices);// !
	btrfs_mapping_tree_free(&fs_info->mapping_tree);// !

	kfree(extent_root);
	kfree(tree_root);
	bdi_destroy(&fs_info->bdi);// !
...
)

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 26a18779e84b..3cf17257f89d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1823,13 +1823,14 @@ fail_sb_buffer:
 fail_iput:
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
-fail:
+
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+	bdi_destroy(&fs_info->bdi);
 
+fail:
 	kfree(extent_root);
 	kfree(tree_root);
-	bdi_destroy(&fs_info->bdi);
 	kfree(fs_info);
 	kfree(chunk_root);
 	kfree(dev_root);
-- 
cgit v1.2.3


From 7237f1833601dcc435a64176c2c347ec4bd959f9 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 21 Jan 2009 12:54:03 -0500
Subject: Btrfs: fix tree logs parallel sync

To improve performance, btrfs_sync_log merges tree log sync
requests. But it wrongly merges sync requests for different
tree logs. If multiple tree logs are synced at the same time,
only one of them actually gets synced.

This patch has following changes to fix the bug:

Move most tree log related fields in btrfs_fs_info to
btrfs_root. This allows merging sync requests separately
for each tree log.

Don't insert root item into the log root tree immediately
after log tree is allocated. Root item for log tree is
inserted when log tree get synced for the first time. This
allows syncing the log root tree without first syncing all
log trees.

At tree-log sync, btrfs_sync_log first sync the log tree;
then updates corresponding root item in the log root tree;
sync the log root tree; then update the super block.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.h       |  13 +-
 fs/btrfs/disk-io.c     |  79 +++++++++--
 fs/btrfs/disk-io.h     |   2 +
 fs/btrfs/extent-tree.c |  10 +-
 fs/btrfs/file.c        |   4 +-
 fs/btrfs/tree-log.c    | 350 +++++++++++++++++++++++--------------------------
 6 files changed, 248 insertions(+), 210 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e1fec636f37f..de103a8a815e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -695,9 +695,7 @@ struct btrfs_fs_info {
 	struct btrfs_transaction *running_transaction;
 	wait_queue_head_t transaction_throttle;
 	wait_queue_head_t transaction_wait;
-
 	wait_queue_head_t async_submit_wait;
-	wait_queue_head_t tree_log_wait;
 
 	struct btrfs_super_block super_copy;
 	struct btrfs_super_block super_for_commit;
@@ -724,10 +722,6 @@ struct btrfs_fs_info {
 	atomic_t async_submit_draining;
 	atomic_t nr_async_bios;
 	atomic_t async_delalloc_pages;
-	atomic_t tree_log_writers;
-	atomic_t tree_log_commit;
-	unsigned long tree_log_batch;
-	u64 tree_log_transid;
 
 	/*
 	 * this is used by the balancing code to wait for all the pending
@@ -827,7 +821,14 @@ struct btrfs_root {
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
 	struct mutex objectid_mutex;
+
 	struct mutex log_mutex;
+	wait_queue_head_t log_writer_wait;
+	wait_queue_head_t log_commit_wait[2];
+	atomic_t log_writers;
+	atomic_t log_commit[2];
+	unsigned long log_transid;
+	unsigned long log_batch;
 
 	u64 objectid;
 	u64 last_trans;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3cf17257f89d..7feac5a475e9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -849,6 +849,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	spin_lock_init(&root->list_lock);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
+	init_waitqueue_head(&root->log_writer_wait);
+	init_waitqueue_head(&root->log_commit_wait[0]);
+	init_waitqueue_head(&root->log_commit_wait[1]);
+	atomic_set(&root->log_commit[0], 0);
+	atomic_set(&root->log_commit[1], 0);
+	atomic_set(&root->log_writers, 0);
+	root->log_batch = 0;
+	root->log_transid = 0;
 	extent_io_tree_init(&root->dirty_log_pages,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 
@@ -933,15 +941,16 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
-			     struct btrfs_fs_info *fs_info)
+static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *root;
 	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct extent_buffer *leaf;
 
 	root = kzalloc(sizeof(*root), GFP_NOFS);
 	if (!root)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
@@ -950,12 +959,23 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
 	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
 	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+	/*
+	 * log trees do not get reference counted because they go away
+	 * before a real commit is actually done.  They do store pointers
+	 * to file data extents, and those reference counts still get
+	 * updated (along with back refs to the log tree).
+	 */
 	root->ref_cows = 0;
 
-	root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
-					    0, BTRFS_TREE_LOG_OBJECTID,
-					    trans->transid, 0, 0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      0, BTRFS_TREE_LOG_OBJECTID,
+				      trans->transid, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		kfree(root);
+		return ERR_CAST(leaf);
+	}
 
+	root->node = leaf;
 	btrfs_set_header_nritems(root->node, 0);
 	btrfs_set_header_level(root->node, 0);
 	btrfs_set_header_bytenr(root->node, root->node->start);
@@ -967,7 +987,48 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			    BTRFS_FSID_SIZE);
 	btrfs_mark_buffer_dirty(root->node);
 	btrfs_tree_unlock(root->node);
-	fs_info->log_root_tree = root;
+	return root;
+}
+
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *log_root;
+
+	log_root = alloc_log_tree(trans, fs_info);
+	if (IS_ERR(log_root))
+		return PTR_ERR(log_root);
+	WARN_ON(fs_info->log_root_tree);
+	fs_info->log_root_tree = log_root;
+	return 0;
+}
+
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root)
+{
+	struct btrfs_root *log_root;
+	struct btrfs_inode_item *inode_item;
+
+	log_root = alloc_log_tree(trans, root->fs_info);
+	if (IS_ERR(log_root))
+		return PTR_ERR(log_root);
+
+	log_root->last_trans = trans->transid;
+	log_root->root_key.offset = root->root_key.objectid;
+
+	inode_item = &log_root->root_item.inode;
+	inode_item->generation = cpu_to_le64(1);
+	inode_item->size = cpu_to_le64(3);
+	inode_item->nlink = cpu_to_le32(1);
+	inode_item->nbytes = cpu_to_le64(root->leafsize);
+	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+	btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
+	btrfs_set_root_generation(&log_root->root_item, trans->transid);
+
+	WARN_ON(root->log_root);
+	root->log_root = log_root;
+	root->log_transid = 0;
 	return 0;
 }
 
@@ -1530,10 +1591,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->transaction_throttle);
 	init_waitqueue_head(&fs_info->transaction_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
-	init_waitqueue_head(&fs_info->tree_log_wait);
-	atomic_set(&fs_info->tree_log_commit, 0);
-	atomic_set(&fs_info->tree_log_writers, 0);
-	fs_info->tree_log_transid = 0;
 
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c0ff404c31b7..494a56eb2986 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -98,5 +98,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root);
 int btree_lock_page_hook(struct page *page);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1d7f043152b0..3b26f0980946 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2698,13 +2698,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	/* if metadata always pin */
 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
 		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
-			struct btrfs_block_group_cache *cache;
-
-			/* btrfs_free_reserved_extent */
-			cache = btrfs_lookup_block_group(root->fs_info, bytenr);
-			BUG_ON(!cache);
-			btrfs_add_free_space(cache, bytenr, num_bytes);
-			put_block_group(cache);
+			mutex_lock(&root->fs_info->pinned_mutex);
+			btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+			mutex_unlock(&root->fs_info->pinned_mutex);
 			update_reserved_extents(root, bytenr, num_bytes, 0);
 			return 0;
 		}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fbcbf43f5114..3e8023efaff7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1214,10 +1214,10 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 	mutex_unlock(&root->fs_info->trans_mutex);
 
-	root->fs_info->tree_log_batch++;
+	root->log_batch++;
 	filemap_fdatawrite(inode->i_mapping);
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
-	root->fs_info->tree_log_batch++;
+	root->log_batch++;
 
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d81cda2e077c..4f26f3ed0c87 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -77,104 +77,6 @@ static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
  * and once to do all the other items.
  */
 
-/*
- * btrfs_add_log_tree adds a new per-subvolume log tree into the
- * tree of log tree roots.  This must be called with a tree log transaction
- * running (see start_log_trans).
- */
-static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root)
-{
-	struct btrfs_key key;
-	struct btrfs_root_item root_item;
-	struct btrfs_inode_item *inode_item;
-	struct extent_buffer *leaf;
-	struct btrfs_root *new_root = root;
-	int ret;
-	u64 objectid = root->root_key.objectid;
-
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-				      BTRFS_TREE_LOG_OBJECTID,
-				      trans->transid, 0, 0, 0);
-	if (IS_ERR(leaf)) {
-		ret = PTR_ERR(leaf);
-		return ret;
-	}
-
-	btrfs_set_header_nritems(leaf, 0);
-	btrfs_set_header_level(leaf, 0);
-	btrfs_set_header_bytenr(leaf, leaf->start);
-	btrfs_set_header_generation(leaf, trans->transid);
-	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
-
-	write_extent_buffer(leaf, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(leaf),
-			    BTRFS_FSID_SIZE);
-	btrfs_mark_buffer_dirty(leaf);
-
-	inode_item = &root_item.inode;
-	memset(inode_item, 0, sizeof(*inode_item));
-	inode_item->generation = cpu_to_le64(1);
-	inode_item->size = cpu_to_le64(3);
-	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nbytes = cpu_to_le64(root->leafsize);
-	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
-
-	btrfs_set_root_bytenr(&root_item, leaf->start);
-	btrfs_set_root_generation(&root_item, trans->transid);
-	btrfs_set_root_level(&root_item, 0);
-	btrfs_set_root_refs(&root_item, 0);
-	btrfs_set_root_used(&root_item, 0);
-
-	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
-	root_item.drop_level = 0;
-
-	btrfs_tree_unlock(leaf);
-	free_extent_buffer(leaf);
-	leaf = NULL;
-
-	btrfs_set_root_dirid(&root_item, 0);
-
-	key.objectid = BTRFS_TREE_LOG_OBJECTID;
-	key.offset = objectid;
-	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-	ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
-				&root_item);
-	if (ret)
-		goto fail;
-
-	new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
-					       &key);
-	BUG_ON(!new_root);
-
-	WARN_ON(root->log_root);
-	root->log_root = new_root;
-
-	/*
-	 * log trees do not get reference counted because they go away
-	 * before a real commit is actually done.  They do store pointers
-	 * to file data extents, and those reference counts still get
-	 * updated (along with back refs to the log tree).
-	 */
-	new_root->ref_cows = 0;
-	new_root->last_trans = trans->transid;
-
-	/*
-	 * we need to make sure the root block for this new tree
-	 * is marked as dirty in the dirty_log_pages tree.  This
-	 * is how it gets flushed down to disk at tree log commit time.
-	 *
-	 * the tree logging mutex keeps others from coming in and changing
-	 * the new_root->node, so we can safely access it here
-	 */
-	set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
-			 new_root->node->start + new_root->node->len - 1,
-			 GFP_NOFS);
-
-fail:
-	return ret;
-}
-
 /*
  * start a sub transaction and setup the log tree
  * this increments the log tree writer count to make the people
@@ -184,6 +86,14 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root)
 {
 	int ret;
+
+	mutex_lock(&root->log_mutex);
+	if (root->log_root) {
+		root->log_batch++;
+		atomic_inc(&root->log_writers);
+		mutex_unlock(&root->log_mutex);
+		return 0;
+	}
 	mutex_lock(&root->fs_info->tree_log_mutex);
 	if (!root->fs_info->log_root_tree) {
 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -193,9 +103,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 		ret = btrfs_add_log_tree(trans, root);
 		BUG_ON(ret);
 	}
-	atomic_inc(&root->fs_info->tree_log_writers);
-	root->fs_info->tree_log_batch++;
 	mutex_unlock(&root->fs_info->tree_log_mutex);
+	root->log_batch++;
+	atomic_inc(&root->log_writers);
+	mutex_unlock(&root->log_mutex);
 	return 0;
 }
 
@@ -212,13 +123,12 @@ static int join_running_log_trans(struct btrfs_root *root)
 	if (!root->log_root)
 		return -ENOENT;
 
-	mutex_lock(&root->fs_info->tree_log_mutex);
+	mutex_lock(&root->log_mutex);
 	if (root->log_root) {
 		ret = 0;
-		atomic_inc(&root->fs_info->tree_log_writers);
-		root->fs_info->tree_log_batch++;
+		atomic_inc(&root->log_writers);
 	}
-	mutex_unlock(&root->fs_info->tree_log_mutex);
+	mutex_unlock(&root->log_mutex);
 	return ret;
 }
 
@@ -228,10 +138,11 @@ static int join_running_log_trans(struct btrfs_root *root)
  */
 static int end_log_trans(struct btrfs_root *root)
 {
-	atomic_dec(&root->fs_info->tree_log_writers);
-	smp_mb();
-	if (waitqueue_active(&root->fs_info->tree_log_wait))
-		wake_up(&root->fs_info->tree_log_wait);
+	if (atomic_dec_and_test(&root->log_writers)) {
+		smp_mb();
+		if (waitqueue_active(&root->log_writer_wait))
+			wake_up(&root->log_writer_wait);
+	}
 	return 0;
 }
 
@@ -1902,26 +1813,65 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 		}
 	}
 	btrfs_free_path(path);
-	if (wc->free)
-		free_extent_buffer(log->node);
 	return ret;
 }
 
-static int wait_log_commit(struct btrfs_root *log)
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *log)
+{
+	int ret;
+
+	if (log->log_transid == 1) {
+		/* insert root item on the first sync */
+		ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	} else {
+		ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	}
+	return ret;
+}
+
+static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
 {
 	DEFINE_WAIT(wait);
-	u64 transid = log->fs_info->tree_log_transid;
+	int index = transid % 2;
 
+	/*
+	 * we only allow two pending log transactions at a time,
+	 * so we know that if ours is more than 2 older than the
+	 * current transaction, we're done
+	 */
 	do {
-		prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&log->fs_info->tree_log_mutex);
-		if (atomic_read(&log->fs_info->tree_log_commit))
+		prepare_to_wait(&root->log_commit_wait[index],
+				&wait, TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&root->log_mutex);
+		if (root->log_transid < transid + 2 &&
+		    atomic_read(&root->log_commit[index]))
 			schedule();
-		finish_wait(&log->fs_info->tree_log_wait, &wait);
-		mutex_lock(&log->fs_info->tree_log_mutex);
-	} while (transid == log->fs_info->tree_log_transid &&
-		atomic_read(&log->fs_info->tree_log_commit));
+		finish_wait(&root->log_commit_wait[index], &wait);
+		mutex_lock(&root->log_mutex);
+	} while (root->log_transid < transid + 2 &&
+		 atomic_read(&root->log_commit[index]));
+	return 0;
+}
+
+static int wait_for_writer(struct btrfs_root *root)
+{
+	DEFINE_WAIT(wait);
+	while (atomic_read(&root->log_writers)) {
+		prepare_to_wait(&root->log_writer_wait,
+				&wait, TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&root->log_mutex);
+		if (atomic_read(&root->log_writers))
+			schedule();
+		mutex_lock(&root->log_mutex);
+		finish_wait(&root->log_writer_wait, &wait);
+	}
 	return 0;
 }
 
@@ -1933,57 +1883,114 @@ static int wait_log_commit(struct btrfs_root *log)
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		   struct btrfs_root *root)
 {
+	int index1;
+	int index2;
 	int ret;
-	unsigned long batch;
 	struct btrfs_root *log = root->log_root;
+	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
 
-	mutex_lock(&log->fs_info->tree_log_mutex);
-	if (atomic_read(&log->fs_info->tree_log_commit)) {
-		wait_log_commit(log);
-		goto out;
+	mutex_lock(&root->log_mutex);
+	index1 = root->log_transid % 2;
+	if (atomic_read(&root->log_commit[index1])) {
+		wait_log_commit(root, root->log_transid);
+		mutex_unlock(&root->log_mutex);
+		return 0;
 	}
-	atomic_set(&log->fs_info->tree_log_commit, 1);
+	atomic_set(&root->log_commit[index1], 1);
+
+	/* wait for previous tree log sync to complete */
+	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
+		wait_log_commit(root, root->log_transid - 1);
 
 	while (1) {
-		batch = log->fs_info->tree_log_batch;
-		mutex_unlock(&log->fs_info->tree_log_mutex);
+		unsigned long batch = root->log_batch;
+		mutex_unlock(&root->log_mutex);
 		schedule_timeout_uninterruptible(1);
-		mutex_lock(&log->fs_info->tree_log_mutex);
-
-		while (atomic_read(&log->fs_info->tree_log_writers)) {
-			DEFINE_WAIT(wait);
-			prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
-					TASK_UNINTERRUPTIBLE);
-			mutex_unlock(&log->fs_info->tree_log_mutex);
-			if (atomic_read(&log->fs_info->tree_log_writers))
-				schedule();
-			mutex_lock(&log->fs_info->tree_log_mutex);
-			finish_wait(&log->fs_info->tree_log_wait, &wait);
-		}
-		if (batch == log->fs_info->tree_log_batch)
+		mutex_lock(&root->log_mutex);
+		wait_for_writer(root);
+		if (batch == root->log_batch)
 			break;
 	}
 
 	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
 	BUG_ON(ret);
-	ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
-			       &root->fs_info->log_root_tree->dirty_log_pages);
+
+	btrfs_set_root_bytenr(&log->root_item, log->node->start);
+	btrfs_set_root_generation(&log->root_item, trans->transid);
+	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
+
+	root->log_batch = 0;
+	root->log_transid++;
+	log->log_transid = root->log_transid;
+	smp_mb();
+	/*
+	 * log tree has been flushed to disk, new modifications of
+	 * the log will be written to new positions. so it's safe to
+	 * allow log writers to go in.
+	 */
+	mutex_unlock(&root->log_mutex);
+
+	mutex_lock(&log_root_tree->log_mutex);
+	log_root_tree->log_batch++;
+	atomic_inc(&log_root_tree->log_writers);
+	mutex_unlock(&log_root_tree->log_mutex);
+
+	ret = update_log_root(trans, log);
+	BUG_ON(ret);
+
+	mutex_lock(&log_root_tree->log_mutex);
+	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
+		smp_mb();
+		if (waitqueue_active(&log_root_tree->log_writer_wait))
+			wake_up(&log_root_tree->log_writer_wait);
+	}
+
+	index2 = log_root_tree->log_transid % 2;
+	if (atomic_read(&log_root_tree->log_commit[index2])) {
+		wait_log_commit(log_root_tree, log_root_tree->log_transid);
+		mutex_unlock(&log_root_tree->log_mutex);
+		goto out;
+	}
+	atomic_set(&log_root_tree->log_commit[index2], 1);
+
+	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
+		wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
+
+	wait_for_writer(log_root_tree);
+
+	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
+				&log_root_tree->dirty_log_pages);
 	BUG_ON(ret);
 
 	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
-				 log->fs_info->log_root_tree->node->start);
+				log_root_tree->node->start);
 	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
-		       btrfs_header_level(log->fs_info->log_root_tree->node));
+				btrfs_header_level(log_root_tree->node));
+
+	log_root_tree->log_batch = 0;
+	log_root_tree->log_transid++;
+	smp_mb();
+
+	mutex_unlock(&log_root_tree->log_mutex);
+
+	/*
+	 * nobody else is going to jump in and write the the ctree
+	 * super here because the log_commit atomic below is protecting
+	 * us.  We must be called with a transaction handle pinning
+	 * the running transaction open, so a full commit can't hop
+	 * in and cause problems either.
+	 */
+	write_ctree_super(trans, root->fs_info->tree_root, 2);
 
-	write_ctree_super(trans, log->fs_info->tree_root, 2);
-	log->fs_info->tree_log_transid++;
-	log->fs_info->tree_log_batch = 0;
-	atomic_set(&log->fs_info->tree_log_commit, 0);
+	atomic_set(&log_root_tree->log_commit[index2], 0);
 	smp_mb();
-	if (waitqueue_active(&log->fs_info->tree_log_wait))
-		wake_up(&log->fs_info->tree_log_wait);
+	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
+		wake_up(&log_root_tree->log_commit_wait[index2]);
 out:
-	mutex_unlock(&log->fs_info->tree_log_mutex);
+	atomic_set(&root->log_commit[index1], 0);
+	smp_mb();
+	if (waitqueue_active(&root->log_commit_wait[index1]))
+		wake_up(&root->log_commit_wait[index1]);
 	return 0;
 }
 
@@ -2019,37 +2026,17 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 				   start, end, GFP_NOFS);
 	}
 
-	log = root->log_root;
-	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
-			     &log->root_key);
-	BUG_ON(ret);
+	if (log->log_transid > 0) {
+		ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
+				     &log->root_key);
+		BUG_ON(ret);
+	}
 	root->log_root = NULL;
-	kfree(root->log_root);
+	free_extent_buffer(log->node);
+	kfree(log);
 	return 0;
 }
 
-/*
- * helper function to update the item for a given subvolumes log root
- * in the tree of log roots
- */
-static int update_log_root(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *log)
-{
-	u64 bytenr = btrfs_root_bytenr(&log->root_item);
-	int ret;
-
-	if (log->node->start == bytenr)
-		return 0;
-
-	btrfs_set_root_bytenr(&log->root_item, log->node->start);
-	btrfs_set_root_generation(&log->root_item, trans->transid);
-	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
-	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
-				&log->root_key, &log->root_item);
-	BUG_ON(ret);
-	return ret;
-}
-
 /*
  * If both a file and directory are logged, and unlinks or renames are
  * mixed in, we have a few interesting corners:
@@ -2711,11 +2698,6 @@ next_slot:
 
 	btrfs_free_path(path);
 	btrfs_free_path(dst_path);
-
-	mutex_lock(&root->fs_info->tree_log_mutex);
-	ret = update_log_root(trans, log);
-	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->tree_log_mutex);
 out:
 	return 0;
 }
-- 
cgit v1.2.3


From 35054394c4b3cecd52577c2662c84da1f3e73525 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 21 Jan 2009 13:11:13 -0500
Subject: Btrfs: stop providing a bmap operation to avoid swapfile corruptions

Swapfiles use bmap to build a list of extents belonging to the file,
and they assume these extents won't change over the life of the file.
They also use resulting list to do IO directly to the block device.

This causes problems for btrfs in a few ways:

btrfs returns logical block numbers through bmap, and these are not suitable
for IO.  They might translate to different devices, raid etc.

COW means that file block mappings are going to change frequently.

Using swapfiles on btrfs will lead to corruption, so we're avoiding the
problem for now by dropping bmap support entirely.  A later commit
will add fiemap support for people that really want to know how
a file is laid out.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 45cf03ee1bc2..2e25d698bab0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4156,11 +4156,6 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	return -EINVAL;
 }
 
-static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
-{
-	return extent_bmap(mapping, iblock, btrfs_get_extent);
-}
-
 int btrfs_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
@@ -4985,13 +4980,24 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.clear_bit_hook = btrfs_clear_bit_hook,
 };
 
+/*
+ * btrfs doesn't support the bmap operation because swapfiles
+ * use bmap to make a mapping of extents in the file.  They assume
+ * these extents won't change over the life of the file and they
+ * use the bmap result to do IO directly to the drive.
+ *
+ * the btrfs bmap call would return logical addresses that aren't
+ * suitable for IO and they also will change frequently as COW
+ * operations happen.  So, swapfile + btrfs == corruption.
+ *
+ * For now we're avoiding this by dropping bmap.
+ */
 static struct address_space_operations btrfs_aops = {
 	.readpage	= btrfs_readpage,
 	.writepage	= btrfs_writepage,
 	.writepages	= btrfs_writepages,
 	.readpages	= btrfs_readpages,
 	.sync_page	= block_sync_page,
-	.bmap		= btrfs_bmap,
 	.direct_IO	= btrfs_direct_IO,
 	.invalidatepage = btrfs_invalidatepage,
 	.releasepage	= btrfs_releasepage,
-- 
cgit v1.2.3


From 1506fcc8189cdd4b95e06df7845a09f18b4526a6 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Wed, 21 Jan 2009 14:39:14 -0500
Subject: Btrfs: fiemap support

Now that bmap support is gone, this is the only way to get extent
mappings for userland.  These are still not valid for IO, but they
can tell us if a file has holes or how much fragmentation there is.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/btrfs/extent_io.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_io.h |  2 ++
 fs/btrfs/inode.c     |  7 ++++
 3 files changed, 101 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c9446d4840ed..a3b0676403f7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2854,6 +2854,98 @@ out:
 	return sector;
 }
 
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len, get_extent_t *get_extent)
+{
+	int ret;
+	u64 off = start;
+	u64 max = start + len;
+	u32 flags = 0;
+	u64 disko = 0;
+	struct extent_map *em = NULL;
+	int end = 0;
+	u64 em_start = 0, em_len = 0;
+	unsigned long emflags;
+	ret = 0;
+
+	if (len == 0)
+		return -EINVAL;
+
+	lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
+		GFP_NOFS);
+	em = get_extent(inode, NULL, 0, off, max - off, 0);
+	if (!em)
+		goto out;
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out;
+	}
+	while (!end) {
+		off = em->start + em->len;
+		if (off >= max)
+			end = 1;
+
+		em_start = em->start;
+		em_len = em->len;
+
+		disko = 0;
+		flags = 0;
+
+		switch (em->block_start) {
+		case EXTENT_MAP_LAST_BYTE:
+			end = 1;
+			flags |= FIEMAP_EXTENT_LAST;
+			break;
+		case EXTENT_MAP_HOLE:
+			flags |= FIEMAP_EXTENT_UNWRITTEN;
+			break;
+		case EXTENT_MAP_INLINE:
+			flags |= (FIEMAP_EXTENT_DATA_INLINE |
+				  FIEMAP_EXTENT_NOT_ALIGNED);
+			break;
+		case EXTENT_MAP_DELALLOC:
+			flags |= (FIEMAP_EXTENT_DELALLOC |
+				  FIEMAP_EXTENT_UNKNOWN);
+			break;
+		default:
+			disko = em->block_start;
+			break;
+		}
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			flags |= FIEMAP_EXTENT_ENCODED;
+
+		emflags = em->flags;
+		free_extent_map(em);
+		em = NULL;
+
+		if (!end) {
+			em = get_extent(inode, NULL, 0, off, max - off, 0);
+			if (!em)
+				goto out;
+			if (IS_ERR(em)) {
+				ret = PTR_ERR(em);
+				goto out;
+			}
+			emflags = em->flags;
+		}
+		if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
+			flags |= FIEMAP_EXTENT_LAST;
+			end = 1;
+		}
+
+		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+					em_len, flags);
+		if (ret)
+			goto out_free;
+	}
+out_free:
+	free_extent_map(em);
+out:
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
+			GFP_NOFS);
+	return ret;
+}
+
 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 					      unsigned long i)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c5b483a79137..e80c6d96b318 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -193,6 +193,8 @@ int extent_commit_write(struct extent_io_tree *tree,
 			unsigned from, unsigned to);
 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 		get_extent_t *get_extent);
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len, get_extent_t *get_extent);
 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2e25d698bab0..288c2cdc7543 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4156,6 +4156,12 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	return -EINVAL;
 }
 
+static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
+{
+	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
+}
+
 int btrfs_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
@@ -5021,6 +5027,7 @@ static struct inode_operations btrfs_file_inode_operations = {
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
 	.fallocate	= btrfs_fallocate,
+	.fiemap		= btrfs_fiemap,
 };
 static struct inode_operations btrfs_special_inode_operations = {
 	.getattr	= btrfs_getattr,
-- 
cgit v1.2.3


From 24179f488092267c9a033d7e25ce7a58af50ff79 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 19 Jan 2009 13:13:33 -0600
Subject: dlm: fix plock notify callback to lockd

We should use the original copy of the file_lock, fl, instead
of the copy, flc in the lockd notify callback.  The range in flc has
been modified by posix_lock_file(), so it will not match a copy of the
lock in lockd.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/plock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index eba87ff3177b..502b1ea5ef6b 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -168,7 +168,7 @@ static int dlm_plock_callback(struct plock_op *op)
 	notify = xop->callback;
 
 	if (op->info.rv) {
-		notify(flc, NULL, op->info.rv);
+		notify(fl, NULL, op->info.rv);
 		goto out;
 	}
 
@@ -187,7 +187,7 @@ static int dlm_plock_callback(struct plock_op *op)
 			  (unsigned long long)op->info.number, file, fl);
 	}
 
-	rv = notify(flc, NULL, 0);
+	rv = notify(fl, NULL, 0);
 	if (rv) {
 		/* XXX: We need to cancel the fs lock here: */
 		log_print("dlm_plock_callback: lock granted after lock request "
-- 
cgit v1.2.3


From 20d5a39929232a715f29e6cb7e3f0d0c790f41eb Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 21 Jan 2009 11:34:50 -0500
Subject: dlm: initialize file_lock struct in GETLK before copying conflicting
 lock

dlm_posix_get fills out the relevant fields in the file_lock before
returning when there is a lock conflict, but doesn't clean out any of
the other fields in the file_lock.

When nfsd does a NFSv4 lockt call, it sets the fl_lmops to
nfsd_posix_mng_ops before calling the lower fs. When the lock comes back
after testing a lock on GFS2, it still has that field set. This confuses
nfsd into thinking that the file_lock is a nfsd4 lock.

Fix this by making DLM reinitialize the file_lock before copying the
fields from the conflicting lock.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/plock.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 502b1ea5ef6b..894a32d438d5 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -304,7 +304,9 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	if (rv == -ENOENT)
 		rv = 0;
 	else if (rv > 0) {
+		locks_init_lock(fl);
 		fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+		fl->fl_flags = FL_POSIX;
 		fl->fl_pid = op->info.pid;
 		fl->fl_start = op->info.start;
 		fl->fl_end = op->info.end;
-- 
cgit v1.2.3


From 74e2d06521913443c7e2697037909f5efc200ec5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 21 Jan 2009 15:22:17 +1100
Subject: Long btree pointers are still 64 bit on disk

[XFS] Long btree pointers are still 64 bit on disk

On 32 bit machines with CONFIG_LBD=n, XFS reduces the
in memory size of xfs_fsblock_t to 32 bits so that it
will fit within 32 bit addressing. However, the disk format
for long btree pointers are still 64 bits in size.

The recent btree rewrite failed to take this into account
when initialising new btree blocks, setting sibling pointers
to NULL and checking if they are NULL. Hence checking whether
a 64 bit NULL was the same as a 32 bit NULL was failingi
resulting in NULL sibling pointers failing to be detected
correctly. This showed up as WANT_CORRUPTED_GOTO shutdowns
in xfs_btree_delrec.

Fix this by making all the comparisons and setting of long
pointer btree NULL blocks to the disk format, not the
in memory format. i.e. use NULLDFSBNO.

Reported-by: Alexander Beregalov <a.beregalov@gmail.com>
Reported-by: Jacek Luczak <difrost.kernel@gmail.com>
Reported-by: Danny ter Haar <dth@dth.net>
Tested-by: Jacek Luczak <difrost.kernel@gmail.com>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_btree.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 4681519ded91..e73c332eb23f 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -843,7 +843,7 @@ xfs_btree_ptr_is_null(
 	union xfs_btree_ptr	*ptr)
 {
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		return be64_to_cpu(ptr->l) == NULLFSBLOCK;
+		return be64_to_cpu(ptr->l) == NULLDFSBNO;
 	else
 		return be32_to_cpu(ptr->s) == NULLAGBLOCK;
 }
@@ -854,7 +854,7 @@ xfs_btree_set_ptr_null(
 	union xfs_btree_ptr	*ptr)
 {
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		ptr->l = cpu_to_be64(NULLFSBLOCK);
+		ptr->l = cpu_to_be64(NULLDFSBNO);
 	else
 		ptr->s = cpu_to_be32(NULLAGBLOCK);
 }
@@ -918,8 +918,8 @@ xfs_btree_init_block(
 	new->bb_numrecs = cpu_to_be16(numrecs);
 
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
-		new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
+		new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+		new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
 	} else {
 		new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
 		new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
@@ -971,7 +971,7 @@ xfs_btree_ptr_to_daddr(
 	union xfs_btree_ptr	*ptr)
 {
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK);
+		ASSERT(be64_to_cpu(ptr->l) != NULLDFSBNO);
 
 		return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
 	} else {
-- 
cgit v1.2.3


From b16ecfe2f985f77901a36ee5a99c7d3400313341 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:22:31 +0300
Subject: fs/Kconfig: move reiserfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig          | 86 +----------------------------------------------------
 fs/reiserfs/Kconfig | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 85 deletions(-)
 create mode 100644 fs/reiserfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 51307b0fdf0f..03fde694969e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -27,91 +27,7 @@ config FS_MBCACHE
 	default y if EXT4_FS=y && EXT4_FS_XATTR
 	default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
 
-config REISERFS_FS
-	tristate "Reiserfs support"
-	help
-	  Stores not just filenames but the files themselves in a balanced
-	  tree.  Uses journalling.
-
-	  Balanced trees are more efficient than traditional file system
-	  architectural foundations.
-
-	  In general, ReiserFS is as fast as ext2, but is very efficient with
-	  large directories and small files.  Additional patches are needed
-	  for NFS and quotas, please see <http://www.namesys.com/> for links.
-
-	  It is more easily extended to have features currently found in
-	  database and keyword search systems than block allocation based file
-	  systems are.  The next version will be so extended, and will support
-	  plugins consistent with our motto ``It takes more than a license to
-	  make source code open.''
-
-	  Read <http://www.namesys.com/> to learn more about reiserfs.
-
-	  Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
-
-	  If you like it, you can pay us to add new features to it that you
-	  need, buy a support contract, or pay us to port it to another OS.
-
-config REISERFS_CHECK
-	bool "Enable reiserfs debug mode"
-	depends on REISERFS_FS
-	help
-	  If you set this to Y, then ReiserFS will perform every check it can
-	  possibly imagine of its internal consistency throughout its
-	  operation.  It will also go substantially slower.  More than once we
-	  have forgotten that this was on, and then gone despondent over the
-	  latest benchmarks.:-) Use of this option allows our team to go all
-	  out in checking for consistency when debugging without fear of its
-	  effect on end users.  If you are on the verge of sending in a bug
-	  report, say Y and you might get a useful error message.  Almost
-	  everyone should say N.
-
-config REISERFS_PROC_INFO
-	bool "Stats in /proc/fs/reiserfs"
-	depends on REISERFS_FS && PROC_FS
-	help
-	  Create under /proc/fs/reiserfs a hierarchy of files, displaying
-	  various ReiserFS statistics and internal data at the expense of
-	  making your kernel or module slightly larger (+8 KB). This also
-	  increases the amount of kernel memory required for each mount.
-	  Almost everyone but ReiserFS developers and people fine-tuning
-	  reiserfs or tracing problems should say N.
-
-config REISERFS_FS_XATTR
-	bool "ReiserFS extended attributes"
-	depends on REISERFS_FS
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-config REISERFS_FS_POSIX_ACL
-	bool "ReiserFS POSIX Access Control Lists"
-	depends on REISERFS_FS_XATTR
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the Posix ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config REISERFS_FS_SECURITY
-	bool "ReiserFS Security Labels"
-	depends on REISERFS_FS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the ReiserFS filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
+source "fs/reiserfs/Kconfig"
 
 config JFS_FS
 	tristate "JFS filesystem support"
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
new file mode 100644
index 000000000000..949b8c6addc8
--- /dev/null
+++ b/fs/reiserfs/Kconfig
@@ -0,0 +1,85 @@
+config REISERFS_FS
+	tristate "Reiserfs support"
+	help
+	  Stores not just filenames but the files themselves in a balanced
+	  tree.  Uses journalling.
+
+	  Balanced trees are more efficient than traditional file system
+	  architectural foundations.
+
+	  In general, ReiserFS is as fast as ext2, but is very efficient with
+	  large directories and small files.  Additional patches are needed
+	  for NFS and quotas, please see <http://www.namesys.com/> for links.
+
+	  It is more easily extended to have features currently found in
+	  database and keyword search systems than block allocation based file
+	  systems are.  The next version will be so extended, and will support
+	  plugins consistent with our motto ``It takes more than a license to
+	  make source code open.''
+
+	  Read <http://www.namesys.com/> to learn more about reiserfs.
+
+	  Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
+
+	  If you like it, you can pay us to add new features to it that you
+	  need, buy a support contract, or pay us to port it to another OS.
+
+config REISERFS_CHECK
+	bool "Enable reiserfs debug mode"
+	depends on REISERFS_FS
+	help
+	  If you set this to Y, then ReiserFS will perform every check it can
+	  possibly imagine of its internal consistency throughout its
+	  operation.  It will also go substantially slower.  More than once we
+	  have forgotten that this was on, and then gone despondent over the
+	  latest benchmarks.:-) Use of this option allows our team to go all
+	  out in checking for consistency when debugging without fear of its
+	  effect on end users.  If you are on the verge of sending in a bug
+	  report, say Y and you might get a useful error message.  Almost
+	  everyone should say N.
+
+config REISERFS_PROC_INFO
+	bool "Stats in /proc/fs/reiserfs"
+	depends on REISERFS_FS && PROC_FS
+	help
+	  Create under /proc/fs/reiserfs a hierarchy of files, displaying
+	  various ReiserFS statistics and internal data at the expense of
+	  making your kernel or module slightly larger (+8 KB). This also
+	  increases the amount of kernel memory required for each mount.
+	  Almost everyone but ReiserFS developers and people fine-tuning
+	  reiserfs or tracing problems should say N.
+
+config REISERFS_FS_XATTR
+	bool "ReiserFS extended attributes"
+	depends on REISERFS_FS
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config REISERFS_FS_POSIX_ACL
+	bool "ReiserFS POSIX Access Control Lists"
+	depends on REISERFS_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config REISERFS_FS_SECURITY
+	bool "ReiserFS Security Labels"
+	depends on REISERFS_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ReiserFS filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
-- 
cgit v1.2.3


From f5c77969b33cc5cbb4534289bf23cb1794f9d37c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:24:27 +0300
Subject: fs/Kconfig: move jfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 51 +--------------------------------------------------
 fs/jfs/Kconfig | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 50 deletions(-)
 create mode 100644 fs/jfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 03fde694969e..b39675cc0fc0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -28,56 +28,7 @@ config FS_MBCACHE
 	default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
 
 source "fs/reiserfs/Kconfig"
-
-config JFS_FS
-	tristate "JFS filesystem support"
-	select NLS
-	help
-	  This is a port of IBM's Journaled Filesystem .  More information is
-	  available in the file <file:Documentation/filesystems/jfs.txt>.
-
-	  If you do not intend to use the JFS filesystem, say N.
-
-config JFS_POSIX_ACL
-	bool "JFS POSIX Access Control Lists"
-	depends on JFS_FS
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the Posix ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config JFS_SECURITY
-	bool "JFS Security Labels"
-	depends on JFS_FS
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the jfs filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
-config JFS_DEBUG
-	bool "JFS debugging"
-	depends on JFS_FS
-	help
-	  If you are experiencing any problems with the JFS filesystem, say
-	  Y here.  This will result in additional debugging messages to be
-	  written to the system log.  Under normal circumstances, this
-	  results in very little overhead.
-
-config JFS_STATISTICS
-	bool "JFS statistics"
-	depends on JFS_FS
-	help
-	  Enabling this option will cause statistics from the JFS file system
-	  to be made available to the user in the /proc/fs/jfs/ directory.
+source "fs/jfs/Kconfig"
 
 config FS_POSIX_ACL
 # Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
new file mode 100644
index 000000000000..9ff619a6f9cc
--- /dev/null
+++ b/fs/jfs/Kconfig
@@ -0,0 +1,49 @@
+config JFS_FS
+	tristate "JFS filesystem support"
+	select NLS
+	help
+	  This is a port of IBM's Journaled Filesystem .  More information is
+	  available in the file <file:Documentation/filesystems/jfs.txt>.
+
+	  If you do not intend to use the JFS filesystem, say N.
+
+config JFS_POSIX_ACL
+	bool "JFS POSIX Access Control Lists"
+	depends on JFS_FS
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config JFS_SECURITY
+	bool "JFS Security Labels"
+	depends on JFS_FS
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the jfs filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config JFS_DEBUG
+	bool "JFS debugging"
+	depends on JFS_FS
+	help
+	  If you are experiencing any problems with the JFS filesystem, say
+	  Y here.  This will result in additional debugging messages to be
+	  written to the system log.  Under normal circumstances, this
+	  results in very little overhead.
+
+config JFS_STATISTICS
+	bool "JFS statistics"
+	depends on JFS_FS
+	help
+	  Enabling this option will cause statistics from the JFS file system
+	  to be made available to the user in the /proc/fs/jfs/ directory.
-- 
cgit v1.2.3


From 2fe4371dff3f1a5a1f7d91f1b090076954f4d17e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:26:11 +0300
Subject: fs/Kconfig: move ocfs2 out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 87 +-------------------------------------------------------
 fs/ocfs2/Kconfig | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 86 deletions(-)
 create mode 100644 fs/ocfs2/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index b39675cc0fc0..9fbc43f973d4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -49,92 +49,7 @@ config FILE_LOCKING
 
 source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
-
-config OCFS2_FS
-	tristate "OCFS2 file system support"
-	depends on NET && SYSFS
-	select CONFIGFS_FS
-	select JBD2
-	select CRC32
-	select QUOTA
-	select QUOTA_TREE
-	help
-	  OCFS2 is a general purpose extent based shared disk cluster file
-	  system with many similarities to ext3. It supports 64 bit inode
-	  numbers, and has automatically extending metadata groups which may
-	  also make it attractive for non-clustered use.
-
-	  You'll want to install the ocfs2-tools package in order to at least
-	  get "mount.ocfs2".
-
-	  Project web page:    http://oss.oracle.com/projects/ocfs2
-	  Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
-	  OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
-
-	  For more information on OCFS2, see the file
-	  <file:Documentation/filesystems/ocfs2.txt>.
-
-config OCFS2_FS_O2CB
-	tristate "O2CB Kernelspace Clustering"
-	depends on OCFS2_FS
-	default y
-	help
-	  OCFS2 includes a simple kernelspace clustering package, the OCFS2
-	  Cluster Base.  It only requires a very small userspace component
-	  to configure it. This comes with the standard ocfs2-tools package.
-	  O2CB is limited to maintaining a cluster for OCFS2 file systems.
-	  It cannot manage any other cluster applications.
-
-	  It is always safe to say Y here, as the clustering method is
-	  run-time selectable.
-
-config OCFS2_FS_USERSPACE_CLUSTER
-	tristate "OCFS2 Userspace Clustering"
-	depends on OCFS2_FS && DLM
-	default y
-	help
-	  This option will allow OCFS2 to use userspace clustering services
-	  in conjunction with the DLM in fs/dlm.  If you are using a
-	  userspace cluster manager, say Y here.
-
-	  It is safe to say Y, as the clustering method is run-time
-	  selectable.
-
-config OCFS2_FS_STATS
-	bool "OCFS2 statistics"
-	depends on OCFS2_FS
-	default y
-	help
-	  This option allows some fs statistics to be captured. Enabling
-	  this option may increase the memory consumption.
-
-config OCFS2_DEBUG_MASKLOG
-	bool "OCFS2 logging support"
-	depends on OCFS2_FS
-	default y
-	help
-	  The ocfs2 filesystem has an extensive logging system.  The system
-	  allows selection of events to log via files in /sys/o2cb/logmask/.
-	  This option will enlarge your kernel, but it allows debugging of
-	  ocfs2 filesystem issues.
-
-config OCFS2_DEBUG_FS
-	bool "OCFS2 expensive checks"
-	depends on OCFS2_FS
-	default n
-	help
-	  This option will enable expensive consistency checks. Enable
-	  this option for debugging only as it is likely to decrease
-	  performance of the filesystem.
-
-config OCFS2_FS_POSIX_ACL
-	bool "OCFS2 POSIX Access Control Lists"
-	depends on OCFS2_FS
-	select FS_POSIX_ACL
-	default n
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
+source "fs/ocfs2/Kconfig"
 
 config BTRFS_FS
 	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
new file mode 100644
index 000000000000..701b7a3a872e
--- /dev/null
+++ b/fs/ocfs2/Kconfig
@@ -0,0 +1,85 @@
+config OCFS2_FS
+	tristate "OCFS2 file system support"
+	depends on NET && SYSFS
+	select CONFIGFS_FS
+	select JBD2
+	select CRC32
+	select QUOTA
+	select QUOTA_TREE
+	help
+	  OCFS2 is a general purpose extent based shared disk cluster file
+	  system with many similarities to ext3. It supports 64 bit inode
+	  numbers, and has automatically extending metadata groups which may
+	  also make it attractive for non-clustered use.
+
+	  You'll want to install the ocfs2-tools package in order to at least
+	  get "mount.ocfs2".
+
+	  Project web page:    http://oss.oracle.com/projects/ocfs2
+	  Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+	  OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+	  For more information on OCFS2, see the file
+	  <file:Documentation/filesystems/ocfs2.txt>.
+
+config OCFS2_FS_O2CB
+	tristate "O2CB Kernelspace Clustering"
+	depends on OCFS2_FS
+	default y
+	help
+	  OCFS2 includes a simple kernelspace clustering package, the OCFS2
+	  Cluster Base.  It only requires a very small userspace component
+	  to configure it. This comes with the standard ocfs2-tools package.
+	  O2CB is limited to maintaining a cluster for OCFS2 file systems.
+	  It cannot manage any other cluster applications.
+
+	  It is always safe to say Y here, as the clustering method is
+	  run-time selectable.
+
+config OCFS2_FS_USERSPACE_CLUSTER
+	tristate "OCFS2 Userspace Clustering"
+	depends on OCFS2_FS && DLM
+	default y
+	help
+	  This option will allow OCFS2 to use userspace clustering services
+	  in conjunction with the DLM in fs/dlm.  If you are using a
+	  userspace cluster manager, say Y here.
+
+	  It is safe to say Y, as the clustering method is run-time
+	  selectable.
+
+config OCFS2_FS_STATS
+	bool "OCFS2 statistics"
+	depends on OCFS2_FS
+	default y
+	help
+	  This option allows some fs statistics to be captured. Enabling
+	  this option may increase the memory consumption.
+
+config OCFS2_DEBUG_MASKLOG
+	bool "OCFS2 logging support"
+	depends on OCFS2_FS
+	default y
+	help
+	  The ocfs2 filesystem has an extensive logging system.  The system
+	  allows selection of events to log via files in /sys/o2cb/logmask/.
+	  This option will enlarge your kernel, but it allows debugging of
+	  ocfs2 filesystem issues.
+
+config OCFS2_DEBUG_FS
+	bool "OCFS2 expensive checks"
+	depends on OCFS2_FS
+	default n
+	help
+	  This option will enable expensive consistency checks. Enable
+	  this option for debugging only as it is likely to decrease
+	  performance of the filesystem.
+
+config OCFS2_FS_POSIX_ACL
+	bool "OCFS2 POSIX Access Control Lists"
+	depends on OCFS2_FS
+	select FS_POSIX_ACL
+	default n
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
-- 
cgit v1.2.3


From 335debee07f2d4187a6073d7764ed56bb2ae52f4 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:27:30 +0300
Subject: fs/Kconfig: move btrfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 20 +-------------------
 fs/btrfs/Kconfig | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 19 deletions(-)
 create mode 100644 fs/btrfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 9fbc43f973d4..51f2aba92c22 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -50,25 +50,7 @@ config FILE_LOCKING
 source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
 source "fs/ocfs2/Kconfig"
-
-config BTRFS_FS
-	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
-	depends on EXPERIMENTAL
-	select LIBCRC32C
-	select ZLIB_INFLATE
-	select ZLIB_DEFLATE
-	help
-	  Btrfs is a new filesystem with extents, writable snapshotting,
-	  support for multiple devices and many more features.
-
-	  Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
-	  FINALIZED.  You should say N here unless you are interested in
-	  testing Btrfs with non-critical data.
-
-	  To compile this file system support as a module, choose M here. The
-	  module will be called btrfs.
-
-	  If unsure, say N.
+source "fs/btrfs/Kconfig"
 
 endif # BLOCK
 
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
new file mode 100644
index 000000000000..f8fcf999ea1b
--- /dev/null
+++ b/fs/btrfs/Kconfig
@@ -0,0 +1,18 @@
+config BTRFS_FS
+	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
+	depends on EXPERIMENTAL
+	select LIBCRC32C
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	help
+	  Btrfs is a new filesystem with extents, writable snapshotting,
+	  support for multiple devices and many more features.
+
+	  Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
+	  FINALIZED.  You should say N here unless you are interested in
+	  testing Btrfs with non-critical data.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called btrfs.
+
+	  If unsure, say N.
-- 
cgit v1.2.3


From 90ffd467933eaf581e11fec51e7ba16fc9bd542d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:31:56 +0300
Subject: fs/Kconfig: move autofs, autofs4 out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig         | 44 ++------------------------------------------
 fs/autofs/Kconfig  | 21 +++++++++++++++++++++
 fs/autofs4/Kconfig | 20 ++++++++++++++++++++
 3 files changed, 43 insertions(+), 42 deletions(-)
 create mode 100644 fs/autofs/Kconfig
 create mode 100644 fs/autofs4/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 51f2aba92c22..70527fe6b630 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -112,48 +112,8 @@ config QUOTACTL
 	depends on XFS_QUOTA || QUOTA
 	default y
 
-config AUTOFS_FS
-	tristate "Kernel automounter support"
-	help
-	  The automounter is a tool to automatically mount remote file systems
-	  on demand. This implementation is partially kernel-based to reduce
-	  overhead in the already-mounted case; this is unlike the BSD
-	  automounter (amd), which is a pure user space daemon.
-
-	  To use the automounter you need the user-space tools from the autofs
-	  package; you can find the location in <file:Documentation/Changes>.
-	  You also want to answer Y to "NFS file system support", below.
-
-	  If you want to use the newer version of the automounter with more
-	  features, say N here and say Y to "Kernel automounter v4 support",
-	  below.
-
-	  To compile this support as a module, choose M here: the module will be
-	  called autofs.
-
-	  If you are not a part of a fairly large, distributed network, you
-	  probably do not need an automounter, and can say N here.
-
-config AUTOFS4_FS
-	tristate "Kernel automounter version 4 support (also supports v3)"
-	help
-	  The automounter is a tool to automatically mount remote file systems
-	  on demand. This implementation is partially kernel-based to reduce
-	  overhead in the already-mounted case; this is unlike the BSD
-	  automounter (amd), which is a pure user space daemon.
-
-	  To use the automounter you need the user-space tools from
-	  <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
-	  want to answer Y to "NFS file system support", below.
-
-	  To compile this support as a module, choose M here: the module will be
-	  called autofs4.  You will need to add "alias autofs autofs4" to your
-	  modules configuration file.
-
-	  If you are not a part of a fairly large, distributed network or
-	  don't have a laptop which needs to dynamically reconfigure to the
-	  local network, you probably do not need an automounter, and can say
-	  N here.
+source "fs/autofs/Kconfig"
+source "fs/autofs4/Kconfig"
 
 config FUSE_FS
 	tristate "FUSE (Filesystem in Userspace) support"
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
new file mode 100644
index 000000000000..5f3bea90911e
--- /dev/null
+++ b/fs/autofs/Kconfig
@@ -0,0 +1,21 @@
+config AUTOFS_FS
+	tristate "Kernel automounter support"
+	help
+	  The automounter is a tool to automatically mount remote file systems
+	  on demand. This implementation is partially kernel-based to reduce
+	  overhead in the already-mounted case; this is unlike the BSD
+	  automounter (amd), which is a pure user space daemon.
+
+	  To use the automounter you need the user-space tools from the autofs
+	  package; you can find the location in <file:Documentation/Changes>.
+	  You also want to answer Y to "NFS file system support", below.
+
+	  If you want to use the newer version of the automounter with more
+	  features, say N here and say Y to "Kernel automounter v4 support",
+	  below.
+
+	  To compile this support as a module, choose M here: the module will be
+	  called autofs.
+
+	  If you are not a part of a fairly large, distributed network, you
+	  probably do not need an automounter, and can say N here.
diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig
new file mode 100644
index 000000000000..1204d6384d39
--- /dev/null
+++ b/fs/autofs4/Kconfig
@@ -0,0 +1,20 @@
+config AUTOFS4_FS
+	tristate "Kernel automounter version 4 support (also supports v3)"
+	help
+	  The automounter is a tool to automatically mount remote file systems
+	  on demand. This implementation is partially kernel-based to reduce
+	  overhead in the already-mounted case; this is unlike the BSD
+	  automounter (amd), which is a pure user space daemon.
+
+	  To use the automounter you need the user-space tools from
+	  <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
+	  want to answer Y to "NFS file system support", below.
+
+	  To compile this support as a module, choose M here: the module will be
+	  called autofs4.  You will need to add "alias autofs autofs4" to your
+	  modules configuration file.
+
+	  If you are not a part of a fairly large, distributed network or
+	  don't have a laptop which needs to dynamically reconfigure to the
+	  local network, you probably do not need an automounter, and can say
+	  N here.
-- 
cgit v1.2.3


From 3ef7784e47975e31148c25b6fa795949fdc16d9c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:33:25 +0300
Subject: fs/Kconfig: move fuse out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 17 +----------------
 fs/fuse/Kconfig | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 16 deletions(-)
 create mode 100644 fs/fuse/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 70527fe6b630..8b36059d2b0c 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -114,22 +114,7 @@ config QUOTACTL
 
 source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
-
-config FUSE_FS
-	tristate "FUSE (Filesystem in Userspace) support"
-	help
-	  With FUSE it is possible to implement a fully functional filesystem
-	  in a userspace program.
-
-	  There's also companion library: libfuse.  This library along with
-	  utilities is available from the FUSE homepage:
-	  <http://fuse.sourceforge.net/>
-
-	  See <file:Documentation/filesystems/fuse.txt> for more information.
-	  See <file:Documentation/Changes> for needed library/utility version.
-
-	  If you want to develop a userspace FS, or if you want to use
-	  a filesystem based on FUSE, answer Y or M.
+source "fs/fuse/Kconfig"
 
 config GENERIC_ACL
 	bool
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
new file mode 100644
index 000000000000..0cf160a94eda
--- /dev/null
+++ b/fs/fuse/Kconfig
@@ -0,0 +1,15 @@
+config FUSE_FS
+	tristate "FUSE (Filesystem in Userspace) support"
+	help
+	  With FUSE it is possible to implement a fully functional filesystem
+	  in a userspace program.
+
+	  There's also companion library: libfuse.  This library along with
+	  utilities is available from the FUSE homepage:
+	  <http://fuse.sourceforge.net/>
+
+	  See <file:Documentation/filesystems/fuse.txt> for more information.
+	  See <file:Documentation/Changes> for needed library/utility version.
+
+	  If you want to develop a userspace FS, or if you want to use
+	  a filesystem based on FUSE, answer Y or M.
-- 
cgit v1.2.3


From ddfaccd995b2d1bb1df4461ee9403ba9fdcbee04 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:35:21 +0300
Subject: fs/Kconfig: move iso9660, udf out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 60 ++------------------------------------------------------
 fs/isofs/Kconfig | 39 ++++++++++++++++++++++++++++++++++++
 fs/udf/Kconfig   | 18 +++++++++++++++++
 3 files changed, 59 insertions(+), 58 deletions(-)
 create mode 100644 fs/isofs/Kconfig
 create mode 100644 fs/udf/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 8b36059d2b0c..b4868b8fd999 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -123,64 +123,8 @@ config GENERIC_ACL
 if BLOCK
 menu "CD-ROM/DVD Filesystems"
 
-config ISO9660_FS
-	tristate "ISO 9660 CDROM file system support"
-	help
-	  This is the standard file system used on CD-ROMs.  It was previously
-	  known as "High Sierra File System" and is called "hsfs" on other
-	  Unix systems.  The so-called Rock-Ridge extensions which allow for
-	  long Unix filenames and symbolic links are also supported by this
-	  driver.  If you have a CD-ROM drive and want to do more with it than
-	  just listen to audio CDs and watch its LEDs, say Y (and read
-	  <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO,
-	  available from <http://www.tldp.org/docs.html#howto>), thereby
-	  enlarging your kernel by about 27 KB; otherwise say N.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called isofs.
-
-config JOLIET
-	bool "Microsoft Joliet CDROM extensions"
-	depends on ISO9660_FS
-	select NLS
-	help
-	  Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system
-	  which allows for long filenames in unicode format (unicode is the
-	  new 16 bit character code, successor to ASCII, which encodes the
-	  characters of almost all languages of the world; see
-	  <http://www.unicode.org/> for more information).  Say Y here if you
-	  want to be able to read Joliet CD-ROMs under Linux.
-
-config ZISOFS
-	bool "Transparent decompression extension"
-	depends on ISO9660_FS
-	select ZLIB_INFLATE
-	help
-	  This is a Linux-specific extension to RockRidge which lets you store
-	  data in compressed form on a CD-ROM and have it transparently
-	  decompressed when the CD-ROM is accessed.  See
-	  <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools
-	  necessary to create such a filesystem.  Say Y here if you want to be
-	  able to read such compressed CD-ROMs.
-
-config UDF_FS
-	tristate "UDF file system support"
-	select CRC_ITU_T
-	help
-	  This is the new file system used on some CD-ROMs and DVDs. Say Y if
-	  you intend to mount DVD discs or CDRW's written in packet mode, or
-	  if written to by other UDF utilities, such as DirectCD.
-	  Please read <file:Documentation/filesystems/udf.txt>.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called udf.
-
-	  If unsure, say N.
-
-config UDF_NLS
-	bool
-	default y
-	depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
+source "fs/isofs/Kconfig"
+source "fs/udf/Kconfig"
 
 endmenu
 endif # BLOCK
diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig
new file mode 100644
index 000000000000..8ab9878e3671
--- /dev/null
+++ b/fs/isofs/Kconfig
@@ -0,0 +1,39 @@
+config ISO9660_FS
+	tristate "ISO 9660 CDROM file system support"
+	help
+	  This is the standard file system used on CD-ROMs.  It was previously
+	  known as "High Sierra File System" and is called "hsfs" on other
+	  Unix systems.  The so-called Rock-Ridge extensions which allow for
+	  long Unix filenames and symbolic links are also supported by this
+	  driver.  If you have a CD-ROM drive and want to do more with it than
+	  just listen to audio CDs and watch its LEDs, say Y (and read
+	  <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO,
+	  available from <http://www.tldp.org/docs.html#howto>), thereby
+	  enlarging your kernel by about 27 KB; otherwise say N.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called isofs.
+
+config JOLIET
+	bool "Microsoft Joliet CDROM extensions"
+	depends on ISO9660_FS
+	select NLS
+	help
+	  Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system
+	  which allows for long filenames in unicode format (unicode is the
+	  new 16 bit character code, successor to ASCII, which encodes the
+	  characters of almost all languages of the world; see
+	  <http://www.unicode.org/> for more information).  Say Y here if you
+	  want to be able to read Joliet CD-ROMs under Linux.
+
+config ZISOFS
+	bool "Transparent decompression extension"
+	depends on ISO9660_FS
+	select ZLIB_INFLATE
+	help
+	  This is a Linux-specific extension to RockRidge which lets you store
+	  data in compressed form on a CD-ROM and have it transparently
+	  decompressed when the CD-ROM is accessed.  See
+	  <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools
+	  necessary to create such a filesystem.  Say Y here if you want to be
+	  able to read such compressed CD-ROMs.
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
new file mode 100644
index 000000000000..0e0e99bd6bce
--- /dev/null
+++ b/fs/udf/Kconfig
@@ -0,0 +1,18 @@
+config UDF_FS
+	tristate "UDF file system support"
+	select CRC_ITU_T
+	help
+	  This is the new file system used on some CD-ROMs and DVDs. Say Y if
+	  you intend to mount DVD discs or CDRW's written in packet mode, or
+	  if written to by other UDF utilities, such as DirectCD.
+	  Please read <file:Documentation/filesystems/udf.txt>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called udf.
+
+	  If unsure, say N.
+
+config UDF_NLS
+	bool
+	default y
+	depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
-- 
cgit v1.2.3


From 1c6ace019bce5e918a3d6cd53948652e14850644 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:37:59 +0300
Subject: fs/Kconfig: move fat out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 98 +---------------------------------------------------------
 fs/fat/Kconfig | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+), 97 deletions(-)
 create mode 100644 fs/fat/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index b4868b8fd999..fdb2c351b4a7 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -132,103 +132,7 @@ endif # BLOCK
 if BLOCK
 menu "DOS/FAT/NT Filesystems"
 
-config FAT_FS
-	tristate
-	select NLS
-	help
-	  If you want to use one of the FAT-based file systems (the MS-DOS and
-	  VFAT (Windows 95) file systems), then you must say Y or M here
-	  to include FAT support. You will then be able to mount partitions or
-	  diskettes with FAT-based file systems and transparently access the
-	  files on them, i.e. MSDOS files will look and behave just like all
-	  other Unix files.
-
-	  This FAT support is not a file system in itself, it only provides
-	  the foundation for the other file systems. You will have to say Y or
-	  M to at least one of "MSDOS fs support" or "VFAT fs support" in
-	  order to make use of it.
-
-	  Another way to read and write MSDOS floppies and hard drive
-	  partitions from within Linux (but not transparently) is with the
-	  mtools ("man mtools") program suite. You don't need to say Y here in
-	  order to do that.
-
-	  If you need to move large files on floppies between a DOS and a
-	  Linux box, say Y here, mount the floppy under Linux with an MSDOS
-	  file system and use GNU tar's M option. GNU tar is a program
-	  available for Unix and DOS ("man tar" or "info tar").
-
-	  The FAT support will enlarge your kernel by about 37 KB. If unsure,
-	  say Y.
-
-	  To compile this as a module, choose M here: the module will be called
-	  fat.  Note that if you compile the FAT support as a module, you
-	  cannot compile any of the FAT-based file systems into the kernel
-	  -- they will have to be modules as well.
-
-config MSDOS_FS
-	tristate "MSDOS fs support"
-	select FAT_FS
-	help
-	  This allows you to mount MSDOS partitions of your hard drive (unless
-	  they are compressed; to access compressed MSDOS partitions under
-	  Linux, you can either use the DOS emulator DOSEMU, described in the
-	  DOSEMU-HOWTO, available from
-	  <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
-	  <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
-	  intend to use dosemu with a non-compressed MSDOS partition, say Y
-	  here) and MSDOS floppies. This means that file access becomes
-	  transparent, i.e. the MSDOS files look and behave just like all
-	  other Unix files.
-
-	  If you have Windows 95 or Windows NT installed on your MSDOS
-	  partitions, you should use the VFAT file system (say Y to "VFAT fs
-	  support" below), or you will not be able to see the long filenames
-	  generated by Windows 95 / Windows NT.
-
-	  This option will enlarge your kernel by about 7 KB. If unsure,
-	  answer Y. This will only work if you said Y to "DOS FAT fs support"
-	  as well. To compile this as a module, choose M here: the module will
-	  be called msdos.
-
-config VFAT_FS
-	tristate "VFAT (Windows-95) fs support"
-	select FAT_FS
-	help
-	  This option provides support for normal Windows file systems with
-	  long filenames.  That includes non-compressed FAT-based file systems
-	  used by Windows 95, Windows 98, Windows NT 4.0, and the Unix
-	  programs from the mtools package.
-
-	  The VFAT support enlarges your kernel by about 10 KB and it only
-	  works if you said Y to the "DOS FAT fs support" above.  Please read
-	  the file <file:Documentation/filesystems/vfat.txt> for details.  If
-	  unsure, say Y.
-
-	  To compile this as a module, choose M here: the module will be called
-	  vfat.
-
-config FAT_DEFAULT_CODEPAGE
-	int "Default codepage for FAT"
-	depends on MSDOS_FS || VFAT_FS
-	default 437
-	help
-	  This option should be set to the codepage of your FAT filesystems.
-	  It can be overridden with the "codepage" mount option.
-	  See <file:Documentation/filesystems/vfat.txt> for more information.
-
-config FAT_DEFAULT_IOCHARSET
-	string "Default iocharset for FAT"
-	depends on VFAT_FS
-	default "iso8859-1"
-	help
-	  Set this to the default input/output character set you'd
-	  like FAT to use. It should probably match the character set
-	  that most of your FAT filesystems use, and can be overridden
-	  with the "iocharset" mount option for FAT filesystems.
-	  Note that "utf8" is not recommended for FAT filesystems.
-	  If unsure, you shouldn't set "utf8" here.
-	  See <file:Documentation/filesystems/vfat.txt> for more information.
+source "fs/fat/Kconfig"
 
 config NTFS_FS
 	tristate "NTFS file system support"
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
new file mode 100644
index 000000000000..d0a69ff25375
--- /dev/null
+++ b/fs/fat/Kconfig
@@ -0,0 +1,97 @@
+config FAT_FS
+	tristate
+	select NLS
+	help
+	  If you want to use one of the FAT-based file systems (the MS-DOS and
+	  VFAT (Windows 95) file systems), then you must say Y or M here
+	  to include FAT support. You will then be able to mount partitions or
+	  diskettes with FAT-based file systems and transparently access the
+	  files on them, i.e. MSDOS files will look and behave just like all
+	  other Unix files.
+
+	  This FAT support is not a file system in itself, it only provides
+	  the foundation for the other file systems. You will have to say Y or
+	  M to at least one of "MSDOS fs support" or "VFAT fs support" in
+	  order to make use of it.
+
+	  Another way to read and write MSDOS floppies and hard drive
+	  partitions from within Linux (but not transparently) is with the
+	  mtools ("man mtools") program suite. You don't need to say Y here in
+	  order to do that.
+
+	  If you need to move large files on floppies between a DOS and a
+	  Linux box, say Y here, mount the floppy under Linux with an MSDOS
+	  file system and use GNU tar's M option. GNU tar is a program
+	  available for Unix and DOS ("man tar" or "info tar").
+
+	  The FAT support will enlarge your kernel by about 37 KB. If unsure,
+	  say Y.
+
+	  To compile this as a module, choose M here: the module will be called
+	  fat.  Note that if you compile the FAT support as a module, you
+	  cannot compile any of the FAT-based file systems into the kernel
+	  -- they will have to be modules as well.
+
+config MSDOS_FS
+	tristate "MSDOS fs support"
+	select FAT_FS
+	help
+	  This allows you to mount MSDOS partitions of your hard drive (unless
+	  they are compressed; to access compressed MSDOS partitions under
+	  Linux, you can either use the DOS emulator DOSEMU, described in the
+	  DOSEMU-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
+	  <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
+	  intend to use dosemu with a non-compressed MSDOS partition, say Y
+	  here) and MSDOS floppies. This means that file access becomes
+	  transparent, i.e. the MSDOS files look and behave just like all
+	  other Unix files.
+
+	  If you have Windows 95 or Windows NT installed on your MSDOS
+	  partitions, you should use the VFAT file system (say Y to "VFAT fs
+	  support" below), or you will not be able to see the long filenames
+	  generated by Windows 95 / Windows NT.
+
+	  This option will enlarge your kernel by about 7 KB. If unsure,
+	  answer Y. This will only work if you said Y to "DOS FAT fs support"
+	  as well. To compile this as a module, choose M here: the module will
+	  be called msdos.
+
+config VFAT_FS
+	tristate "VFAT (Windows-95) fs support"
+	select FAT_FS
+	help
+	  This option provides support for normal Windows file systems with
+	  long filenames.  That includes non-compressed FAT-based file systems
+	  used by Windows 95, Windows 98, Windows NT 4.0, and the Unix
+	  programs from the mtools package.
+
+	  The VFAT support enlarges your kernel by about 10 KB and it only
+	  works if you said Y to the "DOS FAT fs support" above.  Please read
+	  the file <file:Documentation/filesystems/vfat.txt> for details.  If
+	  unsure, say Y.
+
+	  To compile this as a module, choose M here: the module will be called
+	  vfat.
+
+config FAT_DEFAULT_CODEPAGE
+	int "Default codepage for FAT"
+	depends on MSDOS_FS || VFAT_FS
+	default 437
+	help
+	  This option should be set to the codepage of your FAT filesystems.
+	  It can be overridden with the "codepage" mount option.
+	  See <file:Documentation/filesystems/vfat.txt> for more information.
+
+config FAT_DEFAULT_IOCHARSET
+	string "Default iocharset for FAT"
+	depends on VFAT_FS
+	default "iso8859-1"
+	help
+	  Set this to the default input/output character set you'd
+	  like FAT to use. It should probably match the character set
+	  that most of your FAT filesystems use, and can be overridden
+	  with the "iocharset" mount option for FAT filesystems.
+	  Note that "utf8" is not recommended for FAT filesystems.
+	  If unsure, you shouldn't set "utf8" here.
+	  See <file:Documentation/filesystems/vfat.txt> for more information.
-- 
cgit v1.2.3


From 9d73ac9e8faffa3b930fcebbf4ebcd25f8061ada Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:39:20 +0300
Subject: fs/Kconfig: move ntfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 80 +--------------------------------------------------------
 fs/ntfs/Kconfig | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 79 deletions(-)
 create mode 100644 fs/ntfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index fdb2c351b4a7..f746fd6cb728 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -133,85 +133,7 @@ if BLOCK
 menu "DOS/FAT/NT Filesystems"
 
 source "fs/fat/Kconfig"
-
-config NTFS_FS
-	tristate "NTFS file system support"
-	select NLS
-	help
-	  NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
-
-	  Saying Y or M here enables read support.  There is partial, but
-	  safe, write support available.  For write support you must also
-	  say Y to "NTFS write support" below.
-
-	  There are also a number of user-space tools available, called
-	  ntfsprogs.  These include ntfsundelete and ntfsresize, that work
-	  without NTFS support enabled in the kernel.
-
-	  This is a rewrite from scratch of Linux NTFS support and replaced
-	  the old NTFS code starting with Linux 2.5.11.  A backport to
-	  the Linux 2.4 kernel series is separately available as a patch
-	  from the project web site.
-
-	  For more information see <file:Documentation/filesystems/ntfs.txt>
-	  and <http://www.linux-ntfs.org/>.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called ntfs.
-
-	  If you are not using Windows NT, 2000, XP or 2003 in addition to
-	  Linux on your computer it is safe to say N.
-
-config NTFS_DEBUG
-	bool "NTFS debugging support"
-	depends on NTFS_FS
-	help
-	  If you are experiencing any problems with the NTFS file system, say
-	  Y here.  This will result in additional consistency checks to be
-	  performed by the driver as well as additional debugging messages to
-	  be written to the system log.  Note that debugging messages are
-	  disabled by default.  To enable them, supply the option debug_msgs=1
-	  at the kernel command line when booting the kernel or as an option
-	  to insmod when loading the ntfs module.  Once the driver is active,
-	  you can enable debugging messages by doing (as root):
-	  echo 1 > /proc/sys/fs/ntfs-debug
-	  Replacing the "1" with "0" would disable debug messages.
-
-	  If you leave debugging messages disabled, this results in little
-	  overhead, but enabling debug messages results in very significant
-	  slowdown of the system.
-
-	  When reporting bugs, please try to have available a full dump of
-	  debugging messages while the misbehaviour was occurring.
-
-config NTFS_RW
-	bool "NTFS write support"
-	depends on NTFS_FS
-	help
-	  This enables the partial, but safe, write support in the NTFS driver.
-
-	  The only supported operation is overwriting existing files, without
-	  changing the file length.  No file or directory creation, deletion or
-	  renaming is possible.  Note only non-resident files can be written to
-	  so you may find that some very small files (<500 bytes or so) cannot
-	  be written to.
-
-	  While we cannot guarantee that it will not damage any data, we have
-	  so far not received a single report where the driver would have
-	  damaged someones data so we assume it is perfectly safe to use.
-
-	  Note:  While write support is safe in this version (a rewrite from
-	  scratch of the NTFS support), it should be noted that the old NTFS
-	  write support, included in Linux 2.5.10 and before (since 1997),
-	  is not safe.
-
-	  This is currently useful with TopologiLinux.  TopologiLinux is run
-	  on top of any DOS/Microsoft Windows system without partitioning your
-	  hard disk.  Unlike other Linux distributions TopologiLinux does not
-	  need its own partition.  For more information see
-	  <http://topologi-linux.sourceforge.net/>
-
-	  It is perfectly safe to say N here.
+source "fs/ntfs/Kconfig"
 
 endmenu
 endif # BLOCK
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
new file mode 100644
index 000000000000..f5a868cc9152
--- /dev/null
+++ b/fs/ntfs/Kconfig
@@ -0,0 +1,78 @@
+config NTFS_FS
+	tristate "NTFS file system support"
+	select NLS
+	help
+	  NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
+
+	  Saying Y or M here enables read support.  There is partial, but
+	  safe, write support available.  For write support you must also
+	  say Y to "NTFS write support" below.
+
+	  There are also a number of user-space tools available, called
+	  ntfsprogs.  These include ntfsundelete and ntfsresize, that work
+	  without NTFS support enabled in the kernel.
+
+	  This is a rewrite from scratch of Linux NTFS support and replaced
+	  the old NTFS code starting with Linux 2.5.11.  A backport to
+	  the Linux 2.4 kernel series is separately available as a patch
+	  from the project web site.
+
+	  For more information see <file:Documentation/filesystems/ntfs.txt>
+	  and <http://www.linux-ntfs.org/>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ntfs.
+
+	  If you are not using Windows NT, 2000, XP or 2003 in addition to
+	  Linux on your computer it is safe to say N.
+
+config NTFS_DEBUG
+	bool "NTFS debugging support"
+	depends on NTFS_FS
+	help
+	  If you are experiencing any problems with the NTFS file system, say
+	  Y here.  This will result in additional consistency checks to be
+	  performed by the driver as well as additional debugging messages to
+	  be written to the system log.  Note that debugging messages are
+	  disabled by default.  To enable them, supply the option debug_msgs=1
+	  at the kernel command line when booting the kernel or as an option
+	  to insmod when loading the ntfs module.  Once the driver is active,
+	  you can enable debugging messages by doing (as root):
+	  echo 1 > /proc/sys/fs/ntfs-debug
+	  Replacing the "1" with "0" would disable debug messages.
+
+	  If you leave debugging messages disabled, this results in little
+	  overhead, but enabling debug messages results in very significant
+	  slowdown of the system.
+
+	  When reporting bugs, please try to have available a full dump of
+	  debugging messages while the misbehaviour was occurring.
+
+config NTFS_RW
+	bool "NTFS write support"
+	depends on NTFS_FS
+	help
+	  This enables the partial, but safe, write support in the NTFS driver.
+
+	  The only supported operation is overwriting existing files, without
+	  changing the file length.  No file or directory creation, deletion or
+	  renaming is possible.  Note only non-resident files can be written to
+	  so you may find that some very small files (<500 bytes or so) cannot
+	  be written to.
+
+	  While we cannot guarantee that it will not damage any data, we have
+	  so far not received a single report where the driver would have
+	  damaged someones data so we assume it is perfectly safe to use.
+
+	  Note:  While write support is safe in this version (a rewrite from
+	  scratch of the NTFS support), it should be noted that the old NTFS
+	  write support, included in Linux 2.5.10 and before (since 1997),
+	  is not safe.
+
+	  This is currently useful with TopologiLinux.  TopologiLinux is run
+	  on top of any DOS/Microsoft Windows system without partitioning your
+	  hard disk.  Unlike other Linux distributions TopologiLinux does not
+	  need its own partition.  For more information see
+	  <http://topologi-linux.sourceforge.net/>
+
+	  It is perfectly safe to say N here.
-- 
cgit v1.2.3


From 5f3a211a8b02222498f134ea961fe29c97a4801f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:40:58 +0300
Subject: fs/Kconfig: move sysfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 25 +------------------------
 fs/sysfs/Kconfig | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+), 24 deletions(-)
 create mode 100644 fs/sysfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index f746fd6cb728..e9103b9862b4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -141,30 +141,7 @@ endif # BLOCK
 menu "Pseudo filesystems"
 
 source "fs/proc/Kconfig"
-
-config SYSFS
-	bool "sysfs file system support" if EMBEDDED
-	default y
-	help
-	The sysfs filesystem is a virtual filesystem that the kernel uses to
-	export internal kernel objects, their attributes, and their
-	relationships to one another.
-
-	Users can use sysfs to ascertain useful information about the running
-	kernel, such as the devices the kernel has discovered on each bus and
-	which driver each is bound to. sysfs can also be used to tune devices
-	and other kernel subsystems.
-
-	Some system agents rely on the information in sysfs to operate.
-	/sbin/hotplug uses device and object attributes in sysfs to assist in
-	delegating policy decisions, like persistently naming devices.
-
-	sysfs is currently used by the block subsystem to mount the root
-	partition.  If sysfs is disabled you must specify the boot device on
-	the kernel boot command line via its major and minor numbers.  For
-	example, "root=03:01" for /dev/hda1.
-
-	Designers of embedded systems may wish to say N here to conserve space.
+source "fs/sysfs/Kconfig"
 
 config TMPFS
 	bool "Virtual memory file system support (former shm fs)"
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
new file mode 100644
index 000000000000..f4b67588b9d6
--- /dev/null
+++ b/fs/sysfs/Kconfig
@@ -0,0 +1,23 @@
+config SYSFS
+	bool "sysfs file system support" if EMBEDDED
+	default y
+	help
+	The sysfs filesystem is a virtual filesystem that the kernel uses to
+	export internal kernel objects, their attributes, and their
+	relationships to one another.
+
+	Users can use sysfs to ascertain useful information about the running
+	kernel, such as the devices the kernel has discovered on each bus and
+	which driver each is bound to. sysfs can also be used to tune devices
+	and other kernel subsystems.
+
+	Some system agents rely on the information in sysfs to operate.
+	/sbin/hotplug uses device and object attributes in sysfs to assist in
+	delegating policy decisions, like persistently naming devices.
+
+	sysfs is currently used by the block subsystem to mount the root
+	partition.  If sysfs is disabled you must specify the boot device on
+	the kernel boot command line via its major and minor numbers.  For
+	example, "root=03:01" for /dev/hda1.
+
+	Designers of embedded systems may wish to say N here to conserve space.
-- 
cgit v1.2.3


From 4591dabe27ec0f7928fb73d93694698e21dc769e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:42:52 +0300
Subject: fs/Kconfig: move configfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig          | 12 +-----------
 fs/configfs/Kconfig | 11 +++++++++++
 2 files changed, 12 insertions(+), 11 deletions(-)
 create mode 100644 fs/configfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index e9103b9862b4..d7d7f1b93635 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -182,17 +182,7 @@ config HUGETLBFS
 config HUGETLB_PAGE
 	def_bool HUGETLBFS
 
-config CONFIGFS_FS
-	tristate "Userspace-driven configuration filesystem"
-	depends on SYSFS
-	help
-	  configfs is a ram-based filesystem that provides the converse
-	  of sysfs's functionality. Where sysfs is a filesystem-based
-	  view of kernel objects, configfs is a filesystem-based manager
-	  of kernel objects, or config_items.
-
-	  Both sysfs and configfs can and should exist together on the
-	  same system. One is not a replacement for the other.
+source "fs/configfs/Kconfig"
 
 endmenu
 
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
new file mode 100644
index 000000000000..13587cc97a0b
--- /dev/null
+++ b/fs/configfs/Kconfig
@@ -0,0 +1,11 @@
+config CONFIGFS_FS
+	tristate "Userspace-driven configuration filesystem"
+	depends on SYSFS
+	help
+	  configfs is a ram-based filesystem that provides the converse
+	  of sysfs's functionality. Where sysfs is a filesystem-based
+	  view of kernel objects, configfs is a filesystem-based manager
+	  of kernel objects, or config_items.
+
+	  Both sysfs and configfs can and should exist together on the
+	  same system. One is not a replacement for the other.
-- 
cgit v1.2.3


From bc2de2ae67177bc60bb9ab41c97ea4f827d52821 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:48:46 +0300
Subject: fs/Kconfig: move adfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 28 +---------------------------
 fs/adfs/Kconfig | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 27 deletions(-)
 create mode 100644 fs/adfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index d7d7f1b93635..e4492c75efe6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -201,33 +201,7 @@ menuconfig MISC_FILESYSTEMS
 
 if MISC_FILESYSTEMS
 
-config ADFS_FS
-	tristate "ADFS file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
-	help
-	  The Acorn Disc Filing System is the standard file system of the
-	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
-	  systems and the Acorn Archimedes range of machines. If you say Y
-	  here, Linux will be able to read from ADFS partitions on hard drives
-	  and from ADFS-formatted floppy discs. If you also want to be able to
-	  write to those devices, say Y to "ADFS write support" below.
-
-	  The ADFS partition should be the first partition (i.e.,
-	  /dev/[hs]d?1) on each of your drives. Please read the file
-	  <file:Documentation/filesystems/adfs.txt> for further details.
-
-	  To compile this code as a module, choose M here: the module will be
-	  called adfs.
-
-	  If unsure, say N.
-
-config ADFS_FS_RW
-	bool "ADFS write support (DANGEROUS)"
-	depends on ADFS_FS
-	help
-	  If you say Y here, you will be able to write to ADFS partitions on
-	  hard drives and ADFS-formatted floppy disks. This is experimental
-	  codes, so if you're unsure, say N.
+source "fs/adfs/Kconfig"
 
 config AFFS_FS
 	tristate "Amiga FFS file system support (EXPERIMENTAL)"
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
new file mode 100644
index 000000000000..e55182a74605
--- /dev/null
+++ b/fs/adfs/Kconfig
@@ -0,0 +1,27 @@
+config ADFS_FS
+	tristate "ADFS file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	help
+	  The Acorn Disc Filing System is the standard file system of the
+	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
+	  systems and the Acorn Archimedes range of machines. If you say Y
+	  here, Linux will be able to read from ADFS partitions on hard drives
+	  and from ADFS-formatted floppy discs. If you also want to be able to
+	  write to those devices, say Y to "ADFS write support" below.
+
+	  The ADFS partition should be the first partition (i.e.,
+	  /dev/[hs]d?1) on each of your drives. Please read the file
+	  <file:Documentation/filesystems/adfs.txt> for further details.
+
+	  To compile this code as a module, choose M here: the module will be
+	  called adfs.
+
+	  If unsure, say N.
+
+config ADFS_FS_RW
+	bool "ADFS write support (DANGEROUS)"
+	depends on ADFS_FS
+	help
+	  If you say Y here, you will be able to write to ADFS partitions on
+	  hard drives and ADFS-formatted floppy disks. This is experimental
+	  codes, so if you're unsure, say N.
-- 
cgit v1.2.3


From 10951bf05d952bf6d13094f66a0dccd11dec311e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:49:44 +0300
Subject: fs/Kconfig: move affs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 23 +----------------------
 fs/affs/Kconfig | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 22 deletions(-)
 create mode 100644 fs/affs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index e4492c75efe6..3e025af4d8b4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -202,28 +202,7 @@ menuconfig MISC_FILESYSTEMS
 if MISC_FILESYSTEMS
 
 source "fs/adfs/Kconfig"
-
-config AFFS_FS
-	tristate "Amiga FFS file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
-	help
-	  The Fast File System (FFS) is the common file system used on hard
-	  disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
-	  if you want to be able to read and write files from and to an Amiga
-	  FFS partition on your hard drive.  Amiga floppies however cannot be
-	  read with this driver due to an incompatibility of the floppy
-	  controller used in an Amiga and the standard floppy controller in
-	  PCs and workstations. Read <file:Documentation/filesystems/affs.txt>
-	  and <file:fs/affs/Changes>.
-
-	  With this driver you can also mount disk files used by Bernd
-	  Schmidt's Un*X Amiga Emulator
-	  (<http://www.freiburg.linux.de/~uae/>).
-	  If you want to do this, you will also need to say Y or M to "Loop
-	  device support", above.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called affs.  If unsure, say N.
+source "fs/affs/Kconfig"
 
 config ECRYPT_FS
 	tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
new file mode 100644
index 000000000000..cfad9afb4762
--- /dev/null
+++ b/fs/affs/Kconfig
@@ -0,0 +1,21 @@
+config AFFS_FS
+	tristate "Amiga FFS file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	help
+	  The Fast File System (FFS) is the common file system used on hard
+	  disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
+	  if you want to be able to read and write files from and to an Amiga
+	  FFS partition on your hard drive.  Amiga floppies however cannot be
+	  read with this driver due to an incompatibility of the floppy
+	  controller used in an Amiga and the standard floppy controller in
+	  PCs and workstations. Read <file:Documentation/filesystems/affs.txt>
+	  and <file:fs/affs/Changes>.
+
+	  With this driver you can also mount disk files used by Bernd
+	  Schmidt's Un*X Amiga Emulator
+	  (<http://www.freiburg.linux.de/~uae/>).
+	  If you want to do this, you will also need to say Y or M to "Loop
+	  device support", above.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called affs.  If unsure, say N.
-- 
cgit v1.2.3


From 295c896cb95de18004ef5e1b53f44c2ad001f936 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:50:50 +0300
Subject: fs/Kconfig: move ecryptfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig          | 13 +------------
 fs/ecryptfs/Kconfig | 11 +++++++++++
 2 files changed, 12 insertions(+), 12 deletions(-)
 create mode 100644 fs/ecryptfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 3e025af4d8b4..1c79baf55db2 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -203,18 +203,7 @@ if MISC_FILESYSTEMS
 
 source "fs/adfs/Kconfig"
 source "fs/affs/Kconfig"
-
-config ECRYPT_FS
-	tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && KEYS && CRYPTO && NET
-	help
-	  Encrypted filesystem that operates on the VFS layer.  See
-	  <file:Documentation/filesystems/ecryptfs.txt> to learn more about
-	  eCryptfs.  Userspace components are required and can be
-	  obtained from <http://ecryptfs.sf.net>.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called ecryptfs.
+source "fs/ecryptfs/Kconfig"
 
 config HFS_FS
 	tristate "Apple Macintosh file system support (EXPERIMENTAL)"
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
new file mode 100644
index 000000000000..0c754e64232b
--- /dev/null
+++ b/fs/ecryptfs/Kconfig
@@ -0,0 +1,11 @@
+config ECRYPT_FS
+	tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && KEYS && CRYPTO && NET
+	help
+	  Encrypted filesystem that operates on the VFS layer.  See
+	  <file:Documentation/filesystems/ecryptfs.txt> to learn more about
+	  eCryptfs.  Userspace components are required and can be
+	  obtained from <http://ecryptfs.sf.net>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ecryptfs.
-- 
cgit v1.2.3


From b08bac1f185b2281c3decb4f8e15e8f41f96e974 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:53:24 +0300
Subject: fs/Kconfig: move hfs, hfsplus out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig         | 29 ++---------------------------
 fs/hfs/Kconfig     | 12 ++++++++++++
 fs/hfsplus/Kconfig | 13 +++++++++++++
 3 files changed, 27 insertions(+), 27 deletions(-)
 create mode 100644 fs/hfs/Kconfig
 create mode 100644 fs/hfsplus/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 1c79baf55db2..3b48ab4f0b77 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -204,33 +204,8 @@ if MISC_FILESYSTEMS
 source "fs/adfs/Kconfig"
 source "fs/affs/Kconfig"
 source "fs/ecryptfs/Kconfig"
-
-config HFS_FS
-	tristate "Apple Macintosh file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
-	select NLS
-	help
-	  If you say Y here, you will be able to mount Macintosh-formatted
-	  floppy disks and hard drive partitions with full read-write access.
-	  Please read <file:Documentation/filesystems/hfs.txt> to learn about
-	  the available mount options.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called hfs.
-
-config HFSPLUS_FS
-	tristate "Apple Extended HFS file system support"
-	depends on BLOCK
-	select NLS
-	select NLS_UTF8
-	help
-	  If you say Y here, you will be able to mount extended format
-	  Macintosh-formatted hard drive partitions with full read-write access.
-
-	  This file system is often called HFS+ and was introduced with
-	  MacOS 8. It includes all Mac specific filesystem data such as
-	  data forks and creator codes, but it also has several UNIX
-	  style features such as file ownership and permissions.
+source "fs/hfs/Kconfig"
+source "fs/hfsplus/Kconfig"
 
 config BEFS_FS
 	tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
new file mode 100644
index 000000000000..b77c5bc20f8a
--- /dev/null
+++ b/fs/hfs/Kconfig
@@ -0,0 +1,12 @@
+config HFS_FS
+	tristate "Apple Macintosh file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	select NLS
+	help
+	  If you say Y here, you will be able to mount Macintosh-formatted
+	  floppy disks and hard drive partitions with full read-write access.
+	  Please read <file:Documentation/filesystems/hfs.txt> to learn about
+	  the available mount options.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called hfs.
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
new file mode 100644
index 000000000000..a63371815aab
--- /dev/null
+++ b/fs/hfsplus/Kconfig
@@ -0,0 +1,13 @@
+config HFSPLUS_FS
+	tristate "Apple Extended HFS file system support"
+	depends on BLOCK
+	select NLS
+	select NLS_UTF8
+	help
+	  If you say Y here, you will be able to mount extended format
+	  Macintosh-formatted hard drive partitions with full read-write access.
+
+	  This file system is often called HFS+ and was introduced with
+	  MacOS 8. It includes all Mac specific filesystem data such as
+	  data forks and creator codes, but it also has several UNIX
+	  style features such as file ownership and permissions.
-- 
cgit v1.2.3


From 0b09eb32985d5fbec567e83b18db3dec14d1fef9 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:54:16 +0300
Subject: fs/Kconfig: move befs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 28 +---------------------------
 fs/befs/Kconfig | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 27 deletions(-)
 create mode 100644 fs/befs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 3b48ab4f0b77..cfddc0a76add 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -206,33 +206,7 @@ source "fs/affs/Kconfig"
 source "fs/ecryptfs/Kconfig"
 source "fs/hfs/Kconfig"
 source "fs/hfsplus/Kconfig"
-
-config BEFS_FS
-	tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
-	select NLS
-	help
-	  The BeOS File System (BeFS) is the native file system of Be, Inc's
-	  BeOS. Notable features include support for arbitrary attributes
-	  on files and directories, and database-like indices on selected
-	  attributes. (Also note that this driver doesn't make those features
-	  available at this time). It is a 64 bit filesystem, so it supports
-	  extremely large volumes and files.
-
-	  If you use this filesystem, you should also say Y to at least one
-	  of the NLS (native language support) options below.
-
-	  If you don't know what this is about, say N.
-
-	  To compile this as a module, choose M here: the module will be
-	  called befs.
-
-config BEFS_DEBUG
-	bool "Debug BeFS"
-	depends on BEFS_FS
-	help
-	  If you say Y here, you can use the 'debug' mount option to enable
-	  debugging output from the driver.
+source "fs/befs/Kconfig"
 
 config BFS_FS
 	tristate "BFS file system support (EXPERIMENTAL)"
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
new file mode 100644
index 000000000000..7835d30f211f
--- /dev/null
+++ b/fs/befs/Kconfig
@@ -0,0 +1,26 @@
+config BEFS_FS
+	tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	select NLS
+	help
+	  The BeOS File System (BeFS) is the native file system of Be, Inc's
+	  BeOS. Notable features include support for arbitrary attributes
+	  on files and directories, and database-like indices on selected
+	  attributes. (Also note that this driver doesn't make those features
+	  available at this time). It is a 64 bit filesystem, so it supports
+	  extremely large volumes and files.
+
+	  If you use this filesystem, you should also say Y to at least one
+	  of the NLS (native language support) options below.
+
+	  If you don't know what this is about, say N.
+
+	  To compile this as a module, choose M here: the module will be
+	  called befs.
+
+config BEFS_DEBUG
+	bool "Debug BeFS"
+	depends on BEFS_FS
+	help
+	  If you say Y here, you can use the 'debug' mount option to enable
+	  debugging output from the driver.
-- 
cgit v1.2.3


From 0ff423849de3fe98c06d30a8ac73103c8741914c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:55:13 +0300
Subject: fs/Kconfig: move bfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 23 +----------------------
 fs/bfs/Kconfig | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 22 deletions(-)
 create mode 100644 fs/bfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index cfddc0a76add..9acf3a2d2313 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -207,28 +207,7 @@ source "fs/ecryptfs/Kconfig"
 source "fs/hfs/Kconfig"
 source "fs/hfsplus/Kconfig"
 source "fs/befs/Kconfig"
-
-config BFS_FS
-	tristate "BFS file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
-	help
-	  Boot File System (BFS) is a file system used under SCO UnixWare to
-	  allow the bootloader access to the kernel image and other important
-	  files during the boot process.  It is usually mounted under /stand
-	  and corresponds to the slice marked as "STAND" in the UnixWare
-	  partition.  You should say Y if you want to read or write the files
-	  on your /stand slice from within Linux.  You then also need to say Y
-	  to "UnixWare slices support", below.  More information about the BFS
-	  file system is contained in the file
-	  <file:Documentation/filesystems/bfs.txt>.
-
-	  If you don't know what this is about, say N.
-
-	  To compile this as a module, choose M here: the module will be called
-	  bfs.  Note that the file system of your root partition (the one
-	  containing the directory /) cannot be compiled as a module.
-
-
+source "fs/bfs/Kconfig"
 
 config EFS_FS
 	tristate "EFS file system support (read only) (EXPERIMENTAL)"
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
new file mode 100644
index 000000000000..c2336c62024f
--- /dev/null
+++ b/fs/bfs/Kconfig
@@ -0,0 +1,19 @@
+config BFS_FS
+	tristate "BFS file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	help
+	  Boot File System (BFS) is a file system used under SCO UnixWare to
+	  allow the bootloader access to the kernel image and other important
+	  files during the boot process.  It is usually mounted under /stand
+	  and corresponds to the slice marked as "STAND" in the UnixWare
+	  partition.  You should say Y if you want to read or write the files
+	  on your /stand slice from within Linux.  You then also need to say Y
+	  to "UnixWare slices support", below.  More information about the BFS
+	  file system is contained in the file
+	  <file:Documentation/filesystems/bfs.txt>.
+
+	  If you don't know what this is about, say N.
+
+	  To compile this as a module, choose M here: the module will be called
+	  bfs.  Note that the file system of your root partition (the one
+	  containing the directory /) cannot be compiled as a module.
-- 
cgit v1.2.3


From 571f0a0bdeeb2d1692751b6c5df15dafb483c7ff Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:56:07 +0300
Subject: fs/Kconfig: move efs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 17 +----------------
 fs/efs/Kconfig | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 16 deletions(-)
 create mode 100644 fs/efs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 9acf3a2d2313..fad19083285c 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -208,22 +208,7 @@ source "fs/hfs/Kconfig"
 source "fs/hfsplus/Kconfig"
 source "fs/befs/Kconfig"
 source "fs/bfs/Kconfig"
-
-config EFS_FS
-	tristate "EFS file system support (read only) (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
-	help
-	  EFS is an older file system used for non-ISO9660 CD-ROMs and hard
-	  disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
-	  uses the XFS file system for hard disk partitions however).
-
-	  This implementation only offers read-only access. If you don't know
-	  what all this is about, it's safe to say N. For more information
-	  about EFS see its home page at <http://aeschi.ch.eu.org/efs/>.
-
-	  To compile the EFS file system support as a module, choose M here: the
-	  module will be called efs.
-
+source "fs/efs/Kconfig"
 source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
new file mode 100644
index 000000000000..6ebfc1c207a8
--- /dev/null
+++ b/fs/efs/Kconfig
@@ -0,0 +1,14 @@
+config EFS_FS
+	tristate "EFS file system support (read only) (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	help
+	  EFS is an older file system used for non-ISO9660 CD-ROMs and hard
+	  disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
+	  uses the XFS file system for hard disk partitions however).
+
+	  This implementation only offers read-only access. If you don't know
+	  what all this is about, it's safe to say N. For more information
+	  about EFS see its home page at <http://aeschi.ch.eu.org/efs/>.
+
+	  To compile the EFS file system support as a module, choose M here: the
+	  module will be called efs.
-- 
cgit v1.2.3


From 2a22783be0fbbd63599dd6aacf8bc2ddab941bf7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:56:54 +0300
Subject: fs/Kconfig: move cramfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig        | 21 +--------------------
 fs/cramfs/Kconfig | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 20 deletions(-)
 create mode 100644 fs/cramfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index fad19083285c..d7b84dfed4f8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -212,26 +212,7 @@ source "fs/efs/Kconfig"
 source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
-
-config CRAMFS
-	tristate "Compressed ROM file system support (cramfs)"
-	depends on BLOCK
-	select ZLIB_INFLATE
-	help
-	  Saying Y here includes support for CramFs (Compressed ROM File
-	  System).  CramFs is designed to be a simple, small, and compressed
-	  file system for ROM based embedded systems.  CramFs is read-only,
-	  limited to 256MB file systems (with 16MB files), and doesn't support
-	  16/32 bits uid/gid, hard links and timestamps.
-
-	  See <file:Documentation/filesystems/cramfs.txt> and
-	  <file:fs/cramfs/README> for further information.
-
-	  To compile this as a module, choose M here: the module will be called
-	  cramfs.  Note that the root file system (the one containing the
-	  directory /) cannot be compiled as a module.
-
-	  If unsure, say N.
+source "fs/cramfs/Kconfig"
 
 config SQUASHFS
 	tristate "SquashFS 4.0 - Squashed file system support"
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
new file mode 100644
index 000000000000..cd06466f365e
--- /dev/null
+++ b/fs/cramfs/Kconfig
@@ -0,0 +1,19 @@
+config CRAMFS
+	tristate "Compressed ROM file system support (cramfs)"
+	depends on BLOCK
+	select ZLIB_INFLATE
+	help
+	  Saying Y here includes support for CramFs (Compressed ROM File
+	  System).  CramFs is designed to be a simple, small, and compressed
+	  file system for ROM based embedded systems.  CramFs is read-only,
+	  limited to 256MB file systems (with 16MB files), and doesn't support
+	  16/32 bits uid/gid, hard links and timestamps.
+
+	  See <file:Documentation/filesystems/cramfs.txt> and
+	  <file:fs/cramfs/README> for further information.
+
+	  To compile this as a module, choose M here: the module will be called
+	  cramfs.  Note that the root file system (the one containing the
+	  directory /) cannot be compiled as a module.
+
+	  If unsure, say N.
-- 
cgit v1.2.3


From 22635ec9e0cb5afbc1eaa25495ae28da8416aac3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:57:46 +0300
Subject: fs/Kconfig: move squashfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig          | 53 +----------------------------------------------------
 fs/squashfs/Kconfig | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 52 deletions(-)
 create mode 100644 fs/squashfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index d7b84dfed4f8..d44a698463c7 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -213,58 +213,7 @@ source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
 source "fs/cramfs/Kconfig"
-
-config SQUASHFS
-	tristate "SquashFS 4.0 - Squashed file system support"
-	depends on BLOCK
-	select ZLIB_INFLATE
-	help
-	  Saying Y here includes support for SquashFS 4.0 (a Compressed
-	  Read-Only File System).  Squashfs is a highly compressed read-only
-	  filesystem for Linux.  It uses zlib compression to compress both
-	  files, inodes and directories.  Inodes in the system are very small
-	  and all blocks are packed to minimise data overhead. Block sizes
-	  greater than 4K are supported up to a maximum of 1 Mbytes (default
-	  block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
-	  (larger than 4GB), full uid/gid information, hard links and
-	  timestamps.  
-
-	  Squashfs is intended for general read-only filesystem use, for
-	  archival use (i.e. in cases where a .tar.gz file may be used), and in
-	  embedded systems where low overhead is needed.  Further information
-	  and tools are available from http://squashfs.sourceforge.net.
-
-	  If you want to compile this as a module ( = code which can be
-	  inserted in and removed from the running kernel whenever you want),
-	  say M here and read <file:Documentation/modules.txt>.  The module
-	  will be called squashfs.  Note that the root file system (the one
-	  containing the directory /) cannot be compiled as a module.
-
-	  If unsure, say N.
-
-config SQUASHFS_EMBEDDED
-
-	bool "Additional option for memory-constrained systems" 
-	depends on SQUASHFS
-	default n
-	help
-	  Saying Y here allows you to specify cache size.
-
-	  If unsure, say N.
-
-config SQUASHFS_FRAGMENT_CACHE_SIZE
-	int "Number of fragments cached" if SQUASHFS_EMBEDDED
-	depends on SQUASHFS
-	default "3"
-	help
-	  By default SquashFS caches the last 3 fragments read from
-	  the filesystem.  Increasing this amount may mean SquashFS
-	  has to re-read fragments less often from disk, at the expense
-	  of extra system memory.  Decreasing this amount will mean
-	  SquashFS uses less memory at the expense of extra reads from disk.
-
-	  Note there must be at least one cached fragment.  Anything
-	  much more than three will probably not make much difference.
+source "fs/squashfs/Kconfig"
 
 config VXFS_FS
 	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
new file mode 100644
index 000000000000..25a00d19d686
--- /dev/null
+++ b/fs/squashfs/Kconfig
@@ -0,0 +1,51 @@
+config SQUASHFS
+	tristate "SquashFS 4.0 - Squashed file system support"
+	depends on BLOCK
+	select ZLIB_INFLATE
+	help
+	  Saying Y here includes support for SquashFS 4.0 (a Compressed
+	  Read-Only File System).  Squashfs is a highly compressed read-only
+	  filesystem for Linux.  It uses zlib compression to compress both
+	  files, inodes and directories.  Inodes in the system are very small
+	  and all blocks are packed to minimise data overhead. Block sizes
+	  greater than 4K are supported up to a maximum of 1 Mbytes (default
+	  block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
+	  (larger than 4GB), full uid/gid information, hard links and
+	  timestamps.  
+
+	  Squashfs is intended for general read-only filesystem use, for
+	  archival use (i.e. in cases where a .tar.gz file may be used), and in
+	  embedded systems where low overhead is needed.  Further information
+	  and tools are available from http://squashfs.sourceforge.net.
+
+	  If you want to compile this as a module ( = code which can be
+	  inserted in and removed from the running kernel whenever you want),
+	  say M here and read <file:Documentation/modules.txt>.  The module
+	  will be called squashfs.  Note that the root file system (the one
+	  containing the directory /) cannot be compiled as a module.
+
+	  If unsure, say N.
+
+config SQUASHFS_EMBEDDED
+
+	bool "Additional option for memory-constrained systems" 
+	depends on SQUASHFS
+	default n
+	help
+	  Saying Y here allows you to specify cache size.
+
+	  If unsure, say N.
+
+config SQUASHFS_FRAGMENT_CACHE_SIZE
+	int "Number of fragments cached" if SQUASHFS_EMBEDDED
+	depends on SQUASHFS
+	default "3"
+	help
+	  By default SquashFS caches the last 3 fragments read from
+	  the filesystem.  Increasing this amount may mean SquashFS
+	  has to re-read fragments less often from disk, at the expense
+	  of extra system memory.  Decreasing this amount will mean
+	  SquashFS uses less memory at the expense of extra reads from disk.
+
+	  Note there must be at least one cached fragment.  Anything
+	  much more than three will probably not make much difference.
-- 
cgit v1.2.3


From 22135169ddc536b1f7d7f070c7980fe4bcdaa20b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:58:51 +0300
Subject: fs/Kconfig: move vxfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig          | 18 +-----------------
 fs/freevxfs/Kconfig | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 17 deletions(-)
 create mode 100644 fs/freevxfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index d44a698463c7..58ab4df56441 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -214,23 +214,7 @@ source "fs/jffs2/Kconfig"
 source "fs/ubifs/Kconfig"
 source "fs/cramfs/Kconfig"
 source "fs/squashfs/Kconfig"
-
-config VXFS_FS
-	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
-	depends on BLOCK
-	help
-	  FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
-	  file system format.  VERITAS VxFS(TM) is the standard file system
-	  of SCO UnixWare (and possibly others) and optionally available
-	  for Sunsoft Solaris, HP-UX and many other operating systems.
-	  Currently only readonly access is supported.
-
-	  NOTE: the file system type as used by mount(1), mount(2) and
-	  fstab(5) is 'vxfs' as it describes the file system format, not
-	  the actual driver.
-
-	  To compile this as a module, choose M here: the module will be
-	  called freevxfs.  If unsure, say N.
+source "fs/freevxfs/Kconfig"
 
 config MINIX_FS
 	tristate "Minix file system support"
diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
new file mode 100644
index 000000000000..8dc1cd5c1efe
--- /dev/null
+++ b/fs/freevxfs/Kconfig
@@ -0,0 +1,16 @@
+config VXFS_FS
+	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
+	depends on BLOCK
+	help
+	  FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
+	  file system format.  VERITAS VxFS(TM) is the standard file system
+	  of SCO UnixWare (and possibly others) and optionally available
+	  for Sunsoft Solaris, HP-UX and many other operating systems.
+	  Currently only readonly access is supported.
+
+	  NOTE: the file system type as used by mount(1), mount(2) and
+	  fstab(5) is 'vxfs' as it describes the file system format, not
+	  the actual driver.
+
+	  To compile this as a module, choose M here: the module will be
+	  called freevxfs.  If unsure, say N.
-- 
cgit v1.2.3


From 8b1cd7d3c5daaed6c4dec3697c1fc113eb817df0 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:59:49 +0300
Subject: fs/Kconfig: move minix out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 19 +------------------
 fs/minix/Kconfig | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 18 deletions(-)
 create mode 100644 fs/minix/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 58ab4df56441..3323379fdb3c 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -215,24 +215,7 @@ source "fs/ubifs/Kconfig"
 source "fs/cramfs/Kconfig"
 source "fs/squashfs/Kconfig"
 source "fs/freevxfs/Kconfig"
-
-config MINIX_FS
-	tristate "Minix file system support"
-	depends on BLOCK
-	help
-	  Minix is a simple operating system used in many classes about OS's.
-	  The minix file system (method to organize files on a hard disk
-	  partition or a floppy disk) was the original file system for Linux,
-	  but has been superseded by the second extended file system ext2fs.
-	  You don't want to use the minix file system on your hard disk
-	  because of certain built-in restrictions, but it is sometimes found
-	  on older Linux floppy disks.  This option will enlarge your kernel
-	  by about 28 KB. If unsure, say N.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called minix.  Note that the file system of your root
-	  partition (the one containing the directory /) cannot be compiled as
-	  a module.
+source "fs/minix/Kconfig"
 
 config OMFS_FS
 	tristate "SonicBlue Optimized MPEG File System support"
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
new file mode 100644
index 000000000000..0fd7ca994264
--- /dev/null
+++ b/fs/minix/Kconfig
@@ -0,0 +1,17 @@
+config MINIX_FS
+	tristate "Minix file system support"
+	depends on BLOCK
+	help
+	  Minix is a simple operating system used in many classes about OS's.
+	  The minix file system (method to organize files on a hard disk
+	  partition or a floppy disk) was the original file system for Linux,
+	  but has been superseded by the second extended file system ext2fs.
+	  You don't want to use the minix file system on your hard disk
+	  because of certain built-in restrictions, but it is sometimes found
+	  on older Linux floppy disks.  This option will enlarge your kernel
+	  by about 28 KB. If unsure, say N.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called minix.  Note that the file system of your root
+	  partition (the one containing the directory /) cannot be compiled as
+	  a module.
-- 
cgit v1.2.3


From da55e6f92830df9bba7c87438344479c60d44fdb Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:00:41 +0300
Subject: fs/Kconfig: move omfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 15 +--------------
 fs/omfs/Kconfig | 13 +++++++++++++
 2 files changed, 14 insertions(+), 14 deletions(-)
 create mode 100644 fs/omfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 3323379fdb3c..da5e8f956a82 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -216,20 +216,7 @@ source "fs/cramfs/Kconfig"
 source "fs/squashfs/Kconfig"
 source "fs/freevxfs/Kconfig"
 source "fs/minix/Kconfig"
-
-config OMFS_FS
-	tristate "SonicBlue Optimized MPEG File System support"
-	depends on BLOCK
-	select CRC_ITU_T
-	help
-	  This is the proprietary file system used by the Rio Karma music
-	  player and ReplayTV DVR.  Despite the name, this filesystem is not
-	  more efficient than a standard FS for MPEG files, in fact likely
-	  the opposite is true.  Say Y if you have either of these devices
-	  and wish to mount its disk.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called omfs.  If unsure, say N.
+source "fs/omfs/Kconfig"
 
 config HPFS_FS
 	tristate "OS/2 HPFS file system support"
diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig
new file mode 100644
index 000000000000..b1b9a0aba6fd
--- /dev/null
+++ b/fs/omfs/Kconfig
@@ -0,0 +1,13 @@
+config OMFS_FS
+	tristate "SonicBlue Optimized MPEG File System support"
+	depends on BLOCK
+	select CRC_ITU_T
+	help
+	  This is the proprietary file system used by the Rio Karma music
+	  player and ReplayTV DVR.  Despite the name, this filesystem is not
+	  more efficient than a standard FS for MPEG files, in fact likely
+	  the opposite is true.  Say Y if you have either of these devices
+	  and wish to mount its disk.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called omfs.  If unsure, say N.
-- 
cgit v1.2.3


From 928ea192959f188e6a4de95b293e3973887917b5 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:01:26 +0300
Subject: fs/Kconfig: move hpfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 17 +----------------
 fs/hpfs/Kconfig | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 16 deletions(-)
 create mode 100644 fs/hpfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index da5e8f956a82..9bead7c680d7 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -217,22 +217,7 @@ source "fs/squashfs/Kconfig"
 source "fs/freevxfs/Kconfig"
 source "fs/minix/Kconfig"
 source "fs/omfs/Kconfig"
-
-config HPFS_FS
-	tristate "OS/2 HPFS file system support"
-	depends on BLOCK
-	help
-	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
-	  is the file system used for organizing files on OS/2 hard disk
-	  partitions. Say Y if you want to be able to read files from and
-	  write files to an OS/2 HPFS partition on your hard drive. OS/2
-	  floppies however are in regular MSDOS format, so you don't need this
-	  option in order to be able to read them. Read
-	  <file:Documentation/filesystems/hpfs.txt>.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called hpfs.  If unsure, say N.
-
+source "fs/hpfs/Kconfig"
 
 config QNX4FS_FS
 	tristate "QNX4 file system support (read only)"
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
new file mode 100644
index 000000000000..56bd15c5bf6c
--- /dev/null
+++ b/fs/hpfs/Kconfig
@@ -0,0 +1,14 @@
+config HPFS_FS
+	tristate "OS/2 HPFS file system support"
+	depends on BLOCK
+	help
+	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
+	  is the file system used for organizing files on OS/2 hard disk
+	  partitions. Say Y if you want to be able to read files from and
+	  write files to an OS/2 HPFS partition on your hard drive. OS/2
+	  floppies however are in regular MSDOS format, so you don't need this
+	  option in order to be able to read them. Read
+	  <file:Documentation/filesystems/hpfs.txt>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called hpfs.  If unsure, say N.
-- 
cgit v1.2.3


From 4c7415830c7ab465ff54ca7ffc20bfb1b59906c3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:02:21 +0300
Subject: fs/Kconfig: move qnx4 out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 27 +--------------------------
 fs/qnx4/Kconfig | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 26 deletions(-)
 create mode 100644 fs/qnx4/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 9bead7c680d7..b348d2e8cc66 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -218,32 +218,7 @@ source "fs/freevxfs/Kconfig"
 source "fs/minix/Kconfig"
 source "fs/omfs/Kconfig"
 source "fs/hpfs/Kconfig"
-
-config QNX4FS_FS
-	tristate "QNX4 file system support (read only)"
-	depends on BLOCK
-	help
-	  This is the file system used by the real-time operating systems
-	  QNX 4 and QNX 6 (the latter is also called QNX RTP).
-	  Further information is available at <http://www.qnx.com/>.
-	  Say Y if you intend to mount QNX hard disks or floppies.
-	  Unless you say Y to "QNX4FS read-write support" below, you will
-	  only be able to read these file systems.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called qnx4.
-
-	  If you don't know whether you need it, then you don't need it:
-	  answer N.
-
-config QNX4FS_RW
-	bool "QNX4FS write support (DANGEROUS)"
-	depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
-	help
-	  Say Y if you want to test write support for QNX4 file systems.
-
-	  It's currently broken, so for now:
-	  answer N.
+source "fs/qnx4/Kconfig"
 
 config ROMFS_FS
 	tristate "ROM file system support"
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
new file mode 100644
index 000000000000..be8e0e1445b6
--- /dev/null
+++ b/fs/qnx4/Kconfig
@@ -0,0 +1,25 @@
+config QNX4FS_FS
+	tristate "QNX4 file system support (read only)"
+	depends on BLOCK
+	help
+	  This is the file system used by the real-time operating systems
+	  QNX 4 and QNX 6 (the latter is also called QNX RTP).
+	  Further information is available at <http://www.qnx.com/>.
+	  Say Y if you intend to mount QNX hard disks or floppies.
+	  Unless you say Y to "QNX4FS read-write support" below, you will
+	  only be able to read these file systems.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called qnx4.
+
+	  If you don't know whether you need it, then you don't need it:
+	  answer N.
+
+config QNX4FS_RW
+	bool "QNX4FS write support (DANGEROUS)"
+	depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
+	help
+	  Say Y if you want to test write support for QNX4 file systems.
+
+	  It's currently broken, so for now:
+	  answer N.
-- 
cgit v1.2.3


From 41810246df2e65c66dc1f0da79b282a95b664fc7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:03:34 +0300
Subject: fs/Kconfig: move romfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 19 +------------------
 fs/romfs/Kconfig | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 18 deletions(-)
 create mode 100644 fs/romfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index b348d2e8cc66..d8672ccdc69e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -219,24 +219,7 @@ source "fs/minix/Kconfig"
 source "fs/omfs/Kconfig"
 source "fs/hpfs/Kconfig"
 source "fs/qnx4/Kconfig"
-
-config ROMFS_FS
-	tristate "ROM file system support"
-	depends on BLOCK
-	---help---
-	  This is a very small read-only file system mainly intended for
-	  initial ram disks of installation disks, but it could be used for
-	  other read-only media as well.  Read
-	  <file:Documentation/filesystems/romfs.txt> for details.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called romfs.  Note that the file system of your
-	  root partition (the one containing the directory /) cannot be a
-	  module.
-
-	  If you don't know whether you need it, then you don't need it:
-	  answer N.
-
+source "fs/romfs/Kconfig"
 
 config SYSV_FS
 	tristate "System V/Xenix/V7/Coherent file system support"
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
new file mode 100644
index 000000000000..1a17020f9faf
--- /dev/null
+++ b/fs/romfs/Kconfig
@@ -0,0 +1,16 @@
+config ROMFS_FS
+	tristate "ROM file system support"
+	depends on BLOCK
+	---help---
+	  This is a very small read-only file system mainly intended for
+	  initial ram disks of installation disks, but it could be used for
+	  other read-only media as well.  Read
+	  <file:Documentation/filesystems/romfs.txt> for details.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called romfs.  Note that the file system of your
+	  root partition (the one containing the directory /) cannot be a
+	  module.
+
+	  If you don't know whether you need it, then you don't need it:
+	  answer N.
-- 
cgit v1.2.3


From 8af915ba1d1eae1f9f31fa8c5db8040492dc4785 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:04:23 +0300
Subject: fs/Kconfig: move sysv out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 39 +--------------------------------------
 fs/sysv/Kconfig | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 38 deletions(-)
 create mode 100644 fs/sysv/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index d8672ccdc69e..e1cdb8310647 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -220,44 +220,7 @@ source "fs/omfs/Kconfig"
 source "fs/hpfs/Kconfig"
 source "fs/qnx4/Kconfig"
 source "fs/romfs/Kconfig"
-
-config SYSV_FS
-	tristate "System V/Xenix/V7/Coherent file system support"
-	depends on BLOCK
-	help
-	  SCO, Xenix and Coherent are commercial Unix systems for Intel
-	  machines, and Version 7 was used on the DEC PDP-11. Saying Y
-	  here would allow you to read from their floppies and hard disk
-	  partitions.
-
-	  If you have floppies or hard disk partitions like that, it is likely
-	  that they contain binaries from those other Unix systems; in order
-	  to run these binaries, you will want to install linux-abi which is
-	  a set of kernel modules that lets you run SCO, Xenix, Wyse,
-	  UnixWare, Dell Unix and System V programs under Linux.  It is
-	  available via FTP (user: ftp) from
-	  <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
-	  NOTE: that will work only for binaries from Intel-based systems;
-	  PDP ones will have to wait until somebody ports Linux to -11 ;-)
-
-	  If you only intend to mount files from some other Unix over the
-	  network using NFS, you don't need the System V file system support
-	  (but you need NFS file system support obviously).
-
-	  Note that this option is generally not needed for floppies, since a
-	  good portable way to transport files and directories between unixes
-	  (and even other operating systems) is given by the tar program ("man
-	  tar" or preferably "info tar").  Note also that this option has
-	  nothing whatsoever to do with the option "System V IPC". Read about
-	  the System V file system in
-	  <file:Documentation/filesystems/sysv-fs.txt>.
-	  Saying Y here will enlarge your kernel by about 27 KB.
-
-	  To compile this as a module, choose M here: the module will be called
-	  sysv.
-
-	  If you haven't heard about all of this before, it's safe to say N.
-
+source "fs/sysv/Kconfig"
 
 config UFS_FS
 	tristate "UFS file system support (read only)"
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
new file mode 100644
index 000000000000..33aeb4b75db1
--- /dev/null
+++ b/fs/sysv/Kconfig
@@ -0,0 +1,36 @@
+config SYSV_FS
+	tristate "System V/Xenix/V7/Coherent file system support"
+	depends on BLOCK
+	help
+	  SCO, Xenix and Coherent are commercial Unix systems for Intel
+	  machines, and Version 7 was used on the DEC PDP-11. Saying Y
+	  here would allow you to read from their floppies and hard disk
+	  partitions.
+
+	  If you have floppies or hard disk partitions like that, it is likely
+	  that they contain binaries from those other Unix systems; in order
+	  to run these binaries, you will want to install linux-abi which is
+	  a set of kernel modules that lets you run SCO, Xenix, Wyse,
+	  UnixWare, Dell Unix and System V programs under Linux.  It is
+	  available via FTP (user: ftp) from
+	  <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
+	  NOTE: that will work only for binaries from Intel-based systems;
+	  PDP ones will have to wait until somebody ports Linux to -11 ;-)
+
+	  If you only intend to mount files from some other Unix over the
+	  network using NFS, you don't need the System V file system support
+	  (but you need NFS file system support obviously).
+
+	  Note that this option is generally not needed for floppies, since a
+	  good portable way to transport files and directories between unixes
+	  (and even other operating systems) is given by the tar program ("man
+	  tar" or preferably "info tar").  Note also that this option has
+	  nothing whatsoever to do with the option "System V IPC". Read about
+	  the System V file system in
+	  <file:Documentation/filesystems/sysv-fs.txt>.
+	  Saying Y here will enlarge your kernel by about 27 KB.
+
+	  To compile this as a module, choose M here: the module will be called
+	  sysv.
+
+	  If you haven't heard about all of this before, it's safe to say N.
-- 
cgit v1.2.3


From a276a52f9f1b1059bddade71df18ceb6481534a6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:05:02 +0300
Subject: fs/Kconfig: move ufs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 45 +--------------------------------------------
 fs/ufs/Kconfig | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 44 deletions(-)
 create mode 100644 fs/ufs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index e1cdb8310647..35941e8a17c5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -221,50 +221,7 @@ source "fs/hpfs/Kconfig"
 source "fs/qnx4/Kconfig"
 source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
-
-config UFS_FS
-	tristate "UFS file system support (read only)"
-	depends on BLOCK
-	help
-	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
-	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
-	  Unixes can create and mount hard disk partitions and diskettes using
-	  this file system as well. Saying Y here will allow you to read from
-	  these partitions; if you also want to write to them, say Y to the
-	  experimental "UFS file system write support", below. Please read the
-	  file <file:Documentation/filesystems/ufs.txt> for more information.
-
-          The recently released UFS2 variant (used in FreeBSD 5.x) is
-          READ-ONLY supported.
-
-	  Note that this option is generally not needed for floppies, since a
-	  good portable way to transport files and directories between unixes
-	  (and even other operating systems) is given by the tar program ("man
-	  tar" or preferably "info tar").
-
-	  When accessing NeXTstep files, you may need to convert them from the
-	  NeXT character set to the Latin1 character set; use the program
-	  recode ("info recode") for this purpose.
-
-	  To compile the UFS file system support as a module, choose M here: the
-	  module will be called ufs.
-
-	  If you haven't heard about all of this before, it's safe to say N.
-
-config UFS_FS_WRITE
-	bool "UFS file system write support (DANGEROUS)"
-	depends on UFS_FS && EXPERIMENTAL
-	help
-	  Say Y here if you want to try writing to UFS partitions. This is
-	  experimental, so you should back up your UFS partitions beforehand.
-
-config UFS_DEBUG
-	bool "UFS debugging"
-	depends on UFS_FS
-	help
-	  If you are experiencing any problems with the UFS filesystem, say
-	  Y here.  This will result in _many_ additional debugging messages to be
-	  written to the system log.
+source "fs/ufs/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
new file mode 100644
index 000000000000..e4f10a40768a
--- /dev/null
+++ b/fs/ufs/Kconfig
@@ -0,0 +1,43 @@
+config UFS_FS
+	tristate "UFS file system support (read only)"
+	depends on BLOCK
+	help
+	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
+	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
+	  Unixes can create and mount hard disk partitions and diskettes using
+	  this file system as well. Saying Y here will allow you to read from
+	  these partitions; if you also want to write to them, say Y to the
+	  experimental "UFS file system write support", below. Please read the
+	  file <file:Documentation/filesystems/ufs.txt> for more information.
+
+          The recently released UFS2 variant (used in FreeBSD 5.x) is
+          READ-ONLY supported.
+
+	  Note that this option is generally not needed for floppies, since a
+	  good portable way to transport files and directories between unixes
+	  (and even other operating systems) is given by the tar program ("man
+	  tar" or preferably "info tar").
+
+	  When accessing NeXTstep files, you may need to convert them from the
+	  NeXT character set to the Latin1 character set; use the program
+	  recode ("info recode") for this purpose.
+
+	  To compile the UFS file system support as a module, choose M here: the
+	  module will be called ufs.
+
+	  If you haven't heard about all of this before, it's safe to say N.
+
+config UFS_FS_WRITE
+	bool "UFS file system write support (DANGEROUS)"
+	depends on UFS_FS && EXPERIMENTAL
+	help
+	  Say Y here if you want to try writing to UFS partitions. This is
+	  experimental, so you should back up your UFS partitions beforehand.
+
+config UFS_DEBUG
+	bool "UFS debugging"
+	depends on UFS_FS
+	help
+	  If you are experiencing any problems with the UFS filesystem, say
+	  Y here.  This will result in _many_ additional debugging messages to be
+	  written to the system log.
-- 
cgit v1.2.3


From 97afe47ac378615d727fc2f0ffa1b58e9837f438 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:07:41 +0300
Subject: fs/Kconfig: move nfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 87 +---------------------------------------------------------
 fs/nfs/Kconfig | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 86 deletions(-)
 create mode 100644 fs/nfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 35941e8a17c5..f07c72b76662 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -241,92 +241,7 @@ menuconfig NETWORK_FILESYSTEMS
 
 if NETWORK_FILESYSTEMS
 
-config NFS_FS
-	tristate "NFS client support"
-	depends on INET
-	select LOCKD
-	select SUNRPC
-	select NFS_ACL_SUPPORT if NFS_V3_ACL
-	help
-	  Choose Y here if you want to access files residing on other
-	  computers using Sun's Network File System protocol.  To compile
-	  this file system support as a module, choose M here: the module
-	  will be called nfs.
-
-	  To mount file systems exported by NFS servers, you also need to
-	  install the user space mount.nfs command which can be found in
-	  the Linux nfs-utils package, available from http://linux-nfs.org/.
-	  Information about using the mount command is available in the
-	  mount(8) man page.  More detail about the Linux NFS client
-	  implementation is available via the nfs(5) man page.
-
-	  Below you can choose which versions of the NFS protocol are
-	  available in the kernel to mount NFS servers.  Support for NFS
-	  version 2 (RFC 1094) is always available when NFS_FS is selected.
-
-	  To configure a system which mounts its root file system via NFS
-	  at boot time, say Y here, select "Kernel level IP
-	  autoconfiguration" in the NETWORK menu, and select "Root file
-	  system on NFS" below.  You cannot compile this file system as a
-	  module in this case.
-
-	  If unsure, say N.
-
-config NFS_V3
-	bool "NFS client support for NFS version 3"
-	depends on NFS_FS
-	help
-	  This option enables support for version 3 of the NFS protocol
-	  (RFC 1813) in the kernel's NFS client.
-
-	  If unsure, say Y.
-
-config NFS_V3_ACL
-	bool "NFS client support for the NFSv3 ACL protocol extension"
-	depends on NFS_V3
-	help
-	  Some NFS servers support an auxiliary NFSv3 ACL protocol that
-	  Sun added to Solaris but never became an official part of the
-	  NFS version 3 protocol.  This protocol extension allows
-	  applications on NFS clients to manipulate POSIX Access Control
-	  Lists on files residing on NFS servers.  NFS servers enforce
-	  ACLs on local files whether this protocol is available or not.
-
-	  Choose Y here if your NFS server supports the Solaris NFSv3 ACL
-	  protocol extension and you want your NFS client to allow
-	  applications to access and modify ACLs on files on the server.
-
-	  Most NFS servers don't support the Solaris NFSv3 ACL protocol
-	  extension.  You can choose N here or specify the "noacl" mount
-	  option to prevent your NFS client from trying to use the NFSv3
-	  ACL protocol.
-
-	  If unsure, say N.
-
-config NFS_V4
-	bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
-	depends on NFS_FS && EXPERIMENTAL
-	select RPCSEC_GSS_KRB5
-	help
-	  This option enables support for version 4 of the NFS protocol
-	  (RFC 3530) in the kernel's NFS client.
-
-	  To mount NFS servers using NFSv4, you also need to install user
-	  space programs which can be found in the Linux nfs-utils package,
-	  available from http://linux-nfs.org/.
-
-	  If unsure, say N.
-
-config ROOT_NFS
-	bool "Root file system on NFS"
-	depends on NFS_FS=y && IP_PNP
-	help
-	  If you want your system to mount its root file system via NFS,
-	  choose Y here.  This is common practice for managing systems
-	  without local permanent storage.  For details, read
-	  <file:Documentation/filesystems/nfsroot.txt>.
-
-	  Most people say N here.
+source "fs/nfs/Kconfig"
 
 config NFSD
 	tristate "NFS server support"
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
new file mode 100644
index 000000000000..36fe20d6eba2
--- /dev/null
+++ b/fs/nfs/Kconfig
@@ -0,0 +1,86 @@
+config NFS_FS
+	tristate "NFS client support"
+	depends on INET
+	select LOCKD
+	select SUNRPC
+	select NFS_ACL_SUPPORT if NFS_V3_ACL
+	help
+	  Choose Y here if you want to access files residing on other
+	  computers using Sun's Network File System protocol.  To compile
+	  this file system support as a module, choose M here: the module
+	  will be called nfs.
+
+	  To mount file systems exported by NFS servers, you also need to
+	  install the user space mount.nfs command which can be found in
+	  the Linux nfs-utils package, available from http://linux-nfs.org/.
+	  Information about using the mount command is available in the
+	  mount(8) man page.  More detail about the Linux NFS client
+	  implementation is available via the nfs(5) man page.
+
+	  Below you can choose which versions of the NFS protocol are
+	  available in the kernel to mount NFS servers.  Support for NFS
+	  version 2 (RFC 1094) is always available when NFS_FS is selected.
+
+	  To configure a system which mounts its root file system via NFS
+	  at boot time, say Y here, select "Kernel level IP
+	  autoconfiguration" in the NETWORK menu, and select "Root file
+	  system on NFS" below.  You cannot compile this file system as a
+	  module in this case.
+
+	  If unsure, say N.
+
+config NFS_V3
+	bool "NFS client support for NFS version 3"
+	depends on NFS_FS
+	help
+	  This option enables support for version 3 of the NFS protocol
+	  (RFC 1813) in the kernel's NFS client.
+
+	  If unsure, say Y.
+
+config NFS_V3_ACL
+	bool "NFS client support for the NFSv3 ACL protocol extension"
+	depends on NFS_V3
+	help
+	  Some NFS servers support an auxiliary NFSv3 ACL protocol that
+	  Sun added to Solaris but never became an official part of the
+	  NFS version 3 protocol.  This protocol extension allows
+	  applications on NFS clients to manipulate POSIX Access Control
+	  Lists on files residing on NFS servers.  NFS servers enforce
+	  ACLs on local files whether this protocol is available or not.
+
+	  Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+	  protocol extension and you want your NFS client to allow
+	  applications to access and modify ACLs on files on the server.
+
+	  Most NFS servers don't support the Solaris NFSv3 ACL protocol
+	  extension.  You can choose N here or specify the "noacl" mount
+	  option to prevent your NFS client from trying to use the NFSv3
+	  ACL protocol.
+
+	  If unsure, say N.
+
+config NFS_V4
+	bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
+	depends on NFS_FS && EXPERIMENTAL
+	select RPCSEC_GSS_KRB5
+	help
+	  This option enables support for version 4 of the NFS protocol
+	  (RFC 3530) in the kernel's NFS client.
+
+	  To mount NFS servers using NFSv4, you also need to install user
+	  space programs which can be found in the Linux nfs-utils package,
+	  available from http://linux-nfs.org/.
+
+	  If unsure, say N.
+
+config ROOT_NFS
+	bool "Root file system on NFS"
+	depends on NFS_FS=y && IP_PNP
+	help
+	  If you want your system to mount its root file system via NFS,
+	  choose Y here.  This is common practice for managing systems
+	  without local permanent storage.  For details, read
+	  <file:Documentation/filesystems/nfsroot.txt>.
+
+	  Most people say N here.
-- 
cgit v1.2.3


From e2b329e2002685c1b0fa3c06caadc0936b7f507f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:08:58 +0300
Subject: fs/Kconfig: move nfsd out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 82 +--------------------------------------------------------
 fs/nfsd/Kconfig | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 81 deletions(-)
 create mode 100644 fs/nfsd/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index f07c72b76662..acceb6e62bff 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -242,87 +242,7 @@ menuconfig NETWORK_FILESYSTEMS
 if NETWORK_FILESYSTEMS
 
 source "fs/nfs/Kconfig"
-
-config NFSD
-	tristate "NFS server support"
-	depends on INET
-	select LOCKD
-	select SUNRPC
-	select EXPORTFS
-	select NFS_ACL_SUPPORT if NFSD_V2_ACL
-	help
-	  Choose Y here if you want to allow other computers to access
-	  files residing on this system using Sun's Network File System
-	  protocol.  To compile the NFS server support as a module,
-	  choose M here: the module will be called nfsd.
-
-	  You may choose to use a user-space NFS server instead, in which
-	  case you can choose N here.
-
-	  To export local file systems using NFS, you also need to install
-	  user space programs which can be found in the Linux nfs-utils
-	  package, available from http://linux-nfs.org/.  More detail about
-	  the Linux NFS server implementation is available via the
-	  exports(5) man page.
-
-	  Below you can choose which versions of the NFS protocol are
-	  available to clients mounting the NFS server on this system.
-	  Support for NFS version 2 (RFC 1094) is always available when
-	  CONFIG_NFSD is selected.
-
-	  If unsure, say N.
-
-config NFSD_V2_ACL
-	bool
-	depends on NFSD
-
-config NFSD_V3
-	bool "NFS server support for NFS version 3"
-	depends on NFSD
-	help
-	  This option enables support in your system's NFS server for
-	  version 3 of the NFS protocol (RFC 1813).
-
-	  If unsure, say Y.
-
-config NFSD_V3_ACL
-	bool "NFS server support for the NFSv3 ACL protocol extension"
-	depends on NFSD_V3
-	select NFSD_V2_ACL
-	help
-	  Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
-	  never became an official part of the NFS version 3 protocol.
-	  This protocol extension allows applications on NFS clients to
-	  manipulate POSIX Access Control Lists on files residing on NFS
-	  servers.  NFS servers enforce POSIX ACLs on local files whether
-	  this protocol is available or not.
-
-	  This option enables support in your system's NFS server for the
-	  NFSv3 ACL protocol extension allowing NFS clients to manipulate
-	  POSIX ACLs on files exported by your system's NFS server.  NFS
-	  clients which support the Solaris NFSv3 ACL protocol can then
-	  access and modify ACLs on your NFS server.
-
-	  To store ACLs on your NFS server, you also need to enable ACL-
-	  related CONFIG options for your local file systems of choice.
-
-	  If unsure, say N.
-
-config NFSD_V4
-	bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
-	depends on NFSD && PROC_FS && EXPERIMENTAL
-	select NFSD_V3
-	select FS_POSIX_ACL
-	select RPCSEC_GSS_KRB5
-	help
-	  This option enables support in your system's NFS server for
-	  version 4 of the NFS protocol (RFC 3530).
-
-	  To export files using NFSv4, you need to install additional user
-	  space programs which can be found in the Linux nfs-utils package,
-	  available from http://linux-nfs.org/.
-
-	  If unsure, say N.
+source "fs/nfsd/Kconfig"
 
 config LOCKD
 	tristate
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
new file mode 100644
index 000000000000..44d7d04dab95
--- /dev/null
+++ b/fs/nfsd/Kconfig
@@ -0,0 +1,80 @@
+config NFSD
+	tristate "NFS server support"
+	depends on INET
+	select LOCKD
+	select SUNRPC
+	select EXPORTFS
+	select NFS_ACL_SUPPORT if NFSD_V2_ACL
+	help
+	  Choose Y here if you want to allow other computers to access
+	  files residing on this system using Sun's Network File System
+	  protocol.  To compile the NFS server support as a module,
+	  choose M here: the module will be called nfsd.
+
+	  You may choose to use a user-space NFS server instead, in which
+	  case you can choose N here.
+
+	  To export local file systems using NFS, you also need to install
+	  user space programs which can be found in the Linux nfs-utils
+	  package, available from http://linux-nfs.org/.  More detail about
+	  the Linux NFS server implementation is available via the
+	  exports(5) man page.
+
+	  Below you can choose which versions of the NFS protocol are
+	  available to clients mounting the NFS server on this system.
+	  Support for NFS version 2 (RFC 1094) is always available when
+	  CONFIG_NFSD is selected.
+
+	  If unsure, say N.
+
+config NFSD_V2_ACL
+	bool
+	depends on NFSD
+
+config NFSD_V3
+	bool "NFS server support for NFS version 3"
+	depends on NFSD
+	help
+	  This option enables support in your system's NFS server for
+	  version 3 of the NFS protocol (RFC 1813).
+
+	  If unsure, say Y.
+
+config NFSD_V3_ACL
+	bool "NFS server support for the NFSv3 ACL protocol extension"
+	depends on NFSD_V3
+	select NFSD_V2_ACL
+	help
+	  Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
+	  never became an official part of the NFS version 3 protocol.
+	  This protocol extension allows applications on NFS clients to
+	  manipulate POSIX Access Control Lists on files residing on NFS
+	  servers.  NFS servers enforce POSIX ACLs on local files whether
+	  this protocol is available or not.
+
+	  This option enables support in your system's NFS server for the
+	  NFSv3 ACL protocol extension allowing NFS clients to manipulate
+	  POSIX ACLs on files exported by your system's NFS server.  NFS
+	  clients which support the Solaris NFSv3 ACL protocol can then
+	  access and modify ACLs on your NFS server.
+
+	  To store ACLs on your NFS server, you also need to enable ACL-
+	  related CONFIG options for your local file systems of choice.
+
+	  If unsure, say N.
+
+config NFSD_V4
+	bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
+	depends on NFSD && PROC_FS && EXPERIMENTAL
+	select NFSD_V3
+	select FS_POSIX_ACL
+	select RPCSEC_GSS_KRB5
+	help
+	  This option enables support in your system's NFS server for
+	  version 4 of the NFS protocol (RFC 3530).
+
+	  To export files using NFSv4, you need to install additional user
+	  space programs which can be found in the Linux nfs-utils package,
+	  available from http://linux-nfs.org/.
+
+	  If unsure, say N.
-- 
cgit v1.2.3


From 9098c24f35f7da6c89a83420acf21e3d7b35151d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:11:56 +0300
Subject: fs/Kconfig: move sunrpc out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig | 80 +-------------------------------------------------------------
 1 file changed, 1 insertion(+), 79 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index acceb6e62bff..1d7c0f6fade4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -264,85 +264,7 @@ config NFS_COMMON
 	depends on NFSD || NFS_FS
 	default y
 
-config SUNRPC
-	tristate
-
-config SUNRPC_GSS
-	tristate
-
-config SUNRPC_XPRT_RDMA
-	tristate
-	depends on SUNRPC && INFINIBAND && EXPERIMENTAL
-	default SUNRPC && INFINIBAND
-	help
-	  This option enables an RPC client transport capability that
-	  allows the NFS client to mount servers via an RDMA-enabled
-	  transport.
-
-	  To compile RPC client RDMA transport support as a module,
-	  choose M here: the module will be called xprtrdma.
-
-	  If unsure, say N.
-
-config SUNRPC_REGISTER_V4
-	bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
-	depends on SUNRPC && EXPERIMENTAL
-	default n
-	help
-	  Sun added support for registering RPC services at an IPv6
-	  address by creating two new versions of the rpcbind protocol
-	  (RFC 1833).
-
-	  This option enables support in the kernel RPC server for
-	  registering kernel RPC services via version 4 of the rpcbind
-	  protocol.  If you enable this option, you must run a portmapper
-	  daemon that supports rpcbind protocol version 4.
-
-	  Serving NFS over IPv6 from knfsd (the kernel's NFS server)
-	  requires that you enable this option and use a portmapper that
-	  supports rpcbind version 4.
-
-	  If unsure, say N to get traditional behavior (register kernel
-	  RPC services using only rpcbind version 2).  Distributions
-	  using the legacy Linux portmapper daemon must say N here.
-
-config RPCSEC_GSS_KRB5
-	tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
-	depends on SUNRPC && EXPERIMENTAL
-	select SUNRPC_GSS
-	select CRYPTO
-	select CRYPTO_MD5
-	select CRYPTO_DES
-	select CRYPTO_CBC
-	help
-	  Choose Y here to enable Secure RPC using the Kerberos version 5
-	  GSS-API mechanism (RFC 1964).
-
-	  Secure RPC calls with Kerberos require an auxiliary user-space
-	  daemon which may be found in the Linux nfs-utils package
-	  available from http://linux-nfs.org/.  In addition, user-space
-	  Kerberos support should be installed.
-
-	  If unsure, say N.
-
-config RPCSEC_GSS_SPKM3
-	tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)"
-	depends on SUNRPC && EXPERIMENTAL
-	select SUNRPC_GSS
-	select CRYPTO
-	select CRYPTO_MD5
-	select CRYPTO_DES
-	select CRYPTO_CAST5
-	select CRYPTO_CBC
-	help
-	  Choose Y here to enable Secure RPC using the SPKM3 public key
-	  GSS-API mechansim (RFC 2025).
-
-	  Secure RPC calls with SPKM3 require an auxiliary userspace
-	  daemon which may be found in the Linux nfs-utils package
-	  available from http://linux-nfs.org/.
-
-	  If unsure, say N.
+source "net/sunrpc/Kconfig"
 
 config SMB_FS
 	tristate "SMB file system support (OBSOLETE, please use CIFS)"
-- 
cgit v1.2.3


From 213a41d404d5ed16528df5aa0ed215adcb1e9d66 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:13:16 +0300
Subject: fs/Kconfig: move smbfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 58 +-------------------------------------------------------
 fs/smbfs/Kconfig | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 57 deletions(-)
 create mode 100644 fs/smbfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 1d7c0f6fade4..c05ccea75c3a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -265,63 +265,7 @@ config NFS_COMMON
 	default y
 
 source "net/sunrpc/Kconfig"
-
-config SMB_FS
-	tristate "SMB file system support (OBSOLETE, please use CIFS)"
-	depends on INET
-	select NLS
-	help
-	  SMB (Server Message Block) is the protocol Windows for Workgroups
-	  (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
-	  files and printers over local networks.  Saying Y here allows you to
-	  mount their file systems (often called "shares" in this context) and
-	  access them just like any other Unix directory.  Currently, this
-	  works only if the Windows machines use TCP/IP as the underlying
-	  transport protocol, and not NetBEUI.  For details, read
-	  <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
-	  available from <http://www.tldp.org/docs.html#howto>.
-
-	  Note: if you just want your box to act as an SMB *server* and make
-	  files and printing services available to Windows clients (which need
-	  to have a TCP/IP stack), you don't need to say Y here; you can use
-	  the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
-	  for that.
-
-	  General information about how to connect Linux, Windows machines and
-	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-
-	  To compile the SMB support as a module, choose M here:
-	  the module will be called smbfs.  Most people say N, however.
-
-config SMB_NLS_DEFAULT
-	bool "Use a default NLS"
-	depends on SMB_FS
-	help
-	  Enabling this will make smbfs use nls translations by default. You
-	  need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
-	  settings and you need to give the default nls for the SMB server as
-	  CONFIG_SMB_NLS_REMOTE.
-
-	  The nls settings can be changed at mount time, if your smbmount
-	  supports that, using the codepage and iocharset parameters.
-
-	  smbmount from samba 2.2.0 or later supports this.
-
-config SMB_NLS_REMOTE
-	string "Default Remote NLS Option"
-	depends on SMB_NLS_DEFAULT
-	default "cp437"
-	help
-	  This setting allows you to specify a default value for which
-	  codepage the server uses. If this field is left blank no
-	  translations will be done by default. The local codepage/charset
-	  default to CONFIG_NLS_DEFAULT.
-
-	  The nls settings can be changed at mount time, if your smbmount
-	  supports that, using the codepage and iocharset parameters.
-
-	  smbmount from samba 2.2.0 or later supports this.
-
+source "fs/smbfs/Kconfig"
 source "fs/cifs/Kconfig"
 
 config NCP_FS
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
new file mode 100644
index 000000000000..e668127c8b2e
--- /dev/null
+++ b/fs/smbfs/Kconfig
@@ -0,0 +1,55 @@
+config SMB_FS
+	tristate "SMB file system support (OBSOLETE, please use CIFS)"
+	depends on INET
+	select NLS
+	help
+	  SMB (Server Message Block) is the protocol Windows for Workgroups
+	  (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
+	  files and printers over local networks.  Saying Y here allows you to
+	  mount their file systems (often called "shares" in this context) and
+	  access them just like any other Unix directory.  Currently, this
+	  works only if the Windows machines use TCP/IP as the underlying
+	  transport protocol, and not NetBEUI.  For details, read
+	  <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
+	  available from <http://www.tldp.org/docs.html#howto>.
+
+	  Note: if you just want your box to act as an SMB *server* and make
+	  files and printing services available to Windows clients (which need
+	  to have a TCP/IP stack), you don't need to say Y here; you can use
+	  the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
+	  for that.
+
+	  General information about how to connect Linux, Windows machines and
+	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
+
+	  To compile the SMB support as a module, choose M here:
+	  the module will be called smbfs.  Most people say N, however.
+
+config SMB_NLS_DEFAULT
+	bool "Use a default NLS"
+	depends on SMB_FS
+	help
+	  Enabling this will make smbfs use nls translations by default. You
+	  need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
+	  settings and you need to give the default nls for the SMB server as
+	  CONFIG_SMB_NLS_REMOTE.
+
+	  The nls settings can be changed at mount time, if your smbmount
+	  supports that, using the codepage and iocharset parameters.
+
+	  smbmount from samba 2.2.0 or later supports this.
+
+config SMB_NLS_REMOTE
+	string "Default Remote NLS Option"
+	depends on SMB_NLS_DEFAULT
+	default "cp437"
+	help
+	  This setting allows you to specify a default value for which
+	  codepage the server uses. If this field is left blank no
+	  translations will be done by default. The local codepage/charset
+	  default to CONFIG_NLS_DEFAULT.
+
+	  The nls settings can be changed at mount time, if your smbmount
+	  supports that, using the codepage and iocharset parameters.
+
+	  smbmount from samba 2.2.0 or later supports this.
-- 
cgit v1.2.3


From 9d7d6447ef455f4561f63bf6e8f6bef58b42a0a3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:14:15 +0300
Subject: fs/Kconfig: move the rest of ncpfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 22 ----------------------
 fs/ncpfs/Kconfig | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index c05ccea75c3a..86a4f1173fa6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -267,28 +267,6 @@ config NFS_COMMON
 source "net/sunrpc/Kconfig"
 source "fs/smbfs/Kconfig"
 source "fs/cifs/Kconfig"
-
-config NCP_FS
-	tristate "NCP file system support (to mount NetWare volumes)"
-	depends on IPX!=n || INET
-	help
-	  NCP (NetWare Core Protocol) is a protocol that runs over IPX and is
-	  used by Novell NetWare clients to talk to file servers.  It is to
-	  IPX what NFS is to TCP/IP, if that helps.  Saying Y here allows you
-	  to mount NetWare file server volumes and to access them just like
-	  any other Unix directory.  For details, please read the file
-	  <file:Documentation/filesystems/ncpfs.txt> in the kernel source and
-	  the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>.
-
-	  You do not have to say Y here if you want your Linux box to act as a
-	  file *server* for Novell NetWare clients.
-
-	  General information about how to connect Linux, Windows machines and
-	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-
-	  To compile this as a module, choose M here: the module will be called
-	  ncpfs.  Say N unless you are connected to a Novell network.
-
 source "fs/ncpfs/Kconfig"
 
 config CODA_FS
diff --git a/fs/ncpfs/Kconfig b/fs/ncpfs/Kconfig
index 142808427b25..c931cf22a1f6 100644
--- a/fs/ncpfs/Kconfig
+++ b/fs/ncpfs/Kconfig
@@ -1,6 +1,27 @@
 #
 # NCP Filesystem configuration
 #
+config NCP_FS
+	tristate "NCP file system support (to mount NetWare volumes)"
+	depends on IPX!=n || INET
+	help
+	  NCP (NetWare Core Protocol) is a protocol that runs over IPX and is
+	  used by Novell NetWare clients to talk to file servers.  It is to
+	  IPX what NFS is to TCP/IP, if that helps.  Saying Y here allows you
+	  to mount NetWare file server volumes and to access them just like
+	  any other Unix directory.  For details, please read the file
+	  <file:Documentation/filesystems/ncpfs.txt> in the kernel source and
+	  the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>.
+
+	  You do not have to say Y here if you want your Linux box to act as a
+	  file *server* for Novell NetWare clients.
+
+	  General information about how to connect Linux, Windows machines and
+	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
+
+	  To compile this as a module, choose M here: the module will be called
+	  ncpfs.  Say N unless you are connected to a Novell network.
+
 config NCPFS_PACKET_SIGNING
 	bool "Packet signatures"
 	depends on NCP_FS
-- 
cgit v1.2.3


From 33a1a6fedf08bbcb4b4df74498d697e7a88d39f2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:15:06 +0300
Subject: fs/Kconfig: move coda out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 23 +----------------------
 fs/coda/Kconfig | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 22 deletions(-)
 create mode 100644 fs/coda/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 86a4f1173fa6..f5cd88790b0f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -268,28 +268,7 @@ source "net/sunrpc/Kconfig"
 source "fs/smbfs/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
-
-config CODA_FS
-	tristate "Coda file system support (advanced network fs)"
-	depends on INET
-	help
-	  Coda is an advanced network file system, similar to NFS in that it
-	  enables you to mount file systems of a remote server and access them
-	  with regular Unix commands as if they were sitting on your hard
-	  disk.  Coda has several advantages over NFS: support for
-	  disconnected operation (e.g. for laptops), read/write server
-	  replication, security model for authentication and encryption,
-	  persistent client caches and write back caching.
-
-	  If you say Y here, your Linux box will be able to act as a Coda
-	  *client*.  You will need user level code as well, both for the
-	  client and server.  Servers are currently user level, i.e. they need
-	  no kernel support.  Please read
-	  <file:Documentation/filesystems/coda.txt> and check out the Coda
-	  home page <http://www.coda.cs.cmu.edu/>.
-
-	  To compile the coda client support as a module, choose M here: the
-	  module will be called coda.
+source "fs/coda/Kconfig"
 
 config AFS_FS
 	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
diff --git a/fs/coda/Kconfig b/fs/coda/Kconfig
new file mode 100644
index 000000000000..c0e5a7fad06d
--- /dev/null
+++ b/fs/coda/Kconfig
@@ -0,0 +1,21 @@
+config CODA_FS
+	tristate "Coda file system support (advanced network fs)"
+	depends on INET
+	help
+	  Coda is an advanced network file system, similar to NFS in that it
+	  enables you to mount file systems of a remote server and access them
+	  with regular Unix commands as if they were sitting on your hard
+	  disk.  Coda has several advantages over NFS: support for
+	  disconnected operation (e.g. for laptops), read/write server
+	  replication, security model for authentication and encryption,
+	  persistent client caches and write back caching.
+
+	  If you say Y here, your Linux box will be able to act as a Coda
+	  *client*.  You will need user level code as well, both for the
+	  client and server.  Servers are currently user level, i.e. they need
+	  no kernel support.  Please read
+	  <file:Documentation/filesystems/coda.txt> and check out the Coda
+	  home page <http://www.coda.cs.cmu.edu/>.
+
+	  To compile the coda client support as a module, choose M here: the
+	  module will be called coda.
-- 
cgit v1.2.3


From b2480c7fbfed172e6ec3ba1c8e80f05a3721b24a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:16:02 +0300
Subject: fs/Kconfig: move afs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 23 +----------------------
 fs/afs/Kconfig | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 22 deletions(-)
 create mode 100644 fs/afs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index f5cd88790b0f..0563f9f1ab5e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -269,28 +269,7 @@ source "fs/smbfs/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
 source "fs/coda/Kconfig"
-
-config AFS_FS
-	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
-	depends on INET && EXPERIMENTAL
-	select AF_RXRPC
-	help
-	  If you say Y here, you will get an experimental Andrew File System
-	  driver. It currently only supports unsecured read-only AFS access.
-
-	  See <file:Documentation/filesystems/afs.txt> for more information.
-
-	  If unsure, say N.
-
-config AFS_DEBUG
-	bool "AFS dynamic debugging"
-	depends on AFS_FS
-	help
-	  Say Y here to make runtime controllable debugging messages appear.
-
-	  See <file:Documentation/filesystems/afs.txt> for more information.
-
-	  If unsure, say N.
+source "fs/afs/Kconfig"
 
 config 9P_FS
 	tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
new file mode 100644
index 000000000000..e7b522fe15e1
--- /dev/null
+++ b/fs/afs/Kconfig
@@ -0,0 +1,21 @@
+config AFS_FS
+	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	select AF_RXRPC
+	help
+	  If you say Y here, you will get an experimental Andrew File System
+	  driver. It currently only supports unsecured read-only AFS access.
+
+	  See <file:Documentation/filesystems/afs.txt> for more information.
+
+	  If unsure, say N.
+
+config AFS_DEBUG
+	bool "AFS dynamic debugging"
+	depends on AFS_FS
+	help
+	  Say Y here to make runtime controllable debugging messages appear.
+
+	  See <file:Documentation/filesystems/afs.txt> for more information.
+
+	  If unsure, say N.
-- 
cgit v1.2.3


From 0fcb44088970b18eaf2df4579d64840be6e3bf39 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:16:42 +0300
Subject: fs/Kconfig: move 9p out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/9p/Kconfig | 10 ++++++++++
 fs/Kconfig    | 12 +-----------
 2 files changed, 11 insertions(+), 11 deletions(-)
 create mode 100644 fs/9p/Kconfig

(limited to 'fs')

diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
new file mode 100644
index 000000000000..74e0723e90bc
--- /dev/null
+++ b/fs/9p/Kconfig
@@ -0,0 +1,10 @@
+config 9P_FS
+	tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
+	depends on INET && NET_9P && EXPERIMENTAL
+	help
+	  If you say Y here, you will get experimental support for
+	  Plan 9 resource sharing via the 9P2000 protocol.
+
+	  See <http://v9fs.sf.net> for more information.
+
+	  If unsure, say N.
diff --git a/fs/Kconfig b/fs/Kconfig
index 0563f9f1ab5e..93945dd0b1ae 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -270,17 +270,7 @@ source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
 source "fs/coda/Kconfig"
 source "fs/afs/Kconfig"
-
-config 9P_FS
-	tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
-	depends on INET && NET_9P && EXPERIMENTAL
-	help
-	  If you say Y here, you will get experimental support for
-	  Plan 9 resource sharing via the 9P2000 protocol.
-
-	  See <http://v9fs.sf.net> for more information.
-
-	  If unsure, say N.
+source "fs/9p/Kconfig"
 
 endif # NETWORK_FILESYSTEMS
 
-- 
cgit v1.2.3


From a717531942f488209dded30f6bc648167bcefa72 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 22 Jan 2009 09:23:10 -0500
Subject: Btrfs: do less aggressive btree readahead

Just before reading a leaf, btrfs scans the node for blocks that are
close by and reads them too.  It tries to build up a large window
of IO looking for blocks that are within a max distance from the top
and bottom of the IO window.

This patch changes things to just look for blocks within 64k of the
target block.  It will trigger less IO and make for lower latencies on
the read size.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9e46c0776816..2603ee539b7a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1210,8 +1210,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
 	struct btrfs_disk_key disk_key;
 	u32 nritems;
 	u64 search;
-	u64 lowest_read;
-	u64 highest_read;
+	u64 target;
 	u64 nread = 0;
 	int direction = path->reada;
 	struct extent_buffer *eb;
@@ -1235,8 +1234,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
 		return;
 	}
 
-	highest_read = search;
-	lowest_read = search;
+	target = search;
 
 	nritems = btrfs_header_nritems(node);
 	nr = slot;
@@ -1256,24 +1254,15 @@ static noinline void reada_for_search(struct btrfs_root *root,
 				break;
 		}
 		search = btrfs_node_blockptr(node, nr);
-		if ((search >= lowest_read && search <= highest_read) ||
-		    (search < lowest_read && lowest_read - search <= 16384) ||
-		    (search > highest_read && search - highest_read <= 16384)) {
+		if ((search <= target && target - search <= 65536) ||
+		    (search > target && search - target <= 65536)) {
 			readahead_tree_block(root, search, blocksize,
 				     btrfs_node_ptr_generation(node, nr));
 			nread += blocksize;
 		}
 		nscan++;
-		if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
+		if ((nread > 65536 || nscan > 32))
 			break;
-
-		if (nread > (256 * 1024) || nscan > 128)
-			break;
-
-		if (search < lowest_read)
-			lowest_read = search;
-		if (search > highest_read)
-			highest_read = search;
 	}
 }
 
-- 
cgit v1.2.3


From 82c1593cad3dfc97661764c8bc62aa1a416e9ea8 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 20 Jan 2009 16:46:02 +0200
Subject: UBIFS: simplify locking

This patch simplifies lock_[23]_inodes functions. We do not have
to care about locking order, because UBIFS does this for @i_mutex
and this is enough. Thanks to Al Viro for suggesting this.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/dir.c | 92 +++++++++++++++++++++++-----------------------------------
 1 file changed, 36 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index d29b771cce45..f55d523c52bb 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -482,30 +482,29 @@ static int ubifs_dir_release(struct inode *dir, struct file *file)
 }
 
 /**
- * lock_2_inodes - lock two UBIFS inodes.
+ * lock_2_inodes - a wrapper for locking two UBIFS inodes.
  * @inode1: first inode
  * @inode2: second inode
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
  */
 static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
 {
-	if (inode1->i_ino < inode2->i_ino) {
-		mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2);
-		mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
-	} else {
-		mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
-		mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
-	}
+	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+	mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
 }
 
 /**
- * unlock_2_inodes - unlock two UBIFS inodes inodes.
+ * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
  * @inode1: first inode
  * @inode2: second inode
  */
 static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
 {
-	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
 	mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
 }
 
 static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
@@ -527,6 +526,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
 		dentry->d_name.len, dentry->d_name.name, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 	err = dbg_check_synced_i_size(inode);
 	if (err)
 		return err;
@@ -580,6 +581,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
 		dentry->d_name.len, dentry->d_name.name, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 	err = dbg_check_synced_i_size(inode);
 	if (err)
 		return err;
@@ -667,7 +670,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
 		dentry->d_name.name, inode->i_ino, dir->i_ino);
-
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 	err = check_dir_empty(c, dentry->d_inode);
 	if (err)
 		return err;
@@ -922,59 +926,30 @@ out_budg:
 }
 
 /**
- * lock_3_inodes - lock three UBIFS inodes for rename.
+ * lock_3_inodes - a wrapper for locking three UBIFS inodes.
  * @inode1: first inode
  * @inode2: second inode
  * @inode3: third inode
  *
- * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may
- * be null.
+ * This function is used for 'ubifs_rename()' and @inode1 may be the same as
+ * @inode2 whereas @inode3 may be %NULL.
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
  */
 static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
 			  struct inode *inode3)
 {
-	struct inode *i1, *i2, *i3;
-
-	if (!inode3) {
-		if (inode1 != inode2) {
-			lock_2_inodes(inode1, inode2);
-			return;
-		}
-		mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
-		return;
-	}
-
-	if (inode1 == inode2) {
-		lock_2_inodes(inode1, inode3);
-		return;
-	}
-
-	/* 3 different inodes */
-	if (inode1 < inode2) {
-		i3 = inode2;
-		if (inode1 < inode3) {
-			i1 = inode1;
-			i2 = inode3;
-		} else {
-			i1 = inode3;
-			i2 = inode1;
-		}
-	} else {
-		i3 = inode1;
-		if (inode2 < inode3) {
-			i1 = inode2;
-			i2 = inode3;
-		} else {
-			i1 = inode3;
-			i2 = inode2;
-		}
-	}
-	mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
-	lock_2_inodes(i2, i3);
+	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+	if (inode2 != inode1)
+		mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+	if (inode3)
+		mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
 }
 
 /**
- * unlock_3_inodes - unlock three UBIFS inodes for rename.
+ * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.
  * @inode1: first inode
  * @inode2: second inode
  * @inode3: third inode
@@ -982,11 +957,11 @@ static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
 static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
 			    struct inode *inode3)
 {
-	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
-	if (inode1 != inode2)
-		mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
 	if (inode3)
 		mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
+	if (inode1 != inode2)
+		mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
 }
 
 static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -1020,6 +995,11 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		"dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
 		old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
 		new_dentry->d_name.name, new_dir->i_ino);
+	ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
+	if (unlink)
+		ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
+
 
 	if (unlink && is_dir) {
 		err = check_dir_empty(c, new_inode);
-- 
cgit v1.2.3


From e4d9b6cbfc98d696a28d2c24a3d49768695811ee Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 23 Jan 2009 14:17:36 +0200
Subject: UBIFS: fix LEB list freeing

When freeing the c->idx_lebs list, we have to release the LEBs as well,
because we might be called from mount to read-only mode code. Otherwise
the LEBs stay taken forever, which may cause problems when we re-mount
back ro RW mode.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c     | 16 ++++++++++++----
 fs/ubifs/lprops.c |  8 ++++++++
 fs/ubifs/super.c  | 42 +++++++++++++++++++++++++++---------------
 fs/ubifs/ubifs.h  |  2 +-
 4 files changed, 48 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index b2e5f1133377..9760154d874b 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -830,21 +830,29 @@ out:
  * ubifs_destroy_idx_gc - destroy idx_gc list.
  * @c: UBIFS file-system description object
  *
- * This function destroys the idx_gc list. It is called when unmounting or
- * remounting read-only so locks are not needed.
+ * This function destroys the @c->idx_gc list. It is called when unmounting or
+ * remounting read-only so locks are not needed. Returns zero in case of
+ * success and a negative error code in case of failure.
  */
-void ubifs_destroy_idx_gc(struct ubifs_info *c)
+int ubifs_destroy_idx_gc(struct ubifs_info *c)
 {
+	int ret = 0;
+
 	while (!list_empty(&c->idx_gc)) {
+		int err;
 		struct ubifs_gced_idx_leb *idx_gc;
 
 		idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
 				    list);
-		c->idx_gc_cnt -= 1;
+		err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
+					  LPROPS_NC, 0, LPROPS_TAKEN, -1);
+		if (err && !ret)
+			ret = err;
 		list_del(&idx_gc->list);
 		kfree(idx_gc);
 	}
 
+	return ret;
 }
 
 /**
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index dfd2bcece27a..68328c59762b 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -678,6 +678,9 @@ int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
 
 out:
 	ubifs_release_lprops(c);
+	if (err)
+		ubifs_err("cannot change properties of LEB %d, error %d",
+			  lnum, err);
 	return err;
 }
 
@@ -714,6 +717,9 @@ int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
 
 out:
 	ubifs_release_lprops(c);
+	if (err)
+		ubifs_err("cannot update properties of LEB %d, error %d",
+			  lnum, err);
 	return err;
 }
 
@@ -737,6 +743,8 @@ int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
 	lpp = ubifs_lpt_lookup(c, lnum);
 	if (IS_ERR(lpp)) {
 		err = PTR_ERR(lpp);
+		ubifs_err("cannot read properties of LEB %d, error %d",
+			  lnum, err);
 		goto out;
 	}
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index da99da098efd..807bbd3c8b4b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1469,9 +1469,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 {
 	int err, lnum;
 
-	if (c->ro_media)
-		return -EINVAL;
-
 	mutex_lock(&c->umount_mutex);
 	c->remounting_rw = 1;
 	c->always_chk_crc = 1;
@@ -1605,9 +1602,13 @@ out:
  */
 static void commit_on_unmount(struct ubifs_info *c)
 {
-	struct super_block *sb = c->vfs_sb;
 	long long bud_bytes;
 
+	if (!c->fast_unmount) {
+		dbg_gen("skip committing - fast unmount enabled");
+		return;
+	}
+
 	/*
 	 * This function is called before the background thread is stopped, so
 	 * we may race with ongoing commit, which means we have to take
@@ -1617,8 +1618,11 @@ static void commit_on_unmount(struct ubifs_info *c)
 	bud_bytes = c->bud_bytes;
 	spin_unlock(&c->buds_lock);
 
-	if (!c->fast_unmount && !(sb->s_flags & MS_RDONLY) && bud_bytes)
+	if (bud_bytes) {
+		dbg_gen("run commit");
 		ubifs_run_commit(c);
+	} else
+		dbg_gen("journal is empty, do not run commit");
 }
 
 /**
@@ -1633,6 +1637,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	int i, err;
 
 	ubifs_assert(!c->need_recovery);
+	ubifs_assert(!c->ro_media);
+
 	commit_on_unmount(c);
 
 	mutex_lock(&c->umount_mutex);
@@ -1646,16 +1652,17 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 		del_timer_sync(&c->jheads[i].wbuf.timer);
 	}
 
-	if (!c->ro_media) {
-		c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
-		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
-		c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
-		err = ubifs_write_master(c);
-		if (err)
-			ubifs_ro_mode(c, err);
-	}
+	c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+	c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+	c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+	err = ubifs_write_master(c);
+	if (err)
+		ubifs_ro_mode(c, err);
+
+	err = ubifs_destroy_idx_gc(c);
+	if (err)
+		ubifs_ro_mode(c, err);
 
-	ubifs_destroy_idx_gc(c);
 	free_wbufs(c);
 	vfree(c->orph_buf);
 	c->orph_buf = NULL;
@@ -1754,6 +1761,11 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	}
 
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+		if (c->ro_media) {
+			ubifs_msg("cannot re-mount R/W, UBIFS is working in "
+				  "R/O mode");
+			return -EINVAL;
+		}
 		err = ubifs_remount_rw(c);
 		if (err)
 			return err;
@@ -2044,7 +2056,7 @@ static void ubifs_kill_sb(struct super_block *sb)
 	 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
 	 * in order to be outside BKL.
 	 */
-	if (sb->s_root)
+	if (sb->s_root && !(sb->s_flags & MS_RDONLY))
 		commit_on_unmount(c);
 	/* The un-mount routine is actually done in put_super() */
 	generic_shutdown_super(sb);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 2e78d6ac007e..ee9517a7b024 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1593,7 +1593,7 @@ int ubifs_replay_journal(struct ubifs_info *c);
 int ubifs_garbage_collect(struct ubifs_info *c, int anyway);
 int ubifs_gc_start_commit(struct ubifs_info *c);
 int ubifs_gc_end_commit(struct ubifs_info *c);
-void ubifs_destroy_idx_gc(struct ubifs_info *c);
+int ubifs_destroy_idx_gc(struct ubifs_info *c);
 int ubifs_get_idx_gc_leb(struct ubifs_info *c);
 int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp);
 
-- 
cgit v1.2.3


From 84abf972ccff5c13d10b672972949eba431a6e0e Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 23 Jan 2009 14:54:59 +0200
Subject: UBIFS: add re-mount debugging checks

We observe space corrupted accounting when re-mounting. So add some
debbugging checks to catch problems like this.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c |  35 +++++++++++-----
 fs/ubifs/debug.c  | 120 +++++++++++++++++++++++++++++++++++++++---------------
 fs/ubifs/debug.h  |  36 +++++++++-------
 fs/ubifs/file.c   |   1 -
 fs/ubifs/lprops.c |   4 +-
 fs/ubifs/super.c  |  14 +++++--
 fs/ubifs/ubifs.h  |   3 +-
 7 files changed, 148 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 175f9c590b77..f393620890ee 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -689,7 +689,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
 }
 
 /**
- * ubifs_get_free_space - return amount of free space.
+ * ubifs_get_free_space_nolock - return amount of free space.
  * @c: UBIFS file-system description object
  *
  * This function calculates amount of free space to report to user-space.
@@ -704,16 +704,14 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
  * traditional file-systems, because they have way less overhead than UBIFS.
  * So, to keep users happy, UBIFS tries to take the overhead into account.
  */
-long long ubifs_get_free_space(struct ubifs_info *c)
+long long ubifs_get_free_space_nolock(struct ubifs_info *c)
 {
-	int min_idx_lebs, rsvd_idx_lebs, lebs;
+	int rsvd_idx_lebs, lebs;
 	long long available, outstanding, free;
 
-	spin_lock(&c->space_lock);
-	min_idx_lebs = c->min_idx_lebs;
-	ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+	ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
 	outstanding = c->budg_data_growth + c->budg_dd_growth;
-	available = ubifs_calc_available(c, min_idx_lebs);
+	available = ubifs_calc_available(c, c->min_idx_lebs);
 
 	/*
 	 * When reporting free space to user-space, UBIFS guarantees that it is
@@ -726,15 +724,14 @@ long long ubifs_get_free_space(struct ubifs_info *c)
 	 * Note, the calculations below are similar to what we have in
 	 * 'do_budget_space()', so refer there for comments.
 	 */
-	if (min_idx_lebs > c->lst.idx_lebs)
-		rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+	if (c->min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
 	else
 		rsvd_idx_lebs = 0;
 	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
 	       c->lst.taken_empty_lebs;
 	lebs -= rsvd_idx_lebs;
 	available += lebs * (c->dark_wm - c->leb_overhead);
-	spin_unlock(&c->space_lock);
 
 	if (available > outstanding)
 		free = ubifs_reported_space(c, available - outstanding);
@@ -742,3 +739,21 @@ long long ubifs_get_free_space(struct ubifs_info *c)
 		free = 0;
 	return free;
 }
+
+/**
+ * ubifs_get_free_space - return amount of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and retuns amount of free space to report to
+ * user-space.
+ */
+long long ubifs_get_free_space(struct ubifs_info *c)
+{
+	long long free;
+
+	spin_lock(&c->space_lock);
+	free = ubifs_get_free_space_nolock(c);
+	spin_unlock(&c->space_lock);
+
+	return free;
+}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 792c5a16c182..9a41f6f245b7 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -620,9 +620,11 @@ void dbg_dump_budg(struct ubifs_info *c)
 	       c->dark_wm, c->dead_wm, c->max_idx_node_sz);
 	printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
 	       c->gc_lnum, c->ihead_lnum);
-	for (i = 0; i < c->jhead_cnt; i++)
-		printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
-		       c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
+	/* If we are in R/O mode, journal heads do not exist */
+	if (c->jheads)
+		for (i = 0; i < c->jhead_cnt; i++)
+			printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
+			       c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
 	for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
 		bud = rb_entry(rb, struct ubifs_bud, rb);
 		printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -637,10 +639,7 @@ void dbg_dump_budg(struct ubifs_info *c)
 	/* Print budgeting predictions */
 	available = ubifs_calc_available(c, c->min_idx_lebs);
 	outstanding = c->budg_data_growth + c->budg_dd_growth;
-	if (available > outstanding)
-		free = ubifs_reported_space(c, available - outstanding);
-	else
-		free = 0;
+	free = ubifs_get_free_space_nolock(c);
 	printk(KERN_DEBUG "Budgeting predictions:\n");
 	printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
 	       available, outstanding, free);
@@ -860,6 +859,65 @@ void dbg_dump_index(struct ubifs_info *c)
 	dbg_walk_index(c, NULL, dump_znode, NULL);
 }
 
+/**
+ * dbg_save_space_info - save information about flash space.
+ * @c: UBIFS file-system description object
+ *
+ * This function saves information about UBIFS free space, dirty space, etc, in
+ * order to check it later.
+ */
+void dbg_save_space_info(struct ubifs_info *c)
+{
+	struct ubifs_debug_info *d = c->dbg;
+
+	ubifs_get_lp_stats(c, &d->saved_lst);
+
+	spin_lock(&c->space_lock);
+	d->saved_free = ubifs_get_free_space_nolock(c);
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * dbg_check_space_info - check flash space information.
+ * @c: UBIFS file-system description object
+ *
+ * This function compares current flash space information with the information
+ * which was saved when the 'dbg_save_space_info()' function was called.
+ * Returns zero if the information has not changed, and %-EINVAL it it has
+ * changed.
+ */
+int dbg_check_space_info(struct ubifs_info *c)
+{
+	struct ubifs_debug_info *d = c->dbg;
+	struct ubifs_lp_stats lst;
+	long long avail, free;
+
+	spin_lock(&c->space_lock);
+	avail = ubifs_calc_available(c, c->min_idx_lebs);
+	spin_unlock(&c->space_lock);
+	free = ubifs_get_free_space(c);
+
+	if (free != d->saved_free) {
+		ubifs_err("free space changed from %lld to %lld",
+			  d->saved_free, free);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_msg("saved lprops statistics dump");
+	dbg_dump_lstats(&d->saved_lst);
+	ubifs_get_lp_stats(c, &lst);
+	ubifs_msg("current lprops statistics dump");
+	dbg_dump_lstats(&d->saved_lst);
+	spin_lock(&c->space_lock);
+	dbg_dump_budg(c);
+	spin_unlock(&c->space_lock);
+	dump_stack();
+	return -EINVAL;
+}
+
 /**
  * dbg_check_synced_i_size - check synchronized inode size.
  * @inode: inode to check
@@ -2409,7 +2467,7 @@ void ubifs_debugging_exit(struct ubifs_info *c)
  * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
  * contain the stuff specific to particular file-system mounts.
  */
-static struct dentry *debugfs_rootdir;
+static struct dentry *dfs_rootdir;
 
 /**
  * dbg_debugfs_init - initialize debugfs file-system.
@@ -2421,9 +2479,9 @@ static struct dentry *debugfs_rootdir;
  */
 int dbg_debugfs_init(void)
 {
-	debugfs_rootdir = debugfs_create_dir("ubifs", NULL);
-	if (IS_ERR(debugfs_rootdir)) {
-		int err = PTR_ERR(debugfs_rootdir);
+	dfs_rootdir = debugfs_create_dir("ubifs", NULL);
+	if (IS_ERR(dfs_rootdir)) {
+		int err = PTR_ERR(dfs_rootdir);
 		ubifs_err("cannot create \"ubifs\" debugfs directory, "
 			  "error %d\n", err);
 		return err;
@@ -2437,7 +2495,7 @@ int dbg_debugfs_init(void)
  */
 void dbg_debugfs_exit(void)
 {
-	debugfs_remove(debugfs_rootdir);
+	debugfs_remove(dfs_rootdir);
 }
 
 static int open_debugfs_file(struct inode *inode, struct file *file)
@@ -2452,13 +2510,13 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
 	struct ubifs_info *c = file->private_data;
 	struct ubifs_debug_info *d = c->dbg;
 
-	if (file->f_path.dentry == d->dump_lprops)
+	if (file->f_path.dentry == d->dfs_dump_lprops)
 		dbg_dump_lprops(c);
-	else if (file->f_path.dentry == d->dump_budg) {
+	else if (file->f_path.dentry == d->dfs_dump_budg) {
 		spin_lock(&c->space_lock);
 		dbg_dump_budg(c);
 		spin_unlock(&c->space_lock);
-	} else if (file->f_path.dentry == d->dump_tnc) {
+	} else if (file->f_path.dentry == d->dfs_dump_tnc) {
 		mutex_lock(&c->tnc_mutex);
 		dbg_dump_tnc(c);
 		mutex_unlock(&c->tnc_mutex);
@@ -2469,7 +2527,7 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
 	return count;
 }
 
-static const struct file_operations debugfs_fops = {
+static const struct file_operations dfs_fops = {
 	.open = open_debugfs_file,
 	.write = write_debugfs_file,
 	.owner = THIS_MODULE,
@@ -2494,36 +2552,32 @@ int dbg_debugfs_init_fs(struct ubifs_info *c)
 	struct dentry *dent;
 	struct ubifs_debug_info *d = c->dbg;
 
-	sprintf(d->debugfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
-	d->debugfs_dir = debugfs_create_dir(d->debugfs_dir_name,
-					      debugfs_rootdir);
-	if (IS_ERR(d->debugfs_dir)) {
-		err = PTR_ERR(d->debugfs_dir);
+	sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
+	d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir);
+	if (IS_ERR(d->dfs_dir)) {
+		err = PTR_ERR(d->dfs_dir);
 		ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
-			  d->debugfs_dir_name, err);
+			  d->dfs_dir_name, err);
 		goto out;
 	}
 
 	fname = "dump_lprops";
-	dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
-				   &debugfs_fops);
+	dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
 	if (IS_ERR(dent))
 		goto out_remove;
-	d->dump_lprops = dent;
+	d->dfs_dump_lprops = dent;
 
 	fname = "dump_budg";
-	dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
-				   &debugfs_fops);
+	dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
 	if (IS_ERR(dent))
 		goto out_remove;
-	d->dump_budg = dent;
+	d->dfs_dump_budg = dent;
 
 	fname = "dump_tnc";
-	dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
-				   &debugfs_fops);
+	dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
 	if (IS_ERR(dent))
 		goto out_remove;
-	d->dump_tnc = dent;
+	d->dfs_dump_tnc = dent;
 
 	return 0;
 
@@ -2531,7 +2585,7 @@ out_remove:
 	err = PTR_ERR(dent);
 	ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
 		  fname, err);
-	debugfs_remove_recursive(d->debugfs_dir);
+	debugfs_remove_recursive(d->dfs_dir);
 out:
 	return err;
 }
@@ -2542,7 +2596,7 @@ out:
  */
 void dbg_debugfs_exit_fs(struct ubifs_info *c)
 {
-	debugfs_remove_recursive(c->dbg->debugfs_dir);
+	debugfs_remove_recursive(c->dbg->dfs_dir);
 }
 
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 9820d6999f7e..c1cd73b2e06e 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -41,15 +41,17 @@
  * @chk_lpt_wastage: used by LPT tree size checker
  * @chk_lpt_lebs: used by LPT tree size checker
  * @new_nhead_offs: used by LPT tree size checker
- * @new_ihead_lnum: used by debugging to check ihead_lnum
- * @new_ihead_offs: used by debugging to check ihead_offs
+ * @new_ihead_lnum: used by debugging to check @c->ihead_lnum
+ * @new_ihead_offs: used by debugging to check @c->ihead_offs
  *
- * debugfs_dir_name: name of debugfs directory containing this file-system's
- *                   files
- * debugfs_dir: direntry object of the file-system debugfs directory
- * dump_lprops: "dump lprops" debugfs knob
- * dump_budg: "dump budgeting information" debugfs knob
- * dump_tnc: "dump TNC" debugfs knob
+ * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
+ * @saved_free: saved free space (used by 'dbg_save_space_info()')
+ *
+ * dfs_dir_name: name of debugfs directory containing this file-system's files
+ * dfs_dir: direntry object of the file-system debugfs directory
+ * dfs_dump_lprops: "dump lprops" debugfs knob
+ * dfs_dump_budg: "dump budgeting information" debugfs knob
+ * dfs_dump_tnc: "dump TNC" debugfs knob
  */
 struct ubifs_debug_info {
 	void *buf;
@@ -69,11 +71,14 @@ struct ubifs_debug_info {
 	int new_ihead_lnum;
 	int new_ihead_offs;
 
-	char debugfs_dir_name[100];
-	struct dentry *debugfs_dir;
-	struct dentry *dump_lprops;
-	struct dentry *dump_budg;
-	struct dentry *dump_tnc;
+	struct ubifs_lp_stats saved_lst;
+	long long saved_free;
+
+	char dfs_dir_name[100];
+	struct dentry *dfs_dir;
+	struct dentry *dfs_dump_lprops;
+	struct dentry *dfs_dump_budg;
+	struct dentry *dfs_dump_tnc;
 };
 
 #define ubifs_assert(expr) do {                                                \
@@ -297,7 +302,8 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
 		   dbg_znode_callback znode_cb, void *priv);
 
 /* Checking functions */
-
+void dbg_save_space_info(struct ubifs_info *c);
+int dbg_check_space_info(struct ubifs_info *c);
 int dbg_check_lprops(struct ubifs_info *c);
 int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
@@ -439,6 +445,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 
 #define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
 #define dbg_old_index_check_init(c, zroot)         0
+#define dbg_save_space_info(c)                     ({})
+#define dbg_check_space_info(c)                    0
 #define dbg_check_old_index(c, zroot)              0
 #define dbg_check_cats(c)                          0
 #define dbg_check_ltab(c)                          0
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 17443d97e6f1..93b6de51f261 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -432,7 +432,6 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
 	struct page *page;
 
-
 	ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
 
 	if (unlikely(c->ro_media))
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 68328c59762b..4cdd284dea56 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -635,10 +635,10 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
  * @c: UBIFS file-system description object
  * @st: return statistics
  */
-void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st)
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst)
 {
 	spin_lock(&c->space_lock);
-	memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats));
+	memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats));
 	spin_unlock(&c->space_lock);
 }
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 807bbd3c8b4b..5c814a71f33a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1470,6 +1470,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	int err, lnum;
 
 	mutex_lock(&c->umount_mutex);
+	dbg_save_space_info(c);
 	c->remounting_rw = 1;
 	c->always_chk_crc = 1;
 
@@ -1573,8 +1574,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	c->vfs_sb->s_flags &= ~MS_RDONLY;
 	c->remounting_rw = 0;
 	c->always_chk_crc = 0;
+	err = dbg_check_space_info(c);
 	mutex_unlock(&c->umount_mutex);
-	return 0;
+	return err;
 
 out:
 	vfree(c->orph_buf);
@@ -1629,8 +1631,8 @@ static void commit_on_unmount(struct ubifs_info *c)
  * ubifs_remount_ro - re-mount in read-only mode.
  * @c: UBIFS file-system description object
  *
- * We rely on VFS to have stopped writing. Possibly the background thread could
- * be running a commit, however kthread_stop will wait in that case.
+ * We assume VFS has stopped writing. Possibly the background thread could be
+ * running a commit, however kthread_stop will wait in that case.
  */
 static void ubifs_remount_ro(struct ubifs_info *c)
 {
@@ -1640,13 +1642,14 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	ubifs_assert(!c->ro_media);
 
 	commit_on_unmount(c);
-
 	mutex_lock(&c->umount_mutex);
 	if (c->bgt) {
 		kthread_stop(c->bgt);
 		c->bgt = NULL;
 	}
 
+	dbg_save_space_info(c);
+
 	for (i = 0; i < c->jhead_cnt; i++) {
 		ubifs_wbuf_sync(&c->jheads[i].wbuf);
 		del_timer_sync(&c->jheads[i].wbuf.timer);
@@ -1669,6 +1672,9 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	vfree(c->ileb_buf);
 	c->ileb_buf = NULL;
 	ubifs_lpt_free(c, 1);
+	err = dbg_check_space_info(c);
+	if (err)
+		ubifs_ro_mode(c, err);
 	mutex_unlock(&c->umount_mutex);
 }
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index ee9517a7b024..f1754354029f 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1495,6 +1495,7 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
 void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
 			 struct ubifs_budget_req *req);
 long long ubifs_get_free_space(struct ubifs_info *c);
+long long ubifs_get_free_space_nolock(struct ubifs_info *c);
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
 void ubifs_convert_page_budget(struct ubifs_info *c);
 long long ubifs_reported_space(const struct ubifs_info *c, long long free);
@@ -1646,7 +1647,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
 					   const struct ubifs_lprops *lp,
 					   int free, int dirty, int flags,
 					   int idx_gc_cnt);
-void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats);
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst);
 void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
 		      int cat);
 void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
-- 
cgit v1.2.3


From b4978e949104844224ecf786170c9263efa601f3 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 23 Jan 2009 18:23:03 +0200
Subject: UBIFS: always clean up GC LEB space

When we mount UBIFS, GC LEB may contain out-of-date information,
and UBIFS should update lprops and set free space for thei LEB.
Currently UBIFS does this only if mounted R/W. But for R/O mount
we have to do the same, because otherwise we will have incorrect
FS free space reported to user-space.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 47 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 5c814a71f33a..336073e4c391 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -397,6 +397,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_namelen = UBIFS_MAX_NLEN;
 	buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
 	buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
+	ubifs_assert(buf->f_bfree <= c->block_cnt);
 	return 0;
 }
 
@@ -735,12 +736,12 @@ static void init_constants_master(struct ubifs_info *c)
  * take_gc_lnum - reserve GC LEB.
  * @c: UBIFS file-system description object
  *
- * This function ensures that the LEB reserved for garbage collection is
- * unmapped and is marked as "taken" in lprops. We also have to set free space
- * to LEB size and dirty space to zero, because lprops may contain out-of-date
- * information if the file-system was un-mounted before it has been committed.
- * This function returns zero in case of success and a negative error code in
- * case of failure.
+ * This function ensures that the LEB reserved for garbage collection is marked
+ * as "taken" in lprops. We also have to set free space to LEB size and dirty
+ * space to zero, because lprops may contain out-of-date information if the
+ * file-system was un-mounted before it has been committed. This function
+ * returns zero in case of success and a negative error code in case of
+ * failure.
  */
 static int take_gc_lnum(struct ubifs_info *c)
 {
@@ -751,10 +752,6 @@ static int take_gc_lnum(struct ubifs_info *c)
 		return -EINVAL;
 	}
 
-	err = ubifs_leb_unmap(c, c->gc_lnum);
-	if (err)
-		return err;
-
 	/* And we have to tell lprops that this LEB is taken */
 	err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
 				  LPROPS_TAKEN, 0, 0);
@@ -1280,10 +1277,19 @@ static int mount_ubifs(struct ubifs_info *c)
 			if (err)
 				goto out_orphans;
 			err = ubifs_rcvry_gc_commit(c);
-		} else
+		} else {
 			err = take_gc_lnum(c);
-		if (err)
-			goto out_orphans;
+			if (err)
+				goto out_orphans;
+
+			/*
+			 * GC LEB may contain garbage if there was an unclean
+			 * reboot, and it should be un-mapped.
+			 */
+			err = ubifs_leb_unmap(c, c->gc_lnum);
+			if (err)
+				return err;
+		}
 
 		err = dbg_check_lprops(c);
 		if (err)
@@ -1292,6 +1298,16 @@ static int mount_ubifs(struct ubifs_info *c)
 		err = ubifs_recover_size(c);
 		if (err)
 			goto out_orphans;
+	} else {
+		/*
+		 * Even if we mount read-only, we have to set space in GC LEB
+		 * to proper value because this affects UBIFS free space
+		 * reporting. We do not want to have a situation when
+		 * re-mounting from R/O to R/W changes amount of free space.
+		 */
+		err = take_gc_lnum(c);
+		if (err)
+			goto out_orphans;
 	}
 
 	spin_lock(&ubifs_infos_lock);
@@ -1316,6 +1332,8 @@ static int mount_ubifs(struct ubifs_info *c)
 		goto out_infos;
 
 	c->always_chk_crc = 0;
+	/* GC LEB has to be empty and taken at this point */
+	ubifs_assert(c->lst.taken_empty_lebs == 1);
 
 	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
 		  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
@@ -1561,7 +1579,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	if (c->need_recovery)
 		err = ubifs_rcvry_gc_commit(c);
 	else
-		err = take_gc_lnum(c);
+		err = ubifs_leb_unmap(c, c->gc_lnum);
 	if (err)
 		goto out;
 
@@ -1786,6 +1804,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		c->bu.buf = NULL;
 	}
 
+	ubifs_assert(c->lst.taken_empty_lebs == 1);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 49d128aa60751a010640f4763d11577e2f508853 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Mon, 26 Jan 2009 10:55:40 +0200
Subject: UBIFS: ensure orphan area head is initialized

When mounting read-only the orphan area head is
not initialized.  It must be initialized when
remounting read/write, but it was not.  This patch
fixes that.

[Artem: sorry, added comment tweaking noise]
Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/orphan.c | 38 +++++++++++++++++++-------------------
 fs/ubifs/super.c  |  6 ++++++
 fs/ubifs/ubifs.h  |  1 +
 3 files changed, 26 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 9e6f403f170e..152a7b34a141 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -46,7 +46,7 @@
  * Orphans are accumulated in a rb-tree. When an inode's link count drops to
  * zero, the inode number is added to the rb-tree. It is removed from the tree
  * when the inode is deleted.  Any new orphans that are in the orphan tree when
- * the commit is run, are written to the orphan area in 1 or more orph nodes.
+ * the commit is run, are written to the orphan area in 1 or more orphan nodes.
  * If the orphan area is full, it is consolidated to make space.  There is
  * always enough space because validation prevents the user from creating more
  * than the maximum number of orphans allowed.
@@ -231,7 +231,7 @@ static int tot_avail_orphs(struct ubifs_info *c)
 }
 
 /**
- * do_write_orph_node - write a node
+ * do_write_orph_node - write a node to the orphan head.
  * @c: UBIFS file-system description object
  * @len: length of node
  * @atomic: write atomically
@@ -264,11 +264,11 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
 }
 
 /**
- * write_orph_node - write an orph node
+ * write_orph_node - write an orphan node.
  * @c: UBIFS file-system description object
  * @atomic: write atomically
  *
- * This function builds an orph node from the cnext list and writes it to the
+ * This function builds an orphan node from the cnext list and writes it to the
  * orphan head. On success, %0 is returned, otherwise a negative error code
  * is returned.
  */
@@ -326,11 +326,11 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
 }
 
 /**
- * write_orph_nodes - write orph nodes until there are no more to commit
+ * write_orph_nodes - write orphan nodes until there are no more to commit.
  * @c: UBIFS file-system description object
  * @atomic: write atomically
  *
- * This function writes orph nodes for all the orphans to commit. On success,
+ * This function writes orphan nodes for all the orphans to commit. On success,
  * %0 is returned, otherwise a negative error code is returned.
  */
 static int write_orph_nodes(struct ubifs_info *c, int atomic)
@@ -478,14 +478,14 @@ int ubifs_orphan_end_commit(struct ubifs_info *c)
 }
 
 /**
- * clear_orphans - erase all LEBs used for orphans.
+ * ubifs_clear_orphans - erase all LEBs used for orphans.
  * @c: UBIFS file-system description object
  *
  * If recovery is not required, then the orphans from the previous session
  * are not needed. This function locates the LEBs used to record
  * orphans, and un-maps them.
  */
-static int clear_orphans(struct ubifs_info *c)
+int ubifs_clear_orphans(struct ubifs_info *c)
 {
 	int lnum, err;
 
@@ -547,9 +547,9 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
  * do_kill_orphans - remove orphan inodes from the index.
  * @c: UBIFS file-system description object
  * @sleb: scanned LEB
- * @last_cmt_no: cmt_no of last orph node read is passed and returned here
+ * @last_cmt_no: cmt_no of last orphan node read is passed and returned here
  * @outofdate: whether the LEB is out of date is returned here
- * @last_flagged: whether the end orph node is encountered
+ * @last_flagged: whether the end orphan node is encountered
  *
  * This function is a helper to the 'kill_orphans()' function. It goes through
  * every orphan node in a LEB and for every inode number recorded, removes
@@ -580,8 +580,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 		/*
 		 * The commit number on the master node may be less, because
 		 * of a failed commit. If there are several failed commits in a
-		 * row, the commit number written on orph nodes will continue to
-		 * increase (because the commit number is adjusted here) even
+		 * row, the commit number written on orphan nodes will continue
+		 * to increase (because the commit number is adjusted here) even
 		 * though the commit number on the master node stays the same
 		 * because the master node has not been re-written.
 		 */
@@ -589,9 +589,9 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 			c->cmt_no = cmt_no;
 		if (cmt_no < *last_cmt_no && *last_flagged) {
 			/*
-			 * The last orph node had a higher commit number and was
-			 * flagged as the last written for that commit number.
-			 * That makes this orph node, out of date.
+			 * The last orphan node had a higher commit number and
+			 * was flagged as the last written for that commit
+			 * number. That makes this orphan node, out of date.
 			 */
 			if (!first) {
 				ubifs_err("out of order commit number %llu in "
@@ -658,10 +658,10 @@ static int kill_orphans(struct ubifs_info *c)
 	/*
 	 * Orph nodes always start at c->orph_first and are written to each
 	 * successive LEB in turn. Generally unused LEBs will have been unmapped
-	 * but may contain out of date orph nodes if the unmap didn't go
-	 * through. In addition, the last orph node written for each commit is
+	 * but may contain out of date orphan nodes if the unmap didn't go
+	 * through. In addition, the last orphan node written for each commit is
 	 * marked (top bit of orph->cmt_no is set to 1). It is possible that
-	 * there are orph nodes from the next commit (i.e. the commit did not
+	 * there are orphan nodes from the next commit (i.e. the commit did not
 	 * complete successfully). In that case, no orphans will have been lost
 	 * due to the way that orphans are written, and any orphans added will
 	 * be valid orphans anyway and so can be deleted.
@@ -718,7 +718,7 @@ int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
 	if (unclean)
 		err = kill_orphans(c);
 	else if (!read_only)
-		err = clear_orphans(c);
+		err = ubifs_clear_orphans(c);
 
 	return err;
 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 336073e4c391..fd7fc7f3b7a6 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1524,6 +1524,12 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 		err = ubifs_recover_inl_heads(c, c->sbuf);
 		if (err)
 			goto out;
+	} else {
+		/* A readonly mount is not allowed to have orphans */
+		ubifs_assert(c->tot_orphans == 0);
+		err = ubifs_clear_orphans(c);
+		if (err)
+			goto out;
 	}
 
 	if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index f1754354029f..9999ff0aaa43 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1604,6 +1604,7 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
 int ubifs_orphan_start_commit(struct ubifs_info *c);
 int ubifs_orphan_end_commit(struct ubifs_info *c);
 int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
+int ubifs_clear_orphans(struct ubifs_info *c);
 
 /* lpt.c */
 int ubifs_calc_lpt_geom(struct ubifs_info *c);
-- 
cgit v1.2.3


From bb875b38dc5e343bdb696b2eab8233e4d195e208 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Mon, 26 Jan 2009 15:00:58 +0100
Subject: fuse: fix NULL deref in fuse_file_alloc()

ff is set to NULL and then dereferenced on line 65.  Compile tested only.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@kernel.org
---
 fs/fuse/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e8162646a9b5..d9fdb7cec538 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -54,7 +54,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 		ff->reserved_req = fuse_request_alloc();
 		if (!ff->reserved_req) {
 			kfree(ff);
-			ff = NULL;
+			return NULL;
 		} else {
 			INIT_LIST_HEAD(&ff->write_entry);
 			atomic_set(&ff->count, 0);
-- 
cgit v1.2.3


From 3ddf1e7f57237ac7c5d5bfb7058f1ea4f970b661 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 26 Jan 2009 15:00:58 +0100
Subject: fuse: fix missing fput on error

Fix the leaking file reference if allocation or initialization of
fuse_conn failed.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@kernel.org
---
 fs/fuse/inode.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 47c96fdca1ac..6893717b6536 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -829,15 +829,20 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!file)
 		return -EINVAL;
 
-	if (file->f_op != &fuse_dev_operations)
+	if (file->f_op != &fuse_dev_operations) {
+		fput(file);
 		return -EINVAL;
+	}
 
 	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
-	if (!fc)
+	if (!fc) {
+		fput(file);
 		return -ENOMEM;
+	}
 
 	err = fuse_conn_init(fc, sb);
 	if (err) {
+		fput(file);
 		kfree(fc);
 		return err;
 	}
-- 
cgit v1.2.3


From c2b8f006909b9bf9e165dfdf3c378527938c4497 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 26 Jan 2009 15:00:58 +0100
Subject: fuse: fuse_fill_super error handling cleanup

Clean up error handling for the whole of fuse_fill_super() function.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/inode.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6893717b6536..dc649f6bc3e5 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -805,16 +805,18 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	int err;
 	int is_bdev = sb->s_bdev != NULL;
 
+	err = -EINVAL;
 	if (sb->s_flags & MS_MANDLOCK)
-		return -EINVAL;
+		goto err;
 
 	if (!parse_fuse_opt((char *) data, &d, is_bdev))
-		return -EINVAL;
+		goto err;
 
 	if (is_bdev) {
 #ifdef CONFIG_BLOCK
+		err = -EINVAL;
 		if (!sb_set_blocksize(sb, d.blksize))
-			return -EINVAL;
+			goto err;
 #endif
 	} else {
 		sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -826,25 +828,22 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_export_op = &fuse_export_operations;
 
 	file = fget(d.fd);
+	err = -EINVAL;
 	if (!file)
-		return -EINVAL;
+		goto err;
 
-	if (file->f_op != &fuse_dev_operations) {
-		fput(file);
-		return -EINVAL;
-	}
+	if (file->f_op != &fuse_dev_operations)
+		goto err_fput;
 
 	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
-	if (!fc) {
-		fput(file);
-		return -ENOMEM;
-	}
+	err = -ENOMEM;
+	if (!fc)
+		goto err_fput;
 
 	err = fuse_conn_init(fc, sb);
 	if (err) {
-		fput(file);
 		kfree(fc);
-		return err;
+		goto err_fput;
 	}
 
 	fc->release = fuse_free_conn;
@@ -859,12 +858,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	err = -ENOMEM;
 	root = fuse_get_root_inode(sb, d.rootmode);
 	if (!root)
-		goto err;
+		goto err_put_conn;
 
 	root_dentry = d_alloc_root(root);
 	if (!root_dentry) {
 		iput(root);
-		goto err;
+		goto err_put_conn;
 	}
 
 	init_req = fuse_request_alloc();
@@ -908,9 +907,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fuse_request_free(init_req);
  err_put_root:
 	dput(root_dentry);
- err:
-	fput(file);
+ err_put_conn:
 	fuse_conn_put(fc);
+ err_fput:
+	fput(file);
+ err:
 	return err;
 }
 
-- 
cgit v1.2.3


From 26c3679101dbccc054dcf370143941844ba70531 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 26 Jan 2009 15:00:59 +0100
Subject: fuse: destroy bdi on umount

If a fuse filesystem is unmounted but the device file descriptor
remains open and a new mount reuses the old device number, then the
mount fails with EEXIST and the following warning is printed in the
kernel log:

  WARNING: at fs/sysfs/dir.c:462 sysfs_add_one+0x35/0x3d()
  sysfs: duplicate filename '0:15' can not be created

The cause is that the bdi belonging to the fuse filesystem was
destoryed only after the device file was released.  Fix this by
calling bdi_destroy() from fuse_put_super() instead.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@kernel.org
---
 fs/fuse/dev.c   | 3 ++-
 fs/fuse/inode.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e0c7ada08a1f..c4a3d9bbdaa8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -281,7 +281,8 @@ __releases(&fc->lock)
 			fc->blocked = 0;
 			wake_up_all(&fc->blocked_waitq);
 		}
-		if (fc->num_background == FUSE_CONGESTION_THRESHOLD) {
+		if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+		    fc->connected) {
 			clear_bdi_congested(&fc->bdi, READ);
 			clear_bdi_congested(&fc->bdi, WRITE);
 		}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index dc649f6bc3e5..459b73dd45e1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -292,6 +292,7 @@ static void fuse_put_super(struct super_block *sb)
 	list_del(&fc->entry);
 	fuse_ctl_remove_conn(fc);
 	mutex_unlock(&fuse_mutex);
+	bdi_destroy(&fc->bdi);
 	fuse_conn_put(fc);
 }
 
@@ -532,7 +533,6 @@ void fuse_conn_put(struct fuse_conn *fc)
 		if (fc->destroy_req)
 			fuse_request_free(fc->destroy_req);
 		mutex_destroy(&fc->inst_mutex);
-		bdi_destroy(&fc->bdi);
 		fc->release(fc);
 	}
 }
-- 
cgit v1.2.3


From f6d47a1761896dcd89e3184399a8962dff17267d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 26 Jan 2009 15:00:59 +0100
Subject: fuse: fix poll notify

Move fuse_copy_finish() to before calling fuse_notify_poll_wakeup().
This is not a big issue because fuse_notify_poll_wakeup() should be
atomic, but it's cleaner this way, and later uses of notification will
need to be able to finish the copying before performing some actions.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c4a3d9bbdaa8..ba76b68c52ff 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -826,16 +826,21 @@ static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
 			    struct fuse_copy_state *cs)
 {
 	struct fuse_notify_poll_wakeup_out outarg;
-	int err;
+	int err = -EINVAL;
 
 	if (size != sizeof(outarg))
-		return -EINVAL;
+		goto err;
 
 	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
 	if (err)
-		return err;
+		goto err;
 
+	fuse_copy_finish(cs);
 	return fuse_notify_poll_wakeup(fc, &outarg);
+
+err:
+	fuse_copy_finish(cs);
+	return err;
 }
 
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
@@ -846,6 +851,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		return fuse_notify_poll(fc, size, cs);
 
 	default:
+		fuse_copy_finish(cs);
 		return -EINVAL;
 	}
 }
@@ -924,7 +930,6 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
 	 */
 	if (!oh.unique) {
 		err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
-		fuse_copy_finish(&cs);
 		return err ? err : nbytes;
 	}
 
-- 
cgit v1.2.3


From 6ba87c9b920bea8c2703308d31eb7de925242c30 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 26 Jan 2009 16:12:20 +0200
Subject: UBIFS: fix assertions

I introduce wrong assertions in one of the previous commits, this
patch fixes them.

Also, initialize debugfs after the debugging check. This is a little
nicer because we want the FS data to be accessible to external users
after everything has been initialized.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index fd7fc7f3b7a6..dbfc88714716 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1320,20 +1320,21 @@ static int mount_ubifs(struct ubifs_info *c)
 		else {
 			c->need_recovery = 0;
 			ubifs_msg("recovery completed");
+			/* GC LEB has to be empty and taken at this point */
+			ubifs_assert(c->lst.taken_empty_lebs == 1);
 		}
-	}
+	} else
+		ubifs_assert(c->lst.taken_empty_lebs == 1);
 
-	err = dbg_debugfs_init_fs(c);
+	err = dbg_check_filesystem(c);
 	if (err)
 		goto out_infos;
 
-	err = dbg_check_filesystem(c);
+	err = dbg_debugfs_init_fs(c);
 	if (err)
 		goto out_infos;
 
 	c->always_chk_crc = 0;
-	/* GC LEB has to be empty and taken at this point */
-	ubifs_assert(c->lst.taken_empty_lebs == 1);
 
 	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
 		  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
@@ -1663,7 +1664,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	int i, err;
 
 	ubifs_assert(!c->need_recovery);
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
 
 	commit_on_unmount(c);
 	mutex_lock(&c->umount_mutex);
-- 
cgit v1.2.3


From 3632dee2f8b8a9720329f29eeaa4ec4669a3aff8 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Thu, 22 Jan 2009 15:29:45 +0100
Subject: inotify: clean up inotify_read and fix locking problems

If userspace supplies an invalid pointer to a read() of an inotify
instance, the inotify device's event list mutex is unlocked twice.
This causes an unbalance which effectively leaves the data structure
unprotected, and we can trigger oopses by accessing the inotify
instance from different tasks concurrently.

The best fix (contributed largely by Linus) is a total rewrite
of the function in question:

On Thu, Jan 22, 2009 at 7:05 AM, Linus Torvalds wrote:
> The thing to notice is that:
>
>  - locking is done in just one place, and there is no question about it
>   not having an unlock.
>
>  - that whole double-while(1)-loop thing is gone.
>
>  - use multiple functions to make nesting and error handling sane
>
>  - do error testing after doing the things you always need to do, ie do
>   this:
>
>        mutex_lock(..)
>        ret = function_call();
>        mutex_unlock(..)
>
>        .. test ret here ..
>
>   instead of doing conditional exits with unlocking or freeing.
>
> So if the code is written in this way, it may still be buggy, but at least
> it's not buggy because of subtle "forgot to unlock" or "forgot to free"
> issues.
>
> This _always_ unlocks if it locked, and it always frees if it got a
> non-error kevent.

Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Robert Love <rlove@google.com>
Cc: <stable@kernel.org>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inotify/inotify_user.c | 135 +++++++++++++++++++++------------------
 1 file changed, 74 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index d53a1838d6e8..bed766e435b5 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -427,10 +427,61 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
 	return ret;
 }
 
+/*
+ * Get an inotify_kernel_event if one exists and is small
+ * enough to fit in "count". Return an error pointer if
+ * not large enough.
+ *
+ * Called with the device ev_mutex held.
+ */
+static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
+						  size_t count)
+{
+	size_t event_size = sizeof(struct inotify_event);
+	struct inotify_kernel_event *kevent;
+
+	if (list_empty(&dev->events))
+		return NULL;
+
+	kevent = inotify_dev_get_event(dev);
+	if (kevent->name)
+		event_size += kevent->event.len;
+
+	if (event_size > count)
+		return ERR_PTR(-EINVAL);
+
+	remove_kevent(dev, kevent);
+	return kevent;
+}
+
+/*
+ * Copy an event to user space, returning how much we copied.
+ *
+ * We already checked that the event size is smaller than the
+ * buffer we had in "get_one_event()" above.
+ */
+static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent,
+				  char __user *buf)
+{
+	size_t event_size = sizeof(struct inotify_event);
+
+	if (copy_to_user(buf, &kevent->event, event_size))
+		return -EFAULT;
+
+	if (kevent->name) {
+		buf += event_size;
+
+		if (copy_to_user(buf, kevent->name, kevent->event.len))
+			return -EFAULT;
+
+		event_size += kevent->event.len;
+	}
+	return event_size;
+}
+
 static ssize_t inotify_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *pos)
 {
-	size_t event_size = sizeof (struct inotify_event);
 	struct inotify_device *dev;
 	char __user *start;
 	int ret;
@@ -440,81 +491,43 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 	dev = file->private_data;
 
 	while (1) {
+		struct inotify_kernel_event *kevent;
 
 		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
 
 		mutex_lock(&dev->ev_mutex);
-		if (!list_empty(&dev->events)) {
-			ret = 0;
-			break;
-		}
+		kevent = get_one_event(dev, count);
 		mutex_unlock(&dev->ev_mutex);
 
-		if (file->f_flags & O_NONBLOCK) {
-			ret = -EAGAIN;
-			break;
-		}
-
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			break;
+		if (kevent) {
+			ret = PTR_ERR(kevent);
+			if (IS_ERR(kevent))
+				break;
+			ret = copy_event_to_user(kevent, buf);
+			free_kevent(kevent);
+			if (ret < 0)
+				break;
+			buf += ret;
+			count -= ret;
+			continue;
 		}
 
-		schedule();
-	}
-
-	finish_wait(&dev->wq, &wait);
-	if (ret)
-		return ret;
-
-	while (1) {
-		struct inotify_kernel_event *kevent;
-
-		ret = buf - start;
-		if (list_empty(&dev->events))
+		ret = -EAGAIN;
+		if (file->f_flags & O_NONBLOCK)
 			break;
-
-		kevent = inotify_dev_get_event(dev);
-		if (event_size + kevent->event.len > count) {
-			if (ret == 0 && count > 0) {
-				/*
-				 * could not get a single event because we
-				 * didn't have enough buffer space.
-				 */
-				ret = -EINVAL;
-			}
+		ret = -EINTR;
+		if (signal_pending(current))
 			break;
-		}
-		remove_kevent(dev, kevent);
 
-		/*
-		 * Must perform the copy_to_user outside the mutex in order
-		 * to avoid a lock order reversal with mmap_sem.
-		 */
-		mutex_unlock(&dev->ev_mutex);
-
-		if (copy_to_user(buf, &kevent->event, event_size)) {
-			ret = -EFAULT;
+		if (start != buf)
 			break;
-		}
-		buf += event_size;
-		count -= event_size;
-
-		if (kevent->name) {
-			if (copy_to_user(buf, kevent->name, kevent->event.len)){
-				ret = -EFAULT;
-				break;
-			}
-			buf += kevent->event.len;
-			count -= kevent->event.len;
-		}
-
-		free_kevent(kevent);
 
-		mutex_lock(&dev->ev_mutex);
+		schedule();
 	}
-	mutex_unlock(&dev->ev_mutex);
 
+	finish_wait(&dev->wq, &wait);
+	if (start != buf && ret != -EFAULT)
+		ret = buf - start;
 	return ret;
 }
 
-- 
cgit v1.2.3


From fdff73f094e7220602cc3f8959c7230517976412 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 26 Jan 2009 19:06:41 -0500
Subject: ext4: Initialize the new group descriptor when resizing the
 filesystem

Make sure all of the fields of the group descriptor are properly
initialized.  Previously, we allowed bg_flags field to be contain
random garbage, which could trigger non-deterministic behavior,
including a kernel OOPS.

http://bugzilla.kernel.org/show_bug.cgi?id=12433

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/resize.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c328be5d6885..c06886abd658 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -861,12 +861,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
 					 gdb_off * EXT4_DESC_SIZE(sb));
 
+	memset(gdp, 0, EXT4_DESC_SIZE(sb));
 	ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
 	ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
 	ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
 	ext4_free_blks_set(sb, gdp, input->free_blocks_count);
 	ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
-	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+	gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
 
 	/*
-- 
cgit v1.2.3


From 9fd9784c91db79e953ea3fe3741f885bdc390a72 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Date: Mon, 26 Jan 2009 19:26:26 -0500
Subject: ext4: Fix building with EXT4FS_DEBUG

When bg_free_blocks_count was renamed to bg_free_blocks_count_lo in
560671a0, its uses under EXT4FS_DEBUG were not changed to the helper
ext4_free_blks_count.

Another commit, 498e5f24, also did not change everything needed under
EXT4FS_DEBUG, thus making it spill some warnings related to printing
format.

This commit fixes both issues and makes ext4 build again when
EXT4FS_DEBUG is enabled.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 6 +++---
 fs/ext4/extents.c | 2 +-
 fs/ext4/mballoc.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 6bba06b09dd1..9a50b8052dcf 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -684,15 +684,15 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+		desc_count += ext4_free_blks_count(sb, gdp);
 		brelse(bitmap_bh);
 		bitmap_bh = ext4_read_block_bitmap(sb, i);
 		if (bitmap_bh == NULL)
 			continue;
 
 		x = ext4_count_free(bitmap_bh, sb->s_blocksize);
-		printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n",
-			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+		printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
+			i, ext4_free_blks_count(sb, gdp), x);
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 54bf0623a9ae..e2eab196875f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3048,7 +3048,7 @@ retry:
 			WARN_ON(ret <= 0);
 			printk(KERN_ERR "%s: ext4_ext_get_blocks "
 				    "returned error inode#%lu, block=%u, "
-				    "max_blocks=%lu", __func__,
+				    "max_blocks=%u", __func__,
 				    inode->i_ino, block, max_blocks);
 #endif
 			ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 918aec0c8a11..deba54f6cbed 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3025,7 +3025,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		goto out_err;
 
 	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
-			gdp->bg_free_blocks_count);
+			ext4_free_blks_count(sb, gdp));
 
 	err = ext4_journal_get_write_access(handle, gdp_bh);
 	if (err)
-- 
cgit v1.2.3


From 6f7ab6d458bbfc2f55d295fa3e6b9e69cdb1d517 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 27 Jan 2009 16:12:31 +0200
Subject: UBIFS: fix no_chk_data_crc

When data CRC checking is disabled, UBIFS returns incorrect return
code from the 'try_read_node()' function (0 instead of 1, which means
CRC error), which make the caller re-read the data node again, but using
a different code patch, so the second read is fine. Thus, we read the
same node twice. And the result of this is that UBIFS is slower
with no_chk_data_crc option than it is with chk_data_crc option.
This patches fixes the problem.

Reported-by: Reuben Dowle <Reuben.Dowle@navico.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/io.c    | 22 ++++++++++++++--------
 fs/ubifs/tnc.c   | 12 ++++++++----
 fs/ubifs/ubifs.h |  2 +-
 3 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 01682713af69..e8e632a1dcdf 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -29,7 +29,7 @@
  * would have been wasted for padding to the nearest minimal I/O unit boundary.
  * Instead, data first goes to the write-buffer and is flushed when the
  * buffer is full or when it is not used for some time (by timer). This is
- * similarto the mechanism is used by JFFS2.
+ * similar to the mechanism is used by JFFS2.
  *
  * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
  * mutexes defined inside these objects. Since sometimes upper-level code
@@ -75,7 +75,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
  * @lnum: logical eraseblock number
  * @offs: offset within the logical eraseblock
  * @quiet: print no messages
- * @chk_crc: indicates whether to always check the CRC
+ * @must_chk_crc: indicates whether to always check the CRC
  *
  * This function checks node magic number and CRC checksum. This function also
  * validates node length to prevent UBIFS from becoming crazy when an attacker
@@ -83,11 +83,17 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
  * node length in the common header could cause UBIFS to read memory outside of
  * allocated buffer when checking the CRC checksum.
  *
- * This function returns zero in case of success %-EUCLEAN in case of bad CRC
- * or magic.
+ * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
+ * true, which is controlled by corresponding UBIFS mount option. However, if
+ * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
+ * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is
+ * ignored and CRC is checked.
+ *
+ * This function returns zero in case of success and %-EUCLEAN in case of bad
+ * CRC or magic.
  */
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-		     int offs, int quiet, int chk_crc)
+		     int offs, int quiet, int must_chk_crc)
 {
 	int err = -EINVAL, type, node_len;
 	uint32_t crc, node_crc, magic;
@@ -123,9 +129,9 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
 		   node_len > c->ranges[type].max_len)
 		goto out_len;
 
-	if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc)
-		if (c->no_chk_data_crc)
-			return 0;
+	if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc &&
+	     c->no_chk_data_crc)
+		return 0;
 
 	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
 	node_crc = le32_to_cpu(ch->crc);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f7e36f545527..fa28a84c6a1b 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -443,6 +443,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
  * This function performs that same function as ubifs_read_node except that
  * it does not require that there is actually a node present and instead
  * the return code indicates if a node was read.
+ *
+ * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
+ * is true (it is controlled by corresponding mount option). However, if
+ * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always
+ * checked.
  */
 static int try_read_node(const struct ubifs_info *c, void *buf, int type,
 			 int len, int lnum, int offs)
@@ -470,9 +475,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
 	if (node_len != len)
 		return 0;
 
-	if (type == UBIFS_DATA_NODE && !c->always_chk_crc)
-		if (c->no_chk_data_crc)
-			return 0;
+	if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc)
+		return 1;
 
 	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
 	node_crc = le32_to_cpu(ch->crc);
@@ -1506,7 +1510,7 @@ out:
  *
  * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function
  * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares
- * maxumum possible amount of nodes for bulk-read.
+ * maximum possible amount of nodes for bulk-read.
  */
 int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
 {
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 9999ff0aaa43..29dfa816077b 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1428,7 +1428,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
 int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
 		     int offs, int dtype);
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-		     int offs, int quiet, int chk_crc);
+		     int offs, int quiet, int must_chk_crc);
 void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
 void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
 int ubifs_io_init(struct ubifs_info *c);
-- 
cgit v1.2.3


From f0e0059b9c18426cffdcc04161062251a8f9741e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Sun, 25 Jan 2009 20:53:00 -0600
Subject: don't reallocate sxp variable passed into xfs_swapext

fixes kernel.org bugzilla 12538, xfs_fsr fails on 2.6.29-rc kernels

Regression caused by 743bb4650da9e2595d6cedd01c680b5b9398c74a

This was an embarrasing mistake, reallocating the sxp pointer passed
in from the main ioctl switch.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net
Reported-by: Paul Martin <pm@debian.org>
Tested-by: Paul Martin <pm@debian.org>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_dfrag.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b4c1ee713492..f8278cfcc1d3 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -55,17 +55,11 @@ xfs_swapext(
 	struct file	*file, *target_file;
 	int		error = 0;
 
-	sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL);
-	if (!sxp) {
-		error = XFS_ERROR(ENOMEM);
-		goto out;
-	}
-
 	/* Pull information for the target fd */
 	file = fget((int)sxp->sx_fdtarget);
 	if (!file) {
 		error = XFS_ERROR(EINVAL);
-		goto out_free_sxp;
+		goto out;
 	}
 
 	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) {
@@ -109,8 +103,6 @@ xfs_swapext(
 	fput(target_file);
  out_put_file:
 	fput(file);
- out_free_sxp:
-	kmem_free(sxp);
  out:
 	return error;
 }
-- 
cgit v1.2.3


From bf935a78814cc9b96d09f612912178adc964ce9c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 20 Jan 2009 19:32:59 -0500
Subject: nfsd: fix null dereference on error path

We're forgetting to check the return value from groups_alloc().

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/auth.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index c903e04aa217..b860d3484cd7 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -49,6 +49,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 		new->fsuid = exp->ex_anon_uid;
 		new->fsgid = exp->ex_anon_gid;
 		gi = groups_alloc(0);
+		if (!gi)
+			goto oom;
 	} else if (flags & NFSEXP_ROOTSQUASH) {
 		if (!new->fsuid)
 			new->fsuid = exp->ex_anon_uid;
-- 
cgit v1.2.3


From b914152a6fbd2cd0441bc293ae8b3f3f1a9407b6 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 20 Jan 2009 19:34:22 -0500
Subject: nfsd: fix cred leak on every rpc

Since override_creds() took its own reference on new, we need to release
our own reference.

(Note the put_cred on the return value puts the *old* value of
current->creds, not the new passed-in value).

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/auth.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index b860d3484cd7..5573508f707f 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -87,6 +87,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
 							new->cap_permitted);
 	put_cred(override_creds(new));
+	put_cred(new);
 	return 0;
 
 oom:
-- 
cgit v1.2.3


From fa82a491275a613b15489aab4b99acecb00958d3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 22 Jan 2009 14:16:04 -0500
Subject: nfsd: only set file_lock.fl_lmops in nfsd4_lockt if a stateowner is
 found

nfsd4_lockt does a search for a lockstateowner when building the lock
struct to test. If one is found, it'll set fl_owner to it. Regardless of
whether that happens, it'll also set fl_lmops. Given that this lock is
basically a "lightweight" lock that's just used for checking conflicts,
setting fl_lmops is probably not appropriate for it.

This behavior exposed a bug in DLM's GETLK implementation where it
wasn't clearing out the fields in the file_lock before filling in
conflicting lock info. While we were able to fix this in DLM, it
still seems pointless and dangerous to set the fl_lmops this way
when we may have a NULL lockstateowner.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@pig.fieldses.org>
---
 fs/nfsd/nfs4state.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 88db7d3ec120..b6f60f48e94b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2871,7 +2871,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_flags = FL_POSIX;
-	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 
 	file_lock.fl_start = lockt->lt_offset;
 	file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
-- 
cgit v1.2.3


From 4a29d2005b0f28d018d36d209c47f3973a725df5 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Tue, 27 Jan 2009 15:22:54 +0200
Subject: UBIFS: fix LPT out-of-space bug (again)

The function to traverse and dirty the LPT was still not
dirtying all nodes, with the result that the LPT could
run out of space.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/lpt_commit.c | 44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 96ca95707175..3216a1f277f8 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -556,23 +556,23 @@ no_space:
 }
 
 /**
- * next_pnode - find next pnode.
+ * next_pnode_to_dirty - find next pnode to dirty.
  * @c: UBIFS file-system description object
  * @pnode: pnode
  *
- * This function returns the next pnode or %NULL if there are no more pnodes.
+ * This function returns the next pnode to dirty or %NULL if there are no more
+ * pnodes.  Note that pnodes that have never been written (lnum == 0) are
+ * skipped.
  */
-static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
-				      struct ubifs_pnode *pnode)
+static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
+					       struct ubifs_pnode *pnode)
 {
 	struct ubifs_nnode *nnode;
 	int iip;
 
 	/* Try to go right */
 	nnode = pnode->parent;
-	iip = pnode->iip + 1;
-	if (iip < UBIFS_LPT_FANOUT) {
-		/* We assume here that LEB zero is never an LPT LEB */
+	for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
 		if (nnode->nbranch[iip].lnum)
 			return ubifs_get_pnode(c, nnode, iip);
 	}
@@ -583,8 +583,11 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
 		nnode = nnode->parent;
 		if (!nnode)
 			return NULL;
-		/* We assume here that LEB zero is never an LPT LEB */
-	} while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum);
+		for (; iip < UBIFS_LPT_FANOUT; iip++) {
+			if (nnode->nbranch[iip].lnum)
+				break;
+		}
+       } while (iip >= UBIFS_LPT_FANOUT);
 
 	/* Go right */
 	nnode = ubifs_get_nnode(c, nnode, iip);
@@ -593,12 +596,29 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
 
 	/* Go down to level 1 */
 	while (nnode->level > 1) {
-		nnode = ubifs_get_nnode(c, nnode, 0);
+		for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) {
+			if (nnode->nbranch[iip].lnum)
+				break;
+		}
+		if (iip >= UBIFS_LPT_FANOUT) {
+			/*
+			 * Should not happen, but we need to keep going
+			 * if it does.
+			 */
+			iip = 0;
+		}
+		nnode = ubifs_get_nnode(c, nnode, iip);
 		if (IS_ERR(nnode))
 			return (void *)nnode;
 	}
 
-	return ubifs_get_pnode(c, nnode, 0);
+	for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++)
+		if (nnode->nbranch[iip].lnum)
+			break;
+	if (iip >= UBIFS_LPT_FANOUT)
+		/* Should not happen, but we need to keep going if it does */
+		iip = 0;
+	return ubifs_get_pnode(c, nnode, iip);
 }
 
 /**
@@ -688,7 +708,7 @@ static int make_tree_dirty(struct ubifs_info *c)
 	pnode = pnode_lookup(c, 0);
 	while (pnode) {
 		do_make_pnode_dirty(c, pnode);
-		pnode = next_pnode(c, pnode);
+		pnode = next_pnode_to_dirty(c, pnode);
 		if (IS_ERR(pnode))
 			return PTR_ERR(pnode);
 	}
-- 
cgit v1.2.3


From 89f135d8b53bcccafd91a075366d2704ba257cf3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 28 Jan 2009 15:34:27 -0500
Subject: Btrfs: fix readdir on 32 bit machines

After btrfs_readdir has gone through all the directory items, it
sets the directory f_pos to the largest possible int.  This way
applications that mix readdir with creating new files don't
end up in an endless loop finding the new directory items as they go.

It was a workaround for a bug in git, but the assumption was that if git
could make this looping mistake than it would be a common problem.

The largest possible int chosen was INT_LIMIT(typeof(file->f_pos),
and it is possible for that to be a larger number than 32 bit glibc
expects to come out of readdir.

This patches switches that to INT_LIMIT(off_t), which should keep
applications happy on 32 and 64 bit machines.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 288c2cdc7543..2bb65e9b1448 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3263,7 +3263,7 @@ skip:
 
 	/* Reached end of directory/root. Bump pos past the last item. */
 	if (key_type == BTRFS_DIR_INDEX_KEY)
-		filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
+		filp->f_pos = INT_LIMIT(off_t);
 	else
 		filp->f_pos++;
 nopos:
-- 
cgit v1.2.3


From 0496e02d8791e7f06673a19a181be30dad6eff70 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 30 Dec 2008 12:39:16 -0500
Subject: cifs: turn smb_send into a wrapper around smb_sendv

cifs: turn smb_send into a wrapper around smb_sendv

Rename smb_send2 to smb_sendv to make it consistent with kernel naming
conventions for functions that take a vector.

There's no need to have 2 functions to handle sending SMB calls. Turn
smb_send into a wrapper around smb_sendv. This also allows us to
properly mark the socket as needing to be reconnected when there's a
partial send from smb_send.

Also, in practice we always use the address and noblocksnd flag
that's attached to the TCP_Server_Info. There's no need to pass
them in as separate args to smb_sendv.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |   4 +-
 fs/cifs/connect.c   |   4 +-
 fs/cifs/transport.c | 107 ++++++++++------------------------------------------
 3 files changed, 22 insertions(+), 93 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 06f6779988bf..382ba6298809 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -35,8 +35,8 @@ extern struct smb_hdr *cifs_buf_get(void);
 extern void cifs_buf_release(void *);
 extern struct smb_hdr *cifs_small_buf_get(void);
 extern void cifs_small_buf_release(void *);
-extern int smb_send(struct socket *, struct smb_hdr *,
-			unsigned int /* length */ , struct sockaddr *, bool);
+extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
+			unsigned int /* length */);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
 #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e9ea394ee075..7419576228fb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1860,9 +1860,7 @@ ipv4_connect(struct TCP_Server_Info *server)
 			smb_buf = (struct smb_hdr *)ses_init_buf;
 			/* sizeof RFC1002_SESSION_REQUEST with no scope */
 			smb_buf->smb_buf_length = 0x81000044;
-			rc = smb_send(socket, smb_buf, 0x44,
-				(struct sockaddr *) &server->addr.sockAddr,
-				server->noblocksnd);
+			rc = smb_send(server, smb_buf, 0x44);
 			kfree(ses_init_buf);
 			msleep(1); /* RFC1001 layer in at least one server
 				      requires very short break before negprot
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7ebe6599ed3a..2c7efd26992d 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -154,81 +154,8 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
 	spin_unlock(&GlobalMid_Lock);
 }
 
-int
-smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
-	 unsigned int smb_buf_length, struct sockaddr *sin, bool noblocksnd)
-{
-	int rc = 0;
-	int i = 0;
-	struct msghdr smb_msg;
-	struct kvec iov;
-	unsigned len = smb_buf_length + 4;
-
-	if (ssocket == NULL)
-		return -ENOTSOCK; /* BB eventually add reconnect code here */
-	iov.iov_base = smb_buffer;
-	iov.iov_len = len;
-
-	smb_msg.msg_name = sin;
-	smb_msg.msg_namelen = sizeof(struct sockaddr);
-	smb_msg.msg_control = NULL;
-	smb_msg.msg_controllen = 0;
-	if (noblocksnd)
-		smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
-	else
-		smb_msg.msg_flags = MSG_NOSIGNAL;
-
-	/* smb header is converted in header_assemble. bcc and rest of SMB word
-	   area, and byte area if necessary, is converted to littleendian in
-	   cifssmb.c and RFC1001 len is converted to bigendian in smb_send
-	   Flags2 is converted in SendReceive */
-
-	smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
-	cFYI(1, ("Sending smb of length %d", smb_buf_length));
-	dump_smb(smb_buffer, len);
-
-	while (len > 0) {
-		rc = kernel_sendmsg(ssocket, &smb_msg, &iov, 1, len);
-		if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
-			i++;
-		/* smaller timeout here than send2 since smaller size */
-		/* Although it may not be required, this also is smaller
-		   oplock break time */
-			if (i > 12) {
-				cERROR(1,
-				   ("sends on sock %p stuck for 7 seconds",
-				    ssocket));
-				rc = -EAGAIN;
-				break;
-			}
-			msleep(1 << i);
-			continue;
-		}
-		if (rc < 0)
-			break;
-		else
-			i = 0; /* reset i after each successful send */
-		iov.iov_base += rc;
-		iov.iov_len -= rc;
-		len -= rc;
-	}
-
-	if (rc < 0) {
-		cERROR(1, ("Error %d sending data on socket to server", rc));
-	} else {
-		rc = 0;
-	}
-
-	/* Don't want to modify the buffer as a
-	   side effect of this call. */
-	smb_buffer->smb_buf_length = smb_buf_length;
-
-	return rc;
-}
-
 static int
-smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
-	  struct sockaddr *sin, bool noblocksnd)
+smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 {
 	int rc = 0;
 	int i = 0;
@@ -243,11 +170,11 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
 	if (ssocket == NULL)
 		return -ENOTSOCK; /* BB eventually add reconnect code here */
 
-	smb_msg.msg_name = sin;
+	smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr;
 	smb_msg.msg_namelen = sizeof(struct sockaddr);
 	smb_msg.msg_control = NULL;
 	smb_msg.msg_controllen = 0;
-	if (noblocksnd)
+	if (server->noblocksnd)
 		smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
 	else
 		smb_msg.msg_flags = MSG_NOSIGNAL;
@@ -339,6 +266,18 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
 	return rc;
 }
 
+int
+smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
+	 unsigned int smb_buf_length)
+{
+	struct kvec iov;
+
+	iov.iov_base = smb_buffer;
+	iov.iov_len = smb_buf_length + 4;
+
+	return smb_sendv(server, &iov, 1);
+}
+
 static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
 {
 	if (long_op == CIFS_ASYNC_OP) {
@@ -540,9 +479,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_STATS2
 	atomic_inc(&ses->server->inSend);
 #endif
-	rc = smb_send2(ses->server, iov, n_vec,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr),
-		       ses->server->noblocksnd);
+	rc = smb_sendv(ses->server, iov, n_vec);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
@@ -736,9 +673,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_STATS2
 	atomic_inc(&ses->server->inSend);
 #endif
-	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr),
-		      ses->server->noblocksnd);
+	rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
@@ -879,9 +814,7 @@ send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
 		mutex_unlock(&ses->server->srv_mutex);
 		return rc;
 	}
-	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-	      (struct sockaddr *) &(ses->server->addr.sockAddr),
-	      ses->server->noblocksnd);
+	rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
 	mutex_unlock(&ses->server->srv_mutex);
 	return rc;
 }
@@ -973,9 +906,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 #ifdef CONFIG_CIFS_STATS2
 	atomic_inc(&ses->server->inSend);
 #endif
-	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr),
-		      ses->server->noblocksnd);
+	rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
-- 
cgit v1.2.3


From 6a7f8d36c00ab7adef5fb633f7805c91e8c1e139 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 12 Jan 2009 21:03:25 +0000
Subject: [CIFS] Rename md5 functions to avoid collision with new rt modules

When rt modules were added they (each) included their own md5
with names which collided with the existing names of cifs's md5 functions.

Renaming cifs's md5 modules so we don't collide with them.

> Stephen Rothwell wrote:
> When CIFS is built-in (=y) and staging/rt28[67]0 =y, there are multiple
> definitions of:
>
> build-r8250.out:(.text+0x1d8ad0): multiple definition of `MD5Init'
> build-r8250.out:(.text+0x1dbb30): multiple definition of `MD5Update'
> build-r8250.out:(.text+0x1db9b0): multiple definition of `MD5Final'
>
> all of which need to have more unique identifiers for their global
> symbols (e.g., rt28_md5_init, cifs_md5_init, foo, blah, bar).
>

CC: Greg K-H <gregkh@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 18 +++++++++---------
 fs/cifs/md5.c         | 38 +++++++++++++++++++-------------------
 fs/cifs/md5.h         |  6 +++---
 3 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index d4839cf0cb2c..7c9809523f42 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -48,11 +48,11 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
 	if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
 		return -EINVAL;
 
-	MD5Init(&context);
-	MD5Update(&context, (char *)&key->data, key->len);
-	MD5Update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+	cifs_MD5_init(&context);
+	cifs_MD5_update(&context, (char *)&key->data, key->len);
+	cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
 
-	MD5Final(signature, &context);
+	cifs_MD5_final(signature, &context);
 	return 0;
 }
 
@@ -96,8 +96,8 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 	if ((iov == NULL) || (signature == NULL) || (key == NULL))
 		return -EINVAL;
 
-	MD5Init(&context);
-	MD5Update(&context, (char *)&key->data, key->len);
+	cifs_MD5_init(&context);
+	cifs_MD5_update(&context, (char *)&key->data, key->len);
 	for (i = 0; i < n_vec; i++) {
 		if (iov[i].iov_len == 0)
 			continue;
@@ -110,13 +110,13 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 		if (i == 0) {
 			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
 				break; /* nothing to sign or corrupt header */
-			MD5Update(&context, iov[0].iov_base+4,
+			cifs_MD5_update(&context, iov[0].iov_base+4,
 				  iov[0].iov_len-4);
 		} else
-			MD5Update(&context, iov[i].iov_base, iov[i].iov_len);
+			cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
 	}
 
-	MD5Final(signature, &context);
+	cifs_MD5_final(signature, &context);
 
 	return 0;
 }
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
index 462bbfefd4b6..98b66a54c319 100644
--- a/fs/cifs/md5.c
+++ b/fs/cifs/md5.c
@@ -10,8 +10,8 @@
  * with every copy.
  *
  * To compute the message digest of a chunk of bytes, declare an
- * MD5Context structure, pass it to MD5Init, call MD5Update as
- * needed on buffers full of bytes, and then call MD5Final, which
+ * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
+ * needed on buffers full of bytes, and then call cifs_MD5_final, which
  * will fill a supplied 16-byte array with the digest.
  */
 
@@ -45,7 +45,7 @@ byteReverse(unsigned char *buf, unsigned longs)
  * initialization constants.
  */
 void
-MD5Init(struct MD5Context *ctx)
+cifs_MD5_init(struct MD5Context *ctx)
 {
 	ctx->buf[0] = 0x67452301;
 	ctx->buf[1] = 0xefcdab89;
@@ -61,7 +61,7 @@ MD5Init(struct MD5Context *ctx)
  * of bytes.
  */
 void
-MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
+cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
 {
 	register __u32 t;
 
@@ -110,7 +110,7 @@ MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
  * 1 0* (64-bit count of bits processed, MSB-first)
  */
 void
-MD5Final(unsigned char digest[16], struct MD5Context *ctx)
+cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
 {
 	unsigned int count;
 	unsigned char *p;
@@ -165,7 +165,7 @@ MD5Final(unsigned char digest[16], struct MD5Context *ctx)
 
 /*
  * The core of the MD5 algorithm, this alters an existing MD5 hash to
- * reflect the addition of 16 longwords of new data.  MD5Update blocks
+ * reflect the addition of 16 longwords of new data.  cifs_MD5_update blocks
  * the data and converts bytes into longwords for this routine.
  */
 static void
@@ -267,9 +267,9 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
 		unsigned char tk[16];
 		struct MD5Context tctx;
 
-		MD5Init(&tctx);
-		MD5Update(&tctx, key, key_len);
-		MD5Final(tk, &tctx);
+		cifs_MD5_init(&tctx);
+		cifs_MD5_update(&tctx, key, key_len);
+		cifs_MD5_final(tk, &tctx);
 
 		key = tk;
 		key_len = 16;
@@ -287,8 +287,8 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
 		ctx->k_opad[i] ^= 0x5c;
 	}
 
-	MD5Init(&ctx->ctx);
-	MD5Update(&ctx->ctx, ctx->k_ipad, 64);
+	cifs_MD5_init(&ctx->ctx);
+	cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
 }
 #endif
 
@@ -317,8 +317,8 @@ hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
 		ctx->k_opad[i] ^= 0x5c;
 	}
 
-	MD5Init(&ctx->ctx);
-	MD5Update(&ctx->ctx, ctx->k_ipad, 64);
+	cifs_MD5_init(&ctx->ctx);
+	cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
 }
 
 /***********************************************************************
@@ -328,7 +328,7 @@ void
 hmac_md5_update(const unsigned char *text, int text_len,
 		struct HMACMD5Context *ctx)
 {
-	MD5Update(&ctx->ctx, text, text_len);	/* then text of datagram */
+	cifs_MD5_update(&ctx->ctx, text, text_len);	/* then text of datagram */
 }
 
 /***********************************************************************
@@ -339,12 +339,12 @@ hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
 {
 	struct MD5Context ctx_o;
 
-	MD5Final(digest, &ctx->ctx);
+	cifs_MD5_final(digest, &ctx->ctx);
 
-	MD5Init(&ctx_o);
-	MD5Update(&ctx_o, ctx->k_opad, 64);
-	MD5Update(&ctx_o, digest, 16);
-	MD5Final(digest, &ctx_o);
+	cifs_MD5_init(&ctx_o);
+	cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
+	cifs_MD5_update(&ctx_o, digest, 16);
+	cifs_MD5_final(digest, &ctx_o);
 }
 
 /***********************************************************
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
index f7d4f4197bac..6fba8cb402fd 100644
--- a/fs/cifs/md5.h
+++ b/fs/cifs/md5.h
@@ -20,10 +20,10 @@ struct HMACMD5Context {
 };
 #endif				/* _HMAC_MD5_H */
 
-void MD5Init(struct MD5Context *context);
-void MD5Update(struct MD5Context *context, unsigned char const *buf,
+void cifs_MD5_init(struct MD5Context *context);
+void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
 			unsigned len);
-void MD5Final(unsigned char digest[16], struct MD5Context *context);
+void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
 
 /* The following definitions come from lib/hmacmd5.c  */
 
-- 
cgit v1.2.3


From 42c245447c8c3f998dfe880aba18b6e5129d2976 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 13 Jan 2009 22:03:55 +0000
Subject: [CIFS] revalidate parent inode when rmdir done within that directory

When a search is pending of a parent directory, and a child directory
within it is removed, we need to reset the parent directory's time
so that we don't reuse the (now stale) search results.

Thanks to Gunter Kukkukk for reporting this:

> got the following failure notification on irc #samba:
>
> A user was updating from subversion 1.4 to 1.5, where the
> repository is located on a samba share (independent of
> unix extensions = Yes or No).
> svn 1.4 did work, 1.5 does not.
>
> The user did a lot of stracing of subversion - and wrote a
> testapplet to simulate the failing behaviour.
> I've converted the C++ source to C and added some error cases.
>
> When using "./testdir" on a local file system, "result2"
> is always (nil) as expected - cifs vfs behaves different here!
>
>   ./testdir /mnt/cifs/mounted/share
>
> returns a (failing) valid pointer.

Acked-by: Dave Kleikamp <shaggy@us.ibm.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES | 4 +++-
 fs/cifs/inode.c | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 080703a15f44..73ac7ebd1dfc 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -5,7 +5,9 @@ rather than posix (advisory) byte range locks, even though server would
 support posix byte range locks.  Fix query of root inode when prefixpath
 specified and user does not have access to query information about the
 top of the share.  Fix problem in 2.6.28 resolving DFS paths to
-Samba servers (worked to Windows).
+Samba servers (worked to Windows).  Fix rmdir so that pending search
+(readdir) requests do not get invalid results which include the now
+removed directory.
 
 Version 1.55
 ------------
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5ab9896fdcb2..bcf7b5184664 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1285,6 +1285,11 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	cifsInode = CIFS_I(direntry->d_inode);
 	cifsInode->time = 0;	/* force revalidate to go get info when
 				   needed */
+
+	cifsInode = CIFS_I(inode);
+	cifsInode->time = 0;	/* force revalidate to get parent dir info
+				   since cached search results now invalid */
+
 	direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
 		current_fs_time(inode->i_sb);
 
-- 
cgit v1.2.3


From f818dd55c4a8b3519e203900bde0bb780d36e799 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 19 Jan 2009 02:38:35 +0000
Subject: [CIFS] some cleanup to dir.c prior to addition of posix_open

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dir.c | 56 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 838d9c720a5c..964aad03c5ad 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -129,6 +129,17 @@ cifs_bp_rename_retry:
 	return full_path;
 }
 
+static void setup_cifs_dentry(struct cifsTconInfo *tcon,
+			      struct dentry *direntry,
+			      struct inode *newinode)
+{
+	if (tcon->nocase)
+		direntry->d_op = &cifs_ci_dentry_ops;
+	else
+		direntry->d_op = &cifs_dentry_ops;
+	d_instantiate(direntry, newinode);
+}
+
 /* Inode operations in similar order to how they appear in Linux file fs.h */
 
 int
@@ -139,14 +150,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	int xid;
 	int create_options = CREATE_NOT_DIR;
 	int oplock = 0;
+	/* BB below access is too much for the mknod to request */
 	int desiredAccess = GENERIC_READ | GENERIC_WRITE;
 	__u16 fileHandle;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	struct cifsTconInfo *tcon;
 	char *full_path = NULL;
 	FILE_ALL_INFO *buf = NULL;
 	struct inode *newinode = NULL;
-	struct cifsFileInfo *pCifsFile = NULL;
 	struct cifsInodeInfo *pCifsInode;
 	int disposition = FILE_OVERWRITE_IF;
 	bool write_only = false;
@@ -154,7 +165,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	tcon = cifs_sb->tcon;
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -162,6 +173,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		return -ENOMEM;
 	}
 
+	mode &= ~current->fs->umask;
+
 	if (nd && (nd->flags & LOOKUP_OPEN)) {
 		int oflags = nd->intent.open.flags;
 
@@ -196,17 +209,15 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		return -ENOMEM;
 	}
 
-	mode &= ~current->fs->umask;
-
 	/*
 	 * if we're not using unix extensions, see if we need to set
 	 * ATTR_READONLY on the create call
 	 */
-	if (!pTcon->unix_ext && (mode & S_IWUGO) == 0)
+	if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
 		create_options |= CREATE_OPTION_READONLY;
 
 	if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
-		rc = CIFSSMBOpen(xid, pTcon, full_path, disposition,
+		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
 			 desiredAccess, create_options,
 			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -215,7 +226,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 
 	if (rc == -EIO) {
 		/* old server, retry the open legacy style */
-		rc = SMBLegacyOpen(xid, pTcon, full_path, disposition,
+		rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
 			desiredAccess, create_options,
 			&fileHandle, &oplock, buf, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -225,7 +236,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	} else {
 		/* If Open reported that we actually created a file
 		then we now have to set the mode if possible */
-		if ((pTcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+		if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
 			struct cifs_unix_set_info_args args = {
 				.mode	= mode,
 				.ctime	= NO_CHANGE_64,
@@ -244,20 +255,20 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 				args.uid = NO_CHANGE_64;
 				args.gid = NO_CHANGE_64;
 			}
-			CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+			CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
 				cifs_sb->local_nls,
 				cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 		} else {
 			/* BB implement mode setting via Windows security
 			   descriptors e.g. */
-			/* CIFSSMBWinSetPerms(xid,pTcon,path,mode,-1,-1,nls);*/
+			/* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
 
 			/* Could set r/o dos attribute if mode & 0222 == 0 */
 		}
 
 		/* server might mask mode so we have to query for it */
-		if (pTcon->unix_ext)
+		if (tcon->unix_ext)
 			rc = cifs_get_inode_info_unix(&newinode, full_path,
 						 inode->i_sb, xid);
 		else {
@@ -283,22 +294,17 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		}
 
 		if (rc != 0) {
-			cFYI(1,
-			     ("Create worked but get_inode_info failed rc = %d",
-			      rc));
-		} else {
-			if (pTcon->nocase)
-				direntry->d_op = &cifs_ci_dentry_ops;
-			else
-				direntry->d_op = &cifs_dentry_ops;
-			d_instantiate(direntry, newinode);
-		}
+			cFYI(1, ("Create worked, get_inode_info failed rc = %d",
+				 rc));
+		} else
+			setup_cifs_dentry(tcon, direntry, newinode);
+
 		if ((nd == NULL /* nfsd case - nfs srv does not set nd */) ||
 			(!(nd->flags & LOOKUP_OPEN))) {
 			/* mknod case - do not leave file open */
-			CIFSSMBClose(xid, pTcon, fileHandle);
+			CIFSSMBClose(xid, tcon, fileHandle);
 		} else if (newinode) {
-			pCifsFile =
+			struct cifsFileInfo *pCifsFile =
 			   kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
 
 			if (pCifsFile == NULL)
@@ -316,7 +322,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 			/* set the following in open now
 				pCifsFile->pfile = file; */
 			write_lock(&GlobalSMBSeslock);
-			list_add(&pCifsFile->tlist, &pTcon->openFileList);
+			list_add(&pCifsFile->tlist, &tcon->openFileList);
 			pCifsInode = CIFS_I(newinode);
 			if (pCifsInode) {
 				/* if readable file instance put first in list*/
-- 
cgit v1.2.3


From da505c386c9f993e43861791dae339b2219cf8dd Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 19 Jan 2009 03:49:35 +0000
Subject: [CIFS] Make socket retry timeouts consistent between blocking and
 nonblocking cases

We have used approximately 15 second timeouts on nonblocking sends in the past, and
also 15 second SMB timeout (waiting for server responses, for most request types).
Now that we can do blocking tcp sends,
make blocking send timeout approximately the same (15 seconds).

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c   |  4 ++--
 fs/cifs/transport.c | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7419576228fb..a3537a90a9d9 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1802,7 +1802,7 @@ ipv4_connect(struct TCP_Server_Info *server)
 	 *  user space buffer
 	 */
 	socket->sk->sk_rcvtimeo = 7 * HZ;
-	socket->sk->sk_sndtimeo = 3 * HZ;
+	socket->sk->sk_sndtimeo = 5 * HZ;
 
 	/* make the bufsizes depend on wsize/rsize and max requests */
 	if (server->noautotune) {
@@ -1953,7 +1953,7 @@ ipv6_connect(struct TCP_Server_Info *server)
 	 * user space buffer
 	 */
 	socket->sk->sk_rcvtimeo = 7 * HZ;
-	socket->sk->sk_sndtimeo = 3 * HZ;
+	socket->sk->sk_sndtimeo = 5 * HZ;
 	server->ssocket = socket;
 
 	return rc;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 2c7efd26992d..0ad3e2d116a6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -199,7 +199,25 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 				    n_vec - first_vec, total_len);
 		if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
 			i++;
-			if (i >= 14) {
+			/* if blocking send we try 3 times, since each can block
+			   for 5 seconds. For nonblocking  we have to try more
+			   but wait increasing amounts of time allowing time for
+			   socket to clear.  The overall time we wait in either
+			   case to send on the socket is about 15 seconds.
+			   Similarly we wait for 15 seconds for
+			   a response from the server in SendReceive[2]
+			   for the server to send a response back for
+			   most types of requests (except SMB Write
+			   past end of file which can be slow, and
+			   blocking lock operations). NFS waits slightly longer
+			   than CIFS, but this can make it take longer for
+			   nonresponsive servers to be detected and 15 seconds
+			   is more than enough time for modern networks to
+			   send a packet.  In most cases if we fail to send
+			   after the retries we will kill the socket and
+			   reconnect which may clear the network problem.
+			*/
+			if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
 				cERROR(1,
 				   ("sends on sock %p stuck for 15 seconds",
 				    ssocket));
-- 
cgit v1.2.3


From a9ac49d303f967be0dabd97cb722c4a13109c6c2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 22 Jan 2009 14:43:21 -0500
Subject: cifs: make sure we allocate enough storage for socket address

The sockaddr declared on the stack in cifs_get_tcp_session is too small
for IPv6 addresses. Change it from "struct sockaddr" to "struct
sockaddr_storage" to prevent stack corruption when IPv6 is used.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a3537a90a9d9..2209be943051 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1354,7 +1354,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 }
 
 static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr *addr)
+cifs_find_tcp_session(struct sockaddr_storage *addr)
 {
 	struct list_head *tmp;
 	struct TCP_Server_Info *server;
@@ -1374,11 +1374,11 @@ cifs_find_tcp_session(struct sockaddr *addr)
 		if (server->tcpStatus == CifsNew)
 			continue;
 
-		if (addr->sa_family == AF_INET &&
+		if (addr->ss_family == AF_INET &&
 		    (addr4->sin_addr.s_addr !=
 		     server->addr.sockAddr.sin_addr.s_addr))
 			continue;
-		else if (addr->sa_family == AF_INET6 &&
+		else if (addr->ss_family == AF_INET6 &&
 			 memcmp(&server->addr.sockAddr6.sin6_addr,
 				&addr6->sin6_addr, sizeof(addr6->sin6_addr)))
 			continue;
@@ -1419,12 +1419,12 @@ static struct TCP_Server_Info *
 cifs_get_tcp_session(struct smb_vol *volume_info)
 {
 	struct TCP_Server_Info *tcp_ses = NULL;
-	struct sockaddr addr;
+	struct sockaddr_storage addr;
 	struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
 	struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
 	int rc;
 
-	memset(&addr, 0, sizeof(struct sockaddr));
+	memset(&addr, 0, sizeof(struct sockaddr_storage));
 
 	if (volume_info->UNCip && volume_info->UNC) {
 		rc = cifs_inet_pton(AF_INET, volume_info->UNCip,
@@ -1435,9 +1435,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 			rc = cifs_inet_pton(AF_INET6, volume_info->UNCip,
 					    &sin_server6->sin6_addr.in6_u);
 			if (rc > 0)
-				addr.sa_family = AF_INET6;
+				addr.ss_family = AF_INET6;
 		} else {
-			addr.sa_family = AF_INET;
+			addr.ss_family = AF_INET;
 		}
 
 		if (rc <= 0) {
@@ -1502,7 +1502,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	tcp_ses->tcpStatus = CifsNew;
 	++tcp_ses->srv_count;
 
-	if (addr.sa_family == AF_INET6) {
+	if (addr.ss_family == AF_INET6) {
 		cFYI(1, ("attempting ipv6 connect"));
 		/* BB should we allow ipv6 on port 139? */
 		/* other OS never observed in Wild doing 139 with v6 */
-- 
cgit v1.2.3


From 3eb14297c4b85af0c5e6605e18d93b6031330d71 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Thu, 29 Jan 2009 11:17:24 +0200
Subject: UBIFS: sync wbufs after syncing inodes and pages

All writes go through wbufs so they must be sync'd
after syncing inodes and pages.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index dbfc88714716..3ddd754262b4 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -450,16 +450,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
-	/*
-	 * Synchronize write buffers, because 'ubifs_run_commit()' does not
-	 * do this if it waits for an already running commit.
-	 */
-	for (i = 0; i < c->jhead_cnt; i++) {
-		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
-		if (err)
-			return err;
-	}
-
 	/*
 	 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
 	 * pages, so synchronize them first, then commit the journal. Strictly
@@ -471,6 +461,16 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	 */
 	generic_sync_sb_inodes(sb, &wbc);
 
+	/*
+	 * Synchronize write buffers, because 'ubifs_run_commit()' does not
+	 * do this if it waits for an already running commit.
+	 */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			return err;
+	}
+
 	err = ubifs_run_commit(c);
 	if (err)
 		return err;
-- 
cgit v1.2.3


From 227c75c91dbfa037d109ab7ef45b7f5ba9cab6d0 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Thu, 29 Jan 2009 11:53:51 +0200
Subject: UBIFS: spelling fix 'date' -> 'data'

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c | 2 +-
 fs/ubifs/gc.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 9a41f6f245b7..e975bd82f38b 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -1407,7 +1407,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra)
  * @c: UBIFS file-system description object
  * @leaf_cb: called for each leaf node
  * @znode_cb: called for each indexing node
- * @priv: private date which is passed to callbacks
+ * @priv: private data which is passed to callbacks
  *
  * This function walks the UBIFS index and calls the @leaf_cb for each leaf
  * node and @znode_cb for each indexing node. Returns zero in case of success
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 9760154d874b..bad3339a800d 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -401,7 +401,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
 
 		/*
 		 * Don't release the LEB until after the next commit, because
-		 * it may contain date which is needed for recovery. So
+		 * it may contain data which is needed for recovery. So
 		 * although we freed this LEB, it will become usable only after
 		 * the commit.
 		 */
-- 
cgit v1.2.3


From b466f17d780c5b72427f36aef22ecdec9f1d0689 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Thu, 29 Jan 2009 12:59:33 +0200
Subject: UBIFS: remount ro fixes

- preserve the idx_gc list - it will be needed in the same
state, should UBIFS be remounted rw again
- prevent remounting ro if we have switched to read only
mode (due to a fatal error)

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c    | 18 +++++-------------
 fs/ubifs/super.c | 14 +++++++-------
 fs/ubifs/ubifs.h |  2 +-
 3 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index bad3339a800d..a711d33b3d3e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -830,29 +830,21 @@ out:
  * ubifs_destroy_idx_gc - destroy idx_gc list.
  * @c: UBIFS file-system description object
  *
- * This function destroys the @c->idx_gc list. It is called when unmounting or
- * remounting read-only so locks are not needed. Returns zero in case of
- * success and a negative error code in case of failure.
+ * This function destroys the @c->idx_gc list. It is called when unmounting
+ * so locks are not needed. Returns zero in case of success and a negative
+ * error code in case of failure.
  */
-int ubifs_destroy_idx_gc(struct ubifs_info *c)
+void ubifs_destroy_idx_gc(struct ubifs_info *c)
 {
-	int ret = 0;
-
 	while (!list_empty(&c->idx_gc)) {
-		int err;
 		struct ubifs_gced_idx_leb *idx_gc;
 
 		idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
 				    list);
-		err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
-					  LPROPS_NC, 0, LPROPS_TAKEN, -1);
-		if (err && !ret)
-			ret = err;
+		c->idx_gc_cnt -= 1;
 		list_del(&idx_gc->list);
 		kfree(idx_gc);
 	}
-
-	return ret;
 }
 
 /**
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 3ddd754262b4..daa679d3a03e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1687,10 +1687,6 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	if (err)
 		ubifs_ro_mode(c, err);
 
-	err = ubifs_destroy_idx_gc(c);
-	if (err)
-		ubifs_ro_mode(c, err);
-
 	free_wbufs(c);
 	vfree(c->orph_buf);
 	c->orph_buf = NULL;
@@ -1793,15 +1789,19 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
 		if (c->ro_media) {
-			ubifs_msg("cannot re-mount R/W, UBIFS is working in "
-				  "R/O mode");
+			ubifs_msg("cannot re-mount due to prior errors");
 			return -EINVAL;
 		}
 		err = ubifs_remount_rw(c);
 		if (err)
 			return err;
-	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
+		if (c->ro_media) {
+			ubifs_msg("cannot re-mount due to prior errors");
+			return -EINVAL;
+		}
 		ubifs_remount_ro(c);
+	}
 
 	if (c->bulk_read == 1)
 		bu_init(c);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 29dfa816077b..535f87426791 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1594,7 +1594,7 @@ int ubifs_replay_journal(struct ubifs_info *c);
 int ubifs_garbage_collect(struct ubifs_info *c, int anyway);
 int ubifs_gc_start_commit(struct ubifs_info *c);
 int ubifs_gc_end_commit(struct ubifs_info *c);
-int ubifs_destroy_idx_gc(struct ubifs_info *c);
+void ubifs_destroy_idx_gc(struct ubifs_info *c);
 int ubifs_get_idx_gc_leb(struct ubifs_info *c);
 int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp);
 
-- 
cgit v1.2.3


From a2b9df3ff691db8e5e521dccd231a8098bbf7416 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 29 Jan 2009 16:22:54 +0200
Subject: UBIFS: return sensible error codes

When mounting/re-mounting, UBIFS returns EINVAL even if the ENOSPC
or EROFS codes are are much better, just because we have not found
references to ENOSPC/EROFS in mount (2) man pages. This patch
changes this behaviour and makes UBIFS return real error code,
because:

1. It is just less confusing and more logical
2. mount is not described in SuSv3, so it seems to be not really
   well-standartized
3. we do not cover all cases, and any random undocumented in man
   pages error code may be returned anyway

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/master.c |  2 +-
 fs/ubifs/super.c  | 11 +++--------
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 71d5493bf565..a88f33801b98 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -354,7 +354,7 @@ int ubifs_write_master(struct ubifs_info *c)
 	int err, lnum, offs, len;
 
 	if (c->ro_media)
-		return -EINVAL;
+		return -EROFS;
 
 	lnum = UBIFS_MST_LNUM;
 	offs = c->mst_offs + c->mst_node_alsz;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index daa679d3a03e..ab85eb8cce79 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1085,12 +1085,7 @@ static int check_free_space(struct ubifs_info *c)
 		ubifs_err("insufficient free space to mount in read/write mode");
 		dbg_dump_budg(c);
 		dbg_dump_lprops(c);
-		/*
-		 * We return %-EINVAL instead of %-ENOSPC because it seems to
-		 * be the closest error code mentioned in the mount function
-		 * documentation.
-		 */
-		return -EINVAL;
+		return -ENOSPC;
 	}
 	return 0;
 }
@@ -1790,7 +1785,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
 		if (c->ro_media) {
 			ubifs_msg("cannot re-mount due to prior errors");
-			return -EINVAL;
+			return -EROFS;
 		}
 		err = ubifs_remount_rw(c);
 		if (err)
@@ -1798,7 +1793,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
 		if (c->ro_media) {
 			ubifs_msg("cannot re-mount due to prior errors");
-			return -EINVAL;
+			return -EROFS;
 		}
 		ubifs_remount_ro(c);
 	}
-- 
cgit v1.2.3


From 27ad27993313312a4ad0047d0a944c425cd511a5 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 29 Jan 2009 16:34:30 +0200
Subject: UBIFS: remove fast unmounting

This UBIFS feature has never worked properly, and it was a mistake
to add it because we simply have no use-cases. So, lets still accept
the fast_unmount mount option, but ignore it. This does not change
much, because UBIFS commit in sync_fs anyway, and sync_fs is called
while unmounting.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 50 +++++---------------------------------------------
 fs/ubifs/ubifs.h |  2 --
 2 files changed, 5 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index ab85eb8cce79..1182b66a5491 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -957,13 +957,16 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
 
 		token = match_token(p, tokens, args);
 		switch (token) {
+		/*
+		 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
+		 * We accepte them in order to be backware-compatible. But this
+		 * should be removed at some point.
+		 */
 		case Opt_fast_unmount:
 			c->mount_opts.unmount_mode = 2;
-			c->fast_unmount = 1;
 			break;
 		case Opt_norm_unmount:
 			c->mount_opts.unmount_mode = 1;
-			c->fast_unmount = 0;
 			break;
 		case Opt_bulk_read:
 			c->mount_opts.bulk_read = 2;
@@ -1359,7 +1362,6 @@ static int mount_ubifs(struct ubifs_info *c)
 	       c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
 	       c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
 	       c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
-	dbg_msg("fast unmount:        %d", c->fast_unmount);
 	dbg_msg("big_lpt              %d", c->big_lpt);
 	dbg_msg("log LEBs:            %d (%d - %d)",
 		c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
@@ -1615,38 +1617,6 @@ out:
 	return err;
 }
 
-/**
- * commit_on_unmount - commit the journal when un-mounting.
- * @c: UBIFS file-system description object
- *
- * This function is called during un-mounting and re-mounting, and it commits
- * the journal unless the "fast unmount" mode is enabled.
- */
-static void commit_on_unmount(struct ubifs_info *c)
-{
-	long long bud_bytes;
-
-	if (!c->fast_unmount) {
-		dbg_gen("skip committing - fast unmount enabled");
-		return;
-	}
-
-	/*
-	 * This function is called before the background thread is stopped, so
-	 * we may race with ongoing commit, which means we have to take
-	 * @c->bud_lock to access @c->bud_bytes.
-	 */
-	spin_lock(&c->buds_lock);
-	bud_bytes = c->bud_bytes;
-	spin_unlock(&c->buds_lock);
-
-	if (bud_bytes) {
-		dbg_gen("run commit");
-		ubifs_run_commit(c);
-	} else
-		dbg_gen("journal is empty, do not run commit");
-}
-
 /**
  * ubifs_remount_ro - re-mount in read-only mode.
  * @c: UBIFS file-system description object
@@ -1661,7 +1631,6 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	ubifs_assert(!c->need_recovery);
 	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
 
-	commit_on_unmount(c);
 	mutex_lock(&c->umount_mutex);
 	if (c->bgt) {
 		kthread_stop(c->bgt);
@@ -2077,15 +2046,6 @@ out_close:
 
 static void ubifs_kill_sb(struct super_block *sb)
 {
-	struct ubifs_info *c = sb->s_fs_info;
-
-	/*
-	 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
-	 * in order to be outside BKL.
-	 */
-	if (sb->s_root && !(sb->s_flags & MS_RDONLY))
-		commit_on_unmount(c);
-	/* The un-mount routine is actually done in put_super() */
 	generic_shutdown_super(sb);
 }
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 535f87426791..039a68bee29a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -961,7 +961,6 @@ struct ubifs_debug_info;
  * @cs_lock: commit state lock
  * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
  *
- * @fast_unmount: do not run journal commit before un-mounting
  * @big_lpt: flag that LPT is too big to write whole during commit
  * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
  *                   recovery)
@@ -1202,7 +1201,6 @@ struct ubifs_info {
 	spinlock_t cs_lock;
 	wait_queue_head_t cmt_wq;
 
-	unsigned int fast_unmount:1;
 	unsigned int big_lpt:1;
 	unsigned int no_chk_data_crc:1;
 	unsigned int bulk_read:1;
-- 
cgit v1.2.3


From df1c46b2b6876d0a1b1b4740f009fa69d95ebbc9 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 29 Jan 2009 16:53:35 -0800
Subject: tun: Add some missing TUN compat ioctl translations.

Based upon a report from Michael Tokarev <mjt@tls.msk.ru>:

	Just saw in dmesg:

	ioctl32(kvm:4408): Unknown cmd fd(9) cmd(800454cf){t:'T';sz:4} arg(ffc668e4) on /dev/net/tun

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/compat_ioctl.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 5235c67e7594..c8f8d5904f5e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -538,6 +538,7 @@ static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
 		 * cannot be fixed without breaking all existing apps.
 		 */
 		case TUNSETIFF:
+		case TUNGETIFF:
 		case SIOCGIFFLAGS:
 		case SIOCGIFMETRIC:
 		case SIOCGIFMTU:
@@ -1982,6 +1983,11 @@ COMPATIBLE_IOCTL(TUNSETNOCSUM)
 COMPATIBLE_IOCTL(TUNSETDEBUG)
 COMPATIBLE_IOCTL(TUNSETPERSIST)
 COMPATIBLE_IOCTL(TUNSETOWNER)
+COMPATIBLE_IOCTL(TUNSETLINK)
+COMPATIBLE_IOCTL(TUNSETGROUP)
+COMPATIBLE_IOCTL(TUNGETFEATURES)
+COMPATIBLE_IOCTL(TUNSETOFFLOAD)
+COMPATIBLE_IOCTL(TUNSETTXFILTER)
 /* Big V */
 COMPATIBLE_IOCTL(VT_SETMODE)
 COMPATIBLE_IOCTL(VT_GETMODE)
@@ -2573,6 +2579,7 @@ HANDLE_IOCTL(SIOCGIFPFLAGS, dev_ifsioc)
 HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc)
 HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc)
 HANDLE_IOCTL(TUNSETIFF, dev_ifsioc)
+HANDLE_IOCTL(TUNGETIFF, dev_ifsioc)
 HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl)
 HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl)
 HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl)
-- 
cgit v1.2.3


From 9df04e1f25effde823a600e755b51475d438f56b Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Thu, 29 Jan 2009 14:25:26 -0800
Subject: epoll: drop max_user_instances and rely only on max_user_watches

Linus suggested to put limits where the money is, and max_user_watches
already does that w/out the need of max_user_instances.  That has the
advantage to mitigate the potential DoS while allowing pretty generous
default behavior.

Allowing top 4% of low memory (per user) to be allocated in epoll watches,
we have:

LOMEM    MAX_WATCHES (per user)
512MB    ~178000
1GB      ~356000
2GB      ~712000

A box with 512MB of lomem, will meet some challenge in hitting 180K
watches, socket buffers math teaches us.  No more max_user_instances
limits then.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Willy Tarreau <w@1wt.eu>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Bron Gondwana <brong@fastmail.fm>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ba2f9ec71192..011b9b8c90c6 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -234,8 +234,6 @@ struct ep_pqueue {
 /*
  * Configuration options available inside /proc/sys/fs/epoll/
  */
-/* Maximum number of epoll devices, per user */
-static int max_user_instances __read_mostly;
 /* Maximum number of epoll watched descriptors, per user */
 static int max_user_watches __read_mostly;
 
@@ -260,14 +258,6 @@ static struct kmem_cache *pwq_cache __read_mostly;
 static int zero;
 
 ctl_table epoll_table[] = {
-	{
-		.procname	= "max_user_instances",
-		.data		= &max_user_instances,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= &zero,
-	},
 	{
 		.procname	= "max_user_watches",
 		.data		= &max_user_watches,
@@ -491,7 +481,6 @@ static void ep_free(struct eventpoll *ep)
 
 	mutex_unlock(&epmutex);
 	mutex_destroy(&ep->mtx);
-	atomic_dec(&ep->user->epoll_devs);
 	free_uid(ep->user);
 	kfree(ep);
 }
@@ -581,10 +570,6 @@ static int ep_alloc(struct eventpoll **pep)
 	struct eventpoll *ep;
 
 	user = get_current_user();
-	error = -EMFILE;
-	if (unlikely(atomic_read(&user->epoll_devs) >=
-			max_user_instances))
-		goto free_uid;
 	error = -ENOMEM;
 	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
 	if (unlikely(!ep))
@@ -1141,7 +1126,6 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
 			      flags & O_CLOEXEC);
 	if (fd < 0)
 		ep_free(ep);
-	atomic_inc(&ep->user->epoll_devs);
 
 error_return:
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
@@ -1366,8 +1350,10 @@ static int __init eventpoll_init(void)
 	struct sysinfo si;
 
 	si_meminfo(&si);
-	max_user_instances = 128;
-	max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) /
+	/*
+	 * Allows top 4% of lomem to be allocated for epoll watches (per user).
+	 */
+	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
 		EP_ITEM_COST;
 
 	/* Initialize the structure used to perform safe poll wait head wake ups */
-- 
cgit v1.2.3


From b9ec63f78b425c0e16cc95605b5d4ff2dc228b97 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 30 Jan 2009 00:00:24 -0500
Subject: ext4: Remove bogus BUG() check in ext4_bmap()

The code to support journal-less ext4 operation added a BUG to
ext4_bmap() which fired if there was no journal and the
EXT4_STATE_JDATA bit was set in the i_state field.  This caused
running the filefrag program (which uses the FIMBAP ioctl) to trigger
a BUG().

The EXT4_STATE_JDATA bit is only used for ext4_bmap(), and it's
harmless for the bit to be set.  We could add a check in
__ext4_journalled_writepage() and ext4_journalled_write_end() to only
set the EXT4_STATE_JDATA bit if the journal is present, but that adds
an extra test and jump instruction.  It's easier to simply remove the
BUG check.

http://bugzilla.kernel.org/show_bug.cgi?id=12568

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/inode.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b4386dafeb0c..03ba20be1329 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2821,9 +2821,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 		filemap_write_and_wait(mapping);
 	}
 
-	BUG_ON(!EXT4_JOURNAL(inode) &&
-	       EXT4_I(inode)->i_state & EXT4_STATE_JDATA);
-
 	if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
-- 
cgit v1.2.3


From 7b24fc4d7eb611da367dea3aad45473050aacd6c Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Sun, 4 Jan 2009 02:43:38 -0500
Subject: block: Don't verify integrity metadata on read error

If we get an I/O error on a read request there is no point in doing a
verify pass on the integrity buffer.  Adjust the completion path
accordingly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 77ebc3c263d6..8396d741f804 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -465,7 +465,7 @@ static int bio_integrity_verify(struct bio *bio)
 
 		if (ret) {
 			kunmap_atomic(kaddr, KM_USER0);
-			break;
+			return ret;
 		}
 
 		sectors = bv->bv_len / bi->sector_size;
@@ -493,18 +493,13 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 	struct bio_integrity_payload *bip =
 		container_of(work, struct bio_integrity_payload, bip_work);
 	struct bio *bio = bip->bip_bio;
-	int error = bip->bip_error;
+	int error;
 
-	if (bio_integrity_verify(bio)) {
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-		error = -EIO;
-	}
+	error = bio_integrity_verify(bio);
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
-
-	if (bio->bi_end_io)
-		bio->bi_end_io(bio, error);
+	bio_endio(bio, error);
 }
 
 /**
@@ -525,7 +520,17 @@ void bio_integrity_endio(struct bio *bio, int error)
 
 	BUG_ON(bip->bip_bio != bio);
 
-	bip->bip_error = error;
+	/* In case of an I/O error there is no point in verifying the
+	 * integrity metadata.  Restore original bio end_io handler
+	 * and run it.
+	 */
+	if (error) {
+		bio->bi_end_io = bip->bip_end_io;
+		bio_endio(bio, error);
+
+		return;
+	}
+
 	INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
 	queue_work(kintegrityd_wq, &bip->bip_work);
 }
-- 
cgit v1.2.3


From 8ae372e3bb4acaca37ffa2ce54f4cf8dd60a94fa Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Sun, 4 Jan 2009 02:43:39 -0500
Subject: block: Remove obsolete BUG_ON

Now that bio_vecs are no longer cleared in bvec_alloc_bs() the following
BUG_ON must go.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 8396d741f804..549b0144da11 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -140,7 +140,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 
 	iv = bip_vec_idx(bip, bip->bip_vcnt);
 	BUG_ON(iv == NULL);
-	BUG_ON(iv->bv_page != NULL);
 
 	iv->bv_page = page;
 	iv->bv_len = len;
-- 
cgit v1.2.3


From 0e2bedaa394f74fa9f75ee937488c33d90039b5a Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 30 Jan 2009 21:24:41 +0000
Subject: [CIFS] ipv6_addr_equal for address comparison

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2209be943051..005df85219a8 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,7 +23,6 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/wait.h>
-#include <linux/ipv6.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
@@ -35,6 +34,7 @@
 #include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
@@ -1379,8 +1379,8 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
 		     server->addr.sockAddr.sin_addr.s_addr))
 			continue;
 		else if (addr->ss_family == AF_INET6 &&
-			 memcmp(&server->addr.sockAddr6.sin6_addr,
-				&addr6->sin6_addr, sizeof(addr6->sin6_addr)))
+			 !ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
+					  &addr6->sin6_addr))
 			continue;
 
 		++server->srv_count;
-- 
cgit v1.2.3


From ea455f8ab68338ba69f5d3362b342c115bea8e13 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Jan 2009 23:20:31 +0100
Subject: ocfs2: Push out dropping of dentry lock to ocfs2_wq

Dropping of last reference to dentry lock is a complicated operation involving
dropping of reference to inode. This can get complicated and quota code in
particular needs to obtain some quota locks which leads to potential deadlock.
Thus we defer dropping of inode reference to ocfs2_wq.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dcache.c | 42 +++++++++++++++++++++++++++++++++++++++---
 fs/ocfs2/dcache.h |  9 ++++++++-
 fs/ocfs2/ocfs2.h  |  6 ++++++
 fs/ocfs2/super.c  |  3 +++
 4 files changed, 56 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b1cc7c381e88..e9d7c2038c0f 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -38,6 +38,7 @@
 #include "dlmglue.h"
 #include "file.h"
 #include "inode.h"
+#include "super.h"
 
 
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
@@ -294,6 +295,34 @@ out_attach:
 	return ret;
 }
 
+static DEFINE_SPINLOCK(dentry_list_lock);
+
+/* We limit the number of dentry locks to drop in one go. We have
+ * this limit so that we don't starve other users of ocfs2_wq. */
+#define DL_INODE_DROP_COUNT 64
+
+/* Drop inode references from dentry locks */
+void ocfs2_drop_dl_inodes(struct work_struct *work)
+{
+	struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+					       dentry_lock_work);
+	struct ocfs2_dentry_lock *dl;
+	int drop_count = DL_INODE_DROP_COUNT;
+
+	spin_lock(&dentry_list_lock);
+	while (osb->dentry_lock_list && drop_count--) {
+		dl = osb->dentry_lock_list;
+		osb->dentry_lock_list = dl->dl_next;
+		spin_unlock(&dentry_list_lock);
+		iput(dl->dl_inode);
+		kfree(dl);
+		spin_lock(&dentry_list_lock);
+	}
+	if (osb->dentry_lock_list)
+		queue_work(ocfs2_wq, &osb->dentry_lock_work);
+	spin_unlock(&dentry_list_lock);
+}
+
 /*
  * ocfs2_dentry_iput() and friends.
  *
@@ -318,16 +347,23 @@ out_attach:
 static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
 				   struct ocfs2_dentry_lock *dl)
 {
-	iput(dl->dl_inode);
 	ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
 	ocfs2_lock_res_free(&dl->dl_lockres);
-	kfree(dl);
+
+	/* We leave dropping of inode reference to ocfs2_wq as that can
+	 * possibly lead to inode deletion which gets tricky */
+	spin_lock(&dentry_list_lock);
+	if (!osb->dentry_lock_list)
+		queue_work(ocfs2_wq, &osb->dentry_lock_work);
+	dl->dl_next = osb->dentry_lock_list;
+	osb->dentry_lock_list = dl;
+	spin_unlock(&dentry_list_lock);
 }
 
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
 			   struct ocfs2_dentry_lock *dl)
 {
-	int unlock = 0;
+	int unlock;
 
 	BUG_ON(dl->dl_count == 0);
 
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index c091c34d9883..d06e16c06640 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,8 +29,13 @@
 extern struct dentry_operations ocfs2_dentry_ops;
 
 struct ocfs2_dentry_lock {
+	/* Use count of dentry lock */
 	unsigned int		dl_count;
-	u64			dl_parent_blkno;
+	union {
+		/* Linked list of dentry locks to release */
+		struct ocfs2_dentry_lock *dl_next;
+		u64			dl_parent_blkno;
+	};
 
 	/*
 	 * The ocfs2_dentry_lock keeps an inode reference until
@@ -47,6 +52,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
 			   struct ocfs2_dentry_lock *dl);
 
+void ocfs2_drop_dl_inodes(struct work_struct *work);
+
 struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
 				      int skip_unhashed);
 
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index ad5c24a29edd..077384135f4e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -210,6 +210,7 @@ struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
 struct ocfs2_quota_recovery;
+struct ocfs2_dentry_lock;
 struct ocfs2_super
 {
 	struct task_struct *commit_task;
@@ -325,6 +326,11 @@ struct ocfs2_super
 	struct list_head blocked_lock_list;
 	unsigned long blocked_lock_count;
 
+	/* List of dentry locks to release. Anyone can add locks to
+	 * the list, ocfs2_wq processes the list  */
+	struct ocfs2_dentry_lock *dentry_lock_list;
+	struct work_struct dentry_lock_work;
+
 	wait_queue_head_t		osb_mount_event;
 
 	/* Truncate log info */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 43ed11345b59..b1cb38fbe807 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1887,6 +1887,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
 	journal->j_state = OCFS2_JOURNAL_FREE;
 
+	INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
+	osb->dentry_lock_list = NULL;
+
 	/* get some pseudo constants for clustersize bits */
 	osb->s_clustersize_bits =
 		le32_to_cpu(di->id2.i_super.s_clustersize_bits);
-- 
cgit v1.2.3


From f8afead7169f0f28a4b421bcbdb510e52a2d094d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Jan 2009 23:20:32 +0100
Subject: ocfs2: Fix possible deadlock in ocfs2_write_dquot()

It could happen that some limit has been set via quotactl() and in parallel
->mark_dirty() is called from another thread doing e.g. dquot_alloc_space(). In
such case ocfs2_write_dquot() must not try to sync the dquot because that needs
global quota lock but that ranks above transaction start.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/quota_global.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index f4efa89baee5..1ed0f7c86869 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -754,7 +754,9 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 	if (dquot->dq_flags & mask)
 		sync = 1;
 	spin_unlock(&dq_data_lock);
-	if (!sync) {
+	/* This is a slight hack but we can't afford getting global quota
+	 * lock if we already have a transaction started. */
+	if (!sync || journal_current_handle()) {
 		status = ocfs2_write_dquot(dquot);
 		goto out;
 	}
-- 
cgit v1.2.3


From 0e0333429a6280e6eb3c98845e4eed90d5f8078a Mon Sep 17 00:00:00 2001
From: Joel Becker <Joel.Becker@oracle.com>
Date: Wed, 17 Dec 2008 14:23:52 -0800
Subject: configfs: Silence lockdep on mkdir(), rmdir() and
 configfs_depend_item()

When attaching default groups (subdirs) of a new group (in mkdir() or
in configfs_register()), configfs recursively takes inode's mutexes
along the path from the parent of the new group to the default
subdirs. This is needed to ensure that the VFS will not race with
operations on these sub-dirs. This is safe for the following reasons:

- the VFS allows one to lock first an inode and second one of its
  children (The lock subclasses for this pattern are respectively
  I_MUTEX_PARENT and I_MUTEX_CHILD);
- from this rule any inode path can be recursively locked in
  descending order as long as it stays under a single mountpoint and
  does not follow symlinks.

Unfortunately lockdep does not know (yet?) how to handle such
recursion.

I've tried to use Peter Zijlstra's lock_set_subclass() helper to
upgrade i_mutexes from I_MUTEX_CHILD to I_MUTEX_PARENT when we know
that we might recursively lock some of their descendant, but this
usage does not seem to fit the purpose of lock_set_subclass() because
it leads to several i_mutex locked with subclass I_MUTEX_PARENT by
the same task.

>From inside configfs it is not possible to serialize those recursive
locking with a top-level one, because mkdir() and rmdir() are already
called with inodes locked by the VFS. So using some
mutex_lock_nest_lock() is not an option.

I am proposing two solutions:
1) one that wraps recursive mutex_lock()s with
   lockdep_off()/lockdep_on().
2) (as suggested earlier by Peter Zijlstra) one that puts the
   i_mutexes recursively locked in different classes based on their
   depth from the top-level config_group created. This
   induces an arbitrary limit (MAX_LOCK_DEPTH - 2 == 46) on the
   nesting of configfs default groups whenever lockdep is activated
   but this limit looks reasonably high. Unfortunately, this alos
   isolates VFS operations on configfs default groups from the others
   and thus lowers the chances to detect locking issues.

This patch implements solution 1).

Solution 2) looks better from lockdep's point of view, but fails with
configfs_depend_item(). This needs to rework the locking
scheme of configfs_depend_item() by removing the variable lock recursion
depth, and I think that it's doable thanks to the configfs_dirent_lock.
For now, let's stick to solution 1).

Signed-off-by: Louis Rilling <louis.rilling@kerlabs.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/configfs/dir.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e93341f3e82..9c2358391147 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -553,12 +553,24 @@ static void detach_groups(struct config_group *group)
 
 		child = sd->s_dentry;
 
+		/*
+		 * Note: we hide this from lockdep since we have no way
+		 * to teach lockdep about recursive
+		 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
+		 * in an inode tree, which are valid as soon as
+		 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
+		 * parent inode to one of its children.
+		 */
+		lockdep_off();
 		mutex_lock(&child->d_inode->i_mutex);
+		lockdep_on();
 
 		configfs_detach_group(sd->s_element);
 		child->d_inode->i_flags |= S_DEAD;
 
+		lockdep_off();
 		mutex_unlock(&child->d_inode->i_mutex);
+		lockdep_on();
 
 		d_delete(child);
 		dput(child);
@@ -748,11 +760,22 @@ static int configfs_attach_item(struct config_item *parent_item,
 			 * We are going to remove an inode and its dentry but
 			 * the VFS may already have hit and used them. Thus,
 			 * we must lock them as rmdir() would.
+			 *
+			 * Note: we hide this from lockdep since we have no way
+			 * to teach lockdep about recursive
+			 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
+			 * in an inode tree, which are valid as soon as
+			 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
+			 * parent inode to one of its children.
 			 */
+			lockdep_off();
 			mutex_lock(&dentry->d_inode->i_mutex);
+			lockdep_on();
 			configfs_remove_dir(item);
 			dentry->d_inode->i_flags |= S_DEAD;
+			lockdep_off();
 			mutex_unlock(&dentry->d_inode->i_mutex);
+			lockdep_on();
 			d_delete(dentry);
 		}
 	}
@@ -787,14 +810,25 @@ static int configfs_attach_group(struct config_item *parent_item,
 		 *
 		 * We must also lock the inode to remove it safely in case of
 		 * error, as rmdir() would.
+		 *
+		 * Note: we hide this from lockdep since we have no way
+		 * to teach lockdep about recursive
+		 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
+		 * in an inode tree, which are valid as soon as
+		 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
+		 * parent inode to one of its children.
 		 */
+		lockdep_off();
 		mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+		lockdep_on();
 		ret = populate_groups(to_config_group(item));
 		if (ret) {
 			configfs_detach_item(item);
 			dentry->d_inode->i_flags |= S_DEAD;
 		}
+		lockdep_off();
 		mutex_unlock(&dentry->d_inode->i_mutex);
+		lockdep_on();
 		if (ret)
 			d_delete(dentry);
 	}
@@ -956,7 +990,17 @@ static int configfs_depend_prep(struct dentry *origin,
 	BUG_ON(!origin || !sd);
 
 	/* Lock this guy on the way down */
+	/*
+	 * Note: we hide this from lockdep since we have no way
+	 * to teach lockdep about recursive
+	 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
+	 * in an inode tree, which are valid as soon as
+	 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
+	 * parent inode to one of its children.
+	 */
+	lockdep_off();
 	mutex_lock(&sd->s_dentry->d_inode->i_mutex);
+	lockdep_on();
 	if (sd->s_element == target)  /* Boo-yah */
 		goto out;
 
@@ -970,7 +1014,9 @@ static int configfs_depend_prep(struct dentry *origin,
 	}
 
 	/* We looped all our children and didn't find target */
+	lockdep_off();
 	mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
+	lockdep_on();
 	ret = -ENOENT;
 
 out:
@@ -990,11 +1036,16 @@ static void configfs_depend_rollback(struct dentry *origin,
 	struct dentry *dentry = item->ci_dentry;
 
 	while (dentry != origin) {
+		/* See comments in configfs_depend_prep() */
+		lockdep_off();
 		mutex_unlock(&dentry->d_inode->i_mutex);
+		lockdep_on();
 		dentry = dentry->d_parent;
 	}
 
+	lockdep_off();
 	mutex_unlock(&origin->d_inode->i_mutex);
+	lockdep_on();
 }
 
 int configfs_depend_item(struct configfs_subsystem *subsys,
@@ -1329,8 +1380,16 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 			}
 
 			/* Wait until the racing operation terminates */
+			/*
+			 * Note: we hide this from lockdep since we are locked
+			 * with subclass I_MUTEX_NORMAL from vfs_rmdir() (why
+			 * not I_MUTEX_CHILD?), and I_MUTEX_XATTR or
+			 * I_MUTEX_QUOTA are not relevant for the locked inode.
+			 */
+			lockdep_off();
 			mutex_lock(wait_mutex);
 			mutex_unlock(wait_mutex);
+			lockdep_on();
 		}
 	} while (ret == -EAGAIN);
 
-- 
cgit v1.2.3


From 554e7f9e043e29da79c044f7a55efe4fad40701e Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 8 Jan 2009 08:21:43 +0800
Subject: ocfs2: Access the xattr bucket only before modifying it.

In ocfs2_xattr_value_truncate, we may call b-tree codes which will
extend the journal transaction. It has a potential problem that it
may let the already-accessed-but-not-dirtied buffers gone. So we'd
better access the bucket after we call ocfs2_xattr_value_truncate.
And as for the root buffer for the xattr value, b-tree code will
acess and dirty it, so we don't need to worry about it.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e1d638af6ac3..915039fffe6e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4729,13 +4729,6 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 	vb.vb_xv = (struct ocfs2_xattr_value_root *)
 		(vb.vb_bh->b_data + offset % blocksize);
 
-	ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
-						OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
 	/*
 	 * From here on out we have to dirty the bucket.  The generic
 	 * value calls only modify one of the bucket's bhs, but we need
@@ -4748,12 +4741,18 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 	ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_dirty;
+		goto out;
+	}
+
+	ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
 	}
 
 	xe->xe_value_size = cpu_to_le64(len);
 
-out_dirty:
 	ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
 
 out:
-- 
cgit v1.2.3


From a4b91965d39d5d53b470d6aa62cba155a6f3ffe1 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 29 Jan 2009 17:12:31 -0800
Subject: ocfs2: Wakeup the downconvert thread after a successful cancel
 convert

When two nodes holding PR locks on a resource concurrently attempt to
upconvert the locks to EX, the master sends a BAST to one of the nodes. This
message tells that node to first cancel convert the upconvert request,
followed by downconvert to a NL. Only when this lock is downconverted to NL,
can the master upconvert the first node's lock to EX.

While the fs was doing the cancel convert, it was forgetting to wake up the
dc thread after a successful cancel, leading to a deadlock.

Reported-and-Tested-by: David Teigland <teigland@redhat.com>
Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index b0c4cadd4c45..206a2370876a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2860,6 +2860,10 @@ static void ocfs2_unlock_ast(void *opaque, int error)
 	case OCFS2_UNLOCK_CANCEL_CONVERT:
 		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
 		lockres->l_action = OCFS2_AST_INVALID;
+		/* Downconvert thread may have requeued this lock, we
+		 * need to wake it. */
+		if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+			ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
 		break;
 	case OCFS2_UNLOCK_DROP_LOCK:
 		lockres->l_level = DLM_LOCK_IV;
-- 
cgit v1.2.3


From fd4ef231962ab44fd1004e87f9d7c6809f00cd64 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Thu, 29 Jan 2009 15:06:21 -0800
Subject: ocfs2: add quota call to ocfs2_remove_btree_range()

We weren't reclaiming the clusters which get free'd from this function,
so any user punching holes in a file would still have those bytes accounted
against him/her. Add the call to vfs_dq_free_space_nodirty() to fix this.
Interestingly enough, the journal credits calculation already took this into
account.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Acked-by: Jan Kara <jack@suse.cz>
---
 fs/ocfs2/alloc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d861096c9d81..60fe74035db5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5390,6 +5390,9 @@ int ocfs2_remove_btree_range(struct inode *inode,
 		goto out;
 	}
 
+	vfs_dq_free_space_nodirty(inode,
+				  ocfs2_clusters_to_bytes(inode->i_sb, len));
+
 	ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
 				  dealloc);
 	if (ret) {
-- 
cgit v1.2.3


From 6139a2360987f55e4490a7813cf69df74ec8b93a Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 22 Jan 2009 15:37:47 +1100
Subject: xfs: Check buffer lengths in log recovery

Before trying to obtain, read or write a buffer,
check that the buffer length is actually valid. If
it is not valid, then something read in the recovery
process has been corrupted and we should abort
recovery.

Reported-by: Eric Sesterhenn <snakebyte@gmx.de>
Tested-by: Eric Sesterhenn <snakebyte@gmx.de>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 35cca98bd94c..b1047de2fffd 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -70,16 +70,21 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 xfs_buf_t *
 xlog_get_bp(
 	xlog_t		*log,
-	int		num_bblks)
+	int		nbblks)
 {
-	ASSERT(num_bblks > 0);
+	if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+		xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+		XFS_ERROR_REPORT("xlog_get_bp(1)",
+				 XFS_ERRLEVEL_HIGH, log->l_mp);
+		return NULL;
+	}
 
 	if (log->l_sectbb_log) {
-		if (num_bblks > 1)
-			num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
-		num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks);
+		if (nbblks > 1)
+			nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
 	}
-	return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp);
+	return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
 }
 
 void
@@ -102,6 +107,13 @@ xlog_bread(
 {
 	int		error;
 
+	if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+		xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+		XFS_ERROR_REPORT("xlog_bread(1)",
+				 XFS_ERRLEVEL_HIGH, log->l_mp);
+		return EFSCORRUPTED;
+	}
+
 	if (log->l_sectbb_log) {
 		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
 		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
@@ -139,6 +151,13 @@ xlog_bwrite(
 {
 	int		error;
 
+	if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+		xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+		XFS_ERROR_REPORT("xlog_bwrite(1)",
+				 XFS_ERRLEVEL_HIGH, log->l_mp);
+		return EFSCORRUPTED;
+	}
+
 	if (log->l_sectbb_log) {
 		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
 		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
-- 
cgit v1.2.3


From 43f3f057c56d030546145696627f13f95735be95 Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Thu, 22 Jan 2009 21:34:05 -0600
Subject: [XFS] Warn on transaction in flight on read-only remount

Till VFS can correctly support read-only remount without racing,
use WARN_ON instead of BUG_ON on detecting transaction in flight
after quiescing filesystem.

Signed-off-by: Felix Blyakher <felixb@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_sync.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 2ed035354c26..a608e72fa405 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -371,7 +371,11 @@ xfs_quiesce_attr(
 	/* flush inodes and push all remaining buffers out to disk */
 	xfs_quiesce_fs(mp);
 
-	ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
+	/*
+	 * Just warn here till VFS can correctly support
+	 * read-only remount without racing.
+	 */
+	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
 
 	/* Push the superblock and write an unmount record */
 	error = xfs_log_sbcount(mp, 1);
-- 
cgit v1.2.3


From a68370515356a3eddbfaf7f56418b3cf85d76c2c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:19:41 -0500
Subject: Btrfs: Catch missed bios in the async bio submission thread

The async bio submission thread was missing some bios that were
added after it had decided there was no work left to do.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 10 +++++++++-
 fs/btrfs/volumes.c      | 11 +++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d5f4e94f2ca2..f2e80f3768ec 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -349,6 +349,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
 {
 	struct btrfs_worker_thread *worker = work->worker;
 	unsigned long flags;
+	int wake = 0;
 
 	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
 		goto out;
@@ -367,10 +368,16 @@ int btrfs_requeue_work(struct btrfs_work *work)
 			       &worker->workers->worker_list);
 		spin_unlock_irqrestore(&worker->workers->lock, flags);
 	}
+	if (!worker->working) {
+		wake = 1;
+		worker->working = 1;
+	}
 
 	spin_unlock_irqrestore(&worker->lock, flags);
-
+	if (wake)
+		wake_up_process(worker->task);
 out:
+
 	return 0;
 }
 
@@ -397,6 +404,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 	}
 
 	spin_lock_irqsave(&worker->lock, flags);
+
 	atomic_inc(&worker->num_pending);
 	check_busy_worker(worker);
 	list_add_tail(&work->list, &worker->pending);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fd0bedb07a64..bcd14ebccae1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -154,6 +154,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 loop:
 	spin_lock(&device->io_lock);
 
+loop_lock:
 	/* take all the bios off the list at once and process them
 	 * later on (without the lock held).  But, remember the
 	 * tail and other pointers so the bios can be properly reinserted
@@ -203,7 +204,7 @@ loop:
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && bdi_write_congested(bdi) &&
+		if (pending && bdi_write_congested(bdi) && num_run > 16 &&
 		    fs_info->fs_devices->open_devices > 1) {
 			struct bio *old_head;
 
@@ -215,7 +216,8 @@ loop:
 				tail->bi_next = old_head;
 			else
 				device->pending_bio_tail = tail;
-			device->running_pending = 0;
+
+			device->running_pending = 1;
 
 			spin_unlock(&device->io_lock);
 			btrfs_requeue_work(&device->work);
@@ -224,6 +226,11 @@ loop:
 	}
 	if (again)
 		goto loop;
+
+	spin_lock(&device->io_lock);
+	if (device->pending_bios)
+		goto loop_lock;
+	spin_unlock(&device->io_lock);
 done:
 	return 0;
 }
-- 
cgit v1.2.3


From bef62ef339c15d7721da88958b03f7b544464722 Mon Sep 17 00:00:00 2001
From: Christian Hesse <mail@earthworm.de>
Date: Wed, 4 Feb 2009 09:28:28 -0500
Subject: Btrfs: make btrfs acls selectable This patch adds a menu entry to
 kconfig to enable acls for btrfs. This allows you to enable FS_POSIX_ACL at
 kernel compile time.

(updated by Jeff Mahoney to make the changes in fs/btrfs/Kconfig instead)

Signed-off-by: Christian Hesse <mail@earthworm.de>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
---
 fs/btrfs/Kconfig | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index f8fcf999ea1b..7bb3c020e570 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -16,3 +16,16 @@ config BTRFS_FS
 	  module will be called btrfs.
 
 	  If unsure, say N.
+
+config BTRFS_FS_POSIX_ACL
+	bool "Btrfs POSIX Access Control Lists"
+	depends on BTRFS_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
-- 
cgit v1.2.3


From 0279b4cd86685b5eea467c1b74ce94f0add2c0a3 Mon Sep 17 00:00:00 2001
From: Jim Owens <jowens@hp.com>
Date: Wed, 4 Feb 2009 09:29:13 -0500
Subject: Btrfs: selinux support Add call to LSM security initialization and
 save resulting security xattr for new inodes.

Add xattr support to symlink inode ops.

Set inode->i_op for existing special files.

Signed-off-by: jim owens <jowens@hp.com>
---
 fs/btrfs/inode.c | 23 +++++++++++++++++++----
 fs/btrfs/xattr.c | 32 ++++++++++++++++++++++++++++++++
 fs/btrfs/xattr.h |  2 ++
 3 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bb65e9b1448..4a79e1c5ebd0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -90,6 +90,16 @@ static noinline int cow_file_range(struct inode *inode,
 				   u64 start, u64 end, int *page_started,
 				   unsigned long *nr_written, int unlock);
 
+static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
+{
+	int err;
+
+	err = btrfs_init_acl(inode, dir);
+	if (!err)
+		err = btrfs_xattr_security_init(inode, dir);
+	return err;
+}
+
 /*
  * a very lame attempt at stopping writes when the FS is 85% full.  There
  * are countless ways this is incorrect, but it is better than nothing.
@@ -2037,6 +2047,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		break;
 	default:
+		inode->i_op = &btrfs_special_inode_operations;
 		init_special_inode(inode, inode->i_mode, rdev);
 		break;
 	}
@@ -3584,7 +3595,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_acl(inode, dir);
+	err = btrfs_init_inode_security(inode, dir);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -3647,7 +3658,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_acl(inode, dir);
+	err = btrfs_init_inode_security(inode, dir);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -3770,7 +3781,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	drop_on_err = 1;
 
-	err = btrfs_init_acl(inode, dir);
+	err = btrfs_init_inode_security(inode, dir);
 	if (err)
 		goto out_fail;
 
@@ -4732,7 +4743,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_acl(inode, dir);
+	err = btrfs_init_inode_security(inode, dir);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -5043,4 +5054,8 @@ static struct inode_operations btrfs_symlink_inode_operations = {
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
 	.permission	= btrfs_permission,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
 };
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b4fa5f4b6ad1..312b9435e9f8 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/rwsem.h>
 #include <linux/xattr.h>
+#include <linux/security.h>
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
@@ -330,3 +331,34 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 		return -EOPNOTSUPP;
 	return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
 }
+
+int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *suffix;
+	char *name;
+
+	err = security_inode_init_security(inode, dir, &suffix, &value, &len);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
+		       GFP_NOFS);
+	if (!name) {
+		err = -ENOMEM;
+	} else {
+		strcpy(name, XATTR_SECURITY_PREFIX);
+		strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
+		err = __btrfs_setxattr(inode, name, value, len, 0);
+		kfree(name);
+	}
+
+	kfree(suffix);
+	kfree(value);
+	return err;
+}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5b1d08f8e68d..c71e9c3cf3f7 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -36,4 +36,6 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
 		const void *value, size_t size, int flags);
 extern int btrfs_removexattr(struct dentry *dentry, const char *name);
 
+extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir);
+
 #endif /* __XATTR__ */
-- 
cgit v1.2.3


From b51912c91fcf7581cc7b4550f1bb96422809d9ed Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:23:24 -0500
Subject: Btrfs: async threads should try harder to find work

Tracing shows the delay between when an async thread goes to sleep
and when more work is added is often very short.  This commit adds
a little bit of delay and extra checking to the code right before
we schedule out.

It allows more work to be added to the worker
without requiring notifications from other procs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 50 +++++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/disk-io.c      |  2 ++
 2 files changed, 46 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index f2e80f3768ec..c84ca1f5259a 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -19,7 +19,8 @@
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-# include <linux/freezer.h>
+#include <linux/freezer.h>
+#include <linux/ftrace.h>
 #include "async-thread.h"
 
 #define WORK_QUEUED_BIT 0
@@ -142,6 +143,7 @@ static int worker_loop(void *arg)
 	struct btrfs_work *work;
 	do {
 		spin_lock_irq(&worker->lock);
+again_locked:
 		while (!list_empty(&worker->pending)) {
 			cur = worker->pending.next;
 			work = list_entry(cur, struct btrfs_work, list);
@@ -164,14 +166,50 @@ static int worker_loop(void *arg)
 			check_idle_worker(worker);
 
 		}
-		worker->working = 0;
 		if (freezing(current)) {
+			worker->working = 0;
+			spin_unlock_irq(&worker->lock);
 			refrigerator();
 		} else {
-			set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(&worker->lock);
-			if (!kthread_should_stop())
+			if (!kthread_should_stop()) {
+				cpu_relax();
+				/*
+				 * we've dropped the lock, did someone else
+				 * jump_in?
+				 */
+				smp_mb();
+				if (!list_empty(&worker->pending))
+					continue;
+
+				/*
+				 * this short schedule allows more work to
+				 * come in without the queue functions
+				 * needing to go through wake_up_process()
+				 *
+				 * worker->working is still 1, so nobody
+				 * is going to try and wake us up
+				 */
+				schedule_timeout(1);
+				smp_mb();
+				if (!list_empty(&worker->pending))
+					continue;
+
+				/* still no more work?, sleep for real */
+				spin_lock_irq(&worker->lock);
+				set_current_state(TASK_INTERRUPTIBLE);
+				if (!list_empty(&worker->pending))
+					goto again_locked;
+
+				/*
+				 * this makes sure we get a wakeup when someone
+				 * adds something new to the queue
+				 */
+				worker->working = 0;
+				spin_unlock_irq(&worker->lock);
+
 				schedule();
+			}
 			__set_current_state(TASK_RUNNING);
 		}
 	} while (!kthread_should_stop());
@@ -355,8 +393,8 @@ int btrfs_requeue_work(struct btrfs_work *work)
 		goto out;
 
 	spin_lock_irqsave(&worker->lock, flags);
-	atomic_inc(&worker->num_pending);
 	list_add_tail(&work->list, &worker->pending);
+	atomic_inc(&worker->num_pending);
 
 	/* by definition we're busy, take ourselves off the idle
 	 * list
@@ -405,9 +443,9 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
 	spin_lock_irqsave(&worker->lock, flags);
 
+	list_add_tail(&work->list, &worker->pending);
 	atomic_inc(&worker->num_pending);
 	check_busy_worker(worker);
-	list_add_tail(&work->list, &worker->pending);
 
 	/*
 	 * avoid calling into wake_up_process if this thread has already
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7feac5a475e9..9c3810047976 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1679,6 +1679,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 * low idle thresh
 	 */
 	fs_info->endio_workers.idle_thresh = 4;
+	fs_info->endio_meta_workers.idle_thresh = 4;
+
 	fs_info->endio_write_workers.idle_thresh = 64;
 	fs_info->endio_meta_write_workers.idle_thresh = 64;
 
-- 
cgit v1.2.3


From b7a9f29fcf4e53e9ca7982331649fa2013e69c99 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:23:45 -0500
Subject: Btrfs: sort references by byte number during btrfs_inc_ref

When a block goes through cow, we update the reference counts of
everything that block points to.  The internal pointers of the block
can be in just about any order, and it is likely to have clusters of
things that are close together and clusters of things that are not.

To help reduce the seeks that come with updating all of these reference
counts, sort them by byte number before actual updates are done.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 79 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3b26f0980946..7a22f2e6ec47 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/sort.h>
 #include "compat.h"
 #include "hash.h"
 #include "crc32c.h"
@@ -1521,15 +1522,50 @@ out:
 	return ret;
 }
 
-int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *orig_buf, struct extent_buffer *buf,
-		  u32 *nr_extents)
+/* when a block goes through cow, we update the reference counts of
+ * everything that block points to.  The internal pointers of the block
+ * can be in just about any order, and it is likely to have clusters of
+ * things that are close together and clusters of things that are not.
+ *
+ * To help reduce the seeks that come with updating all of these reference
+ * counts, sort them by byte number before actual updates are done.
+ *
+ * struct refsort is used to match byte number to slot in the btree block.
+ * we sort based on the byte number and then use the slot to actually
+ * find the item.
+ */
+struct refsort {
+	u64 bytenr;
+	u32 slot;
+};
+
+/*
+ * for passing into sort()
+ */
+static int refsort_cmp(const void *a_void, const void *b_void)
+{
+	const struct refsort *a = a_void;
+	const struct refsort *b = b_void;
+
+	if (a->bytenr < b->bytenr)
+		return -1;
+	if (a->bytenr > b->bytenr)
+		return 1;
+	return 0;
+}
+
+
+noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct extent_buffer *orig_buf,
+			   struct extent_buffer *buf, u32 *nr_extents)
 {
 	u64 bytenr;
 	u64 ref_root;
 	u64 orig_root;
 	u64 ref_generation;
 	u64 orig_generation;
+	struct refsort *sorted;
 	u32 nritems;
 	u32 nr_file_extents = 0;
 	struct btrfs_key key;
@@ -1538,6 +1574,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int level;
 	int ret = 0;
 	int faili = 0;
+	int refi = 0;
+	int slot;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
 			    u64, u64, u64, u64, u64, u64, u64, u64);
 
@@ -1549,6 +1587,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	nritems = btrfs_header_nritems(buf);
 	level = btrfs_header_level(buf);
 
+	sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
+	BUG_ON(!sorted);
+
 	if (root->ref_cows) {
 		process_func = __btrfs_inc_extent_ref;
 	} else {
@@ -1561,6 +1602,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		process_func = __btrfs_update_extent_ref;
 	}
 
+	/*
+	 * we make two passes through the items.  In the first pass we
+	 * only record the byte number and slot.  Then we sort based on
+	 * byte number and do the actual work based on the sorted results
+	 */
 	for (i = 0; i < nritems; i++) {
 		cond_resched();
 		if (level == 0) {
@@ -1577,6 +1623,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				continue;
 
 			nr_file_extents++;
+			sorted[refi].bytenr = bytenr;
+			sorted[refi].slot = i;
+			refi++;
+		} else {
+			bytenr = btrfs_node_blockptr(buf, i);
+			sorted[refi].bytenr = bytenr;
+			sorted[refi].slot = i;
+			refi++;
+		}
+	}
+	/*
+	 * if refi == 0, we didn't actually put anything into the sorted
+	 * array and we're done
+	 */
+	if (refi == 0)
+		goto out;
+
+	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+	for (i = 0; i < refi; i++) {
+		cond_resched();
+		slot = sorted[i].slot;
+		bytenr = sorted[i].bytenr;
+
+		if (level == 0) {
+			btrfs_item_key_to_cpu(buf, &key, slot);
 
 			ret = process_func(trans, root, bytenr,
 					   orig_buf->start, buf->start,
@@ -1585,25 +1657,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 					   key.objectid);
 
 			if (ret) {
-				faili = i;
+				faili = slot;
 				WARN_ON(1);
 				goto fail;
 			}
 		} else {
-			bytenr = btrfs_node_blockptr(buf, i);
 			ret = process_func(trans, root, bytenr,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
 					   level - 1);
 			if (ret) {
-				faili = i;
+				faili = slot;
 				WARN_ON(1);
 				goto fail;
 			}
 		}
 	}
 out:
+	kfree(sorted);
 	if (nr_extents) {
 		if (level == 0)
 			*nr_extents = nr_file_extents;
@@ -1612,6 +1684,7 @@ out:
 	}
 	return 0;
 fail:
+	kfree(sorted);
 	WARN_ON(1);
 	return ret;
 }
-- 
cgit v1.2.3


From 3935127c50c84106d654ef14962cff28c660bc62 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:24:05 -0500
Subject: Btrfs: disable leak debugging checks in extent_io.c

extent_io.c has debugging code to report and free leaked extent_state
and extent_buffer objects at rmmod time.  This helps track down
leaks and it saves you from rebooting just to properly remove the
kmem_cache object.

But, the code runs under a fairly expensive spinlock and the checks to
see if it is currently enabled are not entirely consistent.  Some use
#ifdef and some #if.

This changes everything to #if and disables the leak checking.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a3b0676403f7..2ea7f052722c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -30,7 +30,7 @@ static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 
 #define LEAK_DEBUG 0
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 static DEFINE_SPINLOCK(leak_lock);
 #endif
 
@@ -119,7 +119,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 static struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	unsigned long flags;
 #endif
 
@@ -129,7 +129,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
 	state->state = 0;
 	state->private = 0;
 	state->tree = NULL;
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&state->leak_list, &states);
 	spin_unlock_irqrestore(&leak_lock, flags);
@@ -144,11 +144,11 @@ static void free_extent_state(struct extent_state *state)
 	if (!state)
 		return;
 	if (atomic_dec_and_test(&state->refs)) {
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 		unsigned long flags;
 #endif
 		WARN_ON(state->tree);
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 		spin_lock_irqsave(&leak_lock, flags);
 		list_del(&state->leak_list);
 		spin_unlock_irqrestore(&leak_lock, flags);
@@ -2983,7 +2983,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 						   gfp_t mask)
 {
 	struct extent_buffer *eb = NULL;
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	unsigned long flags;
 #endif
 
@@ -2991,7 +2991,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 	eb->start = start;
 	eb->len = len;
 	mutex_init(&eb->mutex);
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&eb->leak_list, &buffers);
 	spin_unlock_irqrestore(&leak_lock, flags);
@@ -3003,7 +3003,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	unsigned long flags;
 	spin_lock_irqsave(&leak_lock, flags);
 	list_del(&eb->leak_list);
-- 
cgit v1.2.3


From c487685d7c18a8481900755aa5c56a7a74193101 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:24:25 -0500
Subject: Btrfs: hash_lock is no longer needed

Before metadata is written to disk, it is updated to reflect that writeout
has begun.  Once this update is done, the block must be cow'd before it
can be modified again.

This update was originally synchronized by using a per-fs spinlock.  Today
the buffers for the metadata blocks are locked before writeout begins,
and everyone that tests the flag has the buffer locked as well.

So, the per-fs spinlock (called hash_lock for no good reason) is no
longer required.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   | 7 +------
 fs/btrfs/ctree.h   | 1 -
 fs/btrfs/disk-io.c | 4 ----
 3 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2603ee539b7a..3b6e35aafc9e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -388,16 +388,14 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		WARN_ON(1);
 	}
 
-	spin_lock(&root->fs_info->hash_lock);
 	if (btrfs_header_generation(buf) == trans->transid &&
 	    btrfs_header_owner(buf) == root->root_key.objectid &&
 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 		*cow_ret = buf;
-		spin_unlock(&root->fs_info->hash_lock);
 		WARN_ON(prealloc_dest);
 		return 0;
 	}
-	spin_unlock(&root->fs_info->hash_lock);
+
 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
 	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0,
@@ -1376,14 +1374,11 @@ again:
 			int wret;
 
 			/* is a cow on this block not required */
-			spin_lock(&root->fs_info->hash_lock);
 			if (btrfs_header_generation(b) == trans->transid &&
 			    btrfs_header_owner(b) == root->root_key.objectid &&
 			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
-				spin_unlock(&root->fs_info->hash_lock);
 				goto cow_done;
 			}
-			spin_unlock(&root->fs_info->hash_lock);
 
 			/* ok, we have to cow, is our old prealloc the right
 			 * size?
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index de103a8a815e..f2b8d26b0472 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -703,7 +703,6 @@ struct btrfs_fs_info {
 	struct super_block *sb;
 	struct inode *btree_inode;
 	struct backing_dev_info bdi;
-	spinlock_t hash_lock;
 	struct mutex trans_mutex;
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9c3810047976..549271607c17 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1503,7 +1503,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
-	spin_lock_init(&fs_info->hash_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
 	spin_lock_init(&fs_info->ref_cache_lock);
@@ -2361,7 +2360,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 int btree_lock_page_hook(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_buffer *eb;
 	unsigned long len;
@@ -2376,9 +2374,7 @@ int btree_lock_page_hook(struct page *page)
 		goto out;
 
 	btrfs_tree_lock(eb);
-	spin_lock(&root->fs_info->hash_lock);
 	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-	spin_unlock(&root->fs_info->hash_lock);
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 out:
-- 
cgit v1.2.3


From b4ce94de9b4d64e8ab3cf155d13653c666e22b9b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:25:08 -0500
Subject: Btrfs: Change btree locking to use explicit blocking points

Most of the btrfs metadata operations can be protected by a spinlock,
but some operations still need to schedule.

So far, btrfs has been using a mutex along with a trylock loop,
most of the time it is able to avoid going for the full mutex, so
the trylock loop is a big performance gain.

This commit is step one for getting rid of the blocking locks entirely.
btrfs_tree_lock takes a spinlock, and the code explicitly switches
to a blocking lock when it starts an operation that can schedule.

We'll be able get rid of the blocking locks in smaller pieces over time.
Tracing allows us to find the most common cause of blocking, so we
can start with the hot spots first.

The basic idea is:

btrfs_tree_lock() returns with the spin lock held

btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in
the extent buffer flags, and then drops the spin lock.  The buffer is
still considered locked by all of the btrfs code.

If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops
the spin lock and waits on a wait queue for the blocking bit to go away.

Much of the code that needs to set the blocking bit finishes without actually
blocking a good percentage of the time.  So, an adaptive spin is still
used against the blocking bit to avoid very high context switch rates.

btrfs_clear_lock_blocking() clears the blocking bit and returns
with the spinlock held again.

btrfs_tree_unlock() can be called on either blocking or spinning locks,
it does the right thing based on the blocking bit.

ctree.c has a helper function to set/clear all the locked buffers in a
path as blocking.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 234 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/ctree.h       |   4 +
 fs/btrfs/disk-io.c     |  10 ++-
 fs/btrfs/extent-tree.c |   5 ++
 fs/btrfs/extent_io.c   |  18 ++--
 fs/btrfs/extent_io.h   |  16 +++-
 fs/btrfs/inode.c       |   3 +
 fs/btrfs/locking.c     | 208 +++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/locking.h     |   6 ++
 fs/btrfs/tree-defrag.c |   1 +
 fs/btrfs/tree-log.c    |   4 +
 11 files changed, 470 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3b6e35aafc9e..3af777357acb 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,6 +54,31 @@ struct btrfs_path *btrfs_alloc_path(void)
 	return path;
 }
 
+/*
+ * set all locked nodes in the path to blocking locks.  This should
+ * be done before scheduling
+ */
+noinline void btrfs_set_path_blocking(struct btrfs_path *p)
+{
+	int i;
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		if (p->nodes[i] && p->locks[i])
+			btrfs_set_lock_blocking(p->nodes[i]);
+	}
+}
+
+/*
+ * reset all the locked nodes in the patch to spinning locks.
+ */
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
+{
+	int i;
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		if (p->nodes[i] && p->locks[i])
+			btrfs_clear_lock_blocking(p->nodes[i]);
+	}
+}
+
 /* this also releases the path */
 void btrfs_free_path(struct btrfs_path *p)
 {
@@ -272,6 +297,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
+	/* cow is set to blocking by btrfs_init_new_buffer */
+
 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
 	btrfs_set_header_bytenr(cow, cow->start);
 	btrfs_set_header_generation(cow, trans->transid);
@@ -397,6 +424,11 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	}
 
 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+
+	if (parent)
+		btrfs_set_lock_blocking(parent);
+	btrfs_set_lock_blocking(buf);
+
 	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0,
 				 prealloc_dest);
@@ -502,6 +534,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	if (parent_nritems == 1)
 		return 0;
 
+	btrfs_set_lock_blocking(parent);
+
 	for (i = start_slot; i < end_slot; i++) {
 		int close = 1;
 
@@ -562,6 +596,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			search_start = last_block;
 
 		btrfs_tree_lock(cur);
+		btrfs_set_lock_blocking(cur);
 		err = __btrfs_cow_block(trans, root, cur, parent, i,
 					&cur, search_start,
 					min(16 * blocksize,
@@ -860,6 +895,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		return 0;
 
 	mid = path->nodes[level];
+
 	WARN_ON(!path->locks[level]);
 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
 
@@ -882,6 +918,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		/* promote the child to a root */
 		child = read_node_slot(root, mid, 0);
 		btrfs_tree_lock(child);
+		btrfs_set_lock_blocking(child);
 		BUG_ON(!child);
 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
 		BUG_ON(ret);
@@ -898,6 +935,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 		add_root_to_dirty_list(root);
 		btrfs_tree_unlock(child);
+
 		path->locks[level] = 0;
 		path->nodes[level] = NULL;
 		clean_tree_block(trans, root, mid);
@@ -922,6 +960,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	left = read_node_slot(root, parent, pslot - 1);
 	if (left) {
 		btrfs_tree_lock(left);
+		btrfs_set_lock_blocking(left);
 		wret = btrfs_cow_block(trans, root, left,
 				       parent, pslot - 1, &left, 0);
 		if (wret) {
@@ -932,6 +971,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	right = read_node_slot(root, parent, pslot + 1);
 	if (right) {
 		btrfs_tree_lock(right);
+		btrfs_set_lock_blocking(right);
 		wret = btrfs_cow_block(trans, root, right,
 				       parent, pslot + 1, &right, 0);
 		if (wret) {
@@ -1107,6 +1147,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 		u32 left_nr;
 
 		btrfs_tree_lock(left);
+		btrfs_set_lock_blocking(left);
+
 		left_nr = btrfs_header_nritems(left);
 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
@@ -1153,7 +1195,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	 */
 	if (right) {
 		u32 right_nr;
+
 		btrfs_tree_lock(right);
+		btrfs_set_lock_blocking(right);
+
 		right_nr = btrfs_header_nritems(right);
 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
@@ -1264,6 +1309,68 @@ static noinline void reada_for_search(struct btrfs_root *root,
 	}
 }
 
+/*
+ * returns -EAGAIN if it had to drop the path, or zero if everything was in
+ * cache
+ */
+static noinline int reada_for_balance(struct btrfs_root *root,
+				      struct btrfs_path *path, int level)
+{
+	int slot;
+	int nritems;
+	struct extent_buffer *parent;
+	struct extent_buffer *eb;
+	u64 gen;
+	u64 block1 = 0;
+	u64 block2 = 0;
+	int ret = 0;
+	int blocksize;
+
+	parent = path->nodes[level - 1];
+	if (!parent)
+		return 0;
+
+	nritems = btrfs_header_nritems(parent);
+	slot = path->slots[level];
+	blocksize = btrfs_level_size(root, level);
+
+	if (slot > 0) {
+		block1 = btrfs_node_blockptr(parent, slot - 1);
+		gen = btrfs_node_ptr_generation(parent, slot - 1);
+		eb = btrfs_find_tree_block(root, block1, blocksize);
+		if (eb && btrfs_buffer_uptodate(eb, gen))
+			block1 = 0;
+		free_extent_buffer(eb);
+	}
+	if (slot < nritems) {
+		block2 = btrfs_node_blockptr(parent, slot + 1);
+		gen = btrfs_node_ptr_generation(parent, slot + 1);
+		eb = btrfs_find_tree_block(root, block2, blocksize);
+		if (eb && btrfs_buffer_uptodate(eb, gen))
+			block2 = 0;
+		free_extent_buffer(eb);
+	}
+	if (block1 || block2) {
+		ret = -EAGAIN;
+		btrfs_release_path(root, path);
+		if (block1)
+			readahead_tree_block(root, block1, blocksize, 0);
+		if (block2)
+			readahead_tree_block(root, block2, blocksize, 0);
+
+		if (block1) {
+			eb = read_tree_block(root, block1, blocksize, 0);
+			free_extent_buffer(eb);
+		}
+		if (block1) {
+			eb = read_tree_block(root, block2, blocksize, 0);
+			free_extent_buffer(eb);
+		}
+	}
+	return ret;
+}
+
+
 /*
  * when we walk down the tree, it is usually safe to unlock the higher layers
  * in the tree.  The exceptions are when our path goes through slot 0, because
@@ -1314,6 +1421,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
 	}
 }
 
+/*
+ * This releases any locks held in the path starting at level and
+ * going all the way up to the root.
+ *
+ * btrfs_search_slot will keep the lock held on higher nodes in a few
+ * corner cases, such as COW of the block at slot zero in the node.  This
+ * ignores those rules, and it should only be called when there are no
+ * more updates to be done higher up in the tree.
+ */
+noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
+{
+	int i;
+
+	if (path->keep_locks || path->lowest_level)
+		return;
+
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		if (!path->nodes[i])
+			break;
+		if (!path->locks[i])
+			break;
+		btrfs_tree_unlock(path->nodes[i]);
+		path->locks[i] = 0;
+	}
+}
+
 /*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
@@ -1385,6 +1518,7 @@ again:
 			 */
 			if (prealloc_block.objectid &&
 			    prealloc_block.offset != b->len) {
+				btrfs_set_path_blocking(p);
 				btrfs_free_reserved_extent(root,
 					   prealloc_block.objectid,
 					   prealloc_block.offset);
@@ -1409,6 +1543,8 @@ again:
 				goto again;
 			}
 
+			btrfs_set_path_blocking(p);
+
 			wret = btrfs_cow_block(trans, root, b,
 					       p->nodes[level + 1],
 					       p->slots[level + 1],
@@ -1430,6 +1566,22 @@ cow_done:
 		if (!p->skip_locking)
 			p->locks[level] = 1;
 
+		btrfs_clear_path_blocking(p);
+
+		/*
+		 * we have a lock on b and as long as we aren't changing
+		 * the tree, there is no way to for the items in b to change.
+		 * It is safe to drop the lock on our parent before we
+		 * go through the expensive btree search on b.
+		 *
+		 * If cow is true, then we might be changing slot zero,
+		 * which may require changing the parent.  So, we can't
+		 * drop the lock until after we know which slot we're
+		 * operating on.
+		 */
+		if (!cow)
+			btrfs_unlock_up_safe(p, level + 1);
+
 		ret = check_block(root, p, level);
 		if (ret) {
 			ret = -1;
@@ -1437,6 +1589,7 @@ cow_done:
 		}
 
 		ret = bin_search(b, key, level, &slot);
+
 		if (level != 0) {
 			if (ret && slot > 0)
 				slot -= 1;
@@ -1444,7 +1597,16 @@ cow_done:
 			if ((p->search_for_split || ins_len > 0) &&
 			    btrfs_header_nritems(b) >=
 			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
-				int sret = split_node(trans, root, p, level);
+				int sret;
+
+				sret = reada_for_balance(root, p, level);
+				if (sret)
+					goto again;
+
+				btrfs_set_path_blocking(p);
+				sret = split_node(trans, root, p, level);
+				btrfs_clear_path_blocking(p);
+
 				BUG_ON(sret > 0);
 				if (sret) {
 					ret = sret;
@@ -1453,8 +1615,16 @@ cow_done:
 				b = p->nodes[level];
 				slot = p->slots[level];
 			} else if (ins_len < 0) {
-				int sret = balance_level(trans, root, p,
-							 level);
+				int sret;
+
+				sret = reada_for_balance(root, p, level);
+				if (sret)
+					goto again;
+
+				btrfs_set_path_blocking(p);
+				sret = balance_level(trans, root, p, level);
+				btrfs_clear_path_blocking(p);
+
 				if (sret) {
 					ret = sret;
 					goto done;
@@ -1488,7 +1658,7 @@ cow_done:
 				 * of the btree by dropping locks before
 				 * we read.
 				 */
-				if (level > 1) {
+				if (level > 0) {
 					btrfs_release_path(NULL, p);
 					if (tmp)
 						free_extent_buffer(tmp);
@@ -1503,6 +1673,7 @@ cow_done:
 						free_extent_buffer(tmp);
 					goto again;
 				} else {
+					btrfs_set_path_blocking(p);
 					if (tmp)
 						free_extent_buffer(tmp);
 					if (should_reada)
@@ -1512,14 +1683,29 @@ cow_done:
 					b = read_node_slot(root, b, slot);
 				}
 			}
-			if (!p->skip_locking)
-				btrfs_tree_lock(b);
+			if (!p->skip_locking) {
+				int lret;
+
+				btrfs_clear_path_blocking(p);
+				lret = btrfs_try_spin_lock(b);
+
+				if (!lret) {
+					btrfs_set_path_blocking(p);
+					btrfs_tree_lock(b);
+					btrfs_clear_path_blocking(p);
+				}
+			}
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 &&
 			    btrfs_leaf_free_space(root, b) < ins_len) {
-				int sret = split_leaf(trans, root, key,
+				int sret;
+
+				btrfs_set_path_blocking(p);
+				sret = split_leaf(trans, root, key,
 						      p, ins_len, ret == 0);
+				btrfs_clear_path_blocking(p);
+
 				BUG_ON(sret > 0);
 				if (sret) {
 					ret = sret;
@@ -1533,12 +1719,16 @@ cow_done:
 	}
 	ret = 1;
 done:
+	/*
+	 * we don't really know what they plan on doing with the path
+	 * from here on, so for now just mark it as blocking
+	 */
+	btrfs_set_path_blocking(p);
 	if (prealloc_block.objectid) {
 		btrfs_free_reserved_extent(root,
 			   prealloc_block.objectid,
 			   prealloc_block.offset);
 	}
-
 	return ret;
 }
 
@@ -1562,6 +1752,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
 	BUG_ON(ret);
 
+	btrfs_set_lock_blocking(eb);
+
 	parent = eb;
 	while (1) {
 		level = btrfs_header_level(parent);
@@ -1586,6 +1778,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 			eb = read_tree_block(root, bytenr, blocksize,
 					     generation);
 			btrfs_tree_lock(eb);
+			btrfs_set_lock_blocking(eb);
 		}
 
 		/*
@@ -1610,6 +1803,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 				eb = read_tree_block(root, bytenr, blocksize,
 						generation);
 				btrfs_tree_lock(eb);
+				btrfs_set_lock_blocking(eb);
 			}
 
 			ret = btrfs_cow_block(trans, root, eb, parent, slot,
@@ -2156,6 +2350,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	right = read_node_slot(root, upper, slot + 1);
 	btrfs_tree_lock(right);
+	btrfs_set_lock_blocking(right);
+
 	free_space = btrfs_leaf_free_space(root, right);
 	if (free_space < data_size)
 		goto out_unlock;
@@ -2351,6 +2547,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	left = read_node_slot(root, path->nodes[1], slot - 1);
 	btrfs_tree_lock(left);
+	btrfs_set_lock_blocking(left);
+
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size) {
 		ret = 1;
@@ -2809,6 +3007,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 	path->keep_locks = 0;
 	BUG_ON(ret);
 
+	/*
+	 * make sure any changes to the path from split_leaf leave it
+	 * in a blocking state
+	 */
+	btrfs_set_path_blocking(path);
+
 	leaf = path->nodes[0];
 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
 
@@ -3338,6 +3542,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 		BUG();
 	}
 out:
+	btrfs_unlock_up_safe(path, 1);
 	return ret;
 }
 
@@ -3705,12 +3910,14 @@ find_next_key:
 		 */
 		if (slot >= nritems) {
 			path->slots[level] = slot;
+			btrfs_set_path_blocking(path);
 			sret = btrfs_find_next_key(root, path, min_key, level,
 						  cache_only, min_trans);
 			if (sret == 0) {
 				btrfs_release_path(root, path);
 				goto again;
 			} else {
+				btrfs_clear_path_blocking(path);
 				goto out;
 			}
 		}
@@ -3722,16 +3929,20 @@ find_next_key:
 			unlock_up(path, level, 1);
 			goto out;
 		}
+		btrfs_set_path_blocking(path);
 		cur = read_node_slot(root, cur, slot);
 
 		btrfs_tree_lock(cur);
+
 		path->locks[level - 1] = 1;
 		path->nodes[level - 1] = cur;
 		unlock_up(path, level, 1);
+		btrfs_clear_path_blocking(path);
 	}
 out:
 	if (ret == 0)
 		memcpy(min_key, &found_key, sizeof(found_key));
+	btrfs_set_path_blocking(path);
 	return ret;
 }
 
@@ -3827,6 +4038,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	if (ret < 0)
 		return ret;
 
+	btrfs_set_path_blocking(path);
 	nritems = btrfs_header_nritems(path->nodes[0]);
 	/*
 	 * by releasing the path above we dropped all our locks.  A balance
@@ -3857,6 +4069,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			free_extent_buffer(next);
 		}
 
+		/* the path was set to blocking above */
 		if (level == 1 && (path->locks[1] || path->skip_locking) &&
 		    path->reada)
 			reada_for_search(root, path, level, slot, 0);
@@ -3865,6 +4078,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		if (!path->skip_locking) {
 			WARN_ON(!btrfs_tree_locked(c));
 			btrfs_tree_lock(next);
+			btrfs_set_lock_blocking(next);
 		}
 		break;
 	}
@@ -3881,12 +4095,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			path->locks[level] = 1;
 		if (!level)
 			break;
+
+		btrfs_set_path_blocking(path);
 		if (level == 1 && path->locks[1] && path->reada)
 			reada_for_search(root, path, level, slot, 0);
 		next = read_node_slot(root, next, 0);
 		if (!path->skip_locking) {
 			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
 			btrfs_tree_lock(next);
+			btrfs_set_lock_blocking(next);
 		}
 	}
 done:
@@ -3911,6 +4128,7 @@ int btrfs_previous_item(struct btrfs_root *root,
 
 	while (1) {
 		if (path->slots[0] == 0) {
+			btrfs_set_path_blocking(path);
 			ret = btrfs_prev_leaf(root, path);
 			if (ret != 0)
 				return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f2b8d26b0472..531db112c8bd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1835,6 +1835,10 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 void btrfs_init_path(struct btrfs_path *p);
+void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_clear_path_blocking(struct btrfs_path *p);
+void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
+
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int slot, int nr);
 int btrfs_del_leaf(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 549271607c17..5aebddd71193 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -799,7 +799,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 
 	if (ret == 0)
-		buf->flags |= EXTENT_UPTODATE;
+		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
 	else
 		WARN_ON(1);
 	return buf;
@@ -813,6 +813,10 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	if (btrfs_header_generation(buf) ==
 	    root->fs_info->running_transaction->transid) {
 		WARN_ON(!btrfs_tree_locked(buf));
+
+		/* ugh, clear_extent_buffer_dirty can be expensive */
+		btrfs_set_lock_blocking(buf);
+
 		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
 					  buf);
 	}
@@ -2311,6 +2315,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	u64 transid = btrfs_header_generation(buf);
 	struct inode *btree_inode = root->fs_info->btree_inode;
 
+	btrfs_set_lock_blocking(buf);
+
 	WARN_ON(!btrfs_tree_locked(buf));
 	if (transid != root->fs_info->generation) {
 		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
@@ -2353,7 +2359,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	int ret;
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 	if (ret == 0)
-		buf->flags |= EXTENT_UPTODATE;
+		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
 	return ret;
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7a22f2e6ec47..ed1e25d72483 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3407,7 +3407,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 	btrfs_set_header_generation(buf, trans->transid);
 	btrfs_tree_lock(buf);
 	clean_tree_block(trans, root, buf);
+
+	btrfs_set_lock_blocking(buf);
 	btrfs_set_buffer_uptodate(buf);
+
 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
 		set_extent_dirty(&root->dirty_log_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
@@ -3416,6 +3419,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 			 buf->start + buf->len - 1, GFP_NOFS);
 	}
 	trans->blocks_used++;
+	/* this returns a buffer locked for blocking */
 	return buf;
 }
 
@@ -3752,6 +3756,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 
 		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
 		btrfs_tree_lock(next);
+		btrfs_set_lock_blocking(next);
 
 		ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
 					      &refs);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2ea7f052722c..dd5df53e045a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2990,7 +2990,9 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
 	eb->start = start;
 	eb->len = len;
-	mutex_init(&eb->mutex);
+	spin_lock_init(&eb->lock);
+	init_waitqueue_head(&eb->lock_wq);
+
 #if LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&eb->leak_list, &buffers);
@@ -3071,8 +3073,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 		unlock_page(p);
 	}
 	if (uptodate)
-		eb->flags |= EXTENT_UPTODATE;
-	eb->flags |= EXTENT_BUFFER_FILLED;
+		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
 	spin_lock(&tree->buffer_lock);
 	exists = buffer_tree_insert(tree, start, &eb->rb_node);
@@ -3226,7 +3227,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
 	unsigned long num_pages;
 
 	num_pages = num_extent_pages(eb->start, eb->len);
-	eb->flags &= ~EXTENT_UPTODATE;
+	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
 	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
 			      GFP_NOFS);
@@ -3297,7 +3298,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 	struct page *page;
 	int pg_uptodate = 1;
 
-	if (eb->flags & EXTENT_UPTODATE)
+	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 1;
 
 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3333,7 +3334,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	struct bio *bio = NULL;
 	unsigned long bio_flags = 0;
 
-	if (eb->flags & EXTENT_UPTODATE)
+	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 0;
 
 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3364,7 +3365,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 	if (all_uptodate) {
 		if (start_i == 0)
-			eb->flags |= EXTENT_UPTODATE;
+			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 		goto unlock_exit;
 	}
 
@@ -3400,7 +3401,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 
 	if (!ret)
-		eb->flags |= EXTENT_UPTODATE;
+		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 	return ret;
 
 unlock_exit:
@@ -3497,7 +3498,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		unmap_extent_buffer(eb, eb->map_token, km);
 		eb->map_token = NULL;
 		save = 1;
-		WARN_ON(!mutex_is_locked(&eb->mutex));
 	}
 	err = map_private_extent_buffer(eb, start, min_len, token, map,
 				       map_start, map_len, km);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e80c6d96b318..1f9df88afbf6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -22,6 +22,10 @@
 /* flags for bio submission */
 #define EXTENT_BIO_COMPRESSED 1
 
+/* these are bit numbers for test/set bit */
+#define EXTENT_BUFFER_UPTODATE 0
+#define EXTENT_BUFFER_BLOCKING 1
+
 /*
  * page->private values.  Every page that is controlled by the extent
  * map has page->private set to one.
@@ -95,11 +99,19 @@ struct extent_buffer {
 	unsigned long map_start;
 	unsigned long map_len;
 	struct page *first_page;
+	unsigned long bflags;
 	atomic_t refs;
-	int flags;
 	struct list_head leak_list;
 	struct rb_node rb_node;
-	struct mutex mutex;
+
+	/* the spinlock is used to protect most operations */
+	spinlock_t lock;
+
+	/*
+	 * when we keep the lock held while blocking, waiters go onto
+	 * the wq
+	 */
+	wait_queue_head_t lock_wq;
 };
 
 struct extent_map_tree;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4a79e1c5ebd0..ebd7d6c37df8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -50,6 +50,7 @@
 #include "tree-log.h"
 #include "ref-cache.h"
 #include "compression.h"
+#include "locking.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -2021,6 +2022,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
 
 	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+
 	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
 						alloc_group_block, 0);
 	btrfs_free_path(path);
@@ -2117,6 +2119,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 		goto failed;
 	}
 
+	btrfs_unlock_up_safe(path, 1);
 	leaf = path->nodes[0];
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				  struct btrfs_inode_item);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae7761db6..68fd9ccf1805 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -26,45 +26,215 @@
 #include "locking.h"
 
 /*
- * locks the per buffer mutex in an extent buffer.  This uses adaptive locks
- * and the spin is not tuned very extensively.  The spinning does make a big
- * difference in almost every workload, but spinning for the right amount of
- * time needs some help.
- *
- * In general, we want to spin as long as the lock holder is doing btree
- * searches, and we should give up if they are in more expensive code.
+ * btrfs_header_level() isn't free, so don't call it when lockdep isn't
+ * on
  */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void spin_nested(struct extent_buffer *eb)
+{
+	spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
+}
+#else
+static inline void spin_nested(struct extent_buffer *eb)
+{
+	spin_lock(&eb->lock);
+}
+#endif
 
-int btrfs_tree_lock(struct extent_buffer *eb)
+/*
+ * Setting a lock to blocking will drop the spinlock and set the
+ * flag that forces other procs who want the lock to wait.  After
+ * this you can safely schedule with the lock held.
+ */
+void btrfs_set_lock_blocking(struct extent_buffer *eb)
 {
-	int i;
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+		set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+		spin_unlock(&eb->lock);
+	}
+	/* exit with the spin lock released and the bit set */
+}
 
-	if (mutex_trylock(&eb->mutex))
-		return 0;
+/*
+ * clearing the blocking flag will take the spinlock again.
+ * After this you can't safely schedule
+ */
+void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+{
+	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+		spin_nested(eb);
+		clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+		smp_mb__after_clear_bit();
+	}
+	/* exit with the spin lock held */
+}
+
+/*
+ * unfortunately, many of the places that currently set a lock to blocking
+ * don't end up blocking for every long, and often they don't block
+ * at all.  For a dbench 50 run, if we don't spin one the blocking bit
+ * at all, the context switch rate can jump up to 400,000/sec or more.
+ *
+ * So, we're still stuck with this crummy spin on the blocking bit,
+ * at least until the most common causes of the short blocks
+ * can be dealt with.
+ */
+static int btrfs_spin_on_block(struct extent_buffer *eb)
+{
+	int i;
 	for (i = 0; i < 512; i++) {
 		cpu_relax();
-		if (mutex_trylock(&eb->mutex))
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			return 1;
+		if (need_resched())
+			break;
+	}
+	return 0;
+}
+
+/*
+ * This is somewhat different from trylock.  It will take the
+ * spinlock but if it finds the lock is set to blocking, it will
+ * return without the lock held.
+ *
+ * returns 1 if it was able to take the lock and zero otherwise
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_try_spin_lock(struct extent_buffer *eb)
+{
+	int i;
+
+	spin_nested(eb);
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		return 1;
+	spin_unlock(&eb->lock);
+
+	/* spin for a bit on the BLOCKING flag */
+	for (i = 0; i < 2; i++) {
+		if (!btrfs_spin_on_block(eb))
+			break;
+
+		spin_nested(eb);
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			return 1;
+		spin_unlock(&eb->lock);
+	}
+	return 0;
+}
+
+/*
+ * the autoremove wake function will return 0 if it tried to wake up
+ * a process that was already awake, which means that process won't
+ * count as an exclusive wakeup.  The waitq code will continue waking
+ * procs until it finds one that was actually sleeping.
+ *
+ * For btrfs, this isn't quite what we want.  We want a single proc
+ * to be notified that the lock is ready for taking.  If that proc
+ * already happen to be awake, great, it will loop around and try for
+ * the lock.
+ *
+ * So, btrfs_wake_function always returns 1, even when the proc that we
+ * tried to wake up was already awake.
+ */
+static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
+			       int sync, void *key)
+{
+	autoremove_wake_function(wait, mode, sync, key);
+	return 1;
+}
+
+/*
+ * returns with the extent buffer spinlocked.
+ *
+ * This will spin and/or wait as required to take the lock, and then
+ * return with the spinlock held.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+	DEFINE_WAIT(wait);
+	wait.func = btrfs_wake_function;
+
+	while(1) {
+		spin_nested(eb);
+
+		/* nobody is blocking, exit with the spinlock held */
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
 			return 0;
+
+		/*
+		 * we have the spinlock, but the real owner is blocking.
+		 * wait for them
+		 */
+		spin_unlock(&eb->lock);
+
+		/*
+		 * spin for a bit, and if the blocking flag goes away,
+		 * loop around
+		 */
+		if (btrfs_spin_on_block(eb))
+			continue;
+
+		prepare_to_wait_exclusive(&eb->lock_wq, &wait,
+					  TASK_UNINTERRUPTIBLE);
+
+		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			schedule();
+
+		finish_wait(&eb->lock_wq, &wait);
 	}
-	cpu_relax();
-	mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
 	return 0;
 }
 
+/*
+ * Very quick trylock, this does not spin or schedule.  It returns
+ * 1 with the spinlock held if it was able to take the lock, or it
+ * returns zero if it was unable to take the lock.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
 int btrfs_try_tree_lock(struct extent_buffer *eb)
 {
-	return mutex_trylock(&eb->mutex);
+	if (spin_trylock(&eb->lock)) {
+		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+			/*
+			 * we've got the spinlock, but the real owner is
+			 * blocking.  Drop the spinlock and return failure
+			 */
+			spin_unlock(&eb->lock);
+			return 0;
+		}
+		return 1;
+	}
+	/* someone else has the spinlock giveup */
+	return 0;
 }
 
 int btrfs_tree_unlock(struct extent_buffer *eb)
 {
-	mutex_unlock(&eb->mutex);
+	/*
+	 * if we were a blocking owner, we don't have the spinlock held
+	 * just clear the bit and look for waiters
+	 */
+	if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		smp_mb__after_clear_bit();
+	else
+		spin_unlock(&eb->lock);
+
+	if (waitqueue_active(&eb->lock_wq))
+		wake_up(&eb->lock_wq);
 	return 0;
 }
 
 int btrfs_tree_locked(struct extent_buffer *eb)
 {
-	return mutex_is_locked(&eb->mutex);
+	return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
+			spin_is_locked(&eb->lock);
 }
 
 /*
@@ -75,12 +245,14 @@ int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
 {
 	int i;
 	struct extent_buffer *eb;
+
 	for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
 		eb = path->nodes[i];
 		if (!eb)
 			break;
 		smp_mb();
-		if (!list_empty(&eb->mutex.wait_list))
+		if (spin_is_contended(&eb->lock) ||
+		    waitqueue_active(&eb->lock_wq))
 			return 1;
 	}
 	return 0;
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bc1faef12519..d92e707f5870 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -22,6 +22,12 @@
 int btrfs_tree_lock(struct extent_buffer *eb);
 int btrfs_tree_unlock(struct extent_buffer *eb);
 int btrfs_tree_locked(struct extent_buffer *eb);
+
 int btrfs_try_tree_lock(struct extent_buffer *eb);
+int btrfs_try_spin_lock(struct extent_buffer *eb);
+
 int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
+
+void btrfs_set_lock_blocking(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking(struct extent_buffer *eb);
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3e8358c36165..98d25fa4570e 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		u32 nritems;
 
 		root_node = btrfs_lock_root_node(root);
+		btrfs_set_lock_blocking(root_node);
 		nritems = btrfs_header_nritems(root_node);
 		root->defrag_max.objectid = 0;
 		/* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4f26f3ed0c87..20794290256b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1615,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 				btrfs_tree_lock(next);
 				clean_tree_block(trans, root, next);
+				btrfs_set_lock_blocking(next);
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 
@@ -1661,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		next = path->nodes[*level];
 		btrfs_tree_lock(next);
 		clean_tree_block(trans, root, next);
+		btrfs_set_lock_blocking(next);
 		btrfs_wait_tree_block_writeback(next);
 		btrfs_tree_unlock(next);
 
@@ -1718,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 
 				btrfs_tree_lock(next);
 				clean_tree_block(trans, root, next);
+				btrfs_set_lock_blocking(next);
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 
@@ -1790,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 
 			btrfs_tree_lock(next);
 			clean_tree_block(trans, log, next);
+			btrfs_set_lock_blocking(next);
 			btrfs_wait_tree_block_writeback(next);
 			btrfs_tree_unlock(next);
 
-- 
cgit v1.2.3


From bd56b30205bc09da0beb80d4ba3d4c7309792da5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:27:02 -0500
Subject: Btrfs: Make btrfs_drop_snapshot work in larger and more efficient
 chunks

Every transaction in btrfs creates a new snapshot, and then schedules the
snapshot from the last transaction for deletion.  Snapshot deletion
works by walking down the btree and dropping the reference counts
on each btree block during the walk.

If if a given leaf or node has a reference count greater than one,
the reference count is decremented and the subtree pointed to by that
node is ignored.

If the reference count is one, walking continues down into that node
or leaf, and the references of everything it points to are decremented.

The old code would try to work in small pieces, walking down the tree
until it found the lowest leaf or node to free and then returning.  This
was very friendly to the rest of the FS because it didn't have a huge
impact on other operations.

But it wouldn't always keep up with the rate that new commits added new
snapshots for deletion, and it wasn't very optimal for the extent
allocation tree because it wasn't finding leaves that were close together
on disk and processing them at the same time.

This changes things to walk down to a level 1 node and then process it
in bulk.  All the leaf pointers are sorted and the leaves are dropped
in order based on their extent number.

The extent allocation tree and commit code are now fast enough for
this kind of bulk processing to work without slowing the rest of the FS
down.  Overall it does less IO and is better able to keep up with
snapshot deletions under high load.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 306 ++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/inode.c       |   2 +
 fs/btrfs/ref-cache.c   |   1 +
 fs/btrfs/ref-cache.h   |   1 -
 4 files changed, 265 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ed1e25d72483..1d3e9262a9da 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1533,6 +1533,11 @@ out:
  * struct refsort is used to match byte number to slot in the btree block.
  * we sort based on the byte number and then use the slot to actually
  * find the item.
+ *
+ * struct refsort is smaller than strcut btrfs_item and smaller than
+ * struct btrfs_key_ptr.  Since we're currently limited to the page size
+ * for a btree block, there's no way for a kmalloc of refsorts for a
+ * single node to be bigger than a page.
  */
 struct refsort {
 	u64 bytenr;
@@ -3457,36 +3462,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 {
 	u64 leaf_owner;
 	u64 leaf_generation;
+	struct refsort *sorted;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	int i;
 	int nritems;
 	int ret;
+	int refi = 0;
+	int slot;
 
 	BUG_ON(!btrfs_is_leaf(leaf));
 	nritems = btrfs_header_nritems(leaf);
 	leaf_owner = btrfs_header_owner(leaf);
 	leaf_generation = btrfs_header_generation(leaf);
 
+	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+	/* we do this loop twice.  The first time we build a list
+	 * of the extents we have a reference on, then we sort the list
+	 * by bytenr.  The second time around we actually do the
+	 * extent freeing.
+	 */
 	for (i = 0; i < nritems; i++) {
 		u64 disk_bytenr;
 		cond_resched();
 
 		btrfs_item_key_to_cpu(leaf, &key, i);
+
+		/* only extents have references, skip everything else */
 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
 			continue;
+
 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+
+		/* inline extents live in the btree, they don't have refs */
 		if (btrfs_file_extent_type(leaf, fi) ==
 		    BTRFS_FILE_EXTENT_INLINE)
 			continue;
-		/*
-		 * FIXME make sure to insert a trans record that
-		 * repeats the snapshot del on crash
-		 */
+
 		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+
+		/* holes don't have refs */
 		if (disk_bytenr == 0)
 			continue;
 
+		sorted[refi].bytenr = disk_bytenr;
+		sorted[refi].slot = i;
+		refi++;
+	}
+
+	if (refi == 0)
+		goto out;
+
+	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+	for (i = 0; i < refi; i++) {
+		u64 disk_bytenr;
+
+		disk_bytenr = sorted[i].bytenr;
+		slot = sorted[i].slot;
+
+		cond_resched();
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+			continue;
+
+		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf->start, leaf_owner, leaf_generation,
@@ -3497,6 +3539,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		wake_up(&root->fs_info->transaction_throttle);
 		cond_resched();
 	}
+out:
+	kfree(sorted);
 	return 0;
 }
 
@@ -3506,9 +3550,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 {
 	int i;
 	int ret;
-	struct btrfs_extent_info *info = ref->extents;
+	struct btrfs_extent_info *info;
+	struct refsort *sorted;
+
+	if (ref->nritems == 0)
+		return 0;
 
+	sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
+	for (i = 0; i < ref->nritems; i++) {
+		sorted[i].bytenr = ref->extents[i].bytenr;
+		sorted[i].slot = i;
+	}
+	sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
+
+	/*
+	 * the items in the ref were sorted when the ref was inserted
+	 * into the ref cache, so this is already in order
+	 */
 	for (i = 0; i < ref->nritems; i++) {
+		info = ref->extents + sorted[i].slot;
 		ret = __btrfs_free_extent(trans, root, info->bytenr,
 					  info->num_bytes, ref->bytenr,
 					  ref->owner, ref->generation,
@@ -3565,6 +3625,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
 	return ret;
 }
 
+/*
+ * this is used while deleting old snapshots, and it drops the refs
+ * on a whole subtree starting from a level 1 node.
+ *
+ * The idea is to sort all the leaf pointers, and then drop the
+ * ref on all the leaves in order.  Most of the time the leaves
+ * will have ref cache entries, so no leaf IOs will be required to
+ * find the extents they have references on.
+ *
+ * For each leaf, any references it has are also dropped in order
+ *
+ * This ends up dropping the references in something close to optimal
+ * order for reading and modifying the extent allocation tree.
+ */
+static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path)
+{
+	u64 bytenr;
+	u64 root_owner;
+	u64 root_gen;
+	struct extent_buffer *eb = path->nodes[1];
+	struct extent_buffer *leaf;
+	struct btrfs_leaf_ref *ref;
+	struct refsort *sorted = NULL;
+	int nritems = btrfs_header_nritems(eb);
+	int ret;
+	int i;
+	int refi = 0;
+	int slot = path->slots[1];
+	u32 blocksize = btrfs_level_size(root, 0);
+	u32 refs;
+
+	if (nritems == 0)
+		goto out;
+
+	root_owner = btrfs_header_owner(eb);
+	root_gen = btrfs_header_generation(eb);
+	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+
+	/*
+	 * step one, sort all the leaf pointers so we don't scribble
+	 * randomly into the extent allocation tree
+	 */
+	for (i = slot; i < nritems; i++) {
+		sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
+		sorted[refi].slot = i;
+		refi++;
+	}
+
+	/*
+	 * nritems won't be zero, but if we're picking up drop_snapshot
+	 * after a crash, slot might be > 0, so double check things
+	 * just in case.
+	 */
+	if (refi == 0)
+		goto out;
+
+	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+	/*
+	 * the first loop frees everything the leaves point to
+	 */
+	for (i = 0; i < refi; i++) {
+		u64 ptr_gen;
+
+		bytenr = sorted[i].bytenr;
+
+		/*
+		 * check the reference count on this leaf.  If it is > 1
+		 * we just decrement it below and don't update any
+		 * of the refs the leaf points to.
+		 */
+		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+		BUG_ON(ret);
+		if (refs != 1)
+			continue;
+
+		ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
+
+		/*
+		 * the leaf only had one reference, which means the
+		 * only thing pointing to this leaf is the snapshot
+		 * we're deleting.  It isn't possible for the reference
+		 * count to increase again later
+		 *
+		 * The reference cache is checked for the leaf,
+		 * and if found we'll be able to drop any refs held by
+		 * the leaf without needing to read it in.
+		 */
+		ref = btrfs_lookup_leaf_ref(root, bytenr);
+		if (ref && ref->generation != ptr_gen) {
+			btrfs_free_leaf_ref(root, ref);
+			ref = NULL;
+		}
+		if (ref) {
+			ret = cache_drop_leaf_ref(trans, root, ref);
+			BUG_ON(ret);
+			btrfs_remove_leaf_ref(root, ref);
+			btrfs_free_leaf_ref(root, ref);
+		} else {
+			/*
+			 * the leaf wasn't in the reference cache, so
+			 * we have to read it.
+			 */
+			leaf = read_tree_block(root, bytenr, blocksize,
+					       ptr_gen);
+			ret = btrfs_drop_leaf_ref(trans, root, leaf);
+			BUG_ON(ret);
+			free_extent_buffer(leaf);
+		}
+		atomic_inc(&root->fs_info->throttle_gen);
+		wake_up(&root->fs_info->transaction_throttle);
+		cond_resched();
+	}
+
+	/*
+	 * run through the loop again to free the refs on the leaves.
+	 * This is faster than doing it in the loop above because
+	 * the leaves are likely to be clustered together.  We end up
+	 * working in nice chunks on the extent allocation tree.
+	 */
+	for (i = 0; i < refi; i++) {
+		bytenr = sorted[i].bytenr;
+		ret = __btrfs_free_extent(trans, root, bytenr,
+					blocksize, eb->start,
+					root_owner, root_gen, 0, 1);
+		BUG_ON(ret);
+
+		atomic_inc(&root->fs_info->throttle_gen);
+		wake_up(&root->fs_info->transaction_throttle);
+		cond_resched();
+	}
+out:
+	kfree(sorted);
+
+	/*
+	 * update the path to show we've processed the entire level 1
+	 * node.  This will get saved into the root's drop_snapshot_progress
+	 * field so these drops are not repeated again if this transaction
+	 * commits.
+	 */
+	path->slots[1] = nritems;
+	return 0;
+}
+
 /*
  * helper function for drop_snapshot, this walks down the tree dropping ref
  * counts as it goes.
@@ -3580,7 +3786,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
 	struct extent_buffer *parent;
-	struct btrfs_leaf_ref *ref;
 	u32 blocksize;
 	int ret;
 	u32 refs;
@@ -3607,17 +3812,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 		if (path->slots[*level] >=
 		    btrfs_header_nritems(cur))
 			break;
+
+		/* the new code goes down to level 1 and does all the
+		 * leaves pointed to that node in bulk.  So, this check
+		 * for level 0 will always be false.
+		 *
+		 * But, the disk format allows the drop_snapshot_progress
+		 * field in the root to leave things in a state where
+		 * a leaf will need cleaning up here.  If someone crashes
+		 * with the old code and then boots with the new code,
+		 * we might find a leaf here.
+		 */
 		if (*level == 0) {
 			ret = btrfs_drop_leaf_ref(trans, root, cur);
 			BUG_ON(ret);
 			break;
 		}
+
+		/*
+		 * once we get to level one, process the whole node
+		 * at once, including everything below it.
+		 */
+		if (*level == 1) {
+			ret = drop_level_one_refs(trans, root, path);
+			BUG_ON(ret);
+			break;
+		}
+
 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 		blocksize = btrfs_level_size(root, *level - 1);
 
 		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
 		BUG_ON(ret);
+
+		/*
+		 * if there is more than one reference, we don't need
+		 * to read that node to drop any references it has.  We
+		 * just drop the ref we hold on that node and move on to the
+		 * next slot in this level.
+		 */
 		if (refs != 1) {
 			parent = path->nodes[*level];
 			root_owner = btrfs_header_owner(parent);
@@ -3636,46 +3870,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 
 			continue;
 		}
+
 		/*
-		 * at this point, we have a single ref, and since the
-		 * only place referencing this extent is a dead root
-		 * the reference count should never go higher.
-		 * So, we don't need to check it again
+		 * we need to keep freeing things in the next level down.
+		 * read the block and loop around to process it
 		 */
-		if (*level == 1) {
-			ref = btrfs_lookup_leaf_ref(root, bytenr);
-			if (ref && ref->generation != ptr_gen) {
-				btrfs_free_leaf_ref(root, ref);
-				ref = NULL;
-			}
-			if (ref) {
-				ret = cache_drop_leaf_ref(trans, root, ref);
-				BUG_ON(ret);
-				btrfs_remove_leaf_ref(root, ref);
-				btrfs_free_leaf_ref(root, ref);
-				*level = 0;
-				break;
-			}
-		}
-		next = btrfs_find_tree_block(root, bytenr, blocksize);
-		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
-			free_extent_buffer(next);
-
-			next = read_tree_block(root, bytenr, blocksize,
-					       ptr_gen);
-			cond_resched();
-#if 0
-			/*
-			 * this is a debugging check and can go away
-			 * the ref should never go all the way down to 1
-			 * at this point
-			 */
-			ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
-						&refs);
-			BUG_ON(ret);
-			WARN_ON(refs != 1);
-#endif
-		}
+		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
 		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
 			free_extent_buffer(path->nodes[*level-1]);
@@ -3700,11 +3900,16 @@ out:
 	root_owner = btrfs_header_owner(parent);
 	root_gen = btrfs_header_generation(parent);
 
+	/*
+	 * cleanup and free the reference on the last node
+	 * we processed
+	 */
 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
 				  parent->start, root_owner, root_gen,
 				  *level, 1);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
+
 	*level += 1;
 	BUG_ON(ret);
 
@@ -3824,6 +4029,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
 			struct extent_buffer *node;
 			struct btrfs_disk_key disk_key;
+
+			/*
+			 * there is more work to do in this level.
+			 * Update the drop_progress marker to reflect
+			 * the work we've done so far, and then bump
+			 * the slot number
+			 */
 			node = path->nodes[i];
 			path->slots[i]++;
 			*level = i;
@@ -3835,6 +4047,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 			return 0;
 		} else {
 			struct extent_buffer *parent;
+
+			/*
+			 * this whole node is done, free our reference
+			 * on it and go up one level
+			 */
 			if (path->nodes[*level] == root->node)
 				parent = path->nodes[*level];
 			else
@@ -4849,6 +5066,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
 		ref->bytenr = buf->start;
 		ref->owner = btrfs_header_owner(buf);
 		ref->generation = btrfs_header_generation(buf);
+
 		ret = btrfs_add_leaf_ref(root, ref, 0);
 		WARN_ON(ret);
 		btrfs_free_leaf_ref(root, ref);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ebd7d6c37df8..95ea58cb3065 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2441,6 +2441,8 @@ next_node:
 			ref->generation = leaf_gen;
 			ref->nritems = 0;
 
+			btrfs_sort_leaf_ref(ref);
+
 			ret = btrfs_add_leaf_ref(root, ref, 0);
 			WARN_ON(ret);
 			btrfs_free_leaf_ref(root, ref);
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 6f0acc4c9eab..d0cc62bccb94 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sort.h>
 #include "ctree.h"
 #include "ref-cache.h"
 #include "transaction.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 16f3183d7c59..bc283ad2db73 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -73,5 +73,4 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
 int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
 			   int shared);
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
-
 #endif
-- 
cgit v1.2.3


From 8c087b5183adab186a298f2d6ed39aefdcae413c Mon Sep 17 00:00:00 2001
From: Chris Ball <cjb@laptop.org>
Date: Wed, 4 Feb 2009 09:29:54 -0500
Subject: Btrfs: Handle SGID bit when creating inodes Before this patch, new
 files/dirs would ignore the SGID bit on their parent directory and always be
 owned by the creating user's uid/gid.

Signed-off-by: Chris Ball <cjb@laptop.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95ea58cb3065..5792816d4fca 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3472,7 +3472,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		root->highest_inode = objectid;
 
 	inode->i_uid = current_fsuid();
-	inode->i_gid = current_fsgid();
+
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode->i_gid = current_fsgid();
+
 	inode->i_mode = mode;
 	inode->i_ino = objectid;
 	inode_set_bytes(inode, 0);
-- 
cgit v1.2.3


From 811449496b3e3caa9a8cf43feacbade0153324f2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 4 Feb 2009 09:18:33 -0500
Subject: Btrfs: join the transaction in __btrfs_setxattr

With selinux on we end up calling __btrfs_setxattr when we create an inode,
which calls btrfs_start_transaction().  The problem is we've already called
that in btrfs_new_inode, and in btrfs_start_transaction we end up doing a
wait_current_trans().  If btrfs-transaction has started committing it will wait
for all handles to finish, while the other process is waiting for the
transaction to commit.  This is fixed by using btrfs_join_transaction, which
won't wait for the transaction to commit.  Thanks,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 312b9435e9f8..a9d3bf4d2689 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -98,7 +98,7 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
 	if (!path)
 		return -ENOMEM;
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_join_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 
 	/* first lets see if we already have this xattr */
-- 
cgit v1.2.3


From f03d9301f15fb69cdf1eb59d53c9fb72f68ecccc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:31:06 -0500
Subject: Btrfs: Don't try to compress pages past i_size

The compression code had some checks to make sure we were only
compressing bytes inside of i_size, but it wasn't catching every
case.  To make things worse, some incorrect math about the number
of bytes remaining would make it try to compress more pages than the
file really had.

The fix used here is to fall back to the non-compression code in this
case, which does all the proper cleanup of delalloc and other accounting.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5792816d4fca..9b43a6f303b8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -360,6 +360,19 @@ again:
 	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
 	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
 
+	/*
+	 * we don't want to send crud past the end of i_size through
+	 * compression, that's just a waste of CPU time.  So, if the
+	 * end of the file is before the start of our current
+	 * requested range of bytes, we bail out to the uncompressed
+	 * cleanup code that can deal with all of this.
+	 *
+	 * It isn't really the fastest way to fix things, but this is a
+	 * very uncommon corner.
+	 */
+	if (actual_end <= start)
+		goto cleanup_and_bail_uncompressed;
+
 	total_compressed = actual_end - start;
 
 	/* we want to make sure that amount of ram required to uncompress
@@ -504,6 +517,7 @@ again:
 			goto again;
 		}
 	} else {
+cleanup_and_bail_uncompressed:
 		/*
 		 * No compression, but we still need to write the pages in
 		 * the file we've been given so far.  redirty the locked
-- 
cgit v1.2.3


From 06d9a8d7c24fe22836bf0b0f82db59d6f98e271e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:30:58 -0500
Subject: Btrfs: Change btrfs_truncate_inode_items to stop when it hits the
 inode

btrfs_truncate_inode_items is setup to stop doing btree searches when
it has finished removing the items for the inode.  It used to detect the
end of the inode by looking for an objectid that didn't match the
one we were searching for.

But, this would result in an extra search through the btree, which
adds extra balancing and cow costs to the operation.

This commit adds a check to see if we found the inode item, which means
we can stop searching early.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9b43a6f303b8..ddb0f0ecda6c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2504,7 +2504,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
-	u32 found_type;
+	u32 found_type = (u8)-1;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
 	u64 extent_start = 0;
@@ -2691,6 +2691,8 @@ next:
 			if (pending_del_nr)
 				goto del_pending;
 			btrfs_release_path(root, path);
+			if (found_type == BTRFS_INODE_ITEM_KEY)
+				break;
 			goto search_again;
 		}
 
@@ -2707,6 +2709,8 @@ del_pending:
 			BUG_ON(ret);
 			pending_del_nr = 0;
 			btrfs_release_path(root, path);
+			if (found_type == BTRFS_INODE_ITEM_KEY)
+				break;
 			goto search_again;
 		}
 	}
-- 
cgit v1.2.3


From 4d081c41a4f98aecb5e86ef7d3e644cc7b52131f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:31:28 -0500
Subject: Btrfs: change btrfs_del_leaf to drop locks earlier

btrfs_del_leaf does two things.  First it removes the pointer in the
parent, and then it frees the block that has the leaf.  It has the
parent node locked for both operations.

But, it only needs the parent locked while it is deleting the pointer.
After that it can safely free the block without the parent locked.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3af777357acb..f6916ceb3920 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3630,15 +3630,22 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	u64 root_gen = btrfs_header_generation(path->nodes[1]);
+	u64 parent_start = path->nodes[1]->start;
+	u64 parent_owner = btrfs_header_owner(path->nodes[1]);
 
 	ret = del_ptr(trans, root, path, 1, path->slots[1]);
 	if (ret)
 		return ret;
 
+	/*
+	 * btrfs_free_extent is expensive, we want to make sure we
+	 * aren't holding any locks when we call it
+	 */
+	btrfs_unlock_up_safe(path, 0);
+
 	ret = btrfs_free_extent(trans, root, bytenr,
 				btrfs_level_size(root, 0),
-				path->nodes[1]->start,
-				btrfs_header_owner(path->nodes[1]),
+				parent_start, parent_owner,
 				root_gen, 0, 1);
 	return ret;
 }
-- 
cgit v1.2.3


From 12f4daccfc3732280debba8f9ba49720372de831 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:31:42 -0500
Subject: Btrfs: fix btrfs_unlock_up_safe to walk the entire path

btrfs_unlock_up_safe would break out at the first NULL node entry or
unlocked node it found in the path.

Some of the callers have missing nodes at the lower levels of the path, so this
commit fixes things to check all the nodes in the path before returning.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f6916ceb3920..0d1e3b91e7bd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1439,9 +1439,9 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
 		if (!path->nodes[i])
-			break;
+			continue;
 		if (!path->locks[i])
-			break;
+			continue;
 		btrfs_tree_unlock(path->nodes[i]);
 		path->locks[i] = 0;
 	}
-- 
cgit v1.2.3


From 7b78c170dc4f538cc7ee66f47b3aac3f3974a36c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:12:46 -0500
Subject: Btrfs: Only prep for btree deletion balances when nodes are mostly
 empty

Whenever an item deletion is done, we need to balance all the nodes
in the tree to make sure we don't end up with an empty node if a pointer
is deleted.  This balance prep happens from the root of the tree down
so we can drop our locks as we go.

reada_for_balance was triggering read-ahead on neighboring nodes even
when no balancing was required.  This adds an extra check to avoid
calling balance_level() and avoid reada_for_balance() when a balance
won't be required.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0d1e3b91e7bd..551177c0011a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1518,18 +1518,19 @@ again:
 			 */
 			if (prealloc_block.objectid &&
 			    prealloc_block.offset != b->len) {
-				btrfs_set_path_blocking(p);
+				btrfs_release_path(root, p);
 				btrfs_free_reserved_extent(root,
 					   prealloc_block.objectid,
 					   prealloc_block.offset);
 				prealloc_block.objectid = 0;
+				goto again;
 			}
 
 			/*
 			 * for higher level blocks, try not to allocate blocks
 			 * with the block and the parent locks held.
 			 */
-			if (level > 1 && !prealloc_block.objectid &&
+			if (level > 0 && !prealloc_block.objectid &&
 			    btrfs_path_lock_waiting(p, level)) {
 				u32 size = b->len;
 				u64 hint = b->start;
@@ -1614,7 +1615,9 @@ cow_done:
 				}
 				b = p->nodes[level];
 				slot = p->slots[level];
-			} else if (ins_len < 0) {
+			} else if (ins_len < 0 &&
+				   btrfs_header_nritems(b) <
+				   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
 				int sret;
 
 				sret = reada_for_balance(root, p, level);
-- 
cgit v1.2.3


From 9b0d3ace33ef225690a86d7130feadb22367f496 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:33:00 -0500
Subject: Btrfs: don't return congestion in write_cache_pages as often

On fast devices that go from congested to uncongested very quickly, pdflush
is waiting too often in congestion_wait, and the FS is backing off to
easily in write_cache_pages.

For now, fix this on the btrfs side by only checking congestion after
some bios have already gone down.  Longer term a real fix is needed
for pdflush, but that is a larger project.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dd5df53e045a..37d43b516b79 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2377,11 +2377,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 	int scanned = 0;
 	int range_whole = 0;
 
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		return 0;
-	}
-
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
-- 
cgit v1.2.3


From 436443f0f77f730f9f700095799c485356695c08 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 3 Feb 2009 23:12:34 -0800
Subject: Revert "configfs: Silence lockdep on mkdir(), rmdir() and
 configfs_depend_item()"

This reverts commit 0e0333429a6280e6eb3c98845e4eed90d5f8078a.

I committed this by accident - Joel and Louis are working with the lockdep
maintainer to provide a better solution than just turning lockdep off.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Acked-by: <Joel Becker <joel.becker@oracle.com>
---
 fs/configfs/dir.c | 59 -------------------------------------------------------
 1 file changed, 59 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9c2358391147..8e93341f3e82 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -553,24 +553,12 @@ static void detach_groups(struct config_group *group)
 
 		child = sd->s_dentry;
 
-		/*
-		 * Note: we hide this from lockdep since we have no way
-		 * to teach lockdep about recursive
-		 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
-		 * in an inode tree, which are valid as soon as
-		 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
-		 * parent inode to one of its children.
-		 */
-		lockdep_off();
 		mutex_lock(&child->d_inode->i_mutex);
-		lockdep_on();
 
 		configfs_detach_group(sd->s_element);
 		child->d_inode->i_flags |= S_DEAD;
 
-		lockdep_off();
 		mutex_unlock(&child->d_inode->i_mutex);
-		lockdep_on();
 
 		d_delete(child);
 		dput(child);
@@ -760,22 +748,11 @@ static int configfs_attach_item(struct config_item *parent_item,
 			 * We are going to remove an inode and its dentry but
 			 * the VFS may already have hit and used them. Thus,
 			 * we must lock them as rmdir() would.
-			 *
-			 * Note: we hide this from lockdep since we have no way
-			 * to teach lockdep about recursive
-			 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
-			 * in an inode tree, which are valid as soon as
-			 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
-			 * parent inode to one of its children.
 			 */
-			lockdep_off();
 			mutex_lock(&dentry->d_inode->i_mutex);
-			lockdep_on();
 			configfs_remove_dir(item);
 			dentry->d_inode->i_flags |= S_DEAD;
-			lockdep_off();
 			mutex_unlock(&dentry->d_inode->i_mutex);
-			lockdep_on();
 			d_delete(dentry);
 		}
 	}
@@ -810,25 +787,14 @@ static int configfs_attach_group(struct config_item *parent_item,
 		 *
 		 * We must also lock the inode to remove it safely in case of
 		 * error, as rmdir() would.
-		 *
-		 * Note: we hide this from lockdep since we have no way
-		 * to teach lockdep about recursive
-		 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
-		 * in an inode tree, which are valid as soon as
-		 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
-		 * parent inode to one of its children.
 		 */
-		lockdep_off();
 		mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
-		lockdep_on();
 		ret = populate_groups(to_config_group(item));
 		if (ret) {
 			configfs_detach_item(item);
 			dentry->d_inode->i_flags |= S_DEAD;
 		}
-		lockdep_off();
 		mutex_unlock(&dentry->d_inode->i_mutex);
-		lockdep_on();
 		if (ret)
 			d_delete(dentry);
 	}
@@ -990,17 +956,7 @@ static int configfs_depend_prep(struct dentry *origin,
 	BUG_ON(!origin || !sd);
 
 	/* Lock this guy on the way down */
-	/*
-	 * Note: we hide this from lockdep since we have no way
-	 * to teach lockdep about recursive
-	 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
-	 * in an inode tree, which are valid as soon as
-	 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
-	 * parent inode to one of its children.
-	 */
-	lockdep_off();
 	mutex_lock(&sd->s_dentry->d_inode->i_mutex);
-	lockdep_on();
 	if (sd->s_element == target)  /* Boo-yah */
 		goto out;
 
@@ -1014,9 +970,7 @@ static int configfs_depend_prep(struct dentry *origin,
 	}
 
 	/* We looped all our children and didn't find target */
-	lockdep_off();
 	mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
-	lockdep_on();
 	ret = -ENOENT;
 
 out:
@@ -1036,16 +990,11 @@ static void configfs_depend_rollback(struct dentry *origin,
 	struct dentry *dentry = item->ci_dentry;
 
 	while (dentry != origin) {
-		/* See comments in configfs_depend_prep() */
-		lockdep_off();
 		mutex_unlock(&dentry->d_inode->i_mutex);
-		lockdep_on();
 		dentry = dentry->d_parent;
 	}
 
-	lockdep_off();
 	mutex_unlock(&origin->d_inode->i_mutex);
-	lockdep_on();
 }
 
 int configfs_depend_item(struct configfs_subsystem *subsys,
@@ -1380,16 +1329,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 			}
 
 			/* Wait until the racing operation terminates */
-			/*
-			 * Note: we hide this from lockdep since we are locked
-			 * with subclass I_MUTEX_NORMAL from vfs_rmdir() (why
-			 * not I_MUTEX_CHILD?), and I_MUTEX_XATTR or
-			 * I_MUTEX_QUOTA are not relevant for the locked inode.
-			 */
-			lockdep_off();
 			mutex_lock(wait_mutex);
 			mutex_unlock(wait_mutex);
-			lockdep_on();
 		}
 	} while (ret == -EAGAIN);
 
-- 
cgit v1.2.3


From 806638bce99f51deccbfedbe86ab3c5cf55a1d35 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 5 Feb 2009 09:08:14 -0500
Subject: Btrfs: Fix memory leak in cache_drop_leaf_ref

The code wasn't doing a kfree on the sorted array

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1d3e9262a9da..7527523c2d2d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3582,6 +3582,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		info++;
 	}
 
+	kfree(sorted);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 33da8892a2f9e7d4b2d9a35fc80833ba2d2b1aa6 Mon Sep 17 00:00:00 2001
From: Eric Biederman <ebiederm@xmission.com>
Date: Wed, 4 Feb 2009 15:12:25 -0800
Subject: seq_file: move traverse so it can be used from seq_read

In 2.6.25 some /proc files were converted to use the seq_file
infrastructure.  But seq_files do not correctly support pread(), which
broke some usersapce applications.

To handle pread correctly we can't assume that f_pos is where we left it
in seq_read.  So move traverse() so that we can eventually use it in
seq_read and do thus some day support pread().

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Cc: Paul Turner <pjt@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c | 114 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 57 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index b569ff1c4dc8..2716c12eacf5 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -54,6 +54,63 @@ int seq_open(struct file *file, const struct seq_operations *op)
 }
 EXPORT_SYMBOL(seq_open);
 
+static int traverse(struct seq_file *m, loff_t offset)
+{
+	loff_t pos = 0, index;
+	int error = 0;
+	void *p;
+
+	m->version = 0;
+	index = 0;
+	m->count = m->from = 0;
+	if (!offset) {
+		m->index = index;
+		return 0;
+	}
+	if (!m->buf) {
+		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
+		if (!m->buf)
+			return -ENOMEM;
+	}
+	p = m->op->start(m, &index);
+	while (p) {
+		error = PTR_ERR(p);
+		if (IS_ERR(p))
+			break;
+		error = m->op->show(m, p);
+		if (error < 0)
+			break;
+		if (unlikely(error)) {
+			error = 0;
+			m->count = 0;
+		}
+		if (m->count == m->size)
+			goto Eoverflow;
+		if (pos + m->count > offset) {
+			m->from = offset - pos;
+			m->count -= m->from;
+			m->index = index;
+			break;
+		}
+		pos += m->count;
+		m->count = 0;
+		if (pos == offset) {
+			index++;
+			m->index = index;
+			break;
+		}
+		p = m->op->next(m, p, &index);
+	}
+	m->op->stop(m, p);
+	return error;
+
+Eoverflow:
+	m->op->stop(m, p);
+	kfree(m->buf);
+	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
+	return !m->buf ? -ENOMEM : -EAGAIN;
+}
+
 /**
  *	seq_read -	->read() method for sequential files.
  *	@file: the file to read from
@@ -186,63 +243,6 @@ Efault:
 }
 EXPORT_SYMBOL(seq_read);
 
-static int traverse(struct seq_file *m, loff_t offset)
-{
-	loff_t pos = 0, index;
-	int error = 0;
-	void *p;
-
-	m->version = 0;
-	index = 0;
-	m->count = m->from = 0;
-	if (!offset) {
-		m->index = index;
-		return 0;
-	}
-	if (!m->buf) {
-		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
-		if (!m->buf)
-			return -ENOMEM;
-	}
-	p = m->op->start(m, &index);
-	while (p) {
-		error = PTR_ERR(p);
-		if (IS_ERR(p))
-			break;
-		error = m->op->show(m, p);
-		if (error < 0)
-			break;
-		if (unlikely(error)) {
-			error = 0;
-			m->count = 0;
-		}
-		if (m->count == m->size)
-			goto Eoverflow;
-		if (pos + m->count > offset) {
-			m->from = offset - pos;
-			m->count -= m->from;
-			m->index = index;
-			break;
-		}
-		pos += m->count;
-		m->count = 0;
-		if (pos == offset) {
-			index++;
-			m->index = index;
-			break;
-		}
-		p = m->op->next(m, p, &index);
-	}
-	m->op->stop(m, p);
-	return error;
-
-Eoverflow:
-	m->op->stop(m, p);
-	kfree(m->buf);
-	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
-	return !m->buf ? -ENOMEM : -EAGAIN;
-}
-
 /**
  *	seq_lseek -	->llseek() method for sequential files.
  *	@file: the file in question
-- 
cgit v1.2.3


From f01d1d546abb2f4028b5299092f529eefb01253a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 6 Feb 2009 00:30:05 +0300
Subject: seq_file: fix big-enough lseek() + read()

lseek() further than length of the file will leave stale ->index
(second-to-last during iteration). Next seq_read() will not notice
that ->f_pos is big enough to return 0, but will print last item
as if ->f_pos is pointing to it.

Introduced in commit cb510b8172602a66467f3551b4be1911f5a7c8c2
aka "seq_file: more atomicity in traverse()".

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 2716c12eacf5..5267098532bf 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -102,6 +102,7 @@ static int traverse(struct seq_file *m, loff_t offset)
 		p = m->op->next(m, p, &index);
 	}
 	m->op->stop(m, p);
+	m->index = index;
 	return error;
 
 Eoverflow:
-- 
cgit v1.2.3


From 767b5828ad9a1b435488b5d39b5a66aeef4d25e4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Fri, 6 Feb 2009 00:32:27 +0000
Subject: braino in sg_ioctl_trans()

... and yes, gcc is insane enough to eat that without complaint.
We probably want sparse to scream on those...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c8f8d5904f5e..9c6d815dd191 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -785,7 +785,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 
 	if (copy_in_user(&sgio->status, &sgio32->status,
 			 (4 * sizeof(unsigned char)) +
-			 (2 * sizeof(unsigned (short))) +
+			 (2 * sizeof(unsigned short)) +
 			 (3 * sizeof(int))))
 		return -EFAULT;
 
-- 
cgit v1.2.3


From 42f15d77df8a7e8a2feb15041d5d30710ee7f951 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 6 Feb 2009 11:35:57 -0500
Subject: Btrfs: Make sure dir is non-null before doing S_ISGID checks

The S_ISGID check in btrfs_new_inode caused an oops during subvol creation
because sometimes the dir is null.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ddb0f0ecda6c..8f0706210a47 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3491,7 +3491,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 
 	inode->i_uid = current_fsuid();
 
-	if (dir->i_mode & S_ISGID) {
+	if (dir && (dir->i_mode & S_ISGID)) {
 		inode->i_gid = dir->i_gid;
 		if (S_ISDIR(mode))
 			mode |= S_ISGID;
-- 
cgit v1.2.3


From d4cf109f05ff04c6f5065c3e14165ef01a57dd53 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Fri, 6 Feb 2009 14:59:26 -0600
Subject: vfs: Don't call attach_nobh_buffers() with an empty list

This is a modification of a patch by Bill Pemberton <wfp5p@virginia.edu>

nobh_write_end() could call attach_nobh_buffers() with head == NULL.
This would result in a trap when attach_nobh_buffers() attempted to
access bh->b_this_page.

This can be illustrated by running the writev01 testcase from LTP on jfs.

This error was introduced by commit 5b41e74a "vfs: fix data leak in
nobh_write_end()".  That patch did not take into account that if
PageMappedToDisk() is true upon entry to nobh_write_begin(), then no
buffers will be allocated for the page.  In that case, we won't have to
worry about a failed write leaving unitialized data in the page.

Of course, head != NULL implies !page_has_buffers(page), so no need to
test both.

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: Bill Pemberton <wfp5p@virginia.edu>
Cc: Dmitri Monakhov <dmonakhov@openvz.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index b58208f1640a..665d446b25bc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2688,7 +2688,7 @@ int nobh_write_end(struct file *file, struct address_space *mapping,
 	struct buffer_head *bh;
 	BUG_ON(fsdata != NULL && page_has_buffers(page));
 
-	if (unlikely(copied < len) && !page_has_buffers(page))
+	if (unlikely(copied < len) && head)
 		attach_nobh_buffers(page, head);
 	if (page_has_buffers(page))
 		return generic_write_end(file, mapping, pos, len,
-- 
cgit v1.2.3


From 0bf2f3aec5474da80a60e1baca629af87ecb67b6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Feb 2009 11:45:46 +0000
Subject: CRED: Fix SUID exec regression

The patch:

	commit a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d
	CRED: Make execve() take advantage of copy-on-write credentials

moved the place in which the 'safeness' of a SUID/SGID exec was performed to
before de_thread() was called.  This means that LSM_UNSAFE_SHARE is now
calculated incorrectly.  This flag is set if any of the usage counts for
fs_struct, files_struct and sighand_struct are greater than 1 at the time the
determination is made.  All of which are true for threads created by the
pthread library.

However, since we wish to make the security calculation before irrevocably
damaging the process so that we can return it an error code in the case where
we decide we want to reject the exec request on this basis, we have to make the
determination before calling de_thread().

So, instead, we count up the number of threads (CLONE_THREAD) that are sharing
our fs_struct (CLONE_FS), files_struct (CLONE_FILES) and sighand_structs
(CLONE_SIGHAND/CLONE_THREAD) with us.  These will be killed by de_thread() and
so can be discounted by check_unsafe_exec().

We do have to be careful because CLONE_THREAD does not imply FS or FILES.

We _assume_ that there will be no extra references to these structs held by the
threads we're going to kill.

This can be tested with the attached pair of programs.  Build the two programs
using the Makefile supplied, and run ./test1 as a non-root user.  If
successful, you should see something like:

	[dhowells@andromeda tmp]$ ./test1
	--TEST1--
	uid=4043, euid=4043 suid=4043
	exec ./test2
	--TEST2--
	uid=4043, euid=0 suid=0
	SUCCESS - Correct effective user ID

and if unsuccessful, something like:

	[dhowells@andromeda tmp]$ ./test1
	--TEST1--
	uid=4043, euid=4043 suid=4043
	exec ./test2
	--TEST2--
	uid=4043, euid=4043 suid=4043
	ERROR - Incorrect effective user ID!

The non-root user ID you see will depend on the user you run as.

[test1.c]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>

static void *thread_func(void *arg)
{
	while (1) {}
}

int main(int argc, char **argv)
{
	pthread_t tid;
	uid_t uid, euid, suid;

	printf("--TEST1--\n");
	getresuid(&uid, &euid, &suid);
	printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid);

	if (pthread_create(&tid, NULL, thread_func, NULL) < 0) {
		perror("pthread_create");
		exit(1);
	}

	printf("exec ./test2\n");
	execlp("./test2", "test2", NULL);
	perror("./test2");
	_exit(1);
}

[test2.c]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

int main(int argc, char **argv)
{
	uid_t uid, euid, suid;

	getresuid(&uid, &euid, &suid);
	printf("--TEST2--\n");
	printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid);

	if (euid != 0) {
		fprintf(stderr, "ERROR - Incorrect effective user ID!\n");
		exit(1);
	}
	printf("SUCCESS - Correct effective user ID\n");
	exit(0);
}

[Makefile]
CFLAGS = -D_GNU_SOURCE -Wall -Werror -Wunused
all: test1 test2

test1: test1.c
	gcc $(CFLAGS) -o test1 test1.c -lpthread

test2: test2.c
	gcc $(CFLAGS) -o test2 test2.c
	sudo chown root.root test2
	sudo chmod +s test2

Reported-by: David Smith <dsmith@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: David Smith <dsmith@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/compat.c   |  2 +-
 fs/exec.c     | 28 ++++++++++++++++++++++------
 fs/internal.h |  2 +-
 3 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 65a070e705ab..d0145ca27572 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1407,7 +1407,7 @@ int compat_do_execve(char * filename,
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm);
+	check_unsafe_exec(bprm, current->files);
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
diff --git a/fs/exec.c b/fs/exec.c
index 0dd60a01f1b4..929b58004b7e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1049,16 +1049,32 @@ EXPORT_SYMBOL(install_exec_creds);
  * - the caller must hold current->cred_exec_mutex to protect against
  *   PTRACE_ATTACH
  */
-void check_unsafe_exec(struct linux_binprm *bprm)
+void check_unsafe_exec(struct linux_binprm *bprm, struct files_struct *files)
 {
-	struct task_struct *p = current;
+	struct task_struct *p = current, *t;
+	unsigned long flags;
+	unsigned n_fs, n_files, n_sighand;
 
 	bprm->unsafe = tracehook_unsafe_exec(p);
 
-	if (atomic_read(&p->fs->count) > 1 ||
-	    atomic_read(&p->files->count) > 1 ||
-	    atomic_read(&p->sighand->count) > 1)
+	n_fs = 1;
+	n_files = 1;
+	n_sighand = 1;
+	lock_task_sighand(p, &flags);
+	for (t = next_thread(p); t != p; t = next_thread(t)) {
+		if (t->fs == p->fs)
+			n_fs++;
+		if (t->files == files)
+			n_files++;
+		n_sighand++;
+	}
+
+	if (atomic_read(&p->fs->count) > n_fs ||
+	    atomic_read(&p->files->count) > n_files ||
+	    atomic_read(&p->sighand->count) > n_sighand)
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
+
+	unlock_task_sighand(p, &flags);
 }
 
 /* 
@@ -1273,7 +1289,7 @@ int do_execve(char * filename,
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm);
+	check_unsafe_exec(bprm, displaced);
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..0d8ac497b3d5 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -43,7 +43,7 @@ extern void __init chrdev_init(void);
 /*
  * exec.c
  */
-extern void check_unsafe_exec(struct linux_binprm *);
+extern void check_unsafe_exec(struct linux_binprm *, struct files_struct *);
 
 /*
  * namespace.c
-- 
cgit v1.2.3


From 92dc07b1f988e8c237a38e23be660b9b8533e6fd Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 6 Feb 2009 17:34:07 -0800
Subject: elf core dump: fix get_user use

The elf_core_dump() code does its work with set_fs(KERNEL_DS) in force,
so vma_dump_size() needs to switch back with set_fs(USER_DS) to safely
use get_user() for a normal user-space address.

Checking for VM_READ optimizes out the case where get_user() would fail
anyway.  The vm_file check here was already superfluous given the control
flow earlier in the function, so that is a cleanup/optimization unrelated
to other changes but an obvious and trivial one.

Reported-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Roland McGrath <roland@redhat.com>
---
 fs/binfmt_elf.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e3ff2b9e602f..33b7235f853b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1208,9 +1208,11 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 	 * check for an ELF header.  If we find one, dump the first page to
 	 * aid in determining what was mapped here.
 	 */
-	if (FILTER(ELF_HEADERS) && vma->vm_file != NULL && vma->vm_pgoff == 0) {
+	if (FILTER(ELF_HEADERS) &&
+	    vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
 		u32 __user *header = (u32 __user *) vma->vm_start;
 		u32 word;
+		mm_segment_t fs = get_fs();
 		/*
 		 * Doing it this way gets the constant folded by GCC.
 		 */
@@ -1223,7 +1225,15 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 		magic.elfmag[EI_MAG1] = ELFMAG1;
 		magic.elfmag[EI_MAG2] = ELFMAG2;
 		magic.elfmag[EI_MAG3] = ELFMAG3;
-		if (get_user(word, header) == 0 && word == magic.cmp)
+		/*
+		 * Switch to the user "segment" for get_user(),
+		 * then put back what elf_core_dump() had in place.
+		 */
+		set_fs(USER_DS);
+		if (unlikely(get_user(word, header)))
+			word = 0;
+		set_fs(fs);
+		if (word == magic.cmp)
 			return PAGE_SIZE;
 	}
 
-- 
cgit v1.2.3


From fd9fc842bbab0cb5560b0d52ce4598c898707863 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Fri, 6 Feb 2009 18:06:51 -0600
Subject: eCryptfs: Regression in unencrypted filename symlinks

The addition of filename encryption caused a regression in unencrypted
filename symlink support.  ecryptfs_copy_filename() is used when dealing
with unencrypted filenames and it reported that the new, copied filename
was a character longer than it should have been.

This caused the return value of readlink() to count the NULL byte of the
symlink target.  Most applications don't care about the extra NULL byte,
but a version control system (bzr) helped in discovering the bug.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index c01e043670e2..f6caeb1d1106 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1716,7 +1716,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
 {
 	int rc = 0;
 
-	(*copied_name) = kmalloc((name_size + 2), GFP_KERNEL);
+	(*copied_name) = kmalloc((name_size + 1), GFP_KERNEL);
 	if (!(*copied_name)) {
 		rc = -ENOMEM;
 		goto out;
@@ -1726,7 +1726,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
 						 * in printing out the
 						 * string in debug
 						 * messages */
-	(*copied_name_size) = (name_size + 1);
+	(*copied_name_size) = name_size;
 out:
 	return rc;
 }
-- 
cgit v1.2.3


From 766ccb9ed406c230d13c145def08ebea1b932982 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Tue, 20 Jan 2009 15:31:31 +0100
Subject: async: Rename _special -> _domain for clarity.

Rename the async_*_special() functions to async_*_domain(), which
describes the purpose of these functions much better.
[Broke up long lines to silence checkpatch]

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 645e5403f2a0..61dce001dd57 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -301,7 +301,7 @@ void generic_shutdown_super(struct super_block *sb)
 		/*
 		 * wait for asynchronous fs operations to finish before going further
 		 */
-		async_synchronize_full_special(&sb->s_async_list);
+		async_synchronize_full_domain(&sb->s_async_list);
 
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb);
@@ -470,7 +470,7 @@ restart:
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		async_synchronize_full_special(&sb->s_async_list);
+		async_synchronize_full_domain(&sb->s_async_list);
 		if (sb->s_root && (wait || sb->s_dirt))
 			sb->s_op->sync_fs(sb, wait);
 		up_read(&sb->s_umount);
-- 
cgit v1.2.3


From 9d9b87c1218be78ddecbc85ec3bb91c79c1d56ab Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 4 Feb 2009 17:35:38 -0500
Subject: lockd: fix regression in lockd's handling of blocked locks

If a client requests a blocking lock, is denied, then requests it again,
then here in nlmsvc_lock() we will call vfs_lock_file() without FL_SLEEP
set, because we've already queued a block and don't need the locks code
to do it again.

But that means vfs_lock_file() will return -EAGAIN instead of
FILE_LOCK_DENIED.  So we still need to translate that -EAGAIN return
into a nlm_lck_blocked error in this case, and put ourselves back on
lockd's block list.

The bug was introduced by bde74e4bc64415b1 "locks: add special return
value for asynchronous locks".

Thanks to Frank van Maarseveen for the report; his original test
case was essentially

	for i in `seq 30`; do flock /nfsmount/foo sleep 10 & done

Tested-by: Frank van Maarseveen <frankvm@frankvm.com>
Reported-by: Frank van Maarseveen <frankvm@frankvm.com>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svclock.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 6063a8e4b9f3..763b78a6e9de 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -427,7 +427,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 		case -EAGAIN:
 			ret = nlm_lck_denied;
-			goto out;
+			break;
 		case FILE_LOCK_DEFERRED:
 			if (wait)
 				break;
@@ -443,6 +443,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 	}
 
+	ret = nlm_lck_denied;
+	if (!wait)
+		goto out;
+
 	ret = nlm_lck_blocked;
 
 	/* Append to list of blocked */
-- 
cgit v1.2.3


From 284b066af41579f62649048fdec5c5e7091703e6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 9 Feb 2009 16:22:03 -0500
Subject: Btrfs: don't use spin_is_contended

Btrfs was using spin_is_contended to see if it should drop locks before
doing extent allocations during btrfs_search_slot.  The idea was to avoid
expensive searches in the tree unless the lock was actually contended.

But, spin_is_contended is specific to the ticket spinlocks on x86, so this
is causing compile errors everywhere else.

In practice, the contention could easily appear some time after we started
doing the extent allocation, and it makes more sense to always drop the lock
instead.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   |  3 +--
 fs/btrfs/locking.c | 22 ----------------------
 fs/btrfs/locking.h |  2 --
 3 files changed, 1 insertion(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 551177c0011a..35443cc4b9a9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1530,8 +1530,7 @@ again:
 			 * for higher level blocks, try not to allocate blocks
 			 * with the block and the parent locks held.
 			 */
-			if (level > 0 && !prealloc_block.objectid &&
-			    btrfs_path_lock_waiting(p, level)) {
+			if (level > 0 && !prealloc_block.objectid) {
 				u32 size = b->len;
 				u64 hint = b->start;
 
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 68fd9ccf1805..9ebe9385129b 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -236,25 +236,3 @@ int btrfs_tree_locked(struct extent_buffer *eb)
 	return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
 			spin_is_locked(&eb->lock);
 }
-
-/*
- * btrfs_search_slot uses this to decide if it should drop its locks
- * before doing something expensive like allocating free blocks for cow.
- */
-int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
-{
-	int i;
-	struct extent_buffer *eb;
-
-	for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
-		eb = path->nodes[i];
-		if (!eb)
-			break;
-		smp_mb();
-		if (spin_is_contended(&eb->lock) ||
-		    waitqueue_active(&eb->lock_wq))
-			return 1;
-	}
-	return 0;
-}
-
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index d92e707f5870..6bb0afbff928 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -26,8 +26,6 @@ int btrfs_tree_locked(struct extent_buffer *eb);
 int btrfs_try_tree_lock(struct extent_buffer *eb);
 int btrfs_try_spin_lock(struct extent_buffer *eb);
 
-int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
-
 void btrfs_set_lock_blocking(struct extent_buffer *eb);
 void btrfs_clear_lock_blocking(struct extent_buffer *eb);
 #endif
-- 
cgit v1.2.3


From c88ccea3143975294f5a52097546bcbb75975f52 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 10 Feb 2009 11:27:46 -0500
Subject: jbd2: Fix return value of jbd2_journal_start_commit()

The function jbd2_journal_start_commit() returns 1 if either a
transaction is committing or the function has queued a transaction
commit. But it returns 0 if we raced with somebody queueing the
transaction commit as well. This resulted in ext4_sync_fs() not
functioning correctly (description from Arthur Jones):

   In the case of a data=ordered umount with pending long symlinks
   which are delayed due to a long list of other I/O on the backing
   block device, this causes the buffer associated with the long
   symlinks to not be moved to the inode dirty list in the second
   phase of fsync_super.  Then, before they can be dirtied again,
   kjournald exits, seeing the UMOUNT flag and the dirty pages are
   never written to the backing block device, causing long symlink
   corruption and exposing new or previously freed block data to
   userspace.

This can be reproduced with a script created by Eric Sandeen
<sandeen@redhat.com>:

        #!/bin/bash

        umount /mnt/test2
        mount /dev/sdb4 /mnt/test2
        rm -f /mnt/test2/*
        dd if=/dev/zero of=/mnt/test2/bigfile bs=1M count=512
        touch /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
        ln -s /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
        /mnt/test2/link
        umount /mnt/test2
        mount /dev/sdb4 /mnt/test2
        ls /mnt/test2/

This patch fixes jbd2_journal_start_commit() to always return 1 when
there's a transaction committing or queued for commit.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
CC: Eric Sandeen <sandeen@redhat.com>
CC: linux-ext4@vger.kernel.org
---
 fs/jbd2/journal.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index eb343008eded..58144102bf25 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -450,7 +450,7 @@ int __jbd2_log_space_left(journal_t *journal)
 }
 
 /*
- * Called under j_state_lock.  Returns true if a transaction was started.
+ * Called under j_state_lock.  Returns true if a transaction commit was started.
  */
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
@@ -518,7 +518,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 
 /*
  * Start a commit of the current running transaction (if any).  Returns true
- * if a transaction was started, and fills its tid in at *ptid
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
  */
 int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 {
@@ -528,15 +529,19 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 	if (journal->j_running_transaction) {
 		tid_t tid = journal->j_running_transaction->t_tid;
 
-		ret = __jbd2_log_start_commit(journal, tid);
-		if (ret && ptid)
+		__jbd2_log_start_commit(journal, tid);
+		/* There's a running transaction and we've just made sure
+		 * it's commit has been scheduled. */
+		if (ptid)
 			*ptid = tid;
-	} else if (journal->j_committing_transaction && ptid) {
+		ret = 1;
+	} else if (journal->j_committing_transaction) {
 		/*
 		 * If ext3_write_super() recently started a commit, then we
 		 * have to wait for completion of that transaction
 		 */
-		*ptid = journal->j_committing_transaction->t_tid;
+		if (ptid)
+			*ptid = journal->j_committing_transaction->t_tid;
 		ret = 1;
 	}
 	spin_unlock(&journal->j_state_lock);
-- 
cgit v1.2.3


From 9eddacf9e9c03578ef2c07c9534423e823d677f8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 10 Feb 2009 06:46:05 -0500
Subject: Revert "ext4: wait on all pending commits in ext4_sync_fs()"

This undoes commit 14ce0cb411c88681ab8f3a4c9caa7f42e97a3184.

Since jbd2_journal_start_commit() is now fixed to return 1 when we
started a transaction commit, there's some transaction waiting to be
committed or there's a transaction already committing, we don't
need to call ext4_force_commit() in ext4_sync_fs(). Furthermore
ext4_force_commit() can unnecessarily create sync transaction which is
expensive so it's worthwhile to remove it when we can.

http://bugzilla.kernel.org/show_bug.cgi?id=12224

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext4/super.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e5f06a5f045e..a5732c58f676 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3046,14 +3046,17 @@ static void ext4_write_super(struct super_block *sb)
 static int ext4_sync_fs(struct super_block *sb, int wait)
 {
 	int ret = 0;
+	tid_t target;
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
 	sb->s_dirt = 0;
 	if (EXT4_SB(sb)->s_journal) {
-		if (wait)
-			ret = ext4_force_commit(sb);
-		else
- 			jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+		if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
+					      &target)) {
+			if (wait)
+				jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
+						     target);
+		}
 	} else {
 		ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
 	}
-- 
cgit v1.2.3


From 7f5aa215088b817add9c71914b83650bdd49f8a9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 10 Feb 2009 11:15:34 -0500
Subject: jbd2: Avoid possible NULL dereference in
 jbd2_journal_begin_ordered_truncate()

If we race with commit code setting i_transaction to NULL, we could
possibly dereference it.  Proper locking requires the journal pointer
(to access journal->j_list_lock), which we don't have.  So we have to
change the prototype of the function so that filesystem passes us the
journal pointer.  Also add a more detailed comment about why the
function jbd2_journal_begin_ordered_truncate() does what it does and
how it should be used.

Thanks to Dan Carpenter <error27@gmail.com> for pointing to the
suspitious code.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Joel Becker <joel.becker@oracle.com>
CC: linux-ext4@vger.kernel.org
CC: ocfs2-devel@oss.oracle.com
CC: mfasheh@suse.de
CC: Dan Carpenter <error27@gmail.com>
---
 fs/ext4/inode.c       |  6 ++++--
 fs/jbd2/transaction.c | 42 +++++++++++++++++++++++++++++++-----------
 fs/ocfs2/journal.h    |  6 ++++--
 3 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 03ba20be1329..658c4a7f2578 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,8 +47,10 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
-	return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
-						   new_size);
+	return jbd2_journal_begin_ordered_truncate(
+					EXT4_SB(inode->i_sb)->s_journal,
+					&EXT4_I(inode)->jinode,
+					new_size);
 }
 
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 46b4e347ed7d..28ce21d8598e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2129,26 +2129,46 @@ done:
 }
 
 /*
- * This function must be called when inode is journaled in ordered mode
- * before truncation happens. It starts writeout of truncated part in
- * case it is in the committing transaction so that we stand to ordered
- * mode consistency guarantees.
+ * File truncate and transaction commit interact with each other in a
+ * non-trivial way.  If a transaction writing data block A is
+ * committing, we cannot discard the data by truncate until we have
+ * written them.  Otherwise if we crashed after the transaction with
+ * write has committed but before the transaction with truncate has
+ * committed, we could see stale data in block A.  This function is a
+ * helper to solve this problem.  It starts writeout of the truncated
+ * part in case it is in the committing transaction.
+ *
+ * Filesystem code must call this function when inode is journaled in
+ * ordered mode before truncation happens and after the inode has been
+ * placed on orphan list with the new inode size. The second condition
+ * avoids the race that someone writes new data and we start
+ * committing the transaction after this function has been called but
+ * before a transaction for truncate is started (and furthermore it
+ * allows us to optimize the case where the addition to orphan list
+ * happens in the same transaction as write --- we don't have to write
+ * any data in such case).
  */
-int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+int jbd2_journal_begin_ordered_truncate(journal_t *journal,
+					struct jbd2_inode *jinode,
 					loff_t new_size)
 {
-	journal_t *journal;
-	transaction_t *commit_trans;
+	transaction_t *inode_trans, *commit_trans;
 	int ret = 0;
 
-	if (!inode->i_transaction && !inode->i_next_transaction)
+	/* This is a quick check to avoid locking if not necessary */
+	if (!jinode->i_transaction)
 		goto out;
-	journal = inode->i_transaction->t_journal;
+	/* Locks are here just to force reading of recent values, it is
+	 * enough that the transaction was not committing before we started
+	 * a transaction adding the inode to orphan list */
 	spin_lock(&journal->j_state_lock);
 	commit_trans = journal->j_committing_transaction;
 	spin_unlock(&journal->j_state_lock);
-	if (inode->i_transaction == commit_trans) {
-		ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+	spin_lock(&journal->j_list_lock);
+	inode_trans = jinode->i_transaction;
+	spin_unlock(&journal->j_list_lock);
+	if (inode_trans == commit_trans) {
+		ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
 			new_size, LLONG_MAX);
 		if (ret)
 			jbd2_journal_abort(journal, ret);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3c3532e1307c..172850a9a12a 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
 					       loff_t new_size)
 {
-	return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
-						   new_size);
+	return jbd2_journal_begin_ordered_truncate(
+				OCFS2_SB(inode->i_sb)->journal->j_journal,
+				&OCFS2_I(inode)->ip_jinode,
+				new_size);
 }
 
 #endif /* OCFS2_JOURNAL_H */
-- 
cgit v1.2.3


From 7be2baaa0322c59ba888aa5260a8c130666acd41 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 10 Feb 2009 09:53:42 -0500
Subject: ext4: Fix to read empty directory blocks correctly in 64k

The rec_len field in the directory entry is 16 bits, so there was a
problem representing rec_len for filesystems with a 64k block size in
the case where the directory entry takes the entire 64k block.
Unfortunately, there were two schemes that were proposed; one where
all zeros meant 65536 and one where all ones (65535) meant 65536.
E2fsprogs used 0, whereas the kernel used 65535.  Oops.  Fortunately
this case happens extremely rarely, with the most common case being
the lost+found directory, created by mke2fs.

So we will be liberal in what we accept, and accept both encodings,
but we will continue to encode 65536 as 65535.  This will require a
change in e2fsprogs, but with fortunately ext4 filesystems normally
have the dir_index feature enabled, which precludes having a
completely empty directory block.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index aafc9eba1c25..b0c87dce66a3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -868,7 +868,7 @@ static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
 {
 	unsigned len = le16_to_cpu(dlen);
 
-	if (len == EXT4_MAX_REC_LEN)
+	if (len == EXT4_MAX_REC_LEN || len == 0)
 		return 1 << 16;
 	return len;
 }
-- 
cgit v1.2.3


From ba4439165f0f0d25b2fe065cf0c1ff8130b802eb Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 10 Feb 2009 11:14:34 -0500
Subject: ext4: Fix lockdep warning

We should not call ext4_mb_add_n_trim while holding alloc_semp.

    =============================================
    [ INFO: possible recursive locking detected ]
    2.6.29-rc4-git1-dirty #124
    ---------------------------------------------
    ffsb/3116 is trying to acquire lock:
     (&meta_group_info[i]->alloc_sem){----}, at: [<ffffffff8035a6e8>]
     ext4_mb_load_buddy+0xd2/0x343

    but task is already holding lock:
     (&meta_group_info[i]->alloc_sem){----}, at: [<ffffffff8035a6e8>]
     ext4_mb_load_buddy+0xd2/0x343

http://bugzilla.kernel.org/show_bug.cgi?id=12672

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index deba54f6cbed..c962d0690505 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4476,23 +4476,26 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 			pa->pa_free -= ac->ac_b_ex.fe_len;
 			pa->pa_len -= ac->ac_b_ex.fe_len;
 			spin_unlock(&pa->pa_lock);
-			/*
-			 * We want to add the pa to the right bucket.
-			 * Remove it from the list and while adding
-			 * make sure the list to which we are adding
-			 * doesn't grow big.
-			 */
-			if (likely(pa->pa_free)) {
-				spin_lock(pa->pa_obj_lock);
-				list_del_rcu(&pa->pa_inode_list);
-				spin_unlock(pa->pa_obj_lock);
-				ext4_mb_add_n_trim(ac);
-			}
 		}
-		ext4_mb_put_pa(ac, ac->ac_sb, pa);
 	}
 	if (ac->alloc_semp)
 		up_read(ac->alloc_semp);
+	if (pa) {
+		/*
+		 * We want to add the pa to the right bucket.
+		 * Remove it from the list and while adding
+		 * make sure the list to which we are adding
+		 * doesn't grow big.  We need to release
+		 * alloc_semp before calling ext4_mb_add_n_trim()
+		 */
+		if (pa->pa_linear && likely(pa->pa_free)) {
+			spin_lock(pa->pa_obj_lock);
+			list_del_rcu(&pa->pa_inode_list);
+			spin_unlock(pa->pa_obj_lock);
+			ext4_mb_add_n_trim(ac);
+		}
+		ext4_mb_put_pa(ac, ac->ac_sb, pa);
+	}
 	if (ac->ac_bitmap_page)
 		page_cache_release(ac->ac_bitmap_page);
 	if (ac->ac_buddy_page)
-- 
cgit v1.2.3


From 5a6fe125950676015f5108fb71b2a67441755003 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 10 Feb 2009 14:02:27 +0000
Subject: Do not account for the address space used by hugetlbfs using
 VM_ACCOUNT

When overcommit is disabled, the core VM accounts for pages used by anonymous
shared, private mappings and special mappings. It keeps track of VMAs that
should be accounted for with VM_ACCOUNT and VMAs that never had a reserve
with VM_NORESERVE.

Overcommit for hugetlbfs is much riskier than overcommit for base pages
due to contiguity requirements. It avoids overcommiting on both shared and
private mappings using reservation counters that are checked and updated
during mmap(). This ensures (within limits) that hugepages exist in the
future when faults occurs or it is too easy to applications to be SIGKILLed.

As hugetlbfs makes its own reservations of a different unit to the base page
size, VM_ACCOUNT should never be set. Even if the units were correct, we would
double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may
be set because an application can request no reserves be made for hugetlbfs
at the risk of getting killed later.

With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and
VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This
breaks the accounting for both the core VM and hugetlbfs, can trigger an
OOM storm when hugepage pools are too small lockups and corrupted counters
otherwise are used. This patch brings hugetlbfs more in line with how the
core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6903d37af037..9b800d97a687 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	if (hugetlb_reserve_pages(inode,
 				vma->vm_pgoff >> huge_page_order(h),
-				len >> huge_page_shift(h), vma))
+				len >> huge_page_shift(h), vma,
+				vma->vm_flags))
 		goto out;
 
 	ret = 0;
@@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void)
 			can_do_mlock());
 }
 
-struct file *hugetlb_file_setup(const char *name, size_t size)
+struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 {
 	int error = -ENOMEM;
 	struct file *file;
@@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 
 	error = -ENOMEM;
 	if (hugetlb_reserve_pages(inode, 0,
-			size >> huge_page_shift(hstate_inode(inode)), NULL))
+			size >> huge_page_shift(hstate_inode(inode)), NULL,
+			acctflag))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
-- 
cgit v1.2.3


From 8fe4cd0dc5ea43760c59eb256404188272cc95dd Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 11 Feb 2009 13:04:25 -0800
Subject: jbd: fix return value of journal_start_commit()

journal_start_commit() returns 1 if either a transaction is committing or
the function has queued a transaction commit.  But it returns 0 if we
raced with somebody queueing the transaction commit as well.  This
resulted in ext3_sync_fs() not functioning correctly (description from
Arthur Jones): In the case of a data=ordered umount with pending long
symlinks which are delayed due to a long list of other I/O on the backing
block device, this causes the buffer associated with the long symlinks to
not be moved to the inode dirty list in the second phase of fsync_super.
Then, before they can be dirtied again, kjournald exits, seeing the UMOUNT
flag and the dirty pages are never written to the backing block device,
causing long symlink corruption and exposing new or previously freed block
data to userspace.

This can be reproduced with a script created by Eric Sandeen
<sandeen@redhat.com>:

        #!/bin/bash

        umount /mnt/test2
        mount /dev/sdb4 /mnt/test2
        rm -f /mnt/test2/*
        dd if=/dev/zero of=/mnt/test2/bigfile bs=1M count=512
        touch /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
        ln -s /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
        /mnt/test2/link
        umount /mnt/test2
        mount /dev/sdb4 /mnt/test2
        ls /mnt/test2/

This patch fixes journal_start_commit() to always return 1 when there's
a transaction committing or queued for commit.

Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Mike Snitzer <snitzer@gmail.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/journal.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9e4fa52d7dc8..e79c07812afa 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -427,7 +427,7 @@ int __log_space_left(journal_t *journal)
 }
 
 /*
- * Called under j_state_lock.  Returns true if a transaction was started.
+ * Called under j_state_lock.  Returns true if a transaction commit was started.
  */
 int __log_start_commit(journal_t *journal, tid_t target)
 {
@@ -495,7 +495,8 @@ int journal_force_commit_nested(journal_t *journal)
 
 /*
  * Start a commit of the current running transaction (if any).  Returns true
- * if a transaction was started, and fills its tid in at *ptid
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
  */
 int journal_start_commit(journal_t *journal, tid_t *ptid)
 {
@@ -505,15 +506,19 @@ int journal_start_commit(journal_t *journal, tid_t *ptid)
 	if (journal->j_running_transaction) {
 		tid_t tid = journal->j_running_transaction->t_tid;
 
-		ret = __log_start_commit(journal, tid);
-		if (ret && ptid)
+		__log_start_commit(journal, tid);
+		/* There's a running transaction and we've just made sure
+		 * it's commit has been scheduled. */
+		if (ptid)
 			*ptid = tid;
-	} else if (journal->j_committing_transaction && ptid) {
+		ret = 1;
+	} else if (journal->j_committing_transaction) {
 		/*
 		 * If ext3_write_super() recently started a commit, then we
 		 * have to wait for completion of that transaction
 		 */
-		*ptid = journal->j_committing_transaction->t_tid;
+		if (ptid)
+			*ptid = journal->j_committing_transaction->t_tid;
 		ret = 1;
 	}
 	spin_unlock(&journal->j_state_lock);
-- 
cgit v1.2.3


From 02ac597c9b86af49b2016aa98aee20ab59dbf0d2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 11 Feb 2009 13:04:26 -0800
Subject: ext3: revert "ext3: wait on all pending commits in ext3_sync_fs"

This reverts commit c87591b719737b4e91eb1a9fa8fd55a4ff1886d6.

Since journal_start_commit() is now fixed to return 1 when we started a
transaction commit, there's some transaction waiting to be committed or
there's a transaction already committing, we don't need to call
ext3_force_commit() in ext3_sync_fs().  Furthermore ext3_force_commit()
can unnecessarily create sync transaction which is expensive so it's
worthwhile to remove it when we can.

Cc: Eric Sandeen <sandeen@redhat.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b70d90e08a3c..4a970411a458 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2428,12 +2428,13 @@ static void ext3_write_super (struct super_block * sb)
 
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
-	sb->s_dirt = 0;
-	if (wait)
-		ext3_force_commit(sb);
-	else
-		journal_start_commit(EXT3_SB(sb)->s_journal, NULL);
+	tid_t target;
 
+	sb->s_dirt = 0;
+	if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
+		if (wait)
+			log_wait_commit(EXT3_SB(sb)->s_journal, target);
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 0e4a9b59282914fe057ab17027f55123964bc2e2 Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Wed, 11 Feb 2009 13:04:37 -0800
Subject: ext2/xip: refuse to change xip flag during remount with busy inodes

For a reason that I was unable to understand in three months of debugging,
mount ext2 -o remount stopped working properly when remounting from
regular operation to xip, or the other way around.  According to a git
bisect search, the problem was introduced with the VM_MIXEDMAP/PTE_SPECIAL
rework in the vm:

commit 70688e4dd1647f0ceb502bbd5964fa344c5eb411
Author: Nick Piggin <npiggin@suse.de>
Date:   Mon Apr 28 02:13:02 2008 -0700

    xip: support non-struct page backed memory

In the failing scenario, the filesystem is mounted read only via root=
kernel parameter on s390x.  During remount (in rc.sysinit), the inodes of
the bash binary and its libraries are busy and cannot be invalidated (the
bash which is running rc.sysinit resides on subject filesystem).
Afterwards, another bash process (running ifup-eth) recurses into a
subshell, runs dup_mm (via fork).  Some of the mappings in this bash
process were created from inodes that could not be invalidated during
remount.

Both parent and child process crash some time later due to inconsistencies
in their address spaces.  The issue seems to be timing sensitive, various
attempts to recreate it have failed.

This patch refuses to change the xip flag during remount in case some
inodes cannot be invalidated.  This patch keeps users from running into
that issue.

[akpm@linux-foundation.org: cleanup]
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/super.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index da8bdeaa2e6d..7c6e3606f0ec 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1185,9 +1185,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 	es = sbi->s_es;
 	if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
 	    (old_mount_opt & EXT2_MOUNT_XIP)) &&
-	    invalidate_inodes(sb))
-		ext2_warning(sb, __func__, "busy inodes while remounting "\
-			     "xip remain in cache (no functional problem)");
+	    invalidate_inodes(sb)) {
+		ext2_warning(sb, __func__, "refusing change of xip flag "
+			     "with busy inodes while remounting");
+		sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
+		sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
+	}
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 	if (*flags & MS_RDONLY) {
-- 
cgit v1.2.3


From eb099670895f22970cd143875467c2768d6d87e5 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 12 Feb 2009 09:27:38 -0500
Subject: Btrfs: make sure all pending extent operations are complete

Theres a slight problem with finish_current_insert, if we set all to 1 and then
go through and don't actually skip any of the extents on the pending list, we
could exit right after we've added new extents.

This is a problem because by inserting the new extents we could have gotten new
COW's to happen and such, so we may have some pending updates to do or even
more inserts to do after that.

So this patch will only exit if we have never skipped any of the extents in the
pending list, and we have no extents to insert, this will make sure that all of
the pending work is truly done before we return.  I've been running with this
patch for a few days with all of my other testing and have not seen issues.
Thanks,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/extent-tree.c | 71 +++++++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7527523c2d2d..376656f65b33 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1323,8 +1323,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root)
 {
-	finish_current_insert(trans, root->fs_info->extent_root, 1);
-	del_pending_extents(trans, root->fs_info->extent_root, 1);
+	u64 start;
+	u64 end;
+	int ret;
+
+	while(1) {
+		finish_current_insert(trans, root->fs_info->extent_root, 1);
+		del_pending_extents(trans, root->fs_info->extent_root, 1);
+
+		/* is there more work to do? */
+		ret = find_first_extent_bit(&root->fs_info->pending_del,
+					    0, &start, &end, EXTENT_WRITEBACK);
+		if (!ret)
+			continue;
+		ret = find_first_extent_bit(&root->fs_info->extent_ins,
+					    0, &start, &end, EXTENT_WRITEBACK);
+		if (!ret)
+			continue;
+		break;
+	}
 	return 0;
 }
 
@@ -2211,13 +2228,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	u64 end;
 	u64 priv;
 	u64 search = 0;
-	u64 skipped = 0;
 	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_path *path;
 	struct pending_extent_op *extent_op, *tmp;
 	struct list_head insert_list, update_list;
 	int ret;
-	int num_inserts = 0, max_inserts;
+	int num_inserts = 0, max_inserts, restart = 0;
 
 	path = btrfs_alloc_path();
 	INIT_LIST_HEAD(&insert_list);
@@ -2233,19 +2249,19 @@ again:
 		ret = find_first_extent_bit(&info->extent_ins, search, &start,
 					    &end, EXTENT_WRITEBACK);
 		if (ret) {
-			if (skipped && all && !num_inserts &&
+			if (restart && !num_inserts &&
 			    list_empty(&update_list)) {
-				skipped = 0;
+				restart = 0;
 				search = 0;
 				continue;
 			}
-			mutex_unlock(&info->extent_ins_mutex);
 			break;
 		}
 
 		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
 		if (!ret) {
-			skipped = 1;
+			if (all)
+				restart = 1;
 			search = end + 1;
 			if (need_resched()) {
 				mutex_unlock(&info->extent_ins_mutex);
@@ -2264,7 +2280,7 @@ again:
 			list_add_tail(&extent_op->list, &insert_list);
 			search = end + 1;
 			if (num_inserts == max_inserts) {
-				mutex_unlock(&info->extent_ins_mutex);
+				restart = 1;
 				break;
 			}
 		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
@@ -2280,7 +2296,6 @@ again:
 	 * somebody marked this thing for deletion then just unlock it and be
 	 * done, the free_extents will handle it
 	 */
-	mutex_lock(&info->extent_ins_mutex);
 	list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
 		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
 				  extent_op->bytenr + extent_op->num_bytes - 1,
@@ -2302,6 +2317,10 @@ again:
 	if (!list_empty(&update_list)) {
 		ret = update_backrefs(trans, extent_root, path, &update_list);
 		BUG_ON(ret);
+
+		/* we may have COW'ed new blocks, so lets start over */
+		if (all)
+			restart = 1;
 	}
 
 	/*
@@ -2309,9 +2328,9 @@ again:
 	 * need to make sure everything is cleaned then reset everything and
 	 * go back to the beginning
 	 */
-	if (!num_inserts && all && skipped) {
+	if (!num_inserts && restart) {
 		search = 0;
-		skipped = 0;
+		restart = 0;
 		INIT_LIST_HEAD(&update_list);
 		INIT_LIST_HEAD(&insert_list);
 		goto again;
@@ -2368,27 +2387,19 @@ again:
 	BUG_ON(ret);
 
 	/*
-	 * if we broke out of the loop in order to insert stuff because we hit
-	 * the maximum number of inserts at a time we can handle, then loop
-	 * back and pick up where we left off
+	 * if restart is set for whatever reason we need to go back and start
+	 * searching through the pending list again.
+	 *
+	 * We just inserted some extents, which could have resulted in new
+	 * blocks being allocated, which would result in new blocks needing
+	 * updates, so if all is set we _must_ restart to get the updated
+	 * blocks.
 	 */
-	if (num_inserts == max_inserts) {
-		INIT_LIST_HEAD(&insert_list);
-		INIT_LIST_HEAD(&update_list);
-		num_inserts = 0;
-		goto again;
-	}
-
-	/*
-	 * again, if we need to make absolutely sure there are no more pending
-	 * extent operations left and we know that we skipped some, go back to
-	 * the beginning and do it all again
-	 */
-	if (all && skipped) {
+	if (restart || all) {
 		INIT_LIST_HEAD(&insert_list);
 		INIT_LIST_HEAD(&update_list);
 		search = 0;
-		skipped = 0;
+		restart = 0;
 		num_inserts = 0;
 		goto again;
 	}
@@ -2709,6 +2720,8 @@ again:
 		goto again;
 	}
 
+	if (!err)
+		finish_current_insert(trans, extent_root, 0);
 	return err;
 }
 
-- 
cgit v1.2.3


From b288052e1779261ae80138074989ef50358c4e58 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Feb 2009 09:37:35 -0500
Subject: Btrfs: process mount options on mount -o remount,

Btrfs wasn't parsing any new mount options during remount, making it
difficult to set mount options on a root drive.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f3fd7e2cbc38..66b8341e2dba 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -511,6 +511,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
 
+	ret = btrfs_parse_options(root, data);
+	if (ret)
+		return -EINVAL;
+
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 
-- 
cgit v1.2.3


From 536ac8ae86e68bb5574d7cc81c7d229a86b82601 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Feb 2009 09:41:38 -0500
Subject: Btrfs: use larger metadata clusters in ssd mode

Larger metadata clusters can significantly improve writeback performance
on ssd drives with large erasure blocks.  The larger clusters make it
more likely a given IO will completely overwrite the ssd block, so it
doesn't have to do an internal rwm cycle.

On spinning media, lager metadata clusters end up spreading out the
metadata more over time, which makes fsck slower, so we don't want this
to be the default.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 376656f65b33..c59e12036e20 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2872,7 +2872,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 
 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
 		last_ptr = &root->fs_info->last_alloc;
-		empty_cluster = 64 * 1024;
+		if (!btrfs_test_opt(root, SSD))
+			empty_cluster = 64 * 1024;
 	}
 
 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
-- 
cgit v1.2.3


From e1df36d2f18254d0690a0fbe036cece74ec311b8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Feb 2009 09:45:08 -0500
Subject: Btrfs: don't clean old snapshots on sync(1)

Cleaning old snapshots can make sync(1) somewhat slow, and some users
and applications still use it in a global fsync kind of workload.

This patch changes btrfs not to clean old snapshots during sync, which is
safe from a FS consistency point of view.  The major downside is that it
makes it difficult to tell when old snapshots have been reaped and
the space they were using has been reclaimed.  A new ioctl will be added
for this purpose instead.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 66b8341e2dba..19a4daf03ccb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -379,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	btrfs_start_delalloc_inodes(root);
 	btrfs_wait_ordered_extents(root, 0);
 
-	btrfs_clean_old_snapshots(root);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	sb->s_dirt = 0;
-- 
cgit v1.2.3


From b335b0034e252e79ec2e9c6697f5d663c4627bec Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 12 Feb 2009 10:06:04 -0500
Subject: Btrfs: Avoid using __GFP_HIGHMEM with slab allocator

btrfs_releasepage may call kmem_cache_alloc indirectly,
and provide same GFP flags it gets to kmem_cache_alloc.
So it's possible to use __GFP_HIGHMEM with the slab
allocator.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8f0706210a47..638bcb5e49f6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4263,7 +4263,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
 	if (PageWriteback(page) || PageDirty(page))
 		return 0;
-	return __btrfs_releasepage(page, gfp_flags);
+	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
 }
 
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
-- 
cgit v1.2.3


From 7951f3cefbd711f4429a0cd014aa83a844c399a0 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Thu, 12 Feb 2009 10:06:15 -0500
Subject: Btrfs: balance_level checks !child after access

The BUG_ON() is in the wrong spot.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35443cc4b9a9..6674692f7023 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -917,9 +917,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 		/* promote the child to a root */
 		child = read_node_slot(root, mid, 0);
+		BUG_ON(!child);
 		btrfs_tree_lock(child);
 		btrfs_set_lock_blocking(child);
-		BUG_ON(!child);
 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
 		BUG_ON(ret);
 
-- 
cgit v1.2.3


From e00f7308658622fbd483cb0d9fe41165bf9050d0 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Thu, 12 Feb 2009 14:11:25 -0500
Subject: Btrfs: remove btrfs_init_path

btrfs_init_path was initially used when the path objects were on the
stack.  Now all the work is done by btrfs_alloc_path and btrfs_init_path
isn't required.

This patch removes it, and just uses kmem_cache_zalloc to zero out the object.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     | 11 ++---------
 fs/btrfs/ctree.h     |  1 -
 fs/btrfs/inode-map.c |  1 -
 fs/btrfs/inode.c     |  2 --
 4 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6674692f7023..c8f4c540cc2c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,19 +38,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot);
 
-inline void btrfs_init_path(struct btrfs_path *p)
-{
-	memset(p, 0, sizeof(*p));
-}
-
 struct btrfs_path *btrfs_alloc_path(void)
 {
 	struct btrfs_path *path;
-	path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
-	if (path) {
-		btrfs_init_path(path);
+	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
+	if (path)
 		path->reada = 1;
-	}
 	return path;
 }
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 531db112c8bd..3f7a8058df2b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1834,7 +1834,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
-void btrfs_init_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
 void btrfs_clear_path_blocking(struct btrfs_path *p);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2aa79873eb46..cc7334d833c9 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	search_key.type = 0;
 	search_key.offset = 0;
 
-	btrfs_init_path(path);
 	start_found = 0;
 	ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
 	if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 638bcb5e49f6..3cee77ae03c8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2531,8 +2531,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	key.offset = (u64)-1;
 	key.type = (u8)-1;
 
-	btrfs_init_path(path);
-
 search_again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
-- 
cgit v1.2.3


From a48ddf08ba9bab91efd95e458737afa9d7699623 Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Thu, 12 Feb 2009 14:25:23 -0500
Subject: Btrfs: remove unused code in split_state()

These two lines are not used, remove them.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 37d43b516b79..ebe6b29e6069 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -415,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 
 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 	if (node) {
-		struct extent_state *found;
-		found = rb_entry(node, struct extent_state, rb_node);
 		free_extent_state(prealloc);
 		return -EEXIST;
 	}
-- 
cgit v1.2.3


From 3f3420df505e47751ef76a652b5cb660e5360d6f Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Thu, 12 Feb 2009 10:16:03 -0500
Subject: Btrfs: fs/btrfs/volumes.c: remove useless kzalloc

The call to kzalloc is followed by a kmalloc whose result is stored in the
same variable.

The semantic match that finds the problem is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@r exists@
local idexpression x;
statement S;
expression E;
identifier f,l;
position p1,p2;
expression *ptr != NULL;
@@

(
if ((x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...)) == NULL) S
|
x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...);
...
if (x == NULL) S
)
<... when != x
     when != if (...) { <+...x...+> }
x->f = E
...>
(
 return \(0\|<+...x...+>\|ptr\);
|
 return@p2 ...;
)

@script:python@
p1 << r.p1;
p2 << r.p2;
@@

print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bcd14ebccae1..c793b6f50d8d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2894,10 +2894,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		free_extent_map(em);
 	}
 
-	map = kzalloc(sizeof(*map), GFP_NOFS);
-	if (!map)
-		return -ENOMEM;
-
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 4008c04a07c73ec3cb1be4c1391d2159a8f75d6d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Feb 2009 14:09:45 -0500
Subject: Btrfs: make a lockdep class for the extent buffer locks

Btrfs is currently using spin_lock_nested with a nested value based
on the tree depth of the block.  But, this doesn't quite work because
the max tree depth is bigger than what spin_lock_nested can deal with,
and because locks are sometimes taken before the level field is filled in.

The solution here is to use lockdep_set_class_and_name instead, and to
set the class before unlocking the pages when the block is read from the
disk and just after init of a freshly allocated tree block.

btrfs_clear_path_blocking is also changed to take the locks in the proper
order, and it also makes sure all the locks currently held are properly
set to blocking before it tries to retake the spinlocks.  Otherwise, lockdep
gets upset about bad lock orderin.

The lockdep magic cam from Peter Zijlstra <peterz@infradead.org>

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 45 ++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/ctree.h       | 10 +++-------
 fs/btrfs/disk-io.c     | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/disk-io.h     | 10 ++++++++++
 fs/btrfs/extent-tree.c |  7 +++++--
 fs/btrfs/locking.c     | 11 -----------
 fs/btrfs/volumes.c     |  2 ++
 7 files changed, 99 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c8f4c540cc2c..42491d728e99 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -62,14 +62,38 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
 
 /*
  * reset all the locked nodes in the patch to spinning locks.
+ *
+ * held is used to keep lockdep happy, when lockdep is enabled
+ * we set held to a blocking lock before we go around and
+ * retake all the spinlocks in the path.  You can safely use NULL
+ * for held
  */
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
+					struct extent_buffer *held)
 {
 	int i;
-	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/* lockdep really cares that we take all of these spinlocks
+	 * in the right order.  If any of the locks in the path are not
+	 * currently blocking, it is going to complain.  So, make really
+	 * really sure by forcing the path to blocking before we clear
+	 * the path blocking.
+	 */
+	if (held)
+		btrfs_set_lock_blocking(held);
+	btrfs_set_path_blocking(p);
+#endif
+
+	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
 		if (p->nodes[i] && p->locks[i])
 			btrfs_clear_lock_blocking(p->nodes[i]);
 	}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (held)
+		btrfs_clear_lock_blocking(held);
+#endif
 }
 
 /* this also releases the path */
@@ -279,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 						  trans->transid, level, &ins);
 		BUG_ON(ret);
 		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
-					    buf->len);
+					    buf->len, level);
 	} else {
 		cow = btrfs_alloc_free_block(trans, root, buf->len,
 					     parent_start,
@@ -1559,7 +1583,7 @@ cow_done:
 		if (!p->skip_locking)
 			p->locks[level] = 1;
 
-		btrfs_clear_path_blocking(p);
+		btrfs_clear_path_blocking(p, NULL);
 
 		/*
 		 * we have a lock on b and as long as we aren't changing
@@ -1598,7 +1622,7 @@ cow_done:
 
 				btrfs_set_path_blocking(p);
 				sret = split_node(trans, root, p, level);
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 
 				BUG_ON(sret > 0);
 				if (sret) {
@@ -1618,7 +1642,7 @@ cow_done:
 
 				btrfs_set_path_blocking(p);
 				sret = balance_level(trans, root, p, level);
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 
 				if (sret) {
 					ret = sret;
@@ -1681,13 +1705,13 @@ cow_done:
 			if (!p->skip_locking) {
 				int lret;
 
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 				lret = btrfs_try_spin_lock(b);
 
 				if (!lret) {
 					btrfs_set_path_blocking(p);
 					btrfs_tree_lock(b);
-					btrfs_clear_path_blocking(p);
+					btrfs_clear_path_blocking(p, b);
 				}
 			}
 		} else {
@@ -1699,7 +1723,7 @@ cow_done:
 				btrfs_set_path_blocking(p);
 				sret = split_leaf(trans, root, key,
 						      p, ins_len, ret == 0);
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 
 				BUG_ON(sret > 0);
 				if (sret) {
@@ -3919,7 +3943,6 @@ find_next_key:
 				btrfs_release_path(root, path);
 				goto again;
 			} else {
-				btrfs_clear_path_blocking(path);
 				goto out;
 			}
 		}
@@ -3939,7 +3962,7 @@ find_next_key:
 		path->locks[level - 1] = 1;
 		path->nodes[level - 1] = cur;
 		unlock_up(path, level, 1);
-		btrfs_clear_path_blocking(path);
+		btrfs_clear_path_blocking(path, NULL);
 	}
 out:
 	if (ret == 0)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3f7a8058df2b..766b31ae3186 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -43,11 +43,7 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
-#ifdef CONFIG_LOCKDEP
-# define BTRFS_MAX_LEVEL 7
-#else
-# define BTRFS_MAX_LEVEL 8
-#endif
+#define BTRFS_MAX_LEVEL 8
 
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -1715,7 +1711,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     u64 empty_size);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize);
+					    u64 bytenr, u32 blocksize,
+					    int level);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       u64 num_bytes, u64 parent, u64 min_bytes,
@@ -1835,7 +1832,6 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_clear_path_blocking(struct btrfs_path *p);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5aebddd71193..adda739a0215 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -75,6 +75,40 @@ struct async_submit_bio {
 	struct btrfs_work work;
 };
 
+/* These are used to set the lockdep class on the extent buffer locks.
+ * The class is set by the readpage_end_io_hook after the buffer has
+ * passed csum validation but before the pages are unlocked.
+ *
+ * The lockdep class is also set by btrfs_init_new_buffer on freshly
+ * allocated blocks.
+ *
+ * The class is based on the level in the tree block, which allows lockdep
+ * to know that lower nodes nest inside the locks of higher nodes.
+ *
+ * We also add a check to make sure the highest level of the tree is
+ * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this
+ * code needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# if BTRFS_MAX_LEVEL != 8
+#  error
+# endif
+static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
+static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
+	/* leaf */
+	"btrfs-extent-00",
+	"btrfs-extent-01",
+	"btrfs-extent-02",
+	"btrfs-extent-03",
+	"btrfs-extent-04",
+	"btrfs-extent-05",
+	"btrfs-extent-06",
+	"btrfs-extent-07",
+	/* highest possible level */
+	"btrfs-extent-08",
+};
+#endif
+
 /*
  * extents on the btree inode are pretty simple, there's one extent
  * that covers the entire device
@@ -347,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	return ret;
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
+{
+	lockdep_set_class_and_name(&eb->lock,
+			   &btrfs_eb_class[level],
+			   btrfs_eb_name[level]);
+}
+#endif
+
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
@@ -392,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	found_level = btrfs_header_level(eb);
 
+	btrfs_set_buffer_lockdep_class(eb, found_level);
+
 	ret = csum_tree_block(root, eb, 1);
 	if (ret)
 		ret = -EIO;
@@ -1777,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
 	dev_root->track_dirty = 1;
-
 	if (ret)
 		goto fail_extent_root;
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 494a56eb2986..95029db227be 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -101,4 +101,14 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 int btree_lock_page_hook(struct page *page);
+
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
+#else
+static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
+						 int level)
+{
+}
+#endif
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c59e12036e20..cd86bffbdc9f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3416,7 +3416,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize)
+					    u64 bytenr, u32 blocksize,
+					    int level)
 {
 	struct extent_buffer *buf;
 
@@ -3424,6 +3425,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 	btrfs_set_header_generation(buf, trans->transid);
+	btrfs_set_buffer_lockdep_class(buf, level);
 	btrfs_tree_lock(buf);
 	clean_tree_block(trans, root, buf);
 
@@ -3467,7 +3469,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		return ERR_PTR(ret);
 	}
 
-	buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
+	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
+				    blocksize, level);
 	return buf;
 }
 
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9ebe9385129b..85506c4a3af7 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,21 +25,10 @@
 #include "extent_io.h"
 #include "locking.h"
 
-/*
- * btrfs_header_level() isn't free, so don't call it when lockdep isn't
- * on
- */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static inline void spin_nested(struct extent_buffer *eb)
-{
-	spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
-}
-#else
 static inline void spin_nested(struct extent_buffer *eb)
 {
 	spin_lock(&eb->lock);
 }
-#endif
 
 /*
  * Setting a lock to blocking will drop the spinlock and set the
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c793b6f50d8d..1316139bf9e8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3102,6 +3102,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	if (!sb)
 		return -ENOMEM;
 	btrfs_set_buffer_uptodate(sb);
+	btrfs_set_buffer_lockdep_class(sb, 0);
+
 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
 	array_size = btrfs_super_sys_array_size(super_copy);
 
-- 
cgit v1.2.3


From 2456242530a21cfee82646ebeeda65d3f74faa4c Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 12 Feb 2009 14:14:53 -0500
Subject: Btrfs: hold trans_mutex when using btrfs_record_root_in_trans

btrfs_record_root_in_trans needs the trans_mutex held to make sure two
callers don't race to setup the root in a given transaction.  This adds
it to all the places that were missing it.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 ++
 fs/btrfs/transaction.c | 2 ++
 fs/btrfs/tree-log.c    | 2 ++
 3 files changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cd86bffbdc9f..0a5d796c9f7e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5658,7 +5658,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
 			prev_block = block_start;
 		}
 
+		mutex_lock(&extent_root->fs_info->trans_mutex);
 		btrfs_record_root_in_trans(found_root);
+		mutex_unlock(&extent_root->fs_info->trans_mutex);
 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 			/*
 			 * try to update data extent references while
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 919172de5c9a..4112d53d4f4d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
 		bytes_used = btrfs_root_used(&root->root_item);
 		if (num_bytes) {
+			mutex_lock(&root->fs_info->trans_mutex);
 			btrfs_record_root_in_trans(root);
+			mutex_unlock(&root->fs_info->trans_mutex);
 			btrfs_set_root_used(&root->root_item,
 					    bytes_used - num_bytes);
 		}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 20794290256b..9c462fbd60fa 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2832,7 +2832,9 @@ again:
 		BUG_ON(!wc.replay_dest);
 
 		wc.replay_dest->log_root = log;
+		mutex_lock(&fs_info->trans_mutex);
 		btrfs_record_root_in_trans(wc.replay_dest);
+		mutex_unlock(&fs_info->trans_mutex);
 		ret = walk_log_tree(trans, log, &wc);
 		BUG_ON(ret);
 
-- 
cgit v1.2.3


From efab0b5d3eed6aa71f8e3233e4e11774eedc04dc Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Wed, 11 Feb 2009 13:27:02 -0800
Subject: [JFFS2] force the jffs2 GC daemon to behave a bit better

I've noticed some pretty poor behavior on OLPC machines after bootup, when
gdm/X are starting.  The GCD monopolizes the scheduler (which in turns
means it gets to do more nand i/o), which results in processes taking much
much longer than they should to start.

As an example, on an OLPC machine going from OFW to a usable X (via
auto-login gdm) takes 2m 30s.  The majority of this time is consumed by
the switch into graphical mode.  With this patch, we cut a full 60s off of
bootup time.  After bootup, things are much snappier as well.

Note that we have seen a CRC node error with this patch that causes the machine
to fail to boot, but we've also seen that problem without this patch.

Signed-off-by: Andres Salomon <dilinger@debian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/background.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3cceef4ad2b7..e9580104b6ba 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -95,13 +95,17 @@ static int jffs2_garbage_collect_thread(void *_c)
 			spin_unlock(&c->erase_completion_lock);
 			
 
-		/* This thread is purely an optimisation. But if it runs when
-		   other things could be running, it actually makes things a
-		   lot worse. Use yield() and put it at the back of the runqueue
-		   every time. Especially during boot, pulling an inode in
-		   with read_inode() is much preferable to having the GC thread
-		   get there first. */
-		yield();
+		/* Problem - immediately after bootup, the GCD spends a lot
+		 * of time in places like jffs2_kill_fragtree(); so much so
+		 * that userspace processes (like gdm and X) are starved
+		 * despite plenty of cond_resched()s and renicing.  Yield()
+		 * doesn't help, either (presumably because userspace and GCD
+		 * are generally competing for a higher latency resource -
+		 * disk).
+		 * This forces the GCD to slow the hell down.   Pulling an
+		 * inode in with read_inode() is much preferable to having
+		 * the GC thread get there first. */
+		schedule_timeout_interruptible(msecs_to_jiffies(50));
 
 		/* Put_super will send a SIGKILL and then wait on the sem.
 		 */
-- 
cgit v1.2.3


From d794bf8e0936dce45104565cd48c571061f4c1e3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 14 Feb 2009 10:31:16 -0500
Subject: ext4: Initialize preallocation list_head's properly

When creating a new ext4_prealloc_space structure, we have to
initialize its list_head pointers before we add them to any prealloc
lists.  Otherwise, with list debug enabled, we will get list
corruption warnings.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c962d0690505..4415beeb0b62 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3693,6 +3693,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 	pa->pa_free = pa->pa_len;
 	atomic_set(&pa->pa_count, 1);
 	spin_lock_init(&pa->pa_lock);
+	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
 	pa->pa_linear = 0;
 
@@ -3755,6 +3757,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	atomic_set(&pa->pa_count, 1);
 	spin_lock_init(&pa->pa_lock);
 	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
 	pa->pa_linear = 1;
 
-- 
cgit v1.2.3


From 2acf2c261b823d9d9ed954f348b97620297a36b5 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 14 Feb 2009 10:42:58 -0500
Subject: ext4: Implement range_cyclic in ext4_da_writepages instead of
 write_cache_pages

With delayed allocation we lock the page in write_cache_pages() and
try to build an in memory extent of contiguous blocks.  This is needed
so that we can get large contiguous blocks request.  If range_cyclic
mode is enabled, write_cache_pages() will loop back to the 0 index if
no I/O has been done yet, and try to start writing from the beginning
of the range.  That causes an attempt to take the page lock of lower
index page while holding the page lock of higher index page, which can
cause a dead lock with another writeback thread.

The solution is to implement the range_cyclic behavior in
ext4_da_writepages() instead.

http://bugzilla.kernel.org/show_bug.cgi?id=12579

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 658c4a7f2578..cbd2ca99d113 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2439,6 +2439,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	int no_nrwrite_index_update;
 	int pages_written = 0;
 	long pages_skipped;
+	int range_cyclic, cycled = 1, io_done = 0;
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
@@ -2490,9 +2491,15 @@ static int ext4_da_writepages(struct address_space *mapping,
 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 		range_whole = 1;
 
-	if (wbc->range_cyclic)
+	range_cyclic = wbc->range_cyclic;
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index;
-	else
+		if (index)
+			cycled = 0;
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
+		wbc->range_end  = LLONG_MAX;
+		wbc->range_cyclic = 0;
+	} else
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 
 	mpd.wbc = wbc;
@@ -2506,6 +2513,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	wbc->no_nrwrite_index_update = 1;
 	pages_skipped = wbc->pages_skipped;
 
+retry:
 	while (!ret && wbc->nr_to_write > 0) {
 
 		/*
@@ -2548,6 +2556,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 			pages_written += mpd.pages_written;
 			wbc->pages_skipped = pages_skipped;
 			ret = 0;
+			io_done = 1;
 		} else if (wbc->nr_to_write)
 			/*
 			 * There is no more writeout needed
@@ -2556,6 +2565,13 @@ static int ext4_da_writepages(struct address_space *mapping,
 			 */
 			break;
 	}
+	if (!io_done && !cycled) {
+		cycled = 1;
+		index = 0;
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
+		wbc->range_end  = mapping->writeback_index - 1;
+		goto retry;
+	}
 	if (pages_skipped != wbc->pages_skipped)
 		printk(KERN_EMERG "This should not happen leaving %s "
 				"with nr_to_write = %ld ret = %d\n",
@@ -2563,6 +2579,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 
 	/* Update index */
 	index += pages_written;
+	wbc->range_cyclic = range_cyclic;
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		/*
 		 * set the writeback_index so that range_cyclic
-- 
cgit v1.2.3


From 090542641de833c6f756895fc2f139f046e298f9 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sun, 15 Feb 2009 20:02:19 -0500
Subject: ext4: Fix NULL dereference in ext4_ext_migrate()'s error handling

This was found through a code checker (http://repo.or.cz/w/smatch.git/).
It looks like you might be able to trigger the error by trying to migrate
a readonly file system.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/migrate.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 734abca25e35..fe64d9f79852 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -481,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode)
 					+ 1);
 	if (IS_ERR(handle)) {
 		retval = PTR_ERR(handle);
-		goto err_out;
+		return retval;
 	}
 	tmp_inode = ext4_new_inode(handle,
 				inode->i_sb->s_root->d_inode,
@@ -489,8 +489,7 @@ int ext4_ext_migrate(struct inode *inode)
 	if (IS_ERR(tmp_inode)) {
 		retval = -ENOMEM;
 		ext4_journal_stop(handle);
-		tmp_inode = NULL;
-		goto err_out;
+		return retval;
 	}
 	i_size_write(tmp_inode, i_size_read(inode));
 	/*
@@ -618,8 +617,7 @@ err_out:
 
 	ext4_journal_stop(handle);
 
-	if (tmp_inode)
-		iput(tmp_inode);
+	iput(tmp_inode);
 
 	return retval;
 }
-- 
cgit v1.2.3


From 1a88b5364b535edaa321d70a566e358390ff0872 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 16 Feb 2009 02:38:12 +0000
Subject: Fix incomplete __mntput locking

Getting this wrong caused

	WARNING: at fs/namespace.c:636 mntput_no_expire+0xac/0xf2()

due to optimistically checking cpu_writer->mnt outside the spinlock.

Here's what we really want:
 * we know that nobody will set cpu_writer->mnt to mnt from now on
 * all changes to that sucker are done under cpu_writer->lock
 * we want the laziest equivalent of
	spin_lock(&cpu_writer->lock);
	if (likely(cpu_writer->mnt != mnt)) {
		spin_unlock(&cpu_writer->lock);
		continue;
	}
	/* do stuff */
  that would make sure we won't miss earlier setting of ->mnt done by
  another CPU.

Anyway, for now we just move the spin_lock() earlier and move the test
into the properly locked region.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reported-and-tested-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 228d8c4bfd18..06f8e63f6cb1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -614,9 +614,11 @@ static inline void __mntput(struct vfsmount *mnt)
 	 */
 	for_each_possible_cpu(cpu) {
 		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-		if (cpu_writer->mnt != mnt)
-			continue;
 		spin_lock(&cpu_writer->lock);
+		if (cpu_writer->mnt != mnt) {
+			spin_unlock(&cpu_writer->lock);
+			continue;
+		}
 		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
 		cpu_writer->count = 0;
 		/*
-- 
cgit v1.2.3


From a60e78e57a17d55bbd5a96da16fe9649d364b987 Mon Sep 17 00:00:00 2001
From: Subhash Peddamallu <subhash.peddamallu@gmail.com>
Date: Mon, 16 Feb 2009 10:27:07 +0100
Subject: fs/bio: bio_alloc_bioset: pass right object ptr to mempool_free

When freeing from bio pool use right ptr to account for bs->front_pad,
instead of bio ptr,

Signed-off-by: Subhash Peddamallu <subhash.peddamallu@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 062299acbccd..72ab251cdb9c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -302,9 +302,10 @@ void bio_init(struct bio *bio)
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
 	struct bio *bio = NULL;
+	void *p;
 
 	if (bs) {
-		void *p = mempool_alloc(bs->bio_pool, gfp_mask);
+		p = mempool_alloc(bs->bio_pool, gfp_mask);
 
 		if (p)
 			bio = p + bs->front_pad;
@@ -329,7 +330,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 			}
 			if (unlikely(!bvl)) {
 				if (bs)
-					mempool_free(bio, bs->bio_pool);
+					mempool_free(p, bs->bio_pool);
 				else
 					kfree(bio);
 				bio = NULL;
-- 
cgit v1.2.3


From 78f707bfc723552e8309b7c38a8d0cc51012e813 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 17 Feb 2009 13:59:08 +0100
Subject: block: revert part of 18ce3751ccd488c78d3827e9f6bf54e6322676fb

The above commit added WRITE_SYNC and switched various places to using
that for committing writes that will be waited upon immediately after
submission. However, this causes a performance regression with AS and CFQ
for ext3 at least, since sync_dirty_buffer() will submit some writes with
WRITE_SYNC while ext3 has sumitted others dependent writes without the sync
flag set. This causes excessive anticipation/idling in the IO scheduler
because sync and async writes get interleaved, causing a big performance
regression for the below test case (which is meant to simulate sqlite
like behaviour).

---- test case ----

int main(int argc, char **argv)
{

	int fdes, i;
	FILE *fp;
	struct timeval start;
	struct timeval end;
	struct timeval res;

	gettimeofday(&start, NULL);
	for (i=0; i<ROWS; i++) {
		fp = fopen("test_file", "a");
		fprintf(fp, "Some Text Data\n");
		fdes = fileno(fp);
		fsync(fdes);
		fclose(fp);
	}
	gettimeofday(&end, NULL);

	timersub(&end, &start, &res);
	fprintf(stdout, "time to write %d lines is %ld(msec)\n", ROWS,
			(res.tv_sec*1000000 + res.tv_usec)/1000);

	return 0;
}

-------------------

Thanks to Sean.White@APCC.com for tracking down this performance
regression and providing a test case.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 665d446b25bc..62b57e330b69 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3108,7 +3108,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
-		ret = submit_bh(WRITE_SYNC, bh);
+		ret = submit_bh(WRITE, bh);
 		wait_on_buffer(bh);
 		if (buffer_eopnotsupp(bh)) {
 			clear_buffer_eopnotsupp(bh);
-- 
cgit v1.2.3


From 8f19d472935c83d823fa4cf02bcc0a7b9952db30 Mon Sep 17 00:00:00 2001
From: Eric Biederman <ebiederm@xmission.com>
Date: Wed, 18 Feb 2009 14:48:16 -0800
Subject: seq_file: properly cope with pread

Currently seq_read assumes that the offset passed to it is always the
offset it passed to user space.  In the case pread this assumption is
broken and we do the wrong thing when presented with pread.

To solve this I introduce an offset cache inside of struct seq_file so we
know where our logical file position is.  Then in seq_read if we try to
read from another offset we reset our data structures and attempt to go to
the offset user space wanted.

[akpm@linux-foundation.org: restore FMODE_PWRITE]
[pjt@google.com: seq_open needs its fmode opened up to take advantage of this]
Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Paul Turner <pjt@google.com>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5267098532bf..a1a4cfe19210 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -48,8 +48,16 @@ int seq_open(struct file *file, const struct seq_operations *op)
 	 */
 	file->f_version = 0;
 
-	/* SEQ files support lseek, but not pread/pwrite */
-	file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE);
+	/*
+	 * seq_files support lseek() and pread().  They do not implement
+	 * write() at all, but we clear FMODE_PWRITE here for historical
+	 * reasons.
+	 *
+	 * If a client of seq_files a) implements file.write() and b) wishes to
+	 * support pwrite() then that client will need to implement its own
+	 * file.open() which calls seq_open() and then sets FMODE_PWRITE.
+	 */
+	file->f_mode &= ~FMODE_PWRITE;
 	return 0;
 }
 EXPORT_SYMBOL(seq_open);
@@ -131,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 	int err = 0;
 
 	mutex_lock(&m->lock);
+
+	/* Don't assume *ppos is where we left it */
+	if (unlikely(*ppos != m->read_pos)) {
+		m->read_pos = *ppos;
+		while ((err = traverse(m, *ppos)) == -EAGAIN)
+			;
+		if (err) {
+			/* With prejudice... */
+			m->read_pos = 0;
+			m->version = 0;
+			m->index = 0;
+			m->count = 0;
+			goto Done;
+		}
+	}
+
 	/*
 	 * seq_file->op->..m_start/m_stop/m_next may do special actions
 	 * or optimisations based on the file->f_version, so we want to
@@ -230,8 +254,10 @@ Fill:
 Done:
 	if (!copied)
 		copied = err;
-	else
+	else {
 		*ppos += copied;
+		m->read_pos += copied;
+	}
 	file->f_version = m->version;
 	mutex_unlock(&m->lock);
 	return copied;
@@ -266,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 			if (offset < 0)
 				break;
 			retval = offset;
-			if (offset != file->f_pos) {
+			if (offset != m->read_pos) {
 				while ((retval=traverse(m, offset)) == -EAGAIN)
 					;
 				if (retval) {
 					/* with extreme prejudice... */
 					file->f_pos = 0;
+					m->read_pos = 0;
 					m->version = 0;
 					m->index = 0;
 					m->count = 0;
 				} else {
+					m->read_pos = offset;
 					retval = file->f_pos = offset;
 				}
 			}
-- 
cgit v1.2.3


From 610d18f4128ebbd88845d0fc60cce67b49af881e Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Wed, 18 Feb 2009 14:48:18 -0800
Subject: timerfd: add flags check

As requested by Michael, add a missing check for valid flags in
timerfd_settime(), and make it return EINVAL in case some extra bits are
set.

Michael said:
If this is to be any use to userland apps that want to check flag
support (perhaps it is too late already), then the sooner we get it
into the kernel the better: 2.6.29 would be good; earlier stables as
well would be even better.

[akpm@linux-foundation.org: remove unused TFD_FLAGS_SET]
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: <stable@kernel.org>		[2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/timerfd.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/timerfd.c b/fs/timerfd.c
index 6a123b8ff3f5..b042bd7034b1 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
 
-	if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK))
-		return -EINVAL;
-	if (clockid != CLOCK_MONOTONIC &&
-	    clockid != CLOCK_REALTIME)
+	if ((flags & ~TFD_CREATE_FLAGS) ||
+	    (clockid != CLOCK_MONOTONIC &&
+	     clockid != CLOCK_REALTIME))
 		return -EINVAL;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
 
 	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
-			       flags & (O_CLOEXEC | O_NONBLOCK));
+			       flags & TFD_SHARED_FCNTL_FLAGS);
 	if (ufd < 0)
 		kfree(ctx);
 
@@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
 		return -EFAULT;
 
-	if (!timespec_valid(&ktmr.it_value) ||
+	if ((flags & ~TFD_SETTIME_FLAGS) ||
+	    !timespec_valid(&ktmr.it_value) ||
 	    !timespec_valid(&ktmr.it_interval))
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 1cf6e7d83bf334cc5916137862c920a97aabc018 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 18 Feb 2009 14:48:18 -0800
Subject: mm: task dirty accounting fix

YAMAMOTO-san noticed that task_dirty_inc doesn't seem to be called properly for
cases where set_page_dirty is not used to dirty a page (eg. mark_buffer_dirty).

Additionally, there is some inconsistency about when task_dirty_inc is
called.  It is used for dirty balancing, however it even gets called for
__set_page_dirty_no_writeback.

So rather than increment it in a set_page_dirty wrapper, move it down to
exactly where the dirty page accounting stats are incremented.

Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 665d446b25bc..ff4d1cdd779b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -777,6 +777,7 @@ static int __set_page_dirty(struct page *page,
 			__inc_zone_page_state(page, NR_FILE_DIRTY);
 			__inc_bdi_stat(mapping->backing_dev_info,
 					BDI_RECLAIMABLE);
+			task_dirty_inc(current);
 			task_io_account_write(PAGE_CACHE_SIZE);
 		}
 		radix_tree_tag_set(&mapping->page_tree,
-- 
cgit v1.2.3


From ada723dcd681e2dffd7d73345cc8fda0eb0df9bd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 18 Feb 2009 14:48:30 -0800
Subject: fs/super.c: add lockdep annotation to s_umount

Li Zefan said:

Thread 1:
  for ((; ;))
  {
      mount -t cpuset xxx /mnt > /dev/null 2>&1
      cat /mnt/cpus > /dev/null 2>&1
      umount /mnt > /dev/null 2>&1
  }

Thread 2:
  for ((; ;))
  {
      mount -t cpuset xxx /mnt > /dev/null 2>&1
      umount /mnt > /dev/null 2>&1
  }

(Note: It is irrelevant which cgroup subsys is used.)

After a while a lockdep warning showed up:

=============================================
[ INFO: possible recursive locking detected ]
2.6.28 #479
---------------------------------------------
mount/13554 is trying to acquire lock:
 (&type->s_umount_key#19){--..}, at: [<c049d888>] sget+0x5e/0x321

but task is already holding lock:
 (&type->s_umount_key#19){--..}, at: [<c049da0c>] sget+0x1e2/0x321

other info that might help us debug this:
1 lock held by mount/13554:
 #0:  (&type->s_umount_key#19){--..}, at: [<c049da0c>] sget+0x1e2/0x321

stack backtrace:
Pid: 13554, comm: mount Not tainted 2.6.28-mc #479
Call Trace:
 [<c044ad2e>] validate_chain+0x4c6/0xbbd
 [<c044ba9b>] __lock_acquire+0x676/0x700
 [<c044bb82>] lock_acquire+0x5d/0x7a
 [<c049d888>] ? sget+0x5e/0x321
 [<c061b9b8>] down_write+0x34/0x50
 [<c049d888>] ? sget+0x5e/0x321
 [<c049d888>] sget+0x5e/0x321
 [<c045a2e7>] ? cgroup_set_super+0x0/0x3e
 [<c045959f>] ? cgroup_test_super+0x0/0x2f
 [<c045bcea>] cgroup_get_sb+0x98/0x2e7
 [<c045cfb6>] cpuset_get_sb+0x4a/0x5f
 [<c049dfa4>] vfs_kern_mount+0x40/0x7b
 [<c049e02d>] do_kern_mount+0x37/0xbf
 [<c04af4a0>] do_mount+0x5c3/0x61a
 [<c04addd2>] ? copy_mount_options+0x2c/0x111
 [<c04af560>] sys_mount+0x69/0xa0
 [<c0403251>] sysenter_do_call+0x12/0x31

The cause is after alloc_super() and then retry, an old entry in list
fs_supers is found, so grab_super(old) is called, but both functions hold
s_umount lock:

struct super_block *sget(...)
{
	...
retry:
	spin_lock(&sb_lock);
	if (test) {
		list_for_each_entry(old, &type->fs_supers, s_instances) {
			if (!test(old, data))
				continue;
			if (!grab_super(old))  <--- 2nd: down_write(&old->s_umount);
				goto retry;
			if (s)
				destroy_super(s);
			return old;
		}
	}
	if (!s) {
		spin_unlock(&sb_lock);
		s = alloc_super(type);   <--- 1th: down_write(&s->s_umount)
		if (!s)
			return ERR_PTR(-ENOMEM);
		goto retry;
	}
	...
}

It seems like a false positive, and seems like VFS but not cgroup needs to
be fixed.

Peter said:

We can simply put the new s_umount instance in a but lockdep doesn't
particularly cares about subclass order.

If there's any issue with the callers of sget() assuming the s_umount lock
being of sublcass 0, then there is another annotation we can use to fix
that, but lets not bother with that if this is sufficient.

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=12673

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Paul Menage <menage@google.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 61dce001dd57..8349ed6b1412 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -82,7 +82,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		 * lock ordering than usbfs:
 		 */
 		lockdep_set_class(&s->s_lock, &type->s_lock_key);
-		down_write(&s->s_umount);
+		/*
+		 * sget() can have s_umount recursion.
+		 *
+		 * When it cannot find a suitable sb, it allocates a new
+		 * one (this one), and tries again to find a suitable old
+		 * one.
+		 *
+		 * In case that succeeds, it will acquire the s_umount
+		 * lock of the old one. Since these are clearly distrinct
+		 * locks, and this object isn't exposed yet, there's no
+		 * risk of deadlocks.
+		 *
+		 * Annotate this by putting this lock in a different
+		 * subclass.
+		 */
+		down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
 		s->s_count = S_BIAS;
 		atomic_set(&s->s_active, 1);
 		mutex_init(&s->s_vfs_rename_mutex);
-- 
cgit v1.2.3


From 2db69a9340da12a4db44edb7506dd68799aeff55 Mon Sep 17 00:00:00 2001
From: Bill Nottingham <notting@redhat.com>
Date: Wed, 18 Feb 2009 14:48:39 -0800
Subject: vt: Declare PIO_CMAP/GIO_CMAP as compatbile ioctls.

Otherwise, these don't work when called from 32-bit userspace on 64-bit
kernels.

Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat_ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 9c6d815dd191..39bd4d38e889 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1938,6 +1938,8 @@ ULONG_IOCTL(SET_BITMAP_FILE)
 /* Big K */
 COMPATIBLE_IOCTL(PIO_FONT)
 COMPATIBLE_IOCTL(GIO_FONT)
+COMPATIBLE_IOCTL(PIO_CMAP)
+COMPATIBLE_IOCTL(GIO_CMAP)
 ULONG_IOCTL(KDSIGACCEPT)
 COMPATIBLE_IOCTL(KDGETKEYCODE)
 COMPATIBLE_IOCTL(KDSETKEYCODE)
-- 
cgit v1.2.3


From f04b30de3c82528f1ab4c58b3dd4c975f5341901 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Feb 2009 14:48:43 -0800
Subject: inotify: fix GFP_KERNEL related deadlock

Enhanced lockdep coverage of __GFP_NOFS turned up this new lockdep
assert:

[ 1093.677775]
[ 1093.677781] =================================
[ 1093.680031] [ INFO: inconsistent lock state ]
[ 1093.680031] 2.6.29-rc5-tip-01504-gb49eca1-dirty #1
[ 1093.680031] ---------------------------------
[ 1093.680031] inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage.
[ 1093.680031] kswapd0/308 [HC0[0]:SC0[0]:HE1:SE1] takes:
[ 1093.680031]  (&inode->inotify_mutex){+.+.?.}, at: [<c0205942>] inotify_inode_is_dead+0x20/0x80
[ 1093.680031] {RECLAIM_FS-ON-W} state was registered at:
[ 1093.680031]   [<c01696b9>] mark_held_locks+0x43/0x5b
[ 1093.680031]   [<c016baa4>] lockdep_trace_alloc+0x6c/0x6e
[ 1093.680031]   [<c01cf8b0>] kmem_cache_alloc+0x20/0x150
[ 1093.680031]   [<c040d0ec>] idr_pre_get+0x27/0x6c
[ 1093.680031]   [<c02056e3>] inotify_handle_get_wd+0x25/0xad
[ 1093.680031]   [<c0205f43>] inotify_add_watch+0x7a/0x129
[ 1093.680031]   [<c020679e>] sys_inotify_add_watch+0x20f/0x250
[ 1093.680031]   [<c010389e>] sysenter_do_call+0x12/0x35
[ 1093.680031]   [<ffffffff>] 0xffffffff
[ 1093.680031] irq event stamp: 60417
[ 1093.680031] hardirqs last  enabled at (60417): [<c018d5f5>] call_rcu+0x53/0x59
[ 1093.680031] hardirqs last disabled at (60416): [<c018d5b9>] call_rcu+0x17/0x59
[ 1093.680031] softirqs last  enabled at (59656): [<c0146229>] __do_softirq+0x157/0x16b
[ 1093.680031] softirqs last disabled at (59651): [<c0106293>] do_softirq+0x74/0x15d
[ 1093.680031]
[ 1093.680031] other info that might help us debug this:
[ 1093.680031] 2 locks held by kswapd0/308:
[ 1093.680031]  #0:  (shrinker_rwsem){++++..}, at: [<c01b0502>] shrink_slab+0x36/0x189
[ 1093.680031]  #1:  (&type->s_umount_key#4){+++++.}, at: [<c01e6d77>] shrink_dcache_memory+0x110/0x1fb
[ 1093.680031]
[ 1093.680031] stack backtrace:
[ 1093.680031] Pid: 308, comm: kswapd0 Not tainted 2.6.29-rc5-tip-01504-gb49eca1-dirty #1
[ 1093.680031] Call Trace:
[ 1093.680031]  [<c016947a>] valid_state+0x12a/0x13d
[ 1093.680031]  [<c016954e>] mark_lock+0xc1/0x1e9
[ 1093.680031]  [<c016a5b4>] ? check_usage_forwards+0x0/0x3f
[ 1093.680031]  [<c016ab74>] __lock_acquire+0x2c6/0xac8
[ 1093.680031]  [<c01688d9>] ? register_lock_class+0x17/0x228
[ 1093.680031]  [<c016b3d3>] lock_acquire+0x5d/0x7a
[ 1093.680031]  [<c0205942>] ? inotify_inode_is_dead+0x20/0x80
[ 1093.680031]  [<c08824c4>] __mutex_lock_common+0x3a/0x4cb
[ 1093.680031]  [<c0205942>] ? inotify_inode_is_dead+0x20/0x80
[ 1093.680031]  [<c08829ed>] mutex_lock_nested+0x2e/0x36
[ 1093.680031]  [<c0205942>] ? inotify_inode_is_dead+0x20/0x80
[ 1093.680031]  [<c0205942>] inotify_inode_is_dead+0x20/0x80
[ 1093.680031]  [<c01e6672>] dentry_iput+0x90/0xc2
[ 1093.680031]  [<c01e67a3>] d_kill+0x21/0x45
[ 1093.680031]  [<c01e6a46>] __shrink_dcache_sb+0x27f/0x355
[ 1093.680031]  [<c01e6dc5>] shrink_dcache_memory+0x15e/0x1fb
[ 1093.680031]  [<c01b05ed>] shrink_slab+0x121/0x189
[ 1093.680031]  [<c01b0d12>] kswapd+0x39f/0x561
[ 1093.680031]  [<c01ae499>] ? isolate_pages_global+0x0/0x233
[ 1093.680031]  [<c0157eae>] ? autoremove_wake_function+0x0/0x43
[ 1093.680031]  [<c01b0973>] ? kswapd+0x0/0x561
[ 1093.680031]  [<c0157daf>] kthread+0x41/0x82
[ 1093.680031]  [<c0157d6e>] ? kthread+0x0/0x82
[ 1093.680031]  [<c01043ab>] kernel_thread_helper+0x7/0x10

inotify_handle_get_wd() does idr_pre_get() which does a
kmem_cache_alloc() without __GFP_FS - and is hence deadlockable under
extreme MM pressure.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: MinChan Kim <minchan.kim@gmail.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inotify/inotify.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28f30d4..331f2e88e284 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -156,7 +156,7 @@ static int inotify_handle_get_wd(struct inotify_handle *ih,
 	int ret;
 
 	do {
-		if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
+		if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
 			return -ENOSPC;
 		ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
 	} while (ret == -EAGAIN);
-- 
cgit v1.2.3


From 7fdf582447aa01658b624adc0a51a31e4278b68c Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Wed, 18 Feb 2009 15:41:28 -0600
Subject: Revert "[XFS] use scalable vmap API"

This reverts commit 95f8e302c04c0b0c6de35ab399a5551605eeb006.

This commit caused regression. We'll try to fix use of new
vmap API for next release.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index d71dc44e21ed..0b2177a9fbdc 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -264,7 +264,7 @@ xfs_buf_free(
 		uint		i;
 
 		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-                       vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count);
+                       vunmap(bp->b_addr - bp->b_offset);
 
 		for (i = 0; i < bp->b_page_count; i++) {
 			struct page	*page = bp->b_pages[i];
@@ -386,8 +386,8 @@ _xfs_buf_map_pages(
 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 		bp->b_flags |= XBF_MAPPED;
 	} else if (flags & XBF_MAPPED) {
-               bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-                                       -1, PAGE_KERNEL);
+		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
+					VM_MAP, PAGE_KERNEL);
 		if (unlikely(bp->b_addr == NULL))
 			return -ENOMEM;
 		bp->b_addr += bp->b_offset;
-- 
cgit v1.2.3


From 27e88bf6af7d42adf790f7b2ed7d65475f191cf2 Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Wed, 18 Feb 2009 15:56:51 -0600
Subject: Revert "[XFS] remove old vmap cache"

This reverts commit d2859751cd0bf586941ffa7308635a293f943c17.

This commit caused regression. We'll try to fix use of new
vmap API for next release.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 75 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 74 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 0b2177a9fbdc..cb329edc925b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -165,6 +165,75 @@ test_page_region(
 	return (mask && (page_private(page) & mask) == mask);
 }
 
+/*
+ *	Mapping of multi-page buffers into contiguous virtual space
+ */
+
+typedef struct a_list {
+	void		*vm_addr;
+	struct a_list	*next;
+} a_list_t;
+
+static a_list_t		*as_free_head;
+static int		as_list_len;
+static DEFINE_SPINLOCK(as_lock);
+
+/*
+ *	Try to batch vunmaps because they are costly.
+ */
+STATIC void
+free_address(
+	void		*addr)
+{
+	a_list_t	*aentry;
+
+#ifdef CONFIG_XEN
+	/*
+	 * Xen needs to be able to make sure it can get an exclusive
+	 * RO mapping of pages it wants to turn into a pagetable.  If
+	 * a newly allocated page is also still being vmap()ed by xfs,
+	 * it will cause pagetable construction to fail.  This is a
+	 * quick workaround to always eagerly unmap pages so that Xen
+	 * is happy.
+	 */
+	vunmap(addr);
+	return;
+#endif
+
+	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
+	if (likely(aentry)) {
+		spin_lock(&as_lock);
+		aentry->next = as_free_head;
+		aentry->vm_addr = addr;
+		as_free_head = aentry;
+		as_list_len++;
+		spin_unlock(&as_lock);
+	} else {
+		vunmap(addr);
+	}
+}
+
+STATIC void
+purge_addresses(void)
+{
+	a_list_t	*aentry, *old;
+
+	if (as_free_head == NULL)
+		return;
+
+	spin_lock(&as_lock);
+	aentry = as_free_head;
+	as_free_head = NULL;
+	as_list_len = 0;
+	spin_unlock(&as_lock);
+
+	while ((old = aentry) != NULL) {
+		vunmap(aentry->vm_addr);
+		aentry = aentry->next;
+		kfree(old);
+	}
+}
+
 /*
  *	Internal xfs_buf_t object manipulation
  */
@@ -264,7 +333,7 @@ xfs_buf_free(
 		uint		i;
 
 		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-                       vunmap(bp->b_addr - bp->b_offset);
+			free_address(bp->b_addr - bp->b_offset);
 
 		for (i = 0; i < bp->b_page_count; i++) {
 			struct page	*page = bp->b_pages[i];
@@ -386,6 +455,8 @@ _xfs_buf_map_pages(
 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 		bp->b_flags |= XBF_MAPPED;
 	} else if (flags & XBF_MAPPED) {
+		if (as_list_len > 64)
+			purge_addresses();
 		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
 					VM_MAP, PAGE_KERNEL);
 		if (unlikely(bp->b_addr == NULL))
@@ -1672,6 +1743,8 @@ xfsbufd(
 			count++;
 		}
 
+		if (as_list_len > 0)
+			purge_addresses();
 		if (count)
 			blk_run_address_space(target->bt_mapping);
 
-- 
cgit v1.2.3


From 2cfbd50b536c878e58ab3681c4e944fa3d99b415 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 20 Feb 2009 10:55:10 -0500
Subject: Btrfs: check file pointer in btrfs_sync_file

fsync can be called by NFS with a null file pointer, and btrfs was
oopsing in this case.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3e8023efaff7..872f104576e5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1222,7 +1222,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
-	if (file->private_data)
+	if (file && file->private_data)
 		btrfs_ioctl_trans_end(file);
 
 	trans = btrfs_start_transaction(root, 1);
@@ -1231,7 +1231,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
-	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+	ret = btrfs_log_dentry_safe(trans, root, dentry);
 	if (ret < 0)
 		goto out;
 
@@ -1245,7 +1245,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	 * file again, but that will end up using the synchronization
 	 * inside btrfs_sync_log to keep things safe.
 	 */
-	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	if (ret > 0) {
 		ret = btrfs_commit_transaction(trans, root);
@@ -1253,7 +1253,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		btrfs_sync_log(trans, root);
 		ret = btrfs_end_transaction(trans, root);
 	}
-	mutex_lock(&file->f_dentry->d_inode->i_mutex);
+	mutex_lock(&dentry->d_inode->i_mutex);
 out:
 	return ret > 0 ? EIO : ret;
 }
-- 
cgit v1.2.3


From 6a63209fc02d5483371f07e4913ee8abad608051 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Fri, 20 Feb 2009 11:00:09 -0500
Subject: Btrfs: add better -ENOSPC handling

This is a step in the direction of better -ENOSPC handling.  Instead of
checking the global bytes counter we check the space_info bytes counters to
make sure we have enough space.

If we don't we go ahead and try to allocate a new chunk, and then if that fails
we return -ENOSPC.  This patch adds two counters to btrfs_space_info,
bytes_delalloc and bytes_may_use.

bytes_delalloc account for extents we've actually setup for delalloc and will
be allocated at some point down the line.

bytes_may_use is to keep track of how many bytes we may use for delalloc at
some point.  When we actually set the extent_bit for the delalloc bytes we
subtract the reserved bytes from the bytes_may_use counter.  This keeps us from
not actually being able to allocate space for any delalloc bytes.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/btrfs_inode.h |   8 ++
 fs/btrfs/ctree.h       |  40 ++++++---
 fs/btrfs/extent-tree.c | 215 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/file.c        |  16 +++-
 fs/btrfs/inode.c       |  62 ++++----------
 fs/btrfs/ioctl.c       |   6 +-
 6 files changed, 271 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a8c9693b75ac..72677ce2b74f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,9 @@ struct btrfs_inode {
 	 */
 	struct list_head delalloc_inodes;
 
+	/* the space_info for where this inode's data allocations are done */
+	struct btrfs_space_info *space_info;
+
 	/* full 64 bit generation number, struct vfs_inode doesn't have a big
 	 * enough field for this.
 	 */
@@ -94,6 +97,11 @@ struct btrfs_inode {
 	 */
 	u64 delalloc_bytes;
 
+	/* total number of bytes that may be used for this inode for
+	 * delalloc
+	 */
+	u64 reserved_bytes;
+
 	/*
 	 * the size of the file stored in the metadata on disk.  data=ordered
 	 * means the in-memory i_size might be larger than the size on disk
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 766b31ae3186..82491ba8fa40 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -596,13 +596,27 @@ struct btrfs_block_group_item {
 
 struct btrfs_space_info {
 	u64 flags;
-	u64 total_bytes;
-	u64 bytes_used;
-	u64 bytes_pinned;
-	u64 bytes_reserved;
-	u64 bytes_readonly;
-	int full;
-	int force_alloc;
+
+	u64 total_bytes;	/* total bytes in the space */
+	u64 bytes_used;		/* total bytes used on disk */
+	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
+				   transaction finishes */
+	u64 bytes_reserved;	/* total bytes the allocator has reserved for
+				   current allocations */
+	u64 bytes_readonly;	/* total bytes that are read only */
+
+	/* delalloc accounting */
+	u64 bytes_delalloc;	/* number of bytes reserved for allocation,
+				   this space is not necessarily reserved yet
+				   by the allocator */
+	u64 bytes_may_use;	/* number of bytes that may be used for
+				   delalloc */
+
+	int full;		/* indicates that we cannot allocate any more
+				   chunks for this space */
+	int force_alloc;	/* set if we need to force a chunk alloc for
+				   this space */
+
 	struct list_head list;
 
 	/* for block groups in our same type */
@@ -1782,6 +1796,16 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root);
 int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
+int btrfs_check_metadata_free_space(struct btrfs_root *root);
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+				u64 bytes);
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+				    struct inode *inode, u64 bytes);
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+				 u64 bytes);
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+			      u64 bytes);
 /* ctree.c */
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
@@ -2027,8 +2051,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index);
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
-			   int for_del);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0a5d796c9f7e..e11875e97c2f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -60,6 +60,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			      u64 bytenr, u64 num_bytes, int alloc,
 			      int mark_free);
 
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *extent_root, u64 alloc_bytes,
+			  u64 flags, int force);
+
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
 	return (cache->flags & bits) == bits;
@@ -1909,6 +1913,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_pinned = 0;
 	found->bytes_reserved = 0;
 	found->bytes_readonly = 0;
+	found->bytes_delalloc = 0;
 	found->full = 0;
 	found->force_alloc = 0;
 	*space_info = found;
@@ -1972,6 +1977,196 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	return flags;
 }
 
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	u64 alloc_profile;
+
+	if (data) {
+		alloc_profile = info->avail_data_alloc_bits &
+			info->data_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+	} else if (root == root->fs_info->chunk_root) {
+		alloc_profile = info->avail_system_alloc_bits &
+			info->system_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
+	} else {
+		alloc_profile = info->avail_metadata_alloc_bits &
+			info->metadata_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
+	}
+
+	return btrfs_reduce_alloc_profile(root, data);
+}
+
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+{
+	u64 alloc_target;
+
+	alloc_target = btrfs_get_alloc_profile(root, 1);
+	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+						       alloc_target);
+}
+
+/*
+ * for now this just makes sure we have at least 5% of our metadata space free
+ * for use.
+ */
+int btrfs_check_metadata_free_space(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *meta_sinfo;
+	u64 alloc_target, thresh;
+
+	/* get the space info for where the metadata will live */
+	alloc_target = btrfs_get_alloc_profile(root, 0);
+	meta_sinfo = __find_space_info(info, alloc_target);
+
+	/*
+	 * if the metadata area isn't maxed out then there is no sense in
+	 * checking how much is used, since we can always allocate a new chunk
+	 */
+	if (!meta_sinfo->full)
+		return 0;
+
+	spin_lock(&meta_sinfo->lock);
+	thresh = meta_sinfo->total_bytes * 95;
+
+	do_div(thresh, 100);
+
+	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+		spin_unlock(&meta_sinfo->lock);
+		return -ENOSPC;
+	}
+	spin_unlock(&meta_sinfo->lock);
+
+	return 0;
+}
+
+/*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+				u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+	int ret = 0;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	data_sinfo = BTRFS_I(inode)->space_info;
+again:
+	/* make sure we have enough space to handle the data first */
+	spin_lock(&data_sinfo->lock);
+	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+	    data_sinfo->bytes_may_use < bytes) {
+		/*
+		 * if we don't have enough free bytes in this space then we need
+		 * to alloc a new chunk.
+		 */
+		if (!data_sinfo->full) {
+			u64 alloc_target;
+			struct btrfs_trans_handle *trans;
+
+			data_sinfo->force_alloc = 1;
+			spin_unlock(&data_sinfo->lock);
+
+			alloc_target = btrfs_get_alloc_profile(root, 1);
+			trans = btrfs_start_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     bytes + 2 * 1024 * 1024,
+					     alloc_target, 0);
+			btrfs_end_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
+		spin_unlock(&data_sinfo->lock);
+		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
+		       ", %llu bytes_used, %llu bytes_reserved, "
+		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
+		       "%llu total\n", bytes, data_sinfo->bytes_delalloc,
+		       data_sinfo->bytes_used, data_sinfo->bytes_reserved,
+		       data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
+		       data_sinfo->bytes_may_use, data_sinfo->total_bytes);
+		return -ENOSPC;
+	}
+	data_sinfo->bytes_may_use += bytes;
+	BTRFS_I(inode)->reserved_bytes += bytes;
+	spin_unlock(&data_sinfo->lock);
+
+	return btrfs_check_metadata_free_space(root);
+}
+
+/*
+ * if there was an error for whatever reason after calling
+ * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ */
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+				    struct inode *inode, u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	data_sinfo = BTRFS_I(inode)->space_info;
+	spin_lock(&data_sinfo->lock);
+	data_sinfo->bytes_may_use -= bytes;
+	BTRFS_I(inode)->reserved_bytes -= bytes;
+	spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are adding a delalloc extent to the inode's io_tree */
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+				  u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+
+	/* get the space info for where this inode will be storing its data */
+	data_sinfo = BTRFS_I(inode)->space_info;
+
+	/* make sure we have enough space to handle the data first */
+	spin_lock(&data_sinfo->lock);
+	data_sinfo->bytes_delalloc += bytes;
+
+	/*
+	 * we are adding a delalloc extent without calling
+	 * btrfs_check_data_free_space first.  This happens on a weird
+	 * writepage condition, but shouldn't hurt our accounting
+	 */
+	if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
+		data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
+		BTRFS_I(inode)->reserved_bytes = 0;
+	} else {
+		data_sinfo->bytes_may_use -= bytes;
+		BTRFS_I(inode)->reserved_bytes -= bytes;
+	}
+
+	spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are clearing an delalloc extent from the inode's io_tree */
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+			      u64 bytes)
+{
+	struct btrfs_space_info *info;
+
+	info = BTRFS_I(inode)->space_info;
+
+	spin_lock(&info->lock);
+	info->bytes_delalloc -= bytes;
+	spin_unlock(&info->lock);
+}
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags, int force)
@@ -3105,6 +3300,10 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 	       (unsigned long long)(info->total_bytes - info->bytes_used -
 				    info->bytes_pinned - info->bytes_reserved),
 	       (info->full) ? "" : "not ");
+	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+	       " may_use=%llu, used=%llu\n", info->total_bytes,
+	       info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
+	       info->bytes_used);
 
 	down_read(&info->groups_sem);
 	list_for_each_entry(cache, &info->block_groups, list) {
@@ -3131,24 +3330,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	u64 search_start = 0;
-	u64 alloc_profile;
 	struct btrfs_fs_info *info = root->fs_info;
 
-	if (data) {
-		alloc_profile = info->avail_data_alloc_bits &
-			info->data_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
-	} else if (root == root->fs_info->chunk_root) {
-		alloc_profile = info->avail_system_alloc_bits &
-			info->system_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-	} else {
-		alloc_profile = info->avail_metadata_alloc_bits &
-			info->metadata_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-	}
+	data = btrfs_get_alloc_profile(root, data);
 again:
-	data = btrfs_reduce_alloc_profile(root, data);
 	/*
 	 * the only place that sets empty_size is btrfs_realloc_node, which
 	 * is not called recursively on allocations
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 872f104576e5..dc78954861b3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1091,19 +1091,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		WARN_ON(num_pages > nrptrs);
 		memset(pages, 0, sizeof(struct page *) * nrptrs);
 
-		ret = btrfs_check_free_space(root, write_bytes, 0);
+		ret = btrfs_check_data_free_space(root, inode, write_bytes);
 		if (ret)
 			goto out;
 
 		ret = prepare_pages(root, file, pages, num_pages,
 				    pos, first_index, last_index,
 				    write_bytes);
-		if (ret)
+		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			goto out;
+		}
 
 		ret = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, buf);
 		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			btrfs_drop_pages(pages, num_pages);
 			goto out;
 		}
@@ -1111,8 +1116,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		ret = dirty_and_release_pages(NULL, root, file, pages,
 					      num_pages, pos, write_bytes);
 		btrfs_drop_pages(pages, num_pages);
-		if (ret)
+		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			goto out;
+		}
 
 		if (will_write) {
 			btrfs_fdatawrite_range(inode->i_mapping, pos,
@@ -1136,6 +1144,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	}
 out:
 	mutex_unlock(&inode->i_mutex);
+	if (ret)
+		err = ret;
 
 out_nolock:
 	kfree(pages);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3cee77ae03c8..7d4f948bc22a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -101,34 +101,6 @@ static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
 	return err;
 }
 
-/*
- * a very lame attempt at stopping writes when the FS is 85% full.  There
- * are countless ways this is incorrect, but it is better than nothing.
- */
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
-			   int for_del)
-{
-	u64 total;
-	u64 used;
-	u64 thresh;
-	int ret = 0;
-
-	spin_lock(&root->fs_info->delalloc_lock);
-	total = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	used = btrfs_super_bytes_used(&root->fs_info->super_copy);
-	if (for_del)
-		thresh = total * 90;
-	else
-		thresh = total * 85;
-
-	do_div(thresh, 100);
-
-	if (used + root->fs_info->delalloc_bytes + num_required > thresh)
-		ret = -ENOSPC;
-	spin_unlock(&root->fs_info->delalloc_lock);
-	return ret;
-}
-
 /*
  * this does all the hard work for inserting an inline extent into
  * the btree.  The caller should have done a btrfs_drop_extents so that
@@ -1190,6 +1162,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 	 */
 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
+		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
 		spin_lock(&root->fs_info->delalloc_lock);
 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
 		root->fs_info->delalloc_bytes += end - start + 1;
@@ -1223,9 +1196,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			       (unsigned long long)end - start + 1,
 			       (unsigned long long)
 			       root->fs_info->delalloc_bytes);
+			btrfs_delalloc_free_space(root, inode, (u64)-1);
 			root->fs_info->delalloc_bytes = 0;
 			BTRFS_I(inode)->delalloc_bytes = 0;
 		} else {
+			btrfs_delalloc_free_space(root, inode,
+						  end - start + 1);
 			root->fs_info->delalloc_bytes -= end - start + 1;
 			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
 		}
@@ -2245,10 +2221,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	root = BTRFS_I(dir)->root;
 
-	ret = btrfs_check_free_space(root, 1, 1);
-	if (ret)
-		goto fail;
-
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, dir);
@@ -2261,7 +2233,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	nr = trans->blocks_used;
 
 	btrfs_end_transaction_throttle(trans, root);
-fail:
 	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
@@ -2284,10 +2255,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -ENOTEMPTY;
 	}
 
-	ret = btrfs_check_free_space(root, 1, 1);
-	if (ret)
-		goto fail;
-
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -2304,7 +2271,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 fail_trans:
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction_throttle(trans, root);
-fail:
 	btrfs_btree_balance_dirty(root, nr);
 
 	if (ret && !err)
@@ -2818,7 +2784,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 	if (size <= hole_start)
 		return 0;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		return err;
 
@@ -3014,6 +2980,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->last_trans = 0;
 	bi->logged_trans = 0;
 	bi->delalloc_bytes = 0;
+	bi->reserved_bytes = 0;
 	bi->disk_i_size = 0;
 	bi->flags = 0;
 	bi->index_cnt = (u64)-1;
@@ -3035,6 +3002,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	inode->i_ino = args->ino;
 	init_btrfs_i(inode);
 	BTRFS_I(inode)->root = args->root;
+	btrfs_set_inode_space_info(args->root, inode);
 	return 0;
 }
 
@@ -3455,6 +3423,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	BTRFS_I(inode)->index_cnt = 2;
 	BTRFS_I(inode)->root = root;
 	BTRFS_I(inode)->generation = trans->transid;
+	btrfs_set_inode_space_info(root, inode);
 
 	if (mode & S_IFDIR)
 		owner = 0;
@@ -3602,7 +3571,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 
@@ -3665,7 +3634,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	u64 objectid;
 	u64 index = 0;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 	trans = btrfs_start_transaction(root, 1);
@@ -3733,7 +3702,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		return -ENOENT;
 
 	btrfs_inc_nlink(inode);
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 	err = btrfs_set_inode_index(dir, &index);
@@ -3779,7 +3748,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	u64 index = 0;
 	unsigned long nr = 1;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto out_unlock;
 
@@ -4336,7 +4305,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	u64 page_start;
 	u64 page_end;
 
-	ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
+	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
 	if (ret)
 		goto out;
 
@@ -4349,6 +4318,7 @@ again:
 
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_start >= size)) {
+		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
@@ -4631,7 +4601,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		return -EXDEV;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto out_unlock;
 
@@ -4749,7 +4719,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto out_fail;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 988fdc8b49eb..bca729fc80c8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -70,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	u64 index = 0;
 	unsigned long nr = 1;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto fail_commit;
 
@@ -203,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	if (!root->ref_cows)
 		return -EINVAL;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto fail_unlock;
 
@@ -374,7 +374,7 @@ static int btrfs_defrag_file(struct file *file)
 	unsigned long i;
 	int ret;
 
-	ret = btrfs_check_free_space(root, inode->i_size, 0);
+	ret = btrfs_check_data_free_space(root, inode, inode->i_size);
 	if (ret)
 		return -ENOSPC;
 
-- 
cgit v1.2.3


From 4e06bdd6cbd5105376e7caf4e683ed131e777389 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Fri, 20 Feb 2009 10:59:53 -0500
Subject: Btrfs: try committing transaction before returning ENOSPC

This fixes a problem where we could return -ENOSPC when we may actually have
plenty of space, the space is just pinned.  Instead of returning -ENOSPC
immediately, commit the transaction first and then try and do the allocation
again.

This patch also does chunk allocation for metadata if we pass the 80%
threshold for metadata space.  This will help with stack usage since the chunk
allocation will happen early on, instead of when the allocation is happening.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/extent-tree.c | 57 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e11875e97c2f..6b5966aacf44 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2017,26 +2017,49 @@ int btrfs_check_metadata_free_space(struct btrfs_root *root)
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_space_info *meta_sinfo;
 	u64 alloc_target, thresh;
+	int committed = 0, ret;
 
 	/* get the space info for where the metadata will live */
 	alloc_target = btrfs_get_alloc_profile(root, 0);
 	meta_sinfo = __find_space_info(info, alloc_target);
 
-	/*
-	 * if the metadata area isn't maxed out then there is no sense in
-	 * checking how much is used, since we can always allocate a new chunk
-	 */
-	if (!meta_sinfo->full)
-		return 0;
-
+again:
 	spin_lock(&meta_sinfo->lock);
-	thresh = meta_sinfo->total_bytes * 95;
+	if (!meta_sinfo->full)
+		thresh = meta_sinfo->total_bytes * 80;
+	else
+		thresh = meta_sinfo->total_bytes * 95;
 
 	do_div(thresh, 100);
 
 	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
 	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+		struct btrfs_trans_handle *trans;
+		if (!meta_sinfo->full) {
+			meta_sinfo->force_alloc = 1;
+			spin_unlock(&meta_sinfo->lock);
+
+			trans = btrfs_start_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     2 * 1024 * 1024, alloc_target, 0);
+			btrfs_end_transaction(trans, root);
+			goto again;
+		}
 		spin_unlock(&meta_sinfo->lock);
+
+		if (!committed) {
+			committed = 1;
+			trans = btrfs_join_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
 		return -ENOSPC;
 	}
 	spin_unlock(&meta_sinfo->lock);
@@ -2052,7 +2075,7 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 				u64 bytes)
 {
 	struct btrfs_space_info *data_sinfo;
-	int ret = 0;
+	int ret = 0, committed = 0;
 
 	/* make sure bytes are sectorsize aligned */
 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -2065,13 +2088,14 @@ again:
 	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
 	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
 	    data_sinfo->bytes_may_use < bytes) {
+		struct btrfs_trans_handle *trans;
+
 		/*
 		 * if we don't have enough free bytes in this space then we need
 		 * to alloc a new chunk.
 		 */
 		if (!data_sinfo->full) {
 			u64 alloc_target;
-			struct btrfs_trans_handle *trans;
 
 			data_sinfo->force_alloc = 1;
 			spin_unlock(&data_sinfo->lock);
@@ -2090,6 +2114,19 @@ again:
 			goto again;
 		}
 		spin_unlock(&data_sinfo->lock);
+
+		/* commit the current transaction and try again */
+		if (!committed) {
+			committed = 1;
+			trans = btrfs_join_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
+
 		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
 		       ", %llu bytes_used, %llu bytes_reserved, "
 		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
-- 
cgit v1.2.3


From e4cce94c9c8797b08faf6a79396df4d175e377fa Mon Sep 17 00:00:00 2001
From: Igor Mammedov <niallain@gmail.com>
Date: Tue, 10 Feb 2009 14:10:26 +0300
Subject: [CIFS] Prevent OOPs when mounting with remote prefixpath.

Fixes OOPs with message 'kernel BUG at fs/cifs/cifs_dfs_ref.c:274!'.
Checks if the prefixpath in an accesible while we are still in cifs_mount
and fails with reporting a error if we can't access the prefixpath

Should fix Samba bugs 6086 and 5861 and kernel bug 12192

Signed-off-by: Igor Mammedov <niallain@gmail.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  1 +
 fs/cifs/connect.c   | 45 +++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/inode.c     |  4 ++--
 3 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 382ba6298809..ec9f9c1c7d88 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -42,6 +42,7 @@ extern void _FreeXid(unsigned int);
 #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
 #define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
 extern char *build_path_from_dentry(struct dentry *);
+extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
 extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 005df85219a8..da0f4ffa0613 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2180,6 +2180,33 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 			   "mount option supported"));
 }
 
+static int
+is_path_accessible(int xid, struct cifsTconInfo *tcon,
+		   struct cifs_sb_info *cifs_sb, const char *full_path)
+{
+	int rc;
+	__u64 inode_num;
+	FILE_ALL_INFO *pfile_info;
+
+	rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num,
+				   cifs_sb->local_nls,
+				   cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != -EOPNOTSUPP)
+		return rc;
+
+	pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (pfile_info == NULL)
+		return -ENOMEM;
+
+	rc = CIFSSMBQPathInfo(xid, tcon, full_path, pfile_info,
+			      0 /* not legacy */, cifs_sb->local_nls,
+			      cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	kfree(pfile_info);
+	return rc;
+}
+
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	   char *mount_data, const char *devname)
@@ -2190,6 +2217,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	struct cifsSesInfo *pSesInfo = NULL;
 	struct cifsTconInfo *tcon = NULL;
 	struct TCP_Server_Info *srvTcp = NULL;
+	char   *full_path;
 
 	xid = GetXid();
 
@@ -2426,6 +2454,23 @@ mount_fail_check:
 		cifs_sb->rsize = min(cifs_sb->rsize,
 			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
 
+	if (!rc && cifs_sb->prepathlen) {
+		/* build_path_to_root works only when we have a valid tcon */
+		full_path = cifs_build_path_to_root(cifs_sb);
+		if (full_path == NULL) {
+			rc = -ENOMEM;
+			goto mount_fail_check;
+		}
+		rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
+		if (rc) {
+			cERROR(1, ("Path %s in not accessible: %d",
+						full_path, rc));
+			kfree(full_path);
+			goto mount_fail_check;
+		}
+		kfree(full_path);
+	}
+
 	/* volume_info->password is freed above when existing session found
 	(in which case it is not needed anymore) but when new sesion is created
 	the password ptr is put in the new session structure (in which case the
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index bcf7b5184664..7342bfb02ae0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -621,7 +621,7 @@ static const struct inode_operations cifs_ipc_inode_ops = {
 	.lookup = cifs_lookup,
 };
 
-static char *build_path_to_root(struct cifs_sb_info *cifs_sb)
+char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 {
 	int pplen = cifs_sb->prepathlen;
 	int dfsplen;
@@ -678,7 +678,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	full_path = build_path_to_root(cifs_sb);
+	full_path = cifs_build_path_to_root(cifs_sb);
 	if (full_path == NULL)
 		return ERR_PTR(-ENOMEM);
 
-- 
cgit v1.2.3


From 132ac7b77cc95a22d6118d327c96586759fbf006 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 10 Feb 2009 07:33:57 -0500
Subject: cifs: refactor new_inode() calls and inode initialization

Move new inode creation into a separate routine and refactor the
callers to take advantage of it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  2 ++
 fs/cifs/inode.c     | 96 +++++++++++++++++++++++++++++++++--------------------
 fs/cifs/readdir.c   | 54 ++++++++++++++----------------
 3 files changed, 86 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ec9f9c1c7d88..62fd5bd499f6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -92,6 +92,8 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
 
+extern struct inode *cifs_new_inode(struct super_block *sb,
+				    unsigned long *inum);
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path,
 			FILE_ALL_INFO *pfile_info,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7342bfb02ae0..c7674f595adb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -199,6 +199,49 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
 	pfnd_dat->Gid = cpu_to_le64(pinode->i_gid);
 }
 
+/**
+ * cifs_new inode - create new inode, initialize, and hash it
+ * @sb - pointer to superblock
+ * @inum - if valid pointer and serverino is enabled, replace i_ino with val
+ *
+ * Create a new inode, initialize it for CIFS and hash it. Returns the new
+ * inode or NULL if one couldn't be allocated.
+ *
+ * If the share isn't mounted with "serverino" or inum is a NULL pointer then
+ * we'll just use the inode number assigned by new_inode(). Note that this can
+ * mean i_ino collisions since the i_ino assigned by new_inode is not
+ * guaranteed to be unique.
+ */
+struct inode *
+cifs_new_inode(struct super_block *sb, unsigned long *inum)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (inode == NULL)
+		return NULL;
+
+	/*
+	 * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we
+	 *     stop passing inum as ptr. Are there sanity checks we can use to
+	 *     ensure that the server is really filling in that field? Also,
+	 *     if serverino is disabled, perhaps we should be using iunique()?
+	 */
+	if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
+		inode->i_ino = *inum;
+
+	/*
+	 * must set this here instead of cifs_alloc_inode since VFS will
+	 * clobber i_flags
+	 */
+	if (sb->s_flags & MS_NOATIME)
+		inode->i_flags |= S_NOATIME | S_NOCMTIME;
+
+	insert_inode_hash(inode);
+
+	return inode;
+}
+
 int cifs_get_inode_info_unix(struct inode **pinode,
 	const unsigned char *full_path, struct super_block *sb, int xid)
 {
@@ -233,22 +276,12 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 
 	/* get new inode */
 	if (*pinode == NULL) {
-		*pinode = new_inode(sb);
+		*pinode = cifs_new_inode(sb, (unsigned long *)
+						&find_data.UniqueId);
 		if (*pinode == NULL) {
 			rc = -ENOMEM;
 			goto cgiiu_exit;
 		}
-		/* Is an i_ino of zero legal? */
-		/* note ino incremented to unique num in new_inode */
-		/* Are there sanity checks we can use to ensure that
-		   the server is really filling in that field? */
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-			(*pinode)->i_ino = (unsigned long)find_data.UniqueId;
-
-		if (sb->s_flags & MS_NOATIME)
-			(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-
-		insert_inode_hash(*pinode);
 	}
 
 	inode = *pinode;
@@ -465,11 +498,8 @@ int cifs_get_inode_info(struct inode **pinode,
 
 	/* get new inode */
 	if (*pinode == NULL) {
-		*pinode = new_inode(sb);
-		if (*pinode == NULL) {
-			rc = -ENOMEM;
-			goto cgii_exit;
-		}
+		__u64 inode_num;
+
 		/* Is an i_ino of zero legal? Can we use that to check
 		   if the server supports returning inode numbers?  Are
 		   there other sanity checks we can use to ensure that
@@ -486,7 +516,6 @@ int cifs_get_inode_info(struct inode **pinode,
 
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 			int rc1 = 0;
-			__u64 inode_num;
 
 			rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
 					full_path, &inode_num,
@@ -496,12 +525,17 @@ int cifs_get_inode_info(struct inode **pinode,
 			if (rc1) {
 				cFYI(1, ("GetSrvInodeNum rc %d", rc1));
 				/* BB EOPNOSUPP disable SERVER_INUM? */
-			} else /* do we need cast or hash to ino? */
-				(*pinode)->i_ino = inode_num;
-		} /* else ino incremented to unique num in new_inode*/
-		if (sb->s_flags & MS_NOATIME)
-			(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-		insert_inode_hash(*pinode);
+			}
+			*pinode = cifs_new_inode(sb, (unsigned long *)
+							&inode_num);
+		} else {
+			*pinode = cifs_new_inode(sb, NULL);
+		}
+
+		if (*pinode == NULL) {
+			rc = -ENOMEM;
+			goto cgii_exit;
+		}
 	}
 	inode = *pinode;
 	cifsInfo = CIFS_I(inode);
@@ -1114,24 +1148,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			else
 				direntry->d_op = &cifs_dentry_ops;
 
-			newinode = new_inode(inode->i_sb);
+			newinode = cifs_new_inode(inode->i_sb, (unsigned long *)
+							&pInfo->UniqueId);
 			if (newinode == NULL) {
 				kfree(pInfo);
 				goto mkdir_get_info;
 			}
 
-			/* Is an i_ino of zero legal? */
-			/* Are there sanity checks we can use to ensure that
-			   the server is really filling in that field? */
-			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-				newinode->i_ino =
-					(unsigned long)pInfo->UniqueId;
-			} /* note ino incremented to unique num in new_inode */
-			if (inode->i_sb->s_flags & MS_NOATIME)
-				newinode->i_flags |= S_NOATIME | S_NOCMTIME;
 			newinode->i_nlink = 2;
-
-			insert_inode_hash(newinode);
 			d_instantiate(direntry, newinode);
 
 			/* we already checked in POSIXCreate whether
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 9f51f9bf0292..02a20221e841 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -56,35 +56,34 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 }
 #endif /* DEBUG2 */
 
-/* Returns one if new inode created (which therefore needs to be hashed) */
+/* Returns 1 if new inode created, 2 if both dentry and inode were */
 /* Might check in the future if inode number changed so we can rehash inode */
-static int construct_dentry(struct qstr *qstring, struct file *file,
-	struct inode **ptmp_inode, struct dentry **pnew_dentry)
+static int
+construct_dentry(struct qstr *qstring, struct file *file,
+		 struct inode **ptmp_inode, struct dentry **pnew_dentry,
+		 unsigned long *inum)
 {
-	struct dentry *tmp_dentry;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	struct dentry *tmp_dentry = NULL;
+	struct super_block *sb = file->f_path.dentry->d_sb;
 	int rc = 0;
 
 	cFYI(1, ("For %s", qstring->name));
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
 
 	qstring->hash = full_name_hash(qstring->name, qstring->len);
 	tmp_dentry = d_lookup(file->f_path.dentry, qstring);
 	if (tmp_dentry) {
+		/* BB: overwrite old name? i.e. tmp_dentry->d_name and
+		 * tmp_dentry->d_name.len??
+		 */
 		cFYI(0, ("existing dentry with inode 0x%p",
 			 tmp_dentry->d_inode));
 		*ptmp_inode = tmp_dentry->d_inode;
-/* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/
 		if (*ptmp_inode == NULL) {
-			*ptmp_inode = new_inode(file->f_path.dentry->d_sb);
+			*ptmp_inode = cifs_new_inode(sb, inum);
 			if (*ptmp_inode == NULL)
 				return rc;
 			rc = 1;
 		}
-		if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
-			(*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
 	} else {
 		tmp_dentry = d_alloc(file->f_path.dentry, qstring);
 		if (tmp_dentry == NULL) {
@@ -93,15 +92,14 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 			return rc;
 		}
 
-		*ptmp_inode = new_inode(file->f_path.dentry->d_sb);
-		if (pTcon->nocase)
+		if (CIFS_SB(sb)->tcon->nocase)
 			tmp_dentry->d_op = &cifs_ci_dentry_ops;
 		else
 			tmp_dentry->d_op = &cifs_dentry_ops;
+
+		*ptmp_inode = cifs_new_inode(sb, inum);
 		if (*ptmp_inode == NULL)
 			return rc;
-		if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
-			(*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
 		rc = 2;
 	}
 
@@ -842,9 +840,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 			len = strnlen(filename, PATH_MAX);
 		}
 
-		/* BB fixme - hash low and high 32 bits if not 64 bit arch BB */
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-			*pinum = pFindData->UniqueId;
+		*pinum = pFindData->UniqueId;
 	} else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
 		FILE_DIRECTORY_INFO *pFindData =
 			(FILE_DIRECTORY_INFO *)current_entry;
@@ -940,20 +936,18 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	if (rc)
 		return rc;
 
-	rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry);
+	/* only these two infolevels return valid inode numbers */
+	if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX ||
+	    pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
+		rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
+					&inum);
+	else
+		rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
+					NULL);
+
 	if ((tmp_inode == NULL) || (tmp_dentry == NULL))
 		return -ENOMEM;
 
-	if (rc) {
-		/* inode created, we need to hash it with right inode number */
-		if (inum != 0) {
-			/* BB fixme - hash the 2 32 quantities bits together if
-			 *  necessary BB */
-			tmp_inode->i_ino = inum;
-		}
-		insert_inode_hash(tmp_inode);
-	}
-
 	/* we pass in rc below, indicating whether it is a new inode,
 	   so we can figure out whether to invalidate the inode cached
 	   data if the file has changed */
-- 
cgit v1.2.3


From 950ec52880fab89b957c7dc45e8b8476dd63741f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 11 Feb 2009 08:08:26 -0500
Subject: cifs: properly handle case where CIFSGetSrvInodeNumber fails

...if it does then we pass a pointer to an unintialized variable for
the inode number to cifs_new_inode. Have it pass a NULL pointer instead.

Also tweak the function prototypes to reduce the amount of casting.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  3 +--
 fs/cifs/inode.c     | 20 ++++++++++----------
 fs/cifs/readdir.c   |  6 +++---
 3 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 62fd5bd499f6..446e62cbece9 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -92,8 +92,7 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
 
-extern struct inode *cifs_new_inode(struct super_block *sb,
-				    unsigned long *inum);
+extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path,
 			FILE_ALL_INFO *pfile_info,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index c7674f595adb..475115c7cc79 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -213,7 +213,7 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
  * guaranteed to be unique.
  */
 struct inode *
-cifs_new_inode(struct super_block *sb, unsigned long *inum)
+cifs_new_inode(struct super_block *sb, __u64 *inum)
 {
 	struct inode *inode;
 
@@ -228,7 +228,7 @@ cifs_new_inode(struct super_block *sb, unsigned long *inum)
 	 *     if serverino is disabled, perhaps we should be using iunique()?
 	 */
 	if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
-		inode->i_ino = *inum;
+		inode->i_ino = (unsigned long) *inum;
 
 	/*
 	 * must set this here instead of cifs_alloc_inode since VFS will
@@ -276,8 +276,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 
 	/* get new inode */
 	if (*pinode == NULL) {
-		*pinode = cifs_new_inode(sb, (unsigned long *)
-						&find_data.UniqueId);
+		*pinode = cifs_new_inode(sb, &find_data.UniqueId);
 		if (*pinode == NULL) {
 			rc = -ENOMEM;
 			goto cgiiu_exit;
@@ -499,6 +498,7 @@ int cifs_get_inode_info(struct inode **pinode,
 	/* get new inode */
 	if (*pinode == NULL) {
 		__u64 inode_num;
+		__u64 *pinum = &inode_num;
 
 		/* Is an i_ino of zero legal? Can we use that to check
 		   if the server supports returning inode numbers?  Are
@@ -518,20 +518,20 @@ int cifs_get_inode_info(struct inode **pinode,
 			int rc1 = 0;
 
 			rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
-					full_path, &inode_num,
+					full_path, pinum,
 					cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			if (rc1) {
 				cFYI(1, ("GetSrvInodeNum rc %d", rc1));
+				pinum = NULL;
 				/* BB EOPNOSUPP disable SERVER_INUM? */
 			}
-			*pinode = cifs_new_inode(sb, (unsigned long *)
-							&inode_num);
 		} else {
-			*pinode = cifs_new_inode(sb, NULL);
+			pinum = NULL;
 		}
 
+		*pinode = cifs_new_inode(sb, pinum);
 		if (*pinode == NULL) {
 			rc = -ENOMEM;
 			goto cgii_exit;
@@ -1148,8 +1148,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			else
 				direntry->d_op = &cifs_dentry_ops;
 
-			newinode = cifs_new_inode(inode->i_sb, (unsigned long *)
-							&pInfo->UniqueId);
+			newinode = cifs_new_inode(inode->i_sb,
+						  &pInfo->UniqueId);
 			if (newinode == NULL) {
 				kfree(pInfo);
 				goto mkdir_get_info;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 02a20221e841..c2c01ff4c32c 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -61,7 +61,7 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 static int
 construct_dentry(struct qstr *qstring, struct file *file,
 		 struct inode **ptmp_inode, struct dentry **pnew_dentry,
-		 unsigned long *inum)
+		 __u64 *inum)
 {
 	struct dentry *tmp_dentry = NULL;
 	struct super_block *sb = file->f_path.dentry->d_sb;
@@ -820,7 +820,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 /* inode num, inode type and filename returned */
 static int cifs_get_name_from_search_buf(struct qstr *pqst,
 	char *current_entry, __u16 level, unsigned int unicode,
-	struct cifs_sb_info *cifs_sb, int max_len, ino_t *pinum)
+	struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum)
 {
 	int rc = 0;
 	unsigned int len = 0;
@@ -903,7 +903,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	struct qstr qstring;
 	struct cifsFileInfo *pCifsF;
 	unsigned int obj_type;
-	ino_t  inum;
+	__u64  inum;
 	struct cifs_sb_info *cifs_sb;
 	struct inode *tmp_inode;
 	struct dentry *tmp_dentry;
-- 
cgit v1.2.3


From 44f68fadd865bb288ebdcea2b602f0b1cab27a0c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 11 Feb 2009 08:08:28 -0500
Subject: cifs: posix fill in inode needed by posix open

function needed to prepare for posix open

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h | 2 ++
 fs/cifs/inode.c     | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 446e62cbece9..083dfc57c7a3 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -92,6 +92,8 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
 
+extern void posix_fill_in_inode(struct inode *tmp_inode,
+				FILE_UNIX_BASIC_INFO *pData, int isNewInode);
 extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 475115c7cc79..4690a360c855 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1051,7 +1051,7 @@ out_reval:
 	return rc;
 }
 
-static void posix_fill_in_inode(struct inode *tmp_inode,
+void posix_fill_in_inode(struct inode *tmp_inode,
 	FILE_UNIX_BASIC_INFO *pData, int isNewInode)
 {
 	struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
-- 
cgit v1.2.3


From 69765529d701c838df19ea1f5ad2f33a528261ae Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 17 Feb 2009 01:29:40 +0000
Subject: [CIFS] Fix oops in cifs_strfromUCS_le mounting to servers which do
 not specify their OS

Fixes kernel bug #10451 http://bugzilla.kernel.org/show_bug.cgi?id=10451

Certain NAS appliances do not set the operating system or network operating system
fields in the session setup response on the wire.  cifs was oopsing on the unexpected
zero length response fields (when trying to null terminate a zero length field).

This fixes the oops.

Acked-by: Jeff Layton <jlayton@redhat.com>
CC: stable <stable@kernel.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES | 3 ++-
 fs/cifs/sess.c  | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 73ac7ebd1dfc..1cfa72ef1f37 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -7,7 +7,8 @@ specified and user does not have access to query information about the
 top of the share.  Fix problem in 2.6.28 resolving DFS paths to
 Samba servers (worked to Windows).  Fix rmdir so that pending search
 (readdir) requests do not get invalid results which include the now
-removed directory.
+removed directory.  Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
+when using DFS.
 
 Version 1.55
 ------------
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5f22de7b79a9..b234407a3007 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -228,7 +228,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
 
 	kfree(ses->serverOS);
 	/* UTF-8 string will not grow more than four times as big as UCS-16 */
-	ses->serverOS = kzalloc(4 * len, GFP_KERNEL);
+	ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
 	if (ses->serverOS != NULL)
 		cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp);
 	data += 2 * (len + 1);
@@ -241,7 +241,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
 		return rc;
 
 	kfree(ses->serverNOS);
-	ses->serverNOS = kzalloc(4 * len, GFP_KERNEL); /* BB this is wrong length FIXME BB */
+	ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
 	if (ses->serverNOS != NULL) {
 		cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len,
 				   nls_cp);
-- 
cgit v1.2.3


From c3b2a0c640bff7df85d79fb4f89674949a267ec2 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 20 Feb 2009 04:32:45 +0000
Subject: [CIFS] improve posix semantics of file create

Samba server added support for a new posix open/create/mkdir operation
a year or so ago, and we added support to cifs for mkdir to use it,
but had not added the corresponding code to file create.

The following patch helps improve the performance of the cifs create
path (to Samba and servers which support the cifs posix protocol
extensions).  Using Connectathon basic test1, with 2000 files, the
performance improved about 15%, and also helped reduce network traffic
(17% fewer SMBs sent over the wire) due to saving a network round trip
for the SetPathInfo on every file create.

It should also help the semantics (and probably the performance) of
write (e.g. when posix byte range locks are on the file) on file
handles opened with posix create, and adds support for a few flags
which would have to be ignored otherwise.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES |   4 +-
 fs/cifs/dir.c   | 307 +++++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 208 insertions(+), 103 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 1cfa72ef1f37..72063f5e56b1 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -8,7 +8,9 @@ top of the share.  Fix problem in 2.6.28 resolving DFS paths to
 Samba servers (worked to Windows).  Fix rmdir so that pending search
 (readdir) requests do not get invalid results which include the now
 removed directory.  Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
-when using DFS.
+when using DFS.  Add better file create support to servers which support
+the CIFS POSIX protocol extensions (this adds support for new flags
+on create, and improves semantics for write of locked ranges).
 
 Version 1.55
 ------------
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 964aad03c5ad..89fb72832652 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -3,7 +3,7 @@
  *
  *   vfs operations that deal with dentries
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2009
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -129,6 +129,78 @@ cifs_bp_rename_retry:
 	return full_path;
 }
 
+static int cifs_posix_open(char *full_path, struct inode **pinode,
+		    struct super_block *sb, int mode, int oflags,
+		    int *poplock, __u16 *pnetfid, int xid)
+{
+	int rc;
+	__u32 oplock;
+	FILE_UNIX_BASIC_INFO *presp_data;
+	__u32 posix_flags = 0;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+
+	cFYI(1, ("posix open %s", full_path));
+
+	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+	if (presp_data == NULL)
+		return -ENOMEM;
+
+/* So far cifs posix extensions can only map the following flags.
+   There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
+   so far we do not seem to need them, and we can treat them as local only */
+	if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
+		(FMODE_READ | FMODE_WRITE))
+		posix_flags = SMB_O_RDWR;
+	else if (oflags & FMODE_READ)
+		posix_flags = SMB_O_RDONLY;
+	else if (oflags & FMODE_WRITE)
+		posix_flags = SMB_O_WRONLY;
+	if (oflags & O_CREAT)
+		posix_flags |= SMB_O_CREAT;
+	if (oflags & O_EXCL)
+		posix_flags |= SMB_O_EXCL;
+	if (oflags & O_TRUNC)
+		posix_flags |= SMB_O_TRUNC;
+	if (oflags & O_APPEND)
+		posix_flags |= SMB_O_APPEND;
+	if (oflags & O_SYNC)
+		posix_flags |= SMB_O_SYNC;
+	if (oflags & O_DIRECTORY)
+		posix_flags |= SMB_O_DIRECTORY;
+	if (oflags & O_NOFOLLOW)
+		posix_flags |= SMB_O_NOFOLLOW;
+	if (oflags & O_DIRECT)
+		posix_flags |= SMB_O_DIRECT;
+
+
+	rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
+			pnetfid, presp_data, &oplock, full_path,
+			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc)
+		goto posix_open_ret;
+
+	if (presp_data->Type == cpu_to_le32(-1))
+		goto posix_open_ret; /* open ok, caller does qpathinfo */
+
+	/* get new inode and set it up */
+	if (!pinode)
+		goto posix_open_ret; /* caller does not need info */
+
+	*pinode = cifs_new_inode(sb, &presp_data->UniqueId);
+
+	/* We do not need to close the file if new_inode fails since
+	   the caller will retry qpathinfo as long as inode is null */
+	if (*pinode == NULL)
+		goto posix_open_ret;
+
+	posix_fill_in_inode(*pinode, presp_data, 1);
+
+posix_open_ret:
+	kfree(presp_data);
+	return rc;
+}
+
 static void setup_cifs_dentry(struct cifsTconInfo *tcon,
 			      struct dentry *direntry,
 			      struct inode *newinode)
@@ -150,7 +222,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	int xid;
 	int create_options = CREATE_NOT_DIR;
 	int oplock = 0;
-	/* BB below access is too much for the mknod to request */
+	int oflags;
+	/*
+	 * BB below access is probably too much for mknod to request
+	 *    but we have to do query and setpathinfo so requesting
+	 *    less could fail (unless we want to request getatr and setatr
+	 *    permissions (only).  At least for POSIX we do not have to
+	 *    request so much.
+	 */
 	int desiredAccess = GENERIC_READ | GENERIC_WRITE;
 	__u16 fileHandle;
 	struct cifs_sb_info *cifs_sb;
@@ -174,13 +253,43 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	}
 
 	mode &= ~current->fs->umask;
+	if (oplockEnabled)
+		oplock = REQ_OPLOCK;
 
-	if (nd && (nd->flags & LOOKUP_OPEN)) {
-		int oflags = nd->intent.open.flags;
+	if (nd && (nd->flags & LOOKUP_OPEN))
+		oflags = nd->intent.open.flags;
+	else
+		oflags = FMODE_READ;
+
+	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		rc = cifs_posix_open(full_path, &newinode, inode->i_sb,
+				     mode, oflags, &oplock, &fileHandle, xid);
+		/* EIO could indicate that (posix open) operation is not
+		   supported, despite what server claimed in capability
+		   negotation.  EREMOTE indicates DFS junction, which is not
+		   handled in posix open */
+
+		if ((rc == 0) && (newinode == NULL))
+			goto cifs_create_get_file_info; /* query inode info */
+		else if (rc == 0) /* success, no need to query */
+			goto cifs_create_set_dentry;
+		else if ((rc != -EIO) && (rc != -EREMOTE) &&
+			 (rc != -EOPNOTSUPP)) /* path not found or net err */
+			goto cifs_create_out;
+		/* else fallthrough to retry, using older open call, this is
+		   case where server does not support this SMB level, and
+		   falsely claims capability (also get here for DFS case
+		   which should be rare for path not covered on files) */
+	}
 
+	if (nd && (nd->flags & LOOKUP_OPEN)) {
+		/* if the file is going to stay open, then we
+		   need to set the desired access properly */
 		desiredAccess = 0;
 		if (oflags & FMODE_READ)
-			desiredAccess |= GENERIC_READ;
+			desiredAccess |= GENERIC_READ; /* is this too little? */
 		if (oflags & FMODE_WRITE) {
 			desiredAccess |= GENERIC_WRITE;
 			if (!(oflags & FMODE_READ))
@@ -199,8 +308,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 
 	/* BB add processing to set equivalent of mode - e.g. via CreateX with
 	   ACLs */
-	if (oplockEnabled)
-		oplock = REQ_OPLOCK;
 
 	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
 	if (buf == NULL) {
@@ -233,116 +340,112 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	}
 	if (rc) {
 		cFYI(1, ("cifs_create returned 0x%x", rc));
-	} else {
-		/* If Open reported that we actually created a file
-		then we now have to set the mode if possible */
-		if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
-			struct cifs_unix_set_info_args args = {
+		goto cifs_create_out;
+	}
+
+	/* If Open reported that we actually created a file
+	   then we now have to set the mode if possible */
+	if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+		struct cifs_unix_set_info_args args = {
 				.mode	= mode,
 				.ctime	= NO_CHANGE_64,
 				.atime	= NO_CHANGE_64,
 				.mtime	= NO_CHANGE_64,
 				.device	= 0,
-			};
+		};
 
-			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-				args.uid = (__u64) current_fsuid();
-				if (inode->i_mode & S_ISGID)
-					args.gid = (__u64) inode->i_gid;
-				else
-					args.gid = (__u64) current_fsgid();
-			} else {
-				args.uid = NO_CHANGE_64;
-				args.gid = NO_CHANGE_64;
-			}
-			CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
-				cifs_sb->local_nls,
-				cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+			args.uid = (__u64) current_fsuid();
+			if (inode->i_mode & S_ISGID)
+				args.gid = (__u64) inode->i_gid;
+			else
+				args.gid = (__u64) current_fsgid();
 		} else {
-			/* BB implement mode setting via Windows security
-			   descriptors e.g. */
-			/* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
-
-			/* Could set r/o dos attribute if mode & 0222 == 0 */
+			args.uid = NO_CHANGE_64;
+			args.gid = NO_CHANGE_64;
 		}
+		CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
+			cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	} else {
+		/* BB implement mode setting via Windows security
+		   descriptors e.g. */
+		/* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
 
-		/* server might mask mode so we have to query for it */
-		if (tcon->unix_ext)
-			rc = cifs_get_inode_info_unix(&newinode, full_path,
-						 inode->i_sb, xid);
-		else {
-			rc = cifs_get_inode_info(&newinode, full_path,
-						 buf, inode->i_sb, xid,
-						 &fileHandle);
-			if (newinode) {
-				if (cifs_sb->mnt_cifs_flags &
-				    CIFS_MOUNT_DYNPERM)
-					newinode->i_mode = mode;
-				if ((oplock & CIFS_CREATE_ACTION) &&
-				    (cifs_sb->mnt_cifs_flags &
-				     CIFS_MOUNT_SET_UID)) {
-					newinode->i_uid = current_fsuid();
-					if (inode->i_mode & S_ISGID)
-						newinode->i_gid =
-							inode->i_gid;
-					else
-						newinode->i_gid =
-							current_fsgid();
-				}
+		/* Could set r/o dos attribute if mode & 0222 == 0 */
+	}
+
+cifs_create_get_file_info:
+	/* server might mask mode so we have to query for it */
+	if (tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&newinode, full_path,
+					      inode->i_sb, xid);
+	else {
+		rc = cifs_get_inode_info(&newinode, full_path, buf,
+					 inode->i_sb, xid, &fileHandle);
+		if (newinode) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+				newinode->i_mode = mode;
+			if ((oplock & CIFS_CREATE_ACTION) &&
+			    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
+				newinode->i_uid = current_fsuid();
+				if (inode->i_mode & S_ISGID)
+					newinode->i_gid = inode->i_gid;
+				else
+					newinode->i_gid = current_fsgid();
 			}
 		}
+	}
 
-		if (rc != 0) {
-			cFYI(1, ("Create worked, get_inode_info failed rc = %d",
-				 rc));
-		} else
-			setup_cifs_dentry(tcon, direntry, newinode);
-
-		if ((nd == NULL /* nfsd case - nfs srv does not set nd */) ||
-			(!(nd->flags & LOOKUP_OPEN))) {
-			/* mknod case - do not leave file open */
-			CIFSSMBClose(xid, tcon, fileHandle);
-		} else if (newinode) {
-			struct cifsFileInfo *pCifsFile =
-			   kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-
-			if (pCifsFile == NULL)
-				goto cifs_create_out;
-			pCifsFile->netfid = fileHandle;
-			pCifsFile->pid = current->tgid;
-			pCifsFile->pInode = newinode;
-			pCifsFile->invalidHandle = false;
-			pCifsFile->closePend     = false;
-			init_MUTEX(&pCifsFile->fh_sem);
-			mutex_init(&pCifsFile->lock_mutex);
-			INIT_LIST_HEAD(&pCifsFile->llist);
-			atomic_set(&pCifsFile->wrtPending, 0);
-
-			/* set the following in open now
+cifs_create_set_dentry:
+	if (rc == 0)
+		setup_cifs_dentry(tcon, direntry, newinode);
+	else
+		cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc));
+
+	/* nfsd case - nfs srv does not set nd */
+	if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
+		/* mknod case - do not leave file open */
+		CIFSSMBClose(xid, tcon, fileHandle);
+	} else if (newinode) {
+		struct cifsFileInfo *pCifsFile =
+			kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+
+		if (pCifsFile == NULL)
+			goto cifs_create_out;
+		pCifsFile->netfid = fileHandle;
+		pCifsFile->pid = current->tgid;
+		pCifsFile->pInode = newinode;
+		pCifsFile->invalidHandle = false;
+		pCifsFile->closePend     = false;
+		init_MUTEX(&pCifsFile->fh_sem);
+		mutex_init(&pCifsFile->lock_mutex);
+		INIT_LIST_HEAD(&pCifsFile->llist);
+		atomic_set(&pCifsFile->wrtPending, 0);
+
+		/* set the following in open now
 				pCifsFile->pfile = file; */
-			write_lock(&GlobalSMBSeslock);
-			list_add(&pCifsFile->tlist, &tcon->openFileList);
-			pCifsInode = CIFS_I(newinode);
-			if (pCifsInode) {
-				/* if readable file instance put first in list*/
-				if (write_only) {
-					list_add_tail(&pCifsFile->flist,
-						&pCifsInode->openFileList);
-				} else {
-					list_add(&pCifsFile->flist,
-						&pCifsInode->openFileList);
-				}
-				if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-					pCifsInode->clientCanCacheAll = true;
-					pCifsInode->clientCanCacheRead = true;
-					cFYI(1, ("Exclusive Oplock inode %p",
-						newinode));
-				} else if ((oplock & 0xF) == OPLOCK_READ)
-					pCifsInode->clientCanCacheRead = true;
+		write_lock(&GlobalSMBSeslock);
+		list_add(&pCifsFile->tlist, &tcon->openFileList);
+		pCifsInode = CIFS_I(newinode);
+		if (pCifsInode) {
+			/* if readable file instance put first in list*/
+			if (write_only) {
+				list_add_tail(&pCifsFile->flist,
+					      &pCifsInode->openFileList);
+			} else {
+				list_add(&pCifsFile->flist,
+					 &pCifsInode->openFileList);
 			}
-			write_unlock(&GlobalSMBSeslock);
+			if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+				pCifsInode->clientCanCacheAll = true;
+				pCifsInode->clientCanCacheRead = true;
+				cFYI(1, ("Exclusive Oplock inode %p",
+					newinode));
+			} else if ((oplock & 0xF) == OPLOCK_READ)
+				pCifsInode->clientCanCacheRead = true;
 		}
+		write_unlock(&GlobalSMBSeslock);
 	}
 cifs_create_out:
 	kfree(buf);
-- 
cgit v1.2.3


From eca6acf91552a9b2e997cc76339115c95eac0217 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 20 Feb 2009 05:43:09 +0000
Subject: [CIFS] Fix multiuser mounts so server does not invalidate earlier
 security contexts

When two different users mount the same Windows 2003 Server share using CIFS,
the first session mounted can be invalidated.  Some servers invalidate the first
smb session when a second similar user (e.g. two users who get mapped by server to "guest")
authenticates an smb session from the same client.

By making sure that we set the 2nd and subsequent vc numbers to nonzero values,
this ensures that we will not have this problem.

Fixes Samba bug 6004, problem description follows:
How to reproduce:

- configure an "open share" (full permissions to Guest user) on Windows 2003
Server (I couldn't reproduce the problem with Samba server or Windows older
than 2003)
- mount the share twice with different users who will be authenticated as guest.

 noacl,noperm,user=john,dir_mode=0700,domain=DOMAIN,rw
 noacl,noperm,user=jeff,dir_mode=0700,domain=DOMAIN,rw

Result:

- just the mount point mounted last is accessible:

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES    | 10 +++++++
 fs/cifs/cifsfs.h   |  2 +-
 fs/cifs/cifsglob.h |  6 +++-
 fs/cifs/cifssmb.c  |  7 +++--
 fs/cifs/sess.c     | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 105 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 72063f5e56b1..851388fafc73 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,13 @@
+Version 1.57
+------------
+Improve support for multiple security contexts to the same server. We
+used to use the same "vcnumber" for all connections which could cause
+the server to treat subsequent connections, especially those that
+are authenticated as guest, as reconnections, invalidating the earlier
+user's smb session.  This fix allows cifs to mount multiple times to the
+same server with different userids without risking invalidating earlier
+established security contexts.
+
 Version 1.56
 ------------
 Add "forcemandatorylock" mount option to allow user to use mandatory
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7ac481841f87..2b1d28a9ee28 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.56"
+#define CIFS_VERSION   "1.57"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 94c1ca0ec953..e004f6db5fc8 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -164,9 +164,12 @@ struct TCP_Server_Info {
 	/* multiplexed reads or writes */
 	unsigned int maxBuf;	/* maxBuf specifies the maximum */
 	/* message size the server can send or receive for non-raw SMBs */
-	unsigned int maxRw;	/* maxRw specifies the maximum */
+	unsigned int max_rw;	/* maxRw specifies the maximum */
 	/* message size the server can send or receive for */
 	/* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
+	unsigned int max_vcs;	/* maximum number of smb sessions, at least
+				   those that can be specified uniquely with
+				   vcnumbers */
 	char sessid[4];		/* unique token id for this session */
 	/* (returned on Negotiate */
 	int capabilities; /* allow selective disabling of caps by smb sess */
@@ -210,6 +213,7 @@ struct cifsSesInfo {
 	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
 	__u16 ipc_tid;		/* special tid for connection to IPC share */
 	__u16 flags;
+	__u16 vcnum;
 	char *serverOS;		/* name of operating system underlying server */
 	char *serverNOS;	/* name of network operating system of server */
 	char *serverDomain;	/* security realm of server */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 552642a507c4..939e2f76b959 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -528,14 +528,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
 		server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
 				(__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+		server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
 		GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
 		/* even though we do not use raw we might as well set this
 		accurately, in case we ever find a need for it */
 		if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
-			server->maxRw = 0xFF00;
+			server->max_rw = 0xFF00;
 			server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
 		} else {
-			server->maxRw = 0;/* we do not need to use raw anyway */
+			server->max_rw = 0;/* do not need to use raw anyway */
 			server->capabilities = CAP_MPX_MODE;
 		}
 		tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
@@ -638,7 +639,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	/* probably no need to store and check maxvcs */
 	server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
 			(__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
-	server->maxRw = le32_to_cpu(pSMBr->MaxRawSize);
+	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
 	cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
 	GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
 	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index b234407a3007..5c68b4282be9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -34,15 +34,99 @@
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
 			 unsigned char *p24);
 
+/* Checks if this is the first smb session to be reconnected after
+   the socket has been reestablished (so we know whether to use vc 0).
+   Called while holding the cifs_tcp_ses_lock, so do not block */
+static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
+{
+	struct list_head *tmp;
+	struct cifsSesInfo *tmp_ses;
+
+	list_for_each(tmp, &ses->server->smb_ses_list) {
+		tmp_ses = list_entry(tmp, struct cifsSesInfo,
+				     smb_ses_list);
+		if (tmp_ses->need_reconnect == false)
+			return false;
+	}
+	/* could not find a session that was already connected,
+	   this must be the first one we are reconnecting */
+	return true;
+}
+
+/*
+ *	vc number 0 is treated specially by some servers, and should be the
+ *      first one we request.  After that we can use vcnumbers up to maxvcs,
+ *	one for each smb session (some Windows versions set maxvcs incorrectly
+ *	so maxvc=1 can be ignored).  If we have too many vcs, we can reuse
+ *	any vc but zero (some servers reset the connection on vcnum zero)
+ *
+ */
+static __le16 get_next_vcnum(struct cifsSesInfo *ses)
+{
+	__u16 vcnum = 0;
+	struct list_head *tmp;
+	struct cifsSesInfo *tmp_ses;
+	__u16 max_vcs = ses->server->max_vcs;
+	__u16 i;
+	int free_vc_found = 0;
+
+	/* Quoting the MS-SMB specification: "Windows-based SMB servers set this
+	field to one but do not enforce this limit, which allows an SMB client
+	to establish more virtual circuits than allowed by this value ... but
+	other server implementations can enforce this limit." */
+	if (max_vcs < 2)
+		max_vcs = 0xFFFF;
+
+	write_lock(&cifs_tcp_ses_lock);
+	if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
+			goto get_vc_num_exit;  /* vcnum will be zero */
+	for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
+		if (i == 0) /* this is the only connection, use vc 0 */
+			break;
+
+		free_vc_found = 1;
+
+		list_for_each(tmp, &ses->server->smb_ses_list) {
+			tmp_ses = list_entry(tmp, struct cifsSesInfo,
+					     smb_ses_list);
+			if (tmp_ses->vcnum == i) {
+				free_vc_found = 0;
+				break; /* found duplicate, try next vcnum */
+			}
+		}
+		if (free_vc_found)
+			break; /* we found a vcnumber that will work - use it */
+	}
+
+	if (i == 0)
+		vcnum = 0; /* for most common case, ie if one smb session, use
+			      vc zero.  Also for case when no free vcnum, zero
+			      is safest to send (some clients only send zero) */
+	else if (free_vc_found == 0)
+		vcnum = 1;  /* we can not reuse vc=0 safely, since some servers
+				reset all uids on that, but 1 is ok. */
+	else
+		vcnum = i;
+	ses->vcnum = vcnum;
+get_vc_num_exit:
+	write_unlock(&cifs_tcp_ses_lock);
+
+	return le16_to_cpu(vcnum);
+}
+
 static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 {
 	__u32 capabilities = 0;
 
 	/* init fields common to all four types of SessSetup */
-	/* note that header is initialized to zero in header_assemble */
+	/* Note that offsets for first seven fields in req struct are same  */
+	/*	in CIFS Specs so does not matter which of 3 forms of struct */
+	/*	that we use in next few lines                               */
+	/* Note that header is initialized to zero in header_assemble */
 	pSMB->req.AndXCommand = 0xFF;
 	pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
 	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
+	pSMB->req.VcNumber = get_next_vcnum(ses);
 
 	/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
 
@@ -71,7 +155,6 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 	if (ses->capabilities & CAP_UNIX)
 		capabilities |= CAP_UNIX;
 
-	/* BB check whether to init vcnum BB */
 	return capabilities;
 }
 
-- 
cgit v1.2.3


From 4c41bd0ec953954158f92bed5d3062645062b98e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Feb 2009 21:29:31 +0100
Subject: [JFFS2] fix mount crash caused by removed nodes

At scan time we observed following scenario:

   node A inserted
   node B inserted
   node C inserted -> sets overlapped flag on node B

   node A is removed due to CRC failure -> overlapped flag on node B remains

   while (tn->overlapped)
   	 tn = tn_prev(tn);

   ==> crash, when tn_prev(B) is referenced.

When the ultimate node is removed at scan time and the overlapped flag
is set on the penultimate node, then nothing updates the overlapped
flag of that node. The overlapped iterators blindly expect that the
ultimate node does not have the overlapped flag set, which causes the
scan code to crash.

It would be a huge overhead to go through the node chain on node
removal and fix up the overlapped flags, so detecting such a case on
the fly in the overlapped iterators is a simpler and reliable
solution.

Cc: stable@kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/readinode.c | 42 +++++++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 6ca08ad887c0..1fc1e92356ee 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -220,7 +220,7 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 				struct jffs2_tmp_dnode_info *tn)
 {
 	uint32_t fn_end = tn->fn->ofs + tn->fn->size;
-	struct jffs2_tmp_dnode_info *this;
+	struct jffs2_tmp_dnode_info *this, *ptn;
 
 	dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw));
 
@@ -251,11 +251,18 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 	if (this) {
 		/* If the node is coincident with another at a lower address,
 		   back up until the other node is found. It may be relevant */
-		while (this->overlapped)
-			this = tn_prev(this);
-
-		/* First node should never be marked overlapped */
-		BUG_ON(!this);
+		while (this->overlapped) {
+			ptn = tn_prev(this);
+			if (!ptn) {
+				/*
+				 * We killed a node which set the overlapped
+				 * flags during the scan. Fix it up.
+				 */
+				this->overlapped = 0;
+				break;
+			}
+			this = ptn;
+		}
 		dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole");
 	}
 
@@ -360,7 +367,17 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 			}
 			if (!this->overlapped)
 				break;
-			this = tn_prev(this);
+
+			ptn = tn_prev(this);
+			if (!ptn) {
+				/*
+				 * We killed a node which set the overlapped
+				 * flags during the scan. Fix it up.
+				 */
+				this->overlapped = 0;
+				break;
+			}
+			this = ptn;
 		}
 	}
 
@@ -456,8 +473,15 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
 		eat_last(&rii->tn_root, &last->rb);
 		ver_insert(&ver_root, last);
 
-		if (unlikely(last->overlapped))
-			continue;
+		if (unlikely(last->overlapped)) {
+			if (pen)
+				continue;
+			/*
+			 * We killed a node which set the overlapped
+			 * flags during the scan. Fix it up.
+			 */
+			last->overlapped = 0;
+		}
 
 		/* Now we have a bunch of nodes in reverse version
 		   order, in the tree at ver_root. Most of the time,
-- 
cgit v1.2.3


From 05bf9e839d9de4e8a094274a0a2fd07beb47eaf1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 21 Feb 2009 12:13:24 -0500
Subject: ext4: Add fallback for find_group_flex

This is a workaround for find_group_flex() which badly needs to be
replaced.  One of its problems (besides ignoring the Orlov algorithm)
is that it is a bit hyperactive about returning failure under
suspicious circumstances.  This can lead to spurious ENOSPC failures
even when there are inodes still available.

Work around this for now by retrying the search using
find_group_other() if find_group_flex() returns -1.  If
find_group_other() succeeds when find_group_flex() has failed, log a
warning message.

A better block/inode allocator that will fix this problem for real has
been queued up for the next merge window.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4fb86a0061d0..f18a919be70b 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -715,6 +715,13 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 
 	if (sbi->s_log_groups_per_flex) {
 		ret2 = find_group_flex(sb, dir, &group);
+		if (ret2 == -1) {
+			ret2 = find_group_other(sb, dir, &group);
+			if (ret2 == 0 && printk_ratelimit())
+				printk(KERN_NOTICE "ext4: find_group_flex "
+				       "failed, fallback succeeded dir %lu\n",
+				       dir->i_ino);
+		}
 		goto got_group;
 	}
 
-- 
cgit v1.2.3


From ebd3610b110bbb18ea6f9f2aeed1e1068c537227 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 22 Feb 2009 21:09:59 -0500
Subject: ext4: Fix deadlock in ext4_write_begin() and ext4_da_write_begin()

Functions ext4_write_begin() and ext4_da_write_begin() call
grab_cache_page_write_begin() without AOP_FLAG_NOFS. Thus it
can happen that page reclaim is triggered in that function
and it recurses back into the filesystem (or some other filesystem).
But this can lead to various problems as a transaction is already
started at that point. Add the necessary flag.

http://bugzilla.kernel.org/show_bug.cgi?id=11688

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cbd2ca99d113..51cdd13e1c31 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1368,6 +1368,10 @@ retry:
 		goto out;
 	}
 
+	/* We cannot recurse into the filesystem as the transaction is already
+	 * started */
+	flags |= AOP_FLAG_NOFS;
+
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		ext4_journal_stop(handle);
@@ -1377,7 +1381,7 @@ retry:
 	*pagep = page;
 
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-							ext4_get_block);
+				ext4_get_block);
 
 	if (!ret && ext4_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
@@ -2667,6 +2671,9 @@ retry:
 		ret = PTR_ERR(handle);
 		goto out;
 	}
+	/* We cannot recurse into the filesystem as the transaction is already
+	 * started */
+	flags |= AOP_FLAG_NOFS;
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
-- 
cgit v1.2.3


From cac711211a039ae2e2dc6322ffb3c2279d093bf1 Mon Sep 17 00:00:00 2001
From: Krzysztof Sachanowicz <analyzer1@gmail.com>
Date: Mon, 23 Feb 2009 22:21:55 +0100
Subject: proc: proc_get_inode should de_put when inode already initialized

de_get is called before every proc_get_inode, but corresponding de_put is
called only when dropping last reference to an inode. This might cause
something like
remove_proc_entry: /proc/stats busy, count=14496
to be printed to the syslog.

The fix is to call de_put in case of an already initialized inode in
proc_get_inode.

Signed-off-by: Krzysztof Sachanowicz <analyzer1@gmail.com>
Tested-by: Marcin Pilipczuk <marcin.pilipczuk@gmail.com>
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3e76bb9b3ad6..d8bb5c671f42 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -485,8 +485,10 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 			}
 		}
 		unlock_new_inode(inode);
-	} else
+	} else {
 	       module_put(de->owner);
+	       de_put(de);
+	}
 	return inode;
 
 out_ino:
-- 
cgit v1.2.3


From e07a4b9217d1e97d2f3a62b6b070efdc61212110 Mon Sep 17 00:00:00 2001
From: Helge Bahmann <helge.bahmann@secunet.com>
Date: Fri, 20 Feb 2009 16:24:12 +0300
Subject: proc: fix PG_locked reporting in /proc/kpageflags

Expr always evaluates to zero.

Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/page.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 767d95a6d1b1..2d1345112a42 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -107,7 +107,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 		else
 			kflags = ppage->flags;
 
-		uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) |
+		uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) |
 			kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
 			kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
 			kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
-- 
cgit v1.2.3


From 8b1a8ff8b321a9384304aeea4dbdb9747daf7ee8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 28 Feb 2009 00:08:53 -0500
Subject: ext4: Remove duplicate call to ext4_commit_super() in ext4_freeze()

Commit c4be0c1d added error checking to ext4_freeze() when calling
ext4_commit_super().  Unfortunately the patch failed to remove the
original call to ext4_commit_super(), with the net result that when
freezing the filesystem, the superblock gets written twice, the first
time without error checking.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a5732c58f676..39d1993cfa13 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3091,7 +3091,6 @@ static int ext4_freeze(struct super_block *sb)
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
 		error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
 		if (error)
 			goto out;
-- 
cgit v1.2.3


From d8ae4601a4b7ea1fa17fa395c3468c0e144d1275 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 28 Feb 2009 09:50:01 -0500
Subject: ext4: Reorder fs/Makefile so that ext2 root fs's are mounted using
 ext2

In fs/Makefile, ext3 was placed before ext2 so that a root filesystem
that possessed a journal, it would be mounted as ext3 instead of ext2.
This was necessary because a cleanly unmounted ext3 filesystem was
fully backwards compatible with ext2, and could be mounted by ext2 ---
but it was desirable that it be mounted with ext3 so that the
journaling would be enabled.

The ext4 filesystem supports new incompatible features, so there is no
danger of an ext4 filesystem being mistaken for an ext2 filesystem.
At that point, the relative ordering of ext4 with respect to ext2
didn't matter until ext4 gained the ability to mount filesystems
without a journal starting in 2.6.29-rc1.  Now that this is the case,
given that ext4 is before ext2, it means that root filesystems that
were using the plain-jane ext2 format are getting mounted using the
ext4 filesystem driver, which is a change in behavior which could be
surprising to users.

It's doubtful that there are that many ext2-only root filesystem users
that would also have ext4 compiled into the kernel, but to adhere to
the principle of least surprise, the correct ordering in fs/Makefile
is ext3, followed by ext2, and finally ext4.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index 38bc735c67ad..dc20db348679 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -69,10 +69,12 @@ obj-$(CONFIG_DLM)		+= dlm/
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT4_FS)		+= ext4/ # Before ext2 so root fs can be ext4
+obj-$(CONFIG_EXT2_FS)		+= ext2/
+# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
+# unless explicitly requested by rootfstype
+obj-$(CONFIG_EXT4_FS)		+= ext4/
 obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_JBD2)		+= jbd2/
-obj-$(CONFIG_EXT2_FS)		+= ext2/
 obj-$(CONFIG_CRAMFS)		+= cramfs/
 obj-$(CONFIG_SQUASHFS)		+= squashfs/
 obj-y				+= ramfs/
-- 
cgit v1.2.3


From 8f64b32eb73fbfe9f38c4123121b63ee409278a7 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 26 Feb 2009 00:57:35 -0500
Subject: ext4: don't call jbd2_journal_force_commit_nested without journal

Running without a journal, I oopsed when I ran out of space,
because we called jbd2_journal_force_commit_nested() from
ext4_should_retry_alloc() without a journal.

This should take care of it, I think.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 4 +++-
 fs/ext4/inode.c  | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9a50b8052dcf..de9459b4cb94 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -609,7 +609,9 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
+	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+	    (*retries)++ > 3 ||
+	    !EXT4_SB(sb)->s_journal)
 		return 0;
 
 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 51cdd13e1c31..c7fed5b18745 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2544,7 +2544,7 @@ retry:
 
 		ext4_journal_stop(handle);
 
-		if (mpd.retval == -ENOSPC) {
+		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
 			/* commit the transaction which would
 			 * free blocks released in the transaction
 			 * and try again
-- 
cgit v1.2.3


From b2bf96833c5782befc3e7700f791fde754a47b01 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 19 Feb 2009 08:50:26 +0100
Subject: block: fix bogus gcc warning for uninitialized var usage

Newer gcc throw this warning:

        fs/bio.c: In function ?bio_alloc_bioset?:
        fs/bio.c:305: warning: ?p? may be used uninitialized in this function

since it cannot figure out that 'p' is only ever used if 'bs' is non-NULL.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 72ab251cdb9c..124b95c4d582 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -302,7 +302,7 @@ void bio_init(struct bio *bio)
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
 	struct bio *bio = NULL;
-	void *p;
+	void *uninitialized_var(p);
 
 	if (bs) {
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
-- 
cgit v1.2.3


From 47be12e4eec84c1846f29af64fe25a396b57a026 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 9 Jan 2009 07:32:48 +0800
Subject: ocfs2: Access and dirty the buffer_head in mark_written.

In __ocfs2_mark_extent_written, when we meet with the situation
of c_split_covers_rec, the old solution just replace the extent
record and forget to access and dirty the buffer_head. This will
cause a problem when the unwritten extent is in an extent block.
So access and dirty it.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 60fe74035db5..3a9e5deed74d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4796,6 +4796,29 @@ out:
 	return ret;
 }
 
+static int ocfs2_replace_extent_rec(struct inode *inode,
+				    handle_t *handle,
+				    struct ocfs2_path *path,
+				    struct ocfs2_extent_list *el,
+				    int split_index,
+				    struct ocfs2_extent_rec *split_rec)
+{
+	int ret;
+
+	ret = ocfs2_path_bh_journal_access(handle, inode, path,
+					   path_num_items(path) - 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el->l_recs[split_index] = *split_rec;
+
+	ocfs2_journal_dirty(handle, path_leaf_bh(path));
+out:
+	return ret;
+}
+
 /*
  * Mark part or all of the extent record at split_index in the leaf
  * pointed to by path as written. This removes the unwritten
@@ -4885,7 +4908,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 
 	if (ctxt.c_contig_type == CONTIG_NONE) {
 		if (ctxt.c_split_covers_rec)
-			el->l_recs[split_index] = *split_rec;
+			ret = ocfs2_replace_extent_rec(inode, handle,
+						       path, el,
+						       split_index, split_rec);
 		else
 			ret = ocfs2_split_and_insert(inode, handle, path, et,
 						     &last_eb_bh, split_index,
-- 
cgit v1.2.3


From 7dc102b737e9f49dac426161294cb2d326a97d8e Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 3 Feb 2009 12:37:13 -0800
Subject: ocfs2/dlm: Retract fix for race between purge and migrate

Mainline commit d4f7e650e55af6b235871126f747da88600e8040 attempts to delay
the dlm_thread from sending the drop ref message if the lockres is being
migrated. The problem is that we make the dlm_thread wait for the migration
to complete. This causes a deadlock as dlm_thread also participates in the
lockres migration process.

A better fix for the original oss bugzilla#1012 is in testing.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmthread.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d1295203029f..4060bb328bc8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,8 +181,7 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 
 		spin_lock(&res->spinlock);
 		/* This ensures that clear refmap is sent after the set */
-		__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
-						  DLM_LOCK_RES_MIGRATING));
+		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
 		spin_unlock(&res->spinlock);
 
 		/* clear our bit from the master's refmap, ignore errors */
-- 
cgit v1.2.3


From c74ff8bb2235d848beb67fcfddae71ecbe3f92b1 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 3 Feb 2009 12:37:14 -0800
Subject: ocfs2: Cleanup the lockname print in dlmglue.c

The dentry lock has a different format than other locks. This patch fixes
ocfs2_log_dlm_error() macro to make it print the dentry lock correctly.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 206a2370876a..7219a86d34cc 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -320,9 +320,14 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
 					struct ocfs2_lock_res *lockres);
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 						int convert);
-#define ocfs2_log_dlm_error(_func, _err, _lockres) do {			\
-	mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
-	     _err, _func, _lockres->l_name);				\
+#define ocfs2_log_dlm_error(_func, _err, _lockres) do {					\
+	if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY)				\
+		mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n",	\
+		     _err, _func, _lockres->l_name);					\
+	else										\
+		mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n",	\
+		     _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name,	\
+		     (unsigned int)ocfs2_get_dentry_lock_ino(_lockres));		\
 } while (0)
 static int ocfs2_downconvert_thread(void *arg);
 static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
-- 
cgit v1.2.3


From dabc47de7a23f57522dc762d9d2ad875700d3497 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 3 Feb 2009 12:37:15 -0800
Subject: ocfs2/dlm: Use ast_lock to protect ast_list

The code was using dlm->spinlock instead of dlm->ast_lock to protect the
ast_list. This patch fixes the issue.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmunlock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 86ca085ef324..fcf879ed6930 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -117,11 +117,11 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
 	else
 		BUG_ON(res->owner == dlm->node_num);
 
-	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->ast_lock);
 	/* We want to be sure that we're not freeing a lock
 	 * that still has AST's pending... */
 	in_use = !list_empty(&lock->ast_list);
-	spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm->ast_lock);
 	if (in_use) {
 	       mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
 		    "while waiting for an ast!", res->lockname.len,
-- 
cgit v1.2.3


From 53ecd25e148615e0ed2a72635cc76f4773f97f90 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 3 Feb 2009 12:37:16 -0800
Subject: ocfs2/dlm: Make dlm_assert_master_handler() kill itself instead of
 the asserter

In dlm_assert_master_handler(), if we get an incorrect assert master from a node
that, we reply with EINVAL asking the asserter to die. The problem is that an
assert is sent after so many hoops, it is invariably the node that thinks the
asserter is wrong, is actually wrong. So instead of killing the asserter, this
patch kills the assertee.

This patch papers over a race that is still being addressed.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 54e182a27caf..0a2813947853 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1849,12 +1849,12 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
 		if (!mle) {
 			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
 			    res->owner != assert->node_idx) {
-				mlog(ML_ERROR, "assert_master from "
-					  "%u, but current owner is "
-					  "%u! (%.*s)\n",
-				       assert->node_idx, res->owner,
-				       namelen, name);
-				goto kill;
+				mlog(ML_ERROR, "DIE! Mastery assert from %u, "
+				     "but current owner is %u! (%.*s)\n",
+				     assert->node_idx, res->owner, namelen,
+				     name);
+				__dlm_print_one_lock_resource(res);
+				BUG();
 			}
 		} else if (mle->type != DLM_MLE_MIGRATION) {
 			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
-- 
cgit v1.2.3


From 89a907afe073b8971a83d0ad54f391542b64d327 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Tue, 17 Feb 2009 04:39:28 +0800
Subject: ocfs2: Use the right access_* method in ctime update of xattr.

In ctime updating of xattr, it use the wrong type of access for
inode, so use ocfs2_journal_access_di instead.

Reported-and-Tested-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Tao Ma <tao.ma@oracle.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 915039fffe6e..e3933158e1d7 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2592,8 +2592,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 
 	if (!ret) {
 		/* Update inode ctime. */
-		ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_journal_access_di(ctxt->handle, inode,
+					      xis->inode_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
-- 
cgit v1.2.3


From c8b9cf9a7cd25ba65166116d0a958f0bc709f0a7 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Tue, 24 Feb 2009 17:40:26 -0800
Subject: ocfs2: lock the metaecc process for xattr bucket

For other metadata in ocfs2, metaecc is checked in ocfs2_read_blocks
with io_mutex held. While for xattr bucket, it is calculated by
the whole buckets. So we have to add a spin_lock to prevent multiple
processes calculating metaecc.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Tested-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2.h | 3 +++
 fs/ocfs2/super.c | 1 +
 fs/ocfs2/xattr.c | 4 ++++
 3 files changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 077384135f4e..946d3c34b90b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -341,6 +341,9 @@ struct ocfs2_super
 	struct ocfs2_node_map		osb_recovering_orphan_dirs;
 	unsigned int			*osb_orphan_wipes;
 	wait_queue_head_t		osb_wipe_event;
+
+	/* used to protect metaecc calculation check of xattr. */
+	spinlock_t osb_xattr_lock;
 };
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b1cb38fbe807..1c3acc4654d8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1747,6 +1747,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	INIT_LIST_HEAD(&osb->blocked_lock_list);
 	osb->blocked_lock_count = 0;
 	spin_lock_init(&osb->osb_lock);
+	spin_lock_init(&osb->osb_xattr_lock);
 	ocfs2_init_inode_steal_slot(osb);
 
 	atomic_set(&osb->alloc_stats.moves, 0);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e3933158e1d7..a7c167905c56 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -274,10 +274,12 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
 			       bucket->bu_blocks, bucket->bu_bhs, 0,
 			       NULL);
 	if (!rc) {
+		spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 		rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
 						 bucket->bu_bhs,
 						 bucket->bu_blocks,
 						 &bucket_xh(bucket)->xh_check);
+		spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 		if (rc)
 			mlog_errno(rc);
 	}
@@ -310,9 +312,11 @@ static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
 {
 	int i;
 
+	spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 	ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
 				   bucket->bu_bhs, bucket->bu_blocks,
 				   &bucket_xh(bucket)->xh_check);
+	spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 
 	for (i = 0; i < bucket->bu_blocks; i++)
 		ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
-- 
cgit v1.2.3


From 4442f518269c6b3686fcbcadad22dc4475309b16 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Fri, 20 Feb 2009 11:11:50 +0800
Subject: ocfs2: set gap to seperate entry and value when xattr in bucket

This patch set a gap (4 bytes) between xattr entry and
name/value when xattr in bucket. This gap use to seperate
entry and name/value when a bucket is full. It had already
been set when xattr in inode/block.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index a7c167905c56..4ddd788add67 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -82,13 +82,14 @@ struct ocfs2_xattr_set_ctxt {
 
 #define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
 #define OCFS2_XATTR_INLINE_SIZE	80
+#define OCFS2_XATTR_HEADER_GAP	4
 #define OCFS2_XATTR_FREE_IN_IBODY	(OCFS2_MIN_XATTR_INLINE_SIZE \
 					 - sizeof(struct ocfs2_xattr_header) \
-					 - sizeof(__u32))
+					 - OCFS2_XATTR_HEADER_GAP)
 #define OCFS2_XATTR_FREE_IN_BLOCK(ptr)	((ptr)->i_sb->s_blocksize \
 					 - sizeof(struct ocfs2_xattr_block) \
 					 - sizeof(struct ocfs2_xattr_header) \
-					 - sizeof(__u32))
+					 - OCFS2_XATTR_HEADER_GAP)
 
 static struct ocfs2_xattr_def_value_root def_xv = {
 	.xv.xr_list.l_count = cpu_to_le16(1),
@@ -1511,7 +1512,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		last += 1;
 	}
 
-	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+	free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
 	if (free < 0)
 		return -EIO;
 
@@ -2194,7 +2195,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
 		last += 1;
 	}
 
-	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+	free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
 	if (free < 0)
 		return 0;
 
@@ -5065,8 +5066,8 @@ try_again:
 	xh_free_start = le16_to_cpu(xh->xh_free_start);
 	header_size = sizeof(struct ocfs2_xattr_header) +
 			count * sizeof(struct ocfs2_xattr_entry);
-	max_free = OCFS2_XATTR_BUCKET_SIZE -
-		le16_to_cpu(xh->xh_name_value_len) - header_size;
+	max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
+		le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
 
 	mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
 			"of %u which exceed block size\n",
@@ -5099,7 +5100,7 @@ try_again:
 			need = 0;
 	}
 
-	free = xh_free_start - header_size;
+	free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
 	/*
 	 * We need to make sure the new name/value pair
 	 * can exist in the same block.
@@ -5132,7 +5133,8 @@ try_again:
 			}
 
 			xh_free_start = le16_to_cpu(xh->xh_free_start);
-			free = xh_free_start - header_size;
+			free = xh_free_start - header_size
+				- OCFS2_XATTR_HEADER_GAP;
 			if (xh_free_start % blocksize < need)
 				free -= xh_free_start % blocksize;
 
-- 
cgit v1.2.3


From 28d57d437786eb3e44f1ca3f0f41e7cfe29c6dd4 Mon Sep 17 00:00:00 2001
From: wengang wang <wen.gang.wang@oracle.com>
Date: Fri, 13 Feb 2009 10:11:47 +0800
Subject: ocfs2: add IO error check in ocfs2_get_sector()

Check for IO error in ocfs2_get_sector().

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/super.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 1c3acc4654d8..7ac83a81ee55 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1537,6 +1537,13 @@ static int ocfs2_get_sector(struct super_block *sb,
 	unlock_buffer(*bh);
 	ll_rw_block(READ, 1, bh);
 	wait_on_buffer(*bh);
+	if (!buffer_uptodate(*bh)) {
+		mlog_errno(-EIO);
+		brelse(*bh);
+		*bh = NULL;
+		return -EIO;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From adc487204a9373d2b5a535412466326036147a72 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 27 Feb 2009 14:02:59 -0800
Subject: EXPORT_SYMBOL(d_obtain_alias) rather than EXPORT_SYMBOL_GPL

Commit 4ea3ada2955e4519befa98ff55dd62d6dfbd1705 declares d_obtain_alias()
as EXPORT_SYMBOL_GPL where it's supposed to replace d_alloc_anon which was
previously declared as EXPORT_SYMBOL and thus available to any loadable
module.

This patch reverts that.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 937df0fb0da5..07e2d4a44bda 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1180,7 +1180,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	iput(inode);
 	return res;
 }
-EXPORT_SYMBOL_GPL(d_obtain_alias);
+EXPORT_SYMBOL(d_obtain_alias);
 
 /**
  * d_splice_alias - splice a disconnected dentry into the tree if one exists
-- 
cgit v1.2.3


From 5cf8cf4146de03de67d1a8aefbece66b65f255cc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 26 Feb 2009 21:32:51 +0100
Subject: Fix FREEZE/THAW compat_ioctl regression

Commit 8e961870bb9804110d5c8211d5d9d500451c4518 removed the FREEZE/THAW
handling in xfs_compat_ioctl but never added any compat handler back, so
now any freeze/thaw request from a 32-bit binary ond 64-bit userspace
will fail.

As these ioctls are 32/64-bit compatible two simple COMPATIBLE_IOCTL
entries in fs/compat_ioctl.c will do the job.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat_ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 39bd4d38e889..45e59d3c7f1f 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1913,6 +1913,9 @@ COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
 /* 0x00 */
 COMPATIBLE_IOCTL(FIBMAP)
 COMPATIBLE_IOCTL(FIGETBSZ)
+/* 'X' - originally XFS but some now in the VFS */
+COMPATIBLE_IOCTL(FIFREEZE)
+COMPATIBLE_IOCTL(FITHAW)
 /* RAID */
 COMPATIBLE_IOCTL(RAID_VERSION)
 COMPATIBLE_IOCTL(GET_ARRAY_INFO)
-- 
cgit v1.2.3