From 0f7a622ca61621f951af01448b956f2ecf5fad99 Mon Sep 17 00:00:00 2001
From: Chris Perl <chris.perl@gmail.com>
Date: Fri, 5 Sep 2014 15:40:21 -0400
Subject: rpc: xs_bind - do not bind when requesting a random ephemeral port

When attempting to establish a local ephemeral endpoint for a TCP or UDP
socket, do not explicitly call bind, instead let it happen implicilty when the
socket is first used.

The main motivating factor for this change is when TCP runs out of unique
ephemeral ports (i.e.  cannot find any ephemeral ports which are not a part of
*any* TCP connection).  In this situation if you explicitly call bind, then the
call will fail with EADDRINUSE.  However, if you allow the allocation of an
ephemeral port to happen implicitly as part of connect (or other functions),
then ephemeral ports can be reused, so long as the combination of (local_ip,
local_port, remote_ip, remote_port) is unique for TCP sockets on the system.

This doesn't matter for UDP sockets, but it seemed easiest to treat TCP and UDP
sockets the same.

This can allow mount.nfs(8) to continue to function successfully, even in the
face of misbehaving applications which are creating a large number of TCP
connections.

Signed-off-by: Chris Perl <chris.perl@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprtsock.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 43cd89eacfab..7ed47b4943da 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1746,13 +1746,29 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
 	unsigned short port = xs_get_srcport(transport);
 	unsigned short last;
 
+	/*
+	 * If we are asking for any ephemeral port (i.e. port == 0 &&
+	 * transport->xprt.resvport == 0), don't bind.  Let the local
+	 * port selection happen implicitly when the socket is used
+	 * (for example at connect time).
+	 *
+	 * This ensures that we can continue to establish TCP
+	 * connections even when all local ephemeral ports are already
+	 * a part of some TCP connection.  This makes no difference
+	 * for UDP sockets, but also doens't harm them.
+	 *
+	 * If we're asking for any reserved port (i.e. port == 0 &&
+	 * transport->xprt.resvport == 1) xs_get_srcport above will
+	 * ensure that port is non-zero and we will bind as needed.
+	 */
+	if (port == 0)
+		return 0;
+
 	memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen);
 	do {
 		rpc_set_port((struct sockaddr *)&myaddr, port);
 		err = kernel_bind(sock, (struct sockaddr *)&myaddr,
 				transport->xprt.addrlen);
-		if (port == 0)
-			break;
 		if (err == 0) {
 			transport->srcport = port;
 			break;
-- 
cgit v1.2.3


From 61beef75cc5bae119e500c9f25daad8596c7cbe4 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Wed, 3 Sep 2014 14:15:40 -0400
Subject: NFS: Clear up state owner lock usage

can_open_cached() reads values out of the state structure, meaning that
we need the so_lock to have a correct return value.  As a bonus, this
helps clear up some potentially confusing code.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7dd8aca31c29..18eb31c1ea0b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1307,15 +1307,13 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 	int ret = -EAGAIN;
 
 	for (;;) {
+		spin_lock(&state->owner->so_lock);
 		if (can_open_cached(state, fmode, open_mode)) {
-			spin_lock(&state->owner->so_lock);
-			if (can_open_cached(state, fmode, open_mode)) {
-				update_open_stateflags(state, fmode);
-				spin_unlock(&state->owner->so_lock);
-				goto out_return_state;
-			}
+			update_open_stateflags(state, fmode);
 			spin_unlock(&state->owner->so_lock);
+			goto out_return_state;
 		}
+		spin_unlock(&state->owner->so_lock);
 		rcu_read_lock();
 		delegation = rcu_dereference(nfsi->delegation);
 		if (!can_open_delegated(delegation, fmode)) {
-- 
cgit v1.2.3


From 378520b837cf4da769600b83690d8e825f16a611 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Thu, 7 Aug 2014 10:15:02 +0800
Subject: nfs41: add a helper function to set layoutcommit after commit

Track lwb in nfs_commit_data so that we can use it to setup
layoutcommit in commit_done callback.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c           | 29 +++++++++++++++++++++++++++++
 fs/nfs/pnfs.h           |  1 +
 fs/nfs/write.c          | 15 +++++++++++++++
 include/linux/nfs_xdr.h |  1 +
 4 files changed, 46 insertions(+)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a3851debf8a2..a6586cdee211 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1797,6 +1797,35 @@ pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
 }
 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
 
+void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
+{
+	struct inode *inode = data->inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	bool mark_as_dirty = false;
+
+	spin_lock(&inode->i_lock);
+	if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+		mark_as_dirty = true;
+		dprintk("%s: Set layoutcommit for inode %lu ",
+			__func__, inode->i_ino);
+	}
+	if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) {
+		/* references matched in nfs4_layoutcommit_release */
+		pnfs_get_lseg(data->lseg);
+	}
+	if (data->lwb > nfsi->layout->plh_lwb)
+		nfsi->layout->plh_lwb = data->lwb;
+	spin_unlock(&inode->i_lock);
+	dprintk("%s: lseg %p end_pos %llu\n",
+		__func__, data->lseg, nfsi->layout->plh_lwb);
+
+	/* if pnfs_layoutcommit_inode() runs between inode locks, the next one
+	 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
+	if (mark_as_dirty)
+		mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit);
+
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
 {
 	struct nfs_server *nfss = NFS_SERVER(data->args.inode);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index aca3dff5dae6..79c63114ce77 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -219,6 +219,7 @@ void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
 void pnfs_set_layoutcommit(struct nfs_pgio_header *);
+void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 175d5d073ccf..3c5638f381cd 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1538,6 +1538,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
 }
 EXPORT_SYMBOL_GPL(nfs_initiate_commit);
 
+static loff_t nfs_get_lwb(struct list_head *head)
+{
+	loff_t lwb = 0;
+	struct nfs_page *req;
+
+	list_for_each_entry(req, head, wb_list)
+		if (lwb < (req_offset(req) + req->wb_bytes))
+			lwb = req_offset(req) + req->wb_bytes;
+
+	return lwb;
+}
+
 /*
  * Set up the argument/result storage required for the RPC call.
  */
@@ -1557,6 +1569,9 @@ void nfs_init_commit(struct nfs_commit_data *data,
 	data->inode	  = inode;
 	data->cred	  = first->wb_context->cred;
 	data->lseg	  = lseg; /* reference transferred */
+	/* only set lwb for pnfs commit */
+	if (lseg)
+		data->lwb = nfs_get_lwb(&data->pages);
 	data->mds_ops     = &nfs_commit_ops;
 	data->completion_ops = cinfo->completion_ops;
 	data->dreq	  = cinfo->dreq;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0040629894df..e563b2c976ef 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1328,6 +1328,7 @@ struct nfs_commit_data {
 	struct pnfs_layout_segment *lseg;
 	struct nfs_client	*ds_clp;	/* pNFS data server */
 	int			ds_commit_index;
+	loff_t			lwb;
 	const struct rpc_call_ops *mds_ops;
 	const struct nfs_commit_completion_ops *completion_ops;
 	int (*commit_done_cb) (struct rpc_task *task, struct nfs_commit_data *data);
-- 
cgit v1.2.3


From bc7d4b8fd0917b4b84a8911185d34a97c696f219 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Thu, 7 Aug 2014 10:15:03 +0800
Subject: nfs/filelayout: set layoutcommit depending on write verifier

Following http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
Don't set layoutcommit for commit_through_mds case.
For FILE_SYNC writes, don't set layoutcommit.
For DATA_SYNC wirtes, set layout commit right after wirtes done.
For UNSTABLE writes, set layout commit when commit done.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/filelayout/filelayout.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 90978075f730..163cad1d4127 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -265,7 +265,7 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
 
 	if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
-	    hdr->res.verf->committed == NFS_FILE_SYNC)
+	    hdr->res.verf->committed != NFS_DATA_SYNC)
 		return;
 
 	pnfs_set_layoutcommit(hdr);
@@ -403,6 +403,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 		return -EAGAIN;
 	}
 
+	if (data->verf.committed == NFS_UNSTABLE)
+		pnfs_commit_set_layoutcommit(data);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2e11f8296d22134c4fca7eb022eea2b09facd307 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Aug 2014 11:09:17 -0500
Subject: nfs: cap request size to fit a kmalloced page array

pNFS servers may return arbitrarily large layouts.  Trim back the I/O size
to one that we can at least allocate the page array for.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index be7cbce6e4c7..94e16ec88312 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -481,6 +481,14 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 		return 0;
 	}
 
+	/*
+	 * Limit the request size so that we can still allocate a page array
+	 * for it without upsetting the slab allocator.
+	 */
+	if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
+			sizeof(struct page) > PAGE_SIZE)
+		return 0;
+
 	return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
 }
 EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
-- 
cgit v1.2.3


From 1013df61150e56f775ccacdaaeee66042f1e6eb6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Aug 2014 11:09:18 -0500
Subject: pnfs: do not pass uninitialized lsegs to ->free_lseg

Ensure the lsegs are initialized early so that we don't pass an unitialized
one back to ->free_lseg during error processing.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a6586cdee211..0e0364ca877d 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1358,6 +1358,9 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 		goto out;
 	}
 
+	init_lseg(lo, lseg);
+	lseg->pls_range = res->range;
+
 	spin_lock(&ino->i_lock);
 	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
 		dprintk("%s forget reply due to recall\n", __func__);
@@ -1375,8 +1378,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	/* Done processing layoutget. Set the layout stateid */
 	pnfs_set_layout_stateid(lo, &res->stateid, false);
 
-	init_lseg(lo, lseg);
-	lseg->pls_range = res->range;
 	pnfs_get_lseg(lseg);
 	pnfs_layout_insert_lseg(lo, lseg);
 
-- 
cgit v1.2.3


From 362f74745c15fb9acad577ab8e1342ee5313a2a5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Aug 2014 11:09:20 -0500
Subject: pnfs: don't check sequence on new stateids in layoutget

When layoutget returns an entirely new layout stateid it should not
check the generation counter as the new stateid will start with a new
counter entirely unrelated to old one.

The current behavior causes constant layoutget failures against a block
server which allocates a new stateid after an recall that removed all
outstanding layouts.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0e0364ca877d..ff7fabe33529 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -682,17 +682,6 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
 	return (s32)(s1 - s2) > 0;
 }
 
-static void
-pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
-		const nfs4_stateid *new,
-		struct list_head *free_me_list)
-{
-	if (nfs4_stateid_match_other(&lo->plh_stateid, new))
-		return;
-	/* Layout is new! Kill existing layout segments */
-	pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
-}
-
 /* update lo->plh_stateid with new if is more recent */
 void
 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -1367,16 +1356,29 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 		goto out_forget_reply;
 	}
 
-	if (pnfs_layoutgets_blocked(lo, 1) ||
-	    pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+	if (pnfs_layoutgets_blocked(lo, 1)) {
 		dprintk("%s forget reply due to state\n", __func__);
 		goto out_forget_reply;
 	}
 
-	/* Check that the new stateid matches the old stateid */
-	pnfs_verify_layout_stateid(lo, &res->stateid, &free_me);
-	/* Done processing layoutget. Set the layout stateid */
-	pnfs_set_layout_stateid(lo, &res->stateid, false);
+	if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
+		/* existing state ID, make sure the sequence number matches. */
+		if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+			dprintk("%s forget reply due to sequence\n", __func__);
+			goto out_forget_reply;
+		}
+		pnfs_set_layout_stateid(lo, &res->stateid, false);
+	} else {
+		/*
+		 * We got an entirely new state ID.  Mark all segments for the
+		 * inode invalid, and don't bother validating the stateid
+		 * sequence number.
+		 */
+		pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
+
+		nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
+		lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
+	}
 
 	pnfs_get_lseg(lseg);
 	pnfs_layout_insert_lseg(lo, lseg);
-- 
cgit v1.2.3


From defb8460881cbf0a5890bc7a63b42c0cfbed721d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Aug 2014 11:09:21 -0500
Subject: pnfs: retry after a bad stateid error from layoutget

Currently we fall through to nfs4_async_handle_error when we get
a bad stateid error back from layoutget.  nfs4_async_handle_error
with a NULL state argument will never retry the operations but return
the error to higher layer, causing an avoiable fallback to MDS I/O.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 18eb31c1ea0b..47fa67a3a8cf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7577,11 +7577,16 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 		} else {
 			LIST_HEAD(head);
 
+			/*
+			 * Mark the bad layout state as invalid, then retry
+			 * with the current stateid.
+			 */
 			pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
 			spin_unlock(&inode->i_lock);
-			/* Mark the bad layout state as invalid, then
-			 * retry using the open stateid. */
 			pnfs_free_lseg_list(&head);
+	
+			task->tk_status = 0;
+			rpc_restart_call_prepare(task);
 		}
 	}
 	if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
-- 
cgit v1.2.3


From 47abadefad213bb7de9592d2e09a8bd282ddc3de Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Aug 2014 11:09:22 -0500
Subject: pnfs: avoid using stale stateids after layoutreturn

After we issued a layoutreturn operations the may free the layout stateid
and will thus cause bad stateid error when the client uses it again.

We currently try to avoid this case by chosing the open stateid if not
lsegs are present for this inode.  But various places can hold refererence
on lsegs and thus cause the list not to be empty shortly after a layout
return.  Add an explicit flag to mark the current layout stateid invalid
and force usage of the openstateid after we did a full file layoutreturn.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c | 7 ++++++-
 fs/nfs/pnfs.h | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ff7fabe33529..57b5728e0b8e 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -738,7 +738,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 		status = -EAGAIN;
 	} else if (!nfs4_valid_open_stateid(open_state)) {
 		status = -EBADF;
-	} else if (list_empty(&lo->plh_segs)) {
+	} else if (list_empty(&lo->plh_segs) ||
+		   test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
 		int seq;
 
 		do {
@@ -860,6 +861,8 @@ _pnfs_return_layout(struct inode *ino)
 		dprintk("NFS: %s no layout segments to return\n", __func__);
 		goto out;
 	}
+
+	set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 	lo->plh_block_lgets++;
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&tmp_list);
@@ -1380,6 +1383,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 		lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
 	}
 
+	clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+
 	pnfs_get_lseg(lseg);
 	pnfs_layout_insert_lseg(lo, lseg);
 
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 79c63114ce77..1dd8a5e96c9f 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -65,6 +65,7 @@ enum {
 	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
 	NFS_LAYOUT_ROC,			/* some lseg had roc bit set */
 	NFS_LAYOUT_RETURN,		/* Return this layout ASAP */
+	NFS_LAYOUT_INVALID_STID,	/* layout stateid id is invalid */
 };
 
 enum layoutdriver_policy_flags {
-- 
cgit v1.2.3


From 5f919c9f10c1cf821ee5f414683214a361a1b98c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Aug 2014 11:09:25 -0500
Subject: pnfs: allow splicing pre-encoded pages into the layoutcommit args

Currently there is no XDR buffer space allocated for the per-layout driver
layoutcommit payload, which leads to server buffer overflows in the
blocklayout driver even under simple workloads.  As we can't do per-layout
sizes for XDR operations we'll have to splice a previously encoded list
of pages into the XDR stream, similar to how we handle ACL buffers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4xdr.c        | 18 +++++++++++++-----
 fs/nfs/pnfs.c           | 15 +++++++++++++++
 fs/nfs/pnfs.h           |  4 ++--
 include/linux/nfs_xdr.h |  3 +++
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e13b59d8d9aa..f2cd957adb90 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -395,7 +395,10 @@ static int nfs4_stat_to_errno(int);
 				2 /* last byte written */ + \
 				1 /* nt_timechanged (false) */ + \
 				1 /* layoutupdate4 layout type */ + \
-				1 /* NULL filelayout layoutupdate4 payload */)
+				1 /* layoutupdate4 opaqueue len */)
+				  /* the actual content of layoutupdate4 should
+				     be allocated by drivers and spliced in
+				     using xdr_write_pages */
 #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
 #define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
 				encode_stateid_maxsz + \
@@ -1990,7 +1993,7 @@ encode_layoutget(struct xdr_stream *xdr,
 static int
 encode_layoutcommit(struct xdr_stream *xdr,
 		    struct inode *inode,
-		    const struct nfs4_layoutcommit_args *args,
+		    struct nfs4_layoutcommit_args *args,
 		    struct compound_hdr *hdr)
 {
 	__be32 *p;
@@ -2011,11 +2014,16 @@ encode_layoutcommit(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
 	*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
 
-	if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
+	if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
 		NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
 			NFS_I(inode)->layout, xdr, args);
-	else
-		encode_uint32(xdr, 0); /* no layout-type payload */
+	} else {
+		encode_uint32(xdr, args->layoutupdate_len);
+		if (args->layoutupdate_pages) {
+			xdr_write_pages(xdr, args->layoutupdate_pages, 0,
+					args->layoutupdate_len);
+		}
+	}
 
 	return 0;
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 57b5728e0b8e..8827ab130ed3 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1854,6 +1854,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
 int
 pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
 	struct nfs4_layoutcommit_data *data;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t end_pos;
@@ -1904,6 +1905,20 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 	data->args.lastbytewritten = end_pos - 1;
 	data->res.server = NFS_SERVER(inode);
 
+	if (ld->prepare_layoutcommit) {
+		status = ld->prepare_layoutcommit(&data->args);
+		if (status) {
+			spin_lock(&inode->i_lock);
+			if (end_pos < nfsi->layout->plh_lwb)
+				nfsi->layout->plh_lwb = end_pos;
+			spin_unlock(&inode->i_lock);
+			put_rpccred(data->cred);
+			set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
+			goto clear_layoutcommitting;
+		}
+	}
+
+
 	status = nfs4_proc_layoutcommit(data, sync);
 out:
 	if (status)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1dd8a5e96c9f..8835b5a320cc 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -128,8 +128,8 @@ struct pnfs_layoutdriver_type {
 				     const struct nfs4_layoutreturn_args *args);
 
 	void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
-
-	void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+	int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
+	void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
 				     struct xdr_stream *xdr,
 				     const struct nfs4_layoutcommit_args *args);
 };
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index e563b2c976ef..f4092c6b90fb 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -279,6 +279,9 @@ struct nfs4_layoutcommit_args {
 	__u64 lastbytewritten;
 	struct inode *inode;
 	const u32 *bitmask;
+	size_t layoutupdate_len;
+	struct page *layoutupdate_page;
+	struct page **layoutupdate_pages;
 };
 
 struct nfs4_layoutcommit_res {
-- 
cgit v1.2.3


From e3aaf7f2b8e9e858c0d6626577d3456bf6070e5a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Aug 2014 11:09:26 -0500
Subject: pnfs/blocklayout: reject pnfs blocksize larger than page size

The Linux VM subsystem can't support block sizes larger than page size
for block based filesystems very well.  While this can be hacked around
to some extent for simple filesystems the read-modify-write cycles
required for pnfs block invalid extents are extremly deadlock prone
when operating on multiple pages.  Reject this case early on instead
of pretending to support it (badly).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index cbb1797149d5..6c1a4212919d 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1115,6 +1115,12 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
 		dprintk("%s Server did not return blksize\n", __func__);
 		return -EINVAL;
 	}
+	if (server->pnfs_blksize > PAGE_SIZE) {
+		printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
+			__func__, server->pnfs_blksize);
+		return -EINVAL;
+	}
+
 	b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
 	if (!b_mt_id) {
 		status = -ENOMEM;
-- 
cgit v1.2.3


From 72c5e59f63ceaa604936b1693ba8c4b6cc2b114d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Aug 2014 11:09:27 -0500
Subject: pnfs/blocklayout: improve GETDEVICEINFO error reporting

Tell userspace what stage of GETDEVICEINFO failed so that there is a chance
to debug it, especially with the userspace daemon clusterf***k in the block
layout driver.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayoutdev.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index 04303b5c9361..63f77925aa87 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -150,7 +150,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
 	remove_wait_queue(&nn->bl_wq, &wq);
 
 	if (reply->status != BL_DEVICE_REQUEST_PROC) {
-		dprintk("%s failed to open device: %d\n",
+		printk(KERN_WARNING "%s failed to decode device: %d\n",
 			__func__, reply->status);
 		rv = ERR_PTR(-EINVAL);
 		goto out;
@@ -159,7 +159,8 @@ nfs4_blk_decode_device(struct nfs_server *server,
 	bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
 			       FMODE_READ, NULL);
 	if (IS_ERR(bd)) {
-		dprintk("%s failed to open device : %ld\n", __func__,
+		printk(KERN_WARNING "%s failed to open device %d:%d (%ld)\n",
+			__func__, reply->major, reply->minor,
 			PTR_ERR(bd));
 		rv = ERR_CAST(bd);
 		goto out;
-- 
cgit v1.2.3


From be98fd0ac3dd45c1aa404b101caa37f9b317ab57 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Aug 2014 11:09:28 -0500
Subject: pnfs/blocklayout: plug block queues

Make sure the block queue is plugged when performing pNFS blocklayout I/O.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 6c1a4212919d..5427ae766f06 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -255,6 +255,7 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
 	struct page **pages = hdr->args.pages;
 	int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT;
 	const bool is_dio = (header->dreq != NULL);
+	struct blk_plug plug;
 
 	dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
 		hdr->page_array.npages, f_offset,
@@ -266,6 +267,8 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
 	par->pnfs_callback = bl_end_par_io_read;
 	/* At this point, we can no longer jump to use_mds */
 
+	blk_start_plug(&plug);
+
 	isect = (sector_t) (f_offset >> SECTOR_SHIFT);
 	/* Code assumes extents are page-aligned */
 	for (i = pg_index; i < hdr->page_array.npages; i++) {
@@ -342,6 +345,7 @@ out:
 	bl_put_extent(be);
 	bl_put_extent(cow_read);
 	bl_submit_bio(READ, bio);
+	blk_finish_plug(&plug);
 	put_parallel(par);
 	return PNFS_ATTEMPTED;
 
@@ -688,9 +692,12 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 	u64 temp;
 	int npg_per_block =
 	    NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
+	struct blk_plug plug;
 
 	dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
 
+	blk_start_plug(&plug);
+
 	if (header->dreq != NULL &&
 	    (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
 	     !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
@@ -894,9 +901,11 @@ out:
 	bl_put_extent(be);
 	bl_put_extent(cow_read);
 	bl_submit_bio(WRITE, bio);
+	blk_finish_plug(&plug);
 	put_parallel(par);
 	return PNFS_ATTEMPTED;
 out_mds:
+	blk_finish_plug(&plug);
 	bl_put_extent(be);
 	bl_put_extent(cow_read);
 	kfree(par);
-- 
cgit v1.2.3


From 921b81a8cd5a4acc2a009778d13eedd377362c4c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Aug 2014 11:09:29 -0500
Subject: pnfs/blocklayout: correctly decrement extent length

When we do non-page sized reads we can underflow the extent_length variable
and read incorrect data.  Fix the extent_length calculation and change to
defensive <= checks for the extent length in the read and write path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5427ae766f06..87a633d03507 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -272,7 +272,7 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
 	isect = (sector_t) (f_offset >> SECTOR_SHIFT);
 	/* Code assumes extents are page-aligned */
 	for (i = pg_index; i < hdr->page_array.npages; i++) {
-		if (!extent_length) {
+		if (extent_length <= 0) {
 			/* We've used up the previous extent */
 			bl_put_extent(be);
 			bl_put_extent(cow_read);
@@ -303,6 +303,7 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
 			f_offset += pg_len;
 			bytes_left -= pg_len;
 			isect += (pg_offset >> SECTOR_SHIFT);
+			extent_length -= (pg_offset >> SECTOR_SHIFT);
 		} else {
 			pg_offset = 0;
 			pg_len = PAGE_CACHE_SIZE;
@@ -333,7 +334,7 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
 			}
 		}
 		isect += (pg_len >> SECTOR_SHIFT);
-		extent_length -= PAGE_CACHE_SECTORS;
+		extent_length -= (pg_len >> SECTOR_SHIFT);
 	}
 	if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
 		hdr->res.eof = 1;
@@ -797,7 +798,7 @@ next_page:
 	/* Middle pages */
 	pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
 	for (i = pg_index; i < header->page_array.npages; i++) {
-		if (!extent_length) {
+		if (extent_length <= 0) {
 			/* We've used up the previous extent */
 			bl_put_extent(be);
 			bl_put_extent(cow_read);
-- 
cgit v1.2.3


From 3a3908c8b09d5ec19d543836d4f38d240ae27fe8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 8 Sep 2014 22:21:00 -0700
Subject: NFS: Fix a compile warning when !(CONFIG_NFS_V3 || CONFIG_NFS_V4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gcc reports:

linux/fs/nfs/write.c: In function ‘nfs_page_find_head_request_locked.isra.17’:
linux/fs/nfs/write.c:121:64: warning: ‘cinfo.mds’ may be used uninitialized in this function [-Wmaybe-uninitialized]
  list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
                                                                  ^
linux/fs/nfs/write.c:110:25: note: ‘cinfo.mds’ was declared here
  struct nfs_commit_info cinfo;

Reported-by: Anna Schumaker <Anna.Schumaker@netapp.com>
Cc: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 74 +++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3c5638f381cd..128d97f01d1c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -49,6 +49,9 @@ static const struct nfs_rw_ops nfs_rw_write_ops;
 static void nfs_clear_request_commit(struct nfs_page *req);
 static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
 				      struct inode *inode);
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+						struct page *page);
 
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
@@ -94,38 +97,6 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 }
 
-/*
- * nfs_page_search_commits_for_head_request_locked
- *
- * Search through commit lists on @inode for the head request for @page.
- * Must be called while holding the inode (which is cinfo) lock.
- *
- * Returns the head request if found, or NULL if not found.
- */
-static struct nfs_page *
-nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
-						struct page *page)
-{
-	struct nfs_page *freq, *t;
-	struct nfs_commit_info cinfo;
-	struct inode *inode = &nfsi->vfs_inode;
-
-	nfs_init_cinfo_from_inode(&cinfo, inode);
-
-	/* search through pnfs commit lists */
-	freq = pnfs_search_commit_reqs(inode, &cinfo, page);
-	if (freq)
-		return freq->wb_head;
-
-	/* Linearly search the commit list for the correct request */
-	list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
-		if (freq->wb_page == page)
-			return freq->wb_head;
-	}
-
-	return NULL;
-}
-
 /*
  * nfs_page_find_head_request_locked - find head request associated with @page
  *
@@ -750,6 +721,38 @@ nfs_mark_request_dirty(struct nfs_page *req)
 }
 
 #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
+/*
+ * nfs_page_search_commits_for_head_request_locked
+ *
+ * Search through commit lists on @inode for the head request for @page.
+ * Must be called while holding the inode (which is cinfo) lock.
+ *
+ * Returns the head request if found, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+						struct page *page)
+{
+	struct nfs_page *freq, *t;
+	struct nfs_commit_info cinfo;
+	struct inode *inode = &nfsi->vfs_inode;
+
+	nfs_init_cinfo_from_inode(&cinfo, inode);
+
+	/* search through pnfs commit lists */
+	freq = pnfs_search_commit_reqs(inode, &cinfo, page);
+	if (freq)
+		return freq->wb_head;
+
+	/* Linearly search the commit list for the correct request */
+	list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
+		if (freq->wb_page == page)
+			return freq->wb_head;
+	}
+
+	return NULL;
+}
+
 /**
  * nfs_request_add_commit_list - add request to a commit list
  * @req: pointer to a struct nfs_page
@@ -868,6 +871,13 @@ int nfs_write_need_commit(struct nfs_pgio_header *hdr)
 }
 
 #else
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+						struct page *page)
+{
+	return NULL;
+}
+
 static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
 				      struct inode *inode)
 {
-- 
cgit v1.2.3


From 7c5d187581f57657de79e795602d9f1a0254c88c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 08:23:29 -0700
Subject: pnfs: force a layout commit when encountering busy segments during
 recall

Expedite layout recall processing by forcing a layout commit when
we see busy segments.  Without it the layout recall might have to wait
until the VM decided to start writeback for the file, which can introduce
long delays.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_proc.c | 8 +++++++-
 fs/nfs/pnfs.c          | 3 +++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 41db5258e7a7..0d269515c64d 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -171,6 +171,13 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 		goto out;
 
 	ino = lo->plh_inode;
+
+	spin_lock(&ino->i_lock);
+	pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+	spin_unlock(&ino->i_lock);
+
+	pnfs_layoutcommit_inode(ino, false);
+
 	spin_lock(&ino->i_lock);
 	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
 	    pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
@@ -178,7 +185,6 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 		rv = NFS4ERR_DELAY;
 	else
 		rv = NFS4ERR_NOMATCHING_LAYOUT;
-	pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&free_me_list);
 	pnfs_put_layout_hdr(lo);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8827ab130ed3..ce5d1ea695d1 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -594,6 +594,9 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
 		dprintk("%s freeing layout for inode %lu\n", __func__,
 			lo->plh_inode->i_ino);
 		inode = lo->plh_inode;
+
+		pnfs_layoutcommit_inode(inode, false);
+
 		spin_lock(&inode->i_lock);
 		list_del_init(&lo->plh_bulk_destroy);
 		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
-- 
cgit v1.2.3


From 612aa983a041056c3368aacfdc9febd406d245a8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 08:23:30 -0700
Subject: pnfs: add flag to force read-modify-write in ->write_begin

Like all block based filesystems, the pNFS block layout driver can't read
or write at a byte granularity and thus has to perform read-modify-write
cycles on writes smaller than this granularity.

Add a flag so that the core NFS code always reads a whole page when
starting a smaller write, so that we can do it in the place where the VFS
expects it instead of doing in very deadlock prone way in the writeback
handler.

Note that in theory we could do less than page size reads here for disks
that have a smaller sector size which are served by a server with a smaller
pnfs block size.  But so far that doesn't seem like a worthwhile
optimization.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c |  7 +++++++
 fs/nfs/pnfs.h | 15 +++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 524dd80d1898..05ef7dcc5f21 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #include "nfstrace.h"
 
@@ -327,6 +328,12 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page,
 	unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
 	unsigned int end = offset + len;
 
+	if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
+		if (!PageUptodate(page))
+			return 1;
+		return 0;
+	}
+
 	if ((file->f_mode & FMODE_READ) &&	/* open for read? */
 	    !PageUptodate(page) &&		/* Uptodate? */
 	    !PagePrivate(page) &&		/* i/o request already? */
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 8835b5a320cc..a4c530e52149 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -72,6 +72,7 @@ enum layoutdriver_policy_flags {
 	/* Should the pNFS client commit and return the layout upon a setattr */
 	PNFS_LAYOUTRET_ON_SETATTR	= 1 << 0,
 	PNFS_LAYOUTRET_ON_ERROR		= 1 << 1,
+	PNFS_READ_WHOLE_PAGE		= 1 << 2,
 };
 
 struct nfs4_deviceid_node;
@@ -369,6 +370,14 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
 		PNFS_LAYOUTRET_ON_SETATTR;
 }
 
+static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+	if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+		return false;
+	return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
+}
+
 static inline bool
 pnfs_layoutcommit_outstanding(struct inode *inode)
 {
@@ -444,6 +453,12 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
 	return false;
 }
 
+static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+	return false;
+}
+
 static inline bool
 pnfs_roc(struct inode *ino)
 {
-- 
cgit v1.2.3


From c88953d87f5c8cd95bebcbd6d15f2f0cdd348136 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 08:23:31 -0700
Subject: pnfs: add return_range method

If a layout driver keeps per-inode state outside of the layout segments it
needs to be notified of any layout returns or recalls on an inode, and not
just about the freeing of layout segments.  Add a method to acomplish this,
which will allow the block layout driver to handle the case of truncated
and re-expanded files properly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_proc.c | 12 +++++++++---
 fs/nfs/pnfs.c          | 10 ++++++++++
 fs/nfs/pnfs.h          |  3 +++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 0d269515c64d..56bd0eaf645e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -181,10 +181,16 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 	spin_lock(&ino->i_lock);
 	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
 	    pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
-					&args->cbl_range))
+					&args->cbl_range)) {
 		rv = NFS4ERR_DELAY;
-	else
-		rv = NFS4ERR_NOMATCHING_LAYOUT;
+		goto unlock;
+	}
+
+	if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+		NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
+			&args->cbl_range);
+	}
+unlock:
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&free_me_list);
 	pnfs_put_layout_hdr(lo);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ce5d1ea695d1..76de7f568119 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -857,6 +857,16 @@ _pnfs_return_layout(struct inode *ino)
 	empty = list_empty(&lo->plh_segs);
 	pnfs_clear_layoutcommit(ino, &tmp_list);
 	pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+
+	if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+		struct pnfs_layout_range range = {
+			.iomode		= IOMODE_ANY,
+			.offset		= 0,
+			.length		= NFS4_MAX_UINT64,
+		};
+		NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
+	}
+
 	/* Don't send a LAYOUTRETURN if list was initially empty */
 	if (empty) {
 		spin_unlock(&ino->i_lock);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index a4c530e52149..2c2494225930 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,6 +94,9 @@ struct pnfs_layoutdriver_type {
 	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
 	void (*free_lseg) (struct pnfs_layout_segment *lseg);
 
+	void (*return_range) (struct pnfs_layout_hdr *lo,
+			      struct pnfs_layout_range *range);
+
 	/* test for nfs page cache coalescing */
 	const struct nfs_pageio_ops *pg_read_ops;
 	const struct nfs_pageio_ops *pg_write_ops;
-- 
cgit v1.2.3


From 3a6fd1f004fcaf3dd1c28a7cd16406c8318eb64a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 08:23:32 -0700
Subject: pnfs/blocklayout: remove read-modify-write handling in
 bl_write_pagelist

Use the new PNFS_READ_WHOLE_PAGE flag to offload read-modify-write
handling to core nfs code, and remove a huge chunk of deadlock prone
mess from the block layout writeback path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c | 498 +++++----------------------------------
 1 file changed, 63 insertions(+), 435 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 87a633d03507..cf87254b6cd1 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -35,7 +35,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/bio.h>		/* struct bio */
-#include <linux/buffer_head.h>	/* various write calls */
 #include <linux/prefetch.h>
 #include <linux/pagevec.h>
 
@@ -188,16 +187,6 @@ retry:
 	return bio;
 }
 
-static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
-				      sector_t isect, struct page *page,
-				      struct pnfs_block_extent *be,
-				      void (*end_io)(struct bio *, int err),
-				      struct parallel_io *par)
-{
-	return do_add_page_to_bio(bio, npg, rw, isect, page, be,
-				  end_io, par, 0, PAGE_CACHE_SIZE);
-}
-
 /* This is basically copied from mpage_end_io_read */
 static void bl_end_io_read(struct bio *bio, int err)
 {
@@ -293,8 +282,8 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
 			}
 		}
 
+		pg_offset = f_offset & ~PAGE_CACHE_MASK;
 		if (is_dio) {
-			pg_offset = f_offset & ~PAGE_CACHE_MASK;
 			if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
 				pg_len = PAGE_CACHE_SIZE - pg_offset;
 			else
@@ -305,7 +294,7 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
 			isect += (pg_offset >> SECTOR_SHIFT);
 			extent_length -= (pg_offset >> SECTOR_SHIFT);
 		} else {
-			pg_offset = 0;
+			BUG_ON(pg_offset != 0);
 			pg_len = PAGE_CACHE_SIZE;
 		}
 
@@ -383,29 +372,6 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
 	}
 }
 
-static void bl_end_io_write_zero(struct bio *bio, int err)
-{
-	struct parallel_io *par = bio->bi_private;
-	struct bio_vec *bvec;
-	int i;
-
-	bio_for_each_segment_all(bvec, bio, i) {
-		/* This is the zeroing page we added */
-		end_page_writeback(bvec->bv_page);
-		page_cache_release(bvec->bv_page);
-	}
-
-	if (unlikely(err)) {
-		struct nfs_pgio_header *header = par->data;
-
-		if (!header->pnfs_error)
-			header->pnfs_error = -EIO;
-		pnfs_set_lo_fail(header->lseg);
-	}
-	bio_put(bio);
-	put_parallel(par);
-}
-
 static void bl_end_io_write(struct bio *bio, int err)
 {
 	struct parallel_io *par = bio->bi_private;
@@ -455,256 +421,22 @@ static void bl_end_par_io_write(void *data, int num_se)
 	schedule_work(&hdr->task.u.tk_work);
 }
 
-/* FIXME STUB - mark intersection of layout and page as bad, so is not
- * used again.
- */
-static void mark_bad_read(void)
-{
-	return;
-}
-
-/*
- * map_block:  map a requested I/0 block (isect) into an offset in the LVM
- * block_device
- */
-static void
-map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
-{
-	dprintk("%s enter be=%p\n", __func__, be);
-
-	set_buffer_mapped(bh);
-	bh->b_bdev = be->be_mdev;
-	bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
-	    (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
-
-	dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
-		__func__, (unsigned long long)isect, (long)bh->b_blocknr,
-		bh->b_size);
-	return;
-}
-
-static void
-bl_read_single_end_io(struct bio *bio, int error)
-{
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct page *page = bvec->bv_page;
-
-	/* Only one page in bvec */
-	unlock_page(page);
-}
-
-static int
-bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
-		    unsigned int offset, unsigned int len)
-{
-	struct bio *bio;
-	struct page *shadow_page;
-	sector_t isect;
-	char *kaddr, *kshadow_addr;
-	int ret = 0;
-
-	dprintk("%s: offset %u len %u\n", __func__, offset, len);
-
-	shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
-	if (shadow_page == NULL)
-		return -ENOMEM;
-
-	bio = bio_alloc(GFP_NOIO, 1);
-	if (bio == NULL)
-		return -ENOMEM;
-
-	isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
-		(offset / SECTOR_SIZE);
-
-	bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
-	bio->bi_bdev = be->be_mdev;
-	bio->bi_end_io = bl_read_single_end_io;
-
-	lock_page(shadow_page);
-	if (bio_add_page(bio, shadow_page,
-			 SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
-		unlock_page(shadow_page);
-		bio_put(bio);
-		return -EIO;
-	}
-
-	submit_bio(READ, bio);
-	wait_on_page_locked(shadow_page);
-	if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
-		ret = -EIO;
-	} else {
-		kaddr = kmap_atomic(page);
-		kshadow_addr = kmap_atomic(shadow_page);
-		memcpy(kaddr + offset, kshadow_addr + offset, len);
-		kunmap_atomic(kshadow_addr);
-		kunmap_atomic(kaddr);
-	}
-	__free_page(shadow_page);
-	bio_put(bio);
-
-	return ret;
-}
-
-static int
-bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
-			  unsigned int dirty_offset, unsigned int dirty_len,
-			  bool full_page)
-{
-	int ret = 0;
-	unsigned int start, end;
-
-	if (full_page) {
-		start = 0;
-		end = PAGE_CACHE_SIZE;
-	} else {
-		start = round_down(dirty_offset, SECTOR_SIZE);
-		end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
-	}
-
-	dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
-	if (!be) {
-		zero_user_segments(page, start, dirty_offset,
-				   dirty_offset + dirty_len, end);
-		if (start == 0 && end == PAGE_CACHE_SIZE &&
-		    trylock_page(page)) {
-			SetPageUptodate(page);
-			unlock_page(page);
-		}
-		return ret;
-	}
-
-	if (start != dirty_offset)
-		ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
-
-	if (!ret && (dirty_offset + dirty_len < end))
-		ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
-					  end - dirty_offset - dirty_len);
-
-	return ret;
-}
-
-/* Given an unmapped page, zero it or read in page for COW, page is locked
- * by caller.
- */
-static int
-init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
-{
-	struct buffer_head *bh = NULL;
-	int ret = 0;
-	sector_t isect;
-
-	dprintk("%s enter, %p\n", __func__, page);
-	BUG_ON(PageUptodate(page));
-	if (!cow_read) {
-		zero_user_segment(page, 0, PAGE_SIZE);
-		SetPageUptodate(page);
-		goto cleanup;
-	}
-
-	bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
-	if (!bh) {
-		ret = -ENOMEM;
-		goto cleanup;
-	}
-
-	isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
-	map_block(bh, isect, cow_read);
-	if (!bh_uptodate_or_lock(bh))
-		ret = bh_submit_read(bh);
-	if (ret)
-		goto cleanup;
-	SetPageUptodate(page);
-
-cleanup:
-	if (bh)
-		free_buffer_head(bh);
-	if (ret) {
-		/* Need to mark layout with bad read...should now
-		 * just use nfs4 for reads and writes.
-		 */
-		mark_bad_read();
-	}
-	return ret;
-}
-
-/* Find or create a zeroing page marked being writeback.
- * Return ERR_PTR on error, NULL to indicate skip this page and page itself
- * to indicate write out.
- */
-static struct page *
-bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
-			struct pnfs_block_extent *cow_read)
-{
-	struct page *page;
-	int locked = 0;
-	page = find_get_page(inode->i_mapping, index);
-	if (page)
-		goto check_page;
-
-	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
-	if (unlikely(!page)) {
-		dprintk("%s oom\n", __func__);
-		return ERR_PTR(-ENOMEM);
-	}
-	locked = 1;
-
-check_page:
-	/* PageDirty: Other will write this out
-	 * PageWriteback: Other is writing this out
-	 * PageUptodate: It was read before
-	 */
-	if (PageDirty(page) || PageWriteback(page)) {
-		print_page(page);
-		if (locked)
-			unlock_page(page);
-		page_cache_release(page);
-		return NULL;
-	}
-
-	if (!locked) {
-		lock_page(page);
-		locked = 1;
-		goto check_page;
-	}
-	if (!PageUptodate(page)) {
-		/* New page, readin or zero it */
-		init_page_for_write(page, cow_read);
-	}
-	set_page_writeback(page);
-	unlock_page(page);
-
-	return page;
-}
-
 static enum pnfs_try_status
 bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 {
-	int i, ret, npg_zero, pg_index, last = 0;
+	int i, ret;
 	struct bio *bio = NULL;
-	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
-	sector_t isect, last_isect = 0, extent_length = 0;
+	struct pnfs_block_extent *be = NULL;
+	sector_t isect, extent_length = 0;
 	struct parallel_io *par = NULL;
 	loff_t offset = header->args.offset;
 	size_t count = header->args.count;
-	unsigned int pg_offset, pg_len, saved_len;
 	struct page **pages = header->args.pages;
-	struct page *page;
-	pgoff_t index;
-	u64 temp;
-	int npg_per_block =
-	    NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
+	int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
 	struct blk_plug plug;
 
 	dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
 
-	blk_start_plug(&plug);
-
-	if (header->dreq != NULL &&
-	    (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
-	     !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
-		dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
-		goto out_mds;
-	}
 	/* At this point, header->page_aray is a (sequential) list of nfs_pages.
 	 * We want to write each, and if there is an error set pnfs_error
 	 * to have it redone using nfs.
@@ -715,97 +447,20 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 	par->pnfs_callback = bl_end_par_io_write;
 	/* At this point, have to be more careful with error handling */
 
-	isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
-	be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read);
-	if (!be || !is_writable(be, isect)) {
-		dprintk("%s no matching extents!\n", __func__);
-		goto out_mds;
-	}
+	blk_start_plug(&plug);
 
-	/* First page inside INVALID extent */
-	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-		if (likely(!bl_push_one_short_extent(be->be_inval)))
-			par->bse_count++;
-		else
-			goto out_mds;
-		temp = offset >> PAGE_CACHE_SHIFT;
-		npg_zero = do_div(temp, npg_per_block);
-		isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
-				     (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
-		extent_length = be->be_length - (isect - be->be_f_offset);
-
-fill_invalid_ext:
-		dprintk("%s need to zero %d pages\n", __func__, npg_zero);
-		for (;npg_zero > 0; npg_zero--) {
-			if (bl_is_sector_init(be->be_inval, isect)) {
-				dprintk("isect %llu already init\n",
-					(unsigned long long)isect);
-				goto next_page;
-			}
-			/* page ref released in bl_end_io_write_zero */
-			index = isect >> PAGE_CACHE_SECTOR_SHIFT;
-			dprintk("%s zero %dth page: index %lu isect %llu\n",
-				__func__, npg_zero, index,
-				(unsigned long long)isect);
-			page = bl_find_get_zeroing_page(header->inode, index,
-							cow_read);
-			if (unlikely(IS_ERR(page))) {
-				header->pnfs_error = PTR_ERR(page);
-				goto out;
-			} else if (page == NULL)
-				goto next_page;
+	/* we always write out the whole page */
+	offset = offset & (loff_t)PAGE_CACHE_MASK;
+	isect = offset >> SECTOR_SHIFT;
 
-			ret = bl_mark_sectors_init(be->be_inval, isect,
-						       PAGE_CACHE_SECTORS);
-			if (unlikely(ret)) {
-				dprintk("%s bl_mark_sectors_init fail %d\n",
-					__func__, ret);
-				end_page_writeback(page);
-				page_cache_release(page);
-				header->pnfs_error = ret;
-				goto out;
-			}
-			if (likely(!bl_push_one_short_extent(be->be_inval)))
-				par->bse_count++;
-			else {
-				end_page_writeback(page);
-				page_cache_release(page);
-				header->pnfs_error = -ENOMEM;
-				goto out;
-			}
-			/* FIXME: This should be done in bi_end_io */
-			mark_extents_written(BLK_LSEG2EXT(header->lseg),
-					     page->index << PAGE_CACHE_SHIFT,
-					     PAGE_CACHE_SIZE);
-
-			bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
-						 isect, page, be,
-						 bl_end_io_write_zero, par);
-			if (IS_ERR(bio)) {
-				header->pnfs_error = PTR_ERR(bio);
-				bio = NULL;
-				goto out;
-			}
-next_page:
-			isect += PAGE_CACHE_SECTORS;
-			extent_length -= PAGE_CACHE_SECTORS;
-		}
-		if (last)
-			goto write_done;
-	}
-	bio = bl_submit_bio(WRITE, bio);
-
-	/* Middle pages */
-	pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
 	for (i = pg_index; i < header->page_array.npages; i++) {
 		if (extent_length <= 0) {
 			/* We've used up the previous extent */
 			bl_put_extent(be);
-			bl_put_extent(cow_read);
 			bio = bl_submit_bio(WRITE, bio);
 			/* Get the next one */
 			be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
-					     isect, &cow_read);
+					     isect, NULL);
 			if (!be || !is_writable(be, isect)) {
 				header->pnfs_error = -EINVAL;
 				goto out;
@@ -823,25 +478,10 @@ next_page:
 			    (isect - be->be_f_offset);
 		}
 
-		dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
-		pg_offset = offset & ~PAGE_CACHE_MASK;
-		if (pg_offset + count > PAGE_CACHE_SIZE)
-			pg_len = PAGE_CACHE_SIZE - pg_offset;
-		else
-			pg_len = count;
+		BUG_ON(offset & ~PAGE_CACHE_MASK);
 
-		saved_len = pg_len;
 		if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
 		    !bl_is_sector_init(be->be_inval, isect)) {
-			ret = bl_read_partial_page_sync(pages[i], cow_read,
-							pg_offset, pg_len, true);
-			if (ret) {
-				dprintk("%s bl_read_partial_page_sync fail %d\n",
-					__func__, ret);
-				header->pnfs_error = ret;
-				goto out;
-			}
-
 			ret = bl_mark_sectors_init(be->be_inval, isect,
 						       PAGE_CACHE_SECTORS);
 			if (unlikely(ret)) {
@@ -850,66 +490,31 @@ next_page:
 				header->pnfs_error = ret;
 				goto out;
 			}
-
-			/* Expand to full page write */
-			pg_offset = 0;
-			pg_len = PAGE_CACHE_SIZE;
-		} else if  ((pg_offset & (SECTOR_SIZE - 1)) ||
-			    (pg_len & (SECTOR_SIZE - 1))){
-			/* ahh, nasty case. We have to do sync full sector
-			 * read-modify-write cycles.
-			 */
-			unsigned int saved_offset = pg_offset;
-			ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
-							pg_len, false);
-			pg_offset = round_down(pg_offset, SECTOR_SIZE);
-			pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
-				 - pg_offset;
 		}
 
-
 		bio = do_add_page_to_bio(bio, header->page_array.npages - i,
-					 WRITE,
-					 isect, pages[i], be,
+					 WRITE, isect, pages[i], be,
 					 bl_end_io_write, par,
-					 pg_offset, pg_len);
+					 0, PAGE_CACHE_SIZE);
 		if (IS_ERR(bio)) {
 			header->pnfs_error = PTR_ERR(bio);
 			bio = NULL;
 			goto out;
 		}
-		offset += saved_len;
-		count -= saved_len;
+		offset += PAGE_CACHE_SIZE;
+		count -= PAGE_CACHE_SIZE;
 		isect += PAGE_CACHE_SECTORS;
-		last_isect = isect;
 		extent_length -= PAGE_CACHE_SECTORS;
 	}
 
-	/* Last page inside INVALID extent */
-	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-		bio = bl_submit_bio(WRITE, bio);
-		temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
-		npg_zero = npg_per_block - do_div(temp, npg_per_block);
-		if (npg_zero < npg_per_block) {
-			last = 1;
-			goto fill_invalid_ext;
-		}
-	}
-
-write_done:
 	header->res.count = header->args.count;
 out:
 	bl_put_extent(be);
-	bl_put_extent(cow_read);
 	bl_submit_bio(WRITE, bio);
 	blk_finish_plug(&plug);
 	put_parallel(par);
 	return PNFS_ATTEMPTED;
 out_mds:
-	blk_finish_plug(&plug);
-	bl_put_extent(be);
-	bl_put_extent(cow_read);
-	kfree(par);
 	return PNFS_NOT_ATTEMPTED;
 }
 
@@ -1188,20 +793,45 @@ bl_clear_layoutdriver(struct nfs_server *server)
 }
 
 static bool
-is_aligned_req(struct nfs_page *req, unsigned int alignment)
+is_aligned_req(struct nfs_pageio_descriptor *pgio,
+		struct nfs_page *req, unsigned int alignment)
 {
-	return IS_ALIGNED(req->wb_offset, alignment) &&
-	       IS_ALIGNED(req->wb_bytes, alignment);
+	/*
+	 * Always accept buffered writes, higher layers take care of the
+	 * right alignment.
+	 */
+	if (pgio->pg_dreq == NULL)
+		return true;
+
+	if (!IS_ALIGNED(req->wb_offset, alignment))
+		return false;
+
+	if (IS_ALIGNED(req->wb_bytes, alignment))
+		return true;
+
+	if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
+		/*
+		 * If the write goes up to the inode size, just write
+		 * the full page.  Data past the inode size is
+		 * guaranteed to be zeroed by the higher level client
+		 * code, and this behaviour is mandated by RFC 5663
+		 * section 2.3.2.
+		 */
+		return true;
+	}
+
+	return false;
 }
 
 static void
 bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
-	if (pgio->pg_dreq != NULL &&
-	    !is_aligned_req(req, SECTOR_SIZE))
+	if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
 		nfs_pageio_reset_read_mds(pgio);
-	else
-		pnfs_generic_pg_init_read(pgio, req);
+		return;
+	}
+
+	pnfs_generic_pg_init_read(pgio, req);
 }
 
 /*
@@ -1212,10 +842,8 @@ static size_t
 bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 		struct nfs_page *req)
 {
-	if (pgio->pg_dreq != NULL &&
-	    !is_aligned_req(req, SECTOR_SIZE))
+	if (!is_aligned_req(pgio, req, SECTOR_SIZE))
 		return 0;
-
 	return pnfs_generic_pg_test(pgio, prev, req);
 }
 
@@ -1245,19 +873,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
 static void
 bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
-	if (pgio->pg_dreq != NULL &&
-	    !is_aligned_req(req, PAGE_CACHE_SIZE)) {
+	u64 wb_size;
+
+	if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
 		nfs_pageio_reset_write_mds(pgio);
-	} else {
-		u64 wb_size;
-		if (pgio->pg_dreq == NULL)
-			wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
-						      req->wb_index);
-		else
-			wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
-
-		pnfs_generic_pg_init_write(pgio, req, wb_size);
+		return;
 	}
+
+	if (pgio->pg_dreq == NULL)
+		wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
+					      req->wb_index);
+	else
+		wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+
+	pnfs_generic_pg_init_write(pgio, req, wb_size);
 }
 
 /*
@@ -1268,10 +897,8 @@ static size_t
 bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 		 struct nfs_page *req)
 {
-	if (pgio->pg_dreq != NULL &&
-	    !is_aligned_req(req, PAGE_CACHE_SIZE))
+	if (!is_aligned_req(pgio, req, PAGE_SIZE))
 		return 0;
-
 	return pnfs_generic_pg_test(pgio, prev, req);
 }
 
@@ -1291,6 +918,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 	.id				= LAYOUT_BLOCK_VOLUME,
 	.name				= "LAYOUT_BLOCK_VOLUME",
 	.owner				= THIS_MODULE,
+	.flags				= PNFS_READ_WHOLE_PAGE,
 	.read_pagelist			= bl_read_pagelist,
 	.write_pagelist			= bl_write_pagelist,
 	.alloc_layout_hdr		= bl_alloc_layout_hdr,
-- 
cgit v1.2.3


From 8c792ea940499153732adea2ea4ca37f6999778f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 08:23:33 -0700
Subject: pnfs/blocklayout: don't set pages uptodate

The core nfs code handles setting pages uptodate on reads, no need to mess
with the pageflags outselves.  Also remove a debug function to dump page
flags.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c | 24 +-----------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index cf87254b6cd1..5aa23750a149 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -49,20 +49,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
 MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
 
-static void print_page(struct page *page)
-{
-	dprintk("PRINTPAGE page %p\n", page);
-	dprintk("	PagePrivate %d\n", PagePrivate(page));
-	dprintk("	PageUptodate %d\n", PageUptodate(page));
-	dprintk("	PageError %d\n", PageError(page));
-	dprintk("	PageDirty %d\n", PageDirty(page));
-	dprintk("	PageReferenced %d\n", PageReferenced(page));
-	dprintk("	PageLocked %d\n", PageLocked(page));
-	dprintk("	PageWriteback %d\n", PageWriteback(page));
-	dprintk("	PageMappedToDisk %d\n", PageMappedToDisk(page));
-	dprintk("\n");
-}
-
 /* Given the be associated with isect, determine if page data needs to be
  * initialized.
  */
@@ -187,16 +173,9 @@ retry:
 	return bio;
 }
 
-/* This is basically copied from mpage_end_io_read */
 static void bl_end_io_read(struct bio *bio, int err)
 {
 	struct parallel_io *par = bio->bi_private;
-	struct bio_vec *bvec;
-	int i;
-
-	if (!err)
-		bio_for_each_segment_all(bvec, bio, i)
-			SetPageUptodate(bvec->bv_page);
 
 	if (err) {
 		struct nfs_pgio_header *header = par->data;
@@ -205,6 +184,7 @@ static void bl_end_io_read(struct bio *bio, int err)
 			header->pnfs_error = -EIO;
 		pnfs_set_lo_fail(header->lseg);
 	}
+
 	bio_put(bio);
 	put_parallel(par);
 }
@@ -304,8 +284,6 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
 			/* Fill hole w/ zeroes w/o accessing device */
 			dprintk("%s Zeroing page for hole\n", __func__);
 			zero_user_segment(pages[i], pg_offset, pg_len);
-			print_page(pages[i]);
-			SetPageUptodate(pages[i]);
 		} else {
 			struct pnfs_block_extent *be_read;
 
-- 
cgit v1.2.3


From 8067253c8cc531b6f367b9f5942bdc6168385701 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 08:23:34 -0700
Subject: pnfs/blocklayout: rewrite extent tracking

Currently the block layout driver tracks extents in three separate
data structures:

 - the two list of pnfs_block_extent structures returned by the server
 - the list of sectors that were in invalid state but have been written to
 - a list of pnfs_block_short_extent structures for LAYOUTCOMMIT

All of these share the property that they are not only highly inefficient
data structures, but also that operations on them are even more inefficient
than nessecary.

In addition there are various implementation defects like:

 - using an int to track sectors, causing corruption for large offsets
 - incorrect normalization of page or block granularity ranges
 - insufficient error handling
 - incorrect synchronization as extents can be modified while they are in
   use

This patch replace all three data with a single unified rbtree structure
tracking all extents, as well as their in-memory state, although we still
need to instance for read-only and read-write extent due to the arcane
client side COW feature in the block layouts spec.

To fix the problem of extent possibly being modified while in use we make
sure to return a copy of the extent for use in the write path - the
extent can only be invalidated by a layout recall or return which has
to wait until the I/O operations finished due to refcounts on the layout
segment.

The new extent tree work similar to the schemes used by block based
filesystems like XFS or ext4.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/Makefile         |   3 +-
 fs/nfs/blocklayout/blocklayout.c    | 258 +++-------
 fs/nfs/blocklayout/blocklayout.h    | 112 +----
 fs/nfs/blocklayout/blocklayoutdev.c |  35 +-
 fs/nfs/blocklayout/extent_tree.c    | 547 ++++++++++++++++++++++
 fs/nfs/blocklayout/extents.c        | 908 ------------------------------------
 6 files changed, 651 insertions(+), 1212 deletions(-)
 create mode 100644 fs/nfs/blocklayout/extent_tree.c
 delete mode 100644 fs/nfs/blocklayout/extents.c

diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index d5815505c020..3fa5ec780a8e 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,5 @@
 # Makefile for the pNFS block layout driver kernel module
 #
 obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
-blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
+blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \
+	extent_tree.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5aa23750a149..8502e620f644 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -49,26 +49,16 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
 MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
 
-/* Given the be associated with isect, determine if page data needs to be
- * initialized.
- */
-static int is_hole(struct pnfs_block_extent *be, sector_t isect)
-{
-	if (be->be_state == PNFS_BLOCK_NONE_DATA)
-		return 1;
-	else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
-		return 0;
-	else
-		return !bl_is_sector_init(be->be_inval, isect);
-}
-
-/* Given the be associated with isect, determine if page data can be
- * written to disk.
- */
-static int is_writable(struct pnfs_block_extent *be, sector_t isect)
+static bool is_hole(struct pnfs_block_extent *be)
 {
-	return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
-		be->be_state == PNFS_BLOCK_INVALID_DATA);
+	switch (be->be_state) {
+	case PNFS_BLOCK_NONE_DATA:
+		return true;
+	case PNFS_BLOCK_INVALID_DATA:
+		return be->be_tag ? false : true;
+	default:
+		return false;
+	}
 }
 
 /* The data we are handed might be spread across several bios.  We need
@@ -76,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
  */
 struct parallel_io {
 	struct kref refcnt;
-	void (*pnfs_callback) (void *data, int num_se);
+	void (*pnfs_callback) (void *data);
 	void *data;
-	int bse_count;
 };
 
 static inline struct parallel_io *alloc_parallel(void *data)
@@ -89,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data)
 	if (rv) {
 		rv->data = data;
 		kref_init(&rv->refcnt);
-		rv->bse_count = 0;
 	}
 	return rv;
 }
@@ -104,7 +92,7 @@ static void destroy_parallel(struct kref *kref)
 	struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
 
 	dprintk("%s enter\n", __func__);
-	p->pnfs_callback(p->data, p->bse_count);
+	p->pnfs_callback(p->data);
 	kfree(p);
 }
 
@@ -200,7 +188,7 @@ static void bl_read_cleanup(struct work_struct *work)
 }
 
 static void
-bl_end_par_io_read(void *data, int unused)
+bl_end_par_io_read(void *data)
 {
 	struct nfs_pgio_header *hdr = data;
 
@@ -210,56 +198,46 @@ bl_end_par_io_read(void *data, int unused)
 }
 
 static enum pnfs_try_status
-bl_read_pagelist(struct nfs_pgio_header *hdr)
+bl_read_pagelist(struct nfs_pgio_header *header)
 {
-	struct nfs_pgio_header *header = hdr;
-	int i, hole;
+	struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
 	struct bio *bio = NULL;
-	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+	struct pnfs_block_extent be;
 	sector_t isect, extent_length = 0;
 	struct parallel_io *par;
-	loff_t f_offset = hdr->args.offset;
-	size_t bytes_left = hdr->args.count;
+	loff_t f_offset = header->args.offset;
+	size_t bytes_left = header->args.count;
 	unsigned int pg_offset, pg_len;
-	struct page **pages = hdr->args.pages;
-	int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT;
+	struct page **pages = header->args.pages;
+	int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
 	const bool is_dio = (header->dreq != NULL);
 	struct blk_plug plug;
+	int i;
 
 	dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
-		hdr->page_array.npages, f_offset,
-		(unsigned int)hdr->args.count);
+		header->page_array.npages, f_offset,
+		(unsigned int)header->args.count);
 
-	par = alloc_parallel(hdr);
+	par = alloc_parallel(header);
 	if (!par)
-		goto use_mds;
+		return PNFS_NOT_ATTEMPTED;
 	par->pnfs_callback = bl_end_par_io_read;
-	/* At this point, we can no longer jump to use_mds */
 
 	blk_start_plug(&plug);
 
 	isect = (sector_t) (f_offset >> SECTOR_SHIFT);
 	/* Code assumes extents are page-aligned */
-	for (i = pg_index; i < hdr->page_array.npages; i++) {
+	for (i = pg_index; i < header->page_array.npages; i++) {
 		if (extent_length <= 0) {
 			/* We've used up the previous extent */
-			bl_put_extent(be);
-			bl_put_extent(cow_read);
 			bio = bl_submit_bio(READ, bio);
+
 			/* Get the next one */
-			be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
-					     isect, &cow_read);
-			if (!be) {
+			if (!ext_tree_lookup(bl, isect, &be, false)) {
 				header->pnfs_error = -EIO;
 				goto out;
 			}
-			extent_length = be->be_length -
-				(isect - be->be_f_offset);
-			if (cow_read) {
-				sector_t cow_length = cow_read->be_length -
-					(isect - cow_read->be_f_offset);
-				extent_length = min(extent_length, cow_length);
-			}
+			extent_length = be.be_length - (isect - be.be_f_offset);
 		}
 
 		pg_offset = f_offset & ~PAGE_CACHE_MASK;
@@ -278,20 +256,16 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
 			pg_len = PAGE_CACHE_SIZE;
 		}
 
-		hole = is_hole(be, isect);
-		if (hole && !cow_read) {
+		if (is_hole(&be)) {
 			bio = bl_submit_bio(READ, bio);
 			/* Fill hole w/ zeroes w/o accessing device */
 			dprintk("%s Zeroing page for hole\n", __func__);
 			zero_user_segment(pages[i], pg_offset, pg_len);
 		} else {
-			struct pnfs_block_extent *be_read;
-
-			be_read = (hole && cow_read) ? cow_read : be;
 			bio = do_add_page_to_bio(bio,
-						 hdr->page_array.npages - i,
+						 header->page_array.npages - i,
 						 READ,
-						 isect, pages[i], be_read,
+						 isect, pages[i], &be,
 						 bl_end_io_read, par,
 						 pg_offset, pg_len);
 			if (IS_ERR(bio)) {
@@ -304,50 +278,16 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
 		extent_length -= (pg_len >> SECTOR_SHIFT);
 	}
 	if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
-		hdr->res.eof = 1;
-		hdr->res.count = header->inode->i_size - hdr->args.offset;
+		header->res.eof = 1;
+		header->res.count = header->inode->i_size - header->args.offset;
 	} else {
-		hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset;
+		header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
 	}
 out:
-	bl_put_extent(be);
-	bl_put_extent(cow_read);
 	bl_submit_bio(READ, bio);
 	blk_finish_plug(&plug);
 	put_parallel(par);
 	return PNFS_ATTEMPTED;
-
- use_mds:
-	dprintk("Giving up and using normal NFS\n");
-	return PNFS_NOT_ATTEMPTED;
-}
-
-static void mark_extents_written(struct pnfs_block_layout *bl,
-				 __u64 offset, __u32 count)
-{
-	sector_t isect, end;
-	struct pnfs_block_extent *be;
-	struct pnfs_block_short_extent *se;
-
-	dprintk("%s(%llu, %u)\n", __func__, offset, count);
-	if (count == 0)
-		return;
-	isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
-	end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
-	end >>= SECTOR_SHIFT;
-	while (isect < end) {
-		sector_t len;
-		be = bl_find_get_extent(bl, isect, NULL);
-		BUG_ON(!be); /* FIXME */
-		len = min(end, be->be_f_offset + be->be_length) - isect;
-		if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-			se = bl_pop_one_short_extent(be->be_inval);
-			BUG_ON(!se);
-			bl_mark_for_commit(be, isect, len, se);
-		}
-		isect += len;
-		bl_put_extent(be);
-	}
 }
 
 static void bl_end_io_write(struct bio *bio, int err)
@@ -370,29 +310,30 @@ static void bl_end_io_write(struct bio *bio, int err)
  */
 static void bl_write_cleanup(struct work_struct *work)
 {
-	struct rpc_task *task;
-	struct nfs_pgio_header *hdr;
+	struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
+	struct nfs_pgio_header *hdr =
+			container_of(task, struct nfs_pgio_header, task);
+
 	dprintk("%s enter\n", __func__);
-	task = container_of(work, struct rpc_task, u.tk_work);
-	hdr = container_of(task, struct nfs_pgio_header, task);
+
 	if (likely(!hdr->pnfs_error)) {
-		/* Marks for LAYOUTCOMMIT */
-		mark_extents_written(BLK_LSEG2EXT(hdr->lseg),
-				     hdr->args.offset, hdr->args.count);
+		struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
+		u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
+		u64 end = (hdr->args.offset + hdr->args.count +
+			PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
+
+		ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
+					(end - start) >> SECTOR_SHIFT);
 	}
+
 	pnfs_ld_write_done(hdr);
 }
 
 /* Called when last of bios associated with a bl_write_pagelist call finishes */
-static void bl_end_par_io_write(void *data, int num_se)
+static void bl_end_par_io_write(void *data)
 {
 	struct nfs_pgio_header *hdr = data;
 
-	if (unlikely(hdr->pnfs_error)) {
-		bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
-					num_se);
-	}
-
 	hdr->task.tk_status = hdr->pnfs_error;
 	hdr->verf.committed = NFS_FILE_SYNC;
 	INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
@@ -402,9 +343,9 @@ static void bl_end_par_io_write(void *data, int num_se)
 static enum pnfs_try_status
 bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 {
-	int i, ret;
+	struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
 	struct bio *bio = NULL;
-	struct pnfs_block_extent *be = NULL;
+	struct pnfs_block_extent be;
 	sector_t isect, extent_length = 0;
 	struct parallel_io *par = NULL;
 	loff_t offset = header->args.offset;
@@ -412,6 +353,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 	struct page **pages = header->args.pages;
 	int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
 	struct blk_plug plug;
+	int i;
 
 	dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
 
@@ -421,9 +363,8 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 	 */
 	par = alloc_parallel(header);
 	if (!par)
-		goto out_mds;
+		return PNFS_NOT_ATTEMPTED;
 	par->pnfs_callback = bl_end_par_io_write;
-	/* At this point, have to be more careful with error handling */
 
 	blk_start_plug(&plug);
 
@@ -434,44 +375,18 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 	for (i = pg_index; i < header->page_array.npages; i++) {
 		if (extent_length <= 0) {
 			/* We've used up the previous extent */
-			bl_put_extent(be);
 			bio = bl_submit_bio(WRITE, bio);
 			/* Get the next one */
-			be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
-					     isect, NULL);
-			if (!be || !is_writable(be, isect)) {
+			if (!ext_tree_lookup(bl, isect, &be, true)) {
 				header->pnfs_error = -EINVAL;
 				goto out;
 			}
-			if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-				if (likely(!bl_push_one_short_extent(
-								be->be_inval)))
-					par->bse_count++;
-				else {
-					header->pnfs_error = -ENOMEM;
-					goto out;
-				}
-			}
-			extent_length = be->be_length -
-			    (isect - be->be_f_offset);
-		}
 
-		BUG_ON(offset & ~PAGE_CACHE_MASK);
-
-		if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
-		    !bl_is_sector_init(be->be_inval, isect)) {
-			ret = bl_mark_sectors_init(be->be_inval, isect,
-						       PAGE_CACHE_SECTORS);
-			if (unlikely(ret)) {
-				dprintk("%s bl_mark_sectors_init fail %d\n",
-					__func__, ret);
-				header->pnfs_error = ret;
-				goto out;
-			}
+			extent_length = be.be_length - (isect - be.be_f_offset);
 		}
 
 		bio = do_add_page_to_bio(bio, header->page_array.npages - i,
-					 WRITE, isect, pages[i], be,
+					 WRITE, isect, pages[i], &be,
 					 bl_end_io_write, par,
 					 0, PAGE_CACHE_SIZE);
 		if (IS_ERR(bio)) {
@@ -487,60 +402,22 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 
 	header->res.count = header->args.count;
 out:
-	bl_put_extent(be);
 	bl_submit_bio(WRITE, bio);
 	blk_finish_plug(&plug);
 	put_parallel(par);
 	return PNFS_ATTEMPTED;
-out_mds:
-	return PNFS_NOT_ATTEMPTED;
-}
-
-/* FIXME - range ignored */
-static void
-release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
-{
-	int i;
-	struct pnfs_block_extent *be;
-
-	spin_lock(&bl->bl_ext_lock);
-	for (i = 0; i < EXTENT_LISTS; i++) {
-		while (!list_empty(&bl->bl_extents[i])) {
-			be = list_first_entry(&bl->bl_extents[i],
-					      struct pnfs_block_extent,
-					      be_node);
-			list_del(&be->be_node);
-			bl_put_extent(be);
-		}
-	}
-	spin_unlock(&bl->bl_ext_lock);
-}
-
-static void
-release_inval_marks(struct pnfs_inval_markings *marks)
-{
-	struct pnfs_inval_tracking *pos, *temp;
-	struct pnfs_block_short_extent *se, *stemp;
-
-	list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
-		list_del(&pos->it_link);
-		kfree(pos);
-	}
-
-	list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
-		list_del(&se->bse_node);
-		kfree(se);
-	}
-	return;
 }
 
 static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
 {
 	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+	int err;
 
 	dprintk("%s enter\n", __func__);
-	release_extents(bl, NULL);
-	release_inval_marks(&bl->bl_inval);
+
+	err = ext_tree_remove(bl, true, 0, LLONG_MAX);
+	WARN_ON(err);
+
 	kfree(bl);
 }
 
@@ -553,14 +430,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
 	bl = kzalloc(sizeof(*bl), gfp_flags);
 	if (!bl)
 		return NULL;
+
+	bl->bl_ext_rw = RB_ROOT;
+	bl->bl_ext_ro = RB_ROOT;
 	spin_lock_init(&bl->bl_ext_lock);
-	INIT_LIST_HEAD(&bl->bl_extents[0]);
-	INIT_LIST_HEAD(&bl->bl_extents[1]);
-	INIT_LIST_HEAD(&bl->bl_commit);
-	INIT_LIST_HEAD(&bl->bl_committing);
-	bl->bl_count = 0;
-	bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
-	BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
+
 	return &bl->bl_layout;
 }
 
@@ -600,7 +474,7 @@ bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
 		       const struct nfs4_layoutcommit_args *arg)
 {
 	dprintk("%s enter\n", __func__);
-	encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
+	ext_tree_encode_commit(BLK_LO2EXT(lo), xdr);
 }
 
 static void
@@ -609,7 +483,7 @@ bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
 	struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
 
 	dprintk("%s enter\n", __func__);
-	clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
+	ext_tree_mark_committed(BLK_LO2EXT(lo), lcdata->res.status);
 }
 
 static void free_blk_mountid(struct block_mount_id *mid)
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 9838fb020473..b4f66d875f12 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -63,82 +63,28 @@ enum exstate4 {
 	PNFS_BLOCK_NONE_DATA		= 3  /* unmapped, it's a hole */
 };
 
-#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
-
-struct my_tree {
-	sector_t		mtt_step_size;	/* Internal sector alignment */
-	struct list_head	mtt_stub; /* Should be a radix tree */
-};
-
-struct pnfs_inval_markings {
-	spinlock_t	im_lock;
-	struct my_tree	im_tree;	/* Sectors that need LAYOUTCOMMIT */
-	sector_t	im_block_size;	/* Server blocksize in sectors */
-	struct list_head im_extents;	/* Short extents for INVAL->RW conversion */
-};
-
-struct pnfs_inval_tracking {
-	struct list_head it_link;
-	int		 it_sector;
-	int		 it_tags;
-};
-
 /* sector_t fields are all in 512-byte sectors */
 struct pnfs_block_extent {
-	struct kref	be_refcnt;
-	struct list_head be_node;	/* link into lseg list */
-	struct nfs4_deviceid be_devid;  /* FIXME: could use device cache instead */
+	union {
+		struct rb_node	be_node;
+		struct list_head be_list;
+	};
+	struct nfs4_deviceid be_devid;	/* FIXME: could use device cache instead */
 	struct block_device *be_mdev;
 	sector_t	be_f_offset;	/* the starting offset in the file */
 	sector_t	be_length;	/* the size of the extent */
 	sector_t	be_v_offset;	/* the starting offset in the volume */
 	enum exstate4	be_state;	/* the state of this extent */
-	struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
-};
-
-/* Shortened extent used by LAYOUTCOMMIT */
-struct pnfs_block_short_extent {
-	struct list_head bse_node;
-	struct nfs4_deviceid bse_devid;
-	struct block_device *bse_mdev;
-	sector_t	bse_f_offset;	/* the starting offset in the file */
-	sector_t	bse_length;	/* the size of the extent */
+#define EXTENT_WRITTEN		1
+#define EXTENT_COMMITTING	2
+	unsigned int	be_tag;
 };
 
-static inline void
-BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
-{
-	spin_lock_init(&marks->im_lock);
-	INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
-	INIT_LIST_HEAD(&marks->im_extents);
-	marks->im_block_size = blocksize;
-	marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
-					   blocksize);
-}
-
-enum extentclass4 {
-	RW_EXTENT       = 0, /* READWRTE and INVAL */
-	RO_EXTENT       = 1, /* READ and NONE */
-	EXTENT_LISTS    = 2,
-};
-
-static inline int bl_choose_list(enum exstate4 state)
-{
-	if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
-		return RO_EXTENT;
-	else
-		return RW_EXTENT;
-}
-
 struct pnfs_block_layout {
-	struct pnfs_layout_hdr bl_layout;
-	struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
+	struct pnfs_layout_hdr	bl_layout;
+	struct rb_root		bl_ext_rw;
+	struct rb_root		bl_ext_ro;
 	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
-	struct list_head	bl_extents[EXTENT_LISTS]; /* R and RW extents */
-	struct list_head	bl_commit;	/* Needs layout commit */
-	struct list_head	bl_committing;	/* Layout committing */
-	unsigned int		bl_count;	/* entries in bl_commit */
-	sector_t		bl_blocksize;  /* Server blocksize in sectors */
 };
 
 #define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
@@ -183,29 +129,17 @@ int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
 /* blocklayoutdm.c */
 void bl_free_block_dev(struct pnfs_block_dev *bdev);
 
-/* extents.c */
-struct pnfs_block_extent *
-bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
-		struct pnfs_block_extent **cow_read);
-int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
-			     sector_t offset, sector_t length);
-void bl_put_extent(struct pnfs_block_extent *be);
-struct pnfs_block_extent *bl_alloc_extent(void);
-int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
-int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-				   struct xdr_stream *xdr,
-				   const struct nfs4_layoutcommit_args *arg);
-void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-				   const struct nfs4_layoutcommit_args *arg,
-				   int status);
-int bl_add_merge_extent(struct pnfs_block_layout *bl,
-			 struct pnfs_block_extent *new);
-int bl_mark_for_commit(struct pnfs_block_extent *be,
-			sector_t offset, sector_t length,
-			struct pnfs_block_short_extent *new);
-int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
-struct pnfs_block_short_extent *
-bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
-void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
+/* extent_tree.c */
+int ext_tree_insert(struct pnfs_block_layout *bl,
+		struct pnfs_block_extent *new);
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
+		sector_t end);
+int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+		sector_t len);
+bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+		struct pnfs_block_extent *ret, bool rw);
+int ext_tree_encode_commit(struct pnfs_block_layout *bl,
+		struct xdr_stream *xdr);
+void ext_tree_mark_committed(struct pnfs_block_layout *bl, int status);
 
 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index 63f77925aa87..cd71b5e231ec 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -309,7 +309,7 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
 	 * recovery easier.
 	 */
 	for (i = 0; i < count; i++) {
-		be = bl_alloc_extent();
+		be = kzalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
 		if (!be) {
 			status = -ENOMEM;
 			goto out_err;
@@ -330,13 +330,11 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
 		if (decode_sector_number(&p, &be->be_v_offset) < 0)
 			goto out_err;
 		be->be_state = be32_to_cpup(p++);
-		if (be->be_state == PNFS_BLOCK_INVALID_DATA)
-			be->be_inval = &bl->bl_inval;
 		if (verify_extent(be, &lv)) {
 			dprintk("%s verify failed\n", __func__);
 			goto out_err;
 		}
-		list_add_tail(&be->be_node, &extents);
+		list_add_tail(&be->be_list, &extents);
 	}
 	if (lgr->range.offset + lgr->range.length !=
 			lv.start << SECTOR_SHIFT) {
@@ -352,21 +350,13 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
 	/* Extents decoded properly, now try to merge them in to
 	 * existing layout extents.
 	 */
-	spin_lock(&bl->bl_ext_lock);
-	list_for_each_entry_safe(be, save, &extents, be_node) {
-		list_del(&be->be_node);
-		status = bl_add_merge_extent(bl, be);
-		if (status) {
-			spin_unlock(&bl->bl_ext_lock);
-			/* This is a fairly catastrophic error, as the
-			 * entire layout extent lists are now corrupted.
-			 * We should have some way to distinguish this.
-			 */
-			be = NULL;
-			goto out_err;
-		}
+	list_for_each_entry_safe(be, save, &extents, be_list) {
+		list_del(&be->be_list);
+
+		status = ext_tree_insert(bl, be);
+		if (status)
+			goto out_free_list;
 	}
-	spin_unlock(&bl->bl_ext_lock);
 	status = 0;
  out:
 	__free_page(scratch);
@@ -374,12 +364,13 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
 	return status;
 
  out_err:
-	bl_put_extent(be);
+	kfree(be);
+ out_free_list:
 	while (!list_empty(&extents)) {
 		be = list_first_entry(&extents, struct pnfs_block_extent,
-				      be_node);
-		list_del(&be->be_node);
-		bl_put_extent(be);
+				      be_list);
+		list_del(&be->be_list);
+		kfree(be);
 	}
 	goto out;
 }
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 000000000000..c8c59a5b1a8f
--- /dev/null
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
+
+static inline struct pnfs_block_extent *
+ext_node(struct rb_node *node)
+{
+	return rb_entry(node, struct pnfs_block_extent, be_node);
+}
+
+static struct pnfs_block_extent *
+ext_tree_first(struct rb_root *root)
+{
+	struct rb_node *node = rb_first(root);
+	return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_prev(struct pnfs_block_extent *be)
+{
+	struct rb_node *node = rb_prev(&be->be_node);
+	return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_next(struct pnfs_block_extent *be)
+{
+	struct rb_node *node = rb_next(&be->be_node);
+	return node ? ext_node(node) : NULL;
+}
+
+static inline sector_t
+ext_f_end(struct pnfs_block_extent *be)
+{
+	return be->be_f_offset + be->be_length;
+}
+
+static struct pnfs_block_extent *
+__ext_tree_search(struct rb_root *root, sector_t start)
+{
+	struct rb_node *node = root->rb_node;
+	struct pnfs_block_extent *be = NULL;
+
+	while (node) {
+		be = ext_node(node);
+		if (start < be->be_f_offset)
+			node = node->rb_left;
+		else if (start >= ext_f_end(be))
+			node = node->rb_right;
+		else
+			return be;
+	}
+
+	if (be) {
+		if (start < be->be_f_offset)
+			return be;
+
+		if (start >= ext_f_end(be))
+			return ext_tree_next(be);
+	}
+
+	return NULL;
+}
+
+static bool
+ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
+{
+	if (be1->be_state != be2->be_state)
+		return false;
+	if (be1->be_mdev != be2->be_mdev)
+		return false;
+
+	if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
+		return false;
+
+	if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
+	    (be1->be_v_offset + be1->be_length != be2->be_v_offset))
+		return false;
+
+	if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
+	    be1->be_tag != be2->be_tag)
+		return false;
+
+	return true;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
+{
+	struct pnfs_block_extent *left = ext_tree_prev(be);
+
+	if (left && ext_can_merge(left, be)) {
+		left->be_length += be->be_length;
+		rb_erase(&be->be_node, root);
+		kfree(be);
+		return left;
+	}
+
+	return be;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
+{
+	struct pnfs_block_extent *right = ext_tree_next(be);
+
+	if (right && ext_can_merge(be, right)) {
+		be->be_length += right->be_length;
+		rb_erase(&right->be_node, root);
+		kfree(right);
+	}
+
+	return be;
+}
+
+static void
+__ext_tree_insert(struct rb_root *root,
+		struct pnfs_block_extent *new, bool merge_ok)
+{
+	struct rb_node **p = &root->rb_node, *parent = NULL;
+	struct pnfs_block_extent *be;
+
+	while (*p) {
+		parent = *p;
+		be = ext_node(parent);
+
+		if (new->be_f_offset < be->be_f_offset) {
+			if (merge_ok && ext_can_merge(new, be)) {
+				be->be_f_offset = new->be_f_offset;
+				if (be->be_state != PNFS_BLOCK_NONE_DATA)
+					be->be_v_offset = new->be_v_offset;
+				be->be_length += new->be_length;
+				be = ext_try_to_merge_left(root, be);
+				kfree(new);
+				return;
+			}
+			p = &(*p)->rb_left;
+		} else if (new->be_f_offset >= ext_f_end(be)) {
+			if (merge_ok && ext_can_merge(be, new)) {
+				be->be_length += new->be_length;
+				be = ext_try_to_merge_right(root, be);
+				kfree(new);
+				return;
+			}
+			p = &(*p)->rb_right;
+		} else {
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->be_node, parent, p);
+	rb_insert_color(&new->be_node, root);
+}
+
+static int
+__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
+{
+	struct pnfs_block_extent *be;
+	sector_t len1 = 0, len2 = 0;
+	sector_t orig_f_offset;
+	sector_t orig_v_offset;
+	sector_t orig_len;
+
+	be = __ext_tree_search(root, start);
+	if (!be)
+		return 0;
+	if (be->be_f_offset >= end)
+		return 0;
+
+	orig_f_offset = be->be_f_offset;
+	orig_v_offset = be->be_v_offset;
+	orig_len = be->be_length;
+
+	if (start > be->be_f_offset)
+		len1 = start - be->be_f_offset;
+	if (ext_f_end(be) > end)
+		len2 = ext_f_end(be) - end;
+
+	if (len2 > 0) {
+		if (len1 > 0) {
+			struct pnfs_block_extent *new;
+
+			new = kzalloc(sizeof(*new), GFP_ATOMIC);
+			if (!new)
+				return -ENOMEM;
+
+			be->be_length = len1;
+
+			new->be_f_offset = end;
+			if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+				new->be_v_offset =
+					orig_v_offset + orig_len - len2;
+			}
+			new->be_length = len2;
+			new->be_state = be->be_state;
+			new->be_tag = be->be_tag;
+			new->be_mdev = be->be_mdev;
+			memcpy(&new->be_devid, &be->be_devid,
+				sizeof(struct nfs4_deviceid));
+
+			__ext_tree_insert(root, new, true);
+		} else {
+			be->be_f_offset = end;
+			if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+				be->be_v_offset =
+					orig_v_offset + orig_len - len2;
+			}
+			be->be_length = len2;
+		}
+	} else {
+		if (len1 > 0) {
+			be->be_length = len1;
+			be = ext_tree_next(be);
+		}
+
+		while (be && ext_f_end(be) <= end) {
+			struct pnfs_block_extent *next = ext_tree_next(be);
+
+			rb_erase(&be->be_node, root);
+			kfree(be);
+			be = next;
+		}
+
+		if (be && be->be_f_offset < end) {
+			len1 = ext_f_end(be) - end;
+			be->be_f_offset = end;
+			if (be->be_state != PNFS_BLOCK_NONE_DATA)
+				be->be_v_offset += be->be_length - len1;
+			be->be_length = len1;
+		}
+	}
+
+	return 0;
+}
+
+int
+ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
+{
+	struct pnfs_block_extent *be;
+	struct rb_root *root;
+	int err = 0;
+
+	switch (new->be_state) {
+	case PNFS_BLOCK_READWRITE_DATA:
+	case PNFS_BLOCK_INVALID_DATA:
+		root = &bl->bl_ext_rw;
+		break;
+	case PNFS_BLOCK_READ_DATA:
+	case PNFS_BLOCK_NONE_DATA:
+		root = &bl->bl_ext_ro;
+		break;
+	default:
+		dprintk("invalid extent type\n");
+		return -EINVAL;
+	}
+
+	spin_lock(&bl->bl_ext_lock);
+retry:
+	be = __ext_tree_search(root, new->be_f_offset);
+	if (!be || be->be_f_offset >= ext_f_end(new)) {
+		__ext_tree_insert(root, new, true);
+	} else if (new->be_f_offset >= be->be_f_offset) {
+		if (ext_f_end(new) <= ext_f_end(be)) {
+			kfree(new);
+		} else {
+			sector_t new_len = ext_f_end(new) - ext_f_end(be);
+			sector_t diff = new->be_length - new_len;
+
+			new->be_f_offset += diff;
+			new->be_v_offset += diff;
+			new->be_length = new_len;
+			goto retry;
+		}
+	} else if (ext_f_end(new) <= ext_f_end(be)) {
+		new->be_length = be->be_f_offset - new->be_f_offset;
+		__ext_tree_insert(root, new, true);
+	} else {
+		struct pnfs_block_extent *split;
+		sector_t new_len = ext_f_end(new) - ext_f_end(be);
+		sector_t diff = new->be_length - new_len;
+
+		split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
+		if (!split) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		split->be_length = be->be_f_offset - split->be_f_offset;
+		__ext_tree_insert(root, split, true);
+
+		new->be_f_offset += diff;
+		new->be_v_offset += diff;
+		new->be_length = new_len;
+		goto retry;
+	}
+out:
+	spin_unlock(&bl->bl_ext_lock);
+	return err;
+}
+
+static bool
+__ext_tree_lookup(struct rb_root *root, sector_t isect,
+		struct pnfs_block_extent *ret)
+{
+	struct rb_node *node;
+	struct pnfs_block_extent *be;
+
+	node = root->rb_node;
+	while (node) {
+		be = ext_node(node);
+		if (isect < be->be_f_offset)
+			node = node->rb_left;
+		else if (isect >= ext_f_end(be))
+			node = node->rb_right;
+		else {
+			*ret = *be;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+bool
+ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+	    struct pnfs_block_extent *ret, bool rw)
+{
+	bool found = false;
+
+	spin_lock(&bl->bl_ext_lock);
+	if (!rw)
+		found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
+	if (!found)
+		found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
+	spin_unlock(&bl->bl_ext_lock);
+
+	return found;
+}
+
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
+		sector_t start, sector_t end)
+{
+	int err, err2;
+
+	spin_lock(&bl->bl_ext_lock);
+	err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+	if (rw) {
+		err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
+		if (!err)
+			err = err2;
+	}
+	spin_unlock(&bl->bl_ext_lock);
+
+	return err;
+}
+
+static int
+ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
+		sector_t split)
+{
+	struct pnfs_block_extent *new;
+	sector_t orig_len = be->be_length;
+
+	dprintk("%s: need split for 0x%lx:0x%lx at 0x%lx\n",
+		__func__, be->be_f_offset, ext_f_end(be), split);
+
+	new = kzalloc(sizeof(*new), GFP_ATOMIC);
+	if (!new)
+		return -ENOMEM;
+
+	be->be_length = split - be->be_f_offset;
+
+	new->be_f_offset = split;
+	if (be->be_state != PNFS_BLOCK_NONE_DATA)
+		new->be_v_offset = be->be_v_offset + be->be_length;
+	new->be_length = orig_len - be->be_length;
+	new->be_state = be->be_state;
+	new->be_tag = be->be_tag;
+
+	new->be_mdev = be->be_mdev;
+	memcpy(&new->be_devid, &be->be_devid, sizeof(struct nfs4_deviceid));
+
+	dprintk("%s: got 0x%lx:0x%lx!\n",
+		__func__, be->be_f_offset, ext_f_end(be));
+	dprintk("%s: got 0x%lx:0x%lx!\n",
+		__func__, new->be_f_offset, ext_f_end(new));
+
+	__ext_tree_insert(root, new, false);
+	return 0;
+}
+
+int
+ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+		sector_t len)
+{
+	struct rb_root *root = &bl->bl_ext_rw;
+	sector_t end = start + len;
+	struct pnfs_block_extent *be;
+	int err = 0;
+
+	spin_lock(&bl->bl_ext_lock);
+	/*
+	 * First remove all COW extents or holes from written to range.
+	 */
+	err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+	if (err)
+		goto out;
+
+	/*
+	 * Then mark all invalid extents in the range as written to.
+	 */
+	for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
+		if (be->be_f_offset >= end)
+			break;
+
+		if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
+			continue;
+
+		if (be->be_f_offset < start) {
+			struct pnfs_block_extent *left = ext_tree_prev(be);
+
+			if (left && ext_can_merge(left, be)) {
+				sector_t diff = start - be->be_f_offset;
+
+				left->be_length += diff;
+
+				be->be_f_offset += diff;
+				be->be_v_offset += diff;
+				be->be_length -= diff;
+			} else {
+				err = ext_tree_split(root, be, start);
+				if (err)
+					goto out;
+			}
+		}
+
+		if (ext_f_end(be) > end) {
+			struct pnfs_block_extent *right = ext_tree_next(be);
+
+			if (right && ext_can_merge(be, right)) {
+				sector_t diff = end - be->be_f_offset;
+
+				be->be_length -= diff;
+
+				right->be_f_offset -= diff;
+				right->be_v_offset -= diff;
+				right->be_length += diff;
+			} else {
+				err = ext_tree_split(root, be, end);
+				if (err)
+					goto out;
+			}
+		}
+
+		if (be->be_f_offset >= start && ext_f_end(be) <= end) {
+			be->be_tag = EXTENT_WRITTEN;
+			be = ext_try_to_merge_left(root, be);
+			be = ext_try_to_merge_right(root, be);
+		}
+	}
+out:
+	spin_unlock(&bl->bl_ext_lock);
+	return err;
+}
+
+int
+ext_tree_encode_commit(struct pnfs_block_layout *bl, struct xdr_stream *xdr)
+{
+	struct pnfs_block_extent *be;
+	unsigned int count = 0;
+	__be32 *p, *xdr_start;
+	int ret = 0;
+
+	dprintk("%s enter\n", __func__);
+
+	xdr_start = xdr_reserve_space(xdr, 8);
+	if (!xdr_start)
+		return -ENOSPC;
+
+	spin_lock(&bl->bl_ext_lock);
+	for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
+		if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+		    be->be_tag != EXTENT_WRITTEN)
+			continue;
+
+		p = xdr_reserve_space(xdr, 7 * sizeof(__be32) +
+					NFS4_DEVICEID4_SIZE);
+		if (!p) {
+			printk("%s: out of space for extent list\n", __func__);
+			ret = -ENOSPC;
+			break;
+		}
+
+		p = xdr_encode_opaque_fixed(p, be->be_devid.data,
+				NFS4_DEVICEID4_SIZE);
+		p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+		p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+		p = xdr_encode_hyper(p, 0LL);
+		*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+
+		be->be_tag = EXTENT_COMMITTING;
+		count++;
+	}
+	spin_unlock(&bl->bl_ext_lock);
+
+	xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
+	xdr_start[1] = cpu_to_be32(count);
+
+	dprintk("%s found %i ranges\n", __func__, count);
+	return ret;
+}
+
+void
+ext_tree_mark_committed(struct pnfs_block_layout *bl, int status)
+{
+	struct rb_root *root = &bl->bl_ext_rw;
+	struct pnfs_block_extent *be;
+
+	dprintk("%s status %d\n", __func__, status);
+
+	spin_lock(&bl->bl_ext_lock);
+	for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
+		if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+		    be->be_tag != EXTENT_COMMITTING)
+			continue;
+
+		if (status) {
+			/*
+			 * Mark as written and try again.
+			 *
+			 * XXX: some real error handling here wouldn't hurt..
+			 */
+			be->be_tag = EXTENT_WRITTEN;
+		} else {
+			be->be_state = PNFS_BLOCK_READWRITE_DATA;
+			be->be_tag = 0;
+		}
+
+		be = ext_try_to_merge_left(root, be);
+		be = ext_try_to_merge_right(root, be);
+	}
+	spin_unlock(&bl->bl_ext_lock);
+}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
deleted file mode 100644
index 4d0161442565..000000000000
--- a/fs/nfs/blocklayout/extents.c
+++ /dev/null
@@ -1,908 +0,0 @@
-/*
- *  linux/fs/nfs/blocklayout/blocklayout.h
- *
- *  Module for the NFSv4.1 pNFS block layout driver.
- *
- *  Copyright (c) 2006 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Andy Adamson <andros@citi.umich.edu>
- *  Fred Isaman <iisaman@umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization.  if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose.  the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-
-#include "blocklayout.h"
-#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-
-/* Bit numbers */
-#define EXTENT_INITIALIZED 0
-#define EXTENT_WRITTEN     1
-#define EXTENT_IN_COMMIT   2
-#define INTERNAL_EXISTS    MY_MAX_TAGS
-#define INTERNAL_MASK      ((1 << INTERNAL_EXISTS) - 1)
-
-/* Returns largest t<=s s.t. t%base==0 */
-static inline sector_t normalize(sector_t s, int base)
-{
-	sector_t tmp = s; /* Since do_div modifies its argument */
-	return s - sector_div(tmp, base);
-}
-
-static inline sector_t normalize_up(sector_t s, int base)
-{
-	return normalize(s + base - 1, base);
-}
-
-/* Complete stub using list while determine API wanted */
-
-/* Returns tags, or negative */
-static int32_t _find_entry(struct my_tree *tree, u64 s)
-{
-	struct pnfs_inval_tracking *pos;
-
-	dprintk("%s(%llu) enter\n", __func__, s);
-	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
-		if (pos->it_sector > s)
-			continue;
-		else if (pos->it_sector == s)
-			return pos->it_tags & INTERNAL_MASK;
-		else
-			break;
-	}
-	return -ENOENT;
-}
-
-static inline
-int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
-{
-	int32_t tags;
-
-	dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
-	s = normalize(s, tree->mtt_step_size);
-	tags = _find_entry(tree, s);
-	if ((tags < 0) || !(tags & (1 << tag)))
-		return 0;
-	else
-		return 1;
-}
-
-/* Creates entry with tag, or if entry already exists, unions tag to it.
- * If storage is not NULL, newly created entry will use it.
- * Returns number of entries added, or negative on error.
- */
-static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
-		      struct pnfs_inval_tracking *storage)
-{
-	int found = 0;
-	struct pnfs_inval_tracking *pos;
-
-	dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
-	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
-		if (pos->it_sector > s)
-			continue;
-		else if (pos->it_sector == s) {
-			found = 1;
-			break;
-		} else
-			break;
-	}
-	if (found) {
-		pos->it_tags |= (1 << tag);
-		return 0;
-	} else {
-		struct pnfs_inval_tracking *new;
-		new = storage;
-		new->it_sector = s;
-		new->it_tags = (1 << tag);
-		list_add(&new->it_link, &pos->it_link);
-		return 1;
-	}
-}
-
-/* XXXX Really want option to not create */
-/* Over range, unions tag with existing entries, else creates entry with tag */
-static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
-{
-	u64 i;
-
-	dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
-	for (i = normalize(s, tree->mtt_step_size); i < s + length;
-	     i += tree->mtt_step_size)
-		if (_add_entry(tree, i, tag, NULL))
-			return -ENOMEM;
-	return 0;
-}
-
-/* Ensure that future operations on given range of tree will not malloc */
-static int _preload_range(struct pnfs_inval_markings *marks,
-		u64 offset, u64 length)
-{
-	u64 start, end, s;
-	int count, i, used = 0, status = -ENOMEM;
-	struct pnfs_inval_tracking **storage;
-	struct my_tree  *tree = &marks->im_tree;
-
-	dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
-	start = normalize(offset, tree->mtt_step_size);
-	end = normalize_up(offset + length, tree->mtt_step_size);
-	count = (int)(end - start) / (int)tree->mtt_step_size;
-
-	/* Pre-malloc what memory we might need */
-	storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
-	if (!storage)
-		return -ENOMEM;
-	for (i = 0; i < count; i++) {
-		storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
-				     GFP_NOFS);
-		if (!storage[i])
-			goto out_cleanup;
-	}
-
-	spin_lock_bh(&marks->im_lock);
-	for (s = start; s < end; s += tree->mtt_step_size)
-		used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
-	spin_unlock_bh(&marks->im_lock);
-
-	status = 0;
-
- out_cleanup:
-	for (i = used; i < count; i++) {
-		if (!storage[i])
-			break;
-		kfree(storage[i]);
-	}
-	kfree(storage);
-	return status;
-}
-
-/* We are relying on page lock to serialize this */
-int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
-{
-	int rv;
-
-	spin_lock_bh(&marks->im_lock);
-	rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
-	spin_unlock_bh(&marks->im_lock);
-	return rv;
-}
-
-/* Assume start, end already sector aligned */
-static int
-_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
-{
-	struct pnfs_inval_tracking *pos;
-	u64 expect = 0;
-
-	dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
-	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
-		if (pos->it_sector >= end)
-			continue;
-		if (!expect) {
-			if ((pos->it_sector == end - tree->mtt_step_size) &&
-			    (pos->it_tags & (1 << tag))) {
-				expect = pos->it_sector - tree->mtt_step_size;
-				if (pos->it_sector < tree->mtt_step_size || expect < start)
-					return 1;
-				continue;
-			} else {
-				return 0;
-			}
-		}
-		if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
-			return 0;
-		expect -= tree->mtt_step_size;
-		if (expect < start)
-			return 1;
-	}
-	return 0;
-}
-
-static int is_range_written(struct pnfs_inval_markings *marks,
-			    sector_t start, sector_t end)
-{
-	int rv;
-
-	spin_lock_bh(&marks->im_lock);
-	rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
-	spin_unlock_bh(&marks->im_lock);
-	return rv;
-}
-
-/* Marks sectors in [offest, offset_length) as having been initialized.
- * All lengths are step-aligned, where step is min(pagesize, blocksize).
- * Currently assumes offset is page-aligned
- */
-int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
-			     sector_t offset, sector_t length)
-{
-	sector_t start, end;
-
-	dprintk("%s(offset=%llu,len=%llu) enter\n",
-		__func__, (u64)offset, (u64)length);
-
-	start = normalize(offset, marks->im_block_size);
-	end = normalize_up(offset + length, marks->im_block_size);
-	if (_preload_range(marks, start, end - start))
-		goto outerr;
-
-	spin_lock_bh(&marks->im_lock);
-	if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
-		goto out_unlock;
-	spin_unlock_bh(&marks->im_lock);
-
-	return 0;
-
-out_unlock:
-	spin_unlock_bh(&marks->im_lock);
-outerr:
-	return -ENOMEM;
-}
-
-/* Marks sectors in [offest, offset+length) as having been written to disk.
- * All lengths should be block aligned.
- */
-static int mark_written_sectors(struct pnfs_inval_markings *marks,
-				sector_t offset, sector_t length)
-{
-	int status;
-
-	dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
-		(u64)offset, (u64)length);
-	spin_lock_bh(&marks->im_lock);
-	status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
-	spin_unlock_bh(&marks->im_lock);
-	return status;
-}
-
-static void print_short_extent(struct pnfs_block_short_extent *be)
-{
-	dprintk("PRINT SHORT EXTENT extent %p\n", be);
-	if (be) {
-		dprintk("        be_f_offset %llu\n", (u64)be->bse_f_offset);
-		dprintk("        be_length   %llu\n", (u64)be->bse_length);
-	}
-}
-
-static void print_clist(struct list_head *list, unsigned int count)
-{
-	struct pnfs_block_short_extent *be;
-	unsigned int i = 0;
-
-	ifdebug(FACILITY) {
-		printk(KERN_DEBUG "****************\n");
-		printk(KERN_DEBUG "Extent list looks like:\n");
-		list_for_each_entry(be, list, bse_node) {
-			i++;
-			print_short_extent(be);
-		}
-		if (i != count)
-			printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
-		printk(KERN_DEBUG "****************\n");
-	}
-}
-
-/* Note: In theory, we should do more checking that devid's match between
- * old and new, but if they don't, the lists are too corrupt to salvage anyway.
- */
-/* Note this is very similar to bl_add_merge_extent */
-static void add_to_commitlist(struct pnfs_block_layout *bl,
-			      struct pnfs_block_short_extent *new)
-{
-	struct list_head *clist = &bl->bl_commit;
-	struct pnfs_block_short_extent *old, *save;
-	sector_t end = new->bse_f_offset + new->bse_length;
-
-	dprintk("%s enter\n", __func__);
-	print_short_extent(new);
-	print_clist(clist, bl->bl_count);
-	bl->bl_count++;
-	/* Scan for proper place to insert, extending new to the left
-	 * as much as possible.
-	 */
-	list_for_each_entry_safe(old, save, clist, bse_node) {
-		if (new->bse_f_offset < old->bse_f_offset)
-			break;
-		if (end <= old->bse_f_offset + old->bse_length) {
-			/* Range is already in list */
-			bl->bl_count--;
-			kfree(new);
-			return;
-		} else if (new->bse_f_offset <=
-				old->bse_f_offset + old->bse_length) {
-			/* new overlaps or abuts existing be */
-			if (new->bse_mdev == old->bse_mdev) {
-				/* extend new to fully replace old */
-				new->bse_length += new->bse_f_offset -
-						old->bse_f_offset;
-				new->bse_f_offset = old->bse_f_offset;
-				list_del(&old->bse_node);
-				bl->bl_count--;
-				kfree(old);
-			}
-		}
-	}
-	/* Note that if we never hit the above break, old will not point to a
-	 * valid extent.  However, in that case &old->bse_node==list.
-	 */
-	list_add_tail(&new->bse_node, &old->bse_node);
-	/* Scan forward for overlaps.  If we find any, extend new and
-	 * remove the overlapped extent.
-	 */
-	old = list_prepare_entry(new, clist, bse_node);
-	list_for_each_entry_safe_continue(old, save, clist, bse_node) {
-		if (end < old->bse_f_offset)
-			break;
-		/* new overlaps or abuts old */
-		if (new->bse_mdev == old->bse_mdev) {
-			if (end < old->bse_f_offset + old->bse_length) {
-				/* extend new to fully cover old */
-				end = old->bse_f_offset + old->bse_length;
-				new->bse_length = end - new->bse_f_offset;
-			}
-			list_del(&old->bse_node);
-			bl->bl_count--;
-			kfree(old);
-		}
-	}
-	dprintk("%s: after merging\n", __func__);
-	print_clist(clist, bl->bl_count);
-}
-
-/* Note the range described by offset, length is guaranteed to be contained
- * within be.
- * new will be freed, either by this function or add_to_commitlist if they
- * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
- */
-int bl_mark_for_commit(struct pnfs_block_extent *be,
-		    sector_t offset, sector_t length,
-		    struct pnfs_block_short_extent *new)
-{
-	sector_t new_end, end = offset + length;
-	struct pnfs_block_layout *bl = container_of(be->be_inval,
-						    struct pnfs_block_layout,
-						    bl_inval);
-
-	mark_written_sectors(be->be_inval, offset, length);
-	/* We want to add the range to commit list, but it must be
-	 * block-normalized, and verified that the normalized range has
-	 * been entirely written to disk.
-	 */
-	new->bse_f_offset = offset;
-	offset = normalize(offset, bl->bl_blocksize);
-	if (offset < new->bse_f_offset) {
-		if (is_range_written(be->be_inval, offset, new->bse_f_offset))
-			new->bse_f_offset = offset;
-		else
-			new->bse_f_offset = offset + bl->bl_blocksize;
-	}
-	new_end = normalize_up(end, bl->bl_blocksize);
-	if (end < new_end) {
-		if (is_range_written(be->be_inval, end, new_end))
-			end = new_end;
-		else
-			end = new_end - bl->bl_blocksize;
-	}
-	if (end <= new->bse_f_offset) {
-		kfree(new);
-		return 0;
-	}
-	new->bse_length = end - new->bse_f_offset;
-	new->bse_devid = be->be_devid;
-	new->bse_mdev = be->be_mdev;
-
-	spin_lock(&bl->bl_ext_lock);
-	add_to_commitlist(bl, new);
-	spin_unlock(&bl->bl_ext_lock);
-	return 0;
-}
-
-static void print_bl_extent(struct pnfs_block_extent *be)
-{
-	dprintk("PRINT EXTENT extent %p\n", be);
-	if (be) {
-		dprintk("        be_f_offset %llu\n", (u64)be->be_f_offset);
-		dprintk("        be_length   %llu\n", (u64)be->be_length);
-		dprintk("        be_v_offset %llu\n", (u64)be->be_v_offset);
-		dprintk("        be_state    %d\n", be->be_state);
-	}
-}
-
-static void
-destroy_extent(struct kref *kref)
-{
-	struct pnfs_block_extent *be;
-
-	be = container_of(kref, struct pnfs_block_extent, be_refcnt);
-	dprintk("%s be=%p\n", __func__, be);
-	kfree(be);
-}
-
-void
-bl_put_extent(struct pnfs_block_extent *be)
-{
-	if (be) {
-		dprintk("%s enter %p (%i)\n", __func__, be,
-			atomic_read(&be->be_refcnt.refcount));
-		kref_put(&be->be_refcnt, destroy_extent);
-	}
-}
-
-struct pnfs_block_extent *bl_alloc_extent(void)
-{
-	struct pnfs_block_extent *be;
-
-	be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
-	if (!be)
-		return NULL;
-	INIT_LIST_HEAD(&be->be_node);
-	kref_init(&be->be_refcnt);
-	be->be_inval = NULL;
-	return be;
-}
-
-static void print_elist(struct list_head *list)
-{
-	struct pnfs_block_extent *be;
-	dprintk("****************\n");
-	dprintk("Extent list looks like:\n");
-	list_for_each_entry(be, list, be_node) {
-		print_bl_extent(be);
-	}
-	dprintk("****************\n");
-}
-
-static inline int
-extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
-{
-	/* Note this assumes new->be_f_offset >= old->be_f_offset */
-	return (new->be_state == old->be_state) &&
-		((new->be_state == PNFS_BLOCK_NONE_DATA) ||
-		 ((new->be_v_offset - old->be_v_offset ==
-		   new->be_f_offset - old->be_f_offset) &&
-		  new->be_mdev == old->be_mdev));
-}
-
-/* Adds new to appropriate list in bl, modifying new and removing existing
- * extents as appropriate to deal with overlaps.
- *
- * See bl_find_get_extent for list constraints.
- *
- * Refcount on new is already set.  If end up not using it, or error out,
- * need to put the reference.
- *
- * bl->bl_ext_lock is held by caller.
- */
-int
-bl_add_merge_extent(struct pnfs_block_layout *bl,
-		     struct pnfs_block_extent *new)
-{
-	struct pnfs_block_extent *be, *tmp;
-	sector_t end = new->be_f_offset + new->be_length;
-	struct list_head *list;
-
-	dprintk("%s enter with be=%p\n", __func__, new);
-	print_bl_extent(new);
-	list = &bl->bl_extents[bl_choose_list(new->be_state)];
-	print_elist(list);
-
-	/* Scan for proper place to insert, extending new to the left
-	 * as much as possible.
-	 */
-	list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
-		if (new->be_f_offset >= be->be_f_offset + be->be_length)
-			break;
-		if (new->be_f_offset >= be->be_f_offset) {
-			if (end <= be->be_f_offset + be->be_length) {
-				/* new is a subset of existing be*/
-				if (extents_consistent(be, new)) {
-					dprintk("%s: new is subset, ignoring\n",
-						__func__);
-					bl_put_extent(new);
-					return 0;
-				} else {
-					goto out_err;
-				}
-			} else {
-				/* |<--   be   -->|
-				 *          |<--   new   -->| */
-				if (extents_consistent(be, new)) {
-					/* extend new to fully replace be */
-					new->be_length += new->be_f_offset -
-						be->be_f_offset;
-					new->be_f_offset = be->be_f_offset;
-					new->be_v_offset = be->be_v_offset;
-					dprintk("%s: removing %p\n", __func__, be);
-					list_del(&be->be_node);
-					bl_put_extent(be);
-				} else {
-					goto out_err;
-				}
-			}
-		} else if (end >= be->be_f_offset + be->be_length) {
-			/* new extent overlap existing be */
-			if (extents_consistent(be, new)) {
-				/* extend new to fully replace be */
-				dprintk("%s: removing %p\n", __func__, be);
-				list_del(&be->be_node);
-				bl_put_extent(be);
-			} else {
-				goto out_err;
-			}
-		} else if (end > be->be_f_offset) {
-			/*           |<--   be   -->|
-			 *|<--   new   -->| */
-			if (extents_consistent(new, be)) {
-				/* extend new to fully replace be */
-				new->be_length += be->be_f_offset + be->be_length -
-					new->be_f_offset - new->be_length;
-				dprintk("%s: removing %p\n", __func__, be);
-				list_del(&be->be_node);
-				bl_put_extent(be);
-			} else {
-				goto out_err;
-			}
-		}
-	}
-	/* Note that if we never hit the above break, be will not point to a
-	 * valid extent.  However, in that case &be->be_node==list.
-	 */
-	list_add(&new->be_node, &be->be_node);
-	dprintk("%s: inserting new\n", __func__);
-	print_elist(list);
-	/* FIXME - The per-list consistency checks have all been done,
-	 * should now check cross-list consistency.
-	 */
-	return 0;
-
- out_err:
-	bl_put_extent(new);
-	return -EIO;
-}
-
-/* Returns extent, or NULL.  If a second READ extent exists, it is returned
- * in cow_read, if given.
- *
- * The extents are kept in two seperate ordered lists, one for READ and NONE,
- * one for READWRITE and INVALID.  Within each list, we assume:
- * 1. Extents are ordered by file offset.
- * 2. For any given isect, there is at most one extents that matches.
- */
-struct pnfs_block_extent *
-bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
-	    struct pnfs_block_extent **cow_read)
-{
-	struct pnfs_block_extent *be, *cow, *ret;
-	int i;
-
-	dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
-	cow = ret = NULL;
-	spin_lock(&bl->bl_ext_lock);
-	for (i = 0; i < EXTENT_LISTS; i++) {
-		list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
-			if (isect >= be->be_f_offset + be->be_length)
-				break;
-			if (isect >= be->be_f_offset) {
-				/* We have found an extent */
-				dprintk("%s Get %p (%i)\n", __func__, be,
-					atomic_read(&be->be_refcnt.refcount));
-				kref_get(&be->be_refcnt);
-				if (!ret)
-					ret = be;
-				else if (be->be_state != PNFS_BLOCK_READ_DATA)
-					bl_put_extent(be);
-				else
-					cow = be;
-				break;
-			}
-		}
-		if (ret &&
-		    (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
-			break;
-	}
-	spin_unlock(&bl->bl_ext_lock);
-	if (cow_read)
-		*cow_read = cow;
-	print_bl_extent(ret);
-	return ret;
-}
-
-/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
-static struct pnfs_block_extent *
-bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
-{
-	struct pnfs_block_extent *be, *ret = NULL;
-	int i;
-
-	dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
-	for (i = 0; i < EXTENT_LISTS; i++) {
-		if (ret)
-			break;
-		list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
-			if (isect >= be->be_f_offset + be->be_length)
-				break;
-			if (isect >= be->be_f_offset) {
-				/* We have found an extent */
-				dprintk("%s Get %p (%i)\n", __func__, be,
-					atomic_read(&be->be_refcnt.refcount));
-				kref_get(&be->be_refcnt);
-				ret = be;
-				break;
-			}
-		}
-	}
-	print_bl_extent(ret);
-	return ret;
-}
-
-int
-encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-			       struct xdr_stream *xdr,
-			       const struct nfs4_layoutcommit_args *arg)
-{
-	struct pnfs_block_short_extent *lce, *save;
-	unsigned int count = 0;
-	__be32 *p, *xdr_start;
-
-	dprintk("%s enter\n", __func__);
-	/* BUG - creation of bl_commit is buggy - need to wait for
-	 * entire block to be marked WRITTEN before it can be added.
-	 */
-	spin_lock(&bl->bl_ext_lock);
-	/* Want to adjust for possible truncate */
-	/* We now want to adjust argument range */
-
-	/* XDR encode the ranges found */
-	xdr_start = xdr_reserve_space(xdr, 8);
-	if (!xdr_start)
-		goto out;
-	list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
-		p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
-		if (!p)
-			break;
-		p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
-		p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
-		p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
-		p = xdr_encode_hyper(p, 0LL);
-		*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
-		list_move_tail(&lce->bse_node, &bl->bl_committing);
-		bl->bl_count--;
-		count++;
-	}
-	xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
-	xdr_start[1] = cpu_to_be32(count);
-out:
-	spin_unlock(&bl->bl_ext_lock);
-	dprintk("%s found %i ranges\n", __func__, count);
-	return 0;
-}
-
-/* Helper function to set_to_rw that initialize a new extent */
-static void
-_prep_new_extent(struct pnfs_block_extent *new,
-		 struct pnfs_block_extent *orig,
-		 sector_t offset, sector_t length, int state)
-{
-	kref_init(&new->be_refcnt);
-	/* don't need to INIT_LIST_HEAD(&new->be_node) */
-	memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
-	new->be_mdev = orig->be_mdev;
-	new->be_f_offset = offset;
-	new->be_length = length;
-	new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
-	new->be_state = state;
-	new->be_inval = orig->be_inval;
-}
-
-/* Tries to merge be with extent in front of it in list.
- * Frees storage if not used.
- */
-static struct pnfs_block_extent *
-_front_merge(struct pnfs_block_extent *be, struct list_head *head,
-	     struct pnfs_block_extent *storage)
-{
-	struct pnfs_block_extent *prev;
-
-	if (!storage)
-		goto no_merge;
-	if (&be->be_node == head || be->be_node.prev == head)
-		goto no_merge;
-	prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
-	if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
-	    !extents_consistent(prev, be))
-		goto no_merge;
-	_prep_new_extent(storage, prev, prev->be_f_offset,
-			 prev->be_length + be->be_length, prev->be_state);
-	list_replace(&prev->be_node, &storage->be_node);
-	bl_put_extent(prev);
-	list_del(&be->be_node);
-	bl_put_extent(be);
-	return storage;
-
- no_merge:
-	kfree(storage);
-	return be;
-}
-
-static u64
-set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
-{
-	u64 rv = offset + length;
-	struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
-	struct pnfs_block_extent *children[3];
-	struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
-	int i = 0, j;
-
-	dprintk("%s(%llu, %llu)\n", __func__, offset, length);
-	/* Create storage for up to three new extents e1, e2, e3 */
-	e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
-	e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
-	e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
-	/* BUG - we are ignoring any failure */
-	if (!e1 || !e2 || !e3)
-		goto out_nosplit;
-
-	spin_lock(&bl->bl_ext_lock);
-	be = bl_find_get_extent_locked(bl, offset);
-	rv = be->be_f_offset + be->be_length;
-	if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
-		spin_unlock(&bl->bl_ext_lock);
-		goto out_nosplit;
-	}
-	/* Add e* to children, bumping e*'s krefs */
-	if (be->be_f_offset != offset) {
-		_prep_new_extent(e1, be, be->be_f_offset,
-				 offset - be->be_f_offset,
-				 PNFS_BLOCK_INVALID_DATA);
-		children[i++] = e1;
-		print_bl_extent(e1);
-	} else
-		merge1 = e1;
-	_prep_new_extent(e2, be, offset,
-			 min(length, be->be_f_offset + be->be_length - offset),
-			 PNFS_BLOCK_READWRITE_DATA);
-	children[i++] = e2;
-	print_bl_extent(e2);
-	if (offset + length < be->be_f_offset + be->be_length) {
-		_prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
-				 be->be_f_offset + be->be_length -
-				 offset - length,
-				 PNFS_BLOCK_INVALID_DATA);
-		children[i++] = e3;
-		print_bl_extent(e3);
-	} else
-		merge2 = e3;
-
-	/* Remove be from list, and insert the e* */
-	/* We don't get refs on e*, since this list is the base reference
-	 * set when init'ed.
-	 */
-	if (i < 3)
-		children[i] = NULL;
-	new = children[0];
-	list_replace(&be->be_node, &new->be_node);
-	bl_put_extent(be);
-	new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
-	for (j = 1; j < i; j++) {
-		old = new;
-		new = children[j];
-		list_add(&new->be_node, &old->be_node);
-	}
-	if (merge2) {
-		/* This is a HACK, should just create a _back_merge function */
-		new = list_entry(new->be_node.next,
-				 struct pnfs_block_extent, be_node);
-		new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
-	}
-	spin_unlock(&bl->bl_ext_lock);
-
-	/* Since we removed the base reference above, be is now scheduled for
-	 * destruction.
-	 */
-	bl_put_extent(be);
-	dprintk("%s returns %llu after split\n", __func__, rv);
-	return rv;
-
- out_nosplit:
-	kfree(e1);
-	kfree(e2);
-	kfree(e3);
-	dprintk("%s returns %llu without splitting\n", __func__, rv);
-	return rv;
-}
-
-void
-clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-			      const struct nfs4_layoutcommit_args *arg,
-			      int status)
-{
-	struct pnfs_block_short_extent *lce, *save;
-
-	dprintk("%s status %d\n", __func__, status);
-	list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
-		if (likely(!status)) {
-			u64 offset = lce->bse_f_offset;
-			u64 end = offset + lce->bse_length;
-
-			do {
-				offset = set_to_rw(bl, offset, end - offset);
-			} while (offset < end);
-			list_del(&lce->bse_node);
-
-			kfree(lce);
-		} else {
-			list_del(&lce->bse_node);
-			spin_lock(&bl->bl_ext_lock);
-			add_to_commitlist(bl, lce);
-			spin_unlock(&bl->bl_ext_lock);
-		}
-	}
-}
-
-int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
-{
-	struct pnfs_block_short_extent *new;
-
-	new = kmalloc(sizeof(*new), GFP_NOFS);
-	if (unlikely(!new))
-		return -ENOMEM;
-
-	spin_lock_bh(&marks->im_lock);
-	list_add(&new->bse_node, &marks->im_extents);
-	spin_unlock_bh(&marks->im_lock);
-
-	return 0;
-}
-
-struct pnfs_block_short_extent *
-bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
-{
-	struct pnfs_block_short_extent *rv = NULL;
-
-	spin_lock_bh(&marks->im_lock);
-	if (!list_empty(&marks->im_extents)) {
-		rv = list_entry((&marks->im_extents)->next,
-				struct pnfs_block_short_extent, bse_node);
-		list_del_init(&rv->bse_node);
-	}
-	spin_unlock_bh(&marks->im_lock);
-
-	return rv;
-}
-
-void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
-{
-	struct pnfs_block_short_extent *se = NULL, *tmp;
-
-	if (num_to_free <= 0)
-		return;
-
-	spin_lock(&marks->im_lock);
-	list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
-		list_del(&se->bse_node);
-		kfree(se);
-		if (--num_to_free == 0)
-			break;
-	}
-	spin_unlock(&marks->im_lock);
-
-	BUG_ON(num_to_free > 0);
-}
-- 
cgit v1.2.3


From 71d5b76302e21390b4ab747875de6bd5cfbca979 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 08:23:35 -0700
Subject: pnfs/blocklayout: implement the return_range method

This allows removing extents from the extent tree especially on truncate
operations, and thus fixing reads from truncated and re-extended that
previously returned stale data.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 8502e620f644..d3c3c5c972c3 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -469,6 +469,35 @@ static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
 	return lseg;
 }
 
+static void
+bl_return_range(struct pnfs_layout_hdr *lo,
+		struct pnfs_layout_range *range)
+{
+	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+	sector_t offset = range->offset >> SECTOR_SHIFT, end;
+	int err;
+
+	if (range->offset % 8) {
+		dprintk("%s: offset %lld not block size aligned\n",
+			__func__, range->offset);
+		return;
+	}
+
+	if (range->length != NFS4_MAX_UINT64) {
+		if (range->length % 8) {
+			dprintk("%s: length %lld not block size aligned\n",
+				__func__, range->length);
+			return;
+		}
+
+		end = offset + (range->length >> SECTOR_SHIFT);
+	} else {
+		end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
+	}
+
+	err = ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
+}
+
 static void
 bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
 		       const struct nfs4_layoutcommit_args *arg)
@@ -777,6 +806,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 	.free_layout_hdr		= bl_free_layout_hdr,
 	.alloc_lseg			= bl_alloc_lseg,
 	.free_lseg			= bl_free_lseg,
+	.return_range			= bl_return_range,
 	.encode_layoutcommit		= bl_encode_layoutcommit,
 	.cleanup_layoutcommit		= bl_cleanup_layoutcommit,
 	.set_layoutdriver		= bl_set_layoutdriver,
-- 
cgit v1.2.3


From 848746bd247cdc3ce1d103e92913316445763778 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 08:23:36 -0700
Subject: pnfs/blocklayout: return layouts on setattr

This speads up truncate-heavy workloads like fsx by multiple orders of
magnitude.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index d3c3c5c972c3..bdd73fbacf48 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -799,7 +799,8 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 	.id				= LAYOUT_BLOCK_VOLUME,
 	.name				= "LAYOUT_BLOCK_VOLUME",
 	.owner				= THIS_MODULE,
-	.flags				= PNFS_READ_WHOLE_PAGE,
+	.flags				= PNFS_LAYOUTRET_ON_SETATTR |
+					  PNFS_READ_WHOLE_PAGE,
 	.read_pagelist			= bl_read_pagelist,
 	.write_pagelist			= bl_write_pagelist,
 	.alloc_layout_hdr		= bl_alloc_layout_hdr,
-- 
cgit v1.2.3


From 661373b13d0490ff410a2133d4a7a117f2dd037e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Sep 2014 21:27:57 -0700
Subject: pnfs: factor GETDEVICEINFO implementations

Add support to the common pNFS core to issue GETDEVICEINFO calls on
a device ID cache miss.  The code is taken from the well debugged
file layout implementation and calls out to the layoutdriver through
a new alloc_deviceid_node method.  The calling conventions for
nfs4_find_get_deviceid are changed so that all information needed to
send a GETDEVICEINFO request is passed to the common code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/filelayout/filelayout.c    |  29 +++++---
 fs/nfs/filelayout/filelayout.h    |   7 +-
 fs/nfs/filelayout/filelayoutdev.c | 108 ++-------------------------
 fs/nfs/objlayout/objio_osd.c      | 113 +++++++++++------------------
 fs/nfs/objlayout/objlayout.c      |  70 ------------------
 fs/nfs/objlayout/objlayout.h      |   5 --
 fs/nfs/pnfs.h                     |  13 +++-
 fs/nfs/pnfs_dev.c                 | 149 ++++++++++++++++++++++++++------------
 8 files changed, 182 insertions(+), 312 deletions(-)

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 163cad1d4127..abc5056999d6 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -649,18 +649,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 	}
 
 	/* find and reference the deviceid */
-	d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
-				   NFS_SERVER(lo->plh_inode)->nfs_client, id);
-	if (d == NULL) {
-		dsaddr = filelayout_get_device_info(lo->plh_inode, id,
-				lo->plh_lc_cred, gfp_flags);
-		if (dsaddr == NULL)
-			goto out;
-	} else
-		dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+	d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id,
+			lo->plh_lc_cred, gfp_flags);
+	if (d == NULL)
+		goto out;
+
+	dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
 	/* Found deviceid is unavailable */
 	if (filelayout_test_devid_unavailable(&dsaddr->id_node))
-			goto out_put;
+		goto out_put;
 
 	fl->dsaddr = dsaddr;
 
@@ -1371,6 +1368,17 @@ out:
 	cinfo->ds->ncommitting = 0;
 	return PNFS_ATTEMPTED;
 }
+static struct nfs4_deviceid_node *
+filelayout_alloc_deviceid_node(struct nfs_server *server,
+		struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+	struct nfs4_file_layout_dsaddr *dsaddr;
+
+	dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
+	if (!dsaddr)
+		return NULL;
+	return &dsaddr->id_node;
+}
 
 static void
 filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
@@ -1423,6 +1431,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
 	.commit_pagelist	= filelayout_commit_pagelist,
 	.read_pagelist		= filelayout_read_pagelist,
 	.write_pagelist		= filelayout_write_pagelist,
+	.alloc_deviceid_node	= filelayout_alloc_deviceid_node,
 	.free_deviceid_node	= filelayout_free_deveiceid_node,
 };
 
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index ffbddf2219ea..7c9f800c49d7 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
 u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
 struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
 					u32 ds_idx);
+
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
+	struct pnfs_device *pdev, gfp_t gfp_flags);
 extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
-		struct rpc_cred *cred, gfp_t gfp_flags);
 
 #endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 8540516f4d71..9bb806a76d99 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -484,8 +484,9 @@ out_err:
 }
 
 /* Decode opaque device data and return the result */
-static struct nfs4_file_layout_dsaddr*
-decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+		gfp_t gfp_flags)
 {
 	int i;
 	u32 cnt, num;
@@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
 	dsaddr->stripe_indices = stripe_indices;
 	stripe_indices = NULL;
 	dsaddr->ds_num = num;
-	nfs4_init_deviceid_node(&dsaddr->id_node,
-				NFS_SERVER(ino)->pnfs_curr_ld,
-				NFS_SERVER(ino)->nfs_client,
-				&pdev->dev_id);
+	nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
 
 	INIT_LIST_HEAD(&dsaddrs);
 
@@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
 
 		mp_count = be32_to_cpup(p); /* multipath count */
 		for (j = 0; j < mp_count; j++) {
-			da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
+			da = decode_ds_addr(server->nfs_client->cl_net,
 					    &stream, gfp_flags);
 			if (da)
 				list_add_tail(&da->da_node, &dsaddrs);
@@ -637,102 +635,6 @@ out_err:
 	return NULL;
 }
 
-/*
- * Decode the opaque device specified in 'dev' and add it to the cache of
- * available devices.
- */
-static struct nfs4_file_layout_dsaddr *
-decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
-{
-	struct nfs4_deviceid_node *d;
-	struct nfs4_file_layout_dsaddr *n, *new;
-
-	new = decode_device(inode, dev, gfp_flags);
-	if (!new) {
-		printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
-			__func__);
-		return NULL;
-	}
-
-	d = nfs4_insert_deviceid_node(&new->id_node);
-	n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
-	if (n != new) {
-		nfs4_fl_free_deviceid(new);
-		return n;
-	}
-
-	return new;
-}
-
-/*
- * Retrieve the information for dev_id, add it to the list
- * of available devices, and return it.
- */
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode,
-		struct nfs4_deviceid *dev_id,
-		struct rpc_cred *cred,
-		gfp_t gfp_flags)
-{
-	struct pnfs_device *pdev = NULL;
-	u32 max_resp_sz;
-	int max_pages;
-	struct page **pages = NULL;
-	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
-	int rc, i;
-	struct nfs_server *server = NFS_SERVER(inode);
-
-	/*
-	 * Use the session max response size as the basis for setting
-	 * GETDEVICEINFO's maxcount
-	 */
-	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
-	max_pages = nfs_page_array_len(0, max_resp_sz);
-	dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
-		__func__, inode, max_resp_sz, max_pages);
-
-	pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
-	if (pdev == NULL)
-		return NULL;
-
-	pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
-	if (pages == NULL) {
-		kfree(pdev);
-		return NULL;
-	}
-	for (i = 0; i < max_pages; i++) {
-		pages[i] = alloc_page(gfp_flags);
-		if (!pages[i])
-			goto out_free;
-	}
-
-	memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
-	pdev->layout_type = LAYOUT_NFSV4_1_FILES;
-	pdev->pages = pages;
-	pdev->pgbase = 0;
-	pdev->pglen = max_resp_sz;
-	pdev->mincount = 0;
-	pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-
-	rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
-	dprintk("%s getdevice info returns %d\n", __func__, rc);
-	if (rc)
-		goto out_free;
-
-	/*
-	 * Found new device, need to decode it and then add it to the
-	 * list of known devices for this mountpoint.
-	 */
-	dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
-out_free:
-	for (i = 0; i < max_pages; i++)
-		__free_page(pages[i]);
-	kfree(pages);
-	kfree(pdev);
-	dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
-	return dsaddr;
-}
-
 void
 nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index ae05278b3761..c502f640209b 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
 	kfree(de);
 }
 
-static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
-	const struct nfs4_deviceid *d_id)
-{
-	struct nfs4_deviceid_node *d;
-	struct objio_dev_ent *de;
-
-	d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
-	if (!d)
-		return NULL;
-
-	de = container_of(d, struct objio_dev_ent, id_node);
-	return de;
-}
-
-static struct objio_dev_ent *
-_dev_list_add(const struct nfs_server *nfss,
-	const struct nfs4_deviceid *d_id, struct osd_dev *od,
-	gfp_t gfp_flags)
-{
-	struct nfs4_deviceid_node *d;
-	struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
-	struct objio_dev_ent *n;
-
-	if (!de) {
-		dprintk("%s: -ENOMEM od=%p\n", __func__, od);
-		return NULL;
-	}
-
-	dprintk("%s: Adding od=%p\n", __func__, od);
-	nfs4_init_deviceid_node(&de->id_node,
-				nfss->pnfs_curr_ld,
-				nfss->nfs_client,
-				d_id);
-	de->od.od = od;
-
-	d = nfs4_insert_deviceid_node(&de->id_node);
-	n = container_of(d, struct objio_dev_ent, id_node);
-	if (n != de) {
-		dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
-		objio_free_deviceid_node(&de->id_node);
-		de = n;
-	}
-
-	return de;
-}
-
 struct objio_segment {
 	struct pnfs_layout_segment lseg;
 
@@ -130,29 +84,24 @@ struct objio_state {
 
 /* Send and wait for a get_device_info of devices in the layout,
    then look them up with the osd_initiator library */
-static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
-	struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
-	gfp_t gfp_flags)
+struct nfs4_deviceid_node *
+objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+			gfp_t gfp_flags)
 {
 	struct pnfs_osd_deviceaddr *deviceaddr;
-	struct objio_dev_ent *ode;
+	struct objio_dev_ent *ode = NULL;
 	struct osd_dev *od;
 	struct osd_dev_info odi;
 	bool retry_flag = true;
+	u32 *p;
 	int err;
 
-	ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
-	if (ode) {
-		objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
-		return 0;
-	}
+	deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
+	if (!deviceaddr)
+		return NULL;
 
-	err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
-	if (unlikely(err)) {
-		dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
-			__func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
-		return err;
-	}
+	p = page_address(pdev->pages[0]);
+	pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
 
 	odi.systemid_len = deviceaddr->oda_systemid.len;
 	if (odi.systemid_len > sizeof(odi.systemid)) {
@@ -188,14 +137,24 @@ retry_lookup:
 		goto out;
 	}
 
-	ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
-			    gfp_flags);
-	objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
 	dprintk("Adding new dev_id(%llx:%llx)\n",
-		_DEVID_LO(d_id), _DEVID_HI(d_id));
+		_DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
+
+	ode = kzalloc(sizeof(*ode), gfp_flags);
+	if (!ode) {
+		dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+		goto out;
+	}
+
+	nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
+	kfree(deviceaddr);
+
+	ode->od.od = od;
+	return &ode->id_node;
+
 out:
-	objlayout_put_deviceinfo(deviceaddr);
-	return err;
+	kfree(deviceaddr);
+	return NULL;
 }
 
 static void copy_single_comp(struct ore_components *oc, unsigned c,
@@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
 	struct xdr_stream *xdr,
 	gfp_t gfp_flags)
 {
+	struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
 	struct objio_segment *objio_seg;
 	struct pnfs_osd_xdr_decode_layout_iter iter;
 	struct pnfs_osd_layout layout;
@@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
 	objio_seg->oc.first_dev = layout.olo_comps_index;
 	cur_comp = 0;
 	while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
+		struct nfs4_deviceid_node *d;
+		struct objio_dev_ent *ode;
+
 		copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
-		err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
-					   &src_comp.oc_object_id.oid_device_id,
-					   gfp_flags);
-		if (err)
+
+		d = nfs4_find_get_deviceid(server,
+				&src_comp.oc_object_id.oid_device_id,
+				pnfslay->plh_lc_cred, gfp_flags);
+		if (!d) {
+			err = -ENXIO;
 			goto err;
-		++cur_comp;
+		}
+
+		ode = container_of(d, struct objio_dev_ent, id_node);
+		objio_seg->oc.ods[cur_comp++] = &ode->od;
 	}
 	/* pnfs_osd_xdr_decode_layout_comp returns false on error */
 	if (unlikely(err))
@@ -653,6 +621,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
 	.flags                   = PNFS_LAYOUTRET_ON_SETATTR |
 				   PNFS_LAYOUTRET_ON_ERROR,
 
+	.max_deviceinfo_size	 = PAGE_SIZE,
 	.owner		       	 = THIS_MODULE,
 	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
 	.free_layout_hdr         = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 697a16d11fac..c89357c7a914 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -574,76 +574,6 @@ loop_done:
 	dprintk("%s: Return\n", __func__);
 }
 
-
-/*
- * Get Device Info API for io engines
- */
-struct objlayout_deviceinfo {
-	struct page *page;
-	struct pnfs_osd_deviceaddr da; /* This must be last */
-};
-
-/* Initialize and call nfs_getdeviceinfo, then decode and return a
- * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
- * should be called.
- */
-int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
-	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
-	gfp_t gfp_flags)
-{
-	struct objlayout_deviceinfo *odi;
-	struct pnfs_device pd;
-	struct page *page, **pages;
-	u32 *p;
-	int err;
-
-	page = alloc_page(gfp_flags);
-	if (!page)
-		return -ENOMEM;
-
-	pages = &page;
-	pd.pages = pages;
-
-	memcpy(&pd.dev_id, d_id, sizeof(*d_id));
-	pd.layout_type = LAYOUT_OSD2_OBJECTS;
-	pd.pages = &page;
-	pd.pgbase = 0;
-	pd.pglen = PAGE_SIZE;
-	pd.mincount = 0;
-	pd.maxcount = PAGE_SIZE;
-
-	err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
-			pnfslay->plh_lc_cred);
-	dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
-	if (err)
-		goto err_out;
-
-	p = page_address(page);
-	odi = kzalloc(sizeof(*odi), gfp_flags);
-	if (!odi) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
-	odi->page = page;
-	*deviceaddr = &odi->da;
-	return 0;
-
-err_out:
-	__free_page(page);
-	return err;
-}
-
-void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
-{
-	struct objlayout_deviceinfo *odi = container_of(deviceaddr,
-						struct objlayout_deviceinfo,
-						da);
-
-	__free_page(odi->page);
-	kfree(odi);
-}
-
 enum {
 	OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
 	OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index fd13f1d2f136..3a0828d57339 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir,
 extern void objlayout_write_done(struct objlayout_io_res *oir,
 				 ssize_t status, bool sync);
 
-extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
-	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
-	gfp_t gfp_flags);
-extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
-
 /*
  * exported generic objects function vectors
  */
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 2c2494225930..ce89ae364bb8 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -84,6 +84,7 @@ struct pnfs_layoutdriver_type {
 	const char *name;
 	struct module *owner;
 	unsigned flags;
+	unsigned max_deviceinfo_size;
 
 	int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
 	int (*clear_layoutdriver) (struct nfs_server *);
@@ -126,6 +127,9 @@ struct pnfs_layoutdriver_type {
 	enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
 
 	void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+	struct nfs4_deviceid_node * (*alloc_deviceid_node)
+			(struct nfs_server *server, struct pnfs_device *pdev,
+			gfp_t gfp_flags);
 
 	void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
 				     struct xdr_stream *xdr,
@@ -261,11 +265,12 @@ struct nfs4_deviceid_node {
 	atomic_t			ref;
 };
 
-struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(struct nfs_server *server,
+		const struct nfs4_deviceid *id, struct rpc_cred *cred,
+		gfp_t gfp_mask);
 void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
-void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
-			     const struct pnfs_layoutdriver_type *,
-			     const struct nfs_client *,
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
 			     const struct nfs4_deviceid *);
 struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
 bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6da209bd9408..791f8b3c4cff 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -29,6 +29,9 @@
  */
 
 #include <linux/export.h>
+#include <linux/nfs_fs.h>
+#include "nfs4session.h"
+#include "internal.h"
 #include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS
@@ -89,6 +92,74 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
 	return NULL;
 }
 
+static struct nfs4_deviceid_node *
+nfs4_get_device_info(struct nfs_server *server,
+		const struct nfs4_deviceid *dev_id,
+		struct rpc_cred *cred, gfp_t gfp_flags)
+{
+	struct nfs4_deviceid_node *d = NULL;
+	struct pnfs_device *pdev = NULL;
+	struct page **pages = NULL;
+	u32 max_resp_sz;
+	int max_pages;
+	int rc, i;
+
+	/*
+	 * Use the session max response size as the basis for setting
+	 * GETDEVICEINFO's maxcount
+	 */
+	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+	if (server->pnfs_curr_ld->max_deviceinfo_size &&
+	    server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
+		max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
+	max_pages = nfs_page_array_len(0, max_resp_sz);
+	dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
+		__func__, server, max_resp_sz, max_pages);
+
+	pdev = kzalloc(sizeof(*pdev), gfp_flags);
+	if (!pdev)
+		return NULL;
+
+	pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
+	if (!pages)
+		goto out_free_pdev;
+
+	for (i = 0; i < max_pages; i++) {
+		pages[i] = alloc_page(gfp_flags);
+		if (!pages[i])
+			goto out_free_pages;
+	}
+
+	memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+	pdev->layout_type = server->pnfs_curr_ld->id;
+	pdev->pages = pages;
+	pdev->pgbase = 0;
+	pdev->pglen = max_resp_sz;
+	pdev->mincount = 0;
+	pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
+
+	rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
+	dprintk("%s getdevice info returns %d\n", __func__, rc);
+	if (rc)
+		goto out_free_pages;
+
+	/*
+	 * Found new device, need to decode it and then add it to the
+	 * list of known devices for this mountpoint.
+	 */
+	d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
+			gfp_flags);
+
+out_free_pages:
+	for (i = 0; i < max_pages; i++)
+		__free_page(pages[i]);
+	kfree(pages);
+out_free_pdev:
+	kfree(pdev);
+	dprintk("<-- %s d %p\n", __func__, d);
+	return d;
+}
+
 /*
  * Lookup a deviceid in cache and get a reference count on it if found
  *
@@ -96,14 +167,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
  * @id deviceid to look up
  */
 static struct nfs4_deviceid_node *
-_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
-		   const struct nfs_client *clp, const struct nfs4_deviceid *id,
-		   long hash)
+__nfs4_find_get_deviceid(struct nfs_server *server,
+		const struct nfs4_deviceid *id, long hash)
 {
 	struct nfs4_deviceid_node *d;
 
 	rcu_read_lock();
-	d = _lookup_deviceid(ld, clp, id, hash);
+	d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
+			hash);
 	if (d != NULL)
 		atomic_inc(&d->ref);
 	rcu_read_unlock();
@@ -111,10 +182,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
 }
 
 struct nfs4_deviceid_node *
-nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
-		       const struct nfs_client *clp, const struct nfs4_deviceid *id)
+nfs4_find_get_deviceid(struct nfs_server *server,
+		const struct nfs4_deviceid *id, struct rpc_cred *cred,
+		gfp_t gfp_mask)
 {
-	return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+	long hash = nfs4_deviceid_hash(id);
+	struct nfs4_deviceid_node *d, *new;
+
+	d = __nfs4_find_get_deviceid(server, id, hash);
+	if (d)
+		return d;
+
+	new = nfs4_get_device_info(server, id, cred, gfp_mask);
+	if (!new)
+		return new;
+
+	spin_lock(&nfs4_deviceid_lock);
+	d = __nfs4_find_get_deviceid(server, id, hash);
+	if (d) {
+		spin_unlock(&nfs4_deviceid_lock);
+		server->pnfs_curr_ld->free_deviceid_node(new);
+		return d;
+	}
+	hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+	atomic_inc(&new->ref);
+	spin_unlock(&nfs4_deviceid_lock);
+
+	return new;
 }
 EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
 
@@ -151,54 +245,19 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
 EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
 
 void
-nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
-			const struct pnfs_layoutdriver_type *ld,
-			const struct nfs_client *nfs_client,
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
 			const struct nfs4_deviceid *id)
 {
 	INIT_HLIST_NODE(&d->node);
 	INIT_HLIST_NODE(&d->tmpnode);
-	d->ld = ld;
-	d->nfs_client = nfs_client;
+	d->ld = server->pnfs_curr_ld;
+	d->nfs_client = server->nfs_client;
 	d->flags = 0;
 	d->deviceid = *id;
 	atomic_set(&d->ref, 1);
 }
 EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
 
-/*
- * Uniquely initialize and insert a deviceid node into cache
- *
- * @new new deviceid node
- *      Note that the caller must set up the following members:
- *        new->ld
- *        new->nfs_client
- *        new->deviceid
- *
- * @ret the inserted node, if none found, otherwise, the found entry.
- */
-struct nfs4_deviceid_node *
-nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
-{
-	struct nfs4_deviceid_node *d;
-	long hash;
-
-	spin_lock(&nfs4_deviceid_lock);
-	hash = nfs4_deviceid_hash(&new->deviceid);
-	d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
-	if (d) {
-		spin_unlock(&nfs4_deviceid_lock);
-		return d;
-	}
-
-	hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
-	spin_unlock(&nfs4_deviceid_lock);
-	atomic_inc(&new->ref);
-
-	return new;
-}
-EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
-
 /*
  * Dereference a deviceid node and delete it when its reference count drops
  * to zero.
-- 
cgit v1.2.3


From 9dd2fcd32f488ea89c2227cc56069446147376e8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Sep 2014 21:27:58 -0700
Subject: pnfs: add a common GETDEVICELIST implementation

At a simple helper to issue a GETDEVICELIST operation and pre-load
the device id cache based on the result.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.h     |  2 ++
 fs/nfs/pnfs_dev.c | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index ce89ae364bb8..3eeca49d9ca2 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -277,6 +277,8 @@ bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
 void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
 bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
 void nfs4_deviceid_purge_client(const struct nfs_client *);
+int nfs4_deviceid_getdevicelist(struct nfs_server *server,
+		const struct nfs_fh *fh);
 
 static inline struct pnfs_layout_segment *
 pnfs_get_lseg(struct pnfs_layout_segment *lseg)
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 791f8b3c4cff..82c2836fdb38 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -359,3 +359,32 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
 	rcu_read_unlock();
 }
 
+int
+nfs4_deviceid_getdevicelist(struct nfs_server *server,
+		const struct nfs_fh *fh)
+{
+	struct pnfs_devicelist *dlist;
+	struct nfs4_deviceid_node *d;
+	int error = 0, i;
+
+	dlist = kzalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
+	if (!dlist)
+		return -ENOMEM;
+
+	while (!dlist->eof) {
+		error = nfs4_proc_getdevicelist(server, fh, dlist);
+		if (error)
+			break;
+
+		for (i = 0; i < dlist->num_devs; i++) {
+			d = nfs4_find_get_deviceid(server, &dlist->dev_id[i],
+					NULL, GFP_NOFS);
+			if (d)
+				nfs4_put_deviceid_node(d);
+		}
+	}
+
+	kfree(dlist);
+	return error;
+}
+EXPORT_SYMBOL_GPL(nfs4_deviceid_getdevicelist);
-- 
cgit v1.2.3


From 30ff0603ca4d66c8244efc80ea8470d3d04aee8b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Sep 2014 21:27:59 -0700
Subject: pnfs: add a nfs4_get_deviceid helper

This will be used by the block layout driver when splitting extents.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 3eeca49d9ca2..d84fe29fc88b 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -280,6 +280,13 @@ void nfs4_deviceid_purge_client(const struct nfs_client *);
 int nfs4_deviceid_getdevicelist(struct nfs_server *server,
 		const struct nfs_fh *fh);
 
+static inline struct nfs4_deviceid_node *
+nfs4_get_deviceid(struct nfs4_deviceid_node *d)
+{
+	atomic_inc(&d->ref);
+	return d;
+}
+
 static inline struct pnfs_layout_segment *
 pnfs_get_lseg(struct pnfs_layout_segment *lseg)
 {
-- 
cgit v1.2.3


From 20d655d6197d02e98574208839da11684dc2ad1f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Sep 2014 21:28:00 -0700
Subject: pnfs/blocklayout: use the device id cache

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c    | 149 ++----------------------------------
 fs/nfs/blocklayout/blocklayout.h    |  25 ++----
 fs/nfs/blocklayout/blocklayoutdev.c |  88 +++++++++------------
 fs/nfs/blocklayout/blocklayoutdm.c  |  26 +------
 fs/nfs/blocklayout/extent_tree.c    |  27 ++++---
 5 files changed, 65 insertions(+), 250 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index bdd73fbacf48..25ba9e0e6fff 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -119,6 +119,8 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
 				     void (*end_io)(struct bio *, int err),
 				     struct parallel_io *par)
 {
+	struct pnfs_block_dev *dev =
+		container_of(be->be_device, struct pnfs_block_dev, d_node);
 	struct bio *bio;
 
 	npg = min(npg, BIO_MAX_PAGES);
@@ -131,7 +133,7 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
 	if (bio) {
 		bio->bi_iter.bi_sector = isect - be->be_f_offset +
 			be->be_v_offset;
-		bio->bi_bdev = be->be_mdev;
+		bio->bi_bdev = dev->d_bdev;
 		bio->bi_end_io = end_io;
 		bio->bi_private = par;
 	}
@@ -515,96 +517,9 @@ bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
 	ext_tree_mark_committed(BLK_LO2EXT(lo), lcdata->res.status);
 }
 
-static void free_blk_mountid(struct block_mount_id *mid)
-{
-	if (mid) {
-		struct pnfs_block_dev *dev, *tmp;
-
-		/* No need to take bm_lock as we are last user freeing bm_devlist */
-		list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
-			list_del(&dev->bm_node);
-			bl_free_block_dev(dev);
-		}
-		kfree(mid);
-	}
-}
-
-/* This is mostly copied from the filelayout_get_device_info function.
- * It seems much of this should be at the generic pnfs level.
- */
-static struct pnfs_block_dev *
-nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
-			struct nfs4_deviceid *d_id)
-{
-	struct pnfs_device *dev;
-	struct pnfs_block_dev *rv;
-	u32 max_resp_sz;
-	int max_pages;
-	struct page **pages = NULL;
-	int i, rc;
-
-	/*
-	 * Use the session max response size as the basis for setting
-	 * GETDEVICEINFO's maxcount
-	 */
-	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
-	max_pages = nfs_page_array_len(0, max_resp_sz);
-	dprintk("%s max_resp_sz %u max_pages %d\n",
-		__func__, max_resp_sz, max_pages);
-
-	dev = kmalloc(sizeof(*dev), GFP_NOFS);
-	if (!dev) {
-		dprintk("%s kmalloc failed\n", __func__);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS);
-	if (pages == NULL) {
-		kfree(dev);
-		return ERR_PTR(-ENOMEM);
-	}
-	for (i = 0; i < max_pages; i++) {
-		pages[i] = alloc_page(GFP_NOFS);
-		if (!pages[i]) {
-			rv = ERR_PTR(-ENOMEM);
-			goto out_free;
-		}
-	}
-
-	memcpy(&dev->dev_id, d_id, sizeof(*d_id));
-	dev->layout_type = LAYOUT_BLOCK_VOLUME;
-	dev->pages = pages;
-	dev->pgbase = 0;
-	dev->pglen = PAGE_SIZE * max_pages;
-	dev->mincount = 0;
-	dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-
-	dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
-	rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
-	dprintk("%s getdevice info returns %d\n", __func__, rc);
-	if (rc) {
-		rv = ERR_PTR(rc);
-		goto out_free;
-	}
-
-	rv = nfs4_blk_decode_device(server, dev);
- out_free:
-	for (i = 0; i < max_pages; i++)
-		__free_page(pages[i]);
-	kfree(pages);
-	kfree(dev);
-	return rv;
-}
-
 static int
 bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
 {
-	struct block_mount_id *b_mt_id = NULL;
-	struct pnfs_devicelist *dlist = NULL;
-	struct pnfs_block_dev *bdev;
-	LIST_HEAD(block_disklist);
-	int status, i;
-
 	dprintk("%s enter\n", __func__);
 
 	if (server->pnfs_blksize == 0) {
@@ -617,60 +532,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
 		return -EINVAL;
 	}
 
-	b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
-	if (!b_mt_id) {
-		status = -ENOMEM;
-		goto out_error;
-	}
-	/* Initialize nfs4 block layout mount id */
-	spin_lock_init(&b_mt_id->bm_lock);
-	INIT_LIST_HEAD(&b_mt_id->bm_devlist);
-
-	dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
-	if (!dlist) {
-		status = -ENOMEM;
-		goto out_error;
-	}
-	dlist->eof = 0;
-	while (!dlist->eof) {
-		status = nfs4_proc_getdevicelist(server, fh, dlist);
-		if (status)
-			goto out_error;
-		dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
-			__func__, dlist->num_devs, dlist->eof);
-		for (i = 0; i < dlist->num_devs; i++) {
-			bdev = nfs4_blk_get_deviceinfo(server, fh,
-						       &dlist->dev_id[i]);
-			if (IS_ERR(bdev)) {
-				status = PTR_ERR(bdev);
-				goto out_error;
-			}
-			spin_lock(&b_mt_id->bm_lock);
-			list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
-			spin_unlock(&b_mt_id->bm_lock);
-		}
-	}
-	dprintk("%s SUCCESS\n", __func__);
-	server->pnfs_ld_data = b_mt_id;
-
- out_return:
-	kfree(dlist);
-	return status;
-
- out_error:
-	free_blk_mountid(b_mt_id);
-	goto out_return;
-}
-
-static int
-bl_clear_layoutdriver(struct nfs_server *server)
-{
-	struct block_mount_id *b_mt_id = server->pnfs_ld_data;
-
-	dprintk("%s enter\n", __func__);
-	free_blk_mountid(b_mt_id);
-	dprintk("%s RETURNS\n", __func__);
-	return 0;
+	return nfs4_deviceid_getdevicelist(server, fh);
 }
 
 static bool
@@ -811,7 +673,8 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 	.encode_layoutcommit		= bl_encode_layoutcommit,
 	.cleanup_layoutcommit		= bl_cleanup_layoutcommit,
 	.set_layoutdriver		= bl_set_layoutdriver,
-	.clear_layoutdriver		= bl_clear_layoutdriver,
+	.alloc_deviceid_node		= bl_alloc_deviceid_node,
+	.free_deviceid_node		= bl_free_deviceid_node,
 	.pg_read_ops			= &bl_pg_read_ops,
 	.pg_write_ops			= &bl_pg_write_ops,
 };
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index b4f66d875f12..19fae5e4c90b 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -44,16 +44,9 @@
 #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
 #define SECTOR_SIZE (1 << SECTOR_SHIFT)
 
-struct block_mount_id {
-	spinlock_t			bm_lock;    /* protects list */
-	struct list_head		bm_devlist; /* holds pnfs_block_dev */
-};
-
 struct pnfs_block_dev {
-	struct list_head		bm_node;
-	struct nfs4_deviceid		bm_mdevid;    /* associated devid */
-	struct block_device		*bm_mdev;     /* meta device itself */
-	struct net			*net;
+	struct nfs4_deviceid_node	d_node;
+	struct block_device		*d_bdev;
 };
 
 enum exstate4 {
@@ -69,8 +62,7 @@ struct pnfs_block_extent {
 		struct rb_node	be_node;
 		struct list_head be_list;
 	};
-	struct nfs4_deviceid be_devid;	/* FIXME: could use device cache instead */
-	struct block_device *be_mdev;
+	struct nfs4_deviceid_node *be_device;
 	sector_t	be_f_offset;	/* the starting offset in the file */
 	sector_t	be_length;	/* the size of the extent */
 	sector_t	be_v_offset;	/* the starting offset in the volume */
@@ -87,8 +79,6 @@ struct pnfs_block_layout {
 	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
 };
 
-#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
-
 static inline struct pnfs_block_layout *
 BLK_LO2EXT(struct pnfs_layout_hdr *lo)
 {
@@ -120,14 +110,15 @@ struct bl_msg_hdr {
 /* blocklayoutdev.c */
 ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
 void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-void nfs4_blkdev_put(struct block_device *bdev);
-struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
-						struct pnfs_device *dev);
 int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
 				struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
 
+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
+		struct pnfs_device *pdev, gfp_t gfp_mask);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+
 /* blocklayoutdm.c */
-void bl_free_block_dev(struct pnfs_block_dev *bdev);
+void bl_dm_remove(struct net *net, dev_t dev);
 
 /* extent_tree.c */
 int ext_tree_insert(struct pnfs_block_layout *bl,
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index cd71b5e231ec..d6527d20c508 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -53,16 +53,6 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
 	return 0;
 }
 
-/*
- * Release the block device
- */
-void nfs4_blkdev_put(struct block_device *bdev)
-{
-	dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
-			MINOR(bdev->bd_dev));
-	blkdev_put(bdev, FMODE_READ);
-}
-
 ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
 			 size_t mlen)
 {
@@ -92,12 +82,12 @@ void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
 /*
  * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
  */
-struct pnfs_block_dev *
-nfs4_blk_decode_device(struct nfs_server *server,
-		       struct pnfs_device *dev)
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
+		gfp_t gfp_mask)
 {
 	struct pnfs_block_dev *rv;
-	struct block_device *bd = NULL;
+	struct block_device *bd;
 	struct bl_pipe_msg bl_pipe_msg;
 	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
 	struct bl_msg_hdr bl_msg = {
@@ -117,11 +107,9 @@ nfs4_blk_decode_device(struct nfs_server *server,
 
 	bl_pipe_msg.bl_wq = &nn->bl_wq;
 	memset(msg, 0, sizeof(*msg));
-	msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
-	if (!msg->data) {
-		rv = ERR_PTR(-ENOMEM);
+	msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, gfp_mask);
+	if (!msg->data)
 		goto out;
-	}
 
 	memcpy(msg->data, &bl_msg, sizeof(bl_msg));
 	dataptr = (uint8_t *) msg->data;
@@ -140,7 +128,6 @@ nfs4_blk_decode_device(struct nfs_server *server,
 	rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
 	if (rc < 0) {
 		remove_wait_queue(&nn->bl_wq, &wq);
-		rv = ERR_PTR(rc);
 		goto out;
 	}
 
@@ -152,7 +139,6 @@ nfs4_blk_decode_device(struct nfs_server *server,
 	if (reply->status != BL_DEVICE_REQUEST_PROC) {
 		printk(KERN_WARNING "%s failed to decode device: %d\n",
 			__func__, reply->status);
-		rv = ERR_PTR(-EINVAL);
 		goto out;
 	}
 
@@ -162,51 +148,40 @@ nfs4_blk_decode_device(struct nfs_server *server,
 		printk(KERN_WARNING "%s failed to open device %d:%d (%ld)\n",
 			__func__, reply->major, reply->minor,
 			PTR_ERR(bd));
-		rv = ERR_CAST(bd);
 		goto out;
 	}
 
-	rv = kzalloc(sizeof(*rv), GFP_NOFS);
-	if (!rv) {
-		rv = ERR_PTR(-ENOMEM);
+	rv = kzalloc(sizeof(*rv), gfp_mask);
+	if (!rv)
 		goto out;
-	}
 
-	rv->bm_mdev = bd;
-	memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
-	rv->net = net;
+	nfs4_init_deviceid_node(&rv->d_node, server, &dev->dev_id);
+	rv->d_bdev = bd;
+
 	dprintk("%s Created device %s with bd_block_size %u\n",
 		__func__,
 		bd->bd_disk->disk_name,
 		bd->bd_block_size);
 
+	kfree(msg->data);
+	return &rv->d_node;
+
 out:
 	kfree(msg->data);
-	return rv;
+	return NULL;
 }
 
-/* Map deviceid returned by the server to constructed block_device */
-static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
-					    struct nfs4_deviceid *id)
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
 {
-	struct block_device *rv = NULL;
-	struct block_mount_id *mid;
-	struct pnfs_block_dev *dev;
-
-	dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
-	mid = BLK_ID(lo);
-	spin_lock(&mid->bm_lock);
-	list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
-		if (memcmp(id->data, dev->bm_mdevid.data,
-			   NFS4_DEVICEID4_SIZE) == 0) {
-			rv = dev->bm_mdev;
-			goto out;
-		}
-	}
- out:
-	spin_unlock(&mid->bm_lock);
-	dprintk("%s returning %p\n", __func__, rv);
-	return rv;
+	struct pnfs_block_dev *dev =
+		container_of(d, struct pnfs_block_dev, d_node);
+	struct net *net = d->nfs_client->cl_net;
+
+	blkdev_put(dev->d_bdev, FMODE_READ);
+	bl_dm_remove(net, dev->d_bdev->bd_dev);
+
+	kfree(dev);
 }
 
 /* Tracks info needed to ensure extents in layout obey constraints of spec */
@@ -309,15 +284,20 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
 	 * recovery easier.
 	 */
 	for (i = 0; i < count; i++) {
+		struct nfs4_deviceid id;
+
 		be = kzalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
 		if (!be) {
 			status = -ENOMEM;
 			goto out_err;
 		}
-		memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
+		memcpy(&id, p, NFS4_DEVICEID4_SIZE);
 		p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
-		be->be_mdev = translate_devid(lo, &be->be_devid);
-		if (!be->be_mdev)
+
+		be->be_device =
+			nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
+						lo->plh_lc_cred, gfp_flags);
+		if (!be->be_device)
 			goto out_err;
 
 		/* The next three values are read in as bytes,
@@ -364,12 +344,14 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
 	return status;
 
  out_err:
+	nfs4_put_deviceid_node(be->be_device);
 	kfree(be);
  out_free_list:
 	while (!list_empty(&extents)) {
 		be = list_first_entry(&extents, struct pnfs_block_extent,
 				      be_list);
 		list_del(&be->be_list);
+		nfs4_put_deviceid_node(be->be_device);
 		kfree(be);
 	}
 	goto out;
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
index 8999cfddd866..abc2e9e45610 100644
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -38,7 +38,7 @@
 
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 
-static void dev_remove(struct net *net, dev_t dev)
+void bl_dm_remove(struct net *net, dev_t dev)
 {
 	struct bl_pipe_msg bl_pipe_msg;
 	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
@@ -82,27 +82,3 @@ static void dev_remove(struct net *net, dev_t dev)
 out:
 	kfree(msg->data);
 }
-
-/*
- * Release meta device
- */
-static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
-{
-	dprintk("%s Releasing\n", __func__);
-	nfs4_blkdev_put(bdev->bm_mdev);
-	dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
-}
-
-void bl_free_block_dev(struct pnfs_block_dev *bdev)
-{
-	if (bdev) {
-		if (bdev->bm_mdev) {
-			dprintk("%s Removing DM device: %d:%d\n",
-				__func__,
-				MAJOR(bdev->bm_mdev->bd_dev),
-				MINOR(bdev->bm_mdev->bd_dev));
-			nfs4_blk_metadev_release(bdev);
-		}
-		kfree(bdev);
-	}
-}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index c8c59a5b1a8f..f34f61dd0d0b 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -71,7 +71,7 @@ ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
 {
 	if (be1->be_state != be2->be_state)
 		return false;
-	if (be1->be_mdev != be2->be_mdev)
+	if (be1->be_device != be2->be_device)
 		return false;
 
 	if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
@@ -96,6 +96,7 @@ ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
 	if (left && ext_can_merge(left, be)) {
 		left->be_length += be->be_length;
 		rb_erase(&be->be_node, root);
+		nfs4_put_deviceid_node(be->be_device);
 		kfree(be);
 		return left;
 	}
@@ -111,6 +112,7 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
 	if (right && ext_can_merge(be, right)) {
 		be->be_length += right->be_length;
 		rb_erase(&right->be_node, root);
+		nfs4_put_deviceid_node(right->be_device);
 		kfree(right);
 	}
 
@@ -135,16 +137,14 @@ __ext_tree_insert(struct rb_root *root,
 					be->be_v_offset = new->be_v_offset;
 				be->be_length += new->be_length;
 				be = ext_try_to_merge_left(root, be);
-				kfree(new);
-				return;
+				goto free_new;
 			}
 			p = &(*p)->rb_left;
 		} else if (new->be_f_offset >= ext_f_end(be)) {
 			if (merge_ok && ext_can_merge(be, new)) {
 				be->be_length += new->be_length;
 				be = ext_try_to_merge_right(root, be);
-				kfree(new);
-				return;
+				goto free_new;
 			}
 			p = &(*p)->rb_right;
 		} else {
@@ -154,6 +154,10 @@ __ext_tree_insert(struct rb_root *root,
 
 	rb_link_node(&new->be_node, parent, p);
 	rb_insert_color(&new->be_node, root);
+	return;
+free_new:
+	nfs4_put_deviceid_node(new->be_device);
+	kfree(new);
 }
 
 static int
@@ -198,9 +202,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
 			new->be_length = len2;
 			new->be_state = be->be_state;
 			new->be_tag = be->be_tag;
-			new->be_mdev = be->be_mdev;
-			memcpy(&new->be_devid, &be->be_devid,
-				sizeof(struct nfs4_deviceid));
+			new->be_device = nfs4_get_deviceid(be->be_device);
 
 			__ext_tree_insert(root, new, true);
 		} else {
@@ -221,6 +223,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
 			struct pnfs_block_extent *next = ext_tree_next(be);
 
 			rb_erase(&be->be_node, root);
+			nfs4_put_deviceid_node(be->be_device);
 			kfree(be);
 			be = next;
 		}
@@ -265,6 +268,7 @@ retry:
 		__ext_tree_insert(root, new, true);
 	} else if (new->be_f_offset >= be->be_f_offset) {
 		if (ext_f_end(new) <= ext_f_end(be)) {
+			nfs4_put_deviceid_node(new->be_device);
 			kfree(new);
 		} else {
 			sector_t new_len = ext_f_end(new) - ext_f_end(be);
@@ -290,6 +294,7 @@ retry:
 		}
 
 		split->be_length = be->be_f_offset - split->be_f_offset;
+		split->be_device = nfs4_get_deviceid(new->be_device);
 		__ext_tree_insert(root, split, true);
 
 		new->be_f_offset += diff;
@@ -380,9 +385,7 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
 	new->be_length = orig_len - be->be_length;
 	new->be_state = be->be_state;
 	new->be_tag = be->be_tag;
-
-	new->be_mdev = be->be_mdev;
-	memcpy(&new->be_devid, &be->be_devid, sizeof(struct nfs4_deviceid));
+	new->be_device = nfs4_get_deviceid(be->be_device);
 
 	dprintk("%s: got 0x%lx:0x%lx!\n",
 		__func__, be->be_f_offset, ext_f_end(be));
@@ -495,7 +498,7 @@ ext_tree_encode_commit(struct pnfs_block_layout *bl, struct xdr_stream *xdr)
 			break;
 		}
 
-		p = xdr_encode_opaque_fixed(p, be->be_devid.data,
+		p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
 				NFS4_DEVICEID4_SIZE);
 		p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
 		p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
-- 
cgit v1.2.3


From 08a899d5d9532efb7dea99aad44dc9af39627a92 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 7 Sep 2014 08:36:40 -0700
Subject: nfs: setattr can only change regular file sizes

The VFS never calls setattr with ATTR_SIZE on anything but regular
files.  Remove the if check and turn it into an assert similar to
what some other file systems do.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 577a36f0a510..141c9f4a40de 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -505,7 +505,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 		attr->ia_valid &= ~ATTR_MODE;
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode))
+		BUG_ON(!S_ISREG(inode->i_mode));
+
+		if (attr->ia_size == i_size_read(inode))
 			attr->ia_valid &= ~ATTR_SIZE;
 	}
 
-- 
cgit v1.2.3


From dad2b015bb85799f8005da637954f8eafb83f34c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Wed, 10 Sep 2014 09:03:55 -0400
Subject: nfs: fix RCU cl_xprt handling in nfs_swap_activate/deactivate

sparse says:

fs/nfs/file.c:543:60: warning: incorrect type in argument 1 (different address spaces)
fs/nfs/file.c:543:60:    expected struct rpc_xprt *xprt
fs/nfs/file.c:543:60:    got struct rpc_xprt [noderef] <asn:4>*cl_xprt
fs/nfs/file.c:548:53: warning: incorrect type in argument 1 (different address spaces)
fs/nfs/file.c:548:53:    expected struct rpc_xprt *xprt
fs/nfs/file.c:548:53:    got struct rpc_xprt [noderef] <asn:4>*cl_xprt

cl_xprt is RCU-managed, so we need to take care to dereference and use
it while holding the RCU read lock.

Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05ef7dcc5f21..23e5f0ea5c83 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -546,13 +546,25 @@ static int nfs_launder_page(struct page *page)
 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 						sector_t *span)
 {
+	int ret;
+	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
 	*span = sis->pages;
-	return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
+
+	rcu_read_lock();
+	ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
+	rcu_read_unlock();
+
+	return ret;
 }
 
 static void nfs_swap_deactivate(struct file *file)
 {
-	xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
+	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
+	rcu_read_lock();
+	xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
+	rcu_read_unlock();
 }
 #endif
 
-- 
cgit v1.2.3


From 8d11620e1e43f829721aa1e76bd9dc2da079df9e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Wed, 10 Sep 2014 09:04:27 -0400
Subject: nfs: add __acquires and __releases annotations to seqfile start/stop
 routines

To make sparse happy...

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/client.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 6a4f3666e273..8052020a0d0d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1252,6 +1252,7 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
  * set up the iterator to start reading from the server list and return the first item
  */
 static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
+				__acquires(&nn->nfs_client_lock)
 {
 	struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
 
@@ -1274,6 +1275,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
  * clean up after reading from the transports list
  */
 static void nfs_server_list_stop(struct seq_file *p, void *v)
+				__releases(&nn->nfs_client_lock)
 {
 	struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
 
@@ -1326,6 +1328,7 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
  * set up the iterator to start reading from the volume list and return the first item
  */
 static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
+				__acquires(&nn->nfs_client_lock)
 {
 	struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
 
@@ -1348,6 +1351,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
  * clean up after reading from the transports list
  */
 static void nfs_volume_list_stop(struct seq_file *p, void *v)
+				__releases(&nn->nfs_client_lock)
 {
 	struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
 
-- 
cgit v1.2.3


From 3e3f6b4e2613627d4e971c44eec35e945b39e5e5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 17:36:28 -0700
Subject: pnfs/blocklayout: remove some debugging

The kbuild test robot complained that we got the printk format wrong.
Let's just kill these printks instead of fixing them as there is not
point after the initial tree algorithm debugging.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/extent_tree.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index f34f61dd0d0b..43e891b3e0b6 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -370,9 +370,6 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
 	struct pnfs_block_extent *new;
 	sector_t orig_len = be->be_length;
 
-	dprintk("%s: need split for 0x%lx:0x%lx at 0x%lx\n",
-		__func__, be->be_f_offset, ext_f_end(be), split);
-
 	new = kzalloc(sizeof(*new), GFP_ATOMIC);
 	if (!new)
 		return -ENOMEM;
@@ -387,11 +384,6 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
 	new->be_tag = be->be_tag;
 	new->be_device = nfs4_get_deviceid(be->be_device);
 
-	dprintk("%s: got 0x%lx:0x%lx!\n",
-		__func__, be->be_f_offset, ext_f_end(be));
-	dprintk("%s: got 0x%lx:0x%lx!\n",
-		__func__, new->be_f_offset, ext_f_end(new));
-
 	__ext_tree_insert(root, new, false);
 	return 0;
 }
-- 
cgit v1.2.3


From fd41b4748b3b6c1220f926427bf63bef456034a5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 17:36:29 -0700
Subject: pnfs/objlayout: fix endianess annotation in objio_alloc_deviceid_node

The kbuild test robot complained about a new sparse warning in
objio_alloc_deviceid_node, but it turns out that this was just a moved
reference to an existing variable.  Fix it to have the right big endian
annotated type.

Note that there are some other endianess issues in this file that I didn't
bother to sort out as they involve global headers.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/objlayout/objio_osd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c502f640209b..c6e4bda63000 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -93,7 +93,7 @@ objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 	struct osd_dev *od;
 	struct osd_dev_info odi;
 	bool retry_flag = true;
-	u32 *p;
+	__be32 *p;
 	int err;
 
 	deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
-- 
cgit v1.2.3


From d4b18c3e00b8d18fbd316abe9639b91ad416e1f3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 17:36:31 -0700
Subject: pnfs: remove GETDEVICELIST implementation

The current GETDEVICELIST implementation is buggy in that it doesn't handle
cursors correctly, and in that it returns an error if the server returns
NFSERR_NOTSUPP.  Given that there is no actual need for GETDEVICELIST,
it has various issues and might get removed for NFSv4.2 stop using it in
the blocklayout driver, and thus the Linux NFS client as whole.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c |   2 +-
 fs/nfs/nfs4proc.c                |  48 ---------------
 fs/nfs/nfs4xdr.c                 | 130 ---------------------------------------
 fs/nfs/pnfs.h                    |   5 --
 fs/nfs/pnfs_dev.c                |  30 ---------
 include/linux/nfs_xdr.h          |  11 ----
 6 files changed, 1 insertion(+), 225 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 25ba9e0e6fff..3e1f1afc6db4 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -532,7 +532,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
 		return -EINVAL;
 	}
 
-	return nfs4_deviceid_getdevicelist(server, fh);
+	return 0;
 }
 
 static bool
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 47fa67a3a8cf..0b711227cfa5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7808,54 +7808,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
 	return status;
 }
 
-/*
- * Retrieve the list of Data Server devices from the MDS.
- */
-static int _nfs4_getdevicelist(struct nfs_server *server,
-				    const struct nfs_fh *fh,
-				    struct pnfs_devicelist *devlist)
-{
-	struct nfs4_getdevicelist_args args = {
-		.fh = fh,
-		.layoutclass = server->pnfs_curr_ld->id,
-	};
-	struct nfs4_getdevicelist_res res = {
-		.devlist = devlist,
-	};
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
-		.rpc_argp = &args,
-		.rpc_resp = &res,
-	};
-	int status;
-
-	dprintk("--> %s\n", __func__);
-	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
-				&res.seq_res, 0);
-	dprintk("<-- %s status=%d\n", __func__, status);
-	return status;
-}
-
-int nfs4_proc_getdevicelist(struct nfs_server *server,
-			    const struct nfs_fh *fh,
-			    struct pnfs_devicelist *devlist)
-{
-	struct nfs4_exception exception = { };
-	int err;
-
-	do {
-		err = nfs4_handle_exception(server,
-				_nfs4_getdevicelist(server, fh, devlist),
-				&exception);
-	} while (exception.retry);
-
-	dprintk("%s: err=%d, num_devs=%u\n", __func__,
-		err, devlist->num_devs);
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
-
 static int
 _nfs4_proc_getdeviceinfo(struct nfs_server *server,
 		struct pnfs_device *pdev,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f2cd957adb90..b8165eab0a32 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -362,17 +362,6 @@ static int nfs4_stat_to_errno(int);
 				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz	(op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz	(op_decode_hdr_maxsz + 4)
-#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
-				encode_verifier_maxsz)
-#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
-				2 /* nfs_cookie4 gdlr_cookie */ + \
-				decode_verifier_maxsz \
-				  /* verifier4 gdlr_verifier */ + \
-				1 /* gdlr_deviceid_list count */ + \
-				XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
-					    NFS4_DEVICEID4_SIZE) \
-				  /* gdlr_deviceid_list */ + \
-				1 /* bool gdlr_eof */)
 #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
 				XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
 #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
@@ -812,14 +801,6 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_reclaim_complete_sz	(compound_decode_hdr_maxsz + \
 					 decode_sequence_maxsz + \
 					 decode_reclaim_complete_maxsz)
-#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
-				encode_sequence_maxsz + \
-				encode_putfh_maxsz + \
-				encode_getdevicelist_maxsz)
-#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
-				decode_sequence_maxsz + \
-				decode_putfh_maxsz + \
-				decode_getdevicelist_maxsz)
 #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
 				encode_sequence_maxsz +\
 				encode_getdeviceinfo_maxsz)
@@ -1929,24 +1910,6 @@ static void encode_sequence(struct xdr_stream *xdr,
 }
 
 #ifdef CONFIG_NFS_V4_1
-static void
-encode_getdevicelist(struct xdr_stream *xdr,
-		     const struct nfs4_getdevicelist_args *args,
-		     struct compound_hdr *hdr)
-{
-	__be32 *p;
-	nfs4_verifier dummy = {
-		.data = "dummmmmy",
-	};
-
-	encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr);
-	p = reserve_space(xdr, 16);
-	*p++ = cpu_to_be32(args->layoutclass);
-	*p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
-	xdr_encode_hyper(p, 0ULL);                          /* cookie */
-	encode_nfs4_verifier(xdr, &dummy);
-}
-
 static void
 encode_getdeviceinfo(struct xdr_stream *xdr,
 		     const struct nfs4_getdeviceinfo_args *args,
@@ -2900,24 +2863,6 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
-/*
- * Encode GETDEVICELIST request
- */
-static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
-				       struct xdr_stream *xdr,
-				       struct nfs4_getdevicelist_args *args)
-{
-	struct compound_hdr hdr = {
-		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
-	};
-
-	encode_compound_hdr(xdr, req, &hdr);
-	encode_sequence(xdr, &args->seq_args, &hdr);
-	encode_putfh(xdr, args->fh, &hdr);
-	encode_getdevicelist(xdr, args, &hdr);
-	encode_nops(&hdr);
-}
-
 /*
  * Encode GETDEVICEINFO request
  */
@@ -5773,54 +5718,6 @@ out_overflow:
 }
 
 #if defined(CONFIG_NFS_V4_1)
-/*
- * TODO: Need to handle case when EOF != true;
- */
-static int decode_getdevicelist(struct xdr_stream *xdr,
-				struct pnfs_devicelist *res)
-{
-	__be32 *p;
-	int status, i;
-	nfs4_verifier verftemp;
-
-	status = decode_op_hdr(xdr, OP_GETDEVICELIST);
-	if (status)
-		return status;
-
-	p = xdr_inline_decode(xdr, 8 + 8 + 4);
-	if (unlikely(!p))
-		goto out_overflow;
-
-	/* TODO: Skip cookie for now */
-	p += 2;
-
-	/* Read verifier */
-	p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE);
-
-	res->num_devs = be32_to_cpup(p);
-
-	dprintk("%s: num_dev %d\n", __func__, res->num_devs);
-
-	if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
-		printk(KERN_ERR "NFS: %s too many result dev_num %u\n",
-				__func__, res->num_devs);
-		return -EIO;
-	}
-
-	p = xdr_inline_decode(xdr,
-			      res->num_devs * NFS4_DEVICEID4_SIZE + 4);
-	if (unlikely(!p))
-		goto out_overflow;
-	for (i = 0; i < res->num_devs; i++)
-		p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
-					    NFS4_DEVICEID4_SIZE);
-	res->eof = be32_to_cpup(p);
-	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
-}
-
 static int decode_getdeviceinfo(struct xdr_stream *xdr,
 				struct pnfs_device *pdev)
 {
@@ -7104,32 +7001,6 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
 	return status;
 }
 
-/*
- * Decode GETDEVICELIST response
- */
-static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
-				      struct xdr_stream *xdr,
-				      struct nfs4_getdevicelist_res *res)
-{
-	struct compound_hdr hdr;
-	int status;
-
-	dprintk("encoding getdevicelist!\n");
-
-	status = decode_compound_hdr(xdr, &hdr);
-	if (status != 0)
-		goto out;
-	status = decode_sequence(xdr, &res->seq_res, rqstp);
-	if (status != 0)
-		goto out;
-	status = decode_putfh(xdr);
-	if (status != 0)
-		goto out;
-	status = decode_getdevicelist(xdr, res->devlist);
-out:
-	return status;
-}
-
 /*
  * Decode GETDEVINFO response
  */
@@ -7498,7 +7369,6 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(SECINFO_NO_NAME,	enc_secinfo_no_name,	dec_secinfo_no_name),
 	PROC(TEST_STATEID,	enc_test_stateid,	dec_test_stateid),
 	PROC(FREE_STATEID,	enc_free_stateid,	dec_free_stateid),
-	PROC(GETDEVICELIST,	enc_getdevicelist,	dec_getdevicelist),
 	PROC(BIND_CONN_TO_SESSION,
 			enc_bind_conn_to_session, dec_bind_conn_to_session),
 	PROC(DESTROY_CLIENTID,	enc_destroy_clientid,	dec_destroy_clientid),
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index d84fe29fc88b..b10f12fc6818 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -180,9 +180,6 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 
 /* nfs4proc.c */
-extern int nfs4_proc_getdevicelist(struct nfs_server *server,
-				   const struct nfs_fh *fh,
-				   struct pnfs_devicelist *devlist);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 				   struct pnfs_device *dev,
 				   struct rpc_cred *cred);
@@ -277,8 +274,6 @@ bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
 void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
 bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
 void nfs4_deviceid_purge_client(const struct nfs_client *);
-int nfs4_deviceid_getdevicelist(struct nfs_server *server,
-		const struct nfs_fh *fh);
 
 static inline struct nfs4_deviceid_node *
 nfs4_get_deviceid(struct nfs4_deviceid_node *d)
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 82c2836fdb38..aa2ec0015183 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -358,33 +358,3 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
 	}
 	rcu_read_unlock();
 }
-
-int
-nfs4_deviceid_getdevicelist(struct nfs_server *server,
-		const struct nfs_fh *fh)
-{
-	struct pnfs_devicelist *dlist;
-	struct nfs4_deviceid_node *d;
-	int error = 0, i;
-
-	dlist = kzalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
-	if (!dlist)
-		return -ENOMEM;
-
-	while (!dlist->eof) {
-		error = nfs4_proc_getdevicelist(server, fh, dlist);
-		if (error)
-			break;
-
-		for (i = 0; i < dlist->num_devs; i++) {
-			d = nfs4_find_get_deviceid(server, &dlist->dev_id[i],
-					NULL, GFP_NOFS);
-			if (d)
-				nfs4_put_deviceid_node(d);
-		}
-	}
-
-	kfree(dlist);
-	return error;
-}
-EXPORT_SYMBOL_GPL(nfs4_deviceid_getdevicelist);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f4092c6b90fb..7ae249ccb78d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -252,17 +252,6 @@ struct nfs4_layoutget {
 	gfp_t gfp_flags;
 };
 
-struct nfs4_getdevicelist_args {
-	struct nfs4_sequence_args seq_args;
-	const struct nfs_fh *fh;
-	u32 layoutclass;
-};
-
-struct nfs4_getdevicelist_res {
-	struct nfs4_sequence_res seq_res;
-	struct pnfs_devicelist *devlist;
-};
-
 struct nfs4_getdeviceinfo_args {
 	struct nfs4_sequence_args seq_args;
 	struct pnfs_device *pdev;
-- 
cgit v1.2.3


From 34dc93c2fc04da0d01acf8a1660b4ab276208af7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 17:36:30 -0700
Subject: pnfs/blocklayout: allocate separate pages for the layoutcommit
 payload

Instead of overflowing the XDR send buffer with our extent list allocate
pages and pre-encode the layoutupdate payload into them.  We optimistically
allocate a single page use alloc_page and only switch to vmalloc when we
have more extents outstanding.  Currently there is only a single testcase
(xfstests generic/113) which can reproduce large enough extent lists for
this to occur.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c |  15 ++----
 fs/nfs/blocklayout/blocklayout.h |   8 +--
 fs/nfs/blocklayout/extent_tree.c | 102 +++++++++++++++++++++++++++++++--------
 3 files changed, 91 insertions(+), 34 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 3e1f1afc6db4..cf10a6e291e4 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -500,21 +500,16 @@ bl_return_range(struct pnfs_layout_hdr *lo,
 	err = ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
 }
 
-static void
-bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
-		       const struct nfs4_layoutcommit_args *arg)
+static int
+bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
 {
-	dprintk("%s enter\n", __func__);
-	ext_tree_encode_commit(BLK_LO2EXT(lo), xdr);
+	return ext_tree_prepare_commit(arg);
 }
 
 static void
 bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
 {
-	struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
-
-	dprintk("%s enter\n", __func__);
-	ext_tree_mark_committed(BLK_LO2EXT(lo), lcdata->res.status);
+	ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
 }
 
 static int
@@ -670,7 +665,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 	.alloc_lseg			= bl_alloc_lseg,
 	.free_lseg			= bl_free_lseg,
 	.return_range			= bl_return_range,
-	.encode_layoutcommit		= bl_encode_layoutcommit,
+	.prepare_layoutcommit		= bl_prepare_layoutcommit,
 	.cleanup_layoutcommit		= bl_cleanup_layoutcommit,
 	.set_layoutdriver		= bl_set_layoutdriver,
 	.alloc_deviceid_node		= bl_alloc_deviceid_node,
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 19fae5e4c90b..9757f3eabdd2 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -72,6 +72,9 @@ struct pnfs_block_extent {
 	unsigned int	be_tag;
 };
 
+/* on the wire size of the extent */
+#define BL_EXTENT_SIZE	(7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
+
 struct pnfs_block_layout {
 	struct pnfs_layout_hdr	bl_layout;
 	struct rb_root		bl_ext_rw;
@@ -129,8 +132,7 @@ int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
 		sector_t len);
 bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
 		struct pnfs_block_extent *ret, bool rw);
-int ext_tree_encode_commit(struct pnfs_block_layout *bl,
-		struct xdr_stream *xdr);
-void ext_tree_mark_committed(struct pnfs_block_layout *bl, int status);
+int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
+void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
 
 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 43e891b3e0b6..1b6009ee75ce 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -462,19 +462,25 @@ out:
 	return err;
 }
 
-int
-ext_tree_encode_commit(struct pnfs_block_layout *bl, struct xdr_stream *xdr)
+static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
+		size_t buffer_size)
 {
-	struct pnfs_block_extent *be;
-	unsigned int count = 0;
-	__be32 *p, *xdr_start;
-	int ret = 0;
+	if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
+		int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
 
-	dprintk("%s enter\n", __func__);
+		for (i = 0; i < nr_pages; i++)
+			put_page(arg->layoutupdate_pages[i]);
+		kfree(arg->layoutupdate_pages);
+	} else {
+		put_page(arg->layoutupdate_page);
+	}
+}
 
-	xdr_start = xdr_reserve_space(xdr, 8);
-	if (!xdr_start)
-		return -ENOSPC;
+static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
+		size_t buffer_size, size_t *count)
+{
+	struct pnfs_block_extent *be;
+	int ret = 0;
 
 	spin_lock(&bl->bl_ext_lock);
 	for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
@@ -482,12 +488,11 @@ ext_tree_encode_commit(struct pnfs_block_layout *bl, struct xdr_stream *xdr)
 		    be->be_tag != EXTENT_WRITTEN)
 			continue;
 
-		p = xdr_reserve_space(xdr, 7 * sizeof(__be32) +
-					NFS4_DEVICEID4_SIZE);
-		if (!p) {
-			printk("%s: out of space for extent list\n", __func__);
+		(*count)++;
+		if (*count * BL_EXTENT_SIZE > buffer_size) {
+			/* keep counting.. */
 			ret = -ENOSPC;
-			break;
+			continue;
 		}
 
 		p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
@@ -498,25 +503,80 @@ ext_tree_encode_commit(struct pnfs_block_layout *bl, struct xdr_stream *xdr)
 		*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
 
 		be->be_tag = EXTENT_COMMITTING;
-		count++;
 	}
 	spin_unlock(&bl->bl_ext_lock);
 
-	xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
-	xdr_start[1] = cpu_to_be32(count);
-
-	dprintk("%s found %i ranges\n", __func__, count);
 	return ret;
 }
 
+int
+ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
+{
+	struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+	size_t count = 0, buffer_size = PAGE_SIZE;
+	__be32 *start_p;
+	int ret;
+
+	dprintk("%s enter\n", __func__);
+
+	arg->layoutupdate_page = alloc_page(GFP_NOFS);
+	if (!arg->layoutupdate_page)
+		return -ENOMEM;
+	start_p = page_address(arg->layoutupdate_page);
+	arg->layoutupdate_pages = &arg->layoutupdate_page;
+
+retry:
+	ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+	if (unlikely(ret)) {
+		ext_tree_free_commitdata(arg, buffer_size);
+
+		buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+		count = 0;
+
+		arg->layoutupdate_pages =
+			kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
+				sizeof(struct page *), GFP_NOFS);
+		if (!arg->layoutupdate_pages)
+			return -ENOMEM;
+
+		start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
+		if (!start_p) {
+			kfree(arg->layoutupdate_pages);
+			return -ENOMEM;
+		}
+
+		goto retry;
+	}
+
+	*start_p = cpu_to_be32(count);
+	arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+
+	if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
+		__be32 *p = start_p;
+		int i = 0;
+
+		for (p = start_p;
+		     p < start_p + arg->layoutupdate_len;
+		     p += PAGE_SIZE) {
+			arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
+		}
+	}
+
+	dprintk("%s found %zu ranges\n", __func__, count);
+	return 0;
+}
+
 void
-ext_tree_mark_committed(struct pnfs_block_layout *bl, int status)
+ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
 {
+	struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
 	struct rb_root *root = &bl->bl_ext_rw;
 	struct pnfs_block_extent *be;
 
 	dprintk("%s status %d\n", __func__, status);
 
+	ext_tree_free_commitdata(arg, arg->layoutupdate_len);
+
 	spin_lock(&bl->bl_ext_lock);
 	for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
 		if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
-- 
cgit v1.2.3


From 9cc475411779d635619c2d414da0769e3cbf796b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 17:37:24 -0700
Subject: pnfs/blocklayout: move extent processing to blocklayout.c

This isn't device(id) related, so move it into the main file.  Simple move
for now, the next commit will clean it up a bit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c    | 186 ++++++++++++++++++++++++++++++++++++
 fs/nfs/blocklayout/blocklayout.h    |   2 -
 fs/nfs/blocklayout/blocklayoutdev.c | 186 ------------------------------------
 3 files changed, 186 insertions(+), 188 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index cf10a6e291e4..61a858cd54fe 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -446,6 +446,192 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg)
 	kfree(lseg);
 }
 
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+	u32 mode;	/* R or RW */
+	u64 start;	/* Expected start of next non-COW extent */
+	u64 inval;	/* Start of INVAL coverage */
+	u64 cowread;	/* End of COW read coverage */
+};
+
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+			 struct layout_verification *lv)
+{
+	if (lv->mode == IOMODE_READ) {
+		if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+		    be->be_state == PNFS_BLOCK_INVALID_DATA)
+			return -EIO;
+		if (be->be_f_offset != lv->start)
+			return -EIO;
+		lv->start += be->be_length;
+		return 0;
+	}
+	/* lv->mode == IOMODE_RW */
+	if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+		if (be->be_f_offset != lv->start)
+			return -EIO;
+		if (lv->cowread > lv->start)
+			return -EIO;
+		lv->start += be->be_length;
+		lv->inval = lv->start;
+		return 0;
+	} else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+		if (be->be_f_offset != lv->start)
+			return -EIO;
+		lv->start += be->be_length;
+		return 0;
+	} else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+		if (be->be_f_offset > lv->start)
+			return -EIO;
+		if (be->be_f_offset < lv->inval)
+			return -EIO;
+		if (be->be_f_offset < lv->cowread)
+			return -EIO;
+		/* It looks like you might want to min this with lv->start,
+		 * but you really don't.
+		 */
+		lv->inval = lv->inval + be->be_length;
+		lv->cowread = be->be_f_offset + be->be_length;
+		return 0;
+	} else
+		return -EIO;
+}
+
+static int decode_sector_number(__be32 **rp, sector_t *sp)
+{
+	uint64_t s;
+
+	*rp = xdr_decode_hyper(*rp, &s);
+	if (s & 0x1ff) {
+		printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
+		return -1;
+	}
+	*sp = s >> SECTOR_SHIFT;
+	return 0;
+}
+
+/* XDR decode pnfs_block_layout4 structure */
+static int
+nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+			   struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
+{
+	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+	int i, status = -EIO;
+	uint32_t count;
+	struct pnfs_block_extent *be = NULL, *save;
+	struct xdr_stream stream;
+	struct xdr_buf buf;
+	struct page *scratch;
+	__be32 *p;
+	struct layout_verification lv = {
+		.mode = lgr->range.iomode,
+		.start = lgr->range.offset >> SECTOR_SHIFT,
+		.inval = lgr->range.offset >> SECTOR_SHIFT,
+		.cowread = lgr->range.offset >> SECTOR_SHIFT,
+	};
+	LIST_HEAD(extents);
+
+	dprintk("---> %s\n", __func__);
+
+	scratch = alloc_page(gfp_flags);
+	if (!scratch)
+		return -ENOMEM;
+
+	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	p = xdr_inline_decode(&stream, 4);
+	if (unlikely(!p))
+		goto out_err;
+
+	count = be32_to_cpup(p++);
+
+	dprintk("%s enter, number of extents %i\n", __func__, count);
+	p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
+	if (unlikely(!p))
+		goto out_err;
+
+	/* Decode individual extents, putting them in temporary
+	 * staging area until whole layout is decoded to make error
+	 * recovery easier.
+	 */
+	for (i = 0; i < count; i++) {
+		struct nfs4_deviceid id;
+
+		be = kzalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
+		if (!be) {
+			status = -ENOMEM;
+			goto out_err;
+		}
+		memcpy(&id, p, NFS4_DEVICEID4_SIZE);
+		p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+		be->be_device =
+			nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
+						lo->plh_lc_cred, gfp_flags);
+		if (!be->be_device)
+			goto out_err;
+
+		/* The next three values are read in as bytes,
+		 * but stored as 512-byte sector lengths
+		 */
+		if (decode_sector_number(&p, &be->be_f_offset) < 0)
+			goto out_err;
+		if (decode_sector_number(&p, &be->be_length) < 0)
+			goto out_err;
+		if (decode_sector_number(&p, &be->be_v_offset) < 0)
+			goto out_err;
+		be->be_state = be32_to_cpup(p++);
+		if (verify_extent(be, &lv)) {
+			dprintk("%s verify failed\n", __func__);
+			goto out_err;
+		}
+		list_add_tail(&be->be_list, &extents);
+	}
+	if (lgr->range.offset + lgr->range.length !=
+			lv.start << SECTOR_SHIFT) {
+		dprintk("%s Final length mismatch\n", __func__);
+		be = NULL;
+		goto out_err;
+	}
+	if (lv.start < lv.cowread) {
+		dprintk("%s Final uncovered COW extent\n", __func__);
+		be = NULL;
+		goto out_err;
+	}
+	/* Extents decoded properly, now try to merge them in to
+	 * existing layout extents.
+	 */
+	list_for_each_entry_safe(be, save, &extents, be_list) {
+		list_del(&be->be_list);
+
+		status = ext_tree_insert(bl, be);
+		if (status)
+			goto out_free_list;
+	}
+	status = 0;
+ out:
+	__free_page(scratch);
+	dprintk("%s returns %i\n", __func__, status);
+	return status;
+
+ out_err:
+	nfs4_put_deviceid_node(be->be_device);
+	kfree(be);
+ out_free_list:
+	while (!list_empty(&extents)) {
+		be = list_first_entry(&extents, struct pnfs_block_extent,
+				      be_list);
+		list_del(&be->be_list);
+		nfs4_put_deviceid_node(be->be_device);
+		kfree(be);
+	}
+	goto out;
+}
+
 /* We pretty much ignore lseg, and store all data layout wide, so we
  * can correctly merge.
  */
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 9757f3eabdd2..00c11eb9d765 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -113,8 +113,6 @@ struct bl_msg_hdr {
 /* blocklayoutdev.c */
 ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
 void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
-				struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
 
 struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
 		struct pnfs_device *pdev, gfp_t gfp_mask);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index d6527d20c508..2b54e2940288 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -40,19 +40,6 @@
 
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 
-static int decode_sector_number(__be32 **rp, sector_t *sp)
-{
-	uint64_t s;
-
-	*rp = xdr_decode_hyper(*rp, &s);
-	if (s & 0x1ff) {
-		printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
-		return -1;
-	}
-	*sp = s >> SECTOR_SHIFT;
-	return 0;
-}
-
 ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
 			 size_t mlen)
 {
@@ -183,176 +170,3 @@ bl_free_deviceid_node(struct nfs4_deviceid_node *d)
 
 	kfree(dev);
 }
-
-/* Tracks info needed to ensure extents in layout obey constraints of spec */
-struct layout_verification {
-	u32 mode;	/* R or RW */
-	u64 start;	/* Expected start of next non-COW extent */
-	u64 inval;	/* Start of INVAL coverage */
-	u64 cowread;	/* End of COW read coverage */
-};
-
-/* Verify the extent meets the layout requirements of the pnfs-block draft,
- * section 2.3.1.
- */
-static int verify_extent(struct pnfs_block_extent *be,
-			 struct layout_verification *lv)
-{
-	if (lv->mode == IOMODE_READ) {
-		if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
-		    be->be_state == PNFS_BLOCK_INVALID_DATA)
-			return -EIO;
-		if (be->be_f_offset != lv->start)
-			return -EIO;
-		lv->start += be->be_length;
-		return 0;
-	}
-	/* lv->mode == IOMODE_RW */
-	if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
-		if (be->be_f_offset != lv->start)
-			return -EIO;
-		if (lv->cowread > lv->start)
-			return -EIO;
-		lv->start += be->be_length;
-		lv->inval = lv->start;
-		return 0;
-	} else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-		if (be->be_f_offset != lv->start)
-			return -EIO;
-		lv->start += be->be_length;
-		return 0;
-	} else if (be->be_state == PNFS_BLOCK_READ_DATA) {
-		if (be->be_f_offset > lv->start)
-			return -EIO;
-		if (be->be_f_offset < lv->inval)
-			return -EIO;
-		if (be->be_f_offset < lv->cowread)
-			return -EIO;
-		/* It looks like you might want to min this with lv->start,
-		 * but you really don't.
-		 */
-		lv->inval = lv->inval + be->be_length;
-		lv->cowread = be->be_f_offset + be->be_length;
-		return 0;
-	} else
-		return -EIO;
-}
-
-/* XDR decode pnfs_block_layout4 structure */
-int
-nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
-			   struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
-{
-	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
-	int i, status = -EIO;
-	uint32_t count;
-	struct pnfs_block_extent *be = NULL, *save;
-	struct xdr_stream stream;
-	struct xdr_buf buf;
-	struct page *scratch;
-	__be32 *p;
-	struct layout_verification lv = {
-		.mode = lgr->range.iomode,
-		.start = lgr->range.offset >> SECTOR_SHIFT,
-		.inval = lgr->range.offset >> SECTOR_SHIFT,
-		.cowread = lgr->range.offset >> SECTOR_SHIFT,
-	};
-	LIST_HEAD(extents);
-
-	dprintk("---> %s\n", __func__);
-
-	scratch = alloc_page(gfp_flags);
-	if (!scratch)
-		return -ENOMEM;
-
-	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
-	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
-
-	p = xdr_inline_decode(&stream, 4);
-	if (unlikely(!p))
-		goto out_err;
-
-	count = be32_to_cpup(p++);
-
-	dprintk("%s enter, number of extents %i\n", __func__, count);
-	p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
-	if (unlikely(!p))
-		goto out_err;
-
-	/* Decode individual extents, putting them in temporary
-	 * staging area until whole layout is decoded to make error
-	 * recovery easier.
-	 */
-	for (i = 0; i < count; i++) {
-		struct nfs4_deviceid id;
-
-		be = kzalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
-		if (!be) {
-			status = -ENOMEM;
-			goto out_err;
-		}
-		memcpy(&id, p, NFS4_DEVICEID4_SIZE);
-		p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
-
-		be->be_device =
-			nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
-						lo->plh_lc_cred, gfp_flags);
-		if (!be->be_device)
-			goto out_err;
-
-		/* The next three values are read in as bytes,
-		 * but stored as 512-byte sector lengths
-		 */
-		if (decode_sector_number(&p, &be->be_f_offset) < 0)
-			goto out_err;
-		if (decode_sector_number(&p, &be->be_length) < 0)
-			goto out_err;
-		if (decode_sector_number(&p, &be->be_v_offset) < 0)
-			goto out_err;
-		be->be_state = be32_to_cpup(p++);
-		if (verify_extent(be, &lv)) {
-			dprintk("%s verify failed\n", __func__);
-			goto out_err;
-		}
-		list_add_tail(&be->be_list, &extents);
-	}
-	if (lgr->range.offset + lgr->range.length !=
-			lv.start << SECTOR_SHIFT) {
-		dprintk("%s Final length mismatch\n", __func__);
-		be = NULL;
-		goto out_err;
-	}
-	if (lv.start < lv.cowread) {
-		dprintk("%s Final uncovered COW extent\n", __func__);
-		be = NULL;
-		goto out_err;
-	}
-	/* Extents decoded properly, now try to merge them in to
-	 * existing layout extents.
-	 */
-	list_for_each_entry_safe(be, save, &extents, be_list) {
-		list_del(&be->be_list);
-
-		status = ext_tree_insert(bl, be);
-		if (status)
-			goto out_free_list;
-	}
-	status = 0;
- out:
-	__free_page(scratch);
-	dprintk("%s returns %i\n", __func__, status);
-	return status;
-
- out_err:
-	nfs4_put_deviceid_node(be->be_device);
-	kfree(be);
- out_free_list:
-	while (!list_empty(&extents)) {
-		be = list_first_entry(&extents, struct pnfs_block_extent,
-				      be_list);
-		list_del(&be->be_list);
-		nfs4_put_deviceid_node(be->be_device);
-		kfree(be);
-	}
-	goto out;
-}
-- 
cgit v1.2.3


From ca0fe1dfa5acac6ec4ef5820d2eb5460b02648d5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 17:37:25 -0700
Subject: pnfs/blocklayout: refactor extent processing

Factor out a helper for all per-extent work, and merge the now trivial
functions for lseg allocation and parsing.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c | 207 ++++++++++++++++++++-------------------
 1 file changed, 105 insertions(+), 102 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 61a858cd54fe..76ec017a6f0a 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -513,144 +513,147 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
 	return 0;
 }
 
-/* XDR decode pnfs_block_layout4 structure */
 static int
-nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
-			   struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
+bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
+		struct layout_verification *lv, struct list_head *extents,
+		gfp_t gfp_mask)
 {
-	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
-	int i, status = -EIO;
-	uint32_t count;
-	struct pnfs_block_extent *be = NULL, *save;
-	struct xdr_stream stream;
-	struct xdr_buf buf;
-	struct page *scratch;
+	struct pnfs_block_extent *be;
+	struct nfs4_deviceid id;
+	int error;
 	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
+	if (!p)
+		return -EIO;
+
+	be = kzalloc(sizeof(*be), GFP_NOFS);
+	if (!be)
+		return -ENOMEM;
+
+	memcpy(&id, p, NFS4_DEVICEID4_SIZE);
+	p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+	error = -EIO;
+	be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
+						lo->plh_lc_cred, gfp_mask);
+	if (!be->be_device)
+		goto out_free_be;
+
+	/*
+	 * The next three values are read in as bytes, but stored in the
+	 * extent structure in 512-byte granularity.
+	 */
+	if (decode_sector_number(&p, &be->be_f_offset) < 0)
+		goto out_put_deviceid;
+	if (decode_sector_number(&p, &be->be_length) < 0)
+		goto out_put_deviceid;
+	if (decode_sector_number(&p, &be->be_v_offset) < 0)
+		goto out_put_deviceid;
+	be->be_state = be32_to_cpup(p++);
+
+	error = verify_extent(be, lv);
+	if (error) {
+		dprintk("%s: extent verification failed\n", __func__);
+		goto out_put_deviceid;
+	}
+
+	list_add_tail(&be->be_list, extents);
+	return 0;
+
+out_put_deviceid:
+	nfs4_put_deviceid_node(be->be_device);
+out_free_be:
+	kfree(be);
+	return error;
+}
+
+static struct pnfs_layout_segment *
+bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
+		gfp_t gfp_mask)
+{
 	struct layout_verification lv = {
 		.mode = lgr->range.iomode,
 		.start = lgr->range.offset >> SECTOR_SHIFT,
 		.inval = lgr->range.offset >> SECTOR_SHIFT,
 		.cowread = lgr->range.offset >> SECTOR_SHIFT,
 	};
+	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+	struct pnfs_layout_segment *lseg;
+	struct xdr_buf buf;
+	struct xdr_stream xdr;
+	struct page *scratch;
+	int status, i;
+	uint32_t count;
+	__be32 *p;
 	LIST_HEAD(extents);
 
 	dprintk("---> %s\n", __func__);
 
-	scratch = alloc_page(gfp_flags);
+	lseg = kzalloc(sizeof(*lseg), gfp_mask);
+	if (!lseg)
+		return ERR_PTR(-ENOMEM);
+
+	status = -ENOMEM;
+	scratch = alloc_page(gfp_mask);
 	if (!scratch)
-		return -ENOMEM;
+		goto out;
 
-	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
-	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+	xdr_init_decode_pages(&xdr, &buf,
+			lgr->layoutp->pages, lgr->layoutp->len);
+	xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
 
-	p = xdr_inline_decode(&stream, 4);
+	status = -EIO;
+	p = xdr_inline_decode(&xdr, 4);
 	if (unlikely(!p))
-		goto out_err;
+		goto out_free_scratch;
 
 	count = be32_to_cpup(p++);
+	dprintk("%s: number of extents %d\n", __func__, count);
 
-	dprintk("%s enter, number of extents %i\n", __func__, count);
-	p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
-	if (unlikely(!p))
-		goto out_err;
-
-	/* Decode individual extents, putting them in temporary
-	 * staging area until whole layout is decoded to make error
-	 * recovery easier.
+	/*
+	 * Decode individual extents, putting them in temporary staging area
+	 * until whole layout is decoded to make error recovery easier.
 	 */
 	for (i = 0; i < count; i++) {
-		struct nfs4_deviceid id;
-
-		be = kzalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
-		if (!be) {
-			status = -ENOMEM;
-			goto out_err;
-		}
-		memcpy(&id, p, NFS4_DEVICEID4_SIZE);
-		p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
-
-		be->be_device =
-			nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
-						lo->plh_lc_cred, gfp_flags);
-		if (!be->be_device)
-			goto out_err;
-
-		/* The next three values are read in as bytes,
-		 * but stored as 512-byte sector lengths
-		 */
-		if (decode_sector_number(&p, &be->be_f_offset) < 0)
-			goto out_err;
-		if (decode_sector_number(&p, &be->be_length) < 0)
-			goto out_err;
-		if (decode_sector_number(&p, &be->be_v_offset) < 0)
-			goto out_err;
-		be->be_state = be32_to_cpup(p++);
-		if (verify_extent(be, &lv)) {
-			dprintk("%s verify failed\n", __func__);
-			goto out_err;
-		}
-		list_add_tail(&be->be_list, &extents);
+		status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
+		if (status)
+			goto process_extents;
 	}
+
 	if (lgr->range.offset + lgr->range.length !=
 			lv.start << SECTOR_SHIFT) {
 		dprintk("%s Final length mismatch\n", __func__);
-		be = NULL;
-		goto out_err;
+		status = -EIO;
+		goto process_extents;
 	}
+
 	if (lv.start < lv.cowread) {
 		dprintk("%s Final uncovered COW extent\n", __func__);
-		be = NULL;
-		goto out_err;
-	}
-	/* Extents decoded properly, now try to merge them in to
-	 * existing layout extents.
-	 */
-	list_for_each_entry_safe(be, save, &extents, be_list) {
-		list_del(&be->be_list);
-
-		status = ext_tree_insert(bl, be);
-		if (status)
-			goto out_free_list;
+		status = -EIO;
 	}
-	status = 0;
- out:
-	__free_page(scratch);
-	dprintk("%s returns %i\n", __func__, status);
-	return status;
 
- out_err:
-	nfs4_put_deviceid_node(be->be_device);
-	kfree(be);
- out_free_list:
+process_extents:
 	while (!list_empty(&extents)) {
-		be = list_first_entry(&extents, struct pnfs_block_extent,
-				      be_list);
+		struct pnfs_block_extent *be =
+			list_first_entry(&extents, struct pnfs_block_extent,
+					 be_list);
 		list_del(&be->be_list);
-		nfs4_put_deviceid_node(be->be_device);
-		kfree(be);
-	}
-	goto out;
-}
 
-/* We pretty much ignore lseg, and store all data layout wide, so we
- * can correctly merge.
- */
-static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
-						 struct nfs4_layoutget_res *lgr,
-						 gfp_t gfp_flags)
-{
-	struct pnfs_layout_segment *lseg;
-	int status;
+		if (!status)
+			status = ext_tree_insert(bl, be);
 
-	dprintk("%s enter\n", __func__);
-	lseg = kzalloc(sizeof(*lseg), gfp_flags);
-	if (!lseg)
-		return ERR_PTR(-ENOMEM);
-	status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
+		if (status) {
+			nfs4_put_deviceid_node(be->be_device);
+			kfree(be);
+		}
+	}
+
+out_free_scratch:
+	__free_page(scratch);
+out:
+	dprintk("%s returns %d\n", __func__, status);
 	if (status) {
-		/* We don't want to call the full-blown bl_free_lseg,
-		 * since on error extents were not touched.
-		 */
 		kfree(lseg);
 		return ERR_PTR(status);
 	}
-- 
cgit v1.2.3


From 871760ce97a9a544cfb1ae4589598b25b8570a25 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 17:37:26 -0700
Subject: pnfs/blocklayout: move all rpc_pipefs related code into a single file

Create a file to house all the rpc_pipefs boilerplate code instead of
sprinkling it over a few files.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/Makefile         |   4 +-
 fs/nfs/blocklayout/blocklayout.c    | 145 +--------------
 fs/nfs/blocklayout/blocklayout.h    |  19 +-
 fs/nfs/blocklayout/blocklayoutdev.c | 172 -----------------
 fs/nfs/blocklayout/blocklayoutdm.c  |  84 ---------
 fs/nfs/blocklayout/rpc_pipefs.c     | 362 ++++++++++++++++++++++++++++++++++++
 6 files changed, 378 insertions(+), 408 deletions(-)
 delete mode 100644 fs/nfs/blocklayout/blocklayoutdev.c
 delete mode 100644 fs/nfs/blocklayout/blocklayoutdm.c
 create mode 100644 fs/nfs/blocklayout/rpc_pipefs.c

diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index 3fa5ec780a8e..e177026e0119 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,5 +2,5 @@
 # Makefile for the pNFS block layout driver kernel module
 #
 obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
-blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \
-	extent_tree.o
+
+blocklayoutdriver-y += blocklayout.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 76ec017a6f0a..65a6b19b17a2 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -863,132 +863,6 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 	.pg_write_ops			= &bl_pg_write_ops,
 };
 
-static const struct rpc_pipe_ops bl_upcall_ops = {
-	.upcall		= rpc_pipe_generic_upcall,
-	.downcall	= bl_pipe_downcall,
-	.destroy_msg	= bl_pipe_destroy_msg,
-};
-
-static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
-					    struct rpc_pipe *pipe)
-{
-	struct dentry *dir, *dentry;
-
-	dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
-	if (dir == NULL)
-		return ERR_PTR(-ENOENT);
-	dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
-	dput(dir);
-	return dentry;
-}
-
-static void nfs4blocklayout_unregister_sb(struct super_block *sb,
-					  struct rpc_pipe *pipe)
-{
-	if (pipe->dentry)
-		rpc_unlink(pipe->dentry);
-}
-
-static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
-			   void *ptr)
-{
-	struct super_block *sb = ptr;
-	struct net *net = sb->s_fs_info;
-	struct nfs_net *nn = net_generic(net, nfs_net_id);
-	struct dentry *dentry;
-	int ret = 0;
-
-	if (!try_module_get(THIS_MODULE))
-		return 0;
-
-	if (nn->bl_device_pipe == NULL) {
-		module_put(THIS_MODULE);
-		return 0;
-	}
-
-	switch (event) {
-	case RPC_PIPEFS_MOUNT:
-		dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
-		if (IS_ERR(dentry)) {
-			ret = PTR_ERR(dentry);
-			break;
-		}
-		nn->bl_device_pipe->dentry = dentry;
-		break;
-	case RPC_PIPEFS_UMOUNT:
-		if (nn->bl_device_pipe->dentry)
-			nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
-		break;
-	default:
-		ret = -ENOTSUPP;
-		break;
-	}
-	module_put(THIS_MODULE);
-	return ret;
-}
-
-static struct notifier_block nfs4blocklayout_block = {
-	.notifier_call = rpc_pipefs_event,
-};
-
-static struct dentry *nfs4blocklayout_register_net(struct net *net,
-						   struct rpc_pipe *pipe)
-{
-	struct super_block *pipefs_sb;
-	struct dentry *dentry;
-
-	pipefs_sb = rpc_get_sb_net(net);
-	if (!pipefs_sb)
-		return NULL;
-	dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
-	rpc_put_sb_net(net);
-	return dentry;
-}
-
-static void nfs4blocklayout_unregister_net(struct net *net,
-					   struct rpc_pipe *pipe)
-{
-	struct super_block *pipefs_sb;
-
-	pipefs_sb = rpc_get_sb_net(net);
-	if (pipefs_sb) {
-		nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
-		rpc_put_sb_net(net);
-	}
-}
-
-static int nfs4blocklayout_net_init(struct net *net)
-{
-	struct nfs_net *nn = net_generic(net, nfs_net_id);
-	struct dentry *dentry;
-
-	init_waitqueue_head(&nn->bl_wq);
-	nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
-	if (IS_ERR(nn->bl_device_pipe))
-		return PTR_ERR(nn->bl_device_pipe);
-	dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
-	if (IS_ERR(dentry)) {
-		rpc_destroy_pipe_data(nn->bl_device_pipe);
-		return PTR_ERR(dentry);
-	}
-	nn->bl_device_pipe->dentry = dentry;
-	return 0;
-}
-
-static void nfs4blocklayout_net_exit(struct net *net)
-{
-	struct nfs_net *nn = net_generic(net, nfs_net_id);
-
-	nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
-	rpc_destroy_pipe_data(nn->bl_device_pipe);
-	nn->bl_device_pipe = NULL;
-}
-
-static struct pernet_operations nfs4blocklayout_net_ops = {
-	.init = nfs4blocklayout_net_init,
-	.exit = nfs4blocklayout_net_exit,
-};
-
 static int __init nfs4blocklayout_init(void)
 {
 	int ret;
@@ -998,20 +872,14 @@ static int __init nfs4blocklayout_init(void)
 	ret = pnfs_register_layoutdriver(&blocklayout_type);
 	if (ret)
 		goto out;
-
-	ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
-	if (ret)
-		goto out_remove;
-	ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
+	ret = bl_init_pipefs();
 	if (ret)
-		goto out_notifier;
-out:
-	return ret;
+		goto out_unregister;
+	return 0;
 
-out_notifier:
-	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
-out_remove:
+out_unregister:
 	pnfs_unregister_layoutdriver(&blocklayout_type);
+out:
 	return ret;
 }
 
@@ -1020,8 +888,7 @@ static void __exit nfs4blocklayout_exit(void)
 	dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
 	       __func__);
 
-	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
-	unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+	bl_cleanup_pipefs();
 	pnfs_unregister_layoutdriver(&blocklayout_type);
 }
 
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 00c11eb9d765..c98d98a62664 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -110,17 +110,6 @@ struct bl_msg_hdr {
 #define BL_DEVICE_REQUEST_PROC         0x1 /* User level process succeeds */
 #define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
 
-/* blocklayoutdev.c */
-ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-
-struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
-		struct pnfs_device *pdev, gfp_t gfp_mask);
-void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
-
-/* blocklayoutdm.c */
-void bl_dm_remove(struct net *net, dev_t dev);
-
 /* extent_tree.c */
 int ext_tree_insert(struct pnfs_block_layout *bl,
 		struct pnfs_block_extent *new);
@@ -133,4 +122,12 @@ bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
 int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
 void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
 
+/* rpc_pipefs.c */
+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
+		struct pnfs_device *pdev, gfp_t gfp_mask);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+
+int __init bl_init_pipefs(void);
+void __exit bl_cleanup_pipefs(void);
+
 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
deleted file mode 100644
index 2b54e2940288..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- *  linux/fs/nfs/blocklayout/blocklayoutdev.c
- *
- *  Device operations for the pnfs nfs4 file layout driver.
- *
- *  Copyright (c) 2006 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Andy Adamson <andros@citi.umich.edu>
- *  Fred Isaman <iisaman@umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization.  if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose.  the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-#include <linux/module.h>
-#include <linux/buffer_head.h> /* __bread */
-
-#include <linux/genhd.h>
-#include <linux/blkdev.h>
-#include <linux/hash.h>
-
-#include "blocklayout.h"
-
-#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-
-ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
-			 size_t mlen)
-{
-	struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
-					 nfs_net_id);
-
-	if (mlen != sizeof (struct bl_dev_msg))
-		return -EINVAL;
-
-	if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
-		return -EFAULT;
-
-	wake_up(&nn->bl_wq);
-
-	return mlen;
-}
-
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
-{
-	struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
-
-	if (msg->errno >= 0)
-		return;
-	wake_up(bl_pipe_msg->bl_wq);
-}
-
-/*
- * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
- */
-struct nfs4_deviceid_node *
-bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
-		gfp_t gfp_mask)
-{
-	struct pnfs_block_dev *rv;
-	struct block_device *bd;
-	struct bl_pipe_msg bl_pipe_msg;
-	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
-	struct bl_msg_hdr bl_msg = {
-		.type = BL_DEVICE_MOUNT,
-		.totallen = dev->mincount,
-	};
-	uint8_t *dataptr;
-	DECLARE_WAITQUEUE(wq, current);
-	int offset, len, i, rc;
-	struct net *net = server->nfs_client->cl_net;
-	struct nfs_net *nn = net_generic(net, nfs_net_id);
-	struct bl_dev_msg *reply = &nn->bl_mount_reply;
-
-	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
-	dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
-		dev->mincount);
-
-	bl_pipe_msg.bl_wq = &nn->bl_wq;
-	memset(msg, 0, sizeof(*msg));
-	msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, gfp_mask);
-	if (!msg->data)
-		goto out;
-
-	memcpy(msg->data, &bl_msg, sizeof(bl_msg));
-	dataptr = (uint8_t *) msg->data;
-	len = dev->mincount;
-	offset = sizeof(bl_msg);
-	for (i = 0; len > 0; i++) {
-		memcpy(&dataptr[offset], page_address(dev->pages[i]),
-				len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
-		len -= PAGE_CACHE_SIZE;
-		offset += PAGE_CACHE_SIZE;
-	}
-	msg->len = sizeof(bl_msg) + dev->mincount;
-
-	dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
-	add_wait_queue(&nn->bl_wq, &wq);
-	rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
-	if (rc < 0) {
-		remove_wait_queue(&nn->bl_wq, &wq);
-		goto out;
-	}
-
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	schedule();
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&nn->bl_wq, &wq);
-
-	if (reply->status != BL_DEVICE_REQUEST_PROC) {
-		printk(KERN_WARNING "%s failed to decode device: %d\n",
-			__func__, reply->status);
-		goto out;
-	}
-
-	bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
-			       FMODE_READ, NULL);
-	if (IS_ERR(bd)) {
-		printk(KERN_WARNING "%s failed to open device %d:%d (%ld)\n",
-			__func__, reply->major, reply->minor,
-			PTR_ERR(bd));
-		goto out;
-	}
-
-	rv = kzalloc(sizeof(*rv), gfp_mask);
-	if (!rv)
-		goto out;
-
-	nfs4_init_deviceid_node(&rv->d_node, server, &dev->dev_id);
-	rv->d_bdev = bd;
-
-	dprintk("%s Created device %s with bd_block_size %u\n",
-		__func__,
-		bd->bd_disk->disk_name,
-		bd->bd_block_size);
-
-	kfree(msg->data);
-	return &rv->d_node;
-
-out:
-	kfree(msg->data);
-	return NULL;
-}
-
-void
-bl_free_deviceid_node(struct nfs4_deviceid_node *d)
-{
-	struct pnfs_block_dev *dev =
-		container_of(d, struct pnfs_block_dev, d_node);
-	struct net *net = d->nfs_client->cl_net;
-
-	blkdev_put(dev->d_bdev, FMODE_READ);
-	bl_dm_remove(net, dev->d_bdev->bd_dev);
-
-	kfree(dev);
-}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
deleted file mode 100644
index abc2e9e45610..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- *  linux/fs/nfs/blocklayout/blocklayoutdm.c
- *
- *  Module for the NFSv4.1 pNFS block layout driver.
- *
- *  Copyright (c) 2007 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Fred Isaman <iisaman@umich.edu>
- *  Andy Adamson <andros@citi.umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization.  if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose.  the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-
-#include <linux/genhd.h> /* gendisk - used in a dprintk*/
-#include <linux/sched.h>
-#include <linux/hash.h>
-
-#include "blocklayout.h"
-
-#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-
-void bl_dm_remove(struct net *net, dev_t dev)
-{
-	struct bl_pipe_msg bl_pipe_msg;
-	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
-	struct bl_dev_msg bl_umount_request;
-	struct bl_msg_hdr bl_msg = {
-		.type = BL_DEVICE_UMOUNT,
-		.totallen = sizeof(bl_umount_request),
-	};
-	uint8_t *dataptr;
-	DECLARE_WAITQUEUE(wq, current);
-	struct nfs_net *nn = net_generic(net, nfs_net_id);
-
-	dprintk("Entering %s\n", __func__);
-
-	bl_pipe_msg.bl_wq = &nn->bl_wq;
-	memset(msg, 0, sizeof(*msg));
-	msg->len = sizeof(bl_msg) + bl_msg.totallen;
-	msg->data = kzalloc(msg->len, GFP_NOFS);
-	if (!msg->data)
-		goto out;
-
-	memset(&bl_umount_request, 0, sizeof(bl_umount_request));
-	bl_umount_request.major = MAJOR(dev);
-	bl_umount_request.minor = MINOR(dev);
-
-	memcpy(msg->data, &bl_msg, sizeof(bl_msg));
-	dataptr = (uint8_t *) msg->data;
-	memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
-
-	add_wait_queue(&nn->bl_wq, &wq);
-	if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
-		remove_wait_queue(&nn->bl_wq, &wq);
-		goto out;
-	}
-
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	schedule();
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&nn->bl_wq, &wq);
-
-out:
-	kfree(msg->data);
-}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 000000000000..bfb04861eb61
--- /dev/null
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,362 @@
+/*
+ *  Copyright (c) 2006,2007 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+static void bl_dm_remove(struct net *net, dev_t dev)
+{
+	struct bl_pipe_msg bl_pipe_msg;
+	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+	struct bl_dev_msg bl_umount_request;
+	struct bl_msg_hdr bl_msg = {
+		.type = BL_DEVICE_UMOUNT,
+		.totallen = sizeof(bl_umount_request),
+	};
+	uint8_t *dataptr;
+	DECLARE_WAITQUEUE(wq, current);
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	dprintk("Entering %s\n", __func__);
+
+	bl_pipe_msg.bl_wq = &nn->bl_wq;
+	memset(msg, 0, sizeof(*msg));
+	msg->len = sizeof(bl_msg) + bl_msg.totallen;
+	msg->data = kzalloc(msg->len, GFP_NOFS);
+	if (!msg->data)
+		goto out;
+
+	memset(&bl_umount_request, 0, sizeof(bl_umount_request));
+	bl_umount_request.major = MAJOR(dev);
+	bl_umount_request.minor = MINOR(dev);
+
+	memcpy(msg->data, &bl_msg, sizeof(bl_msg));
+	dataptr = (uint8_t *) msg->data;
+	memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
+
+	add_wait_queue(&nn->bl_wq, &wq);
+	if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
+		remove_wait_queue(&nn->bl_wq, &wq);
+		goto out;
+	}
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule();
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&nn->bl_wq, &wq);
+
+out:
+	kfree(msg->data);
+}
+
+/*
+ * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
+ */
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
+		gfp_t gfp_mask)
+{
+	struct pnfs_block_dev *rv;
+	struct block_device *bd;
+	struct bl_pipe_msg bl_pipe_msg;
+	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+	struct bl_msg_hdr bl_msg = {
+		.type = BL_DEVICE_MOUNT,
+		.totallen = dev->mincount,
+	};
+	uint8_t *dataptr;
+	DECLARE_WAITQUEUE(wq, current);
+	int offset, len, i, rc;
+	struct net *net = server->nfs_client->cl_net;
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct bl_dev_msg *reply = &nn->bl_mount_reply;
+
+	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+	dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
+		dev->mincount);
+
+	bl_pipe_msg.bl_wq = &nn->bl_wq;
+	memset(msg, 0, sizeof(*msg));
+	msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, gfp_mask);
+	if (!msg->data)
+		goto out;
+
+	memcpy(msg->data, &bl_msg, sizeof(bl_msg));
+	dataptr = (uint8_t *) msg->data;
+	len = dev->mincount;
+	offset = sizeof(bl_msg);
+	for (i = 0; len > 0; i++) {
+		memcpy(&dataptr[offset], page_address(dev->pages[i]),
+				len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
+		len -= PAGE_CACHE_SIZE;
+		offset += PAGE_CACHE_SIZE;
+	}
+	msg->len = sizeof(bl_msg) + dev->mincount;
+
+	dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+	add_wait_queue(&nn->bl_wq, &wq);
+	rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
+	if (rc < 0) {
+		remove_wait_queue(&nn->bl_wq, &wq);
+		goto out;
+	}
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule();
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&nn->bl_wq, &wq);
+
+	if (reply->status != BL_DEVICE_REQUEST_PROC) {
+		printk(KERN_WARNING "%s failed to decode device: %d\n",
+			__func__, reply->status);
+		goto out;
+	}
+
+	bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
+			       FMODE_READ, NULL);
+	if (IS_ERR(bd)) {
+		printk(KERN_WARNING "%s failed to open device %d:%d (%ld)\n",
+			__func__, reply->major, reply->minor,
+			PTR_ERR(bd));
+		goto out;
+	}
+
+	rv = kzalloc(sizeof(*rv), gfp_mask);
+	if (!rv)
+		goto out;
+
+	nfs4_init_deviceid_node(&rv->d_node, server, &dev->dev_id);
+	rv->d_bdev = bd;
+
+	dprintk("%s Created device %s with bd_block_size %u\n",
+		__func__,
+		bd->bd_disk->disk_name,
+		bd->bd_block_size);
+
+	kfree(msg->data);
+	return &rv->d_node;
+
+out:
+	kfree(msg->data);
+	return NULL;
+}
+
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+	struct pnfs_block_dev *dev =
+		container_of(d, struct pnfs_block_dev, d_node);
+	struct net *net = d->nfs_client->cl_net;
+
+	blkdev_put(dev->d_bdev, FMODE_READ);
+	bl_dm_remove(net, dev->d_bdev->bd_dev);
+
+	kfree(dev);
+}
+
+static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+			 size_t mlen)
+{
+	struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
+					 nfs_net_id);
+
+	if (mlen != sizeof (struct bl_dev_msg))
+		return -EINVAL;
+
+	if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
+		return -EFAULT;
+
+	wake_up(&nn->bl_wq);
+
+	return mlen;
+}
+
+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+	struct bl_pipe_msg *bl_pipe_msg =
+		container_of(msg, struct bl_pipe_msg, msg);
+
+	if (msg->errno >= 0)
+		return;
+	wake_up(bl_pipe_msg->bl_wq);
+}
+
+static const struct rpc_pipe_ops bl_upcall_ops = {
+	.upcall		= rpc_pipe_generic_upcall,
+	.downcall	= bl_pipe_downcall,
+	.destroy_msg	= bl_pipe_destroy_msg,
+};
+
+static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+					    struct rpc_pipe *pipe)
+{
+	struct dentry *dir, *dentry;
+
+	dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
+	if (dir == NULL)
+		return ERR_PTR(-ENOENT);
+	dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+	dput(dir);
+	return dentry;
+}
+
+static void nfs4blocklayout_unregister_sb(struct super_block *sb,
+					  struct rpc_pipe *pipe)
+{
+	if (pipe->dentry)
+		rpc_unlink(pipe->dentry);
+}
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+			   void *ptr)
+{
+	struct super_block *sb = ptr;
+	struct net *net = sb->s_fs_info;
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct dentry *dentry;
+	int ret = 0;
+
+	if (!try_module_get(THIS_MODULE))
+		return 0;
+
+	if (nn->bl_device_pipe == NULL) {
+		module_put(THIS_MODULE);
+		return 0;
+	}
+
+	switch (event) {
+	case RPC_PIPEFS_MOUNT:
+		dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
+		if (IS_ERR(dentry)) {
+			ret = PTR_ERR(dentry);
+			break;
+		}
+		nn->bl_device_pipe->dentry = dentry;
+		break;
+	case RPC_PIPEFS_UMOUNT:
+		if (nn->bl_device_pipe->dentry)
+			nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+		break;
+	default:
+		ret = -ENOTSUPP;
+		break;
+	}
+	module_put(THIS_MODULE);
+	return ret;
+}
+
+static struct notifier_block nfs4blocklayout_block = {
+	.notifier_call = rpc_pipefs_event,
+};
+
+static struct dentry *nfs4blocklayout_register_net(struct net *net,
+						   struct rpc_pipe *pipe)
+{
+	struct super_block *pipefs_sb;
+	struct dentry *dentry;
+
+	pipefs_sb = rpc_get_sb_net(net);
+	if (!pipefs_sb)
+		return NULL;
+	dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+	rpc_put_sb_net(net);
+	return dentry;
+}
+
+static void nfs4blocklayout_unregister_net(struct net *net,
+					   struct rpc_pipe *pipe)
+{
+	struct super_block *pipefs_sb;
+
+	pipefs_sb = rpc_get_sb_net(net);
+	if (pipefs_sb) {
+		nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+		rpc_put_sb_net(net);
+	}
+}
+
+static int nfs4blocklayout_net_init(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct dentry *dentry;
+
+	init_waitqueue_head(&nn->bl_wq);
+	nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
+	if (IS_ERR(nn->bl_device_pipe))
+		return PTR_ERR(nn->bl_device_pipe);
+	dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+	if (IS_ERR(dentry)) {
+		rpc_destroy_pipe_data(nn->bl_device_pipe);
+		return PTR_ERR(dentry);
+	}
+	nn->bl_device_pipe->dentry = dentry;
+	return 0;
+}
+
+static void nfs4blocklayout_net_exit(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
+	rpc_destroy_pipe_data(nn->bl_device_pipe);
+	nn->bl_device_pipe = NULL;
+}
+
+static struct pernet_operations nfs4blocklayout_net_ops = {
+	.init = nfs4blocklayout_net_init,
+	.exit = nfs4blocklayout_net_exit,
+};
+
+int __init bl_init_pipefs(void)
+{
+	int ret;
+
+	ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+	if (ret)
+		goto out;
+	ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
+	if (ret)
+		goto out_unregister_notifier;
+	return 0;
+
+out_unregister_notifier:
+	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+out:
+	return ret;
+}
+
+void __exit bl_cleanup_pipefs(void)
+{
+	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+	unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+}
-- 
cgit v1.2.3


From 5c83746a0cf2831d4b59f5cf99ef5fbf138564e4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 17:37:27 -0700
Subject: pnfs/blocklayout: in-kernel GETDEVICEINFO XDR parsing

This patches moves parsing of the GETDEVICEINFO XDR to kernel space, as well
as the management of complex devices.  The reason for that is we might have
multiple outstanding complex devices after a NOTIFY_DEVICEID4_CHANGE, which
device mapper or md can't handle as they claim devices exclusively.

But as is turns out simple striping / concatenation is fairly trivial to
implement anyway, so we make our life simpler by reducing the reliance
on blkmapd.  For now we still use blkmapd by feeding it synthetic SIMPLE
device XDR to translate device signatures to device numbers, but in the
long runs I have plans to eliminate it entirely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/Makefile      |   2 +-
 fs/nfs/blocklayout/blocklayout.c |  92 ++++++----
 fs/nfs/blocklayout/blocklayout.h |  83 ++++++++-
 fs/nfs/blocklayout/dev.c         | 360 +++++++++++++++++++++++++++++++++++++++
 fs/nfs/blocklayout/rpc_pipefs.c  | 141 ++++-----------
 5 files changed, 530 insertions(+), 148 deletions(-)
 create mode 100644 fs/nfs/blocklayout/dev.c

diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index e177026e0119..3ca14c36d08b 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -3,4 +3,4 @@
 #
 obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
 
-blocklayoutdriver-y += blocklayout.o extent_tree.o rpc_pipefs.o
+blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 65a6b19b17a2..c41a718854e3 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -114,13 +114,10 @@ bl_submit_bio(int rw, struct bio *bio)
 	return NULL;
 }
 
-static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
-				     struct pnfs_block_extent *be,
-				     void (*end_io)(struct bio *, int err),
-				     struct parallel_io *par)
+static struct bio *
+bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
+		void (*end_io)(struct bio *, int err), struct parallel_io *par)
 {
-	struct pnfs_block_dev *dev =
-		container_of(be->be_device, struct pnfs_block_dev, d_node);
 	struct bio *bio;
 
 	npg = min(npg, BIO_MAX_PAGES);
@@ -131,32 +128,55 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
 	}
 
 	if (bio) {
-		bio->bi_iter.bi_sector = isect - be->be_f_offset +
-			be->be_v_offset;
-		bio->bi_bdev = dev->d_bdev;
+		bio->bi_iter.bi_sector = disk_sector;
+		bio->bi_bdev = bdev;
 		bio->bi_end_io = end_io;
 		bio->bi_private = par;
 	}
 	return bio;
 }
 
-static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
-				      sector_t isect, struct page *page,
-				      struct pnfs_block_extent *be,
-				      void (*end_io)(struct bio *, int err),
-				      struct parallel_io *par,
-				      unsigned int offset, int len)
+static struct bio *
+do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
+		struct page *page, struct pnfs_block_dev_map *map,
+		struct pnfs_block_extent *be,
+		void (*end_io)(struct bio *, int err),
+		struct parallel_io *par, unsigned int offset, int *len)
 {
-	isect = isect + (offset >> SECTOR_SHIFT);
+	struct pnfs_block_dev *dev =
+		container_of(be->be_device, struct pnfs_block_dev, node);
+	u64 disk_addr, end;
+
 	dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
-		npg, rw, (unsigned long long)isect, offset, len);
+		npg, rw, (unsigned long long)isect, offset, *len);
+
+	/* translate to device offset */
+	isect += be->be_v_offset;
+	isect -= be->be_f_offset;
+
+	/* translate to physical disk offset */
+	disk_addr = (u64)isect << SECTOR_SHIFT;
+	if (disk_addr < map->start || disk_addr >= map->start + map->len) {
+		if (!dev->map(dev, disk_addr, map))
+			return ERR_PTR(-EIO);
+		bio = bl_submit_bio(rw, bio);
+	}
+	disk_addr += map->disk_offset;
+	disk_addr -= map->start;
+
+	/* limit length to what the device mapping allows */
+	end = disk_addr + *len;
+	if (end >= map->start + map->len)
+		*len = map->start + map->len - disk_addr;
+
 retry:
 	if (!bio) {
-		bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
+		bio = bl_alloc_init_bio(npg, map->bdev,
+				disk_addr >> SECTOR_SHIFT, end_io, par);
 		if (!bio)
 			return ERR_PTR(-ENOMEM);
 	}
-	if (bio_add_page(bio, page, len, offset) < len) {
+	if (bio_add_page(bio, page, *len, offset) < *len) {
 		bio = bl_submit_bio(rw, bio);
 		goto retry;
 	}
@@ -203,6 +223,7 @@ static enum pnfs_try_status
 bl_read_pagelist(struct nfs_pgio_header *header)
 {
 	struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+	struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
 	struct bio *bio = NULL;
 	struct pnfs_block_extent be;
 	sector_t isect, extent_length = 0;
@@ -248,28 +269,29 @@ bl_read_pagelist(struct nfs_pgio_header *header)
 				pg_len = PAGE_CACHE_SIZE - pg_offset;
 			else
 				pg_len = bytes_left;
-
-			f_offset += pg_len;
-			bytes_left -= pg_len;
-			isect += (pg_offset >> SECTOR_SHIFT);
-			extent_length -= (pg_offset >> SECTOR_SHIFT);
 		} else {
 			BUG_ON(pg_offset != 0);
 			pg_len = PAGE_CACHE_SIZE;
 		}
 
+		isect += (pg_offset >> SECTOR_SHIFT);
+		extent_length -= (pg_offset >> SECTOR_SHIFT);
+
 		if (is_hole(&be)) {
 			bio = bl_submit_bio(READ, bio);
 			/* Fill hole w/ zeroes w/o accessing device */
 			dprintk("%s Zeroing page for hole\n", __func__);
 			zero_user_segment(pages[i], pg_offset, pg_len);
+
+			/* invalidate map */
+			map.start = NFS4_MAX_UINT64;
 		} else {
 			bio = do_add_page_to_bio(bio,
 						 header->page_array.npages - i,
 						 READ,
-						 isect, pages[i], &be,
+						 isect, pages[i], &map, &be,
 						 bl_end_io_read, par,
-						 pg_offset, pg_len);
+						 pg_offset, &pg_len);
 			if (IS_ERR(bio)) {
 				header->pnfs_error = PTR_ERR(bio);
 				bio = NULL;
@@ -278,6 +300,8 @@ bl_read_pagelist(struct nfs_pgio_header *header)
 		}
 		isect += (pg_len >> SECTOR_SHIFT);
 		extent_length -= (pg_len >> SECTOR_SHIFT);
+		f_offset += pg_len;
+		bytes_left -= pg_len;
 	}
 	if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
 		header->res.eof = 1;
@@ -346,6 +370,7 @@ static enum pnfs_try_status
 bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 {
 	struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+	struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
 	struct bio *bio = NULL;
 	struct pnfs_block_extent be;
 	sector_t isect, extent_length = 0;
@@ -354,6 +379,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 	size_t count = header->args.count;
 	struct page **pages = header->args.pages;
 	int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+	unsigned int pg_len;
 	struct blk_plug plug;
 	int i;
 
@@ -387,19 +413,21 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 			extent_length = be.be_length - (isect - be.be_f_offset);
 		}
 
+		pg_len = PAGE_CACHE_SIZE;
 		bio = do_add_page_to_bio(bio, header->page_array.npages - i,
-					 WRITE, isect, pages[i], &be,
+					 WRITE, isect, pages[i], &map, &be,
 					 bl_end_io_write, par,
-					 0, PAGE_CACHE_SIZE);
+					 0, &pg_len);
 		if (IS_ERR(bio)) {
 			header->pnfs_error = PTR_ERR(bio);
 			bio = NULL;
 			goto out;
 		}
-		offset += PAGE_CACHE_SIZE;
-		count -= PAGE_CACHE_SIZE;
-		isect += PAGE_CACHE_SECTORS;
-		extent_length -= PAGE_CACHE_SECTORS;
+
+		offset += pg_len;
+		count -= pg_len;
+		isect += (pg_len >> SECTOR_SHIFT);
+		extent_length -= (pg_len >> SECTOR_SHIFT);
 	}
 
 	header->res.count = header->args.count;
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index c98d98a62664..92dca9e90d8d 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -44,9 +44,77 @@
 #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
 #define SECTOR_SIZE (1 << SECTOR_SHIFT)
 
+struct pnfs_block_dev;
+
+enum pnfs_block_volume_type {
+	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
+	PNFS_BLOCK_VOLUME_SLICE		= 1,
+	PNFS_BLOCK_VOLUME_CONCAT	= 2,
+	PNFS_BLOCK_VOLUME_STRIPE	= 3,
+};
+
+#define PNFS_BLOCK_MAX_UUIDS	4
+#define PNFS_BLOCK_MAX_DEVICES	64
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN	128
+
+
+struct pnfs_block_volume {
+	enum pnfs_block_volume_type	type;
+	union {
+		struct {
+			int		len;
+			int		nr_sigs;
+			struct {
+				u64		offset;
+				u32		sig_len;
+				u8		sig[PNFS_BLOCK_UUID_LEN];
+			} sigs[PNFS_BLOCK_MAX_UUIDS];
+		} simple;
+		struct {
+			u64		start;
+			u64		len;
+			u32		volume;
+		} slice;
+		struct {
+			u32		volumes_count;
+			u32		volumes[PNFS_BLOCK_MAX_DEVICES];
+		} concat;
+		struct {
+			u64		chunk_size;
+			u32		volumes_count;
+			u32		volumes[PNFS_BLOCK_MAX_DEVICES];
+		} stripe;
+	};
+};
+
+struct pnfs_block_dev_map {
+	sector_t			start;
+	sector_t			len;
+
+	sector_t			disk_offset;
+	struct block_device		*bdev;
+};
+
 struct pnfs_block_dev {
-	struct nfs4_deviceid_node	d_node;
-	struct block_device		*d_bdev;
+	struct nfs4_deviceid_node	node;
+
+	u64				start;
+	u64				len;
+
+	u32				nr_children;
+	struct pnfs_block_dev		*children;
+	u64				chunk_size;
+
+	struct block_device		*bdev;
+	u64				disk_offset;
+
+	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
+			struct pnfs_block_dev_map *map);
 };
 
 enum exstate4 {
@@ -110,6 +178,11 @@ struct bl_msg_hdr {
 #define BL_DEVICE_REQUEST_PROC         0x1 /* User level process succeeds */
 #define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
 
+/* dev.c */
+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
+		struct pnfs_device *pdev, gfp_t gfp_mask);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+
 /* extent_tree.c */
 int ext_tree_insert(struct pnfs_block_layout *bl,
 		struct pnfs_block_extent *new);
@@ -123,10 +196,8 @@ int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
 void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
 
 /* rpc_pipefs.c */
-struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
-		struct pnfs_device *pdev, gfp_t gfp_mask);
-void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
-
+dev_t bl_resolve_deviceid(struct nfs_server *server,
+		struct pnfs_block_volume *b, gfp_t gfp_mask);
 int __init bl_init_pipefs(void);
 void __exit bl_cleanup_pipefs(void);
 
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
new file mode 100644
index 000000000000..00f159da06ee
--- /dev/null
+++ b/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/blkdev.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
+
+static void
+bl_free_device(struct pnfs_block_dev *dev)
+{
+	if (dev->nr_children) {
+		int i;
+
+		for (i = 0; i < dev->nr_children; i++)
+			bl_free_device(&dev->children[i]);
+		kfree(dev->children);
+	} else {
+		if (dev->bdev)
+			blkdev_put(dev->bdev, FMODE_READ);
+	}
+}
+
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+	struct pnfs_block_dev *dev =
+		container_of(d, struct pnfs_block_dev, node);
+
+	bl_free_device(dev);
+	kfree(dev);
+}
+
+static int
+nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+	__be32 *p;
+	int i;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (!p)
+		return -EIO;
+	b->type = be32_to_cpup(p++);
+
+	switch (b->type) {
+	case PNFS_BLOCK_VOLUME_SIMPLE:
+		p = xdr_inline_decode(xdr, 4);
+		if (!p)
+			return -EIO;
+		b->simple.nr_sigs = be32_to_cpup(p++);
+		if (!b->simple.nr_sigs) {
+			dprintk("no signature\n");
+			return -EIO;
+		}
+
+		b->simple.len = 4 + 4;
+		for (i = 0; i < b->simple.nr_sigs; i++) {
+			p = xdr_inline_decode(xdr, 8 + 4);
+			if (!p)
+				return -EIO;
+			p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
+			b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+
+			p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
+			if (!p)
+				return -EIO;
+			memcpy(&b->simple.sigs[i].sig, p,
+				b->simple.sigs[i].sig_len);
+
+			b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
+		}
+		break;
+	case PNFS_BLOCK_VOLUME_SLICE:
+		p = xdr_inline_decode(xdr, 8 + 8 + 4);
+		if (!p)
+			return -EIO;
+		p = xdr_decode_hyper(p, &b->slice.start);
+		p = xdr_decode_hyper(p, &b->slice.len);
+		b->slice.volume = be32_to_cpup(p++);
+		break;
+	case PNFS_BLOCK_VOLUME_CONCAT:
+		p = xdr_inline_decode(xdr, 4);
+		if (!p)
+			return -EIO;
+		b->concat.volumes_count = be32_to_cpup(p++);
+
+		p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
+		if (!p)
+			return -EIO;
+		for (i = 0; i < b->concat.volumes_count; i++)
+			b->concat.volumes[i] = be32_to_cpup(p++);
+		break;
+	case PNFS_BLOCK_VOLUME_STRIPE:
+		p = xdr_inline_decode(xdr, 8 + 4);
+		if (!p)
+			return -EIO;
+		p = xdr_decode_hyper(p, &b->stripe.chunk_size);
+		b->stripe.volumes_count = be32_to_cpup(p++);
+
+		p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
+		if (!p)
+			return -EIO;
+		for (i = 0; i < b->stripe.volumes_count; i++)
+			b->stripe.volumes[i] = be32_to_cpup(p++);
+		break;
+	default:
+		dprintk("unknown volume type!\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
+		struct pnfs_block_dev_map *map)
+{
+	map->start = dev->start;
+	map->len = dev->len;
+	map->disk_offset = dev->disk_offset;
+	map->bdev = dev->bdev;
+	return true;
+}
+
+static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
+		struct pnfs_block_dev_map *map)
+{
+	int i;
+
+	for (i = 0; i < dev->nr_children; i++) {
+		struct pnfs_block_dev *child = &dev->children[i];
+
+		if (child->start > offset ||
+		    child->start + child->len <= offset)
+			continue;
+
+		child->map(child, offset - child->start, map);
+		return true;
+	}
+
+	dprintk("%s: ran off loop!\n", __func__);
+	return false;
+}
+
+static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
+		struct pnfs_block_dev_map *map)
+{
+	struct pnfs_block_dev *child;
+	u64 chunk = (offset / dev->chunk_size);
+	int chunk_idx = chunk % dev->nr_children;
+	u64 disk_offset;
+
+	if (chunk_idx > dev->nr_children) {
+		dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
+			__func__, chunk_idx, offset, dev->chunk_size);
+		/* error, should not happen */
+		return false;
+	}
+
+	/* truncate offset to the beginning of the stripe */
+	offset = chunk * dev->chunk_size;
+
+	/* disk offset of the stripe */
+	disk_offset = offset / dev->nr_children;
+
+	child = &dev->children[chunk_idx];
+	child->map(child, disk_offset, map);
+
+	map->start += offset;
+	map->disk_offset += disk_offset;
+	map->len = dev->chunk_size;
+	return true;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
+
+
+static int
+bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	dev_t dev;
+
+	dev = bl_resolve_deviceid(server, v, gfp_mask);
+	if (!dev)
+		return -EIO;
+
+	d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+	if (IS_ERR(d->bdev)) {
+		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
+			MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
+		return PTR_ERR(d->bdev);
+	}
+
+
+	d->len = i_size_read(d->bdev->bd_inode);
+	d->map = bl_map_simple;
+
+	printk(KERN_INFO "pNFS: using block device %s\n",
+		d->bdev->bd_disk->disk_name);
+	return 0;
+}
+
+static int
+bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	int ret;
+
+	ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
+	if (ret)
+		return ret;
+
+	d->disk_offset = v->slice.start;
+	d->len = v->slice.len;
+	return 0;
+}
+
+static int
+bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	u64 len = 0;
+	int ret, i;
+
+	d->children = kcalloc(v->concat.volumes_count,
+			sizeof(struct pnfs_block_dev), GFP_KERNEL);
+	if (!d->children)
+		return -ENOMEM;
+
+	for (i = 0; i < v->concat.volumes_count; i++) {
+		ret = bl_parse_deviceid(server, &d->children[i],
+				volumes, v->concat.volumes[i], gfp_mask);
+		if (ret)
+			return ret;
+
+		d->nr_children++;
+		d->children[i].start += len;
+		len += d->children[i].len;
+	}
+
+	d->len = len;
+	d->map = bl_map_concat;
+	return 0;
+}
+
+static int
+bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	u64 len = 0;
+	int ret, i;
+
+	d->children = kcalloc(v->stripe.volumes_count,
+			sizeof(struct pnfs_block_dev), GFP_KERNEL);
+	if (!d->children)
+		return -ENOMEM;
+
+	for (i = 0; i < v->stripe.volumes_count; i++) {
+		ret = bl_parse_deviceid(server, &d->children[i],
+				volumes, v->stripe.volumes[i], gfp_mask);
+		if (ret)
+			return ret;
+
+		d->nr_children++;
+		len += d->children[i].len;
+	}
+
+	d->len = len;
+	d->chunk_size = v->stripe.chunk_size;
+	d->map = bl_map_stripe;
+	return 0;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	switch (volumes[idx].type) {
+	case PNFS_BLOCK_VOLUME_SIMPLE:
+		return bl_parse_simple(server, d, volumes, idx, gfp_mask);
+	case PNFS_BLOCK_VOLUME_SLICE:
+		return bl_parse_slice(server, d, volumes, idx, gfp_mask);
+	case PNFS_BLOCK_VOLUME_CONCAT:
+		return bl_parse_concat(server, d, volumes, idx, gfp_mask);
+	case PNFS_BLOCK_VOLUME_STRIPE:
+		return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+	default:
+		dprintk("unsupported volume type: %d\n", volumes[idx].type);
+		return -EIO;
+	}
+}
+
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+		gfp_t gfp_mask)
+{
+	struct nfs4_deviceid_node *node = NULL;
+	struct pnfs_block_volume *volumes;
+	struct pnfs_block_dev *top;
+	struct xdr_stream xdr;
+	struct xdr_buf buf;
+	struct page *scratch;
+	int nr_volumes, ret, i;
+	__be32 *p;
+
+	scratch = alloc_page(gfp_mask);
+	if (!scratch)
+		goto out;
+
+	xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
+	xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+	p = xdr_inline_decode(&xdr, sizeof(__be32));
+	if (!p)
+		goto out_free_scratch;
+	nr_volumes = be32_to_cpup(p++);
+
+	volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
+			  gfp_mask);
+	if (!volumes)
+		goto out_free_scratch;
+
+	for (i = 0; i < nr_volumes; i++) {
+		ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
+		if (ret < 0)
+			goto out_free_volumes;
+	}
+
+	top = kzalloc(sizeof(*top), gfp_mask);
+	if (!top)
+		goto out_free_volumes;
+
+	ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
+	if (ret) {
+		bl_free_device(top);
+		kfree(top);
+		goto out_free_volumes;
+	}
+
+	node = &top->node;
+	nfs4_init_deviceid_node(node, server, &pdev->dev_id);
+
+out_free_volumes:
+	kfree(volumes);
+out_free_scratch:
+	__free_page(scratch);
+out:
+	return node;
+}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index bfb04861eb61..8d04bda2bd2e 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -34,94 +34,53 @@
 
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 
-static void bl_dm_remove(struct net *net, dev_t dev)
+static void
+nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
 {
-	struct bl_pipe_msg bl_pipe_msg;
-	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
-	struct bl_dev_msg bl_umount_request;
-	struct bl_msg_hdr bl_msg = {
-		.type = BL_DEVICE_UMOUNT,
-		.totallen = sizeof(bl_umount_request),
-	};
-	uint8_t *dataptr;
-	DECLARE_WAITQUEUE(wq, current);
-	struct nfs_net *nn = net_generic(net, nfs_net_id);
-
-	dprintk("Entering %s\n", __func__);
-
-	bl_pipe_msg.bl_wq = &nn->bl_wq;
-	memset(msg, 0, sizeof(*msg));
-	msg->len = sizeof(bl_msg) + bl_msg.totallen;
-	msg->data = kzalloc(msg->len, GFP_NOFS);
-	if (!msg->data)
-		goto out;
-
-	memset(&bl_umount_request, 0, sizeof(bl_umount_request));
-	bl_umount_request.major = MAJOR(dev);
-	bl_umount_request.minor = MINOR(dev);
-
-	memcpy(msg->data, &bl_msg, sizeof(bl_msg));
-	dataptr = (uint8_t *) msg->data;
-	memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
-
-	add_wait_queue(&nn->bl_wq, &wq);
-	if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
-		remove_wait_queue(&nn->bl_wq, &wq);
-		goto out;
+	int i;
+
+	*p++ = cpu_to_be32(1);
+	*p++ = cpu_to_be32(b->type);
+	*p++ = cpu_to_be32(b->simple.nr_sigs);
+	for (i = 0; i < b->simple.nr_sigs; i++) {
+		p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
+		p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
+					 b->simple.sigs[i].sig_len);
 	}
-
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	schedule();
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&nn->bl_wq, &wq);
-
-out:
-	kfree(msg->data);
 }
 
-/*
- * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
- */
-struct nfs4_deviceid_node *
-bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
+dev_t
+bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
 		gfp_t gfp_mask)
 {
-	struct pnfs_block_dev *rv;
-	struct block_device *bd;
-	struct bl_pipe_msg bl_pipe_msg;
-	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
-	struct bl_msg_hdr bl_msg = {
-		.type = BL_DEVICE_MOUNT,
-		.totallen = dev->mincount,
-	};
-	uint8_t *dataptr;
-	DECLARE_WAITQUEUE(wq, current);
-	int offset, len, i, rc;
 	struct net *net = server->nfs_client->cl_net;
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
 	struct bl_dev_msg *reply = &nn->bl_mount_reply;
+	struct bl_pipe_msg bl_pipe_msg;
+	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+	struct bl_msg_hdr *bl_msg;
+	DECLARE_WAITQUEUE(wq, current);
+	dev_t dev = 0;
+	int rc;
 
 	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
-	dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
-		dev->mincount);
 
 	bl_pipe_msg.bl_wq = &nn->bl_wq;
+
+	b->simple.len += 4;	/* single volume */
+	if (b->simple.len > PAGE_SIZE)
+		return -EIO;
+
 	memset(msg, 0, sizeof(*msg));
-	msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, gfp_mask);
+	msg->len = sizeof(*bl_msg) + b->simple.len;
+	msg->data = kzalloc(msg->len, gfp_mask);
 	if (!msg->data)
 		goto out;
 
-	memcpy(msg->data, &bl_msg, sizeof(bl_msg));
-	dataptr = (uint8_t *) msg->data;
-	len = dev->mincount;
-	offset = sizeof(bl_msg);
-	for (i = 0; len > 0; i++) {
-		memcpy(&dataptr[offset], page_address(dev->pages[i]),
-				len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
-		len -= PAGE_CACHE_SIZE;
-		offset += PAGE_CACHE_SIZE;
-	}
-	msg->len = sizeof(bl_msg) + dev->mincount;
+	bl_msg = msg->data;
+	bl_msg->type = BL_DEVICE_MOUNT,
+	bl_msg->totallen = b->simple.len;
+	nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
 
 	dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
 	add_wait_queue(&nn->bl_wq, &wq);
@@ -142,46 +101,10 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
 		goto out;
 	}
 
-	bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
-			       FMODE_READ, NULL);
-	if (IS_ERR(bd)) {
-		printk(KERN_WARNING "%s failed to open device %d:%d (%ld)\n",
-			__func__, reply->major, reply->minor,
-			PTR_ERR(bd));
-		goto out;
-	}
-
-	rv = kzalloc(sizeof(*rv), gfp_mask);
-	if (!rv)
-		goto out;
-
-	nfs4_init_deviceid_node(&rv->d_node, server, &dev->dev_id);
-	rv->d_bdev = bd;
-
-	dprintk("%s Created device %s with bd_block_size %u\n",
-		__func__,
-		bd->bd_disk->disk_name,
-		bd->bd_block_size);
-
-	kfree(msg->data);
-	return &rv->d_node;
-
+	dev = MKDEV(reply->major, reply->minor);
 out:
 	kfree(msg->data);
-	return NULL;
-}
-
-void
-bl_free_deviceid_node(struct nfs4_deviceid_node *d)
-{
-	struct pnfs_block_dev *dev =
-		container_of(d, struct pnfs_block_dev, d_node);
-	struct net *net = d->nfs_client->cl_net;
-
-	blkdev_put(dev->d_bdev, FMODE_READ);
-	bl_dm_remove(net, dev->d_bdev->bd_dev);
-
-	kfree(dev);
+	return dev;
 }
 
 static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
-- 
cgit v1.2.3


From 84c9dee3adc2bc49a52af74f18378a4887448288 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Sep 2014 17:37:28 -0700
Subject: pnfs: enable CB_NOTIFY_DEVICEID support

This code has been around for a while, but never was enabled, although
it is in a working shape.

Note that we implement NOTIFY_DEVICEID4_CHANGE identical to
NOTIFY_DEVICEID4_DELETE.  Given that in either case we can't do anything
but preventing further lookups of a given device ID there isn't much difference
in semantics for the two.  For the delete case the server MUST ensure that
there are no outstanding layouts, while for the change case it doesn't, but
that has little relevance to the client.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_proc.c |  3 ---
 fs/nfs/nfs4xdr.c       | 31 +++++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 56bd0eaf645e..73466b934090 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -289,9 +289,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
 		}
 
 	found:
-		if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
-			dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
-				"deleting instead\n", __func__);
 		nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
 	}
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b8165eab0a32..005d03c5d274 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -362,14 +362,19 @@ static int nfs4_stat_to_errno(int);
 				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz	(op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz	(op_decode_hdr_maxsz + 4)
-#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
-				XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
+				XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+				1 /* layout type */ + \
+				1 /* maxcount */ + \
+				1 /* bitmap size */ + \
+				1 /* notification bitmap length */ + \
+				1 /* notification bitmap, word 0 */)
 #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
 				1 /* layout type */ + \
 				1 /* opaque devaddr4 length */ + \
 				  /* devaddr4 payload is read into page */ \
 				1 /* notification bitmap length */ + \
-				1 /* notification bitmap */)
+				1 /* notification bitmap, word 0 */)
 #define encode_layoutget_maxsz	(op_encode_hdr_maxsz + 10 + \
 				encode_stateid_maxsz)
 #define decode_layoutget_maxsz	(op_decode_hdr_maxsz + 8 + \
@@ -1918,12 +1923,15 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
 	__be32 *p;
 
 	encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
-	p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE);
+	p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
 	p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
 				    NFS4_DEVICEID4_SIZE);
 	*p++ = cpu_to_be32(args->pdev->layout_type);
 	*p++ = cpu_to_be32(args->pdev->maxcount);	/* gdia_maxcount */
-	*p++ = cpu_to_be32(0);				/* bitmap length 0 */
+
+	p = reserve_space(xdr, 4 + 4);
+	*p++ = cpu_to_be32(1);			/* bitmap length */
+	*p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE);
 }
 
 static void
@@ -5767,9 +5775,16 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
 		p = xdr_inline_decode(xdr, 4 * len);
 		if (unlikely(!p))
 			goto out_overflow;
-		for (i = 0; i < len; i++, p++) {
-			if (be32_to_cpup(p)) {
-				dprintk("%s: notifications not supported\n",
+
+		if (be32_to_cpup(p++) &
+		    ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) {
+			dprintk("%s: unsupported notification\n",
+				__func__);
+		}
+
+		for (i = 1; i < len; i++) {
+			if (be32_to_cpup(p++)) {
+				dprintk("%s: unsupported notification\n",
 					__func__);
 				return -EIO;
 			}
-- 
cgit v1.2.3


From 164ae58c3c2a56e99d7ae207499f1fbd5e6f263d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 12 Sep 2014 13:25:14 -0400
Subject: pNFS/blocklayout: Remove a couple of unused variables

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c | 3 +--
 fs/nfs/blocklayout/extent_tree.c | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index c41a718854e3..5228f201d3d5 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -694,7 +694,6 @@ bl_return_range(struct pnfs_layout_hdr *lo,
 {
 	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
 	sector_t offset = range->offset >> SECTOR_SHIFT, end;
-	int err;
 
 	if (range->offset % 8) {
 		dprintk("%s: offset %lld not block size aligned\n",
@@ -714,7 +713,7 @@ bl_return_range(struct pnfs_layout_hdr *lo,
 		end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
 	}
 
-	err = ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
+	ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
 }
 
 static int
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 1b6009ee75ce..93193616205a 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -165,7 +165,6 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
 {
 	struct pnfs_block_extent *be;
 	sector_t len1 = 0, len2 = 0;
-	sector_t orig_f_offset;
 	sector_t orig_v_offset;
 	sector_t orig_len;
 
@@ -175,7 +174,6 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
 	if (be->be_f_offset >= end)
 		return 0;
 
-	orig_f_offset = be->be_f_offset;
 	orig_v_offset = be->be_v_offset;
 	orig_len = be->be_length;
 
-- 
cgit v1.2.3


From f418c64b71590bac8fdebd0969a1eeaffaf036d2 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Wed, 3 Sep 2014 12:19:07 -0400
Subject: NFS: Unconditionally enable commit code

The goal is to create a generic NFS module with code that does not
depend on what versions of NFS are enabled.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c        | 14 ------------
 fs/nfs/write.c         | 61 --------------------------------------------------
 include/linux/nfs_fs.h |  8 -------
 3 files changed, 83 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 65ef6e00deee..dda4b8667c02 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -178,7 +178,6 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
 	return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
 }
 
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 /*
  * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
  * @dreq - direct request possibly spanning multiple servers
@@ -197,7 +196,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
 	WARN_ON_ONCE(verfp->committed < 0);
 	return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
 }
-#endif
 
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
@@ -576,7 +574,6 @@ out:
 	return result;
 }
 
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
 	struct nfs_pageio_descriptor desc;
@@ -700,17 +697,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 	schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
 }
 
-#else
-static void nfs_direct_write_schedule_work(struct work_struct *work)
-{
-}
-
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
-{
-	nfs_direct_complete(dreq, true);
-}
-#endif
-
 static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 {
 	struct nfs_direct_req *dreq = hdr->dreq;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 128d97f01d1c..6786873a2901 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -720,7 +720,6 @@ nfs_mark_request_dirty(struct nfs_page *req)
 	__set_page_dirty_nobuffers(req->wb_page);
 }
 
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 /*
  * nfs_page_search_commits_for_head_request_locked
  *
@@ -870,43 +869,6 @@ int nfs_write_need_commit(struct nfs_pgio_header *hdr)
 	return hdr->verf.committed != NFS_FILE_SYNC;
 }
 
-#else
-static struct nfs_page *
-nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
-						struct page *page)
-{
-	return NULL;
-}
-
-static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
-				      struct inode *inode)
-{
-}
-
-void nfs_init_cinfo(struct nfs_commit_info *cinfo,
-		    struct inode *inode,
-		    struct nfs_direct_req *dreq)
-{
-}
-
-void
-nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
-			struct nfs_commit_info *cinfo)
-{
-}
-
-static void
-nfs_clear_request_commit(struct nfs_page *req)
-{
-}
-
-int nfs_write_need_commit(struct nfs_pgio_header *hdr)
-{
-	return 0;
-}
-
-#endif
-
 static void nfs_write_completion(struct nfs_pgio_header *hdr)
 {
 	struct nfs_commit_info cinfo;
@@ -942,7 +904,6 @@ out:
 	hdr->release(hdr);
 }
 
-#if  IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 unsigned long
 nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 {
@@ -999,19 +960,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 	return ret;
 }
 
-#else
-unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
-{
-	return 0;
-}
-
-int nfs_scan_commit(struct inode *inode, struct list_head *dst,
-		    struct nfs_commit_info *cinfo)
-{
-	return 0;
-}
-#endif
-
 /*
  * Search for an existing write request, and attempt to update
  * it to reflect a new dirty region on a given page.
@@ -1404,7 +1352,6 @@ static int nfs_writeback_done(struct rpc_task *task,
 		return status;
 	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
 
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 	if (hdr->res.verf->committed < hdr->args.stable &&
 	    task->tk_status >= 0) {
 		/* We tried a write call, but the server did not
@@ -1426,7 +1373,6 @@ static int nfs_writeback_done(struct rpc_task *task,
 			complain = jiffies + 300 * HZ;
 		}
 	}
-#endif
 
 	/* Deal with the suid/sgid bit corner case */
 	if (nfs_should_remove_suid(inode))
@@ -1479,7 +1425,6 @@ static void nfs_writeback_result(struct rpc_task *task,
 }
 
 
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
 {
 	int ret;
@@ -1803,12 +1748,6 @@ out_mark_dirty:
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return ret;
 }
-#else
-static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
-{
-	return 0;
-}
-#endif
 
 int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 5180a7ededec..fd334d4b0d41 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -529,17 +529,9 @@ extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned
 extern int nfs_wb_all(struct inode *inode);
 extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 extern int  nfs_commit_inode(struct inode *, int);
 extern struct nfs_commit_data *nfs_commitdata_alloc(void);
 extern void nfs_commit_free(struct nfs_commit_data *data);
-#else
-static inline int
-nfs_commit_inode(struct inode *inode, int how)
-{
-	return 0;
-}
-#endif
 
 static inline int
 nfs_have_writebacks(struct inode *inode)
-- 
cgit v1.2.3


From 00a36a1090350995127c2a4bfac6be7fc85b5b81 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Wed, 3 Sep 2014 12:19:08 -0400
Subject: NFS: Move v3 declarations out of internal.h

I am generally against the "one big header file" approach, and
everything in the client includes this file.  Let's move all the NFS v3
declarations into a v3-only header file.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/internal.h |  7 -------
 fs/nfs/nfs3_fs.h  | 15 +++++++++++++++
 fs/nfs/nfs3proc.c |  1 +
 3 files changed, 16 insertions(+), 7 deletions(-)
 create mode 100644 fs/nfs/nfs3_fs.h

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9056622d2230..14ae6f20a172 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -218,13 +218,6 @@ static inline void nfs_fs_proc_exit(void)
 int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
 #endif
 
-/* nfs3client.c */
-#if IS_ENABLED(CONFIG_NFS_V3)
-struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
-struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
-				     struct nfs_fattr *, rpc_authflavor_t);
-#endif
-
 /* callback_xdr.c */
 extern struct svc_version nfs4_callback_version1;
 extern struct svc_version nfs4_callback_version4;
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
new file mode 100644
index 000000000000..6599e0dcd69f
--- /dev/null
+++ b/fs/nfs/nfs3_fs.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (C) 2014 Anna Schumaker.
+ *
+ * NFSv3-specific filesystem definitions and declarations
+ */
+#ifndef __LINUX_FS_NFS_NFS3_FS_H
+#define __LINUX_FS_NFS_NFS3_FS_H
+
+/* nfs3client.c */
+struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
+struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
+				     struct nfs_fattr *, rpc_authflavor_t);
+
+
+#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 809670eba52a..524f9f837408 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -22,6 +22,7 @@
 
 #include "iostat.h"
 #include "internal.h"
+#include "nfs3_fs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
-- 
cgit v1.2.3


From f08460dc23db1e5cd6b7ab34a62ffea60f55725f Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Wed, 3 Sep 2014 12:19:09 -0400
Subject: NFS: Remove v3 not compiled check from validate_mount_data()

This check is already performed by the module loading code - if the
module can't be found then -EPROTONOSUPPORT will be returned.  Let's
handle v3 this way, too.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/super.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e4499d5b51e8..31a11b0e885d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2065,11 +2065,6 @@ static int nfs23_validate_mount_data(void *options,
 		return NFS_TEXT_DATA;
 	}
 
-#if !IS_ENABLED(CONFIG_NFS_V3)
-	if (args->version == 3)
-		goto out_v3_not_compiled;
-#endif /* !CONFIG_NFS_V3 */
-
 	return 0;
 
 out_no_data:
@@ -2085,12 +2080,6 @@ out_no_sec:
 	dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");
 	return -EINVAL;
 
-#if !IS_ENABLED(CONFIG_NFS_V3)
-out_v3_not_compiled:
-	dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n");
-	return -EPROTONOSUPPORT;
-#endif /* !CONFIG_NFS_V3 */
-
 out_nomem:
 	dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
 	return -ENOMEM;
-- 
cgit v1.2.3


From cb8c20fa53ec28602793ee43ddc7e8883be62e69 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Wed, 3 Sep 2014 12:19:10 -0400
Subject: NFS: Move NFS v3 acl functions to nfs3_fs.h

This code is internal to the v3 module, so other parts of the client
shouldn't have any knowledge of it.

nfs3_getxattr(), nfs3_setxattr(), and nfs3_removexattr() no longer exist
anywhere so I remove the declarations while I'm here.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs3_fs.h       | 19 +++++++++++++++++++
 fs/nfs/nfs3super.c     |  1 +
 include/linux/nfs_fs.h | 33 ---------------------------------
 3 files changed, 20 insertions(+), 33 deletions(-)

diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 6599e0dcd69f..333ae4068506 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -6,6 +6,25 @@
 #ifndef __LINUX_FS_NFS_NFS3_FS_H
 #define __LINUX_FS_NFS_NFS3_FS_H
 
+/*
+ * nfs3acl.c
+ */
+#ifdef CONFIG_NFS_V3_ACL
+extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
+extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+		struct posix_acl *dfacl);
+extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
+extern const struct xattr_handler *nfs3_xattr_handlers[];
+#else
+static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+		struct posix_acl *dfacl)
+{
+	return 0;
+}
+#define nfs3_listxattr NULL
+#endif /* CONFIG_NFS_V3_ACL */
+
 /* nfs3client.c */
 struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
 struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index d6a98949af19..6af29c2da352 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -4,6 +4,7 @@
 #include <linux/module.h>
 #include <linux/nfs_fs.h>
 #include "internal.h"
+#include "nfs3_fs.h"
 #include "nfs.h"
 
 static struct nfs_subversion nfs_v3 = {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index fd334d4b0d41..28d649054d5f 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -442,22 +442,6 @@ static inline struct rpc_cred *nfs_file_cred(struct file *file)
 	return NULL;
 }
 
-/*
- * linux/fs/nfs/xattr.c
- */
-#ifdef CONFIG_NFS_V3_ACL
-extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
-extern ssize_t nfs3_getxattr(struct dentry *, const char *, void *, size_t);
-extern int nfs3_setxattr(struct dentry *, const char *,
-			const void *, size_t, int);
-extern int nfs3_removexattr (struct dentry *, const char *name);
-#else
-# define nfs3_listxattr NULL
-# define nfs3_getxattr NULL
-# define nfs3_setxattr NULL
-# define nfs3_removexattr NULL
-#endif
-
 /*
  * linux/fs/nfs/direct.c
  */
@@ -548,23 +532,6 @@ extern int  nfs_readpages(struct file *, struct address_space *,
 extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
 			       struct page *);
 
-/*
- * linux/fs/nfs3proc.c
- */
-#ifdef CONFIG_NFS_V3_ACL
-extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
-extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
-		struct posix_acl *dfacl);
-extern const struct xattr_handler *nfs3_xattr_handlers[];
-#else
-static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
-		struct posix_acl *dfacl)
-{
-	return 0;
-}
-#endif /* CONFIG_NFS_V3_ACL */
-
 /*
  * inline functions
  */
-- 
cgit v1.2.3


From 88ac815cdbef93dec8382b3531ef90474dd102f2 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Fri, 12 Sep 2014 11:04:10 +0800
Subject: nfs41: change PNFS_LAYOUTRET_ON_SETATTR to only return on truncation
 to smaller size

Both blocks layout and objects layout want to use it to avoid CB_LAYOUTRECALL
but that should only happen if client is doing truncation to a smaller size.
For other cases, we let server decide if it wants to recall client's layouts.
Change PNFS_LAYOUTRET_ON_SETATTR to follow the logic and not to send
layoutreturn unnecessarily.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Boaz Harrosh <boaz@plexistor.com>
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 4 +++-
 fs/nfs/pnfs.h     | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0b711227cfa5..288be08bda95 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3211,7 +3211,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	struct nfs4_label *label = NULL;
 	int status;
 
-	if (pnfs_ld_layoutret_on_setattr(inode))
+	if (pnfs_ld_layoutret_on_setattr(inode) &&
+	    sattr->ia_valid & ATTR_SIZE &&
+	    sattr->ia_size < i_size_read(inode))
 		pnfs_commit_and_return_layout(inode);
 
 	nfs_fattr_init(fattr);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index b10f12fc6818..693ce42ec683 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -69,7 +69,8 @@ enum {
 };
 
 enum layoutdriver_policy_flags {
-	/* Should the pNFS client commit and return the layout upon a setattr */
+	/* Should the pNFS client commit and return the layout upon truncate to
+	 * a smaller size */
 	PNFS_LAYOUTRET_ON_SETATTR	= 1 << 0,
 	PNFS_LAYOUTRET_ON_ERROR		= 1 << 1,
 	PNFS_READ_WHOLE_PAGE		= 1 << 2,
-- 
cgit v1.2.3


From b262b35c2ceb989c6b89dae6b8e0c0ce466cc90d Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 15 Sep 2014 17:01:32 +1000
Subject: pnfs/blocklayout: include vmalloc.h for __vmalloc

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/extent_tree.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 93193616205a..31d0b5e53dfd 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -2,6 +2,8 @@
  * Copyright (c) 2014 Christoph Hellwig.
  */
 
+#include <linux/vmalloc.h>
+
 #include "blocklayout.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
-- 
cgit v1.2.3


From 5466112f0935f079e225514905c57d5e5285a9b6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 18 Sep 2014 17:03:46 -0400
Subject: pnfs/blocklayout: Fix a 64-bit division/remainder issue in
 bl_map_stripe

kbuild test robot reports:

   fs/built-in.o: In function `bl_map_stripe':
   >> :(.text+0x965b4): undefined reference to `__aeabi_uldivmod'
   >> :(.text+0x965cc): undefined reference to `__aeabi_uldivmod'
   >> :(.text+0x96604): undefined reference to `__aeabi_uldivmod'

Fixes: 5c83746a0cf2 (pnfs/blocklayout: in-kernel GETDEVICEINFO XDR parsing)
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/dev.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 00f159da06ee..5aed4f98df41 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -150,10 +150,13 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
 		struct pnfs_block_dev_map *map)
 {
 	struct pnfs_block_dev *child;
-	u64 chunk = (offset / dev->chunk_size);
-	int chunk_idx = chunk % dev->nr_children;
+	u64 chunk;
+	u32 chunk_idx;
 	u64 disk_offset;
 
+	chunk = div_u64(offset, dev->chunk_size);
+	div_u64_rem(chunk, dev->nr_children, &chunk_idx);
+
 	if (chunk_idx > dev->nr_children) {
 		dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
 			__func__, chunk_idx, offset, dev->chunk_size);
@@ -165,7 +168,7 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
 	offset = chunk * dev->chunk_size;
 
 	/* disk offset of the stripe */
-	disk_offset = offset / dev->nr_children;
+	disk_offset = div_u64(offset, dev->nr_children);
 
 	child = &dev->children[chunk_idx];
 	child->map(child, disk_offset, map);
-- 
cgit v1.2.3


From 2f3169fb18f4643ac9a6a097a6a6c71f0b2cef75 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Wed, 24 Sep 2014 18:56:11 +0200
Subject: nfs: fix duplicate proc entries

Commit 65b38851a174
("NFS: Fix /proc/fs/nfsfs/servers and /proc/fs/nfsfs/volumes")

updated the following function:
static int nfs_volume_list_open(struct inode *inode, struct file *file)

it used &nfs_server_list_ops instead of &nfs_volume_list_ops
which means cat /proc/fs/nfsfs/volumes = /proc/fs/nfsfs/servers

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Fixes: 65b38851a174 (NFS: Fix /proc/fs/nfsfs/servers and...)
Cc: stable@vger.kernel.org # 3.4.x+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 6a4f3666e273..94088517039f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1318,7 +1318,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
  */
 static int nfs_volume_list_open(struct inode *inode, struct file *file)
 {
-	return seq_open_net(inode, file, &nfs_server_list_ops,
+	return seq_open_net(inode, file, &nfs_volume_list_ops,
 			   sizeof(struct seq_net_private));
 }
 
-- 
cgit v1.2.3


From 8faaa6d5d48b201527e0451296d9e71d23afb362 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Wed, 24 Sep 2014 18:11:28 -0400
Subject: Fixing lease renewal

Commit c9fdeb28 removed a 'continue' after checking if the lease needs
to be renewed. However, if client hasn't moved, the code falls down to
starting reboot recovery erroneously (ie., sends open reclaim and gets
back stale_clientid error) before recovering from getting stale_clientid
on the renew operation.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Fixes: c9fdeb280b8c (NFS: Add basic migration support to state manager thread)
Cc: stable@vger.kernel.org # 3.13+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4state.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 22fe35104c0c..6678769e1a54 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2345,6 +2345,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			status = nfs4_check_lease(clp);
 			if (status < 0)
 				goto out_error;
+			continue;
 		}
 
 		if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
-- 
cgit v1.2.3


From a743419f420a64d442280845c0377a915b76644f Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Tue, 23 Sep 2014 12:26:19 -0400
Subject: SUNRPC: Don't wake tasks during connection abort

When aborting a connection to preserve source ports, don't wake the task in
xs_error_report.  This allows tasks with RPC_TASK_SOFTCONN to succeed if the
connection needs to be re-established since it preserves the task's status
instead of setting it to the status of the aborting kernel_connect().

This may also avoid a potential conflict on the socket's lock.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Cc: stable@vger.kernel.org # 3.14+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h | 1 +
 net/sunrpc/xprtsock.c       | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index fcbfe8783243..cf391eef2e6d 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -357,6 +357,7 @@ int			xs_swapper(struct rpc_xprt *xprt, int enable);
 #define XPRT_CONNECTION_ABORT	(7)
 #define XPRT_CONNECTION_CLOSE	(8)
 #define XPRT_CONGESTED		(9)
+#define XPRT_CONNECTION_REUSE	(10)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7ed47b4943da..bffba4e4bfc6 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -845,6 +845,8 @@ static void xs_error_report(struct sock *sk)
 	dprintk("RPC:       xs_error_report client %p, error=%d...\n",
 			xprt, -err);
 	trace_rpc_socket_error(xprt, sk->sk_socket, err);
+	if (test_bit(XPRT_CONNECTION_REUSE, &xprt->state))
+		goto out;
 	xprt_wake_pending_tasks(xprt, err);
  out:
 	read_unlock_bh(&sk->sk_callback_lock);
@@ -2261,7 +2263,9 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 		abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
 				&xprt->state);
 		/* "close" the socket, preserving the local port */
+		set_bit(XPRT_CONNECTION_REUSE, &xprt->state);
 		xs_tcp_reuse_connection(transport);
+		clear_bit(XPRT_CONNECTION_REUSE, &xprt->state);
 
 		if (abort_and_exit)
 			goto out_eagain;
-- 
cgit v1.2.3


From 173b3afceebe76fa2205b2c8808682d5b541fe3c Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Tue, 23 Sep 2014 12:26:20 -0400
Subject: lockd: Try to reconnect if statd has moved

If rpc.statd is restarted, upcalls to monitor hosts can fail with
ECONNREFUSED.  In that case force a lookup of statd's new port and retry the
upcall.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/lockd/mon.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index daa8e7514eae..9106f42c472c 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -159,6 +159,12 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
 
 	msg.rpc_proc = &clnt->cl_procinfo[proc];
 	status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+	if (status == -ECONNREFUSED) {
+		dprintk("lockd:	NSM upcall RPC failed, status=%d, forcing rebind\n",
+				status);
+		rpc_force_rebind(clnt);
+		status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+	}
 	if (status < 0)
 		dprintk("lockd: NSM upcall RPC failed, status=%d\n",
 				status);
-- 
cgit v1.2.3


From f279cd008fc9742f5ec294d9b8a793a7a0b163ef Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Wed, 24 Sep 2014 18:08:00 +0000
Subject: rpc: return sent and err from xs_sendpages()

If an error is returned after the first bits of a packet have already been
successfully queued, xs_sendpages() will return a positive 'int' value
indicating success. Callers seem to treat this as -EAGAIN.

However, there are cases where its not a question of waiting for the write
queue to drain. For example, when there is an iptables rule dropping packets
to the destination, the lower level code can return -EPERM only after parts
of the packet have been successfully queued. In this case, we can end up
continuously retrying resulting in a kernel softlockup.

This patch is intended to make no changes in behavior but is in preparation for
subsequent patches that can make decisions based on both on the number of bytes
sent by xs_sendpages() and any errors that may have be returned.

Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprtsock.c | 81 ++++++++++++++++++++++++++-------------------------
 1 file changed, 42 insertions(+), 39 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index bffba4e4bfc6..7d9ea6b7113d 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -399,13 +399,13 @@ static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen,
 	return kernel_sendmsg(sock, &msg, NULL, 0, 0);
 }
 
-static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy)
+static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy, int *sent_p)
 {
 	ssize_t (*do_sendpage)(struct socket *sock, struct page *page,
 			int offset, size_t size, int flags);
 	struct page **ppage;
 	unsigned int remainder;
-	int err, sent = 0;
+	int err;
 
 	remainder = xdr->page_len - base;
 	base += xdr->page_base;
@@ -424,15 +424,15 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
 		err = do_sendpage(sock, *ppage, base, len, flags);
 		if (remainder == 0 || err != len)
 			break;
-		sent += err;
+		*sent_p += err;
 		ppage++;
 		base = 0;
 	}
-	if (sent == 0)
-		return err;
-	if (err > 0)
-		sent += err;
-	return sent;
+	if (err > 0) {
+		*sent_p += err;
+		err = 0;
+	}
+	return err;
 }
 
 /**
@@ -443,12 +443,14 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
  * @xdr: buffer containing this request
  * @base: starting position in the buffer
  * @zerocopy: true if it is safe to use sendpage()
+ * @sent_p: return the total number of bytes successfully queued for sending
  *
  */
-static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy)
+static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy, int *sent_p)
 {
 	unsigned int remainder = xdr->len - base;
-	int err, sent = 0;
+	int err = 0;
+	int sent = 0;
 
 	if (unlikely(!sock))
 		return -ENOTSOCK;
@@ -465,7 +467,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
 		err = xs_send_kvec(sock, addr, addrlen, &xdr->head[0], base, remainder != 0);
 		if (remainder == 0 || err != len)
 			goto out;
-		sent += err;
+		*sent_p += err;
 		base = 0;
 	} else
 		base -= xdr->head[0].iov_len;
@@ -473,23 +475,23 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
 	if (base < xdr->page_len) {
 		unsigned int len = xdr->page_len - base;
 		remainder -= len;
-		err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy);
-		if (remainder == 0 || err != len)
+		err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy, &sent);
+		*sent_p += sent;
+		if (remainder == 0 || sent != len)
 			goto out;
-		sent += err;
 		base = 0;
 	} else
 		base -= xdr->page_len;
 
 	if (base >= xdr->tail[0].iov_len)
-		return sent;
+		return 0;
 	err = xs_send_kvec(sock, NULL, 0, &xdr->tail[0], base, 0);
 out:
-	if (sent == 0)
-		return err;
-	if (err > 0)
-		sent += err;
-	return sent;
+	if (err > 0) {
+		*sent_p += err;
+		err = 0;
+	}
+	return err;
 }
 
 static void xs_nospace_callback(struct rpc_task *task)
@@ -573,19 +575,20 @@ static int xs_local_send_request(struct rpc_task *task)
 				container_of(xprt, struct sock_xprt, xprt);
 	struct xdr_buf *xdr = &req->rq_snd_buf;
 	int status;
+	int sent = 0;
 
 	xs_encode_stream_record_marker(&req->rq_snd_buf);
 
 	xs_pktdump("packet data:",
 			req->rq_svec->iov_base, req->rq_svec->iov_len);
 
-	status = xs_sendpages(transport->sock, NULL, 0,
-						xdr, req->rq_bytes_sent, true);
+	status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent,
+			      true, &sent);
 	dprintk("RPC:       %s(%u) = %d\n",
 			__func__, xdr->len - req->rq_bytes_sent, status);
-	if (likely(status >= 0)) {
-		req->rq_bytes_sent += status;
-		req->rq_xmit_bytes_sent += status;
+	if (likely(sent > 0) || status == 0) {
+		req->rq_bytes_sent += sent;
+		req->rq_xmit_bytes_sent += sent;
 		if (likely(req->rq_bytes_sent >= req->rq_slen)) {
 			req->rq_bytes_sent = 0;
 			return 0;
@@ -626,6 +629,7 @@ static int xs_udp_send_request(struct rpc_task *task)
 	struct rpc_xprt *xprt = req->rq_xprt;
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 	struct xdr_buf *xdr = &req->rq_snd_buf;
+	int sent = 0;
 	int status;
 
 	xs_pktdump("packet data:",
@@ -634,17 +638,15 @@ static int xs_udp_send_request(struct rpc_task *task)
 
 	if (!xprt_bound(xprt))
 		return -ENOTCONN;
-	status = xs_sendpages(transport->sock,
-			      xs_addr(xprt),
-			      xprt->addrlen, xdr,
-			      req->rq_bytes_sent, true);
+	status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen,
+			      xdr, req->rq_bytes_sent, true, &sent);
 
 	dprintk("RPC:       xs_udp_send_request(%u) = %d\n",
 			xdr->len - req->rq_bytes_sent, status);
 
-	if (status >= 0) {
-		req->rq_xmit_bytes_sent += status;
-		if (status >= req->rq_slen)
+	if (sent > 0 || status == 0) {
+		req->rq_xmit_bytes_sent += sent;
+		if (sent >= req->rq_slen)
 			return 0;
 		/* Still some bytes left; set up for a retry later. */
 		status = -EAGAIN;
@@ -713,6 +715,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	struct xdr_buf *xdr = &req->rq_snd_buf;
 	bool zerocopy = true;
 	int status;
+	int sent;
 
 	xs_encode_stream_record_marker(&req->rq_snd_buf);
 
@@ -730,26 +733,26 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	 * to cope with writespace callbacks arriving _after_ we have
 	 * called sendmsg(). */
 	while (1) {
-		status = xs_sendpages(transport->sock,
-					NULL, 0, xdr, req->rq_bytes_sent,
-					zerocopy);
+		sent = 0;
+		status = xs_sendpages(transport->sock, NULL, 0, xdr,
+				      req->rq_bytes_sent, zerocopy, &sent);
 
 		dprintk("RPC:       xs_tcp_send_request(%u) = %d\n",
 				xdr->len - req->rq_bytes_sent, status);
 
-		if (unlikely(status < 0))
+		if (unlikely(sent == 0 && status < 0))
 			break;
 
 		/* If we've sent the entire packet, immediately
 		 * reset the count of bytes sent. */
-		req->rq_bytes_sent += status;
-		req->rq_xmit_bytes_sent += status;
+		req->rq_bytes_sent += sent;
+		req->rq_xmit_bytes_sent += sent;
 		if (likely(req->rq_bytes_sent >= req->rq_slen)) {
 			req->rq_bytes_sent = 0;
 			return 0;
 		}
 
-		if (status != 0)
+		if (sent != 0)
 			continue;
 		status = -EAGAIN;
 		break;
-- 
cgit v1.2.3


From 3dedbb5ca10ef13f25055776d2f6d9499d9ca1ba Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Wed, 24 Sep 2014 18:08:04 +0000
Subject: rpc: Add -EPERM processing for xs_udp_send_request()

If an iptables drop rule is added for an nfs server, the client can end up in
a softlockup. Because of the way that xs_sendpages() is structured, the -EPERM
is ignored since the prior bits of the packet may have been successfully queued
and thus xs_sendpages() returns a non-zero value. Then, xs_udp_send_request()
thinks that because some bits were queued it should return -EAGAIN. We then try
the request again and again, resulting in cpu spinning. Reproducer:

1) open a file on the nfs server '/nfs/foo' (mounted using udp)
2) iptables -A OUTPUT -d <nfs server ip> -j DROP
3) write to /nfs/foo
4) close /nfs/foo
5) iptables -D OUTPUT -d <nfs server ip> -j DROP

The softlockup occurs in step 4 above.

The previous patch, allows xs_sendpages() to return both a sent count and
any error values that may have occurred. Thus, if we get an -EPERM, return
that to the higher level code.

With this patch in place we can successfully abort the above sequence and
avoid the softlockup.

I also tried the above test case on an nfs mount on tcp and although the system
does not softlockup, I still ended up with the 'hung_task' firing after 120
seconds, due to the i/o being stuck. The tcp case appears a bit harder to fix,
since -EPERM appears to get ignored much lower down in the stack and does not
propogate up to xs_sendpages(). This case is not quite as insidious as the
softlockup and it is not addressed here.

Reported-by: Yigong Lou <ylou@akamai.com>
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/clnt.c     | 2 ++
 net/sunrpc/xprtsock.c | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 488ddeed9363..c4c60697589a 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1913,6 +1913,7 @@ call_transmit_status(struct rpc_task *task)
 	case -EHOSTDOWN:
 	case -EHOSTUNREACH:
 	case -ENETUNREACH:
+	case -EPERM:
 		if (RPC_IS_SOFTCONN(task)) {
 			xprt_end_transmit(task);
 			rpc_exit(task, task->tk_status);
@@ -2018,6 +2019,7 @@ call_status(struct rpc_task *task)
 	case -EHOSTDOWN:
 	case -EHOSTUNREACH:
 	case -ENETUNREACH:
+	case -EPERM:
 		if (RPC_IS_SOFTCONN(task)) {
 			rpc_exit(task, status);
 			break;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7d9ea6b7113d..02603ec2460a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -644,6 +644,10 @@ static int xs_udp_send_request(struct rpc_task *task)
 	dprintk("RPC:       xs_udp_send_request(%u) = %d\n",
 			xdr->len - req->rq_bytes_sent, status);
 
+	/* firewall is blocking us, don't return -EAGAIN or we end up looping */
+	if (status == -EPERM)
+		goto process_status;
+
 	if (sent > 0 || status == 0) {
 		req->rq_xmit_bytes_sent += sent;
 		if (sent >= req->rq_slen)
@@ -652,6 +656,7 @@ static int xs_udp_send_request(struct rpc_task *task)
 		status = -EAGAIN;
 	}
 
+process_status:
 	switch (status) {
 	case -ENOTSOCK:
 		status = -ENOTCONN;
@@ -667,6 +672,7 @@ static int xs_udp_send_request(struct rpc_task *task)
 	case -ENOBUFS:
 	case -EPIPE:
 	case -ECONNREFUSED:
+	case -EPERM:
 		/* When the server has died, an ICMP port unreachable message
 		 * prompts ECONNREFUSED. */
 		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
-- 
cgit v1.2.3


From 8478eaa16e701ecfe054b62ec764bc1291b79e19 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 18 Sep 2014 16:09:27 +1000
Subject: NFSv4: use exponential retry on NFS4ERR_DELAY for async requests.

Currently asynchronous NFSv4 request will be retried with
exponential timeout (from 1/10 to 15 seconds), but async
requests will always use a 15second retry.

Some "async" requests are really synchronous though.  The
async mechanism is used to allow the request to continue if
the requesting process is killed.
In those cases, an exponential retry is appropriate.

For example, if two different clients both open a file and
get a READ delegation, and one client then unlinks the file
(while still holding an open file descriptor), that unlink
will used the "silly-rename" handling which is async.
The first rename will result in NFS4ERR_DELAY while the
delegation is reclaimed from the other client.  The rename
will not be retried for 15 seconds, causing an unlink to take
15 seconds rather than 100msec.

This patch only added exponential timeout for async unlink and
async rename.  Other async calls, such as 'close' are sometimes
waited for so they might benefit from exponential timeout too.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 65 ++++++++++++++++++++++++++++++++-----------------
 include/linux/nfs_xdr.h |  2 ++
 2 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 288be08bda95..45bce9ed6791 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,7 +77,7 @@ struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
 static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
 static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -314,20 +314,30 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 	kunmap_atomic(start);
 }
 
+static long nfs4_update_delay(long *timeout)
+{
+	long ret;
+	if (!timeout)
+		return NFS4_POLL_RETRY_MAX;
+	if (*timeout <= 0)
+		*timeout = NFS4_POLL_RETRY_MIN;
+	if (*timeout > NFS4_POLL_RETRY_MAX)
+		*timeout = NFS4_POLL_RETRY_MAX;
+	ret = *timeout;
+	*timeout <<= 1;
+	return ret;
+}
+
 static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 {
 	int res = 0;
 
 	might_sleep();
 
-	if (*timeout <= 0)
-		*timeout = NFS4_POLL_RETRY_MIN;
-	if (*timeout > NFS4_POLL_RETRY_MAX)
-		*timeout = NFS4_POLL_RETRY_MAX;
-	freezable_schedule_timeout_killable_unsafe(*timeout);
+	freezable_schedule_timeout_killable_unsafe(
+		nfs4_update_delay(timeout));
 	if (fatal_signal_pending(current))
 		res = -ERESTARTSYS;
-	*timeout <<= 1;
 	return res;
 }
 
@@ -2583,7 +2593,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 			if (calldata->arg.fmode == 0)
 				break;
 		default:
-			if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
+			if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) {
 				rpc_restart_call_prepare(task);
 				goto out_release;
 			}
@@ -3572,7 +3582,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 
 	if (!nfs4_sequence_done(task, &res->seq_res))
 		return 0;
-	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+	if (nfs4_async_handle_error(task, res->server, NULL,
+				    &data->timeout) == -EAGAIN)
 		return 0;
 	update_changeattr(dir, &res->cinfo);
 	return 1;
@@ -3605,7 +3616,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
 
 	if (!nfs4_sequence_done(task, &res->seq_res))
 		return 0;
-	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+	if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
 		return 0;
 
 	update_changeattr(old_dir, &res->old_cinfo);
@@ -4109,7 +4120,8 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
 
 	trace_nfs4_read(hdr, task->tk_status);
 	if (nfs4_async_handle_error(task, server,
-				    hdr->args.context->state) == -EAGAIN) {
+				    hdr->args.context->state,
+				    NULL) == -EAGAIN) {
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
@@ -4177,10 +4189,11 @@ static int nfs4_write_done_cb(struct rpc_task *task,
 			      struct nfs_pgio_header *hdr)
 {
 	struct inode *inode = hdr->inode;
-	
+
 	trace_nfs4_write(hdr, task->tk_status);
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode),
-				    hdr->args.context->state) == -EAGAIN) {
+				    hdr->args.context->state,
+				    NULL) == -EAGAIN) {
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
@@ -4260,7 +4273,8 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da
 	struct inode *inode = data->inode;
 
 	trace_nfs4_commit(data, task->tk_status);
-	if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
+	if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+				    NULL, NULL) == -EAGAIN) {
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
@@ -4813,7 +4827,8 @@ out:
 
 
 static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
+			struct nfs4_state *state, long *timeout)
 {
 	struct nfs_client *clp = server->nfs_client;
 
@@ -4863,6 +4878,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 #endif /* CONFIG_NFS_V4_1 */
 		case -NFS4ERR_DELAY:
 			nfs_inc_server_stats(server, NFSIOS_DELAY);
+			rpc_delay(task, nfs4_update_delay(timeout));
+			goto restart_call;
 		case -NFS4ERR_GRACE:
 			rpc_delay(task, NFS4_POLL_RETRY_MAX);
 		case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -5103,8 +5120,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 			pnfs_roc_set_barrier(data->inode, data->roc_barrier);
 		break;
 	default:
-		if (nfs4_async_handle_error(task, data->res.server, NULL) ==
-				-EAGAIN) {
+		if (nfs4_async_handle_error(task, data->res.server,
+					    NULL, NULL) == -EAGAIN) {
 			rpc_restart_call_prepare(task);
 			return;
 		}
@@ -5368,7 +5385,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 		case -NFS4ERR_EXPIRED:
 			break;
 		default:
-			if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
+			if (nfs4_async_handle_error(task, calldata->server,
+						    NULL, NULL) == -EAGAIN)
 				rpc_restart_call_prepare(task);
 	}
 	nfs_release_seqid(calldata->arg.seqid);
@@ -5974,7 +5992,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
 		break;
 	case -NFS4ERR_LEASE_MOVED:
 	case -NFS4ERR_DELAY:
-		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
+		if (nfs4_async_handle_error(task, server,
+					    NULL, NULL) == -EAGAIN)
 			rpc_restart_call_prepare(task);
 	}
 }
@@ -7591,7 +7610,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 			rpc_restart_call_prepare(task);
 		}
 	}
-	if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
+	if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
 		rpc_restart_call_prepare(task);
 out:
 	dprintk("<-- %s\n", __func__);
@@ -7751,7 +7770,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 	case 0:
 		break;
 	case -NFS4ERR_DELAY:
-		if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN)
+		if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
 			break;
 		rpc_restart_call_prepare(task);
 		return;
@@ -7882,7 +7901,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
 	case 0:
 		break;
 	default:
-		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+		if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
 			rpc_restart_call_prepare(task);
 			return;
 		}
@@ -8178,7 +8197,7 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
 
 	switch (task->tk_status) {
 	case -NFS4ERR_DELAY:
-		if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN)
+		if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN)
 			rpc_restart_call_prepare(task);
 	}
 }
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7ae249ccb78d..6951c7d9097d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1339,6 +1339,7 @@ struct nfs_unlinkdata {
 	struct inode *dir;
 	struct rpc_cred	*cred;
 	struct nfs_fattr dir_attr;
+	long timeout;
 };
 
 struct nfs_renamedata {
@@ -1352,6 +1353,7 @@ struct nfs_renamedata {
 	struct dentry		*new_dentry;
 	struct nfs_fattr	new_fattr;
 	void (*complete)(struct rpc_task *, struct nfs_renamedata *);
+	long timeout;
 };
 
 struct nfs_access_entry;
-- 
cgit v1.2.3


From e87b4c7a7ac6d895846570dec637744cf7050df3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 18 Sep 2014 16:09:27 +1000
Subject: NFS: don't use STABLE writes during writeback.

commit b31268ac793fd300da66b9c28bbf0a200339ab96
  FS: Use stable writes when not doing a bulk flush

was a bit heavy handed.
The particular problem that lead to this patch was that
small writes to an O_SYNC file we being written as UNSTABLE writes
followed by a commit.
This is appropriate for large writes (which require multiple NFS
requests) but for small writes (single NFS request), using
NFS_FILE_SYNC is more efficient.

So that patch causes the code to select between the two methods
depending on how many nfs requests get generated.

Unfortunately this ends up applying to non O_SYNC writes as well.
In particular if you memory-map a file and update random pages, then
when they are eventually written out by writeback they will go as
NFS_FILE_SYNC.  This is inefficient and slows down the application.

So: only set FLUSH_COND_STABLE when wbc->sync_mode is WB_SYNC_ALL.
With this patch:
 O_SYNC writes are NFS_FILE_SYNC for single requests, and NFS_UNSTABLE
    followed by COMMIT for multiple requests
 Writing immediately before close of fsync follow the same pattern.
 Non-O_SYNC writes without an fsync of close eventually get flushed
 out as UNSTABLE and a commit follows eventually as appropriate.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6786873a2901..a623b00530c3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -242,11 +242,14 @@ static void nfs_mark_uptodate(struct nfs_page *req)
 
 static int wb_priority(struct writeback_control *wbc)
 {
+	int ret = 0;
 	if (wbc->for_reclaim)
 		return FLUSH_HIGHPRI | FLUSH_STABLE;
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		ret = FLUSH_COND_STABLE;
 	if (wbc->for_kupdate || wbc->for_background)
-		return FLUSH_LOWPRI | FLUSH_COND_STABLE;
-	return FLUSH_COND_STABLE;
+		ret |= FLUSH_LOWPRI;
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From cbbce82209490df8b68da9aec0d642451fe0a668 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 25 Sep 2014 13:55:19 +1000
Subject: SCHED: add some "wait..on_bit...timeout()" interfaces.

In commit c1221321b7c25b53204447cff9949a6d5a7ddddc
   sched: Allow wait_on_bit_action() functions to support a timeout

I suggested that a "wait_on_bit_timeout()" interface would not meet my
need.  This isn't true - I was just over-engineering.

Including a 'private' field in wait_bit_key instead of a focused
"timeout" field was just premature generalization.  If some other
use is ever found, it can be generalized or added later.

So this patch renames "private" to "timeout" with a meaning "stop
waiting when "jiffies" reaches or passes "timeout",
and adds two of the many possible wait..bit..timeout() interfaces:

wait_on_page_bit_killable_timeout(), which is the one I want to use,
and out_of_line_wait_on_bit_timeout() which is a reasonably general
example.  Others can be added as needed.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/pagemap.h |  2 ++
 include/linux/wait.h    |  5 ++++-
 kernel/sched/wait.c     | 36 ++++++++++++++++++++++++++++++++++++
 mm/filemap.c            | 13 +++++++++++++
 4 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3df8c7db7a4e..87f9e4230d3a 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -502,6 +502,8 @@ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
 extern void wait_on_page_bit(struct page *page, int bit_nr);
 
 extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
+extern int wait_on_page_bit_killable_timeout(struct page *page,
+					     int bit_nr, unsigned long timeout);
 
 static inline int wait_on_page_locked_killable(struct page *page)
 {
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 6fb1ba5f9b2f..80115bf88671 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -25,7 +25,7 @@ struct wait_bit_key {
 	void			*flags;
 	int			bit_nr;
 #define WAIT_ATOMIC_T_BIT_NR	-1
-	unsigned long		private;
+	unsigned long		timeout;
 };
 
 struct wait_bit_queue {
@@ -154,6 +154,7 @@ int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_ac
 void wake_up_bit(void *, int);
 void wake_up_atomic_t(atomic_t *);
 int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned);
+int out_of_line_wait_on_bit_timeout(void *, int, wait_bit_action_f *, unsigned, unsigned long);
 int out_of_line_wait_on_bit_lock(void *, int, wait_bit_action_f *, unsigned);
 int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned);
 wait_queue_head_t *bit_waitqueue(void *, int);
@@ -859,6 +860,8 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
 extern int bit_wait(struct wait_bit_key *);
 extern int bit_wait_io(struct wait_bit_key *);
+extern int bit_wait_timeout(struct wait_bit_key *);
+extern int bit_wait_io_timeout(struct wait_bit_key *);
 
 /**
  * wait_on_bit - wait for a bit to be cleared
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 15cab1a4f84e..5a62915f47a8 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -343,6 +343,18 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
 }
 EXPORT_SYMBOL(out_of_line_wait_on_bit);
 
+int __sched out_of_line_wait_on_bit_timeout(
+	void *word, int bit, wait_bit_action_f *action,
+	unsigned mode, unsigned long timeout)
+{
+	wait_queue_head_t *wq = bit_waitqueue(word, bit);
+	DEFINE_WAIT_BIT(wait, word, bit);
+
+	wait.key.timeout = jiffies + timeout;
+	return __wait_on_bit(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
+
 int __sched
 __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
 			wait_bit_action_f *action, unsigned mode)
@@ -520,3 +532,27 @@ __sched int bit_wait_io(struct wait_bit_key *word)
 	return 0;
 }
 EXPORT_SYMBOL(bit_wait_io);
+
+__sched int bit_wait_timeout(struct wait_bit_key *word)
+{
+	unsigned long now = ACCESS_ONCE(jiffies);
+	if (signal_pending_state(current->state, current))
+		return 1;
+	if (time_after_eq(now, word->timeout))
+		return -EAGAIN;
+	schedule_timeout(word->timeout - now);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_timeout);
+
+__sched int bit_wait_io_timeout(struct wait_bit_key *word)
+{
+	unsigned long now = ACCESS_ONCE(jiffies);
+	if (signal_pending_state(current->state, current))
+		return 1;
+	if (time_after_eq(now, word->timeout))
+		return -EAGAIN;
+	io_schedule_timeout(word->timeout - now);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/mm/filemap.c b/mm/filemap.c
index 90effcdf948d..cbe5a9013f70 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -703,6 +703,19 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
 			     bit_wait_io, TASK_KILLABLE);
 }
 
+int wait_on_page_bit_killable_timeout(struct page *page,
+				       int bit_nr, unsigned long timeout)
+{
+	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+
+	wait.key.timeout = jiffies + timeout;
+	if (!test_bit(bit_nr, &page->flags))
+		return 0;
+	return __wait_on_bit(page_waitqueue(page), &wait,
+			     bit_wait_io_timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout);
+
 /**
  * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
  * @page: Page defining the wait queue of interest
-- 
cgit v1.2.3


From a4796e37c12e177572b80864cbab9c907ea250b0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 24 Sep 2014 11:28:32 +1000
Subject: MM: export page_wakeup functions

This will allow NFS to wait for PG_private to be cleared and,
particularly, to send a wake-up when it is.

Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/pagemap.h | 10 ++++++++--
 mm/filemap.c            |  8 ++------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 87f9e4230d3a..2dca0cef3506 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -496,8 +496,8 @@ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
 }
 
 /*
- * This is exported only for wait_on_page_locked/wait_on_page_writeback.
- * Never use this directly!
+ * This is exported only for wait_on_page_locked/wait_on_page_writeback,
+ * and for filesystems which need to wait on PG_private.
  */
 extern void wait_on_page_bit(struct page *page, int bit_nr);
 
@@ -512,6 +512,12 @@ static inline int wait_on_page_locked_killable(struct page *page)
 	return 0;
 }
 
+extern wait_queue_head_t *page_waitqueue(struct page *page);
+static inline void wake_up_page(struct page *page, int bit)
+{
+	__wake_up_bit(page_waitqueue(page), &page->flags, bit);
+}
+
 /* 
  * Wait for a page to be unlocked.
  *
diff --git a/mm/filemap.c b/mm/filemap.c
index cbe5a9013f70..b9b1413080be 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -670,17 +670,13 @@ EXPORT_SYMBOL(__page_cache_alloc);
  * at a cost of "thundering herd" phenomena during rare hash
  * collisions.
  */
-static wait_queue_head_t *page_waitqueue(struct page *page)
+wait_queue_head_t *page_waitqueue(struct page *page)
 {
 	const struct zone *zone = page_zone(page);
 
 	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
 }
-
-static inline void wake_up_page(struct page *page, int bit)
-{
-	__wake_up_bit(page_waitqueue(page), &page->flags, bit);
-}
+EXPORT_SYMBOL(page_waitqueue);
 
 void wait_on_page_bit(struct page *page, int bit_nr)
 {
-- 
cgit v1.2.3


From 9590544694becc64f4874963dbfc4b4d391024b7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 24 Sep 2014 11:28:32 +1000
Subject: NFS: avoid deadlocks with loop-back mounted NFS filesystems.

Support for loop-back mounted NFS filesystems is useful when NFS is
used to access shared storage in a high-availability cluster.

If the node running the NFS server fails, some other node can mount the
filesystem and start providing NFS service.  If that node already had
the filesystem NFS mounted, it will now have it loop-back mounted.

nfsd can suffer a deadlock when allocating memory and entering direct
reclaim.
While direct reclaim does not write to the NFS filesystem it can send
and wait for a COMMIT through nfs_release_page().

This patch modifies nfs_release_page() to wait a limited time for the
commit to complete - one second.  If the commit doesn't complete
in this time, nfs_release_page() will fail.  This means it might now
fail in some cases where it wouldn't before.  These cases are only
when 'gfp' includes '__GFP_WAIT'.

nfs_release_page() is only called by try_to_release_page(), and that
can only be called on an NFS page with required 'gfp' flags from
 - page_cache_pipe_buf_steal() in splice.c
 - shrink_page_list() in vmscan.c
 - invalidate_inode_pages2_range() in truncate.c

The first two handle failure quite safely.  The last is only called
after ->launder_page() has been called, and that will have waited
for the commit to finish already.

So aborting if the commit takes longer than 1 second is perfectly safe.

Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c  | 26 ++++++++++++++++----------
 fs/nfs/write.c |  2 ++
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 23e5f0ea5c83..325df0aeab05 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -475,17 +475,23 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 
 	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
 
-	/* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not
-	 * doing this memory reclaim for a fs-related allocation.
+	/* Always try to initiate a 'commit' if relevant, but only
+	 * wait for it if __GFP_WAIT is set and the calling process is
+	 * allowed to block.  Even then, only wait 1 second.
+	 * Waiting indefinitely can cause deadlocks when the NFS
+	 * server is on this machine, and there is no particular need
+	 * to wait extensively here.  A short wait has the benefit
+	 * that someone else can worry about the freezer.
 	 */
-	if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL &&
-	    !(current->flags & PF_FSTRANS)) {
-		int how = FLUSH_SYNC;
-
-		/* Don't let kswapd deadlock waiting for OOM RPC calls */
-		if (current_is_kswapd())
-			how = 0;
-		nfs_commit_inode(mapping->host, how);
+	if (mapping) {
+		struct nfs_server *nfss = NFS_SERVER(mapping->host);
+		nfs_commit_inode(mapping->host, 0);
+		if ((gfp & __GFP_WAIT) &&
+		    !current_is_kswapd() &&
+		    !(current->flags & PF_FSTRANS)) {
+			wait_on_page_bit_killable_timeout(page, PG_private,
+							  HZ);
+		}
 	}
 	/* If PagePrivate() is set, then the page is not freeable */
 	if (PagePrivate(page))
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a623b00530c3..c063a4e70354 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -705,6 +705,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 		if (likely(!PageSwapCache(head->wb_page))) {
 			set_page_private(head->wb_page, 0);
 			ClearPagePrivate(head->wb_page);
+			smp_mb__after_atomic();
+			wake_up_page(head->wb_page, PG_private);
 			clear_bit(PG_MAPPED, &head->wb_flags);
 		}
 		nfsi->npages--;
-- 
cgit v1.2.3


From 353db7966288a2f18da22438aeec2b4862c0b241 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 24 Sep 2014 11:28:32 +1000
Subject: NFS: avoid waiting at all in nfs_release_page when congested.

If nfs_release_page() is called on a sequence of pages which are all
in the same file which is blocked on COMMIT, each page could
contribute a 1 second delay which could be come excessive.  I have
seen delays of as much as 208 seconds.

To keep the delay to one second, mark the bdi as write-congested
if the commit didn't finished.  Once it does finish, the
write-congested flag will be cleared by nfs_commit_release_pages().

With this, the longest total delay in try_to_free_pages that I have
seen is under 3 seconds.  With no waiting in nfs_release_page at all
I have seen delays of nearly 1.5 seconds.

Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c  | 9 +++++++--
 fs/nfs/write.c | 5 +++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 325df0aeab05..24832d040eb8 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -477,7 +477,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 
 	/* Always try to initiate a 'commit' if relevant, but only
 	 * wait for it if __GFP_WAIT is set and the calling process is
-	 * allowed to block.  Even then, only wait 1 second.
+	 * allowed to block.  Even then, only wait 1 second and only
+	 * if the 'bdi' is not congested.
 	 * Waiting indefinitely can cause deadlocks when the NFS
 	 * server is on this machine, and there is no particular need
 	 * to wait extensively here.  A short wait has the benefit
@@ -488,9 +489,13 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 		nfs_commit_inode(mapping->host, 0);
 		if ((gfp & __GFP_WAIT) &&
 		    !current_is_kswapd() &&
-		    !(current->flags & PF_FSTRANS)) {
+		    !(current->flags & PF_FSTRANS) &&
+		    !bdi_write_congested(&nfss->backing_dev_info)) {
 			wait_on_page_bit_killable_timeout(page, PG_private,
 							  HZ);
+			if (PagePrivate(page))
+				set_bdi_congested(&nfss->backing_dev_info,
+						  BLK_RW_ASYNC);
 		}
 	}
 	/* If PagePrivate() is set, then the page is not freeable */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c063a4e70354..12493846a2d3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1611,6 +1611,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 	struct nfs_page	*req;
 	int status = data->task.tk_status;
 	struct nfs_commit_info cinfo;
+	struct nfs_server *nfss;
 
 	while (!list_empty(&data->pages)) {
 		req = nfs_list_entry(data->pages.next);
@@ -1644,6 +1645,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 	next:
 		nfs_unlock_and_release_request(req);
 	}
+	nfss = NFS_SERVER(data->inode);
+	if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
+		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+
 	nfs_init_cinfo(&cinfo, data->inode, data->dreq);
 	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
 		nfs_commit_clear_lock(NFS_I(data->inode));
-- 
cgit v1.2.3


From 1aff52562939485e503936e17934be077ffaea53 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 24 Sep 2014 11:28:32 +1000
Subject: NFS/SUNRPC: Remove other deadlock-avoidance mechanisms in
 nfs_release_page()

Now that nfs_release_page() doesn't block indefinitely, other deadlock
avoidance mechanisms aren't needed.
 - it doesn't hurt for kswapd to block occasionally.  If it doesn't
   want to block it would clear __GFP_WAIT.  The current_is_kswapd()
   was only added to avoid deadlocks and we have a new approach for
   that.
 - memory allocation in the SUNRPC layer can very rarely try to
   ->releasepage() a page it is trying to handle.  The deadlock
   is removed as nfs_release_page() doesn't block indefinitely.

So we don't need to set PF_FSTRANS for sunrpc network operations any
more.

Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c                   | 14 ++++++--------
 net/sunrpc/sched.c              |  2 --
 net/sunrpc/xprtrdma/transport.c |  2 --
 net/sunrpc/xprtsock.c           | 10 ----------
 4 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 24832d040eb8..6920127c5eb7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -476,20 +476,18 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
 
 	/* Always try to initiate a 'commit' if relevant, but only
-	 * wait for it if __GFP_WAIT is set and the calling process is
-	 * allowed to block.  Even then, only wait 1 second and only
-	 * if the 'bdi' is not congested.
+	 * wait for it if __GFP_WAIT is set.  Even then, only wait 1
+	 * second and only if the 'bdi' is not congested.
 	 * Waiting indefinitely can cause deadlocks when the NFS
-	 * server is on this machine, and there is no particular need
-	 * to wait extensively here.  A short wait has the benefit
-	 * that someone else can worry about the freezer.
+	 * server is on this machine, when a new TCP connection is
+	 * needed and in other rare cases.  There is no particular
+	 * need to wait extensively here.  A short wait has the
+	 * benefit that someone else can worry about the freezer.
 	 */
 	if (mapping) {
 		struct nfs_server *nfss = NFS_SERVER(mapping->host);
 		nfs_commit_inode(mapping->host, 0);
 		if ((gfp & __GFP_WAIT) &&
-		    !current_is_kswapd() &&
-		    !(current->flags & PF_FSTRANS) &&
 		    !bdi_write_congested(&nfss->backing_dev_info)) {
 			wait_on_page_bit_killable_timeout(page, PG_private,
 							  HZ);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 9358c79fd589..fe3441abdbe5 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -821,9 +821,7 @@ void rpc_execute(struct rpc_task *task)
 
 static void rpc_async_schedule(struct work_struct *work)
 {
-	current->flags |= PF_FSTRANS;
 	__rpc_execute(container_of(work, struct rpc_task, u.tk_work));
-	current->flags &= ~PF_FSTRANS;
 }
 
 /**
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 2faac4940563..6a4615dd0261 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -205,7 +205,6 @@ xprt_rdma_connect_worker(struct work_struct *work)
 	struct rpc_xprt *xprt = &r_xprt->xprt;
 	int rc = 0;
 
-	current->flags |= PF_FSTRANS;
 	xprt_clear_connected(xprt);
 
 	dprintk("RPC:       %s: %sconnect\n", __func__,
@@ -216,7 +215,6 @@ xprt_rdma_connect_worker(struct work_struct *work)
 
 	dprintk("RPC:       %s: exit\n", __func__);
 	xprt_clear_connecting(xprt);
-	current->flags &= ~PF_FSTRANS;
 }
 
 /*
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 02603ec2460a..3b305ab17afe 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1954,8 +1954,6 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
 	struct socket *sock;
 	int status = -EIO;
 
-	current->flags |= PF_FSTRANS;
-
 	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 	status = __sock_create(xprt->xprt_net, AF_LOCAL,
 					SOCK_STREAM, 0, &sock, 1);
@@ -1995,7 +1993,6 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
 out:
 	xprt_clear_connecting(xprt);
 	xprt_wake_pending_tasks(xprt, status);
-	current->flags &= ~PF_FSTRANS;
 	return status;
 }
 
@@ -2098,8 +2095,6 @@ static void xs_udp_setup_socket(struct work_struct *work)
 	struct socket *sock = transport->sock;
 	int status = -EIO;
 
-	current->flags |= PF_FSTRANS;
-
 	/* Start by resetting any existing state */
 	xs_reset_transport(transport);
 	sock = xs_create_sock(xprt, transport,
@@ -2119,7 +2114,6 @@ static void xs_udp_setup_socket(struct work_struct *work)
 out:
 	xprt_clear_connecting(xprt);
 	xprt_wake_pending_tasks(xprt, status);
-	current->flags &= ~PF_FSTRANS;
 }
 
 /*
@@ -2256,8 +2250,6 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 	struct rpc_xprt *xprt = &transport->xprt;
 	int status = -EIO;
 
-	current->flags |= PF_FSTRANS;
-
 	if (!sock) {
 		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 		sock = xs_create_sock(xprt, transport,
@@ -2305,7 +2297,6 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 	case -EINPROGRESS:
 	case -EALREADY:
 		xprt_clear_connecting(xprt);
-		current->flags &= ~PF_FSTRANS;
 		return;
 	case -EINVAL:
 		/* Happens, for instance, if the user specified a link
@@ -2323,7 +2314,6 @@ out_eagain:
 out:
 	xprt_clear_connecting(xprt);
 	xprt_wake_pending_tasks(xprt, status);
-	current->flags &= ~PF_FSTRANS;
 }
 
 /**
-- 
cgit v1.2.3


From 3fc3edf141fd78f624194eb89d7b37ff86138422 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 25 Sep 2014 16:28:53 -0400
Subject: NFSv3: Fix missing includes of nfs3_fs.h

Silence a few warnings about missing symbols that are due to missing
includes of nfs3_fs.h.

Fixes: 00a36a1090350 (NFS: Move v3 declarations out of internal.h)
Fixes: cb8c20fa53ec2 (NFS: Move NFS v3 acl functions to nfs3_fs.h)
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs3acl.c    | 1 +
 fs/nfs/nfs3client.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 24c6898159cc..658e586ca438 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -7,6 +7,7 @@
 #include <linux/nfsacl.h>
 
 #include "internal.h"
+#include "nfs3_fs.h"
 
 #define NFSDBG_FACILITY	NFSDBG_PROC
 
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b3fc65ef39ca..8c1b437c5403 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,6 +1,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include "internal.h"
+#include "nfs3_fs.h"
 
 #ifdef CONFIG_NFS_V3_ACL
 static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program };
-- 
cgit v1.2.3


From 2aca5b869ace67a63aab895659e5dc14c33a4d6e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 24 Sep 2014 22:35:58 -0400
Subject: SUNRPC: Add missing support for RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT

The flag RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT was intended introduced in
order to allow NFSv4 clients to disable resend timeouts. Since those
cause the RPC layer to break the connection, they mess up the duplicate
reply caches that remain indexed on the port number in NFSv4..

This patch includes the code that was missing in the original to
set the appropriate flag in struct rpc_clnt, when the caller of
rpc_create() sets RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT.

Fixes: 8a19a0b6cb2e (SUNRPC: Add RPC task and client level options to...)
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/clnt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 488ddeed9363..e0b94ce4c4e6 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -461,6 +461,8 @@ struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
 
 	if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
 		clnt->cl_autobind = 1;
+	if (args->flags & RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT)
+		clnt->cl_noretranstimeo = 1;
 	if (args->flags & RPC_CLNT_CREATE_DISCRTRY)
 		clnt->cl_discrtry = 1;
 	if (!(args->flags & RPC_CLNT_CREATE_QUIET))
@@ -579,6 +581,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args,
 	/* Turn off autobind on clones */
 	new->cl_autobind = 0;
 	new->cl_softrtry = clnt->cl_softrtry;
+	new->cl_noretranstimeo = clnt->cl_noretranstimeo;
 	new->cl_discrtry = clnt->cl_discrtry;
 	new->cl_chatty = clnt->cl_chatty;
 	return new;
-- 
cgit v1.2.3


From f3f760314afcb8522d2349b970b065589c5c8e48 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 25 Sep 2014 14:34:41 +0100
Subject: NFS: Fabricate fscache server index key correctly

When fabricating a server index key for fscache, we should clear the index key
buffer before starting to fill it in, not in the middle.

Reported-by: James Pearson <james-p@moving-picture.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/fscache-index.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 7cf2c4699b08..777b055063f6 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -74,11 +74,10 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
 	struct nfs_server_key *key = buffer;
 	uint16_t len = sizeof(struct nfs_server_key);
 
+	memset(key, 0, len);
 	key->nfsversion = clp->rpc_ops->version;
 	key->family = clp->cl_addr.ss_family;
 
-	memset(key, 0, len);
-
 	switch (clp->cl_addr.ss_family) {
 	case AF_INET:
 		key->port = sin->sin_port;
-- 
cgit v1.2.3


From a4339b7b686b4acc8b6de2b07d7bacbe3ae44b83 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 27 Sep 2014 17:02:26 -0400
Subject: NFSv4: Fix lock recovery when CREATE_SESSION/SETCLIENTID_CONFIRM
 fails

If a NFSv4.x server returns NFS4ERR_STALE_CLIENTID in response to a
CREATE_SESSION or SETCLIENTID_CONFIRM in order to tell us that it rebooted
a second time, then the client will currently take this to mean that it must
declare all locks to be stale, and hence ineligible for reboot recovery.

RFC3530 and RFC5661 both suggest that the client should instead rely on the
server to respond to inelegible open share, lock and delegation reclaim
requests with NFS4ERR_NO_GRACE in this situation.

Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4state.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 22fe35104c0c..26d510d11efd 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1761,7 +1761,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
 		break;
 	case -NFS4ERR_STALE_CLIENTID:
 		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
-		nfs4_state_clear_reclaim_reboot(clp);
 		nfs4_state_start_reclaim_reboot(clp);
 		break;
 	case -NFS4ERR_CLID_INUSE:
-- 
cgit v1.2.3


From df817ba35736db2d62b07de6f050a4db53492ad8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 27 Sep 2014 17:41:51 -0400
Subject: NFSv4: fix open/lock state recovery error handling

The current open/lock state recovery unfortunately does not handle errors
such as NFS4ERR_CONN_NOT_BOUND_TO_SESSION correctly. Instead of looping,
just proceeds as if the state manager is finished recovering.
This patch ensures that we loop back, handle higher priority errors
and complete the open/lock state recovery.

Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4state.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 26d510d11efd..87b2d0e79797 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1705,7 +1705,8 @@ restart:
 			if (status < 0) {
 				set_bit(ops->owner_flag_bit, &sp->so_flags);
 				nfs4_put_state_owner(sp);
-				return nfs4_recovery_handle_error(clp, status);
+				status = nfs4_recovery_handle_error(clp, status);
+				return (status != 0) ? status : -EAGAIN;
 			}
 
 			nfs4_put_state_owner(sp);
@@ -1714,7 +1715,7 @@ restart:
 		spin_unlock(&clp->cl_lock);
 	}
 	rcu_read_unlock();
-	return status;
+	return 0;
 }
 
 static int nfs4_check_lease(struct nfs_client *clp)
@@ -2365,14 +2366,11 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			section = "reclaim reboot";
 			status = nfs4_do_reclaim(clp,
 				clp->cl_mvops->reboot_recovery_ops);
-			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
-			    test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
-				continue;
-			nfs4_state_end_reclaim_reboot(clp);
-			if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
+			if (status == -EAGAIN)
 				continue;
 			if (status < 0)
 				goto out_error;
+			nfs4_state_end_reclaim_reboot(clp);
 		}
 
 		/* Now recover expired state... */
@@ -2380,9 +2378,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			section = "reclaim nograce";
 			status = nfs4_do_reclaim(clp,
 				clp->cl_mvops->nograce_recovery_ops);
-			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
-			    test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
-			    test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+			if (status == -EAGAIN)
 				continue;
 			if (status < 0)
 				goto out_error;
-- 
cgit v1.2.3


From d1f456b0b9545f1606a54cd17c20775f159bd2ce Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Mon, 29 Sep 2014 12:31:57 -0400
Subject: NFSv4.1: Fix an NFSv4.1 state renewal regression

Commit 2f60ea6b8ced ("NFSv4: The NFSv4.0 client must send RENEW calls if it holds a delegation") set the NFS4_RENEW_TIMEOUT flag in nfs4_renew_state, and does
not put an nfs41_proc_async_sequence call, the NFSv4.1 lease renewal heartbeat
call, on the wire to renew the NFSv4.1 state if the flag was not set.

The NFS4_RENEW_TIMEOUT flag is set when "now" is after the last renewal
(cl_last_renewal) plus the lease time divided by 3. This is arbitrary and
sometimes does the following:

In normal operation, the only way a future state renewal call is put on the
wire is via a call to nfs4_schedule_state_renewal, which schedules a
nfs4_renew_state workqueue task. nfs4_renew_state determines if the
NFS4_RENEW_TIMEOUT should be set, and the calls nfs41_proc_async_sequence,
which only gets sent if the NFS4_RENEW_TIMEOUT flag is set.
Then the nfs41_proc_async_sequence rpc_release function schedules
another state remewal via nfs4_schedule_state_renewal.

Without this change we can get into a state where an application stops
accessing the NFSv4.1 share, state renewal calls stop due to the
NFS4_RENEW_TIMEOUT flag _not_ being set. The only way to recover
from this situation is with a clientid re-establishment, once the application
resumes and the server has timed out the lease and so returns
NFS4ERR_BAD_SESSION on the subsequent SEQUENCE operation.

An example application:
open, lock, write a file.

sleep for 6 * lease (could be less)

ulock, close.

In the above example with NFSv4.1 delegations enabled, without this change,
there are no OP_SEQUENCE state renewal calls during the sleep, and the
clientid is recovered due to lease expiration on the close.

This issue does not occur with NFSv4.1 delegations disabled, nor with
NFSv4.0, with or without delegations enabled.

Signed-off-by: Andy Adamson <andros@netapp.com>
Link: http://lkml.kernel.org/r/1411486536-23401-1-git-send-email-andros@netapp.com
Fixes: 2f60ea6b8ced (NFSv4: The NFSv4.0 client must send RENEW calls...)
Cc: stable@vger.kernel.org # 3.2.x
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c   |  2 +-
 fs/nfs/nfs4renewd.c | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6ca0c8e7a945..0422d77b73c7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7353,7 +7353,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
 	int ret = 0;
 
 	if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
-		return 0;
+		return -EAGAIN;
 	task = _nfs41_proc_sequence(clp, cred, false);
 	if (IS_ERR(task))
 		ret = PTR_ERR(task);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 1720d32ffa54..e1ba58c3d1ad 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -88,10 +88,18 @@ nfs4_renew_state(struct work_struct *work)
 			}
 			nfs_expire_all_delegations(clp);
 		} else {
+			int ret;
+
 			/* Queue an asynchronous RENEW. */
-			ops->sched_state_renewal(clp, cred, renew_flags);
+			ret = ops->sched_state_renewal(clp, cred, renew_flags);
 			put_rpccred(cred);
-			goto out_exp;
+			switch (ret) {
+			default:
+				goto out_exp;
+			case -EAGAIN:
+			case -ENOMEM:
+				break;
+			}
 		}
 	} else {
 		dprintk("%s: failed to call renewd. Reason: lease not expired \n",
-- 
cgit v1.2.3