NFSD: Don't hand out delegations for 30 seconds after recalling them.

If nfsd needs to recall a delegation for some reason it implies that there is contention on the file, so further delegations should not be handed out. The current code fails to do so, and the result is effectively a live-lock under some workloads: a client attempting a conflicting operation on a read-delegated file receives NFS4ERR_DELAY and retries the operation, but by the time it retries the server may already have given out another delegation. We could simply avoid delegations for (say) 30 seconds after any recall, but this is probably too heavy handed. We could keep a list of inodes (or inode numbers or filehandles) for recalled delegations, but that requires memory allocation and searching. The approach taken here is to use a bloom filter to record the filehandles which are currently blocked from delegation, and to accept the cost of a few false positives. We have 2 bloom filters, each of which is valid for 30 seconds. When a delegation is recalled the filehandle is added to one filter and will remain disabled for between 30 and 60 seconds. We keep a count of the number of filehandles that have been added, so when that count is zero we can bypass all other tests. The bloom filters have 256 bits and 3 hash functions. This should allow a couple of dozen blocked filehandles with minimal false positives. If many more filehandles are all blocked at once, behaviour will degrade towards rejecting all delegations for between 30 and 60 seconds, then resetting and allowing new delegations. Signed-off-by: NeilBrown <neilb@suse.de> Signed-off-by: J. Bruce Fields <bfields@redhat.com>
author: NeilBrown <neilb@suse.de> 2014-06-04 17:39:26 +1000
committer: J. Bruce Fields <bfields@redhat.com> 2014-06-17 16:42:47 -0400
commit: 6282cd56555347c0ec2addc97bd96b40df0a38b7 (patch)
tree: 53b6573954b79471cc35ad36640320eb7c9eaa88 /fs/nfsd
parent: 7171511eaec5bf23fb06078f59784a3a0626b38f (diff)
1 files changed, 78 insertions, 0 deletions
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c0d45cec9958..2204e1fe5725 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -41,6 +41,7 @@
 #include <linux/ratelimit.h>
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/sunrpc/addr.h>
+#include <linux/hash.h>
 #include "xdr4.h"
 #include "xdr4cb.h"
 #include "vfs.h"
@@ -364,6 +365,79 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
 	return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
 }
 
+/*
+ * When we recall a delegation, we should be careful not to hand it
+ * out again straight away.
+ * To ensure this we keep a pair of bloom filters ('new' and 'old')
+ * in which the filehandles of recalled delegations are "stored".
+ * If a filehandle appear in either filter, a delegation is blocked.
+ * When a delegation is recalled, the filehandle is stored in the "new"
+ * filter.
+ * Every 30 seconds we swap the filters and clear the "new" one,
+ * unless both are empty of course.
+ *
+ * Each filter is 256 bits.  We hash the filehandle to 32bit and use the
+ * low 3 bytes as hash-table indices.
+ *
+ * 'state_lock', which is always held when block_delegations() is called,
+ * is used to manage concurrent access.  Testing does not need the lock
+ * except when swapping the two filters.
+ */
+static struct bloom_pair {
+	int	entries, old_entries;
+	time_t	swap_time;
+	int	new; /* index into 'set' */
+	DECLARE_BITMAP(set[2], 256);
+} blocked_delegations;
+
+static int delegation_blocked(struct knfsd_fh *fh)
+{
+	u32 hash;
+	struct bloom_pair *bd = &blocked_delegations;
+
+	if (bd->entries == 0)
+		return 0;
+	if (seconds_since_boot() - bd->swap_time > 30) {
+		spin_lock(&state_lock);
+		if (seconds_since_boot() - bd->swap_time > 30) {
+			bd->entries -= bd->old_entries;
+			bd->old_entries = bd->entries;
+			memset(bd->set[bd->new], 0,
+			       sizeof(bd->set[0]));
+			bd->new = 1-bd->new;
+			bd->swap_time = seconds_since_boot();
+		}
+		spin_unlock(&state_lock);
+	}
+	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
+	if (test_bit(hash&255, bd->set[0]) &&
+	    test_bit((hash>>8)&255, bd->set[0]) &&
+	    test_bit((hash>>16)&255, bd->set[0]))
+		return 1;
+
+	if (test_bit(hash&255, bd->set[1]) &&
+	    test_bit((hash>>8)&255, bd->set[1]) &&
+	    test_bit((hash>>16)&255, bd->set[1]))
+		return 1;
+
+	return 0;
+}
+
+static void block_delegations(struct knfsd_fh *fh)
+{
+	u32 hash;
+	struct bloom_pair *bd = &blocked_delegations;
+
+	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
+
+	__set_bit(hash&255, bd->set[bd->new]);
+	__set_bit((hash>>8)&255, bd->set[bd->new]);
+	__set_bit((hash>>16)&255, bd->set[bd->new]);
+	if (bd->entries == 0)
+		bd->swap_time = seconds_since_boot();
+	bd->entries += 1;
+}
+
 static struct nfs4_delegation *
 alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
 {
@@ -372,6 +446,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
 	dprintk("NFSD alloc_init_deleg\n");
 	if (num_delegations > max_delegations)
 		return NULL;
+	if (delegation_blocked(&current_fh->fh_handle))
+		return NULL;
 	dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
 	if (dp == NULL)
 		return dp;
@@ -2770,6 +2846,8 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 	/* Only place dl_time is set; protected by i_lock: */
 	dp->dl_time = get_seconds();
 
+	block_delegations(&dp->dl_fh);
+
 	nfsd4_cb_recall(dp);
 }
author	NeilBrown <neilb@suse.de>	2014-06-04 17:39:26 +1000
committer	J. Bruce Fields <bfields@redhat.com>	2014-06-17 16:42:47 -0400
commit	6282cd56555347c0ec2addc97bd96b40df0a38b7 (patch)
tree	53b6573954b79471cc35ad36640320eb7c9eaa88 /fs/nfsd
parent	7171511eaec5bf23fb06078f59784a3a0626b38f (diff)