summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-10-14 12:31:14 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-14 12:31:14 -0700
commit8acd3a60bcca17c6d89c73cee3ad6057eb83ba1e (patch)
treed610c8d39246c33c499ee9d92d302d3ca9e89ae3
parentc269bc00fcb876ae3b85f178f1e34601185c8ccc (diff)
parent107e0008dfb8bd6366bc8827f5bbbc0c1f795d2d (diff)
Merge branch 'for-2.6.28' of git://linux-nfs.org/~bfields/linux
* 'for-2.6.28' of git://linux-nfs.org/~bfields/linux: (59 commits) svcrdma: Fix IRD/ORD polarity svcrdma: Update svc_rdma_send_error to use DMA LKEY svcrdma: Modify the RPC reply path to use FRMR when available svcrdma: Modify the RPC recv path to use FRMR when available svcrdma: Add support to svc_rdma_send to handle chained WR svcrdma: Modify post recv path to use local dma key svcrdma: Add a service to register a Fast Reg MR with the device svcrdma: Query device for Fast Reg support during connection setup svcrdma: Add FRMR get/put services NLM: Remove unused argument from svc_addsock() function NLM: Remove "proto" argument from lockd_up() NLM: Always start both UDP and TCP listeners lockd: Remove unused fields in the nlm_reboot structure lockd: Add helper to sanity check incoming NOTIFY requests lockd: change nlmclnt_grant() to take a "struct sockaddr *" lockd: Adjust nlmsvc_lookup_host() to accomodate AF_INET6 addresses lockd: Adjust nlmclnt_lookup_host() signature to accomodate non-AF_INET lockd: Support non-AF_INET addresses in nlm_lookup_host() NLM: Convert nlm_lookup_host() to use a single argument svcrdma: Add Fast Reg MR Data Types ...
-rw-r--r--fs/Kconfig30
-rw-r--r--fs/Makefile3
-rw-r--r--fs/lockd/Makefile2
-rw-r--r--fs/lockd/clntlock.c13
-rw-r--r--fs/lockd/grace.c59
-rw-r--r--fs/lockd/host.c350
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/lockd/svc.c88
-rw-r--r--fs/lockd/svc4proc.c31
-rw-r--r--fs/lockd/svclock.c18
-rw-r--r--fs/lockd/svcproc.c31
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/lockd/xdr.c2
-rw-r--r--fs/lockd/xdr4.c2
-rw-r--r--fs/nfs/callback.c3
-rw-r--r--fs/nfsd/lockd.c1
-rw-r--r--fs/nfsd/nfs3proc.c8
-rw-r--r--fs/nfsd/nfs4callback.c7
-rw-r--r--fs/nfsd/nfs4proc.c8
-rw-r--r--fs/nfsd/nfs4state.c34
-rw-r--r--fs/nfsd/nfs4xdr.c171
-rw-r--r--fs/nfsd/nfsctl.c5
-rw-r--r--fs/nfsd/nfsfh.c30
-rw-r--r--fs/nfsd/nfsproc.c6
-rw-r--r--fs/nfsd/nfssvc.c20
-rw-r--r--fs/nfsd/vfs.c63
-rw-r--r--fs/proc/proc_misc.c4
-rw-r--r--include/linux/fs.h65
-rw-r--r--include/linux/lockd/bind.h11
-rw-r--r--include/linux/lockd/lockd.h137
-rw-r--r--include/linux/lockd/xdr.h2
-rw-r--r--include/linux/nfsd/nfsd.h3
-rw-r--r--include/linux/sunrpc/clnt.h5
-rw-r--r--include/linux/sunrpc/svc.h19
-rw-r--r--include/linux/sunrpc/svc_rdma.h27
-rw-r--r--include/linux/sunrpc/svcsock.h5
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c6
-rw-r--r--net/sunrpc/clnt.c2
-rw-r--r--net/sunrpc/rpcb_clnt.c81
-rw-r--r--net/sunrpc/svc.c251
-rw-r--r--net/sunrpc/svc_xprt.c39
-rw-r--r--net/sunrpc/svcsock.c17
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c187
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c255
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c364
46 files changed, 1842 insertions, 628 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index f54a157a0296..501f012e0c6f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -433,6 +433,14 @@ config FS_POSIX_ACL
bool
default n
+config FILE_LOCKING
+ bool "Enable POSIX file locking API" if EMBEDDED
+ default y
+ help
+ This option enables standard file locking support, required
+ for filesystems like NFS and for the flock() system
+ call. Disabling this option saves about 11k.
+
source "fs/xfs/Kconfig"
source "fs/gfs2/Kconfig"
@@ -1779,6 +1787,28 @@ config SUNRPC_XPRT_RDMA
If unsure, say N.
+config SUNRPC_REGISTER_V4
+ bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
+ depends on SUNRPC && EXPERIMENTAL
+ default n
+ help
+ Sun added support for registering RPC services at an IPv6
+ address by creating two new versions of the rpcbind protocol
+ (RFC 1833).
+
+ This option enables support in the kernel RPC server for
+ registering kernel RPC services via version 4 of the rpcbind
+ protocol. If you enable this option, you must run a portmapper
+ daemon that supports rpcbind protocol version 4.
+
+ Serving NFS over IPv6 from knfsd (the kernel's NFS server)
+ requires that you enable this option and use a portmapper that
+ supports rpcbind version 4.
+
+ If unsure, say N to get traditional behavior (register kernel
+ RPC services using only rpcbind version 2). Distributions
+ using the legacy Linux portmapper daemon must say N here.
+
config RPCSEC_GSS_KRB5
tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
depends on SUNRPC && EXPERIMENTAL
diff --git a/fs/Makefile b/fs/Makefile
index de404b00eb0c..b6f27dc26b72 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -7,7 +7,7 @@
obj-y := open.o read_write.o file_table.o super.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
- ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
+ ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
+obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
nfsd-$(CONFIG_NFSD) := nfsctl.o
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 7725a0a9a555..97f6073ab339 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,6 @@
obj-$(CONFIG_LOCKD) += lockd.o
lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
- svcproc.o svcsubs.o mon.o xdr.o
+ svcproc.o svcsubs.o mon.o xdr.o grace.o
lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 0b45fd3a4bfd..8307dd64bf46 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -54,14 +54,13 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
int status;
- status = lockd_up(nlm_init->protocol);
+ status = lockd_up();
if (status < 0)
return ERR_PTR(status);
- host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address,
+ host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
nlm_init->protocol, nlm_version,
- nlm_init->hostname,
- strlen(nlm_init->hostname));
+ nlm_init->hostname);
if (host == NULL) {
lockd_down();
return ERR_PTR(-ENOLCK);
@@ -142,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
/*
* The server lockd has called us back to tell us the lock was granted
*/
-__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
+__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
{
const struct file_lock *fl = &lock->fl;
const struct nfs_fh *fh = &lock->fh;
@@ -166,7 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock
*/
if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
continue;
- if (!nlm_cmp_addr(&block->b_host->h_addr, addr))
+ if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
continue;
if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
continue;
@@ -216,7 +215,7 @@ reclaimer(void *ptr)
/* This one ensures that our parent doesn't terminate while the
* reclaim is in progress */
lock_kernel();
- lockd_up(0); /* note: this cannot fail as lockd is already running */
+ lockd_up(); /* note: this cannot fail as lockd is already running */
dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c
new file mode 100644
index 000000000000..183cc1f0af1c
--- /dev/null
+++ b/fs/lockd/grace.c
@@ -0,0 +1,59 @@
+/*
+ * Common code for control of lockd and nfsv4 grace periods.
+ */
+
+#include <linux/module.h>
+#include <linux/lockd/bind.h>
+
+static LIST_HEAD(grace_list);
+static DEFINE_SPINLOCK(grace_lock);
+
+/**
+ * locks_start_grace
+ * @lm: who this grace period is for
+ *
+ * A grace period is a period during which locks should not be given
+ * out. Currently grace periods are only enforced by the two lock
+ * managers (lockd and nfsd), using the locks_in_grace() function to
+ * check when they are in a grace period.
+ *
+ * This function is called to start a grace period.
+ */
+void locks_start_grace(struct lock_manager *lm)
+{
+ spin_lock(&grace_lock);
+ list_add(&lm->list, &grace_list);
+ spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_start_grace);
+
+/**
+ * locks_end_grace
+ * @lm: who this grace period is for
+ *
+ * Call this function to state that the given lock manager is ready to
+ * resume regular locking. The grace period will not end until all lock
+ * managers that called locks_start_grace() also call locks_end_grace().
+ * Note that callers count on it being safe to call this more than once,
+ * and the second call should be a no-op.
+ */
+void locks_end_grace(struct lock_manager *lm)
+{
+ spin_lock(&grace_lock);
+ list_del_init(&lm->list);
+ spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_end_grace);
+
+/**
+ * locks_in_grace
+ *
+ * Lock managers call this function to determine when it is OK for them
+ * to answer ordinary lock requests, and when they should accept only
+ * lock reclaims.
+ */
+int locks_in_grace(void)
+{
+ return !list_empty(&grace_list);
+}
+EXPORT_SYMBOL_GPL(locks_in_grace);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index a17664c7eacc..9fd8889097b7 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -11,16 +11,17 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/in.h>
+#include <linux/in6.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
#include <linux/lockd/sm_inter.h>
#include <linux/mutex.h>
+#include <net/ipv6.h>
#define NLMDBG_FACILITY NLMDBG_HOSTCACHE
#define NLM_HOST_NRHASH 32
-#define NLM_ADDRHASH(addr) (ntohl(addr) & (NLM_HOST_NRHASH-1))
#define NLM_HOST_REBIND (60 * HZ)
#define NLM_HOST_EXPIRE (300 * HZ)
#define NLM_HOST_COLLECT (120 * HZ)
@@ -30,42 +31,115 @@ static unsigned long next_gc;
static int nrhosts;
static DEFINE_MUTEX(nlm_host_mutex);
-
static void nlm_gc_hosts(void);
-static struct nsm_handle * __nsm_find(const struct sockaddr_in *,
- const char *, unsigned int, int);
-static struct nsm_handle * nsm_find(const struct sockaddr_in *sin,
- const char *hostname,
- unsigned int hostname_len);
+static struct nsm_handle *nsm_find(const struct sockaddr *sap,
+ const size_t salen,
+ const char *hostname,
+ const size_t hostname_len,
+ const int create);
+
+struct nlm_lookup_host_info {
+ const int server; /* search for server|client */
+ const struct sockaddr *sap; /* address to search for */
+ const size_t salen; /* it's length */
+ const unsigned short protocol; /* transport to search for*/
+ const u32 version; /* NLM version to search for */
+ const char *hostname; /* remote's hostname */
+ const size_t hostname_len; /* it's length */
+ const struct sockaddr *src_sap; /* our address (optional) */
+ const size_t src_len; /* it's length */
+};
+
+/*
+ * Hash function must work well on big- and little-endian platforms
+ */
+static unsigned int __nlm_hash32(const __be32 n)
+{
+ unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16);
+ return hash ^ (hash >> 8);
+}
+
+static unsigned int __nlm_hash_addr4(const struct sockaddr *sap)
+{
+ const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ return __nlm_hash32(sin->sin_addr.s_addr);
+}
+
+static unsigned int __nlm_hash_addr6(const struct sockaddr *sap)
+{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ const struct in6_addr addr = sin6->sin6_addr;
+ return __nlm_hash32(addr.s6_addr32[0]) ^
+ __nlm_hash32(addr.s6_addr32[1]) ^
+ __nlm_hash32(addr.s6_addr32[2]) ^
+ __nlm_hash32(addr.s6_addr32[3]);
+}
+
+static unsigned int nlm_hash_address(const struct sockaddr *sap)
+{
+ unsigned int hash;
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ hash = __nlm_hash_addr4(sap);
+ break;
+ case AF_INET6:
+ hash = __nlm_hash_addr6(sap);
+ break;
+ default:
+ hash = 0;
+ }
+ return hash & (NLM_HOST_NRHASH - 1);
+}
+
+static void nlm_clear_port(struct sockaddr *sap)
+{
+ switch (sap->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)sap)->sin_port = 0;
+ break;
+ case AF_INET6:
+ ((struct sockaddr_in6 *)sap)->sin6_port = 0;
+ break;
+ }
+}
+
+static void nlm_display_address(const struct sockaddr *sap,
+ char *buf, const size_t len)
+{
+ const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+
+ switch (sap->sa_family) {
+ case AF_UNSPEC:
+ snprintf(buf, len, "unspecified");
+ break;
+ case AF_INET:
+ snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr));
+ break;
+ case AF_INET6:
+ if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+ snprintf(buf, len, NIPQUAD_FMT,
+ NIPQUAD(sin6->sin6_addr.s6_addr32[3]));
+ else
+ snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr));
+ break;
+ default:
+ snprintf(buf, len, "unsupported address family");
+ break;
+ }
+}
/*
* Common host lookup routine for server & client
*/
-static struct nlm_host *nlm_lookup_host(int server,
- const struct sockaddr_in *sin,
- int proto, u32 version,
- const char *hostname,
- unsigned int hostname_len,
- const struct sockaddr_in *ssin)
+static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
{
struct hlist_head *chain;
struct hlist_node *pos;
struct nlm_host *host;
struct nsm_handle *nsm = NULL;
- int hash;
-
- dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT
- ", p=%d, v=%u, my role=%s, name=%.*s)\n",
- NIPQUAD(ssin->sin_addr.s_addr),
- NIPQUAD(sin->sin_addr.s_addr), proto, version,
- server? "server" : "client",
- hostname_len,
- hostname? hostname : "<none>");
-
- hash = NLM_ADDRHASH(sin->sin_addr.s_addr);
-
- /* Lock hash table */
mutex_lock(&nlm_host_mutex);
if (time_after_eq(jiffies, next_gc))
@@ -78,22 +152,22 @@ static struct nlm_host *nlm_lookup_host(int server,
* different NLM rpc_clients into one single nlm_host object.
* This would allow us to have one nlm_host per address.
*/
- chain = &nlm_hosts[hash];
+ chain = &nlm_hosts[nlm_hash_address(ni->sap)];
hlist_for_each_entry(host, pos, chain, h_hash) {
- if (!nlm_cmp_addr(&host->h_addr, sin))
+ if (!nlm_cmp_addr(nlm_addr(host), ni->sap))
continue;
/* See if we have an NSM handle for this client */
if (!nsm)
nsm = host->h_nsmhandle;
- if (host->h_proto != proto)
+ if (host->h_proto != ni->protocol)
continue;
- if (host->h_version != version)
+ if (host->h_version != ni->version)
continue;
- if (host->h_server != server)
+ if (host->h_server != ni->server)
continue;
- if (!nlm_cmp_addr(&host->h_saddr, ssin))
+ if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
continue;
/* Move to head of hash chain. */
@@ -101,30 +175,41 @@ static struct nlm_host *nlm_lookup_host(int server,
hlist_add_head(&host->h_hash, chain);
nlm_get_host(host);
+ dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
+ host->h_name, host->h_addrbuf);
goto out;
}
- if (nsm)
- atomic_inc(&nsm->sm_count);
-
- host = NULL;
- /* Sadly, the host isn't in our hash table yet. See if
- * we have an NSM handle for it. If not, create one.
+ /*
+ * The host wasn't in our hash table. If we don't
+ * have an NSM handle for it yet, create one.
*/
- if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len)))
- goto out;
+ if (nsm)
+ atomic_inc(&nsm->sm_count);
+ else {
+ host = NULL;
+ nsm = nsm_find(ni->sap, ni->salen,
+ ni->hostname, ni->hostname_len, 1);
+ if (!nsm) {
+ dprintk("lockd: nlm_lookup_host failed; "
+ "no nsm handle\n");
+ goto out;
+ }
+ }
host = kzalloc(sizeof(*host), GFP_KERNEL);
if (!host) {
nsm_release(nsm);
+ dprintk("lockd: nlm_lookup_host failed; no memory\n");
goto out;
}
host->h_name = nsm->sm_name;
- host->h_addr = *sin;
- host->h_addr.sin_port = 0; /* ouch! */
- host->h_saddr = *ssin;
- host->h_version = version;
- host->h_proto = proto;
+ memcpy(nlm_addr(host), ni->sap, ni->salen);
+ host->h_addrlen = ni->salen;
+ nlm_clear_port(nlm_addr(host));
+ memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
+ host->h_version = ni->version;
+ host->h_proto = ni->protocol;
host->h_rpcclnt = NULL;
mutex_init(&host->h_mutex);
host->h_nextrebind = jiffies + NLM_HOST_REBIND;
@@ -135,7 +220,7 @@ static struct nlm_host *nlm_lookup_host(int server,
host->h_state = 0; /* pseudo NSM state */
host->h_nsmstate = 0; /* real NSM state */
host->h_nsmhandle = nsm;
- host->h_server = server;
+ host->h_server = ni->server;
hlist_add_head(&host->h_hash, chain);
INIT_LIST_HEAD(&host->h_lockowners);
spin_lock_init(&host->h_lock);
@@ -143,6 +228,15 @@ static struct nlm_host *nlm_lookup_host(int server,
INIT_LIST_HEAD(&host->h_reclaim);
nrhosts++;
+
+ nlm_display_address((struct sockaddr *)&host->h_addr,
+ host->h_addrbuf, sizeof(host->h_addrbuf));
+ nlm_display_address((struct sockaddr *)&host->h_srcaddr,
+ host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
+
+ dprintk("lockd: nlm_lookup_host created host %s\n",
+ host->h_name);
+
out:
mutex_unlock(&nlm_host_mutex);
return host;
@@ -170,33 +264,103 @@ nlm_destroy_host(struct nlm_host *host)
kfree(host);
}
-/*
- * Find an NLM server handle in the cache. If there is none, create it.
+/**
+ * nlmclnt_lookup_host - Find an NLM host handle matching a remote server
+ * @sap: network address of server
+ * @salen: length of server address
+ * @protocol: transport protocol to use
+ * @version: NLM protocol version
+ * @hostname: '\0'-terminated hostname of server
+ *
+ * Returns an nlm_host structure that matches the passed-in
+ * [server address, transport protocol, NLM version, server hostname].
+ * If one doesn't already exist in the host cache, a new handle is
+ * created and returned.
*/
-struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin,
- int proto, u32 version,
- const char *hostname,
- unsigned int hostname_len)
+struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
+ const size_t salen,
+ const unsigned short protocol,
+ const u32 version, const char *hostname)
{
- struct sockaddr_in ssin = {0};
-
- return nlm_lookup_host(0, sin, proto, version,
- hostname, hostname_len, &ssin);
+ const struct sockaddr source = {
+ .sa_family = AF_UNSPEC,
+ };
+ struct nlm_lookup_host_info ni = {
+ .server = 0,
+ .sap = sap,
+ .salen = salen,
+ .protocol = protocol,
+ .version = version,
+ .hostname = hostname,
+ .hostname_len = strlen(hostname),
+ .src_sap = &source,
+ .src_len = sizeof(source),
+ };
+
+ dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
+ (hostname ? hostname : "<none>"), version,
+ (protocol == IPPROTO_UDP ? "udp" : "tcp"));
+
+ return nlm_lookup_host(&ni);
}
-/*
- * Find an NLM client handle in the cache. If there is none, create it.
+/**
+ * nlmsvc_lookup_host - Find an NLM host handle matching a remote client
+ * @rqstp: incoming NLM request
+ * @hostname: name of client host
+ * @hostname_len: length of client hostname
+ *
+ * Returns an nlm_host structure that matches the [client address,
+ * transport protocol, NLM version, client hostname] of the passed-in
+ * NLM request. If one doesn't already exist in the host cache, a
+ * new handle is created and returned.
+ *
+ * Before possibly creating a new nlm_host, construct a sockaddr
+ * for a specific source address in case the local system has
+ * multiple network addresses. The family of the address in
+ * rq_daddr is guaranteed to be the same as the family of the
+ * address in rq_addr, so it's safe to use the same family for
+ * the source address.
*/
-struct nlm_host *
-nlmsvc_lookup_host(struct svc_rqst *rqstp,
- const char *hostname, unsigned int hostname_len)
+struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
+ const char *hostname,
+ const size_t hostname_len)
{
- struct sockaddr_in ssin = {0};
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ };
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ };
+ struct nlm_lookup_host_info ni = {
+ .server = 1,
+ .sap = svc_addr(rqstp),
+ .salen = rqstp->rq_addrlen,
+ .protocol = rqstp->rq_prot,
+ .version = rqstp->rq_vers,
+ .hostname = hostname,
+ .hostname_len = hostname_len,
+ .src_len = rqstp->rq_addrlen,
+ };
+
+ dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
+ (int)hostname_len, hostname, rqstp->rq_vers,
+ (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
+
+ switch (ni.sap->sa_family) {
+ case AF_INET:
+ sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
+ ni.src_sap = (struct sockaddr *)&sin;
+ break;
+ case AF_INET6:
+ ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
+ ni.src_sap = (struct sockaddr *)&sin6;
+ break;
+ default:
+ return NULL;
+ }
- ssin.sin_addr = rqstp->rq_daddr.addr;
- return nlm_lookup_host(1, svc_addr_in(rqstp),
- rqstp->rq_prot, rqstp->rq_vers,
- hostname, hostname_len, &ssin);
+ return nlm_lookup_host(&ni);
}
/*
@@ -207,9 +371,8 @@ nlm_bind_host(struct nlm_host *host)
{
struct rpc_clnt *clnt;
- dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n",
- NIPQUAD(host->h_saddr.sin_addr),
- NIPQUAD(host->h_addr.sin_addr));
+ dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
+ host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
/* Lock host handle */
mutex_lock(&host->h_mutex);
@@ -221,7 +384,7 @@ nlm_bind_host(struct nlm_host *host)
if (time_after_eq(jiffies, host->h_nextrebind)) {
rpc_force_rebind(clnt);
host->h_nextrebind = jiffies + NLM_HOST_REBIND;
- dprintk("lockd: next rebind in %ld jiffies\n",
+ dprintk("lockd: next rebind in %lu jiffies\n",
host->h_nextrebind - jiffies);
}
} else {
@@ -234,9 +397,9 @@ nlm_bind_host(struct nlm_host *host)
};
struct rpc_create_args args = {
.protocol = host->h_proto,
- .address = (struct sockaddr *)&host->h_addr,
- .addrsize = sizeof(host->h_addr),
- .saddress = (struct sockaddr *)&host->h_saddr,
+ .address = nlm_addr(host),
+ .addrsize = host->h_addrlen,
+ .saddress = nlm_srcaddr(host),
.timeout = &timeparms,
.servername = host->h_name,
.program = &nlm_program,
@@ -324,12 +487,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin,
struct nsm_handle *nsm;
struct nlm_host *host;
- dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n",
- hostname, NIPQUAD(sin->sin_addr));
-
- /* Find the NSM handle for this peer */
- if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0)))
+ nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
+ hostname, hostname_len, 0);
+ if (nsm == NULL) {
+ dprintk("lockd: never saw rebooted peer '%.*s' before\n",
+ hostname_len, hostname);
return;
+ }
+
+ dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
+ hostname_len, hostname, nsm->sm_addrbuf);
/* When reclaiming locks on this peer, make sure that
* we set up a new notification */
@@ -461,22 +628,23 @@ nlm_gc_hosts(void)
static LIST_HEAD(nsm_handles);
static DEFINE_SPINLOCK(nsm_lock);
-static struct nsm_handle *
-__nsm_find(const struct sockaddr_in *sin,
- const char *hostname, unsigned int hostname_len,
- int create)
+static struct nsm_handle *nsm_find(const struct sockaddr *sap,
+ const size_t salen,
+ const char *hostname,
+ const size_t hostname_len,
+ const int create)
{
struct nsm_handle *nsm = NULL;
struct nsm_handle *pos;
- if (!sin)
+ if (!sap)
return NULL;
if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
if (printk_ratelimit()) {
printk(KERN_WARNING "Invalid hostname \"%.*s\" "
"in NFS lock request\n",
- hostname_len, hostname);
+ (int)hostname_len, hostname);
}
return NULL;
}
@@ -489,7 +657,7 @@ retry:
if (strlen(pos->sm_name) != hostname_len
|| memcmp(pos->sm_name, hostname, hostname_len))
continue;
- } else if (!nlm_cmp_addr(&pos->sm_addr, sin))
+ } else if (!nlm_cmp_addr(nsm_addr(pos), sap))
continue;
atomic_inc(&pos->sm_count);
kfree(nsm);
@@ -509,10 +677,13 @@ retry:
if (nsm == NULL)
return NULL;
- nsm->sm_addr = *sin;
+ memcpy(nsm_addr(nsm), sap, salen);
+ nsm->sm_addrlen = salen;
nsm->sm_name = (char *) (nsm + 1);
memcpy(nsm->sm_name, hostname, hostname_len);
nsm->sm_name[hostname_len] = '\0';
+ nlm_display_address((struct sockaddr *)&nsm->sm_addr,
+ nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
atomic_set(&nsm->sm_count, 1);
goto retry;
@@ -521,13 +692,6 @@ found:
return nsm;
}
-static struct nsm_handle *
-nsm_find(const struct sockaddr_in *sin, const char *hostname,
- unsigned int hostname_len)
-{
- return __nsm_find(sin, hostname, hostname_len, 1);
-}
-
/*
* Release an NSM handle
*/
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e4d563543b11..4e7e958e8f67 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
memset(&args, 0, sizeof(args));
args.mon_name = nsm->sm_name;
- args.addr = nsm->sm_addr.sin_addr.s_addr;
+ args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
args.prog = NLM_PROGRAM;
args.vers = 3;
args.proc = NLMPROC_NSM_NOTIFY;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 5bd9bf0fa9df..c631a83931ce 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -51,7 +51,6 @@ static DEFINE_MUTEX(nlmsvc_mutex);
static unsigned int nlmsvc_users;
static struct task_struct *nlmsvc_task;
static struct svc_rqst *nlmsvc_rqst;
-int nlmsvc_grace_period;
unsigned long nlmsvc_timeout;
/*
@@ -85,27 +84,23 @@ static unsigned long get_lockd_grace_period(void)
return nlm_timeout * 5 * HZ;
}
-unsigned long get_nfs_grace_period(void)
-{
- unsigned long lockdgrace = get_lockd_grace_period();
- unsigned long nfsdgrace = 0;
-
- if (nlmsvc_ops)
- nfsdgrace = nlmsvc_ops->get_grace_period();
-
- return max(lockdgrace, nfsdgrace);
-}
-EXPORT_SYMBOL(get_nfs_grace_period);
+static struct lock_manager lockd_manager = {
+};
-static unsigned long set_grace_period(void)
+static void grace_ender(struct work_struct *not_used)
{
- nlmsvc_grace_period = 1;
- return get_nfs_grace_period() + jiffies;
+ locks_end_grace(&lockd_manager);
}
-static inline void clear_grace_period(void)
+static DECLARE_DELAYED_WORK(grace_period_end, grace_ender);
+
+static void set_grace_period(void)
{
- nlmsvc_grace_period = 0;
+ unsigned long grace_period = get_lockd_grace_period();
+
+ locks_start_grace(&lockd_manager);
+ cancel_delayed_work_sync(&grace_period_end);
+ schedule_delayed_work(&grace_period_end, grace_period);
}
/*
@@ -116,7 +111,6 @@ lockd(void *vrqstp)
{
int err = 0, preverr = 0;
struct svc_rqst *rqstp = vrqstp;
- unsigned long grace_period_expire;
/* try_to_freeze() is called from svc_recv() */
set_freezable();
@@ -139,7 +133,7 @@ lockd(void *vrqstp)
nlm_timeout = LOCKD_DFLT_TIMEO;
nlmsvc_timeout = nlm_timeout * HZ;
- grace_period_expire = set_grace_period();
+ set_grace_period();
/*
* The main request loop. We don't terminate until the last
@@ -153,21 +147,12 @@ lockd(void *vrqstp)
flush_signals(current);
if (nlmsvc_ops) {
nlmsvc_invalidate_all();
- grace_period_expire = set_grace_period();
+ set_grace_period();
}
continue;
}
- /*
- * Retry any blocked locks that have been notified by
- * the VFS. Don't do this during grace period.
- * (Theoretically, there shouldn't even be blocked locks
- * during grace period).
- */
- if (!nlmsvc_grace_period) {
- timeout = nlmsvc_retry_blocked();
- } else if (time_before(grace_period_expire, jiffies))
- clear_grace_period();
+ timeout = nlmsvc_retry_blocked();
/*
* Find a socket with data available and call its
@@ -195,6 +180,7 @@ lockd(void *vrqstp)
svc_process(rqstp);
}
flush_signals(current);
+ cancel_delayed_work_sync(&grace_period_end);
if (nlmsvc_ops)
nlmsvc_invalidate_all();
nlm_shutdown_hosts();
@@ -203,25 +189,28 @@ lockd(void *vrqstp)
}
/*
- * Make any sockets that are needed but not present.
- * If nlm_udpport or nlm_tcpport were set as module
- * options, make those sockets unconditionally
+ * Ensure there are active UDP and TCP listeners for lockd.
+ *
+ * Even if we have only TCP NFS mounts and/or TCP NFSDs, some
+ * local services (such as rpc.statd) still require UDP, and
+ * some NFS servers do not yet support NLM over TCP.
+ *
+ * Returns zero if all listeners are available; otherwise a
+ * negative errno value is returned.
*/
-static int make_socks(struct svc_serv *serv, int proto)
+static int make_socks(struct svc_serv *serv)
{
static int warned;
struct svc_xprt *xprt;
int err = 0;
- if (proto == IPPROTO_UDP || nlm_udpport) {
- xprt = svc_find_xprt(serv, "udp", 0, 0);
- if (!xprt)
- err = svc_create_xprt(serv, "udp", nlm_udpport,
- SVC_SOCK_DEFAULTS);
- else
- svc_xprt_put(xprt);
- }
- if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
+ xprt = svc_find_xprt(serv, "udp", 0, 0);
+ if (!xprt)
+ err = svc_create_xprt(serv, "udp", nlm_udpport,
+ SVC_SOCK_DEFAULTS);
+ else
+ svc_xprt_put(xprt);
+ if (err >= 0) {
xprt = svc_find_xprt(serv, "tcp", 0, 0);
if (!xprt)
err = svc_create_xprt(serv, "tcp", nlm_tcpport,
@@ -241,8 +230,7 @@ static int make_socks(struct svc_serv *serv, int proto)
/*
* Bring up the lockd process if it's not already up.
*/
-int
-lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
+int lockd_up(void)
{
struct svc_serv *serv;
int error = 0;
@@ -251,11 +239,8 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
/*
* Check whether we're already up and running.
*/
- if (nlmsvc_rqst) {
- if (proto)
- error = make_socks(nlmsvc_rqst->rq_server, proto);
+ if (nlmsvc_rqst)
goto out;
- }
/*
* Sanity check: if there's no pid,
@@ -266,13 +251,14 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
"lockd_up: no pid, %d users??\n", nlmsvc_users);
error = -ENOMEM;
- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
+ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
goto out;
}
- if ((error = make_socks(serv, proto)) < 0)
+ error = make_socks(serv);
+ if (error < 0)
goto destroy_and_out;
/*
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4a714f64515b..014f6ce48172 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -88,12 +88,6 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
dprintk("lockd: TEST4 called\n");
resp->cookie = argp->cookie;
- /* Don't accept test requests during grace period */
- if (nlmsvc_grace_period) {
- resp->status = nlm_lck_denied_grace_period;
- return rc;
- }
-
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -122,12 +116,6 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
- /* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period && !argp->reclaim) {
- resp->status = nlm_lck_denied_grace_period;
- return rc;
- }
-
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -146,7 +134,8 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Now try to lock the file */
resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock,
- argp->block, &argp->cookie);
+ argp->block, &argp->cookie,
+ argp->reclaim);
if (resp->status == nlm_drop_reply)
rc = rpc_drop_reply;
else
@@ -169,7 +158,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -202,7 +191,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -231,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
dprintk("lockd: GRANTED called\n");
- resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
+ resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
return rpc_success;
}
@@ -341,7 +330,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period && !argp->reclaim) {
+ if (locks_in_grace() && !argp->reclaim) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -374,7 +363,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -432,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
{
struct sockaddr_in saddr;
- memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
-
dprintk("lockd: SM_NOTIFY called\n");
- if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
- || ntohs(saddr.sin_port) >= 1024) {
+
+ if (!nlm_privileged_requester(rqstp)) {
char buf[RPC_MAX_ADDRBUFLEN];
printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index cf0d5c2c318d..6063a8e4b9f3 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -360,7 +360,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
__be32
nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_host *host, struct nlm_lock *lock, int wait,
- struct nlm_cookie *cookie)
+ struct nlm_cookie *cookie, int reclaim)
{
struct nlm_block *block = NULL;
int error;
@@ -406,6 +406,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
+ if (locks_in_grace() && !reclaim) {
+ ret = nlm_lck_denied_grace_period;
+ goto out;
+ }
+ if (reclaim && !locks_in_grace()) {
+ ret = nlm_lck_denied_grace_period;
+ goto out;
+ }
+
if (!wait)
lock->fl.fl_flags &= ~FL_SLEEP;
error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
@@ -502,6 +511,10 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
+ if (locks_in_grace()) {
+ ret = nlm_lck_denied_grace_period;
+ goto out;
+ }
error = vfs_test_lock(file->f_file, &lock->fl);
if (error == FILE_LOCK_DEFERRED) {
ret = nlmsvc_defer_lock_rqst(rqstp, block);
@@ -582,6 +595,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
+ if (locks_in_grace())
+ return nlm_lck_denied_grace_period;
+
mutex_lock(&file->f_mutex);
block = nlmsvc_lookup_block(file, lock);
mutex_unlock(&file->f_mutex);
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 76262c1986f2..548b0bb2b84d 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -117,12 +117,6 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
dprintk("lockd: TEST called\n");
resp->cookie = argp->cookie;
- /* Don't accept test requests during grace period */
- if (nlmsvc_grace_period) {
- resp->status = nlm_lck_denied_grace_period;
- return rc;
- }
-
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -152,12 +146,6 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
- /* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period && !argp->reclaim) {
- resp->status = nlm_lck_denied_grace_period;
- return rc;
- }
-
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -176,7 +164,8 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Now try to lock the file */
resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock,
- argp->block, &argp->cookie));
+ argp->block, &argp->cookie,
+ argp->reclaim));
if (resp->status == nlm_drop_reply)
rc = rpc_drop_reply;
else
@@ -199,7 +188,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -232,7 +221,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -261,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
dprintk("lockd: GRANTED called\n");
- resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
+ resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
return rpc_success;
}
@@ -373,7 +362,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period && !argp->reclaim) {
+ if (locks_in_grace() && !argp->reclaim) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -406,7 +395,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -464,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
{
struct sockaddr_in saddr;
- memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
-
dprintk("lockd: SM_NOTIFY called\n");
- if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
- || ntohs(saddr.sin_port) >= 1024) {
+
+ if (!nlm_privileged_requester(rqstp)) {
char buf[RPC_MAX_ADDRBUFLEN];
printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 198b4e55b373..34c2766e27c7 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
static int
nlmsvc_match_ip(void *datap, struct nlm_host *host)
{
- return nlm_cmp_addr(&host->h_saddr, datap);
+ return nlm_cmp_addr(nlm_srcaddr(host), datap);
}
/**
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 3e459e18cc31..1f226290c67c 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
argp->state = ntohl(*p++);
/* Preserve the address in network byte order */
argp->addr = *p++;
- argp->vers = *p++;
- argp->proto = *p++;
return xdr_argsize_check(rqstp, p);
}
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 43ff9397e6c6..50c493a8ad8e 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
argp->state = ntohl(*p++);
/* Preserve the address in network byte order */
argp->addr = *p++;
- argp->vers = *p++;
- argp->proto = *p++;
return xdr_argsize_check(rqstp, p);
}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index f447f4b4476c..6a09760c5960 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -105,7 +105,8 @@ int nfs_callback_up(void)
mutex_lock(&nfs_callback_mutex);
if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
goto out;
- serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+ AF_INET, NULL);
ret = -ENOMEM;
if (!serv)
goto out_err;
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 15c6faeec77c..b2786a5f9afe 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -70,7 +70,6 @@ nlm_fclose(struct file *filp)
static struct nlmsvc_binding nfsd_nlm_ops = {
.fopen = nlm_fopen, /* open file for locking */
.fclose = nlm_fclose, /* close file */
- .get_grace_period = get_nfs4_grace_period,
};
void
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 4d617ea28cfc..9dbd2eb91281 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ nfserr = fh_verify(rqstp, &resp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
if (nfserr)
RETURN_STATUS(nfserr);
@@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
dprintk("nfsd: FSSTAT(3) %s\n",
SVCFH_fmt(&argp->fh));
- nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
+ nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
fh_put(&argp->fh);
RETURN_STATUS(nfserr);
}
@@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
resp->f_maxfilesize = ~(u32) 0;
resp->f_properties = NFS3_FSF_DEFAULT;
- nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
+ nfserr = fh_verify(rqstp, &argp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
/* Check special features of the file system. May request
* different read/write sizes for file systems known to have
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 702fa577aa6e..094747a1227c 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
WRITE32(OP_CB_RECALL);
- WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t));
+ WRITE32(cb_rec->cbr_stateid.si_generation);
+ WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
WRITE32(cb_rec->cbr_trunc);
WRITE32(len);
WRITEMEM(cb_rec->cbr_fhval, len);
@@ -379,6 +380,7 @@ static int do_probe_callback(void *data)
.addrsize = sizeof(addr),
.timeout = &timeparms,
.program = &cb_program,
+ .prognumber = cb->cb_prog,
.version = nfs_cb_version[1]->number,
.authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
.flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
@@ -396,9 +398,6 @@ static int do_probe_callback(void *data)
addr.sin_port = htons(cb->cb_port);
addr.sin_addr.s_addr = htonl(cb->cb_addr);
- /* Initialize rpc_stat */
- memset(args.program->stats, 0, sizeof(struct rpc_stat));
-
/* Create RPC client */
client = rpc_create(&args);
if (IS_ERR(client)) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e5b51ffafc6c..669461e291ae 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -201,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Openowner is now set, so sequence id will get bumped. Now we need
* these checks before we do any creates: */
status = nfserr_grace;
- if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+ if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
status = nfserr_no_grace;
- if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+ if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
switch (open->op_claim_type) {
@@ -575,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
__be32 status;
- if (nfs4_in_grace())
+ if (locks_in_grace())
return nfserr_grace;
status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
remove->rm_name, remove->rm_namelen);
@@ -596,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!cstate->save_fh.fh_dentry)
return status;
- if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags
+ if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags
& NFSEXP_NOSUBTREECHECK))
return nfserr_grace;
status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1578d7a2667e..0cc7ff5d5ab5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -61,7 +61,6 @@
static time_t lease_time = 90; /* default lease time */
static time_t user_lease_time = 90;
static time_t boot_time;
-static int in_grace = 1;
static u32 current_ownerid = 1;
static u32 current_fileid = 1;
static u32 current_delegid = 1;
@@ -1640,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
case NFS4_OPEN_CLAIM_NULL:
/* Let's not give out any delegations till everyone's
* had the chance to reclaim theirs.... */
- if (nfs4_in_grace())
+ if (locks_in_grace())
goto out;
if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
goto out;
@@ -1816,12 +1815,15 @@ out:
return status;
}
+struct lock_manager nfsd4_manager = {
+};
+
static void
-end_grace(void)
+nfsd4_end_grace(void)
{
dprintk("NFSD: end of grace period\n");
nfsd4_recdir_purge_old();
- in_grace = 0;
+ locks_end_grace(&nfsd4_manager);
}
static time_t
@@ -1838,8 +1840,8 @@ nfs4_laundromat(void)
nfs4_lock_state();
dprintk("NFSD: laundromat service - starting\n");
- if (in_grace)
- end_grace();
+ if (locks_in_grace())
+ nfsd4_end_grace();
list_for_each_safe(pos, next, &client_lru) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -1974,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
return nfserr_bad_stateid;
else if (ONE_STATEID(stateid) && (flags & RD_STATE))
return nfs_ok;
- else if (nfs4_in_grace()) {
+ else if (locks_in_grace()) {
/* Answer in remaining cases depends on existance of
* conflicting state; so we must wait out the grace period. */
return nfserr_grace;
@@ -1993,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
static inline int
io_during_grace_disallowed(struct inode *inode, int flags)
{
- return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE))
+ return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
&& mandatory_lock(inode);
}
@@ -2693,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
filp = lock_stp->st_vfs_file;
status = nfserr_grace;
- if (nfs4_in_grace() && !lock->lk_reclaim)
+ if (locks_in_grace() && !lock->lk_reclaim)
goto out;
status = nfserr_no_grace;
- if (!nfs4_in_grace() && lock->lk_reclaim)
+ if (!locks_in_grace() && lock->lk_reclaim)
goto out;
locks_init_lock(&file_lock);
@@ -2779,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
int error;
__be32 status;
- if (nfs4_in_grace())
+ if (locks_in_grace())
return nfserr_grace;
if (check_lock_length(lockt->lt_offset, lockt->lt_length))
@@ -3192,9 +3194,9 @@ __nfs4_state_start(void)
unsigned long grace_time;
boot_time = get_seconds();
- grace_time = get_nfs_grace_period();
+ grace_time = get_nfs4_grace_period();
lease_time = user_lease_time;
- in_grace = 1;
+ locks_start_grace(&nfsd4_manager);
printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
grace_time/HZ);
laundry_wq = create_singlethread_workqueue("nfsd4");
@@ -3213,12 +3215,6 @@ nfs4_state_start(void)
return;
}
-int
-nfs4_in_grace(void)
-{
- return in_grace;
-}
-
time_t
nfs4_lease_time(void)
{
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 14ba4d9b2859..afcdf4b76843 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -413,6 +413,18 @@ out_nfserr:
}
static __be32
+nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
+{
+ DECODE_HEAD;
+
+ READ_BUF(sizeof(stateid_t));
+ READ32(sid->si_generation);
+ COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+
+ DECODE_TAIL;
+}
+
+static __be32
nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
{
DECODE_HEAD;
@@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
DECODE_HEAD;
close->cl_stateowner = NULL;
- READ_BUF(4 + sizeof(stateid_t));
+ READ_BUF(4);
READ32(close->cl_seqid);
- READ32(close->cl_stateid.si_generation);
- COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
+ return nfsd4_decode_stateid(argp, &close->cl_stateid);
DECODE_TAIL;
}
@@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
static inline __be32
nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
{
- DECODE_HEAD;
-
- READ_BUF(sizeof(stateid_t));
- READ32(dr->dr_stateid.si_generation);
- COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t));
-
- DECODE_TAIL;
+ return nfsd4_decode_stateid(argp, &dr->dr_stateid);
}
static inline __be32
@@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
READ32(lock->lk_is_new);
if (lock->lk_is_new) {
- READ_BUF(36);
+ READ_BUF(4);
READ32(lock->lk_new_open_seqid);
- READ32(lock->lk_new_open_stateid.si_generation);
-
- COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
+ if (status)
+ return status;
+ READ_BUF(8 + sizeof(clientid_t));
READ32(lock->lk_new_lock_seqid);
COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
READ32(lock->lk_new_owner.len);
READ_BUF(lock->lk_new_owner.len);
READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
} else {
- READ_BUF(20);
- READ32(lock->lk_old_lock_stateid.si_generation);
- COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
+ if (status)
+ return status;
+ READ_BUF(4);
READ32(lock->lk_old_lock_seqid);
}
@@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
DECODE_HEAD;
locku->lu_stateowner = NULL;
- READ_BUF(24 + sizeof(stateid_t));
+ READ_BUF(8);
READ32(locku->lu_type);
if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
goto xdr_error;
READ32(locku->lu_seqid);
- READ32(locku->lu_stateid.si_generation);
- COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
+ if (status)
+ return status;
+ READ_BUF(16);
READ64(locku->lu_offset);
READ64(locku->lu_length);
@@ -678,8 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
READ32(open->op_delegate_type);
break;
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
- READ_BUF(sizeof(stateid_t) + 4);
- COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+ if (status)
+ return status;
+ READ_BUF(4);
READ32(open->op_fname.len);
READ_BUF(open->op_fname.len);
SAVEMEM(open->op_fname.data, open->op_fname.len);
@@ -699,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
DECODE_HEAD;
open_conf->oc_stateowner = NULL;
- READ_BUF(4 + sizeof(stateid_t));
- READ32(open_conf->oc_req_stateid.si_generation);
- COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
+ if (status)
+ return status;
+ READ_BUF(4);
READ32(open_conf->oc_seqid);
DECODE_TAIL;
@@ -713,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
DECODE_HEAD;
open_down->od_stateowner = NULL;
- READ_BUF(12 + sizeof(stateid_t));
- READ32(open_down->od_stateid.si_generation);
- COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
+ if (status)
+ return status;
+ READ_BUF(12);
READ32(open_down->od_seqid);
READ32(open_down->od_share_access);
READ32(open_down->od_share_deny);
@@ -743,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
{
DECODE_HEAD;
- READ_BUF(sizeof(stateid_t) + 12);
- READ32(read->rd_stateid.si_generation);
- COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &read->rd_stateid);
+ if (status)
+ return status;
+ READ_BUF(12);
READ64(read->rd_offset);
READ32(read->rd_length);
@@ -834,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
{
- DECODE_HEAD;
-
- READ_BUF(sizeof(stateid_t));
- READ32(setattr->sa_stateid.si_generation);
- COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t));
- if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl)))
- goto out;
+ __be32 status;
- DECODE_TAIL;
+ status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
+ if (status)
+ return status;
+ return nfsd4_decode_fattr(argp, setattr->sa_bmval,
+ &setattr->sa_iattr, &setattr->sa_acl);
}
static __be32
@@ -927,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
int len;
DECODE_HEAD;
- READ_BUF(sizeof(stateid_opaque_t) + 20);
- READ32(write->wr_stateid.si_generation);
- COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &write->wr_stateid);
+ if (status)
+ return status;
+ READ_BUF(16);
READ64(write->wr_offset);
READ32(write->wr_stable_how);
if (write->wr_stable_how > 2)
@@ -1183,7 +1196,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
* Header routine to setup seqid operation replay cache
*/
#define ENCODE_SEQID_OP_HEAD \
- __be32 *p; \
__be32 *save; \
\
save = resp->p;
@@ -1950,6 +1962,17 @@ fail:
return -EINVAL;
}
+static void
+nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
+{
+ ENCODE_HEAD;
+
+ RESERVE_SPACE(sizeof(stateid_t));
+ WRITE32(sid->si_generation);
+ WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+ ADJUST_ARGS();
+}
+
static __be32
nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
{
@@ -1969,12 +1992,9 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
{
ENCODE_SEQID_OP_HEAD;
- if (!nfserr) {
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(close->cl_stateid.si_generation);
- WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- }
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &close->cl_stateid);
+
ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
return nfserr;
}
@@ -2074,12 +2094,9 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
{
ENCODE_SEQID_OP_HEAD;
- if (!nfserr) {
- RESERVE_SPACE(4 + sizeof(stateid_t));
- WRITE32(lock->lk_resp_stateid.si_generation);
- WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- } else if (nfserr == nfserr_denied)
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
+ else if (nfserr == nfserr_denied)
nfsd4_encode_lock_denied(resp, &lock->lk_denied);
ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
@@ -2099,13 +2116,9 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
{
ENCODE_SEQID_OP_HEAD;
- if (!nfserr) {
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(locku->lu_stateid.si_generation);
- WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- }
-
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &locku->lu_stateid);
+
ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
return nfserr;
}
@@ -2128,14 +2141,14 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
static __be32
nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
{
+ ENCODE_HEAD;
ENCODE_SEQID_OP_HEAD;
if (nfserr)
goto out;
- RESERVE_SPACE(36 + sizeof(stateid_t));
- WRITE32(open->op_stateid.si_generation);
- WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t));
+ nfsd4_encode_stateid(resp, &open->op_stateid);
+ RESERVE_SPACE(40);
WRITECINFO(open->op_cinfo);
WRITE32(open->op_rflags);
WRITE32(2);
@@ -2148,8 +2161,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
case NFS4_OPEN_DELEGATE_NONE:
break;
case NFS4_OPEN_DELEGATE_READ:
- RESERVE_SPACE(20 + sizeof(stateid_t));
- WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
+ RESERVE_SPACE(20);
WRITE32(open->op_recall);
/*
@@ -2162,8 +2175,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
ADJUST_ARGS();
break;
case NFS4_OPEN_DELEGATE_WRITE:
- RESERVE_SPACE(32 + sizeof(stateid_t));
- WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
+ RESERVE_SPACE(32);
WRITE32(0);
/*
@@ -2195,13 +2208,9 @@ static __be32
nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
{
ENCODE_SEQID_OP_HEAD;
-
- if (!nfserr) {
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(oc->oc_resp_stateid.si_generation);
- WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- }
+
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
return nfserr;
@@ -2211,13 +2220,9 @@ static __be32
nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
{
ENCODE_SEQID_OP_HEAD;
-
- if (!nfserr) {
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(od->od_stateid.si_generation);
- WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- }
+
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &od->od_stateid);
ENCODE_SEQID_OP_TAIL(od->od_stateowner);
return nfserr;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c53e65f8f3a2..97543df58242 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -614,10 +614,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
return -EINVAL;
err = nfsd_create_serv();
if (!err) {
- int proto = 0;
- err = svc_addsock(nfsd_serv, fd, buf, &proto);
+ err = svc_addsock(nfsd_serv, fd, buf);
if (err >= 0) {
- err = lockd_up(proto);
+ err = lockd_up();
if (err < 0)
svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index ea37c96f0445..cd25d91895a1 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -302,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
if (error)
goto out;
- if (!(access & NFSD_MAY_LOCK)) {
- /*
- * pseudoflavor restrictions are not enforced on NLM,
- * which clients virtually always use auth_sys for,
- * even while using RPCSEC_GSS for NFS.
- */
- error = check_nfsd_access(exp, rqstp);
- if (error)
- goto out;
- }
+ /*
+ * pseudoflavor restrictions are not enforced on NLM,
+ * which clients virtually always use auth_sys for,
+ * even while using RPCSEC_GSS for NFS.
+ */
+ if (access & NFSD_MAY_LOCK)
+ goto skip_pseudoflavor_check;
+ /*
+ * Clients may expect to be able to use auth_sys during mount,
+ * even if they use gss for everything else; see section 2.3.2
+ * of rfc 2623.
+ */
+ if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT
+ && exp->ex_path.dentry == dentry)
+ goto skip_pseudoflavor_check;
+
+ error = check_nfsd_access(exp, rqstp);
+ if (error)
+ goto out;
+skip_pseudoflavor_check:
/* Finally, check access permissions. */
error = nfsd_permission(rqstp, exp, dentry, access);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0766f95d236a..5cffeca7acef 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ nfserr = fh_verify(rqstp, &resp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
return nfsd_return_attrs(nfserr, resp);
}
@@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh));
- nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
+ nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
+ NFSD_MAY_BYPASS_GSS_ON_ROOT);
fh_put(&argp->fh);
return nfserr;
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 80292ff5e924..59eeb46f82c5 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,6 +229,7 @@ int nfsd_create_serv(void)
atomic_set(&nfsd_busy, 0);
nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+ AF_INET,
nfsd_last_thread, nfsd, THIS_MODULE);
if (nfsd_serv == NULL)
err = -ENOMEM;
@@ -243,25 +244,20 @@ static int nfsd_init_socks(int port)
if (!list_empty(&nfsd_serv->sv_permsocks))
return 0;
- error = lockd_up(IPPROTO_UDP);
- if (error >= 0) {
- error = svc_create_xprt(nfsd_serv, "udp", port,
+ error = svc_create_xprt(nfsd_serv, "udp", port,
SVC_SOCK_DEFAULTS);
- if (error < 0)
- lockd_down();
- }
if (error < 0)
return error;
- error = lockd_up(IPPROTO_TCP);
- if (error >= 0) {
- error = svc_create_xprt(nfsd_serv, "tcp", port,
+ error = svc_create_xprt(nfsd_serv, "tcp", port,
SVC_SOCK_DEFAULTS);
- if (error < 0)
- lockd_down();
- }
if (error < 0)
return error;
+
+ error = lockd_up();
+ if (error < 0)
+ return error;
+
return 0;
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 18060bed5267..aa1d0d6489a1 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -83,7 +83,6 @@ struct raparm_hbucket {
spinlock_t pb_lock;
} ____cacheline_aligned_in_smp;
-static struct raparms * raparml;
#define RAPARM_HASH_BITS 4
#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS)
#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
@@ -1866,9 +1865,9 @@ out:
* N.B. After this call fhp needs an fh_put
*/
__be32
-nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
+nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
{
- __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
+ __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
if (!err && vfs_statfs(fhp->fh_dentry,stat))
err = nfserr_io;
return err;
@@ -1966,11 +1965,20 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
void
nfsd_racache_shutdown(void)
{
- if (!raparml)
- return;
+ struct raparms *raparm, *last_raparm;
+ unsigned int i;
+
dprintk("nfsd: freeing readahead buffers.\n");
- kfree(raparml);
- raparml = NULL;
+
+ for (i = 0; i < RAPARM_HASH_SIZE; i++) {
+ raparm = raparm_hash[i].pb_head;
+ while(raparm) {
+ last_raparm = raparm;
+ raparm = raparm->p_next;
+ kfree(last_raparm);
+ }
+ raparm_hash[i].pb_head = NULL;
+ }
}
/*
* Initialize readahead param cache
@@ -1981,35 +1989,38 @@ nfsd_racache_init(int cache_size)
int i;
int j = 0;
int nperbucket;
+ struct raparms **raparm = NULL;
- if (raparml)
+ if (raparm_hash[0].pb_head)
return 0;
- if (cache_size < 2*RAPARM_HASH_SIZE)
- cache_size = 2*RAPARM_HASH_SIZE;
- raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL);
-
- if (!raparml) {
- printk(KERN_WARNING
- "nfsd: Could not allocate memory read-ahead cache.\n");
- return -ENOMEM;
- }
+ nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
+ if (nperbucket < 2)
+ nperbucket = 2;
+ cache_size = nperbucket * RAPARM_HASH_SIZE;
dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
- for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
- raparm_hash[i].pb_head = NULL;
+
+ for (i = 0; i < RAPARM_HASH_SIZE; i++) {
spin_lock_init(&raparm_hash[i].pb_lock);
- }
- nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
- for (i = 0; i < cache_size - 1; i++) {
- if (i % nperbucket == 0)
- raparm_hash[j++].pb_head = raparml + i;
- if (i % nperbucket < nperbucket-1)
- raparml[i].p_next = raparml + i + 1;
+
+ raparm = &raparm_hash[i].pb_head;
+ for (j = 0; j < nperbucket; j++) {
+ *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
+ if (!*raparm)
+ goto out_nomem;
+ raparm = &(*raparm)->p_next;
+ }
+ *raparm = NULL;
}
nfsdstats.ra_size = cache_size;
return 0;
+
+out_nomem:
+ dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
+ nfsd_racache_shutdown();
+ return -ENOMEM;
}
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 66c1ab87656c..b675a49c1823 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -683,6 +683,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off,
return proc_calc_metrics(page, start, off, count, eof, len);
}
+#ifdef CONFIG_FILE_LOCKING
static int locks_open(struct inode *inode, struct file *filp)
{
return seq_open(filp, &locks_seq_operations);
@@ -694,6 +695,7 @@ static const struct file_operations proc_locks_operations = {
.llseek = seq_lseek,
.release = seq_release,
};
+#endif /* CONFIG_FILE_LOCKING */
static int execdomains_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
@@ -887,7 +889,9 @@ void __init proc_misc_init(void)
#ifdef CONFIG_PRINTK
proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
#endif
+#ifdef CONFIG_FILE_LOCKING
proc_create("locks", 0, NULL, &proc_locks_operations);
+#endif
proc_create("devices", 0, NULL, &proc_devinfo_operations);
proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
#ifdef CONFIG_BLOCK
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 44e3cb2f1966..a6a625be13fc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -947,6 +947,14 @@ struct lock_manager_operations {
int (*fl_change)(struct file_lock **, int);
};
+struct lock_manager {
+ struct list_head list;
+};
+
+void locks_start_grace(struct lock_manager *);
+void locks_end_grace(struct lock_manager *);
+int locks_in_grace(void);
+
/* that will die - we need it for nfs_lock_info */
#include <linux/nfs_fs_i.h>
@@ -988,6 +996,13 @@ struct file_lock {
#include <linux/fcntl.h>
+extern void send_sigio(struct fown_struct *fown, int fd, int band);
+
+/* fs/sync.c */
+extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
+ loff_t endbyte, unsigned int flags);
+
+#ifdef CONFIG_FILE_LOCKING
extern int fcntl_getlk(struct file *, struct flock __user *);
extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
struct flock __user *);
@@ -998,14 +1013,9 @@ extern int fcntl_setlk64(unsigned int, struct file *, unsigned int,
struct flock64 __user *);
#endif
-extern void send_sigio(struct fown_struct *fown, int fd, int band);
extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
extern int fcntl_getlease(struct file *filp);
-/* fs/sync.c */
-extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
- loff_t endbyte, unsigned int flags);
-
/* fs/locks.c */
extern void locks_init_lock(struct file_lock *);
extern void locks_copy_lock(struct file_lock *, struct file_lock *);
@@ -1028,6 +1038,37 @@ extern int lease_modify(struct file_lock **, int);
extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
extern struct seq_operations locks_seq_operations;
+#else /* !CONFIG_FILE_LOCKING */
+#define fcntl_getlk(a, b) ({ -EINVAL; })
+#define fcntl_setlk(a, b, c, d) ({ -EACCES; })
+#if BITS_PER_LONG == 32
+#define fcntl_getlk64(a, b) ({ -EINVAL; })
+#define fcntl_setlk64(a, b, c, d) ({ -EACCES; })
+#endif
+#define fcntl_setlease(a, b, c) ({ 0; })
+#define fcntl_getlease(a) ({ 0; })
+#define locks_init_lock(a) ({ })
+#define __locks_copy_lock(a, b) ({ })
+#define locks_copy_lock(a, b) ({ })
+#define locks_remove_posix(a, b) ({ })
+#define locks_remove_flock(a) ({ })
+#define posix_test_lock(a, b) ({ 0; })
+#define posix_lock_file(a, b, c) ({ -ENOLCK; })
+#define posix_lock_file_wait(a, b) ({ -ENOLCK; })
+#define posix_unblock_lock(a, b) (-ENOENT)
+#define vfs_test_lock(a, b) ({ 0; })
+#define vfs_lock_file(a, b, c, d) (-ENOLCK)
+#define vfs_cancel_lock(a, b) ({ 0; })
+#define flock_lock_file_wait(a, b) ({ -ENOLCK; })
+#define __break_lease(a, b) ({ 0; })
+#define lease_get_mtime(a, b) ({ })
+#define generic_setlease(a, b, c) ({ -EINVAL; })
+#define vfs_setlease(a, b, c) ({ -EINVAL; })
+#define lease_modify(a, b) ({ -EINVAL; })
+#define lock_may_read(a, b, c) ({ 1; })
+#define lock_may_write(a, b, c) ({ 1; })
+#endif /* !CONFIG_FILE_LOCKING */
+
struct fasync_struct {
int magic;
@@ -1575,9 +1616,12 @@ extern int vfs_statfs(struct dentry *, struct kstatfs *);
/* /sys/fs */
extern struct kobject *fs_kobj;
+extern int rw_verify_area(int, struct file *, loff_t *, size_t);
+
#define FLOCK_VERIFY_READ 1
#define FLOCK_VERIFY_WRITE 2
+#ifdef CONFIG_FILE_LOCKING
extern int locks_mandatory_locked(struct inode *);
extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
@@ -1608,8 +1652,6 @@ static inline int locks_verify_locked(struct inode *inode)
return 0;
}
-extern int rw_verify_area(int, struct file *, loff_t *, size_t);
-
static inline int locks_verify_truncate(struct inode *inode,
struct file *filp,
loff_t size)
@@ -1630,6 +1672,15 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
return __break_lease(inode, mode);
return 0;
}
+#else /* !CONFIG_FILE_LOCKING */
+#define locks_mandatory_locked(a) ({ 0; })
+#define locks_mandatory_area(a, b, c, d, e) ({ 0; })
+#define __mandatory_lock(a) ({ 0; })
+#define mandatory_lock(a) ({ 0; })
+#define locks_verify_locked(a) ({ 0; })
+#define locks_verify_truncate(a, b, c) ({ 0; })
+#define break_lease(a, b) ({ 0; })
+#endif /* CONFIG_FILE_LOCKING */
/* fs/open.c */
diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h
index 3d25bcd139d1..e5872dc994c0 100644
--- a/include/linux/lockd/bind.h
+++ b/include/linux/lockd/bind.h
@@ -27,7 +27,6 @@ struct nlmsvc_binding {
struct nfs_fh *,
struct file **);
void (*fclose)(struct file *);
- unsigned long (*get_grace_period)(void);
};
extern struct nlmsvc_binding * nlmsvc_ops;
@@ -53,15 +52,7 @@ extern void nlmclnt_done(struct nlm_host *host);
extern int nlmclnt_proc(struct nlm_host *host, int cmd,
struct file_lock *fl);
-extern int lockd_up(int proto);
+extern int lockd_up(void);
extern void lockd_down(void);
-unsigned long get_nfs_grace_period(void);
-
-#ifdef CONFIG_NFSD_V4
-unsigned long get_nfs4_grace_period(void);
-#else
-static inline unsigned long get_nfs4_grace_period(void) {return 0;}
-#endif
-
#endif /* LINUX_LOCKD_BIND_H */
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index dbb87ab282e8..b56d5aa9b194 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -12,6 +12,8 @@
#ifdef __KERNEL__
#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
#include <linux/fs.h>
#include <linux/kref.h>
#include <linux/utsname.h>
@@ -38,8 +40,9 @@
*/
struct nlm_host {
struct hlist_node h_hash; /* doubly linked list */
- struct sockaddr_in h_addr; /* peer address */
- struct sockaddr_in h_saddr; /* our address (optional) */
+ struct sockaddr_storage h_addr; /* peer address */
+ size_t h_addrlen;
+ struct sockaddr_storage h_srcaddr; /* our address (optional) */
struct rpc_clnt * h_rpcclnt; /* RPC client to talk to peer */
char * h_name; /* remote hostname */
u32 h_version; /* interface version */
@@ -61,18 +64,56 @@ struct nlm_host {
struct list_head h_granted; /* Locks in GRANTED state */
struct list_head h_reclaim; /* Locks in RECLAIM state */
struct nsm_handle * h_nsmhandle; /* NSM status handle */
+
+ char h_addrbuf[48], /* address eyecatchers */
+ h_srcaddrbuf[48];
};
struct nsm_handle {
struct list_head sm_link;
atomic_t sm_count;
char * sm_name;
- struct sockaddr_in sm_addr;
+ struct sockaddr_storage sm_addr;
+ size_t sm_addrlen;
unsigned int sm_monitored : 1,
sm_sticky : 1; /* don't unmonitor */
+ char sm_addrbuf[48]; /* address eyecatcher */
};
/*
+ * Rigorous type checking on sockaddr type conversions
+ */
+static inline struct sockaddr_in *nlm_addr_in(const struct nlm_host *host)
+{
+ return (struct sockaddr_in *)&host->h_addr;
+}
+
+static inline struct sockaddr *nlm_addr(const struct nlm_host *host)
+{
+ return (struct sockaddr *)&host->h_addr;
+}
+
+static inline struct sockaddr_in *nlm_srcaddr_in(const struct nlm_host *host)
+{
+ return (struct sockaddr_in *)&host->h_srcaddr;
+}
+
+static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host)
+{
+ return (struct sockaddr *)&host->h_srcaddr;
+}
+
+static inline struct sockaddr_in *nsm_addr_in(const struct nsm_handle *handle)
+{
+ return (struct sockaddr_in *)&handle->sm_addr;
+}
+
+static inline struct sockaddr *nsm_addr(const struct nsm_handle *handle)
+{
+ return (struct sockaddr *)&handle->sm_addr;
+}
+
+/*
* Map an fl_owner_t into a unique 32-bit "pid"
*/
struct nlm_lockowner {
@@ -166,7 +207,8 @@ int nlm_async_reply(struct nlm_rqst *, u32, const struct rpc_call_ops *);
struct nlm_wait * nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl);
void nlmclnt_finish_block(struct nlm_wait *block);
int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout);
-__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *);
+__be32 nlmclnt_grant(const struct sockaddr *addr,
+ const struct nlm_lock *lock);
void nlmclnt_recovery(struct nlm_host *);
int nlmclnt_reclaim(struct nlm_host *, struct file_lock *);
void nlmclnt_next_cookie(struct nlm_cookie *);
@@ -174,12 +216,14 @@ void nlmclnt_next_cookie(struct nlm_cookie *);
/*
* Host cache
*/
-struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin,
- int proto, u32 version,
+struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
+ const size_t salen,
+ const unsigned short protocol,
+ const u32 version,
+ const char *hostname);
+struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
const char *hostname,
- unsigned int hostname_len);
-struct nlm_host *nlmsvc_lookup_host(struct svc_rqst *, const char *,
- unsigned int);
+ const size_t hostname_len);
struct rpc_clnt * nlm_bind_host(struct nlm_host *);
void nlm_rebind_host(struct nlm_host *);
struct nlm_host * nlm_get_host(struct nlm_host *);
@@ -201,7 +245,7 @@ typedef int (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref);
*/
__be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *,
struct nlm_host *, struct nlm_lock *, int,
- struct nlm_cookie *);
+ struct nlm_cookie *, int);
__be32 nlmsvc_unlock(struct nlm_file *, struct nlm_lock *);
__be32 nlmsvc_testlock(struct svc_rqst *, struct nlm_file *,
struct nlm_host *, struct nlm_lock *,
@@ -233,15 +277,82 @@ static inline struct inode *nlmsvc_file_inode(struct nlm_file *file)
return file->f_file->f_path.dentry->d_inode;
}
+static inline int __nlm_privileged_request4(const struct sockaddr *sap)
+{
+ const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ return (sin->sin_addr.s_addr == htonl(INADDR_LOOPBACK)) &&
+ (ntohs(sin->sin_port) < 1024);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static inline int __nlm_privileged_request6(const struct sockaddr *sap)
+{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ return (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LOOPBACK) &&
+ (ntohs(sin6->sin6_port) < 1024);
+}
+#else /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
+static inline int __nlm_privileged_request6(const struct sockaddr *sap)
+{
+ return 0;
+}
+#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
+
/*
- * Compare two host addresses (needs modifying for ipv6)
+ * Ensure incoming requests are from local privileged callers.
+ *
+ * Return TRUE if sender is local and is connecting via a privileged port;
+ * otherwise return FALSE.
*/
-static inline int nlm_cmp_addr(const struct sockaddr_in *sin1,
- const struct sockaddr_in *sin2)
+static inline int nlm_privileged_requester(const struct svc_rqst *rqstp)
{
+ const struct sockaddr *sap = svc_addr(rqstp);
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ return __nlm_privileged_request4(sap);
+ case AF_INET6:
+ return __nlm_privileged_request6(sap);
+ default:
+ return 0;
+ }
+}
+
+static inline int __nlm_cmp_addr4(const struct sockaddr *sap1,
+ const struct sockaddr *sap2)
+{
+ const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1;
+ const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2;
return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
}
+static inline int __nlm_cmp_addr6(const struct sockaddr *sap1,
+ const struct sockaddr *sap2)
+{
+ const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1;
+ const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2;
+ return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
+}
+
+/*
+ * Compare two host addresses
+ *
+ * Return TRUE if the addresses are the same; otherwise FALSE.
+ */
+static inline int nlm_cmp_addr(const struct sockaddr *sap1,
+ const struct sockaddr *sap2)
+{
+ if (sap1->sa_family == sap2->sa_family) {
+ switch (sap1->sa_family) {
+ case AF_INET:
+ return __nlm_cmp_addr4(sap1, sap2);
+ case AF_INET6:
+ return __nlm_cmp_addr6(sap1, sap2);
+ }
+ }
+ return 0;
+}
+
/*
* Compare two NLM locks.
* When the second lock is of type F_UNLCK, this acts like a wildcard.
diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
index df18fa053bcd..d6b3a802c046 100644
--- a/include/linux/lockd/xdr.h
+++ b/include/linux/lockd/xdr.h
@@ -81,8 +81,6 @@ struct nlm_reboot {
unsigned int len;
u32 state;
__be32 addr;
- __be32 vers;
- __be32 proto;
};
/*
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 108f47e5fd95..21269405ffe2 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -38,6 +38,7 @@
#define NFSD_MAY_LOCK 32
#define NFSD_MAY_OWNER_OVERRIDE 64
#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
+#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -125,7 +126,7 @@ int nfsd_truncate(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
loff_t *, struct readdir_cd *, filldir_t);
__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
- struct kstatfs *);
+ struct kstatfs *, int access);
int nfsd_notify_change(struct inode *, struct iattr *);
__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index e5bfe01ee305..6f0ee1b84a4f 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -104,6 +104,7 @@ struct rpc_create_args {
const struct rpc_timeout *timeout;
char *servername;
struct rpc_program *program;
+ u32 prognumber; /* overrides program->number */
u32 version;
rpc_authflavor_t authflavor;
unsigned long flags;
@@ -124,10 +125,10 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
void rpc_shutdown_client(struct rpc_clnt *);
void rpc_release_client(struct rpc_clnt *);
-int rpcb_register(u32, u32, int, unsigned short, int *);
+int rpcb_register(u32, u32, int, unsigned short);
int rpcb_v4_register(const u32 program, const u32 version,
const struct sockaddr *address,
- const char *netid, int *result);
+ const char *netid);
int rpcb_getport_sync(struct sockaddr_in *, u32, u32, int);
void rpcb_getport_async(struct rpc_task *);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index dc69068d94c7..3afe7fb403b2 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -66,6 +66,7 @@ struct svc_serv {
struct list_head sv_tempsocks; /* all temporary sockets */
int sv_tmpcnt; /* count of temporary sockets */
struct timer_list sv_temptimer; /* timer for aging temporary sockets */
+ sa_family_t sv_family; /* listener's address family */
char * sv_name; /* service name */
@@ -265,17 +266,17 @@ struct svc_rqst {
/*
* Rigorous type checking on sockaddr type conversions
*/
-static inline struct sockaddr_in *svc_addr_in(struct svc_rqst *rqst)
+static inline struct sockaddr_in *svc_addr_in(const struct svc_rqst *rqst)
{
return (struct sockaddr_in *) &rqst->rq_addr;
}
-static inline struct sockaddr_in6 *svc_addr_in6(struct svc_rqst *rqst)
+static inline struct sockaddr_in6 *svc_addr_in6(const struct svc_rqst *rqst)
{
return (struct sockaddr_in6 *) &rqst->rq_addr;
}
-static inline struct sockaddr *svc_addr(struct svc_rqst *rqst)
+static inline struct sockaddr *svc_addr(const struct svc_rqst *rqst)
{
return (struct sockaddr *) &rqst->rq_addr;
}
@@ -381,18 +382,20 @@ struct svc_procedure {
/*
* Function prototypes.
*/
-struct svc_serv * svc_create(struct svc_program *, unsigned int,
- void (*shutdown)(struct svc_serv*));
+struct svc_serv *svc_create(struct svc_program *, unsigned int, sa_family_t,
+ void (*shutdown)(struct svc_serv *));
struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
struct svc_pool *pool);
void svc_exit_thread(struct svc_rqst *);
struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int,
- void (*shutdown)(struct svc_serv*), svc_thread_fn,
- struct module *);
+ sa_family_t, void (*shutdown)(struct svc_serv *),
+ svc_thread_fn, struct module *);
int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
void svc_destroy(struct svc_serv *);
int svc_process(struct svc_rqst *);
-int svc_register(struct svc_serv *, int, unsigned short);
+int svc_register(const struct svc_serv *, const unsigned short,
+ const unsigned short);
+
void svc_wake_up(struct svc_serv *);
void svc_reserve(struct svc_rqst *rqstp, int space);
struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu);
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index dc05b54bd3a3..c14fe86dac59 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -72,6 +72,7 @@ extern atomic_t rdma_stat_sq_prod;
*/
struct svc_rdma_op_ctxt {
struct svc_rdma_op_ctxt *read_hdr;
+ struct svc_rdma_fastreg_mr *frmr;
int hdr_count;
struct xdr_buf arg;
struct list_head dto_q;
@@ -103,16 +104,30 @@ struct svc_rdma_chunk_sge {
int start; /* sge no for this chunk */
int count; /* sge count for this chunk */
};
+struct svc_rdma_fastreg_mr {
+ struct ib_mr *mr;
+ void *kva;
+ struct ib_fast_reg_page_list *page_list;
+ int page_list_len;
+ unsigned long access_flags;
+ unsigned long map_len;
+ enum dma_data_direction direction;
+ struct list_head frmr_list;
+};
struct svc_rdma_req_map {
+ struct svc_rdma_fastreg_mr *frmr;
unsigned long count;
union {
struct kvec sge[RPCSVC_MAXPAGES];
struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES];
};
};
-
+#define RDMACTXT_F_FAST_UNREG 1
#define RDMACTXT_F_LAST_CTXT 2
+#define SVCRDMA_DEVCAP_FAST_REG 1 /* fast mr registration */
+#define SVCRDMA_DEVCAP_READ_W_INV 2 /* read w/ invalidate */
+
struct svcxprt_rdma {
struct svc_xprt sc_xprt; /* SVC transport structure */
struct rdma_cm_id *sc_cm_id; /* RDMA connection id */
@@ -136,6 +151,11 @@ struct svcxprt_rdma {
struct ib_cq *sc_rq_cq;
struct ib_cq *sc_sq_cq;
struct ib_mr *sc_phys_mr; /* MR for server memory */
+ u32 sc_dev_caps; /* distilled device caps */
+ u32 sc_dma_lkey; /* local dma key */
+ unsigned int sc_frmr_pg_list_len;
+ struct list_head sc_frmr_q;
+ spinlock_t sc_frmr_q_lock;
spinlock_t sc_lock; /* transport lock */
@@ -192,8 +212,13 @@ extern int svc_rdma_post_recv(struct svcxprt_rdma *);
extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
+extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
extern struct svc_rdma_req_map *svc_rdma_get_req_map(void);
extern void svc_rdma_put_req_map(struct svc_rdma_req_map *);
+extern int svc_rdma_fastreg(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *);
+extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *);
+extern void svc_rdma_put_frmr(struct svcxprt_rdma *,
+ struct svc_rdma_fastreg_mr *);
extern void svc_sq_reap(struct svcxprt_rdma *);
extern void svc_rq_reap(struct svcxprt_rdma *);
extern struct svc_xprt_class svc_rdma_class;
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 8cff696dedf5..483e10380aae 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -39,10 +39,7 @@ int svc_send(struct svc_rqst *);
void svc_drop(struct svc_rqst *);
void svc_sock_update_bufs(struct svc_serv *serv);
int svc_sock_names(char *buf, struct svc_serv *serv, char *toclose);
-int svc_addsock(struct svc_serv *serv,
- int fd,
- char *name_return,
- int *proto);
+int svc_addsock(struct svc_serv *serv, int fd, char *name_return);
void svc_init_xprt_sock(void);
void svc_cleanup_xprt_sock(void);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 08d6e1bb99ac..503d8d4eb80a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -125,6 +125,7 @@ cond_syscall(sys_vm86old);
cond_syscall(sys_vm86);
cond_syscall(compat_sys_ipc);
cond_syscall(compat_sys_sysctl);
+cond_syscall(sys_flock);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c468c3c6dfc5..cfc5295f1e82 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -96,7 +96,7 @@ static int sixty = 60;
static int neg_one = -1;
#endif
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
static int two = 2;
#endif
@@ -1248,6 +1248,7 @@ static struct ctl_table fs_table[] = {
.extra1 = &minolduid,
.extra2 = &maxolduid,
},
+#ifdef CONFIG_FILE_LOCKING
{
.ctl_name = FS_LEASES,
.procname = "leases-enable",
@@ -1256,6 +1257,7 @@ static struct ctl_table fs_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#endif
#ifdef CONFIG_DNOTIFY
{
.ctl_name = FS_DIR_NOTIFY,
@@ -1267,6 +1269,7 @@ static struct ctl_table fs_table[] = {
},
#endif
#ifdef CONFIG_MMU
+#ifdef CONFIG_FILE_LOCKING
{
.ctl_name = FS_LEASE_TIME,
.procname = "lease-break-time",
@@ -1278,6 +1281,7 @@ static struct ctl_table fs_table[] = {
.extra1 = &zero,
.extra2 = &two,
},
+#endif
{
.procname = "aio-nr",
.data = &aio_nr,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 76739e928d0d..da0789fa1b88 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -174,7 +174,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
clnt->cl_procinfo = version->procs;
clnt->cl_maxproc = version->nrprocs;
clnt->cl_protname = program->name;
- clnt->cl_prog = program->number;
+ clnt->cl_prog = args->prognumber ? : program->number;
clnt->cl_vers = version->number;
clnt->cl_stats = program->stats;
clnt->cl_metrics = rpc_alloc_iostats(clnt);
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 24db2b4d12d3..34abc91058d8 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -20,6 +20,7 @@
#include <linux/in6.h>
#include <linux/kernel.h>
#include <linux/errno.h>
+#include <net/ipv6.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/sched.h>
@@ -176,13 +177,12 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
}
static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
- u32 version, struct rpc_message *msg,
- int *result)
+ u32 version, struct rpc_message *msg)
{
struct rpc_clnt *rpcb_clnt;
- int error = 0;
+ int result, error = 0;
- *result = 0;
+ msg->rpc_resp = &result;
rpcb_clnt = rpcb_create_local(addr, addrlen, version);
if (!IS_ERR(rpcb_clnt)) {
@@ -191,12 +191,15 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
} else
error = PTR_ERR(rpcb_clnt);
- if (error < 0)
+ if (error < 0) {
printk(KERN_WARNING "RPC: failed to contact local rpcbind "
"server (errno %d).\n", -error);
- dprintk("RPC: registration status %d/%d\n", error, *result);
+ return error;
+ }
- return error;
+ if (!result)
+ return -EACCES;
+ return 0;
}
/**
@@ -205,7 +208,11 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
* @vers: RPC version number to bind
* @prot: transport protocol to register
* @port: port value to register
- * @okay: OUT: result code
+ *
+ * Returns zero if the registration request was dispatched successfully
+ * and the rpcbind daemon returned success. Otherwise, returns an errno
+ * value that reflects the nature of the error (request could not be
+ * dispatched, timed out, or rpcbind returned an error).
*
* RPC services invoke this function to advertise their contact
* information via the system's rpcbind daemon. RPC services
@@ -217,15 +224,6 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
* all registered transports for [program, version] from the local
* rpcbind database.
*
- * Returns zero if the registration request was dispatched
- * successfully and a reply was received. The rpcbind daemon's
- * boolean result code is stored in *okay.
- *
- * Returns an errno value and sets *result to zero if there was
- * some problem that prevented the rpcbind request from being
- * dispatched, or if the rpcbind daemon did not respond within
- * the timeout.
- *
* This function uses rpcbind protocol version 2 to contact the
* local rpcbind daemon.
*
@@ -236,7 +234,7 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
* IN6ADDR_ANY (ie available for all AF_INET and AF_INET6
* addresses).
*/
-int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
+int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
{
struct rpcbind_args map = {
.r_prog = prog,
@@ -246,7 +244,6 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
};
struct rpc_message msg = {
.rpc_argp = &map,
- .rpc_resp = okay,
};
dprintk("RPC: %sregistering (%u, %u, %d, %u) with local "
@@ -259,7 +256,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
sizeof(rpcb_inaddr_loopback),
- RPCBVERS_2, &msg, okay);
+ RPCBVERS_2, &msg);
}
/*
@@ -290,7 +287,7 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register,
return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
sizeof(rpcb_inaddr_loopback),
- RPCBVERS_4, msg, msg->rpc_resp);
+ RPCBVERS_4, msg);
}
/*
@@ -304,10 +301,13 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
char buf[64];
/* Construct AF_INET6 universal address */
- snprintf(buf, sizeof(buf),
- NIP6_FMT".%u.%u",
- NIP6(address_to_register->sin6_addr),
- port >> 8, port & 0xff);
+ if (ipv6_addr_any(&address_to_register->sin6_addr))
+ snprintf(buf, sizeof(buf), "::.%u.%u",
+ port >> 8, port & 0xff);
+ else
+ snprintf(buf, sizeof(buf), NIP6_FMT".%u.%u",
+ NIP6(address_to_register->sin6_addr),
+ port >> 8, port & 0xff);
map->r_addr = buf;
dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
@@ -321,7 +321,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback,
sizeof(rpcb_in6addr_loopback),
- RPCBVERS_4, msg, msg->rpc_resp);
+ RPCBVERS_4, msg);
}
/**
@@ -330,7 +330,11 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
* @version: RPC version number of service to (un)register
* @address: address family, IP address, and port to (un)register
* @netid: netid of transport protocol to (un)register
- * @result: result code from rpcbind RPC call
+ *
+ * Returns zero if the registration request was dispatched successfully
+ * and the rpcbind daemon returned success. Otherwise, returns an errno
+ * value that reflects the nature of the error (request could not be
+ * dispatched, timed out, or rpcbind returned an error).
*
* RPC services invoke this function to advertise their contact
* information via the system's rpcbind daemon. RPC services
@@ -342,15 +346,6 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
* to zero. Callers pass a netid of "" to unregister all
* transport netids associated with [program, version, address].
*
- * Returns zero if the registration request was dispatched
- * successfully and a reply was received. The rpcbind daemon's
- * result code is stored in *result.
- *
- * Returns an errno value and sets *result to zero if there was
- * some problem that prevented the rpcbind request from being
- * dispatched, or if the rpcbind daemon did not respond within
- * the timeout.
- *
* This function uses rpcbind protocol version 4 to contact the
* local rpcbind daemon. The local rpcbind daemon must support
* version 4 of the rpcbind protocol in order for these functions
@@ -372,8 +367,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
* advertises the service on all IPv4 and IPv6 addresses.
*/
int rpcb_v4_register(const u32 program, const u32 version,
- const struct sockaddr *address, const char *netid,
- int *result)
+ const struct sockaddr *address, const char *netid)
{
struct rpcbind_args map = {
.r_prog = program,
@@ -383,11 +377,8 @@ int rpcb_v4_register(const u32 program, const u32 version,
};
struct rpc_message msg = {
.rpc_argp = &map,
- .rpc_resp = result,
};
- *result = 0;
-
switch (address->sa_family) {
case AF_INET:
return rpcb_register_netid4((struct sockaddr_in *)address,
@@ -633,7 +624,7 @@ static void rpcb_getport_done(struct rpc_task *child, void *data)
static int rpcb_encode_mapping(struct rpc_rqst *req, __be32 *p,
struct rpcbind_args *rpcb)
{
- dprintk("RPC: rpcb_encode_mapping(%u, %u, %d, %u)\n",
+ dprintk("RPC: encoding rpcb request (%u, %u, %d, %u)\n",
rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port);
*p++ = htonl(rpcb->r_prog);
*p++ = htonl(rpcb->r_vers);
@@ -648,7 +639,7 @@ static int rpcb_decode_getport(struct rpc_rqst *req, __be32 *p,
unsigned short *portp)
{
*portp = (unsigned short) ntohl(*p++);
- dprintk("RPC: rpcb_decode_getport result %u\n",
+ dprintk("RPC: rpcb getport result: %u\n",
*portp);
return 0;
}
@@ -657,7 +648,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p,
unsigned int *boolp)
{
*boolp = (unsigned int) ntohl(*p++);
- dprintk("RPC: rpcb_decode_set: call %s\n",
+ dprintk("RPC: rpcb set/unset call %s\n",
(*boolp ? "succeeded" : "failed"));
return 0;
}
@@ -665,7 +656,7 @@ static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p,
static int rpcb_encode_getaddr(struct rpc_rqst *req, __be32 *p,
struct rpcbind_args *rpcb)
{
- dprintk("RPC: rpcb_encode_getaddr(%u, %u, %s)\n",
+ dprintk("RPC: encoding rpcb request (%u, %u, %s)\n",
rpcb->r_prog, rpcb->r_vers, rpcb->r_addr);
*p++ = htonl(rpcb->r_prog);
*p++ = htonl(rpcb->r_vers);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 5a32cb7c4bb4..54c98d876847 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -28,6 +28,8 @@
#define RPCDBG_FACILITY RPCDBG_SVCDSP
+static void svc_unregister(const struct svc_serv *serv);
+
#define svc_serv_is_pooled(serv) ((serv)->sv_function)
/*
@@ -357,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu)
*/
static struct svc_serv *
__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
- void (*shutdown)(struct svc_serv *serv))
+ sa_family_t family, void (*shutdown)(struct svc_serv *serv))
{
struct svc_serv *serv;
unsigned int vers;
@@ -366,6 +368,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
return NULL;
+ serv->sv_family = family;
serv->sv_name = prog->pg_name;
serv->sv_program = prog;
serv->sv_nrthreads = 1;
@@ -416,30 +419,29 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
spin_lock_init(&pool->sp_lock);
}
-
/* Remove any stale portmap registrations */
- svc_register(serv, 0, 0);
+ svc_unregister(serv);
return serv;
}
struct svc_serv *
svc_create(struct svc_program *prog, unsigned int bufsize,
- void (*shutdown)(struct svc_serv *serv))
+ sa_family_t family, void (*shutdown)(struct svc_serv *serv))
{
- return __svc_create(prog, bufsize, /*npools*/1, shutdown);
+ return __svc_create(prog, bufsize, /*npools*/1, family, shutdown);
}
EXPORT_SYMBOL(svc_create);
struct svc_serv *
svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
- void (*shutdown)(struct svc_serv *serv),
+ sa_family_t family, void (*shutdown)(struct svc_serv *serv),
svc_thread_fn func, struct module *mod)
{
struct svc_serv *serv;
unsigned int npools = svc_pool_map_get();
- serv = __svc_create(prog, bufsize, npools, shutdown);
+ serv = __svc_create(prog, bufsize, npools, family, shutdown);
if (serv != NULL) {
serv->sv_function = func;
@@ -486,8 +488,7 @@ svc_destroy(struct svc_serv *serv)
if (svc_serv_is_pooled(serv))
svc_pool_map_put();
- /* Unregister service with the portmapper */
- svc_register(serv, 0, 0);
+ svc_unregister(serv);
kfree(serv->sv_pools);
kfree(serv);
}
@@ -718,55 +719,245 @@ svc_exit_thread(struct svc_rqst *rqstp)
}
EXPORT_SYMBOL(svc_exit_thread);
+#ifdef CONFIG_SUNRPC_REGISTER_V4
+
/*
- * Register an RPC service with the local portmapper.
- * To unregister a service, call this routine with
- * proto and port == 0.
+ * Register an "inet" protocol family netid with the local
+ * rpcbind daemon via an rpcbind v4 SET request.
+ *
+ * No netconfig infrastructure is available in the kernel, so
+ * we map IP_ protocol numbers to netids by hand.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
*/
-int
-svc_register(struct svc_serv *serv, int proto, unsigned short port)
+static int __svc_rpcb_register4(const u32 program, const u32 version,
+ const unsigned short protocol,
+ const unsigned short port)
+{
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_ANY),
+ .sin_port = htons(port),
+ };
+ char *netid;
+
+ switch (protocol) {
+ case IPPROTO_UDP:
+ netid = RPCBIND_NETID_UDP;
+ break;
+ case IPPROTO_TCP:
+ netid = RPCBIND_NETID_TCP;
+ break;
+ default:
+ return -EPROTONOSUPPORT;
+ }
+
+ return rpcb_v4_register(program, version,
+ (struct sockaddr *)&sin, netid);
+}
+
+/*
+ * Register an "inet6" protocol family netid with the local
+ * rpcbind daemon via an rpcbind v4 SET request.
+ *
+ * No netconfig infrastructure is available in the kernel, so
+ * we map IP_ protocol numbers to netids by hand.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_rpcb_register6(const u32 program, const u32 version,
+ const unsigned short protocol,
+ const unsigned short port)
+{
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_ANY_INIT,
+ .sin6_port = htons(port),
+ };
+ char *netid;
+
+ switch (protocol) {
+ case IPPROTO_UDP:
+ netid = RPCBIND_NETID_UDP6;
+ break;
+ case IPPROTO_TCP:
+ netid = RPCBIND_NETID_TCP6;
+ break;
+ default:
+ return -EPROTONOSUPPORT;
+ }
+
+ return rpcb_v4_register(program, version,
+ (struct sockaddr *)&sin6, netid);
+}
+
+/*
+ * Register a kernel RPC service via rpcbind version 4.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_register(const u32 program, const u32 version,
+ const sa_family_t family,
+ const unsigned short protocol,
+ const unsigned short port)
+{
+ int error;
+
+ switch (family) {
+ case AF_INET:
+ return __svc_rpcb_register4(program, version,
+ protocol, port);
+ case AF_INET6:
+ error = __svc_rpcb_register6(program, version,
+ protocol, port);
+ if (error < 0)
+ return error;
+
+ /*
+ * Work around bug in some versions of Linux rpcbind
+ * which don't allow registration of both inet and
+ * inet6 netids.
+ *
+ * Error return ignored for now.
+ */
+ __svc_rpcb_register4(program, version,
+ protocol, port);
+ return 0;
+ }
+
+ return -EAFNOSUPPORT;
+}
+
+#else /* CONFIG_SUNRPC_REGISTER_V4 */
+
+/*
+ * Register a kernel RPC service via rpcbind version 2.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_register(const u32 program, const u32 version,
+ sa_family_t family,
+ const unsigned short protocol,
+ const unsigned short port)
+{
+ if (family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ return rpcb_register(program, version, protocol, port);
+}
+
+#endif /* CONFIG_SUNRPC_REGISTER_V4 */
+
+/**
+ * svc_register - register an RPC service with the local portmapper
+ * @serv: svc_serv struct for the service to register
+ * @proto: transport protocol number to advertise
+ * @port: port to advertise
+ *
+ * Service is registered for any address in serv's address family
+ */
+int svc_register(const struct svc_serv *serv, const unsigned short proto,
+ const unsigned short port)
{
struct svc_program *progp;
- unsigned long flags;
unsigned int i;
- int error = 0, dummy;
+ int error = 0;
- if (!port)
- clear_thread_flag(TIF_SIGPENDING);
+ BUG_ON(proto == 0 && port == 0);
for (progp = serv->sv_program; progp; progp = progp->pg_next) {
for (i = 0; i < progp->pg_nvers; i++) {
if (progp->pg_vers[i] == NULL)
continue;
- dprintk("svc: svc_register(%s, %s, %d, %d)%s\n",
+ dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n",
progp->pg_name,
+ i,
proto == IPPROTO_UDP? "udp" : "tcp",
port,
- i,
+ serv->sv_family,
progp->pg_vers[i]->vs_hidden?
" (but not telling portmap)" : "");
if (progp->pg_vers[i]->vs_hidden)
continue;
- error = rpcb_register(progp->pg_prog, i, proto, port, &dummy);
+ error = __svc_register(progp->pg_prog, i,
+ serv->sv_family, proto, port);
if (error < 0)
break;
- if (port && !dummy) {
- error = -EACCES;
- break;
- }
}
}
- if (!port) {
- spin_lock_irqsave(&current->sighand->siglock, flags);
- recalc_sigpending();
- spin_unlock_irqrestore(&current->sighand->siglock, flags);
+ return error;
+}
+
+#ifdef CONFIG_SUNRPC_REGISTER_V4
+
+static void __svc_unregister(const u32 program, const u32 version,
+ const char *progname)
+{
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_ANY_INIT,
+ .sin6_port = 0,
+ };
+ int error;
+
+ error = rpcb_v4_register(program, version,
+ (struct sockaddr *)&sin6, "");
+ dprintk("svc: %s(%sv%u), error %d\n",
+ __func__, progname, version, error);
+}
+
+#else /* CONFIG_SUNRPC_REGISTER_V4 */
+
+static void __svc_unregister(const u32 program, const u32 version,
+ const char *progname)
+{
+ int error;
+
+ error = rpcb_register(program, version, 0, 0);
+ dprintk("svc: %s(%sv%u), error %d\n",
+ __func__, progname, version, error);
+}
+
+#endif /* CONFIG_SUNRPC_REGISTER_V4 */
+
+/*
+ * All netids, bind addresses and ports registered for [program, version]
+ * are removed from the local rpcbind database (if the service is not
+ * hidden) to make way for a new instance of the service.
+ *
+ * The result of unregistration is reported via dprintk for those who want
+ * verification of the result, but is otherwise not important.
+ */
+static void svc_unregister(const struct svc_serv *serv)
+{
+ struct svc_program *progp;
+ unsigned long flags;
+ unsigned int i;
+
+ clear_thread_flag(TIF_SIGPENDING);
+
+ for (progp = serv->sv_program; progp; progp = progp->pg_next) {
+ for (i = 0; i < progp->pg_nvers; i++) {
+ if (progp->pg_vers[i] == NULL)
+ continue;
+ if (progp->pg_vers[i]->vs_hidden)
+ continue;
+
+ __svc_unregister(progp->pg_prog, i, progp->pg_name);
+ }
}
- return error;
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
}
/*
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e46c825f4954..bf5b5cdafebf 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -159,15 +159,44 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
}
EXPORT_SYMBOL_GPL(svc_xprt_init);
-int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
- int flags)
+static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
+ struct svc_serv *serv,
+ unsigned short port, int flags)
{
- struct svc_xprt_class *xcl;
struct sockaddr_in sin = {
.sin_family = AF_INET,
.sin_addr.s_addr = htonl(INADDR_ANY),
.sin_port = htons(port),
};
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_ANY_INIT,
+ .sin6_port = htons(port),
+ };
+ struct sockaddr *sap;
+ size_t len;
+
+ switch (serv->sv_family) {
+ case AF_INET:
+ sap = (struct sockaddr *)&sin;
+ len = sizeof(sin);
+ break;
+ case AF_INET6:
+ sap = (struct sockaddr *)&sin6;
+ len = sizeof(sin6);
+ break;
+ default:
+ return ERR_PTR(-EAFNOSUPPORT);
+ }
+
+ return xcl->xcl_ops->xpo_create(serv, sap, len, flags);
+}
+
+int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
+ int flags)
+{
+ struct svc_xprt_class *xcl;
+
dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
spin_lock(&svc_xprt_class_lock);
list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
@@ -180,9 +209,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
goto err;
spin_unlock(&svc_xprt_class_lock);
- newxprt = xcl->xcl_ops->
- xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin),
- flags);
+ newxprt = __svc_xpo_create(xcl, serv, port, flags);
if (IS_ERR(newxprt)) {
module_put(xcl->xcl_owner);
return PTR_ERR(newxprt);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 3e65719f1ef6..95293f549e9c 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1114,6 +1114,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
struct svc_sock *svsk;
struct sock *inet;
int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
+ int val;
dprintk("svc: svc_setup_socket %p\n", sock);
if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1146,6 +1147,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
else
svc_tcp_init(svsk, serv);
+ /*
+ * We start one listener per sv_serv. We want AF_INET
+ * requests to be automatically shunted to our AF_INET6
+ * listener using a mapped IPv4 address. Make sure
+ * no-one starts an equivalent IPv4 listener, which
+ * would steal our incoming connections.
+ */
+ val = 0;
+ if (serv->sv_family == AF_INET6)
+ kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
+ (char *)&val, sizeof(val));
+
dprintk("svc: svc_setup_socket created %p (inet %p)\n",
svsk, svsk->sk_sk);
@@ -1154,8 +1167,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
int svc_addsock(struct svc_serv *serv,
int fd,
- char *name_return,
- int *proto)
+ char *name_return)
{
int err = 0;
struct socket *so = sockfd_lookup(fd, &err);
@@ -1190,7 +1202,6 @@ int svc_addsock(struct svc_serv *serv,
sockfd_put(so);
return err;
}
- if (proto) *proto = so->sk->sk_protocol;
return one_sock_name(name_return, svsk);
}
EXPORT_SYMBOL_GPL(svc_addsock);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 74de31a06616..a4756576d687 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -116,7 +116,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
*
* Assumptions:
* - chunk[0]->position points to pages[0] at an offset of 0
- * - pages[] is not physically or virtually contigous and consists of
+ * - pages[] is not physically or virtually contiguous and consists of
* PAGE_SIZE elements.
*
* Output:
@@ -125,7 +125,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
* chunk in the read list
*
*/
-static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
+static int map_read_chunks(struct svcxprt_rdma *xprt,
struct svc_rqst *rqstp,
struct svc_rdma_op_ctxt *head,
struct rpcrdma_msg *rmsgp,
@@ -211,26 +211,128 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
return sge_no;
}
-static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
- struct svc_rdma_op_ctxt *ctxt,
- struct kvec *vec,
- u64 *sgl_offset,
- int count)
+/* Map a read-chunk-list to an XDR and fast register the page-list.
+ *
+ * Assumptions:
+ * - chunk[0] position points to pages[0] at an offset of 0
+ * - pages[] will be made physically contiguous by creating a one-off memory
+ * region using the fastreg verb.
+ * - byte_count is # of bytes in read-chunk-list
+ * - ch_count is # of chunks in read-chunk-list
+ *
+ * Output:
+ * - sge array pointing into pages[] array.
+ * - chunk_sge array specifying sge index and count for each
+ * chunk in the read list
+ */
+static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head,
+ struct rpcrdma_msg *rmsgp,
+ struct svc_rdma_req_map *rpl_map,
+ struct svc_rdma_req_map *chl_map,
+ int ch_count,
+ int byte_count)
+{
+ int page_no;
+ int ch_no;
+ u32 offset;
+ struct rpcrdma_read_chunk *ch;
+ struct svc_rdma_fastreg_mr *frmr;
+ int ret = 0;
+
+ frmr = svc_rdma_get_frmr(xprt);
+ if (IS_ERR(frmr))
+ return -ENOMEM;
+
+ head->frmr = frmr;
+ head->arg.head[0] = rqstp->rq_arg.head[0];
+ head->arg.tail[0] = rqstp->rq_arg.tail[0];
+ head->arg.pages = &head->pages[head->count];
+ head->hdr_count = head->count; /* save count of hdr pages */
+ head->arg.page_base = 0;
+ head->arg.page_len = byte_count;
+ head->arg.len = rqstp->rq_arg.len + byte_count;
+ head->arg.buflen = rqstp->rq_arg.buflen + byte_count;
+
+ /* Fast register the page list */
+ frmr->kva = page_address(rqstp->rq_arg.pages[0]);
+ frmr->direction = DMA_FROM_DEVICE;
+ frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
+ frmr->map_len = byte_count;
+ frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
+ for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
+ frmr->page_list->page_list[page_no] =
+ ib_dma_map_single(xprt->sc_cm_id->device,
+ page_address(rqstp->rq_arg.pages[page_no]),
+ PAGE_SIZE, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ frmr->page_list->page_list[page_no]))
+ goto fatal_err;
+ atomic_inc(&xprt->sc_dma_used);
+ head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+ }
+ head->count += page_no;
+
+ /* rq_respages points one past arg pages */
+ rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+
+ /* Create the reply and chunk maps */
+ offset = 0;
+ ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+ for (ch_no = 0; ch_no < ch_count; ch_no++) {
+ rpl_map->sge[ch_no].iov_base = frmr->kva + offset;
+ rpl_map->sge[ch_no].iov_len = ch->rc_target.rs_length;
+ chl_map->ch[ch_no].count = 1;
+ chl_map->ch[ch_no].start = ch_no;
+ offset += ch->rc_target.rs_length;
+ ch++;
+ }
+
+ ret = svc_rdma_fastreg(xprt, frmr);
+ if (ret)
+ goto fatal_err;
+
+ return ch_no;
+
+ fatal_err:
+ printk("svcrdma: error fast registering xdr for xprt %p", xprt);
+ svc_rdma_put_frmr(xprt, frmr);
+ return -EIO;
+}
+
+static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
+ struct svc_rdma_op_ctxt *ctxt,
+ struct svc_rdma_fastreg_mr *frmr,
+ struct kvec *vec,
+ u64 *sgl_offset,
+ int count)
{
int i;
ctxt->count = count;
ctxt->direction = DMA_FROM_DEVICE;
for (i = 0; i < count; i++) {
- atomic_inc(&xprt->sc_dma_used);
- ctxt->sge[i].addr =
- ib_dma_map_single(xprt->sc_cm_id->device,
- vec[i].iov_base, vec[i].iov_len,
- DMA_FROM_DEVICE);
+ ctxt->sge[i].length = 0; /* in case map fails */
+ if (!frmr) {
+ ctxt->sge[i].addr =
+ ib_dma_map_single(xprt->sc_cm_id->device,
+ vec[i].iov_base,
+ vec[i].iov_len,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ ctxt->sge[i].addr))
+ return -EINVAL;
+ ctxt->sge[i].lkey = xprt->sc_dma_lkey;
+ atomic_inc(&xprt->sc_dma_used);
+ } else {
+ ctxt->sge[i].addr = (unsigned long)vec[i].iov_base;
+ ctxt->sge[i].lkey = frmr->mr->lkey;
+ }
ctxt->sge[i].length = vec[i].iov_len;
- ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey;
*sgl_offset = *sgl_offset + vec[i].iov_len;
}
+ return 0;
}
static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
@@ -278,6 +380,7 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
struct svc_rdma_op_ctxt *hdr_ctxt)
{
struct ib_send_wr read_wr;
+ struct ib_send_wr inv_wr;
int err = 0;
int ch_no;
int ch_count;
@@ -301,9 +404,20 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
if (ch_count > RPCSVC_MAXPAGES)
return -EINVAL;
- sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
- rpl_map, chl_map,
- ch_count, byte_count);
+
+ if (!xprt->sc_frmr_pg_list_len)
+ sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+ rpl_map, chl_map, ch_count,
+ byte_count);
+ else
+ sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+ rpl_map, chl_map, ch_count,
+ byte_count);
+ if (sge_count < 0) {
+ err = -EIO;
+ goto out;
+ }
+
sgl_offset = 0;
ch_no = 0;
@@ -312,13 +426,16 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
next_sge:
ctxt = svc_rdma_get_context(xprt);
ctxt->direction = DMA_FROM_DEVICE;
+ ctxt->frmr = hdr_ctxt->frmr;
+ ctxt->read_hdr = NULL;
clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+ clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
/* Prepare READ WR */
memset(&read_wr, 0, sizeof read_wr);
- ctxt->wr_op = IB_WR_RDMA_READ;
read_wr.wr_id = (unsigned long)ctxt;
read_wr.opcode = IB_WR_RDMA_READ;
+ ctxt->wr_op = read_wr.opcode;
read_wr.send_flags = IB_SEND_SIGNALED;
read_wr.wr.rdma.rkey = ch->rc_target.rs_handle;
read_wr.wr.rdma.remote_addr =
@@ -327,10 +444,15 @@ next_sge:
read_wr.sg_list = ctxt->sge;
read_wr.num_sge =
rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
- rdma_set_ctxt_sge(xprt, ctxt,
- &rpl_map->sge[chl_map->ch[ch_no].start],
- &sgl_offset,
- read_wr.num_sge);
+ err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr,
+ &rpl_map->sge[chl_map->ch[ch_no].start],
+ &sgl_offset,
+ read_wr.num_sge);
+ if (err) {
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 0);
+ goto out;
+ }
if (((ch+1)->rc_discrim == 0) &&
(read_wr.num_sge == chl_map->ch[ch_no].count)) {
/*
@@ -339,6 +461,29 @@ next_sge:
* the client and the RPC needs to be enqueued.
*/
set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+ if (hdr_ctxt->frmr) {
+ set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+ /*
+ * Invalidate the local MR used to map the data
+ * sink.
+ */
+ if (xprt->sc_dev_caps &
+ SVCRDMA_DEVCAP_READ_W_INV) {
+ read_wr.opcode =
+ IB_WR_RDMA_READ_WITH_INV;
+ ctxt->wr_op = read_wr.opcode;
+ read_wr.ex.invalidate_rkey =
+ ctxt->frmr->mr->lkey;
+ } else {
+ /* Prepare INVALIDATE WR */
+ memset(&inv_wr, 0, sizeof inv_wr);
+ inv_wr.opcode = IB_WR_LOCAL_INV;
+ inv_wr.send_flags = IB_SEND_SIGNALED;
+ inv_wr.ex.invalidate_rkey =
+ hdr_ctxt->frmr->mr->lkey;
+ read_wr.next = &inv_wr;
+ }
+ }
ctxt->read_hdr = hdr_ctxt;
}
/* Post the read */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 84d328329d98..9a7a8e7ae038 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -69,9 +69,127 @@
* array is only concerned with the reply we are assured that we have
* on extra page for the RPCRMDA header.
*/
-static void xdr_to_sge(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- struct svc_rdma_req_map *vec)
+int fast_reg_xdr(struct svcxprt_rdma *xprt,
+ struct xdr_buf *xdr,
+ struct svc_rdma_req_map *vec)
+{
+ int sge_no;
+ u32 sge_bytes;
+ u32 page_bytes;
+ u32 page_off;
+ int page_no = 0;
+ u8 *frva;
+ struct svc_rdma_fastreg_mr *frmr;
+
+ frmr = svc_rdma_get_frmr(xprt);
+ if (IS_ERR(frmr))
+ return -ENOMEM;
+ vec->frmr = frmr;
+
+ /* Skip the RPCRDMA header */
+ sge_no = 1;
+
+ /* Map the head. */
+ frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK);
+ vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+ vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
+ vec->count = 2;
+ sge_no++;
+
+ /* Build the FRMR */
+ frmr->kva = frva;
+ frmr->direction = DMA_TO_DEVICE;
+ frmr->access_flags = 0;
+ frmr->map_len = PAGE_SIZE;
+ frmr->page_list_len = 1;
+ frmr->page_list->page_list[page_no] =
+ ib_dma_map_single(xprt->sc_cm_id->device,
+ (void *)xdr->head[0].iov_base,
+ PAGE_SIZE, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ frmr->page_list->page_list[page_no]))
+ goto fatal_err;
+ atomic_inc(&xprt->sc_dma_used);
+
+ page_off = xdr->page_base;
+ page_bytes = xdr->page_len + page_off;
+ if (!page_bytes)
+ goto encode_tail;
+
+ /* Map the pages */
+ vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
+ vec->sge[sge_no].iov_len = page_bytes;
+ sge_no++;
+ while (page_bytes) {
+ struct page *page;
+
+ page = xdr->pages[page_no++];
+ sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
+ page_bytes -= sge_bytes;
+
+ frmr->page_list->page_list[page_no] =
+ ib_dma_map_page(xprt->sc_cm_id->device, page, 0,
+ PAGE_SIZE, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ frmr->page_list->page_list[page_no]))
+ goto fatal_err;
+
+ atomic_inc(&xprt->sc_dma_used);
+ page_off = 0; /* reset for next time through loop */
+ frmr->map_len += PAGE_SIZE;
+ frmr->page_list_len++;
+ }
+ vec->count++;
+
+ encode_tail:
+ /* Map tail */
+ if (0 == xdr->tail[0].iov_len)
+ goto done;
+
+ vec->count++;
+ vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
+
+ if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) ==
+ ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) {
+ /*
+ * If head and tail use the same page, we don't need
+ * to map it again.
+ */
+ vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
+ } else {
+ void *va;
+
+ /* Map another page for the tail */
+ page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
+ va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK);
+ vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
+
+ frmr->page_list->page_list[page_no] =
+ ib_dma_map_single(xprt->sc_cm_id->device, va, PAGE_SIZE,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ frmr->page_list->page_list[page_no]))
+ goto fatal_err;
+ atomic_inc(&xprt->sc_dma_used);
+ frmr->map_len += PAGE_SIZE;
+ frmr->page_list_len++;
+ }
+
+ done:
+ if (svc_rdma_fastreg(xprt, frmr))
+ goto fatal_err;
+
+ return 0;
+
+ fatal_err:
+ printk("svcrdma: Error fast registering memory for xprt %p\n", xprt);
+ svc_rdma_put_frmr(xprt, frmr);
+ return -EIO;
+}
+
+static int map_xdr(struct svcxprt_rdma *xprt,
+ struct xdr_buf *xdr,
+ struct svc_rdma_req_map *vec)
{
int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
int sge_no;
@@ -83,6 +201,9 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt,
BUG_ON(xdr->len !=
(xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
+ if (xprt->sc_frmr_pg_list_len)
+ return fast_reg_xdr(xprt, xdr, vec);
+
/* Skip the first sge, this is for the RPCRDMA header */
sge_no = 1;
@@ -116,9 +237,12 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt,
BUG_ON(sge_no > sge_max);
vec->count = sge_no;
+ return 0;
}
/* Assumptions:
+ * - We are using FRMR
+ * - or -
* - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
*/
static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
@@ -158,30 +282,35 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
sge_no = 0;
/* Copy the remaining SGE */
- while (bc != 0 && xdr_sge_no < vec->count) {
- sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
- sge_bytes = min((size_t)bc,
- (size_t)(vec->sge[xdr_sge_no].iov_len-sge_off));
+ while (bc != 0) {
+ sge_bytes = min_t(size_t,
+ bc, vec->sge[xdr_sge_no].iov_len-sge_off);
sge[sge_no].length = sge_bytes;
- atomic_inc(&xprt->sc_dma_used);
- sge[sge_no].addr =
- ib_dma_map_single(xprt->sc_cm_id->device,
- (void *)
- vec->sge[xdr_sge_no].iov_base + sge_off,
- sge_bytes, DMA_TO_DEVICE);
- if (dma_mapping_error(xprt->sc_cm_id->device->dma_device,
- sge[sge_no].addr))
- goto err;
+ if (!vec->frmr) {
+ sge[sge_no].addr =
+ ib_dma_map_single(xprt->sc_cm_id->device,
+ (void *)
+ vec->sge[xdr_sge_no].iov_base + sge_off,
+ sge_bytes, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ sge[sge_no].addr))
+ goto err;
+ atomic_inc(&xprt->sc_dma_used);
+ sge[sge_no].lkey = xprt->sc_dma_lkey;
+ } else {
+ sge[sge_no].addr = (unsigned long)
+ vec->sge[xdr_sge_no].iov_base + sge_off;
+ sge[sge_no].lkey = vec->frmr->mr->lkey;
+ }
+ ctxt->count++;
+ ctxt->frmr = vec->frmr;
sge_off = 0;
sge_no++;
- ctxt->count++;
xdr_sge_no++;
+ BUG_ON(xdr_sge_no > vec->count);
bc -= sge_bytes;
}
- BUG_ON(bc != 0);
- BUG_ON(xdr_sge_no > vec->count);
-
/* Prepare WRITE WR */
memset(&write_wr, 0, sizeof write_wr);
ctxt->wr_op = IB_WR_RDMA_WRITE;
@@ -226,7 +355,10 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
res_ary = (struct rpcrdma_write_array *)
&rdma_resp->rm_body.rm_chunks[1];
- max_write = xprt->sc_max_sge * PAGE_SIZE;
+ if (vec->frmr)
+ max_write = vec->frmr->map_len;
+ else
+ max_write = xprt->sc_max_sge * PAGE_SIZE;
/* Write chunks start at the pagelist */
for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
@@ -297,7 +429,10 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
res_ary = (struct rpcrdma_write_array *)
&rdma_resp->rm_body.rm_chunks[2];
- max_write = xprt->sc_max_sge * PAGE_SIZE;
+ if (vec->frmr)
+ max_write = vec->frmr->map_len;
+ else
+ max_write = xprt->sc_max_sge * PAGE_SIZE;
/* xdr offset starts at RPC message */
for (xdr_off = 0, chunk_no = 0;
@@ -307,7 +442,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
ch = &arg_ary->wc_array[chunk_no].wc_target;
write_len = min(xfer_len, ch->rs_length);
-
/* Prepare the reply chunk given the length actually
* written */
rs_offset = get_unaligned(&(ch->rs_offset));
@@ -366,6 +500,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
int byte_count)
{
struct ib_send_wr send_wr;
+ struct ib_send_wr inv_wr;
int sge_no;
int sge_bytes;
int page_no;
@@ -385,27 +520,45 @@ static int send_reply(struct svcxprt_rdma *rdma,
/* Prepare the context */
ctxt->pages[0] = page;
ctxt->count = 1;
+ ctxt->frmr = vec->frmr;
+ if (vec->frmr)
+ set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+ else
+ clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
/* Prepare the SGE for the RPCRDMA Header */
- atomic_inc(&rdma->sc_dma_used);
ctxt->sge[0].addr =
ib_dma_map_page(rdma->sc_cm_id->device,
page, 0, PAGE_SIZE, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
+ goto err;
+ atomic_inc(&rdma->sc_dma_used);
+
ctxt->direction = DMA_TO_DEVICE;
+
ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
- ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey;
+ ctxt->sge[0].lkey = rdma->sc_dma_lkey;
/* Determine how many of our SGE are to be transmitted */
for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
byte_count -= sge_bytes;
- atomic_inc(&rdma->sc_dma_used);
- ctxt->sge[sge_no].addr =
- ib_dma_map_single(rdma->sc_cm_id->device,
- vec->sge[sge_no].iov_base,
- sge_bytes, DMA_TO_DEVICE);
+ if (!vec->frmr) {
+ ctxt->sge[sge_no].addr =
+ ib_dma_map_single(rdma->sc_cm_id->device,
+ vec->sge[sge_no].iov_base,
+ sge_bytes, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdma->sc_cm_id->device,
+ ctxt->sge[sge_no].addr))
+ goto err;
+ atomic_inc(&rdma->sc_dma_used);
+ ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+ } else {
+ ctxt->sge[sge_no].addr = (unsigned long)
+ vec->sge[sge_no].iov_base;
+ ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey;
+ }
ctxt->sge[sge_no].length = sge_bytes;
- ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey;
}
BUG_ON(byte_count != 0);
@@ -417,11 +570,16 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
ctxt->count++;
rqstp->rq_respages[page_no] = NULL;
- /* If there are more pages than SGE, terminate SGE list */
+ /*
+ * If there are more pages than SGE, terminate SGE
+ * list so that svc_rdma_unmap_dma doesn't attempt to
+ * unmap garbage.
+ */
if (page_no+1 >= sge_no)
ctxt->sge[page_no+1].length = 0;
}
BUG_ON(sge_no > rdma->sc_max_sge);
+ BUG_ON(sge_no > ctxt->count);
memset(&send_wr, 0, sizeof send_wr);
ctxt->wr_op = IB_WR_SEND;
send_wr.wr_id = (unsigned long)ctxt;
@@ -429,12 +587,26 @@ static int send_reply(struct svcxprt_rdma *rdma,
send_wr.num_sge = sge_no;
send_wr.opcode = IB_WR_SEND;
send_wr.send_flags = IB_SEND_SIGNALED;
+ if (vec->frmr) {
+ /* Prepare INVALIDATE WR */
+ memset(&inv_wr, 0, sizeof inv_wr);
+ inv_wr.opcode = IB_WR_LOCAL_INV;
+ inv_wr.send_flags = IB_SEND_SIGNALED;
+ inv_wr.ex.invalidate_rkey =
+ vec->frmr->mr->lkey;
+ send_wr.next = &inv_wr;
+ }
ret = svc_rdma_send(rdma, &send_wr);
if (ret)
- svc_rdma_put_context(ctxt, 1);
+ goto err;
- return ret;
+ return 0;
+
+ err:
+ svc_rdma_put_frmr(rdma, vec->frmr);
+ svc_rdma_put_context(ctxt, 1);
+ return -EIO;
}
void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
@@ -477,8 +649,9 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
ctxt = svc_rdma_get_context(rdma);
ctxt->direction = DMA_TO_DEVICE;
vec = svc_rdma_get_req_map();
- xdr_to_sge(rdma, &rqstp->rq_res, vec);
-
+ ret = map_xdr(rdma, &rqstp->rq_res, vec);
+ if (ret)
+ goto err0;
inline_bytes = rqstp->rq_res.len;
/* Create the RDMA response header */
@@ -498,7 +671,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
if (ret < 0) {
printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
ret);
- goto error;
+ goto err1;
}
inline_bytes -= ret;
@@ -508,7 +681,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
if (ret < 0) {
printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
ret);
- goto error;
+ goto err1;
}
inline_bytes -= ret;
@@ -517,9 +690,11 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
svc_rdma_put_req_map(vec);
dprintk("svcrdma: send_reply returns %d\n", ret);
return ret;
- error:
+
+ err1:
+ put_page(res_page);
+ err0:
svc_rdma_put_req_map(vec);
svc_rdma_put_context(ctxt, 0);
- put_page(res_page);
return ret;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 900cb69728c6..6fb493cbd29f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -100,20 +100,29 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
ctxt->xprt = xprt;
INIT_LIST_HEAD(&ctxt->dto_q);
ctxt->count = 0;
+ ctxt->frmr = NULL;
atomic_inc(&xprt->sc_ctxt_used);
return ctxt;
}
-static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
+void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
{
struct svcxprt_rdma *xprt = ctxt->xprt;
int i;
for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
- atomic_dec(&xprt->sc_dma_used);
- ib_dma_unmap_single(xprt->sc_cm_id->device,
- ctxt->sge[i].addr,
- ctxt->sge[i].length,
- ctxt->direction);
+ /*
+ * Unmap the DMA addr in the SGE if the lkey matches
+ * the sc_dma_lkey, otherwise, ignore it since it is
+ * an FRMR lkey and will be unmapped later when the
+ * last WR that uses it completes.
+ */
+ if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
+ atomic_dec(&xprt->sc_dma_used);
+ ib_dma_unmap_single(xprt->sc_cm_id->device,
+ ctxt->sge[i].addr,
+ ctxt->sge[i].length,
+ ctxt->direction);
+ }
}
}
@@ -150,6 +159,7 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void)
schedule_timeout_uninterruptible(msecs_to_jiffies(500));
}
map->count = 0;
+ map->frmr = NULL;
return map;
}
@@ -316,6 +326,50 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
}
/*
+ * Processs a completion context
+ */
+static void process_context(struct svcxprt_rdma *xprt,
+ struct svc_rdma_op_ctxt *ctxt)
+{
+ svc_rdma_unmap_dma(ctxt);
+
+ switch (ctxt->wr_op) {
+ case IB_WR_SEND:
+ if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
+ svc_rdma_put_frmr(xprt, ctxt->frmr);
+ svc_rdma_put_context(ctxt, 1);
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ svc_rdma_put_context(ctxt, 0);
+ break;
+
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_READ_WITH_INV:
+ if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
+ struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
+ BUG_ON(!read_hdr);
+ if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
+ svc_rdma_put_frmr(xprt, ctxt->frmr);
+ spin_lock_bh(&xprt->sc_rq_dto_lock);
+ set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+ list_add_tail(&read_hdr->dto_q,
+ &xprt->sc_read_complete_q);
+ spin_unlock_bh(&xprt->sc_rq_dto_lock);
+ svc_xprt_enqueue(&xprt->sc_xprt);
+ }
+ svc_rdma_put_context(ctxt, 0);
+ break;
+
+ default:
+ printk(KERN_ERR "svcrdma: unexpected completion type, "
+ "opcode=%d\n",
+ ctxt->wr_op);
+ break;
+ }
+}
+
+/*
* Send Queue Completion Handler - potentially called on interrupt context.
*
* Note that caller must hold a transport reference.
@@ -327,17 +381,12 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
struct ib_cq *cq = xprt->sc_sq_cq;
int ret;
-
if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
return;
ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
atomic_inc(&rdma_stat_sq_poll);
while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
- ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
- xprt = ctxt->xprt;
-
- svc_rdma_unmap_dma(ctxt);
if (wc.status != IB_WC_SUCCESS)
/* Close the transport */
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
@@ -346,35 +395,10 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
atomic_dec(&xprt->sc_sq_count);
wake_up(&xprt->sc_send_wait);
- switch (ctxt->wr_op) {
- case IB_WR_SEND:
- svc_rdma_put_context(ctxt, 1);
- break;
-
- case IB_WR_RDMA_WRITE:
- svc_rdma_put_context(ctxt, 0);
- break;
-
- case IB_WR_RDMA_READ:
- if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
- struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
- BUG_ON(!read_hdr);
- spin_lock_bh(&xprt->sc_rq_dto_lock);
- set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
- list_add_tail(&read_hdr->dto_q,
- &xprt->sc_read_complete_q);
- spin_unlock_bh(&xprt->sc_rq_dto_lock);
- svc_xprt_enqueue(&xprt->sc_xprt);
- }
- svc_rdma_put_context(ctxt, 0);
- break;
+ ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
+ if (ctxt)
+ process_context(xprt, ctxt);
- default:
- printk(KERN_ERR "svcrdma: unexpected completion type, "
- "opcode=%d, status=%d\n",
- wc.opcode, wc.status);
- break;
- }
svc_xprt_put(&xprt->sc_xprt);
}
@@ -425,10 +449,12 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
+ INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
init_waitqueue_head(&cma_xprt->sc_send_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
+ spin_lock_init(&cma_xprt->sc_frmr_q_lock);
cma_xprt->sc_ord = svcrdma_ord;
@@ -462,7 +488,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
struct ib_recv_wr recv_wr, *bad_recv_wr;
struct svc_rdma_op_ctxt *ctxt;
struct page *page;
- unsigned long pa;
+ dma_addr_t pa;
int sge_no;
int buflen;
int ret;
@@ -474,13 +500,15 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
BUG_ON(sge_no >= xprt->sc_max_sge);
page = svc_rdma_get_page();
ctxt->pages[sge_no] = page;
- atomic_inc(&xprt->sc_dma_used);
pa = ib_dma_map_page(xprt->sc_cm_id->device,
page, 0, PAGE_SIZE,
DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
+ goto err_put_ctxt;
+ atomic_inc(&xprt->sc_dma_used);
ctxt->sge[sge_no].addr = pa;
ctxt->sge[sge_no].length = PAGE_SIZE;
- ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
buflen += PAGE_SIZE;
}
ctxt->count = sge_no;
@@ -496,6 +524,10 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
svc_rdma_put_context(ctxt, 1);
}
return ret;
+
+ err_put_ctxt:
+ svc_rdma_put_context(ctxt, 1);
+ return -ENOMEM;
}
/*
@@ -566,7 +598,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
"event=%d\n", cma_id, cma_id->context, event->event);
handle_connect_req(cma_id,
- event->param.conn.responder_resources);
+ event->param.conn.initiator_depth);
break;
case RDMA_CM_EVENT_ESTABLISHED:
@@ -686,6 +718,97 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
return ERR_PTR(ret);
}
+static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
+{
+ struct ib_mr *mr;
+ struct ib_fast_reg_page_list *pl;
+ struct svc_rdma_fastreg_mr *frmr;
+
+ frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
+ if (!frmr)
+ goto err;
+
+ mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES);
+ if (!mr)
+ goto err_free_frmr;
+
+ pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
+ RPCSVC_MAXPAGES);
+ if (!pl)
+ goto err_free_mr;
+
+ frmr->mr = mr;
+ frmr->page_list = pl;
+ INIT_LIST_HEAD(&frmr->frmr_list);
+ return frmr;
+
+ err_free_mr:
+ ib_dereg_mr(mr);
+ err_free_frmr:
+ kfree(frmr);
+ err:
+ return ERR_PTR(-ENOMEM);
+}
+
+static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt)
+{
+ struct svc_rdma_fastreg_mr *frmr;
+
+ while (!list_empty(&xprt->sc_frmr_q)) {
+ frmr = list_entry(xprt->sc_frmr_q.next,
+ struct svc_rdma_fastreg_mr, frmr_list);
+ list_del_init(&frmr->frmr_list);
+ ib_dereg_mr(frmr->mr);
+ ib_free_fast_reg_page_list(frmr->page_list);
+ kfree(frmr);
+ }
+}
+
+struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
+{
+ struct svc_rdma_fastreg_mr *frmr = NULL;
+
+ spin_lock_bh(&rdma->sc_frmr_q_lock);
+ if (!list_empty(&rdma->sc_frmr_q)) {
+ frmr = list_entry(rdma->sc_frmr_q.next,
+ struct svc_rdma_fastreg_mr, frmr_list);
+ list_del_init(&frmr->frmr_list);
+ frmr->map_len = 0;
+ frmr->page_list_len = 0;
+ }
+ spin_unlock_bh(&rdma->sc_frmr_q_lock);
+ if (frmr)
+ return frmr;
+
+ return rdma_alloc_frmr(rdma);
+}
+
+static void frmr_unmap_dma(struct svcxprt_rdma *xprt,
+ struct svc_rdma_fastreg_mr *frmr)
+{
+ int page_no;
+ for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
+ dma_addr_t addr = frmr->page_list->page_list[page_no];
+ if (ib_dma_mapping_error(frmr->mr->device, addr))
+ continue;
+ atomic_dec(&xprt->sc_dma_used);
+ ib_dma_unmap_single(frmr->mr->device, addr, PAGE_SIZE,
+ frmr->direction);
+ }
+}
+
+void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
+ struct svc_rdma_fastreg_mr *frmr)
+{
+ if (frmr) {
+ frmr_unmap_dma(rdma, frmr);
+ spin_lock_bh(&rdma->sc_frmr_q_lock);
+ BUG_ON(!list_empty(&frmr->frmr_list));
+ list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
+ spin_unlock_bh(&rdma->sc_frmr_q_lock);
+ }
+}
+
/*
* This is the xpo_recvfrom function for listening endpoints. Its
* purpose is to accept incoming connections. The CMA callback handler
@@ -704,6 +827,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct rdma_conn_param conn_param;
struct ib_qp_init_attr qp_attr;
struct ib_device_attr devattr;
+ int dma_mr_acc;
+ int need_dma_mr;
int ret;
int i;
@@ -819,15 +944,77 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
}
newxprt->sc_qp = newxprt->sc_cm_id->qp;
- /* Register all of physical memory */
- newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd,
- IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE);
- if (IS_ERR(newxprt->sc_phys_mr)) {
- dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret);
+ /*
+ * Use the most secure set of MR resources based on the
+ * transport type and available memory management features in
+ * the device. Here's the table implemented below:
+ *
+ * Fast Global DMA Remote WR
+ * Reg LKEY MR Access
+ * Sup'd Sup'd Needed Needed
+ *
+ * IWARP N N Y Y
+ * N Y Y Y
+ * Y N Y N
+ * Y Y N -
+ *
+ * IB N N Y N
+ * N Y N -
+ * Y N Y N
+ * Y Y N -
+ *
+ * NB: iWARP requires remote write access for the data sink
+ * of an RDMA_READ. IB does not.
+ */
+ if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+ newxprt->sc_frmr_pg_list_len =
+ devattr.max_fast_reg_page_list_len;
+ newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
+ }
+
+ /*
+ * Determine if a DMA MR is required and if so, what privs are required
+ */
+ switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
+ case RDMA_TRANSPORT_IWARP:
+ newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
+ if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
+ need_dma_mr = 1;
+ dma_mr_acc =
+ (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE);
+ } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+ need_dma_mr = 1;
+ dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+ } else
+ need_dma_mr = 0;
+ break;
+ case RDMA_TRANSPORT_IB:
+ if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+ need_dma_mr = 1;
+ dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+ } else
+ need_dma_mr = 0;
+ break;
+ default:
goto errout;
}
+ /* Create the DMA MR if needed, otherwise, use the DMA LKEY */
+ if (need_dma_mr) {
+ /* Register all of physical memory */
+ newxprt->sc_phys_mr =
+ ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
+ if (IS_ERR(newxprt->sc_phys_mr)) {
+ dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
+ ret);
+ goto errout;
+ }
+ newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
+ } else
+ newxprt->sc_dma_lkey =
+ newxprt->sc_cm_id->device->local_dma_lkey;
+
/* Post receive buffers */
for (i = 0; i < newxprt->sc_max_requests; i++) {
ret = svc_rdma_post_recv(newxprt);
@@ -961,6 +1148,9 @@ static void __svc_rdma_free(struct work_struct *work)
WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
WARN_ON(atomic_read(&rdma->sc_dma_used) != 0);
+ /* De-allocate fastreg mr */
+ rdma_dealloc_frmr_q(rdma);
+
/* Destroy the QP if present (not a listener) */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
ib_destroy_qp(rdma->sc_qp);
@@ -1014,21 +1204,59 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
return 1;
}
+/*
+ * Attempt to register the kvec representing the RPC memory with the
+ * device.
+ *
+ * Returns:
+ * NULL : The device does not support fastreg or there were no more
+ * fastreg mr.
+ * frmr : The kvec register request was successfully posted.
+ * <0 : An error was encountered attempting to register the kvec.
+ */
+int svc_rdma_fastreg(struct svcxprt_rdma *xprt,
+ struct svc_rdma_fastreg_mr *frmr)
+{
+ struct ib_send_wr fastreg_wr;
+ u8 key;
+
+ /* Bump the key */
+ key = (u8)(frmr->mr->lkey & 0x000000FF);
+ ib_update_fast_reg_key(frmr->mr, ++key);
+
+ /* Prepare FASTREG WR */
+ memset(&fastreg_wr, 0, sizeof fastreg_wr);
+ fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+ fastreg_wr.send_flags = IB_SEND_SIGNALED;
+ fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
+ fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
+ fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
+ fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ fastreg_wr.wr.fast_reg.length = frmr->map_len;
+ fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
+ fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
+ return svc_rdma_send(xprt, &fastreg_wr);
+}
+
int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
{
- struct ib_send_wr *bad_wr;
+ struct ib_send_wr *bad_wr, *n_wr;
+ int wr_count;
+ int i;
int ret;
if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
return -ENOTCONN;
BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
- BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op !=
- wr->opcode);
+ wr_count = 1;
+ for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
+ wr_count++;
+
/* If the SQ is full, wait until an SQ entry is available */
while (1) {
spin_lock_bh(&xprt->sc_lock);
- if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) {
+ if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
spin_unlock_bh(&xprt->sc_lock);
atomic_inc(&rdma_stat_sq_starve);
@@ -1043,19 +1271,26 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
return 0;
continue;
}
- /* Bumped used SQ WR count and post */
- svc_xprt_get(&xprt->sc_xprt);
+ /* Take a transport ref for each WR posted */
+ for (i = 0; i < wr_count; i++)
+ svc_xprt_get(&xprt->sc_xprt);
+
+ /* Bump used SQ WR count and post */
+ atomic_add(wr_count, &xprt->sc_sq_count);
ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
- if (!ret)
- atomic_inc(&xprt->sc_sq_count);
- else {
- svc_xprt_put(&xprt->sc_xprt);
+ if (ret) {
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ atomic_sub(wr_count, &xprt->sc_sq_count);
+ for (i = 0; i < wr_count; i ++)
+ svc_xprt_put(&xprt->sc_xprt);
dprintk("svcrdma: failed to post SQ WR rc=%d, "
"sc_sq_count=%d, sc_sq_depth=%d\n",
ret, atomic_read(&xprt->sc_sq_count),
xprt->sc_sq_depth);
}
spin_unlock_bh(&xprt->sc_lock);
+ if (ret)
+ wake_up(&xprt->sc_send_wait);
break;
}
return ret;
@@ -1079,10 +1314,14 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
/* Prepare SGE for local address */
- atomic_inc(&xprt->sc_dma_used);
sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
- sge.lkey = xprt->sc_phys_mr->lkey;
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) {
+ put_page(p);
+ return;
+ }
+ atomic_inc(&xprt->sc_dma_used);
+ sge.lkey = xprt->sc_dma_lkey;
sge.length = length;
ctxt = svc_rdma_get_context(xprt);
@@ -1103,6 +1342,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
if (ret) {
dprintk("svcrdma: Error %d posting send for protocol error\n",
ret);
+ ib_dma_unmap_page(xprt->sc_cm_id->device,
+ sge.addr, PAGE_SIZE,
+ DMA_FROM_DEVICE);
svc_rdma_put_context(ctxt, 1);
}
}