diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-26 09:52:09 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-26 09:52:09 -0700 |
commit | a4dd8dce14014665862ce7911b38cb2c69e366dd (patch) | |
tree | 00e79b2845d5d49abcd83cf253db83a52d482265 /fs/nfs | |
parent | b18cae4224bde7e5a332c19bc99247b2098ea232 (diff) | |
parent | 411b5e05617593efebc06241dbc56f42150f2abe (diff) |
Merge branch 'nfs-for-2.6.37' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6
* 'nfs-for-2.6.37' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6:
net/sunrpc: Use static const char arrays
nfs4: fix channel attribute sanity-checks
NFSv4.1: Use more sensible names for 'initialize_mountpoint'
NFSv4.1: pnfs: filelayout: add driver's LAYOUTGET and GETDEVICEINFO infrastructure
NFSv4.1: pnfs: add LAYOUTGET and GETDEVICEINFO infrastructure
NFS: client needs to maintain list of inodes with active layouts
NFS: create and destroy inode's layout cache
NFSv4.1: pnfs: filelayout: introduce minimal file layout driver
NFSv4.1: pnfs: full mount/umount infrastructure
NFS: set layout driver
NFS: ask for layouttypes during v4 fsinfo call
NFS: change stateid to be a union
NFSv4.1: pnfsd, pnfs: protocol level pnfs constants
SUNRPC: define xdr_decode_opaque_fixed
NFSD: remove duplicate NFS4_STATEID_SIZE
Diffstat (limited to 'fs/nfs')
-rw-r--r-- | fs/nfs/Kconfig | 8 | ||||
-rw-r--r-- | fs/nfs/Makefile | 4 | ||||
-rw-r--r-- | fs/nfs/callback_proc.c | 8 | ||||
-rw-r--r-- | fs/nfs/client.c | 10 | ||||
-rw-r--r-- | fs/nfs/file.c | 5 | ||||
-rw-r--r-- | fs/nfs/inode.c | 3 | ||||
-rw-r--r-- | fs/nfs/nfs4filelayout.c | 280 | ||||
-rw-r--r-- | fs/nfs/nfs4filelayout.h | 94 | ||||
-rw-r--r-- | fs/nfs/nfs4filelayoutdev.c | 448 | ||||
-rw-r--r-- | fs/nfs/nfs4proc.c | 218 | ||||
-rw-r--r-- | fs/nfs/nfs4state.c | 2 | ||||
-rw-r--r-- | fs/nfs/nfs4xdr.c | 360 | ||||
-rw-r--r-- | fs/nfs/pnfs.c | 783 | ||||
-rw-r--r-- | fs/nfs/pnfs.h | 189 | ||||
-rw-r--r-- | fs/nfs/read.c | 3 |
15 files changed, 2374 insertions, 41 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 5c55c26af165..fd667652c502 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -77,13 +77,17 @@ config NFS_V4 config NFS_V4_1 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" - depends on NFS_V4 && EXPERIMENTAL + depends on NFS_FS && NFS_V4 && EXPERIMENTAL + select PNFS_FILE_LAYOUT help This option enables support for minor version 1 of the NFSv4 protocol - (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. + (RFC 5661) in the kernel's NFS client. If unsure, say N. +config PNFS_FILE_LAYOUT + tristate + config ROOT_NFS bool "Root file system on NFS" depends on NFS_FS=y && IP_PNP diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index da7fda639eac..4776ff9e3814 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -15,5 +15,9 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o +nfs-$(CONFIG_NFS_V4_1) += pnfs.o nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o + +obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o +nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 930d10fecdaf..2950fca0c61b 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -118,11 +118,11 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n if (delegation == NULL) return 0; - /* seqid is 4-bytes long */ - if (((u32 *) &stateid->data)[0] != 0) + if (stateid->stateid.seqid != 0) return 0; - if (memcmp(&delegation->stateid.data[4], &stateid->data[4], - sizeof(stateid->data)-4)) + if (memcmp(&delegation->stateid.stateid.other, + &stateid->stateid.other, + NFS4_STATEID_OTHER_SIZE)) return 0; return 1; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index a882785eba41..fd6f0a70021b 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -48,6 +48,7 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_CLIENT @@ -155,7 +156,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ cred = rpc_lookup_machine_cred(); if (!IS_ERR(cred)) clp->cl_machine_cred = cred; - +#if defined(CONFIG_NFS_V4_1) + INIT_LIST_HEAD(&clp->cl_layouts); +#endif nfs_fscache_get_client_cookie(clp); return clp; @@ -252,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp) nfs_free_client(clp); } } +EXPORT_SYMBOL_GPL(nfs_put_client); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) /* @@ -900,6 +904,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + set_pnfs_layoutdriver(server, fsinfo->layouttype); + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); @@ -939,6 +945,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str } fsinfo.fattr = fattr; + fsinfo.layouttype = 0; error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) goto out_error; @@ -1021,6 +1028,7 @@ void nfs_free_server(struct nfs_server *server) { dprintk("--> nfs_free_server()\n"); + unset_pnfs_layoutdriver(server); spin_lock(&nfs_client_lock); list_del(&server->client_link); list_del(&server->master_link); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e18c31e08a28..e756075637b0 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -36,6 +36,7 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_FILE @@ -386,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); + pnfs_update_layout(mapping->host, + nfs_file_open_context(file), + IOMODE_RW); + start: /* * Prevent starvation issues if someone is doing a consistency diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6eec28656415..314f57164602 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -48,6 +48,7 @@ #include "internal.h" #include "fscache.h" #include "dns_resolve.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -1410,6 +1411,7 @@ void nfs4_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); + pnfs_destroy_layout(NFS_I(inode)); /* If we are holding a delegation, return it! */ nfs_inode_return_delegation_noreclaim(inode); /* First call standard NFS clear_inode() code */ @@ -1447,6 +1449,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) nfsi->delegation = NULL; nfsi->delegation_state = 0; init_rwsem(&nfsi->rwsem); + nfsi->layout = NULL; #endif } diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c new file mode 100644 index 000000000000..2e92f0d8d654 --- /dev/null +++ b/fs/nfs/nfs4filelayout.c @@ -0,0 +1,280 @@ +/* + * Module for the pnfs nfs4 file layout driver. + * Defines all I/O and Policy interface operations, plus code + * to register itself with the pNFS client. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include <linux/nfs_fs.h> + +#include "internal.h" +#include "nfs4filelayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); +MODULE_DESCRIPTION("The NFSv4 file layout driver"); + +static int +filelayout_set_layoutdriver(struct nfs_server *nfss) +{ + int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client, + nfs4_fl_free_deviceid_callback); + if (status) { + printk(KERN_WARNING "%s: deviceid cache could not be " + "initialized\n", __func__); + return status; + } + dprintk("%s: deviceid cache has been initialized successfully\n", + __func__); + return 0; +} + +/* Clear out the layout by destroying its device list */ +static int +filelayout_clear_layoutdriver(struct nfs_server *nfss) +{ + dprintk("--> %s\n", __func__); + + if (nfss->nfs_client->cl_devid_cache) + pnfs_put_deviceid_cache(nfss->nfs_client); + return 0; +} + +/* + * filelayout_check_layout() + * + * Make sure layout segment parameters are sane WRT the device. + * At this point no generic layer initialization of the lseg has occurred, + * and nothing has been added to the layout_hdr cache. + * + */ +static int +filelayout_check_layout(struct pnfs_layout_hdr *lo, + struct nfs4_filelayout_segment *fl, + struct nfs4_layoutget_res *lgr, + struct nfs4_deviceid *id) +{ + struct nfs4_file_layout_dsaddr *dsaddr; + int status = -EINVAL; + struct nfs_server *nfss = NFS_SERVER(lo->inode); + + dprintk("--> %s\n", __func__); + + if (fl->pattern_offset > lgr->range.offset) { + dprintk("%s pattern_offset %lld to large\n", + __func__, fl->pattern_offset); + goto out; + } + + if (fl->stripe_unit % PAGE_SIZE) { + dprintk("%s Stripe unit (%u) not page aligned\n", + __func__, fl->stripe_unit); + goto out; + } + + /* find and reference the deviceid */ + dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); + if (dsaddr == NULL) { + dsaddr = get_device_info(lo->inode, id); + if (dsaddr == NULL) + goto out; + } + fl->dsaddr = dsaddr; + + if (fl->first_stripe_index < 0 || + fl->first_stripe_index >= dsaddr->stripe_count) { + dprintk("%s Bad first_stripe_index %d\n", + __func__, fl->first_stripe_index); + goto out_put; + } + + if ((fl->stripe_type == STRIPE_SPARSE && + fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) || + (fl->stripe_type == STRIPE_DENSE && + fl->num_fh != dsaddr->stripe_count)) { + dprintk("%s num_fh %u not valid for given packing\n", + __func__, fl->num_fh); + goto out_put; + } + + if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { + dprintk("%s Stripe unit (%u) not aligned with rsize %u " + "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, + nfss->wsize); + } + + status = 0; +out: + dprintk("--> %s returns %d\n", __func__, status); + return status; +out_put: + pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid); + goto out; +} + +static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) +{ + int i; + + for (i = 0; i < fl->num_fh; i++) { + if (!fl->fh_array[i]) + break; + kfree(fl->fh_array[i]); + } + kfree(fl->fh_array); + fl->fh_array = NULL; +} + +static void +_filelayout_free_lseg(struct nfs4_filelayout_segment *fl) +{ + filelayout_free_fh_array(fl); + kfree(fl); +} + +static int +filelayout_decode_layout(struct pnfs_layout_hdr *flo, + struct nfs4_filelayout_segment *fl, + struct nfs4_layoutget_res *lgr, + struct nfs4_deviceid *id) +{ + uint32_t *p = (uint32_t *)lgr->layout.buf; + uint32_t nfl_util; + int i; + + dprintk("%s: set_layout_map Begin\n", __func__); + + memcpy(id, p, sizeof(*id)); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + print_deviceid(id); + + nfl_util = be32_to_cpup(p++); + if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) + fl->commit_through_mds = 1; + if (nfl_util & NFL4_UFLG_DENSE) + fl->stripe_type = STRIPE_DENSE; + else + fl->stripe_type = STRIPE_SPARSE; + fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; + + fl->first_stripe_index = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &fl->pattern_offset); + fl->num_fh = be32_to_cpup(p++); + + dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n", + __func__, nfl_util, fl->num_fh, fl->first_stripe_index, + fl->pattern_offset); + + fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), + GFP_KERNEL); + if (!fl->fh_array) + return -ENOMEM; + + for (i = 0; i < fl->num_fh; i++) { + /* Do we want to use a mempool here? */ + fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); + if (!fl->fh_array[i]) { + filelayout_free_fh_array(fl); + return -ENOMEM; + } + fl->fh_array[i]->size = be32_to_cpup(p++); + if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { + printk(KERN_ERR "Too big fh %d received %d\n", + i, fl->fh_array[i]->size); + filelayout_free_fh_array(fl); + return -EIO; + } + memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); + p += XDR_QUADLEN(fl->fh_array[i]->size); + dprintk("DEBUG: %s: fh len %d\n", __func__, + fl->fh_array[i]->size); + } + + return 0; +} + +static struct pnfs_layout_segment * +filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, + struct nfs4_layoutget_res *lgr) +{ + struct nfs4_filelayout_segment *fl; + int rc; + struct nfs4_deviceid id; + + dprintk("--> %s\n", __func__); + fl = kzalloc(sizeof(*fl), GFP_KERNEL); + if (!fl) + return NULL; + + rc = filelayout_decode_layout(layoutid, fl, lgr, &id); + if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) { + _filelayout_free_lseg(fl); + return NULL; + } + return &fl->generic_hdr; +} + +static void +filelayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode); + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + + dprintk("--> %s\n", __func__); + pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, + &fl->dsaddr->deviceid); + _filelayout_free_lseg(fl); +} + +static struct pnfs_layoutdriver_type filelayout_type = { + .id = LAYOUT_NFSV4_1_FILES, + .name = "LAYOUT_NFSV4_1_FILES", + .owner = THIS_MODULE, + .set_layoutdriver = filelayout_set_layoutdriver, + .clear_layoutdriver = filelayout_clear_layoutdriver, + .alloc_lseg = filelayout_alloc_lseg, + .free_lseg = filelayout_free_lseg, +}; + +static int __init nfs4filelayout_init(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", + __func__); + return pnfs_register_layoutdriver(&filelayout_type); +} + +static void __exit nfs4filelayout_exit(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", + __func__); + pnfs_unregister_layoutdriver(&filelayout_type); +} + +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h new file mode 100644 index 000000000000..bbf60dd2ab9d --- /dev/null +++ b/fs/nfs/nfs4filelayout.h @@ -0,0 +1,94 @@ +/* + * NFSv4 file layout driver data structures. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#ifndef FS_NFS_NFS4FILELAYOUT_H +#define FS_NFS_NFS4FILELAYOUT_H + +#include "pnfs.h" + +/* + * Field testing shows we need to support upto 4096 stripe indices. + * We store each index as a u8 (u32 on the wire) to keep the memory footprint + * reasonable. This in turn means we support a maximum of 256 + * RFC 5661 multipath_list4 structures. + */ +#define NFS4_PNFS_MAX_STRIPE_CNT 4096 +#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */ + +enum stripetype4 { + STRIPE_SPARSE = 1, + STRIPE_DENSE = 2 +}; + +/* Individual ip address */ +struct nfs4_pnfs_ds { + struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + u32 ds_ip_addr; + u32 ds_port; + struct nfs_client *ds_clp; + atomic_t ds_count; +}; + +struct nfs4_file_layout_dsaddr { + struct pnfs_deviceid_node deviceid; + u32 stripe_count; + u8 *stripe_indices; + u32 ds_num; + struct nfs4_pnfs_ds *ds_list[1]; +}; + +struct nfs4_filelayout_segment { + struct pnfs_layout_segment generic_hdr; + u32 stripe_type; + u32 commit_through_mds; + u32 stripe_unit; + u32 first_stripe_index; + u64 pattern_offset; + struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ + unsigned int num_fh; + struct nfs_fh **fh_array; +}; + +static inline struct nfs4_filelayout_segment * +FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) +{ + return container_of(lseg, + struct nfs4_filelayout_segment, + generic_hdr); +} + +extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); +extern void print_ds(struct nfs4_pnfs_ds *ds); +extern void print_deviceid(struct nfs4_deviceid *dev_id); +extern struct nfs4_file_layout_dsaddr * +nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); +struct nfs4_file_layout_dsaddr * +get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id); + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c new file mode 100644 index 000000000000..51fe64ace55a --- /dev/null +++ b/fs/nfs/nfs4filelayoutdev.c @@ -0,0 +1,448 @@ +/* + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * Garth Goodson <Garth.Goodson@netapp.com> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include <linux/nfs_fs.h> +#include <linux/vmalloc.h> + +#include "internal.h" +#include "nfs4filelayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* + * Data server cache + * + * Data servers can be mapped to different device ids. + * nfs4_pnfs_ds reference counting + * - set to 1 on allocation + * - incremented when a device id maps a data server already in the cache. + * - decremented when deviceid is removed from the cache. + */ +DEFINE_SPINLOCK(nfs4_ds_cache_lock); +static LIST_HEAD(nfs4_data_server_cache); + +/* Debug routines */ +void +print_ds(struct nfs4_pnfs_ds *ds) +{ + if (ds == NULL) { + printk("%s NULL device\n", __func__); + return; + } + printk(" ip_addr %x port %hu\n" + " ref count %d\n" + " client %p\n" + " cl_exchange_flags %x\n", + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + atomic_read(&ds->ds_count), ds->ds_clp, + ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); +} + +void +print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) +{ + int i; + + ifdebug(FACILITY) { + printk("%s dsaddr->ds_num %d\n", __func__, + dsaddr->ds_num); + for (i = 0; i < dsaddr->ds_num; i++) + print_ds(dsaddr->ds_list[i]); + } +} + +void print_deviceid(struct nfs4_deviceid *id) +{ + u32 *p = (u32 *)id; + + dprintk("%s: device id= [%x%x%x%x]\n", __func__, + p[0], p[1], p[2], p[3]); +} + +/* nfs4_ds_cache_lock is held */ +static struct nfs4_pnfs_ds * +_data_server_lookup_locked(u32 ip_addr, u32 port) +{ + struct nfs4_pnfs_ds *ds; + + dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", + ntohl(ip_addr), ntohs(port)); + + list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { + if (ds->ds_ip_addr == ip_addr && + ds->ds_port == port) { + return ds; + } + } + return NULL; +} + +static void +destroy_ds(struct nfs4_pnfs_ds *ds) +{ + dprintk("--> %s\n", __func__); + ifdebug(FACILITY) + print_ds(ds); + + if (ds->ds_clp) + nfs_put_client(ds->ds_clp); + kfree(ds); +} + +static void +nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ + struct nfs4_pnfs_ds *ds; + int i; + + print_deviceid(&dsaddr->deviceid.de_id); + + for (i = 0; i < dsaddr->ds_num; i++) { + ds = dsaddr->ds_list[i]; + if (ds != NULL) { + if (atomic_dec_and_lock(&ds->ds_count, + &nfs4_ds_cache_lock)) { + list_del_init(&ds->ds_node); + spin_unlock(&nfs4_ds_cache_lock); + destroy_ds(ds); + } + } + } + kfree(dsaddr->stripe_indices); + kfree(dsaddr); +} + +void +nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device) +{ + struct nfs4_file_layout_dsaddr *dsaddr = + container_of(device, struct nfs4_file_layout_dsaddr, deviceid); + + nfs4_fl_free_deviceid(dsaddr); +} + +static struct nfs4_pnfs_ds * +nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port) +{ + struct nfs4_pnfs_ds *tmp_ds, *ds; + + ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); + if (!ds) + goto out; + + spin_lock(&nfs4_ds_cache_lock); + tmp_ds = _data_server_lookup_locked(ip_addr, port); + if (tmp_ds == NULL) { + ds->ds_ip_addr = ip_addr; + ds->ds_port = port; + atomic_set(&ds->ds_count, 1); + INIT_LIST_HEAD(&ds->ds_node); + ds->ds_clp = NULL; + list_add(&ds->ds_node, &nfs4_data_server_cache); + dprintk("%s add new data server ip 0x%x\n", __func__, + ds->ds_ip_addr); + } else { + kfree(ds); + atomic_inc(&tmp_ds->ds_count); + dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", + __func__, tmp_ds->ds_ip_addr, + atomic_read(&tmp_ds->ds_count)); + ds = tmp_ds; + } + spin_unlock(&nfs4_ds_cache_lock); +out: + return ds; +} + +/* + * Currently only support ipv4, and one multi-path address. + */ +static struct nfs4_pnfs_ds * +decode_and_add_ds(__be32 **pp, struct inode *inode) +{ + struct nfs4_pnfs_ds *ds = NULL; + char *buf; + const char *ipend, *pstr; + u32 ip_addr, port; + int nlen, rlen, i; + int tmp[2]; + __be32 *r_netid, *r_addr, *p = *pp; + + /* r_netid */ + nlen = be32_to_cpup(p++); + r_netid = p; + p += XDR_QUADLEN(nlen); + + /* r_addr */ + rlen = be32_to_cpup(p++); + r_addr = p; + p += XDR_QUADLEN(rlen); + *pp = p; + + /* Check that netid is "tcp" */ + if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) { + dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); + goto out_err; + } + + /* ipv6 length plus port is legal */ + if (rlen > INET6_ADDRSTRLEN + 8) { + dprintk("%s Invalid address, length %d\n", __func__, + rlen); + goto out_err; + } + buf = kmalloc(rlen + 1, GFP_KERNEL); + buf[rlen] = '\0'; + memcpy(buf, r_addr, rlen); + + /* replace the port dots with dashes for the in4_pton() delimiter*/ + for (i = 0; i < 2; i++) { + char *res = strrchr(buf, '.'); + *res = '-'; + } + + /* Currently only support ipv4 address */ + if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) { + dprintk("%s: Only ipv4 addresses supported\n", __func__); + goto out_free; + } + + /* port */ + pstr = ipend; + sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]); + port = htons((tmp[0] << 8) | (tmp[1])); + + ds = nfs4_pnfs_ds_add(inode, ip_addr, port); + dprintk("%s Decoded address and port %s\n", __func__, buf); +out_free: + kfree(buf); +out_err: + return ds; +} + +/* Decode opaque device data and return the result */ +static struct nfs4_file_layout_dsaddr* +decode_device(struct inode *ino, struct pnfs_device *pdev) +{ + int i, dummy; + u32 cnt, num; + u8 *indexp; + __be32 *p = (__be32 *)pdev->area, *indicesp; + struct nfs4_file_layout_dsaddr *dsaddr; + + /* Get the stripe count (number of stripe index) */ + cnt = be32_to_cpup(p++); + dprintk("%s stripe count %d\n", __func__, cnt); + if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { + printk(KERN_WARNING "%s: stripe count %d greater than " + "supported maximum %d\n", __func__, + cnt, NFS4_PNFS_MAX_STRIPE_CNT); + goto out_err; + } + + /* Check the multipath list count */ + indicesp = p; + p += XDR_QUADLEN(cnt << 2); + num = be32_to_cpup(p++); + dprintk("%s ds_num %u\n", __func__, num); + if (num > NFS4_PNFS_MAX_MULTI_CNT) { + printk(KERN_WARNING "%s: multipath count %d greater than " + "supported maximum %d\n", __func__, + num, NFS4_PNFS_MAX_MULTI_CNT); + goto out_err; + } + dsaddr = kzalloc(sizeof(*dsaddr) + + (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), + GFP_KERNEL); + if (!dsaddr) + goto out_err; + + dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); + if (!dsaddr->stripe_indices) + goto out_err_free; + + dsaddr->stripe_count = cnt; + dsaddr->ds_num = num; + + memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id)); + + /* Go back an read stripe indices */ + p = indicesp; + indexp = &dsaddr->stripe_indices[0]; + for (i = 0; i < dsaddr->stripe_count; i++) { + *indexp = be32_to_cpup(p++); + if (*indexp >= num) + goto out_err_free; + indexp++; + } + /* Skip already read multipath list count */ + p++; + + for (i = 0; i < dsaddr->ds_num; i++) { + int j; + + dummy = be32_to_cpup(p++); /* multipath count */ + if (dummy > 1) { + printk(KERN_WARNING + "%s: Multipath count %d not supported, " + "skipping all greater than 1\n", __func__, + dummy); + } + for (j = 0; j < dummy; j++) { + if (j == 0) { + dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); + if (dsaddr->ds_list[i] == NULL) + goto out_err_free; + } else { + u32 len; + /* skip extra multipath */ + len = be32_to_cpup(p++); + p += XDR_QUADLEN(len); + len = be32_to_cpup(p++); + p += XDR_QUADLEN(len); + continue; + } + } + } + return dsaddr; + +out_err_free: + nfs4_fl_free_deviceid(dsaddr); +out_err: + dprintk("%s ERROR: returning NULL\n", __func__); + return NULL; +} + +/* + * Decode the opaque device specified in 'dev' + * and add it to the list of available devices. + * If the deviceid is already cached, nfs4_add_deviceid will return + * a pointer to the cached struct and throw away the new. + */ +static struct nfs4_file_layout_dsaddr* +decode_and_add_device(struct inode *inode, struct pnfs_device *dev) +{ + struct nfs4_file_layout_dsaddr *dsaddr; + struct pnfs_deviceid_node *d; + + dsaddr = decode_device(inode, dev); + if (!dsaddr) { + printk(KERN_WARNING "%s: Could not decode or add device\n", + __func__); + return NULL; + } + + d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, + &dsaddr->deviceid); + + return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); +} + +/* + * Retrieve the information for dev_id, add it to the list + * of available devices, and return it. + */ +struct nfs4_file_layout_dsaddr * +get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) +{ + struct pnfs_device *pdev = NULL; + u32 max_resp_sz; + int max_pages; + struct page **pages = NULL; + struct nfs4_file_layout_dsaddr *dsaddr = NULL; + int rc, i; + struct nfs_server *server = NFS_SERVER(inode); + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = max_resp_sz >> PAGE_SHIFT; + dprintk("%s inode %p max_resp_sz %u max_pages %d\n", + __func__, inode, max_resp_sz, max_pages); + + pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); + if (pdev == NULL) + return NULL; + + pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) { + kfree(pdev); + return NULL; + } + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_free; + } + + /* set pdev->area */ + pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); + if (!pdev->area) + goto out_free; + + memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); + pdev->layout_type = LAYOUT_NFSV4_1_FILES; + pdev->pages = pages; + pdev->pgbase = 0; + pdev->pglen = PAGE_SIZE * max_pages; + pdev->mincount = 0; + + rc = nfs4_proc_getdeviceinfo(server, pdev); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) + goto out_free; + + /* + * Found new device, need to decode it and then add it to the + * list of known devices for this mountpoint. + */ + dsaddr = decode_and_add_device(inode, pdev); +out_free: + if (pdev->area != NULL) + vunmap(pdev->area); + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + kfree(pdev); + dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); + return dsaddr; +} + +struct nfs4_file_layout_dsaddr * +nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id) +{ + struct pnfs_deviceid_node *d; + + d = pnfs_find_get_deviceid(clp->cl_devid_cache, id); + return (d == NULL) ? NULL : + container_of(d, struct nfs4_file_layout_dsaddr, deviceid); +} diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e87fe612ca18..32c8758c99fd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -55,6 +55,7 @@ #include "internal.h" #include "iostat.h" #include "callback.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -130,6 +131,7 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_LEASE_TIME, FATTR4_WORD1_TIME_DELTA + | FATTR4_WORD1_FS_LAYOUT_TYPES }; const u32 nfs4_fs_locations_bitmap[2] = { @@ -4840,49 +4842,56 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->bc_attrs.max_reqs); } -static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd) +static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) { - if (rcvd <= sent) - return 0; - printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. " - "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd); - return -EINVAL; + struct nfs4_channel_attrs *sent = &args->fc_attrs; + struct nfs4_channel_attrs *rcvd = &session->fc_attrs; + + if (rcvd->headerpadsz > sent->headerpadsz) + return -EINVAL; + if (rcvd->max_resp_sz > sent->max_resp_sz) + return -EINVAL; + /* + * Our requested max_ops is the minimum we need; we're not + * prepared to break up compounds into smaller pieces than that. + * So, no point even trying to continue if the server won't + * cooperate: + */ + if (rcvd->max_ops < sent->max_ops) + return -EINVAL; + if (rcvd->max_reqs == 0) + return -EINVAL; + return 0; } -#define _verify_fore_channel_attr(_name_) \ - _verify_channel_attr("fore", #_name_, \ - args->fc_attrs._name_, \ - session->fc_attrs._name_) +static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) +{ + struct nfs4_channel_attrs *sent = &args->bc_attrs; + struct nfs4_channel_attrs *rcvd = &session->bc_attrs; -#define _verify_back_channel_attr(_name_) \ - _verify_channel_attr("back", #_name_, \ - args->bc_attrs._name_, \ - session->bc_attrs._name_) + if (rcvd->max_rqst_sz > sent->max_rqst_sz) + return -EINVAL; + if (rcvd->max_resp_sz < sent->max_resp_sz) + return -EINVAL; + if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) + return -EINVAL; + /* These would render the backchannel useless: */ + if (rcvd->max_ops == 0) + return -EINVAL; + if (rcvd->max_reqs == 0) + return -EINVAL; + return 0; +} -/* - * The server is not allowed to increase the fore channel header pad size, - * maximum response size, or maximum number of operations. - * - * The back channel attributes are only negotiatied down: We send what the - * (back channel) server insists upon. - */ static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) { - int ret = 0; - - ret |= _verify_fore_channel_attr(headerpadsz); - ret |= _verify_fore_channel_attr(max_resp_sz); - ret |= _verify_fore_channel_attr(max_ops); - - ret |= _verify_back_channel_attr(headerpadsz); - ret |= _verify_back_channel_attr(max_rqst_sz); - ret |= _verify_back_channel_attr(max_resp_sz); - ret |= _verify_back_channel_attr(max_resp_sz_cached); - ret |= _verify_back_channel_attr(max_ops); - ret |= _verify_back_channel_attr(max_reqs); + int ret; - return ret; + ret = nfs4_verify_fore_channel_attrs(args, session); + if (ret) + return ret; + return nfs4_verify_back_channel_attrs(args, session); } static int _nfs4_proc_create_session(struct nfs_client *clp) @@ -5255,6 +5264,147 @@ out: dprintk("<-- %s status=%d\n", __func__, status); return status; } + +static void +nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct inode *ino = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(ino); + + dprintk("--> %s\n", __func__); + if (nfs4_setup_sequence(server, &lgp->args.seq_args, + &lgp->res.seq_res, 0, task)) + return; + rpc_call_start(task); +} + +static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &lgp->res.seq_res)) + return; + + switch (task->tk_status) { + case 0: + break; + case -NFS4ERR_LAYOUTTRYLATER: + case -NFS4ERR_RECALLCONFLICT: + task->tk_status = -NFS4ERR_DELAY; + /* Fall through */ + default: + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } + lgp->status = task->tk_status; + dprintk("<-- %s\n", __func__); +} + +static void nfs4_layoutget_release(void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + + dprintk("--> %s\n", __func__); + put_layout_hdr(lgp->args.inode); + if (lgp->res.layout.buf != NULL) + free_page((unsigned long) lgp->res.layout.buf); + put_nfs_open_context(lgp->args.ctx); + kfree(calldata); + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_layoutget_call_ops = { + .rpc_call_prepare = nfs4_layoutget_prepare, + .rpc_call_done = nfs4_layoutget_done, + .rpc_release = nfs4_layoutget_release, +}; + +int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) +{ + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], + .rpc_argp = &lgp->args, + .rpc_resp = &lgp->res, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_layoutget_call_ops, + .callback_data = lgp, + .flags = RPC_TASK_ASYNC, + }; + int status = 0; + + dprintk("--> %s\n", __func__); + + lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); + if (lgp->res.layout.buf == NULL) { + nfs4_layoutget_release(lgp); + return -ENOMEM; + } + + lgp->res.seq_res.sr_slot = NULL; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) + goto out; + status = lgp->status; + if (status != 0) + goto out; + status = pnfs_layout_process(lgp); +out: + rpc_put_task(task); + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} + +static int +_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +{ + struct nfs4_getdeviceinfo_args args = { + .pdev = pdev, + }; + struct nfs4_getdeviceinfo_res res = { + .pdev = pdev, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(server, &msg, &args, &res, 0); + dprintk("<-- %s status=%d\n", __func__, status); + + return status; +} + +int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_proc_getdeviceinfo(server, pdev), + &exception); + } while (exception.retry); + return err; +} +EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo); + #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index aa0b02a610c4..f575a3126737 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -54,6 +54,7 @@ #include "callback.h" #include "delegation.h" #include "internal.h" +#include "pnfs.h" #define OPENOWNER_POOL_SIZE 8 @@ -1475,6 +1476,7 @@ static void nfs4_state_manager(struct nfs_client *clp) } clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + pnfs_destroy_all_layouts(clp); } if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index bd2101d918c8..f313c4cce7e4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,6 +52,7 @@ #include <linux/nfs_idmap.h> #include "nfs4_fs.h" #include "internal.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -310,6 +311,19 @@ static int nfs4_stat_to_errno(int); XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) +#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ + XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) +#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ + 1 /* layout type */ + \ + 1 /* opaque devaddr4 length */ + \ + /* devaddr4 payload is read into page */ \ + 1 /* notification bitmap length */ + \ + 1 /* notification bitmap */) +#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ + encode_stateid_maxsz) +#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ + decode_stateid_maxsz + \ + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -699,6 +713,20 @@ static int nfs4_stat_to_errno(int); #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_reclaim_complete_maxsz) +#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_getdeviceinfo_maxsz) +#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_getdeviceinfo_maxsz) +#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_layoutget_maxsz) +#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutget_maxsz) const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + compound_encode_hdr_maxsz + @@ -1737,6 +1765,58 @@ static void encode_sequence(struct xdr_stream *xdr, #endif /* CONFIG_NFS_V4_1 */ } +#ifdef CONFIG_NFS_V4_1 +static void +encode_getdeviceinfo(struct xdr_stream *xdr, + const struct nfs4_getdeviceinfo_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(OP_GETDEVICEINFO); + p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(args->pdev->layout_type); + *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */ + *p++ = cpu_to_be32(0); /* bitmap length 0 */ + hdr->nops++; + hdr->replen += decode_getdeviceinfo_maxsz; +} + +static void +encode_layoutget(struct xdr_stream *xdr, + const struct nfs4_layoutget_args *args, + struct compound_hdr *hdr) +{ + nfs4_stateid stateid; + __be32 *p; + + p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_LAYOUTGET); + *p++ = cpu_to_be32(0); /* Signal layout available */ + *p++ = cpu_to_be32(args->type); + *p++ = cpu_to_be32(args->range.iomode); + p = xdr_encode_hyper(p, args->range.offset); + p = xdr_encode_hyper(p, args->range.length); + p = xdr_encode_hyper(p, args->minlength); + pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout, + args->ctx->state); + p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->maxcount); + + dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", + __func__, + args->type, + args->range.iomode, + (unsigned long)args->range.offset, + (unsigned long)args->range.length, + args->maxcount); + hdr->nops++; + hdr->replen += decode_layoutget_maxsz; +} +#endif /* CONFIG_NFS_V4_1 */ + /* * END OF "GENERIC" ENCODE ROUTINES. */ @@ -2554,6 +2634,51 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p, return 0; } +/* + * Encode GETDEVICEINFO request + */ +static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, + struct nfs4_getdeviceinfo_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_getdeviceinfo(&xdr, args, &hdr); + + /* set up reply kvec. Subtract notification bitmap max size (2) + * so that notification bitmap is put in xdr_buf tail */ + xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2, + args->pdev->pages, args->pdev->pgbase, + args->pdev->pglen); + + encode_nops(&hdr); + return 0; +} + +/* + * Encode LAYOUTGET request + */ +static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, + struct nfs4_layoutget_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_putfh(&xdr, NFS_FH(args->inode), &hdr); + encode_layoutget(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -3978,6 +4103,61 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep); } +/* + * Decode potentially multiple layout types. Currently we only support + * one layout driver per file system. + */ +static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, + uint32_t *layouttype) +{ + uint32_t *p; + int num; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num = be32_to_cpup(p); + + /* pNFS is not supported by the underlying file system */ + if (num == 0) { + *layouttype = 0; + return 0; + } + if (num > 1) + printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " + "per filesystem not supported\n", __func__); + + /* Decode and set first layout type, move xdr->p past unused types */ + p = xdr_inline_decode(xdr, num * 4); + if (unlikely(!p)) + goto out_overflow; + *layouttype = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * The type of file system exported. + * Note we must ensure that layouttype is set in any non-error case. + */ +static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *layouttype) +{ + int status = 0; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); + if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) + return -EIO; + if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { + status = decode_first_pnfs_layout_type(xdr, layouttype); + bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; + } else + *layouttype = 0; + return status; +} + static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) { __be32 *savep; @@ -4006,6 +4186,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); if (status != 0) goto xdr_error; + status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); + if (status != 0) + goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: @@ -4772,6 +4955,134 @@ out_overflow: #endif /* CONFIG_NFS_V4_1 */ } +#if defined(CONFIG_NFS_V4_1) + +static int decode_getdeviceinfo(struct xdr_stream *xdr, + struct pnfs_device *pdev) +{ + __be32 *p; + uint32_t len, type; + int status; + + status = decode_op_hdr(xdr, OP_GETDEVICEINFO); + if (status) { + if (status == -ETOOSMALL) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + pdev->mincount = be32_to_cpup(p); + dprintk("%s: Min count too small. mincnt = %u\n", + __func__, pdev->mincount); + } + return status; + } + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + type = be32_to_cpup(p++); + if (type != pdev->layout_type) { + dprintk("%s: layout mismatch req: %u pdev: %u\n", + __func__, pdev->layout_type, type); + return -EINVAL; + } + /* + * Get the length of the opaque device_addr4. xdr_read_pages places + * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) + * and places the remaining xdr data in xdr_buf->tail + */ + pdev->mincount = be32_to_cpup(p); + xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ + + /* Parse notification bitmap, verifying that it is zero. */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len) { + int i; + + p = xdr_inline_decode(xdr, 4 * len); + if (unlikely(!p)) + goto out_overflow; + for (i = 0; i < len; i++, p++) { + if (be32_to_cpup(p)) { + dprintk("%s: notifications not supported\n", + __func__); + return -EIO; + } + } + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs4_layoutget_res *res) +{ + __be32 *p; + int status; + u32 layout_count; + + status = decode_op_hdr(xdr, OP_LAYOUTGET); + if (status) + return status; + p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); + if (unlikely(!p)) + goto out_overflow; + res->return_on_close = be32_to_cpup(p++); + p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE); + layout_count = be32_to_cpup(p); + if (!layout_count) { + dprintk("%s: server responded with empty layout array\n", + __func__); + return -EINVAL; + } + + p = xdr_inline_decode(xdr, 24); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->range.offset); + p = xdr_decode_hyper(p, &res->range.length); + res->range.iomode = be32_to_cpup(p++); + res->type = be32_to_cpup(p++); + + status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); + if (unlikely(status)) + return status; + + dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", + __func__, + (unsigned long)res->range.offset, + (unsigned long)res->range.length, + res->range.iomode, + res->type, + res->layout.len); + + /* nfs4_proc_layoutget allocated a single page */ + if (res->layout.len > PAGE_SIZE) + return -ENOMEM; + memcpy(res->layout.buf, p, res->layout.len); + + if (layout_count > 1) { + /* We only handle a length one array at the moment. Any + * further entries are just ignored. Note that this means + * the client may see a response that is less than the + * minimum it requested. + */ + dprintk("%s: server responded with %d layouts, dropping tail\n", + __func__, layout_count); + } + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} +#endif /* CONFIG_NFS_V4_1 */ + /* * END OF "GENERIC" DECODE ROUTINES. */ @@ -5799,6 +6110,53 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p, status = decode_reclaim_complete(&xdr, (void *)NULL); return status; } + +/* + * Decode GETDEVINFO response + */ +static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs4_getdeviceinfo_res *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status != 0) + goto out; + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (status != 0) + goto out; + status = decode_getdeviceinfo(&xdr, res->pdev); +out: + return status; +} + +/* + * Decode LAYOUTGET response + */ +static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs4_layoutget_res *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_layoutget(&xdr, rqstp, res); +out: + return status; +} #endif /* CONFIG_NFS_V4_1 */ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, @@ -5990,6 +6348,8 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SEQUENCE, enc_sequence, dec_sequence), PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), + PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC(LAYOUTGET, enc_layoutget, dec_layoutget), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c new file mode 100644 index 000000000000..db773428f95f --- /dev/null +++ b/fs/nfs/pnfs.c @@ -0,0 +1,783 @@ +/* + * pNFS functions to call and manage layout drivers. + * + * Copyright (c) 2002 [year of first publication] + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include <linux/nfs_fs.h> +#include "internal.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS + +/* Locking: + * + * pnfs_spinlock: + * protects pnfs_modules_tbl. + */ +static DEFINE_SPINLOCK(pnfs_spinlock); + +/* + * pnfs_modules_tbl holds all pnfs modules + */ +static LIST_HEAD(pnfs_modules_tbl); + +/* Return the registered pnfs layout driver module matching given id */ +static struct pnfs_layoutdriver_type * +find_pnfs_driver_locked(u32 id) +{ + struct pnfs_layoutdriver_type *local; + + list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) + if (local->id == id) + goto out; + local = NULL; +out: + dprintk("%s: Searching for id %u, found %p\n", __func__, id, local); + return local; +} + +static struct pnfs_layoutdriver_type * +find_pnfs_driver(u32 id) +{ + struct pnfs_layoutdriver_type *local; + + spin_lock(&pnfs_spinlock); + local = find_pnfs_driver_locked(id); + spin_unlock(&pnfs_spinlock); + return local; +} + +void +unset_pnfs_layoutdriver(struct nfs_server *nfss) +{ + if (nfss->pnfs_curr_ld) { + nfss->pnfs_curr_ld->clear_layoutdriver(nfss); + module_put(nfss->pnfs_curr_ld->owner); + } + nfss->pnfs_curr_ld = NULL; +} + +/* + * Try to set the server's pnfs module to the pnfs layout type specified by id. + * Currently only one pNFS layout driver per filesystem is supported. + * + * @id layout type. Zero (illegal layout type) indicates pNFS not in use. + */ +void +set_pnfs_layoutdriver(struct nfs_server *server, u32 id) +{ + struct pnfs_layoutdriver_type *ld_type = NULL; + + if (id == 0) + goto out_no_driver; + if (!(server->nfs_client->cl_exchange_flags & + (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { + printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__, + id, server->nfs_client->cl_exchange_flags); + goto out_no_driver; + } + ld_type = find_pnfs_driver(id); + if (!ld_type) { + request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); + ld_type = find_pnfs_driver(id); + if (!ld_type) { + dprintk("%s: No pNFS module found for %u.\n", + __func__, id); + goto out_no_driver; + } + } + if (!try_module_get(ld_type->owner)) { + dprintk("%s: Could not grab reference on module\n", __func__); + goto out_no_driver; + } + server->pnfs_curr_ld = ld_type; + if (ld_type->set_layoutdriver(server)) { + printk(KERN_ERR + "%s: Error initializing mount point for layout driver %u.\n", + __func__, id); + module_put(ld_type->owner); + goto out_no_driver; + } + dprintk("%s: pNFS module for %u set\n", __func__, id); + return; + +out_no_driver: + dprintk("%s: Using NFSv4 I/O\n", __func__); + server->pnfs_curr_ld = NULL; +} + +int +pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + int status = -EINVAL; + struct pnfs_layoutdriver_type *tmp; + + if (ld_type->id == 0) { + printk(KERN_ERR "%s id 0 is reserved\n", __func__); + return status; + } + if (!ld_type->alloc_lseg || !ld_type->free_lseg) { + printk(KERN_ERR "%s Layout driver must provide " + "alloc_lseg and free_lseg.\n", __func__); + return status; + } + + spin_lock(&pnfs_spinlock); + tmp = find_pnfs_driver_locked(ld_type->id); + if (!tmp) { + list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl); + status = 0; + dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, + ld_type->name); + } else { + printk(KERN_ERR "%s Module with id %d already loaded!\n", + __func__, ld_type->id); + } + spin_unlock(&pnfs_spinlock); + + return status; +} +EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver); + +void +pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); + spin_lock(&pnfs_spinlock); + list_del(&ld_type->pnfs_tblid); + spin_unlock(&pnfs_spinlock); +} +EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); + +/* + * pNFS client layout cache + */ + +static void +get_layout_hdr_locked(struct pnfs_layout_hdr *lo) +{ + assert_spin_locked(&lo->inode->i_lock); + lo->refcount++; +} + +static void +put_layout_hdr_locked(struct pnfs_layout_hdr *lo) +{ + assert_spin_locked(&lo->inode->i_lock); + BUG_ON(lo->refcount == 0); + + lo->refcount--; + if (!lo->refcount) { + dprintk("%s: freeing layout cache %p\n", __func__, lo); + BUG_ON(!list_empty(&lo->layouts)); + NFS_I(lo->inode)->layout = NULL; + kfree(lo); + } +} + +void +put_layout_hdr(struct inode *inode) +{ + spin_lock(&inode->i_lock); + put_layout_hdr_locked(NFS_I(inode)->layout); + spin_unlock(&inode->i_lock); +} + +static void +init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) +{ + INIT_LIST_HEAD(&lseg->fi_list); + kref_init(&lseg->kref); + lseg->layout = lo; +} + +/* Called without i_lock held, as the free_lseg call may sleep */ +static void +destroy_lseg(struct kref *kref) +{ + struct pnfs_layout_segment *lseg = + container_of(kref, struct pnfs_layout_segment, kref); + struct inode *ino = lseg->layout->inode; + + dprintk("--> %s\n", __func__); + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + /* Matched by get_layout_hdr_locked in pnfs_insert_layout */ + put_layout_hdr(ino); +} + +static void +put_lseg(struct pnfs_layout_segment *lseg) +{ + if (!lseg) + return; + + dprintk("%s: lseg %p ref %d\n", __func__, lseg, + atomic_read(&lseg->kref.refcount)); + kref_put(&lseg->kref, destroy_lseg); +} + +static void +pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list) +{ + struct pnfs_layout_segment *lseg, *next; + struct nfs_client *clp; + + dprintk("%s:Begin lo %p\n", __func__, lo); + + assert_spin_locked(&lo->inode->i_lock); + list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) { + dprintk("%s: freeing lseg %p\n", __func__, lseg); + list_move(&lseg->fi_list, tmp_list); + } + clp = NFS_SERVER(lo->inode)->nfs_client; + spin_lock(&clp->cl_lock); + /* List does not take a reference, so no need for put here */ + list_del_init(&lo->layouts); + spin_unlock(&clp->cl_lock); + write_seqlock(&lo->seqlock); + clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state); + write_sequnlock(&lo->seqlock); + + dprintk("%s:Return\n", __func__); +} + +static void +pnfs_free_lseg_list(struct list_head *tmp_list) +{ + struct pnfs_layout_segment *lseg; + + while (!list_empty(tmp_list)) { + lseg = list_entry(tmp_list->next, struct pnfs_layout_segment, + fi_list); + dprintk("%s calling put_lseg on %p\n", __func__, lseg); + list_del(&lseg->fi_list); + put_lseg(lseg); + } +} + +void +pnfs_destroy_layout(struct nfs_inode *nfsi) +{ + struct pnfs_layout_hdr *lo; + LIST_HEAD(tmp_list); + + spin_lock(&nfsi->vfs_inode.i_lock); + lo = nfsi->layout; + if (lo) { + pnfs_clear_lseg_list(lo, &tmp_list); + /* Matched by refcount set to 1 in alloc_init_layout_hdr */ + put_layout_hdr_locked(lo); + } + spin_unlock(&nfsi->vfs_inode.i_lock); + pnfs_free_lseg_list(&tmp_list); +} + +/* + * Called by the state manger to remove all layouts established under an + * expired lease. + */ +void +pnfs_destroy_all_layouts(struct nfs_client *clp) +{ + struct pnfs_layout_hdr *lo; + LIST_HEAD(tmp_list); + + spin_lock(&clp->cl_lock); + list_splice_init(&clp->cl_layouts, &tmp_list); + spin_unlock(&clp->cl_lock); + + while (!list_empty(&tmp_list)) { + lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, + layouts); + dprintk("%s freeing layout for inode %lu\n", __func__, + lo->inode->i_ino); + pnfs_destroy_layout(NFS_I(lo->inode)); + } +} + +/* update lo->stateid with new if is more recent + * + * lo->stateid could be the open stateid, in which case we just use what given. + */ +static void +pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new) +{ + nfs4_stateid *old = &lo->stateid; + bool overwrite = false; + + write_seqlock(&lo->seqlock); + if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) || + memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other))) + overwrite = true; + else { + u32 oldseq, newseq; + + oldseq = be32_to_cpu(old->stateid.seqid); + newseq = be32_to_cpu(new->stateid.seqid); + if ((int)(newseq - oldseq) > 0) + overwrite = true; + } + if (overwrite) + memcpy(&old->stateid, &new->stateid, sizeof(new->stateid)); + write_sequnlock(&lo->seqlock); +} + +static void +pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, + struct nfs4_state *state) +{ + int seq; + + dprintk("--> %s\n", __func__); + write_seqlock(&lo->seqlock); + do { + seq = read_seqbegin(&state->seqlock); + memcpy(lo->stateid.data, state->stateid.data, + sizeof(state->stateid.data)); + } while (read_seqretry(&state->seqlock, seq)); + set_bit(NFS_LAYOUT_STATEID_SET, &lo->state); + write_sequnlock(&lo->seqlock); + dprintk("<-- %s\n", __func__); +} + +void +pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state) +{ + int seq; + + dprintk("--> %s\n", __func__); + do { + seq = read_seqbegin(&lo->seqlock); + if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) { + /* This will trigger retry of the read */ + pnfs_layout_from_open_stateid(lo, open_state); + } else + memcpy(dst->data, lo->stateid.data, + sizeof(lo->stateid.data)); + } while (read_seqretry(&lo->seqlock, seq)); + dprintk("<-- %s\n", __func__); +} + +/* +* Get layout from server. +* for now, assume that whole file layouts are requested. +* arg->offset: 0 +* arg->length: all ones +*/ +static struct pnfs_layout_segment * +send_layoutget(struct pnfs_layout_hdr *lo, + struct nfs_open_context *ctx, + u32 iomode) +{ + struct inode *ino = lo->inode; + struct nfs_server *server = NFS_SERVER(ino); + struct nfs4_layoutget *lgp; + struct pnfs_layout_segment *lseg = NULL; + + dprintk("--> %s\n", __func__); + + BUG_ON(ctx == NULL); + lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); + if (lgp == NULL) { + put_layout_hdr(lo->inode); + return NULL; + } + lgp->args.minlength = NFS4_MAX_UINT64; + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; + lgp->args.range.iomode = iomode; + lgp->args.range.offset = 0; + lgp->args.range.length = NFS4_MAX_UINT64; + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); + lgp->lsegpp = &lseg; + + /* Synchronously retrieve layout information from server and + * store in lseg. + */ + nfs4_proc_layoutget(lgp); + if (!lseg) { + /* remember that LAYOUTGET failed and suspend trying */ + set_bit(lo_fail_bit(iomode), &lo->state); + } + return lseg; +} + +/* + * Compare two layout segments for sorting into layout cache. + * We want to preferentially return RW over RO layouts, so ensure those + * are seen first. + */ +static s64 +cmp_layout(u32 iomode1, u32 iomode2) +{ + /* read > read/write */ + return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ); +} + +static void +pnfs_insert_layout(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_segment *lp; + int found = 0; + + dprintk("%s:Begin\n", __func__); + + assert_spin_locked(&lo->inode->i_lock); + if (list_empty(&lo->segs)) { + struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client; + + spin_lock(&clp->cl_lock); + BUG_ON(!list_empty(&lo->layouts)); + list_add_tail(&lo->layouts, &clp->cl_layouts); + spin_unlock(&clp->cl_lock); + } + list_for_each_entry(lp, &lo->segs, fi_list) { + if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0) + continue; + list_add_tail(&lseg->fi_list, &lp->fi_list); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu before " + "lp %p iomode %d offset %llu length %llu\n", + __func__, lseg, lseg->range.iomode, + lseg->range.offset, lseg->range.length, + lp, lp->range.iomode, lp->range.offset, + lp->range.length); + found = 1; + break; + } + if (!found) { + list_add_tail(&lseg->fi_list, &lo->segs); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu at tail\n", + __func__, lseg, lseg->range.iomode, + lseg->range.offset, lseg->range.length); + } + get_layout_hdr_locked(lo); + + dprintk("%s:Return\n", __func__); +} + +static struct pnfs_layout_hdr * +alloc_init_layout_hdr(struct inode *ino) +{ + struct pnfs_layout_hdr *lo; + + lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); + if (!lo) + return NULL; + lo->refcount = 1; + INIT_LIST_HEAD(&lo->layouts); + INIT_LIST_HEAD(&lo->segs); + seqlock_init(&lo->seqlock); + lo->inode = ino; + return lo; +} + +static struct pnfs_layout_hdr * +pnfs_find_alloc_layout(struct inode *ino) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *new = NULL; + + dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); + + assert_spin_locked(&ino->i_lock); + if (nfsi->layout) + return nfsi->layout; + + spin_unlock(&ino->i_lock); + new = alloc_init_layout_hdr(ino); + spin_lock(&ino->i_lock); + + if (likely(nfsi->layout == NULL)) /* Won the race? */ + nfsi->layout = new; + else + kfree(new); + return nfsi->layout; +} + +/* + * iomode matching rules: + * iomode lseg match + * ----- ----- ----- + * ANY READ true + * ANY RW true + * RW READ false + * RW RW true + * READ READ true + * READ RW true + */ +static int +is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) +{ + return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW); +} + +/* + * lookup range in layout + */ +static struct pnfs_layout_segment * +pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode) +{ + struct pnfs_layout_segment *lseg, *ret = NULL; + + dprintk("%s:Begin\n", __func__); + + assert_spin_locked(&lo->inode->i_lock); + list_for_each_entry(lseg, &lo->segs, fi_list) { + if (is_matching_lseg(lseg, iomode)) { + ret = lseg; + break; + } + if (cmp_layout(iomode, lseg->range.iomode) > 0) + break; + } + + dprintk("%s:Return lseg %p ref %d\n", + __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0); + return ret; +} + +/* + * Layout segment is retreived from the server if not cached. + * The appropriate layout segment is referenced and returned to the caller. + */ +struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + enum pnfs_iomode iomode) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg = NULL; + + if (!pnfs_enabled_sb(NFS_SERVER(ino))) + return NULL; + spin_lock(&ino->i_lock); + lo = pnfs_find_alloc_layout(ino); + if (lo == NULL) { + dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); + goto out_unlock; + } + + /* Check to see if the layout for the given range already exists */ + lseg = pnfs_has_layout(lo, iomode); + if (lseg) { + dprintk("%s: Using cached lseg %p for iomode %d)\n", + __func__, lseg, iomode); + goto out_unlock; + } + + /* if LAYOUTGET already failed once we don't try again */ + if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) + goto out_unlock; + + get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */ + spin_unlock(&ino->i_lock); + + lseg = send_layoutget(lo, ctx, iomode); +out: + dprintk("%s end, state 0x%lx lseg %p\n", __func__, + nfsi->layout->state, lseg); + return lseg; +out_unlock: + spin_unlock(&ino->i_lock); + goto out; +} + +int +pnfs_layout_process(struct nfs4_layoutget *lgp) +{ + struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; + struct nfs4_layoutget_res *res = &lgp->res; + struct pnfs_layout_segment *lseg; + struct inode *ino = lo->inode; + int status = 0; + + /* Inject layout blob into I/O device driver */ + lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res); + if (!lseg || IS_ERR(lseg)) { + if (!lseg) + status = -ENOMEM; + else + status = PTR_ERR(lseg); + dprintk("%s: Could not allocate layout: error %d\n", + __func__, status); + goto out; + } + + spin_lock(&ino->i_lock); + init_lseg(lo, lseg); + lseg->range = res->range; + *lgp->lsegpp = lseg; + pnfs_insert_layout(lo, lseg); + + /* Done processing layoutget. Set the layout stateid */ + pnfs_set_layout_stateid(lo, &res->stateid); + spin_unlock(&ino->i_lock); +out: + return status; +} + +/* + * Device ID cache. Currently supports one layout type per struct nfs_client. + * Add layout type to the lookup key to expand to support multiple types. + */ +int +pnfs_alloc_init_deviceid_cache(struct nfs_client *clp, + void (*free_callback)(struct pnfs_deviceid_node *)) +{ + struct pnfs_deviceid_cache *c; + + c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL); + if (!c) + return -ENOMEM; + spin_lock(&clp->cl_lock); + if (clp->cl_devid_cache != NULL) { + atomic_inc(&clp->cl_devid_cache->dc_ref); + dprintk("%s [kref [%d]]\n", __func__, + atomic_read(&clp->cl_devid_cache->dc_ref)); + kfree(c); + } else { + /* kzalloc initializes hlists */ + spin_lock_init(&c->dc_lock); + atomic_set(&c->dc_ref, 1); + c->dc_free_callback = free_callback; + clp->cl_devid_cache = c; + dprintk("%s [new]\n", __func__); + } + spin_unlock(&clp->cl_lock); + return 0; +} +EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache); + +/* + * Called from pnfs_layoutdriver_type->free_lseg + * last layout segment reference frees deviceid + */ +void +pnfs_put_deviceid(struct pnfs_deviceid_cache *c, + struct pnfs_deviceid_node *devid) +{ + struct nfs4_deviceid *id = &devid->de_id; + struct pnfs_deviceid_node *d; + struct hlist_node *n; + long h = nfs4_deviceid_hash(id); + + dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); + if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock)) + return; + + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) + if (!memcmp(&d->de_id, id, sizeof(*id))) { + hlist_del_rcu(&d->de_node); + spin_unlock(&c->dc_lock); + synchronize_rcu(); + c->dc_free_callback(devid); + return; + } + spin_unlock(&c->dc_lock); + /* Why wasn't it found in the list? */ + BUG(); +} +EXPORT_SYMBOL_GPL(pnfs_put_deviceid); + +/* Find and reference a deviceid */ +struct pnfs_deviceid_node * +pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id) +{ + struct pnfs_deviceid_node *d; + struct hlist_node *n; + long hash = nfs4_deviceid_hash(id); + + dprintk("--> %s hash %ld\n", __func__, hash); + rcu_read_lock(); + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { + if (!memcmp(&d->de_id, id, sizeof(*id))) { + if (!atomic_inc_not_zero(&d->de_ref)) { + goto fail; + } else { + rcu_read_unlock(); + return d; + } + } + } +fail: + rcu_read_unlock(); + return NULL; +} +EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid); + +/* + * Add a deviceid to the cache. + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new + */ +struct pnfs_deviceid_node * +pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new) +{ + struct pnfs_deviceid_node *d; + long hash = nfs4_deviceid_hash(&new->de_id); + + dprintk("--> %s hash %ld\n", __func__, hash); + spin_lock(&c->dc_lock); + d = pnfs_find_get_deviceid(c, &new->de_id); + if (d) { + spin_unlock(&c->dc_lock); + dprintk("%s [discard]\n", __func__); + c->dc_free_callback(new); + return d; + } + INIT_HLIST_NODE(&new->de_node); + atomic_set(&new->de_ref, 1); + hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); + spin_unlock(&c->dc_lock); + dprintk("%s [new]\n", __func__); + return new; +} +EXPORT_SYMBOL_GPL(pnfs_add_deviceid); + +void +pnfs_put_deviceid_cache(struct nfs_client *clp) +{ + struct pnfs_deviceid_cache *local = clp->cl_devid_cache; + + dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); + if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { + int i; + /* Verify cache is empty */ + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) + BUG_ON(!hlist_empty(&local->dc_deviceids[i])); + clp->cl_devid_cache = NULL; + spin_unlock(&clp->cl_lock); + kfree(local); + } +} +EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h new file mode 100644 index 000000000000..e12367d50489 --- /dev/null +++ b/fs/nfs/pnfs.h @@ -0,0 +1,189 @@ +/* + * pNFS client data structures. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#ifndef FS_NFS_PNFS_H +#define FS_NFS_PNFS_H + +struct pnfs_layout_segment { + struct list_head fi_list; + struct pnfs_layout_range range; + struct kref kref; + struct pnfs_layout_hdr *layout; +}; + +#ifdef CONFIG_NFS_V4_1 + +#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" + +enum { + NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ + NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ + NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */ +}; + +/* Per-layout driver specific registration structure */ +struct pnfs_layoutdriver_type { + struct list_head pnfs_tblid; + const u32 id; + const char *name; + struct module *owner; + int (*set_layoutdriver) (struct nfs_server *); + int (*clear_layoutdriver) (struct nfs_server *); + struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); + void (*free_lseg) (struct pnfs_layout_segment *lseg); +}; + +struct pnfs_layout_hdr { + unsigned long refcount; + struct list_head layouts; /* other client layouts */ + struct list_head segs; /* layout segments list */ + seqlock_t seqlock; /* Protects the stateid */ + nfs4_stateid stateid; + unsigned long state; + struct inode *inode; +}; + +struct pnfs_device { + struct nfs4_deviceid dev_id; + unsigned int layout_type; + unsigned int mincount; + struct page **pages; + void *area; + unsigned int pgbase; + unsigned int pglen; +}; + +/* + * Device ID RCU cache. A device ID is unique per client ID and layout type. + */ +#define NFS4_DEVICE_ID_HASH_BITS 5 +#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) +#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) + +static inline u32 +nfs4_deviceid_hash(struct nfs4_deviceid *id) +{ + unsigned char *cptr = (unsigned char *)id->data; + unsigned int nbytes = NFS4_DEVICEID4_SIZE; + u32 x = 0; + + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x & NFS4_DEVICE_ID_HASH_MASK; +} + +struct pnfs_deviceid_node { + struct hlist_node de_node; + struct nfs4_deviceid de_id; + atomic_t de_ref; +}; + +struct pnfs_deviceid_cache { + spinlock_t dc_lock; + atomic_t dc_ref; + void (*dc_free_callback)(struct pnfs_deviceid_node *); + struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; +}; + +extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *, + void (*free_callback)(struct pnfs_deviceid_node *)); +extern void pnfs_put_deviceid_cache(struct nfs_client *); +extern struct pnfs_deviceid_node *pnfs_find_get_deviceid( + struct pnfs_deviceid_cache *, + struct nfs4_deviceid *); +extern struct pnfs_deviceid_node *pnfs_add_deviceid( + struct pnfs_deviceid_cache *, + struct pnfs_deviceid_node *); +extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c, + struct pnfs_deviceid_node *devid); + +extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); +extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); + +/* nfs4proc.c */ +extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *dev); +extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); + +/* pnfs.c */ +struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, + enum pnfs_iomode access_type); +void set_pnfs_layoutdriver(struct nfs_server *, u32 id); +void unset_pnfs_layoutdriver(struct nfs_server *); +int pnfs_layout_process(struct nfs4_layoutget *lgp); +void pnfs_destroy_layout(struct nfs_inode *); +void pnfs_destroy_all_layouts(struct nfs_client *); +void put_layout_hdr(struct inode *inode); +void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state); + + +static inline int lo_fail_bit(u32 iomode) +{ + return iomode == IOMODE_RW ? + NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; +} + +/* Return true if a layout driver is being used for this mountpoint */ +static inline int pnfs_enabled_sb(struct nfs_server *nfss) +{ + return nfss->pnfs_curr_ld != NULL; +} + +#else /* CONFIG_NFS_V4_1 */ + +static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) +{ +} + +static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) +{ +} + +static inline struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, + enum pnfs_iomode access_type) +{ + return NULL; +} + +static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) +{ +} + +static inline void unset_pnfs_layoutdriver(struct nfs_server *s) +{ +} + +#endif /* CONFIG_NFS_V4_1 */ + +#endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 79859c81a943..e4b62c6f5a6e 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -25,6 +25,7 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -120,6 +121,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); + pnfs_update_layout(inode, ctx, IOMODE_READ); new = nfs_create_request(ctx, inode, page, 0, len); if (IS_ERR(new)) { unlock_page(page); @@ -624,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ + pnfs_update_layout(inode, desc.ctx, IOMODE_READ); if (rsize < PAGE_CACHE_SIZE) nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); else |