summaryrefslogtreecommitdiff
path: root/fs/nfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/nfs')
-rw-r--r--fs/nfs/blocklayout/blocklayout.h19
-rw-r--r--fs/nfs/blocklayout/dev.c9
-rw-r--r--fs/nfs/blocklayout/extent_tree.c19
-rw-r--r--fs/nfs/callback.c4
-rw-r--r--fs/nfs/callback_proc.c9
-rw-r--r--fs/nfs/client.c113
-rw-r--r--fs/nfs/delegation.c29
-rw-r--r--fs/nfs/delegation.h3
-rw-r--r--fs/nfs/dir.c20
-rw-r--r--fs/nfs/file.c21
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c424
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h5
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c82
-rw-r--r--fs/nfs/inode.c61
-rw-r--r--fs/nfs/internal.h20
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs42.h2
-rw-r--r--fs/nfs/nfs42xdr.c5
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c5
-rw-r--r--fs/nfs/nfs4file.c32
-rw-r--r--fs/nfs/nfs4idmap.c14
-rw-r--r--fs/nfs/nfs4proc.c136
-rw-r--r--fs/nfs/nfs4state.c12
-rw-r--r--fs/nfs/nfs4trace.h61
-rw-r--r--fs/nfs/nfs4xdr.c75
-rw-r--r--fs/nfs/pagelist.c4
-rw-r--r--fs/nfs/pnfs.c227
-rw-r--r--fs/nfs/pnfs.h48
-rw-r--r--fs/nfs/pnfs_nfs.c88
-rw-r--r--fs/nfs/super.c7
-rw-r--r--fs/nfs/write.c36
32 files changed, 966 insertions, 629 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 92dca9e90d8d..c556640dcf3b 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -46,13 +46,6 @@
struct pnfs_block_dev;
-enum pnfs_block_volume_type {
- PNFS_BLOCK_VOLUME_SIMPLE = 0,
- PNFS_BLOCK_VOLUME_SLICE = 1,
- PNFS_BLOCK_VOLUME_CONCAT = 2,
- PNFS_BLOCK_VOLUME_STRIPE = 3,
-};
-
#define PNFS_BLOCK_MAX_UUIDS 4
#define PNFS_BLOCK_MAX_DEVICES 64
@@ -117,13 +110,6 @@ struct pnfs_block_dev {
struct pnfs_block_dev_map *map);
};
-enum exstate4 {
- PNFS_BLOCK_READWRITE_DATA = 0,
- PNFS_BLOCK_READ_DATA = 1,
- PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
- PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
-};
-
/* sector_t fields are all in 512-byte sectors */
struct pnfs_block_extent {
union {
@@ -134,15 +120,12 @@ struct pnfs_block_extent {
sector_t be_f_offset; /* the starting offset in the file */
sector_t be_length; /* the size of the extent */
sector_t be_v_offset; /* the starting offset in the volume */
- enum exstate4 be_state; /* the state of this extent */
+ enum pnfs_block_extent_state be_state; /* the state of this extent */
#define EXTENT_WRITTEN 1
#define EXTENT_COMMITTING 2
unsigned int be_tag;
};
-/* on the wire size of the extent */
-#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
-
struct pnfs_block_layout {
struct pnfs_layout_hdr bl_layout;
struct rb_root bl_ext_rw;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index e535599a0719..a861bbdfe577 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -22,7 +22,7 @@ bl_free_device(struct pnfs_block_dev *dev)
kfree(dev->children);
} else {
if (dev->bdev)
- blkdev_put(dev->bdev, FMODE_READ);
+ blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
}
}
@@ -65,6 +65,11 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
return -EIO;
p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+ if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) {
+ pr_info("signature too long: %d\n",
+ b->simple.sigs[i].sig_len);
+ return -EIO;
+ }
p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
if (!p)
@@ -195,7 +200,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
if (!dev)
return -EIO;
- d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
if (IS_ERR(d->bdev)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 31d0b5e53dfd..c59a59c37f3d 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -462,6 +462,12 @@ out:
return err;
}
+static size_t ext_tree_layoutupdate_size(size_t count)
+{
+ return sizeof(__be32) /* number of entries */ +
+ PNFS_BLOCK_EXTENT_SIZE * count;
+}
+
static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
size_t buffer_size)
{
@@ -489,7 +495,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
continue;
(*count)++;
- if (*count * BL_EXTENT_SIZE > buffer_size) {
+ if (ext_tree_layoutupdate_size(*count) > buffer_size) {
/* keep counting.. */
ret = -ENOSPC;
continue;
@@ -530,7 +536,7 @@ retry:
if (unlikely(ret)) {
ext_tree_free_commitdata(arg, buffer_size);
- buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+ buffer_size = ext_tree_layoutupdate_size(count);
count = 0;
arg->layoutupdate_pages =
@@ -549,17 +555,14 @@ retry:
}
*start_p = cpu_to_be32(count);
- arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+ arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
- __be32 *p = start_p;
+ void *p = start_p, *end = p + arg->layoutupdate_len;
int i = 0;
- for (p = start_p;
- p < start_p + arg->layoutupdate_len;
- p += PAGE_SIZE) {
+ for ( ; p < end; p += PAGE_SIZE)
arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
- }
}
dprintk("%s found %zu ranges\n", __func__, count);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 2c4a0b565d28..75f7c0a7538a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -162,10 +162,6 @@ nfs41_callback_up(struct svc_serv *serv)
spin_lock_init(&serv->sv_cb_lock);
init_waitqueue_head(&serv->sv_cb_waitq);
rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
- if (IS_ERR(rqstp)) {
- svc_xprt_put(serv->sv_bc_xprt);
- serv->sv_bc_xprt = NULL;
- }
dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
return rqstp;
}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 29e3c1b011b7..b85cf7a30232 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -40,8 +40,11 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
inode = nfs_delegation_find_inode(cps->clp, &args->fh);
- if (inode == NULL)
+ if (inode == NULL) {
+ trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL,
+ -ntohl(res->status));
goto out;
+ }
nfsi = NFS_I(inode);
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
@@ -60,6 +63,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
res->status = 0;
out_iput:
rcu_read_unlock();
+ trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status));
iput(inode);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
@@ -194,6 +198,7 @@ unlock:
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me_list);
pnfs_put_layout_hdr(lo);
+ trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv);
iput(ino);
out:
return rv;
@@ -554,7 +559,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
status = htonl(NFS4_OK);
nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
- nfs41_server_notify_target_slotid_update(cps->clp);
+ nfs41_notify_server(cps->clp);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 4a90c9bb3135..57c5a02f6213 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -20,6 +20,7 @@
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/unistd.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/metrics.h>
@@ -285,116 +286,6 @@ void nfs_put_client(struct nfs_client *clp)
}
EXPORT_SYMBOL_GPL(nfs_put_client);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-/*
- * Test if two ip6 socket addresses refer to the same socket by
- * comparing relevant fields. The padding bytes specifically, are not
- * compared. sin6_flowinfo is not compared because it only affects QoS
- * and sin6_scope_id is only compared if the address is "link local"
- * because "link local" addresses need only be unique to a specific
- * link. Conversely, ordinary unicast addresses might have different
- * sin6_scope_id.
- *
- * The caller should ensure both socket addresses are AF_INET6.
- */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
- const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
- if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
- return 0;
- else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- return sin1->sin6_scope_id == sin2->sin6_scope_id;
-
- return 1;
-}
-#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- return 0;
-}
-#endif
-
-/*
- * Test if two ip4 socket addresses refer to the same socket, by
- * comparing relevant fields. The padding bytes specifically, are
- * not compared.
- *
- * The caller should ensure both socket addresses are AF_INET.
- */
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
- const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
- return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
-}
-
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
- const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
- return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
- (sin1->sin6_port == sin2->sin6_port);
-}
-
-static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
- const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
- return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
- (sin1->sin_port == sin2->sin_port);
-}
-
-#if defined(CONFIG_NFS_V4_1)
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, excluding the port number.
- */
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- if (sa1->sa_family != sa2->sa_family)
- return 0;
-
- switch (sa1->sa_family) {
- case AF_INET:
- return nfs_sockaddr_match_ipaddr4(sa1, sa2);
- case AF_INET6:
- return nfs_sockaddr_match_ipaddr6(sa1, sa2);
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr);
-#endif /* CONFIG_NFS_V4_1 */
-
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, including the port number.
- */
-static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- if (sa1->sa_family != sa2->sa_family)
- return 0;
-
- switch (sa1->sa_family) {
- case AF_INET:
- return nfs_sockaddr_cmp_ip4(sa1, sa2);
- case AF_INET6:
- return nfs_sockaddr_cmp_ip6(sa1, sa2);
- }
- return 0;
-}
-
/*
* Find an nfs_client on the list that matches the initialisation data
* that is supplied.
@@ -421,7 +312,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
if (clp->cl_minorversion != data->minorversion)
continue;
/* Match the full socket address */
- if (!nfs_sockaddr_cmp(sap, clap))
+ if (!rpc_cmp_addr_port(sap, clap))
continue;
atomic_inc(&clp->cl_count);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 029d688a969f..2714ef835bdd 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -175,7 +175,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
if (delegation->inode != NULL) {
nfs4_stateid_copy(&delegation->stateid, &res->delegation);
delegation->type = res->delegation_type;
- delegation->maxsize = res->maxsize;
+ delegation->pagemod_limit = res->pagemod_limit;
oldcred = delegation->cred;
delegation->cred = get_rpccred(cred);
clear_bit(NFS_DELEGATION_NEED_RECLAIM,
@@ -337,7 +337,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
return -ENOMEM;
nfs4_stateid_copy(&delegation->stateid, &res->delegation);
delegation->type = res->delegation_type;
- delegation->maxsize = res->maxsize;
+ delegation->pagemod_limit = res->pagemod_limit;
delegation->change_attr = inode->i_version;
delegation->cred = get_rpccred(cred);
delegation->inode = inode;
@@ -900,3 +900,28 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
rcu_read_unlock();
return ret;
}
+
+/**
+ * nfs4_delegation_flush_on_close - Check if we must flush file on close
+ * @inode: inode to check
+ *
+ * This function checks the number of outstanding writes to the file
+ * against the delegation 'space_limit' field to see if
+ * the spec requires us to flush the file on close.
+ */
+bool nfs4_delegation_flush_on_close(const struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation;
+ bool ret = true;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(nfsi->delegation);
+ if (delegation == NULL || !(delegation->type & FMODE_WRITE))
+ goto out;
+ if (nfsi->nrequests < delegation->pagemod_limit)
+ ret = false;
+out:
+ rcu_read_unlock();
+ return ret;
+}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index e3c20a3ccc93..a44829173e57 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -18,7 +18,7 @@ struct nfs_delegation {
struct inode *inode;
nfs4_stateid stateid;
fmode_t type;
- loff_t maxsize;
+ unsigned long pagemod_limit;
__u64 change_attr;
unsigned long flags;
spinlock_t lock;
@@ -61,6 +61,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs4_have_delegation(struct inode *inode, fmode_t flags);
int nfs4_check_delegation(struct inode *inode, fmode_t flags);
+bool nfs4_delegation_flush_on_close(const struct inode *inode);
#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 547308a5ec6f..3d8e4ffa0a33 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -583,26 +583,19 @@ out_nopages:
}
static
-void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
{
unsigned int i;
for (i = 0; i < npages; i++)
put_page(pages[i]);
}
-static
-void nfs_readdir_free_large_page(void *ptr, struct page **pages,
- unsigned int npages)
-{
- nfs_readdir_free_pagearray(pages, npages);
-}
-
/*
* nfs_readdir_large_page will allocate pages that must be freed with a call
- * to nfs_readdir_free_large_page
+ * to nfs_readdir_free_pagearray
*/
static
-int nfs_readdir_large_page(struct page **pages, unsigned int npages)
+int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
{
unsigned int i;
@@ -615,7 +608,7 @@ int nfs_readdir_large_page(struct page **pages, unsigned int npages)
return 0;
out_freepages:
- nfs_readdir_free_pagearray(pages, i);
+ nfs_readdir_free_pages(pages, i);
return -ENOMEM;
}
@@ -623,7 +616,6 @@ static
int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
{
struct page *pages[NFS_MAX_READDIR_PAGES];
- void *pages_ptr = NULL;
struct nfs_entry entry;
struct file *file = desc->file;
struct nfs_cache_array *array;
@@ -653,7 +645,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
memset(array, 0, sizeof(struct nfs_cache_array));
array->eof_index = -1;
- status = nfs_readdir_large_page(pages, array_size);
+ status = nfs_readdir_alloc_pages(pages, array_size);
if (status < 0)
goto out_release_array;
do {
@@ -671,7 +663,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
}
} while (array->eof_index < 0);
- nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+ nfs_readdir_free_pages(pages, array_size);
out_release_array:
nfs_readdir_release_array(page);
out_label_free:
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cc4fa1ed61fc..c0f9b1ed12b9 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -82,7 +82,8 @@ nfs_file_release(struct inode *inode, struct file *filp)
dprintk("NFS: release(%pD2)\n", filp);
nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
- return nfs_release(inode, filp);
+ nfs_file_clear_open_context(filp);
+ return 0;
}
EXPORT_SYMBOL_GPL(nfs_file_release);
@@ -141,7 +142,7 @@ EXPORT_SYMBOL_GPL(nfs_file_llseek);
/*
* Flush all dirty pages, and check for write errors.
*/
-int
+static int
nfs_file_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
@@ -152,17 +153,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
if ((file->f_mode & FMODE_WRITE) == 0)
return 0;
- /*
- * If we're holding a write delegation, then just start the i/o
- * but don't wait for completion (or send a commit).
- */
- if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
- return filemap_fdatawrite(file->f_mapping);
-
/* Flush writes to the server and return any errors */
return vfs_fsync(file, 0);
}
-EXPORT_SYMBOL_GPL(nfs_file_flush);
ssize_t
nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
@@ -644,12 +637,10 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
.page_mkwrite = nfs_vm_page_mkwrite,
};
-static int nfs_need_sync_write(struct file *filp, struct inode *inode)
+static int nfs_need_check_write(struct file *filp, struct inode *inode)
{
struct nfs_open_context *ctx;
- if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
- return 1;
ctx = nfs_file_open_context(filp);
if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
nfs_ctx_key_to_expire(ctx))
@@ -699,8 +690,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
if (result > 0)
written = result;
- /* Return error values for O_DSYNC and IS_SYNC() */
- if (result >= 0 && nfs_need_sync_write(file, inode)) {
+ /* Return error values */
+ if (result >= 0 && nfs_need_check_write(file, inode)) {
int err = vfs_fsync(file, 0);
if (err < 0)
result = err;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index b3289d701eea..fbc5a56de875 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -34,6 +34,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
ffl = kzalloc(sizeof(*ffl), gfp_flags);
if (ffl) {
INIT_LIST_HEAD(&ffl->error_list);
+ INIT_LIST_HEAD(&ffl->mirrors);
return &ffl->generic_hdr;
} else
return NULL;
@@ -135,6 +136,95 @@ decode_name(struct xdr_stream *xdr, u32 *id)
return 0;
}
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+ const struct nfs4_ff_layout_mirror *m2)
+{
+ int i, j;
+
+ if (m1->fh_versions_cnt != m2->fh_versions_cnt)
+ return false;
+ for (i = 0; i < m1->fh_versions_cnt; i++) {
+ bool found_fh = false;
+ for (j = 0; j < m2->fh_versions_cnt; i++) {
+ if (nfs_compare_fh(&m1->fh_versions[i],
+ &m2->fh_versions[j]) == 0) {
+ found_fh = true;
+ break;
+ }
+ }
+ if (!found_fh)
+ return false;
+ }
+ return true;
+}
+
+static struct nfs4_ff_layout_mirror *
+ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
+ struct nfs4_ff_layout_mirror *pos;
+ struct inode *inode = lo->plh_inode;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
+ if (mirror->mirror_ds != pos->mirror_ds)
+ continue;
+ if (!ff_mirror_match_fh(mirror, pos))
+ continue;
+ if (atomic_inc_not_zero(&pos->ref)) {
+ spin_unlock(&inode->i_lock);
+ return pos;
+ }
+ }
+ list_add(&mirror->mirrors, &ff_layout->mirrors);
+ mirror->layout = lo;
+ spin_unlock(&inode->i_lock);
+ return mirror;
+}
+
+static void
+ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ struct inode *inode;
+ if (mirror->layout == NULL)
+ return;
+ inode = mirror->layout->plh_inode;
+ spin_lock(&inode->i_lock);
+ list_del(&mirror->mirrors);
+ spin_unlock(&inode->i_lock);
+ mirror->layout = NULL;
+}
+
+static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+
+ mirror = kzalloc(sizeof(*mirror), gfp_flags);
+ if (mirror != NULL) {
+ spin_lock_init(&mirror->lock);
+ atomic_set(&mirror->ref, 1);
+ INIT_LIST_HEAD(&mirror->mirrors);
+ }
+ return mirror;
+}
+
+static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ ff_layout_remove_mirror(mirror);
+ kfree(mirror->fh_versions);
+ if (mirror->cred)
+ put_rpccred(mirror->cred);
+ nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
+ kfree(mirror);
+}
+
+static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror != NULL && atomic_dec_and_test(&mirror->ref))
+ ff_layout_free_mirror(mirror);
+}
+
static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
{
int i;
@@ -144,11 +234,7 @@ static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
/* normally mirror_ds is freed in
* .free_deviceid_node but we still do it here
* for .alloc_lseg error path */
- if (fls->mirror_array[i]) {
- kfree(fls->mirror_array[i]->fh_versions);
- nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
- kfree(fls->mirror_array[i]);
- }
+ ff_layout_put_mirror(fls->mirror_array[i]);
}
kfree(fls->mirror_array);
fls->mirror_array = NULL;
@@ -181,6 +267,65 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
}
}
+static bool
+ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ u64 end1, end2;
+
+ if (l1->iomode != l2->iomode)
+ return l1->iomode != IOMODE_READ;
+ end1 = pnfs_calc_offset_end(l1->offset, l1->length);
+ end2 = pnfs_calc_offset_end(l2->offset, l2->length);
+ if (end1 < l2->offset)
+ return false;
+ if (end2 < l1->offset)
+ return true;
+ return l2->offset <= l1->offset;
+}
+
+static bool
+ff_lseg_merge(struct pnfs_layout_segment *new,
+ struct pnfs_layout_segment *old)
+{
+ u64 new_end, old_end;
+
+ if (new->pls_range.iomode != old->pls_range.iomode)
+ return false;
+ old_end = pnfs_calc_offset_end(old->pls_range.offset,
+ old->pls_range.length);
+ if (old_end < new->pls_range.offset)
+ return false;
+ new_end = pnfs_calc_offset_end(new->pls_range.offset,
+ new->pls_range.length);
+ if (new_end < old->pls_range.offset)
+ return false;
+
+ /* Mergeable: copy info from 'old' to 'new' */
+ if (new_end < old_end)
+ new_end = old_end;
+ if (new->pls_range.offset < old->pls_range.offset)
+ new->pls_range.offset = old->pls_range.offset;
+ new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset,
+ new_end);
+ if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
+ set_bit(NFS_LSEG_ROC, &new->pls_flags);
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
+ set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags);
+ return true;
+}
+
+static void
+ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ pnfs_generic_layout_insert_lseg(lo, lseg,
+ ff_lseg_range_is_after,
+ ff_lseg_merge,
+ free_me);
+}
+
static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
{
int i, j;
@@ -246,6 +391,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
goto out_err_free;
for (i = 0; i < fls->mirror_array_cnt; i++) {
+ struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid devid;
struct nfs4_deviceid_node *idnode;
u32 ds_count;
@@ -262,17 +408,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
if (ds_count != 1)
goto out_err_free;
- fls->mirror_array[i] =
- kzalloc(sizeof(struct nfs4_ff_layout_mirror),
- gfp_flags);
+ fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
if (fls->mirror_array[i] == NULL) {
rc = -ENOMEM;
goto out_err_free;
}
- spin_lock_init(&fls->mirror_array[i]->lock);
fls->mirror_array[i]->ds_count = ds_count;
- fls->mirror_array[i]->lseg = &fls->generic_hdr;
/* deviceid */
rc = decode_deviceid(&stream, &devid);
@@ -338,6 +480,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
if (rc)
goto out_err_free;
+ mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
+ if (mirror != fls->mirror_array[i]) {
+ ff_layout_free_mirror(fls->mirror_array[i]);
+ fls->mirror_array[i] = mirror;
+ }
+
dprintk("%s: uid %d gid %d\n", __func__,
fls->mirror_array[i]->uid,
fls->mirror_array[i]->gid);
@@ -379,21 +527,9 @@ static void
ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
- int i;
dprintk("--> %s\n", __func__);
- for (i = 0; i < fls->mirror_array_cnt; i++) {
- if (fls->mirror_array[i]) {
- nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
- fls->mirror_array[i]->mirror_ds = NULL;
- if (fls->mirror_array[i]->cred) {
- put_rpccred(fls->mirror_array[i]->cred);
- fls->mirror_array[i]->cred = NULL;
- }
- }
- }
-
if (lseg->pls_range.iomode == IOMODE_RW) {
struct nfs4_flexfile_layout *ffl;
struct inode *inode;
@@ -419,48 +555,44 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
}
static void
-nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer)
+nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
/* first IO request? */
if (atomic_inc_return(&timer->n_ops) == 1) {
- timer->start_time = ktime_get();
+ timer->start_time = now;
}
}
static ktime_t
-nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer)
+nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
- ktime_t start, now;
+ ktime_t start;
if (atomic_dec_return(&timer->n_ops) < 0)
WARN_ON_ONCE(1);
- now = ktime_get();
start = timer->start_time;
timer->start_time = now;
return ktime_sub(now, start);
}
-static ktime_t
-nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
-{
- return ktime_sub(ktime_get(), task->tk_start);
-}
-
static bool
nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
- struct nfs4_ff_layoutstat *layoutstat)
+ struct nfs4_ff_layoutstat *layoutstat,
+ ktime_t now)
{
static const ktime_t notime = {0};
- ktime_t now = ktime_get();
+ s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
- nfs4_ff_start_busy_timer(&layoutstat->busy_timer);
+ nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
if (ktime_equal(mirror->start_time, notime))
mirror->start_time = now;
if (ktime_equal(mirror->last_report_time, notime))
mirror->last_report_time = now;
+ if (layoutstats_timer != 0)
+ report_interval = (s64)layoutstats_timer * 1000LL;
if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
- FF_LAYOUTSTATS_REPORT_INTERVAL) {
+ report_interval) {
mirror->last_report_time = now;
return true;
}
@@ -482,35 +614,39 @@ static void
nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
__u64 requested,
__u64 completed,
- ktime_t time_completed)
+ ktime_t time_completed,
+ ktime_t time_started)
{
struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+ ktime_t completion_time = ktime_sub(time_completed, time_started);
ktime_t timer;
iostat->ops_completed++;
iostat->bytes_completed += completed;
iostat->bytes_not_delivered += requested - completed;
- timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer);
+ timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed);
iostat->total_busy_time =
ktime_add(iostat->total_busy_time, timer);
iostat->aggregate_completion_time =
- ktime_add(iostat->aggregate_completion_time, time_completed);
+ ktime_add(iostat->aggregate_completion_time,
+ completion_time);
}
static void
-nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
- __u64 requested)
+nfs4_ff_layout_stat_io_start_read(struct inode *inode,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested, ktime_t now)
{
bool report;
spin_lock(&mirror->lock);
- report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat);
+ report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
spin_unlock(&mirror->lock);
if (report)
- pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+ pnfs_report_layoutstat(inode, GFP_KERNEL);
}
static void
@@ -522,23 +658,24 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
spin_lock(&mirror->lock);
nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
requested, completed,
- nfs4_ff_layout_calc_completion_time(task));
+ ktime_get(), task->tk_start);
spin_unlock(&mirror->lock);
}
static void
-nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
- __u64 requested)
+nfs4_ff_layout_stat_io_start_write(struct inode *inode,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested, ktime_t now)
{
bool report;
spin_lock(&mirror->lock);
- report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat);
+ report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
spin_unlock(&mirror->lock);
if (report)
- pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+ pnfs_report_layoutstat(inode, GFP_NOIO);
}
static void
@@ -553,8 +690,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
spin_lock(&mirror->lock);
nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
- requested, completed,
- nfs4_ff_layout_calc_completion_time(task));
+ requested, completed, ktime_get(), task->tk_start);
spin_unlock(&mirror->lock);
}
@@ -728,8 +864,6 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
/* no lseg means that pnfs is not in use, so no mirroring here */
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
nfs_pageio_reset_write_mds(pgio);
return 1;
}
@@ -931,18 +1065,26 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
if (task->tk_status >= 0)
return 0;
- if (task->tk_status != -EJUKEBOX) {
+ switch (task->tk_status) {
+ /* File access problems. Don't mark the device as unavailable */
+ case -EACCES:
+ case -ESTALE:
+ case -EISDIR:
+ case -EBADHANDLE:
+ case -ELOOP:
+ case -ENOSPC:
+ break;
+ case -EJUKEBOX:
+ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ goto out_retry;
+ default:
dprintk("%s DS connection error %d\n", __func__,
task->tk_status);
nfs4_mark_deviceid_unavailable(devid);
- if (ff_layout_has_available_ds(lseg))
- return -NFS4ERR_RESET_TO_PNFS;
- else
- return -NFS4ERR_RESET_TO_MDS;
}
-
- if (task->tk_status == -EJUKEBOX)
- nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ /* FIXME: Need to prevent infinite looping here. */
+ return -NFS4ERR_RESET_TO_PNFS;
+out_retry:
task->tk_status = 0;
rpc_restart_call(task);
rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
@@ -972,15 +1114,41 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
int idx, u64 offset, u64 length,
- u32 status, int opnum)
+ u32 status, int opnum, int error)
{
struct nfs4_ff_layout_mirror *mirror;
int err;
+ if (status == 0) {
+ switch (error) {
+ case -ETIMEDOUT:
+ case -EPFNOSUPPORT:
+ case -EPROTONOSUPPORT:
+ case -EOPNOTSUPP:
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EADDRINUSE:
+ case -ENOBUFS:
+ case -EPIPE:
+ case -EPERM:
+ status = NFS4ERR_NXIO;
+ break;
+ case -EACCES:
+ status = NFS4ERR_ACCESS;
+ break;
+ default:
+ return;
+ }
+ }
+
mirror = FF_LAYOUT_COMP(lseg, idx);
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, offset, length, status, opnum,
GFP_NOIO);
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
}
@@ -989,16 +1157,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
- struct inode *inode;
int err;
trace_nfs4_pnfs_read(hdr, task->tk_status);
- if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
- hdr->res.op_status = NFS4ERR_NXIO;
- if (task->tk_status < 0 && hdr->res.op_status)
+ if (task->tk_status < 0)
ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
hdr->args.offset, hdr->args.count,
- hdr->res.op_status, OP_READ);
+ hdr->res.op_status, OP_READ,
+ task->tk_status);
err = ff_layout_async_handle_error(task, hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
hdr->pgio_mirror_idx);
@@ -1010,8 +1176,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
pnfs_read_resend_pnfs(hdr);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
- inode = hdr->lseg->pls_layout->plh_inode;
- pnfs_error_mark_layout_for_return(inode, hdr->lseg);
ff_layout_reset_read(hdr);
return task->tk_status;
case -EAGAIN:
@@ -1061,9 +1225,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
static int ff_layout_read_prepare_common(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
- nfs4_ff_layout_stat_io_start_read(
+ nfs4_ff_layout_stat_io_start_read(hdr->inode,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count);
+ hdr->args.count,
+ task->tk_start);
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
@@ -1163,32 +1328,26 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
- struct inode *inode;
int err;
trace_nfs4_pnfs_write(hdr, task->tk_status);
- if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
- hdr->res.op_status = NFS4ERR_NXIO;
- if (task->tk_status < 0 && hdr->res.op_status)
+ if (task->tk_status < 0)
ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
hdr->args.offset, hdr->args.count,
- hdr->res.op_status, OP_WRITE);
+ hdr->res.op_status, OP_WRITE,
+ task->tk_status);
err = ff_layout_async_handle_error(task, hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
hdr->pgio_mirror_idx);
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
+ pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
+ ff_layout_reset_write(hdr, true);
+ return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
- inode = hdr->lseg->pls_layout->plh_inode;
- pnfs_error_mark_layout_for_return(inode, hdr->lseg);
- if (err == -NFS4ERR_RESET_TO_PNFS) {
- pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
- ff_layout_reset_write(hdr, true);
- } else {
- pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
- ff_layout_reset_write(hdr, false);
- }
+ pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
+ ff_layout_reset_write(hdr, false);
return task->tk_status;
case -EAGAIN:
rpc_restart_call_prepare(task);
@@ -1199,34 +1358,35 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
hdr->res.verf->committed == NFS_DATA_SYNC)
ff_layout_set_layoutcommit(hdr);
+ /* zero out fattr since we don't care DS attr at all */
+ hdr->fattr.valid = 0;
+ if (task->tk_status >= 0)
+ nfs_writeback_update_inode(hdr);
+
return 0;
}
static int ff_layout_commit_done_cb(struct rpc_task *task,
struct nfs_commit_data *data)
{
- struct inode *inode;
int err;
trace_nfs4_pnfs_commit_ds(data, task->tk_status);
- if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
- data->res.op_status = NFS4ERR_NXIO;
- if (task->tk_status < 0 && data->res.op_status)
+ if (task->tk_status < 0)
ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
data->args.offset, data->args.count,
- data->res.op_status, OP_COMMIT);
+ data->res.op_status, OP_COMMIT,
+ task->tk_status);
err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
data->lseg, data->ds_commit_index);
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
+ pnfs_set_retry_layoutget(data->lseg->pls_layout);
+ pnfs_generic_prepare_to_resend_writes(data);
+ return -EAGAIN;
case -NFS4ERR_RESET_TO_MDS:
- inode = data->lseg->pls_layout->plh_inode;
- pnfs_error_mark_layout_for_return(inode, data->lseg);
- if (err == -NFS4ERR_RESET_TO_PNFS)
- pnfs_set_retry_layoutget(data->lseg->pls_layout);
- else
- pnfs_clear_retry_layoutget(data->lseg->pls_layout);
+ pnfs_clear_retry_layoutget(data->lseg->pls_layout);
pnfs_generic_prepare_to_resend_writes(data);
return -EAGAIN;
case -EAGAIN:
@@ -1244,9 +1404,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
static int ff_layout_write_prepare_common(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
- nfs4_ff_layout_stat_io_start_write(
+ nfs4_ff_layout_stat_io_start_write(hdr->inode,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count);
+ hdr->args.count,
+ task->tk_start);
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
@@ -1325,9 +1486,9 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
static void ff_layout_commit_prepare_common(struct rpc_task *task,
struct nfs_commit_data *cdata)
{
- nfs4_ff_layout_stat_io_start_write(
+ nfs4_ff_layout_stat_io_start_write(cdata->inode,
FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
- 0);
+ 0, task->tk_start);
}
static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
@@ -1842,53 +2003,55 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr,
*start = cpu_to_be32((xdr->p - start - 1) * 4);
}
-static bool
+static int
ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
- struct pnfs_layout_segment *pls,
- int *dev_count, int dev_limit)
+ struct pnfs_layout_hdr *lo,
+ int dev_limit)
{
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *dev;
struct nfs42_layoutstat_devinfo *devinfo;
- int i;
+ int i = 0;
- for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) {
- if (*dev_count >= dev_limit)
+ list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+ if (i >= dev_limit)
break;
- mirror = FF_LAYOUT_COMP(pls, i);
- if (!mirror || !mirror->mirror_ds)
+ if (!mirror->mirror_ds)
+ continue;
+ /* mirror refcount put in cleanup_layoutstats */
+ if (!atomic_inc_not_zero(&mirror->ref))
continue;
- dev = FF_LAYOUT_DEVID_NODE(pls, i);
- devinfo = &args->devinfo[*dev_count];
+ dev = &mirror->mirror_ds->id_node;
+ devinfo = &args->devinfo[i];
memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
- devinfo->offset = pls->pls_range.offset;
- devinfo->length = pls->pls_range.length;
- /* well, we don't really know if IO is continuous or not! */
- devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
+ devinfo->offset = 0;
+ devinfo->length = NFS4_MAX_UINT64;
+ devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
- devinfo->write_count = mirror->write_stat.io_stat.bytes_completed;
+ devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
devinfo->layout_type = LAYOUT_FLEX_FILES;
devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
devinfo->layout_private = mirror;
- /* lseg refcount put in cleanup_layoutstats */
- pnfs_get_lseg(pls);
- ++(*dev_count);
+ i++;
}
-
- return *dev_count < dev_limit;
+ return i;
}
static int
ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
{
- struct pnfs_layout_segment *pls;
+ struct nfs4_flexfile_layout *ff_layout;
+ struct nfs4_ff_layout_mirror *mirror;
int dev_count = 0;
spin_lock(&args->inode->i_lock);
- list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
- dev_count += FF_LAYOUT_MIRROR_COUNT(pls);
+ ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
+ list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+ if (atomic_read(&mirror->ref) != 0)
+ dev_count ++;
}
spin_unlock(&args->inode->i_lock);
/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
@@ -1897,20 +2060,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
__func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
dev_count = PNFS_LAYOUTSTATS_MAXDEV;
}
- args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL);
+ args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
if (!args->devinfo)
return -ENOMEM;
- dev_count = 0;
spin_lock(&args->inode->i_lock);
- list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
- if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count,
- PNFS_LAYOUTSTATS_MAXDEV)) {
- break;
- }
- }
+ args->num_dev = ff_layout_mirror_prepare_stats(args,
+ &ff_layout->generic_hdr, dev_count);
spin_unlock(&args->inode->i_lock);
- args->num_dev = dev_count;
return 0;
}
@@ -1924,7 +2081,7 @@ ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
for (i = 0; i < data->args.num_dev; i++) {
mirror = data->args.devinfo[i].layout_private;
data->args.devinfo[i].layout_private = NULL;
- pnfs_put_lseg(mirror->lseg);
+ ff_layout_put_mirror(mirror);
}
}
@@ -1936,6 +2093,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.free_layout_hdr = ff_layout_free_layout_hdr,
.alloc_lseg = ff_layout_alloc_lseg,
.free_lseg = ff_layout_free_lseg,
+ .add_lseg = ff_layout_add_lseg,
.pg_read_ops = &ff_layout_pg_read_ops,
.pg_write_ops = &ff_layout_pg_write_ops,
.get_ds_info = ff_layout_get_ds_info,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index f92f9a0a856b..68cc0d9828f9 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -67,7 +67,8 @@ struct nfs4_ff_layoutstat {
};
struct nfs4_ff_layout_mirror {
- struct pnfs_layout_segment *lseg; /* back pointer */
+ struct pnfs_layout_hdr *layout;
+ struct list_head mirrors;
u32 ds_count;
u32 efficiency;
struct nfs4_ff_layout_ds *mirror_ds;
@@ -77,6 +78,7 @@ struct nfs4_ff_layout_mirror {
u32 uid;
u32 gid;
struct rpc_cred *cred;
+ atomic_t ref;
spinlock_t lock;
struct nfs4_ff_layoutstat read_stat;
struct nfs4_ff_layoutstat write_stat;
@@ -95,6 +97,7 @@ struct nfs4_ff_layout_segment {
struct nfs4_flexfile_layout {
struct pnfs_layout_hdr generic_hdr;
struct pnfs_ds_commit_info commit_info;
+ struct list_head mirrors;
struct list_head error_list; /* nfs4_ff_layout_ds_err */
};
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index f13e1969eedd..e125e55de86d 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -172,6 +172,32 @@ out_err:
return NULL;
}
+static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
+ struct nfs4_deviceid_node *devid)
+{
+ nfs4_mark_deviceid_unavailable(devid);
+ if (!ff_layout_has_available_ds(lseg))
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+ lseg);
+}
+
+static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror == NULL || mirror->mirror_ds == NULL) {
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+ lseg);
+ return false;
+ }
+ if (mirror->mirror_ds->ds == NULL) {
+ struct nfs4_deviceid_node *devid;
+ devid = &mirror->mirror_ds->id_node;
+ ff_layout_mark_devid_invalid(lseg, devid);
+ return false;
+ }
+ return true;
+}
+
static u64
end_offset(u64 start, u64 len)
{
@@ -336,16 +362,10 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
{
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
struct nfs_fh *fh = NULL;
- struct nfs4_deviceid_node *devid;
- if (mirror == NULL || mirror->mirror_ds == NULL ||
- mirror->mirror_ds->ds == NULL) {
- printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
+ if (!ff_layout_mirror_valid(lseg, mirror)) {
+ pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
__func__, mirror_idx);
- if (mirror && mirror->mirror_ds) {
- devid = &mirror->mirror_ds->id_node;
- pnfs_generic_mark_devid_invalid(devid);
- }
goto out;
}
@@ -368,14 +388,9 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
unsigned int max_payload;
rpc_authflavor_t flavor;
- if (mirror == NULL || mirror->mirror_ds == NULL ||
- mirror->mirror_ds->ds == NULL) {
- printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+ if (!ff_layout_mirror_valid(lseg, mirror)) {
+ pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
__func__, ds_idx);
- if (mirror && mirror->mirror_ds) {
- devid = &mirror->mirror_ds->id_node;
- pnfs_generic_mark_devid_invalid(devid);
- }
goto out;
}
@@ -500,16 +515,19 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
range->offset, range->length))
continue;
/* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
- * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
+ * + array length + deviceid(NFS4_DEVICEID4_SIZE)
+ * + status(4) + opnum(4)
*/
p = xdr_reserve_space(xdr,
- 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
+ 28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
if (unlikely(!p))
return -ENOBUFS;
p = xdr_encode_hyper(p, err->offset);
p = xdr_encode_hyper(p, err->length);
p = xdr_encode_opaque_fixed(p, &err->stateid,
NFS4_STATEID_SIZE);
+ /* Encode 1 error */
+ *p++ = cpu_to_be32(1);
p = xdr_encode_opaque_fixed(p, &err->deviceid,
NFS4_DEVICEID4_SIZE);
*p++ = cpu_to_be32(err->status);
@@ -525,11 +543,11 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
return 0;
}
-bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *devid;
- int idx;
+ u32 idx;
for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
@@ -543,6 +561,32 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
return false;
}
+static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_deviceid_node *devid;
+ u32 idx;
+
+ for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ if (!mirror || !mirror->mirror_ds)
+ return false;
+ devid = &mirror->mirror_ds->id_node;
+ if (ff_layout_test_devid_unavailable(devid))
+ return false;
+ }
+
+ return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
+}
+
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+ if (lseg->pls_range.iomode == IOMODE_READ)
+ return ff_read_layout_has_available_ds(lseg);
+ /* Note: RW layout needs all mirrors available */
+ return ff_rw_layout_has_available_ds(lseg);
+}
+
module_param(dataserver_retrans, uint, 0644);
MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
"retries a request before it attempts further "
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0adc7d245b3d..326d9e10d833 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -504,7 +504,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct nfs_fattr *fattr;
- int error = -ENOMEM;
+ int error = 0;
nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -513,15 +513,14 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid &= ~ATTR_MODE;
if (attr->ia_valid & ATTR_SIZE) {
- loff_t i_size;
-
BUG_ON(!S_ISREG(inode->i_mode));
- i_size = i_size_read(inode);
- if (attr->ia_size == i_size)
+ error = inode_newsize_ok(inode, attr->ia_size);
+ if (error)
+ return error;
+
+ if (attr->ia_size == i_size_read(inode))
attr->ia_valid &= ~ATTR_SIZE;
- else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
- return -ETXTBSY;
}
/* Optimization: if the end result is no change, don't RPC */
@@ -536,8 +535,11 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
nfs_sync_inode(inode);
fattr = nfs_alloc_fattr();
- if (fattr == NULL)
+ if (fattr == NULL) {
+ error = -ENOMEM;
goto out;
+ }
+
/*
* Return any delegations if we're going to change ACLs
*/
@@ -759,11 +761,13 @@ EXPORT_SYMBOL_GPL(nfs_put_lock_context);
* @ctx: pointer to context
* @is_sync: is this a synchronous close
*
- * always ensure that the attributes are up to date if we're mounted
- * with close-to-open semantics
+ * Ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics and we have cached data that will
+ * need to be revalidated on open.
*/
void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
{
+ struct nfs_inode *nfsi;
struct inode *inode;
struct nfs_server *server;
@@ -772,7 +776,12 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
if (!is_sync)
return;
inode = d_inode(ctx->dentry);
- if (!list_empty(&NFS_I(inode)->open_files))
+ nfsi = NFS_I(inode);
+ if (inode->i_mapping->nrpages == 0)
+ return;
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ return;
+ if (!list_empty(&nfsi->open_files))
return;
server = NFS_SERVER(inode);
if (server->flags & NFS_MOUNT_NOCTO)
@@ -844,6 +853,11 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
}
EXPORT_SYMBOL_GPL(put_nfs_open_context);
+static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
+{
+ __put_nfs_open_context(ctx, 1);
+}
+
/*
* Ensure that mmap has a recent RPC credential for use when writing out
* shared pages
@@ -888,7 +902,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
return ctx;
}
-static void nfs_file_clear_open_context(struct file *filp)
+void nfs_file_clear_open_context(struct file *filp)
{
struct nfs_open_context *ctx = nfs_file_open_context(filp);
@@ -899,7 +913,7 @@ static void nfs_file_clear_open_context(struct file *filp)
spin_lock(&inode->i_lock);
list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
spin_unlock(&inode->i_lock);
- __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
+ put_nfs_open_context_sync(ctx);
}
}
@@ -919,12 +933,6 @@ int nfs_open(struct inode *inode, struct file *filp)
return 0;
}
-int nfs_release(struct inode *inode, struct file *filp)
-{
- nfs_file_clear_open_context(filp);
- return 0;
-}
-
/*
* This function is called whenever some part of NFS notices that
* the cached attributes have to be refreshed.
@@ -1273,13 +1281,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
return 0;
}
-static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
-{
- if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
- return 0;
- return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
-}
-
static atomic_long_t nfs_attr_generation_counter;
static unsigned long nfs_read_attr_generation_counter(void)
@@ -1428,7 +1429,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
const struct nfs_inode *nfsi = NFS_I(inode);
return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
- nfs_ctime_need_update(inode, fattr) ||
((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
}
@@ -1491,6 +1491,13 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
{
unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ /*
+ * Don't revalidate the pagecache if we hold a delegation, but do
+ * force an attribute update
+ */
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_FORCED;
+
if (S_ISDIR(inode->i_mode))
invalid |= NFS_INO_INVALID_DATA;
nfs_set_cache_invalid(inode, invalid);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9b372b845f6a..56cfde26fb9c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -219,10 +219,6 @@ static inline void nfs_fs_proc_exit(void)
}
#endif
-#ifdef CONFIG_NFS_V4_1
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
-#endif
-
/* callback_xdr.c */
extern struct svc_version nfs4_callback_version1;
extern struct svc_version nfs4_callback_version4;
@@ -364,7 +360,6 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *)
/* file.c */
int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
loff_t nfs_file_llseek(struct file *, loff_t, int);
-int nfs_file_flush(struct file *, fl_owner_t);
ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
size_t, unsigned int);
@@ -490,6 +485,9 @@ void nfs_retry_commit(struct list_head *page_list,
void nfs_commitdata_release(struct nfs_commit_data *data);
void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
struct nfs_commit_info *cinfo);
+void nfs_request_add_commit_list_locked(struct nfs_page *req,
+ struct list_head *dst,
+ struct nfs_commit_info *cinfo);
void nfs_request_remove_commit_list(struct nfs_page *req,
struct nfs_commit_info *cinfo);
void nfs_init_cinfo(struct nfs_commit_info *cinfo,
@@ -623,13 +621,15 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
* Record the page as unstable and mark its inode as dirty.
*/
static inline
-void nfs_mark_page_unstable(struct page *page)
+void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
{
- struct inode *inode = page_file_mapping(page)->host;
+ if (!cinfo->dreq) {
+ struct inode *inode = page_file_mapping(page)->host;
- inc_zone_page_state(page, NR_UNSTABLE_NFS);
- inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
- __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ inc_zone_page_state(page, NR_UNSTABLE_NFS);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ }
}
/*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9b04c2e6fffc..267126d32ec0 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
{
encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
encode_symlinkdata3(xdr, args);
+ xdr->buf->flags |= XDRBUF_WRITE;
}
/*
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index ff66ae700b89..814c1255f1d2 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -17,7 +17,5 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
-/* nfs4.2xdr.h */
-extern struct rpc_procinfo nfs4_2_procedures[];
#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index a6bd27da6286..0eb29e14070d 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -238,8 +238,7 @@ out_overflow:
return -EIO;
}
-static int decode_layoutstats(struct xdr_stream *xdr,
- struct nfs42_layoutstat_res *res)
+static int decode_layoutstats(struct xdr_stream *xdr)
{
return decode_op_hdr(xdr, OP_LAYOUTSTATS);
}
@@ -343,7 +342,7 @@ static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
goto out;
WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
for (i = 0; i < res->num_dev; i++) {
- status = decode_layoutstats(xdr, res);
+ status = decode_layoutstats(xdr);
if (status)
goto out;
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ea3bee919a76..50cfc4ca7a02 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -405,9 +405,7 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
int nfs41_discover_server_trunking(struct nfs_client *clp,
struct nfs_client **, struct rpc_cred *);
extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
-extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp);
-extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp);
-
+extern void nfs41_notify_server(struct nfs_client *);
#else
static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
{
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 3aa6a9ba5113..223bedda64ae 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -729,10 +729,7 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
return false;
/* Match only the IP address, not the port number */
- if (!nfs_sockaddr_match_ipaddr(addr, clap))
- return false;
-
- return true;
+ return rpc_cmp_addr(addr, clap);
}
/*
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index dcd39d4e2efe..b0dbe0abed53 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -6,7 +6,9 @@
#include <linux/fs.h>
#include <linux/falloc.h>
#include <linux/nfs_fs.h>
+#include "delegation.h"
#include "internal.h"
+#include "iostat.h"
#include "fscache.h"
#include "pnfs.h"
@@ -27,7 +29,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
struct inode *dir;
unsigned openflags = filp->f_flags;
struct iattr attr;
- int opened = 0;
int err;
/*
@@ -66,7 +67,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
nfs_sync_inode(inode);
}
- inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);
+ inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
switch (err) {
@@ -100,6 +101,31 @@ out_drop:
goto out_put_ctx;
}
+/*
+ * Flush all dirty pages, and check for write errors.
+ */
+static int
+nfs4_file_flush(struct file *file, fl_owner_t id)
+{
+ struct inode *inode = file_inode(file);
+
+ dprintk("NFS: flush(%pD2)\n", file);
+
+ nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
+ if ((file->f_mode & FMODE_WRITE) == 0)
+ return 0;
+
+ /*
+ * If we're holding a write delegation, then check if we're required
+ * to flush the i/o on close. If not, then just start the i/o now.
+ */
+ if (!nfs4_delegation_flush_on_close(inode))
+ return filemap_fdatawrite(file->f_mapping);
+
+ /* Flush writes to the server and return any errors */
+ return vfs_fsync(file, 0);
+}
+
static int
nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
@@ -178,7 +204,7 @@ const struct file_operations nfs4_file_operations = {
.write_iter = nfs_file_write,
.mmap = nfs_file_mmap,
.open = nfs4_file_open,
- .flush = nfs_file_flush,
+ .flush = nfs4_file_flush,
.release = nfs_file_release,
.fsync = nfs4_file_fsync,
.lock = nfs_lock,
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 535dfc69c628..2e4902203c35 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -184,7 +184,7 @@ static struct key_type key_type_id_resolver = {
.read = user_read,
};
-static int nfs_idmap_init_keyring(void)
+int nfs_idmap_init(void)
{
struct cred *cred;
struct key *keyring;
@@ -230,7 +230,7 @@ failed_put_cred:
return ret;
}
-static void nfs_idmap_quit_keyring(void)
+void nfs_idmap_quit(void)
{
key_revoke(id_resolver_cache->thread_keyring);
unregister_key_type(&key_type_id_resolver);
@@ -492,16 +492,6 @@ nfs_idmap_delete(struct nfs_client *clp)
kfree(idmap);
}
-int nfs_idmap_init(void)
-{
- return nfs_idmap_init_keyring();
-}
-
-void nfs_idmap_quit(void)
-{
- nfs_idmap_quit_keyring();
-}
-
static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
struct idmap_msg *im,
struct rpc_pipe_msg *msg)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3acb1eb72930..693b903b48bd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -586,7 +586,7 @@ out_unlock:
spin_unlock(&tbl->slot_tbl_lock);
res->sr_slot = NULL;
if (send_new_highest_used_slotid)
- nfs41_server_notify_highest_slotid_update(session->clp);
+ nfs41_notify_server(session->clp);
}
int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
@@ -1150,7 +1150,8 @@ out:
return ret;
}
-static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
+ enum open_claim_type4 claim)
{
if (delegation == NULL)
return 0;
@@ -1158,6 +1159,16 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
return 0;
if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
return 0;
+ switch (claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
+ break;
+ default:
+ return 0;
+ }
nfs_mark_delegation_referenced(delegation);
return 1;
}
@@ -1220,6 +1231,7 @@ static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
}
static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
+ nfs4_stateid *arg_stateid,
nfs4_stateid *stateid, fmode_t fmode)
{
clear_bit(NFS_O_RDWR_STATE, &state->flags);
@@ -1238,8 +1250,9 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
if (stateid == NULL)
return;
/* Handle races with OPEN */
- if (!nfs4_stateid_match_other(stateid, &state->open_stateid) ||
- !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+ if (!nfs4_stateid_match_other(arg_stateid, &state->open_stateid) ||
+ (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
+ !nfs4_stateid_is_newer(stateid, &state->open_stateid))) {
nfs_resync_open_stateid_locked(state);
return;
}
@@ -1248,10 +1261,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
nfs4_stateid_copy(&state->open_stateid, stateid);
}
-static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+static void nfs_clear_open_stateid(struct nfs4_state *state,
+ nfs4_stateid *arg_stateid,
+ nfs4_stateid *stateid, fmode_t fmode)
{
write_seqlock(&state->seqlock);
- nfs_clear_open_stateid_locked(state, stateid, fmode);
+ nfs_clear_open_stateid_locked(state, arg_stateid, stateid, fmode);
write_sequnlock(&state->seqlock);
if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
@@ -1376,6 +1391,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
struct nfs_delegation *delegation;
int open_mode = opendata->o_arg.open_flags;
fmode_t fmode = opendata->o_arg.fmode;
+ enum open_claim_type4 claim = opendata->o_arg.claim;
nfs4_stateid stateid;
int ret = -EAGAIN;
@@ -1389,7 +1405,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
spin_unlock(&state->owner->so_lock);
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
- if (!can_open_delegated(delegation, fmode)) {
+ if (!can_open_delegated(delegation, fmode, claim)) {
rcu_read_unlock();
break;
}
@@ -1852,6 +1868,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
struct nfs4_opendata *data = calldata;
struct nfs4_state_owner *sp = data->owner;
struct nfs_client *clp = sp->so_server->nfs_client;
+ enum open_claim_type4 claim = data->o_arg.claim;
if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
goto out_wait;
@@ -1866,15 +1883,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
goto out_no_action;
rcu_read_lock();
delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
- if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR &&
- data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH &&
- can_open_delegated(delegation, data->o_arg.fmode))
+ if (can_open_delegated(delegation, data->o_arg.fmode, claim))
goto unlock_no_action;
rcu_read_unlock();
}
/* Update client id. */
data->o_arg.clientid = clp->cl_clientid;
- switch (data->o_arg.claim) {
+ switch (claim) {
+ default:
+ break;
case NFS4_OPEN_CLAIM_PREVIOUS:
case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
@@ -2294,15 +2311,25 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
* fields corresponding to attributes that were used to store the verifier.
* Make sure we clobber those fields in the later setattr call
*/
-static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr)
+static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
+ struct iattr *sattr, struct nfs4_label **label)
{
- if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
+ const u32 *attrset = opendata->o_res.attrset;
+
+ if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
!(sattr->ia_valid & ATTR_ATIME_SET))
sattr->ia_valid |= ATTR_ATIME;
- if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
+ if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
!(sattr->ia_valid & ATTR_MTIME_SET))
sattr->ia_valid |= ATTR_MTIME;
+
+ /* Except MODE, it seems harmless of setting twice. */
+ if ((attrset[1] & FATTR4_WORD1_MODE))
+ sattr->ia_valid &= ~ATTR_MODE;
+
+ if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
+ *label = NULL;
}
static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
@@ -2425,9 +2452,9 @@ static int _nfs4_do_open(struct inode *dir,
goto err_free_label;
state = ctx->state;
- if ((opendata->o_arg.open_flags & O_EXCL) &&
+ if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
(opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
- nfs4_exclusive_attrset(opendata, sattr);
+ nfs4_exclusive_attrset(opendata, sattr, &label);
nfs_fattr_init(opendata->o_res.f_attr);
status = nfs4_do_setattr(state->inode, cred,
@@ -2439,7 +2466,7 @@ static int _nfs4_do_open(struct inode *dir,
nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
}
}
- if (opendata->file_created)
+ if (opened && opendata->file_created)
*opened |= FILE_CREATED;
if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
@@ -2661,7 +2688,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
switch (task->tk_status) {
case 0:
res_stateid = &calldata->res.stateid;
- if (calldata->arg.fmode == 0 && calldata->roc)
+ if (calldata->roc)
pnfs_roc_set_barrier(state->inode,
calldata->roc_barrier);
renew_lease(server, calldata->timestamp);
@@ -2684,7 +2711,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
goto out_release;
}
}
- nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
+ nfs_clear_open_stateid(state, &calldata->arg.stateid,
+ res_stateid, calldata->arg.fmode);
out_release:
nfs_release_seqid(calldata->arg.seqid);
nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -2735,14 +2763,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
goto out_no_action;
}
- if (calldata->arg.fmode == 0) {
+ if (calldata->arg.fmode == 0)
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
- if (calldata->roc &&
- pnfs_roc_drain(inode, &calldata->roc_barrier, task)) {
- nfs_release_seqid(calldata->arg.seqid);
- goto out_wait;
- }
- }
+ if (calldata->roc)
+ pnfs_roc_get_barrier(inode, &calldata->roc_barrier);
+
calldata->arg.share_access =
nfs4_map_atomic_open_share(NFS_SERVER(inode),
calldata->arg.fmode, 0);
@@ -2883,8 +2908,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
{
+ u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion;
struct nfs4_server_caps_arg args = {
.fhandle = fhandle,
+ .bitmask = bitmask,
};
struct nfs4_server_caps_res res = {};
struct rpc_message msg = {
@@ -2894,10 +2921,18 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
};
int status;
+ bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS |
+ FATTR4_WORD0_FH_EXPIRE_TYPE |
+ FATTR4_WORD0_LINK_SUPPORT |
+ FATTR4_WORD0_SYMLINK_SUPPORT |
+ FATTR4_WORD0_ACLSUPPORT;
+ if (minorversion)
+ bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
if (status == 0) {
/* Sanity check the server answers */
- switch (server->nfs_client->cl_minorversion) {
+ switch (minorversion) {
case 0:
res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
res.attr_bitmask[2] = 0;
@@ -2950,6 +2985,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
server->cache_consistency_bitmask[2] = 0;
+ memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask,
+ sizeof(server->exclcreat_bitmask));
server->acl_bitmask = res.acl_bitmask;
server->fh_expire_type = res.fh_expire_type;
}
@@ -3552,7 +3589,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
struct nfs4_label l, *ilabel = NULL;
struct nfs_open_context *ctx;
struct nfs4_state *state;
- int opened = 0;
int status = 0;
ctx = alloc_nfs_open_context(dentry, FMODE_READ);
@@ -3562,7 +3598,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
sattr->ia_mode &= ~current_umask();
- state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened);
+ state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL);
if (IS_ERR(state)) {
status = PTR_ERR(state);
goto out;
@@ -4978,13 +5014,12 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
int result;
size_t len;
char *str;
- bool retried = false;
if (clp->cl_owner_id != NULL)
return 0;
-retry:
+
rcu_read_lock();
- len = 10 + strlen(clp->cl_ipaddr) + 1 +
+ len = 14 + strlen(clp->cl_ipaddr) + 1 +
strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
1 +
strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
@@ -5010,14 +5045,6 @@ retry:
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
rcu_read_unlock();
- /* Did something change? */
- if (result >= len) {
- kfree(str);
- if (retried)
- return -EINVAL;
- retried = true;
- goto retry;
- }
clp->cl_owner_id = str;
return 0;
}
@@ -5049,10 +5076,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
clp->rpc_ops->version, clp->cl_minorversion,
nfs4_client_id_uniquifier,
clp->cl_rpcclient->cl_nodename);
- if (result >= len) {
- kfree(str);
- return -EINVAL;
- }
clp->cl_owner_id = str;
return 0;
}
@@ -5088,10 +5111,6 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
result = scnprintf(str, len, "Linux NFSv%u.%u %s",
clp->rpc_ops->version, clp->cl_minorversion,
clp->cl_rpcclient->cl_nodename);
- if (result >= len) {
- kfree(str);
- return -EINVAL;
- }
clp->cl_owner_id = str;
return 0;
}
@@ -5289,9 +5308,8 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
d_data = (struct nfs4_delegreturndata *)data;
- if (d_data->roc &&
- pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
- return;
+ if (d_data->roc)
+ pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier);
nfs4_setup_sequence(d_data->res.server,
&d_data->args.seq_args,
@@ -7746,10 +7764,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
case 0:
goto out;
/*
+ * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
+ * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
+ */
+ case -NFS4ERR_BADLAYOUT:
+ goto out_overflow;
+ /*
* NFS4ERR_LAYOUTTRYLATER is a conflict with another client
- * (or clients) writing to the same RAID stripe
+ * (or clients) writing to the same RAID stripe except when
+ * the minlength argument is 0 (see RFC5661 section 18.43.3).
*/
case -NFS4ERR_LAYOUTTRYLATER:
+ if (lgp->args.minlength == 0)
+ goto out_overflow;
/*
* NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
* existing layout before getting a new one).
@@ -7805,6 +7832,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
rpc_restart_call_prepare(task);
out:
dprintk("<-- %s\n", __func__);
+ return;
+out_overflow:
+ task->tk_status = -EOVERFLOW;
+ goto out;
}
static size_t max_response_pages(struct nfs_server *server)
@@ -8661,6 +8692,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
.state_renewal_ops = &nfs41_state_renewal_ops,
+ .mig_recovery_ops = &nfs41_mig_recovery_ops,
};
#endif
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f2e2ad894461..da73bc443238 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2152,23 +2152,13 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
}
EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
-static void nfs41_ping_server(struct nfs_client *clp)
+void nfs41_notify_server(struct nfs_client *clp)
{
/* Use CHECK_LEASE to ping the server with a SEQUENCE */
set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
nfs4_schedule_state_manager(clp);
}
-void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)
-{
- nfs41_ping_server(clp);
-}
-
-void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp)
-{
- nfs41_ping_server(clp);
-}
-
static void nfs4_reset_all_state(struct nfs_client *clp)
{
if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 470af1a78bec..28df12e525ba 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -884,6 +884,66 @@ DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr);
DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root);
DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo);
+DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const struct nfs_fh *fhandle,
+ const struct inode *inode,
+ int error
+ ),
+
+ TP_ARGS(clp, fhandle, inode, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ if (inode != NULL) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ } else {
+ __entry->fileid = 0;
+ __entry->dev = 0;
+ }
+ __assign_str(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "dstaddr=%s",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __get_str(dstaddr)
+ )
+);
+
+#define DEFINE_NFS4_INODE_CALLBACK_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_callback_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ const struct nfs_fh *fhandle, \
+ const struct inode *inode, \
+ int error \
+ ), \
+ TP_ARGS(clp, fhandle, inode, error))
+DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
+DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode);
+
+
DECLARE_EVENT_CLASS(nfs4_idmap_event,
TP_PROTO(
const char *name,
@@ -1136,6 +1196,7 @@ TRACE_EVENT(nfs4_layoutget,
DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 558cd65dbdb7..788adf3897c7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -400,7 +400,8 @@ static int nfs4_stat_to_errno(int);
#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
encode_stateid_maxsz + \
- 1 /* FIXME: opaque lrf_body always empty at the moment */)
+ 1 + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
1 + decode_stateid_maxsz)
#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
@@ -1001,7 +1002,8 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
const struct nfs4_label *label,
- const struct nfs_server *server)
+ const struct nfs_server *server,
+ bool excl_check)
{
char owner_name[IDMAP_NAMESZ];
char owner_group[IDMAP_NAMESZ];
@@ -1067,6 +1069,17 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
len += 4;
}
+
+ if (excl_check) {
+ const u32 *excl_bmval = server->exclcreat_bitmask;
+ bmval[0] &= excl_bmval[0];
+ bmval[1] &= excl_bmval[1];
+ bmval[2] &= excl_bmval[2];
+
+ if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL))
+ label = NULL;
+ }
+
if (label) {
len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
@@ -1154,7 +1167,9 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
case NF4LNK:
p = reserve_space(xdr, 4);
*p = cpu_to_be32(create->u.symlink.len);
- xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
+ xdr_write_pages(xdr, create->u.symlink.pages, 0,
+ create->u.symlink.len);
+ xdr->buf->flags |= XDRBUF_WRITE;
break;
case NF4BLK: case NF4CHR:
@@ -1168,7 +1183,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
}
encode_string(xdr, create->name->len, create->name->name);
- encode_attrs(xdr, create->attrs, create->label, create->server);
+ encode_attrs(xdr, create->attrs, create->label, create->server, false);
}
static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1382,18 +1397,17 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
{
- struct iattr dummy;
__be32 *p;
p = reserve_space(xdr, 4);
switch(arg->createmode) {
case NFS4_CREATE_UNCHECKED:
*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
break;
case NFS4_CREATE_GUARDED:
*p = cpu_to_be32(NFS4_CREATE_GUARDED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
break;
case NFS4_CREATE_EXCLUSIVE:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1402,8 +1416,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
case NFS4_CREATE_EXCLUSIVE4_1:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
encode_nfs4_verifier(xdr, &arg->u.verifier);
- dummy.ia_valid = 0;
- encode_attrs(xdr, &dummy, arg->label, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true);
}
}
@@ -1659,7 +1672,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
{
encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
encode_nfs4_stateid(xdr, &arg->stateid);
- encode_attrs(xdr, arg->iap, arg->label, server);
+ encode_attrs(xdr, arg->iap, arg->label, server, false);
}
static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -2580,6 +2593,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
struct xdr_stream *xdr,
struct nfs4_server_caps_arg *args)
{
+ const u32 *bitmask = args->bitmask;
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
};
@@ -2587,11 +2601,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fhandle, &hdr);
- encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
- FATTR4_WORD0_FH_EXPIRE_TYPE|
- FATTR4_WORD0_LINK_SUPPORT|
- FATTR4_WORD0_SYMLINK_SUPPORT|
- FATTR4_WORD0_ACLSUPPORT, &hdr);
+ encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr);
encode_nops(&hdr);
}
@@ -3368,6 +3378,22 @@ out_overflow:
return -EIO;
}
+static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
+ uint32_t *bitmap, uint32_t *bitmask)
+{
+ if (likely(bitmap[2] & FATTR4_WORD2_SUPPATTR_EXCLCREAT)) {
+ int ret;
+ ret = decode_attr_bitmap(xdr, bitmask);
+ if (unlikely(ret < 0))
+ return ret;
+ bitmap[2] &= ~FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+ } else
+ bitmask[0] = bitmask[1] = bitmask[2] = 0;
+ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+ bitmask[0], bitmask[1], bitmask[2]);
+ return 0;
+}
+
static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
{
__be32 *p;
@@ -4321,6 +4347,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
goto xdr_error;
if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
goto xdr_error;
+ if ((status = decode_attr_exclcreat_supported(xdr, bitmap,
+ res->exclcreat_bitmask)) != 0)
+ goto xdr_error;
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
dprintk("%s: xdr returned %d!\n", __func__, -status);
@@ -4903,24 +4932,28 @@ static int decode_lookup(struct xdr_stream *xdr)
}
/* This is too sick! */
-static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
+static int decode_space_limit(struct xdr_stream *xdr,
+ unsigned long *pagemod_limit)
{
__be32 *p;
uint32_t limit_type, nblocks, blocksize;
+ u64 maxsize = 0;
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
goto out_overflow;
limit_type = be32_to_cpup(p++);
switch (limit_type) {
- case 1:
- xdr_decode_hyper(p, maxsize);
+ case NFS4_LIMIT_SIZE:
+ xdr_decode_hyper(p, &maxsize);
break;
- case 2:
+ case NFS4_LIMIT_BLOCKS:
nblocks = be32_to_cpup(p++);
blocksize = be32_to_cpup(p);
- *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+ maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
}
+ maxsize >>= PAGE_CACHE_SHIFT;
+ *pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
return 0;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -4948,7 +4981,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
break;
case NFS4_OPEN_DELEGATE_WRITE:
res->delegation_type = FMODE_WRITE|FMODE_READ;
- if (decode_space_limit(xdr, &res->maxsize) < 0)
+ if (decode_space_limit(xdr, &res->pagemod_limit) < 0)
return -EIO;
}
return decode_ace(xdr, NULL, res->server->nfs_client);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 4984bbe55ff1..7c5718ba625e 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init);
void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
{
spin_lock(&hdr->lock);
- if (pos < hdr->io_start + hdr->good_bytes) {
- set_bit(NFS_IOHDR_ERROR, &hdr->flags);
+ if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)
+ || pos < hdr->io_start + hdr->good_bytes) {
clear_bit(NFS_IOHDR_EOF, &hdr->flags);
hdr->good_bytes = pos - hdr->io_start;
hdr->error = error;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 70bf706b1090..ba1246433794 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -368,7 +368,6 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
return false;
lo->plh_return_iomode = 0;
- lo->plh_block_lgets++;
pnfs_get_layout_hdr(lo);
clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
return true;
@@ -817,25 +816,12 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
}
-static bool
-pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range)
-{
- return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
- (lo->plh_return_iomode == IOMODE_ANY ||
- lo->plh_return_iomode == range->iomode);
-}
-
/* lget is set to 1 if called from inside send_layoutget call chain */
static bool
-pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range, int lget)
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
{
return lo->plh_block_lgets ||
- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
- (list_empty(&lo->plh_segs) &&
- (atomic_read(&lo->plh_outstanding) > lget)) ||
- pnfs_layout_returning(lo, range);
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
}
int
@@ -847,7 +833,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
- if (pnfs_layoutgets_blocked(lo, range, 1)) {
+ if (pnfs_layoutgets_blocked(lo)) {
status = -EAGAIN;
} else if (!nfs4_valid_open_stateid(open_state)) {
status = -EBADF;
@@ -882,6 +868,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_server *server = NFS_SERVER(ino);
struct nfs4_layoutget *lgp;
struct pnfs_layout_segment *lseg;
+ loff_t i_size;
dprintk("--> %s\n", __func__);
@@ -889,9 +876,17 @@ send_layoutget(struct pnfs_layout_hdr *lo,
if (lgp == NULL)
return NULL;
+ i_size = i_size_read(ino);
+
lgp->args.minlength = PAGE_CACHE_SIZE;
if (lgp->args.minlength > range->length)
lgp->args.minlength = range->length;
+ if (range->iomode == IOMODE_READ) {
+ if (range->offset >= i_size)
+ lgp->args.minlength = 0;
+ else if (i_size - range->offset < lgp->args.minlength)
+ lgp->args.minlength = i_size - range->offset;
+ }
lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
lgp->args.range = *range;
lgp->args.type = server->pnfs_curr_ld->id;
@@ -956,9 +951,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
if (unlikely(lrp == NULL)) {
status = -ENOMEM;
spin_lock(&ino->i_lock);
- lo->plh_block_lgets--;
pnfs_clear_layoutreturn_waitbit(lo);
- rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
spin_unlock(&ino->i_lock);
pnfs_put_layout_hdr(lo);
goto out;
@@ -1080,15 +1073,14 @@ bool pnfs_roc(struct inode *ino)
struct pnfs_layout_segment *lseg, *tmp;
nfs4_stateid stateid;
LIST_HEAD(tmp_list);
- bool found = false, layoutreturn = false;
+ bool found = false, layoutreturn = false, roc = false;
spin_lock(&ino->i_lock);
lo = nfsi->layout;
- if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+ if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
goto out_noroc;
- /* Don't return layout if we hold a delegation */
+ /* no roc if we hold a delegation */
if (nfs4_check_delegation(ino, FMODE_READ))
goto out_noroc;
@@ -1099,34 +1091,41 @@ bool pnfs_roc(struct inode *ino)
goto out_noroc;
}
+ stateid = lo->plh_stateid;
+ /* always send layoutreturn if being marked so */
+ if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ &lo->plh_flags))
+ layoutreturn = pnfs_prepare_layoutreturn(lo);
+
pnfs_clear_retry_layoutget(lo);
list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
- if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+ /* If we are sending layoutreturn, invalidate all valid lsegs */
+ if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
mark_lseg_invalid(lseg, &tmp_list);
found = true;
}
- if (!found)
- goto out_noroc;
- lo->plh_block_lgets++;
- pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
- spin_unlock(&ino->i_lock);
- pnfs_free_lseg_list(&tmp_list);
- pnfs_layoutcommit_inode(ino, true);
- return true;
+ /* pnfs_prepare_layoutreturn() grabs lo ref and it will be put
+ * in pnfs_roc_release(). We don't really send a layoutreturn but
+ * still want others to view us like we are sending one!
+ *
+ * If pnfs_prepare_layoutreturn() fails, it means someone else is doing
+ * LAYOUTRETURN, so we proceed like there are no layouts to return.
+ *
+ * ROC in three conditions:
+ * 1. there are ROC lsegs
+ * 2. we don't send layoutreturn
+ * 3. no others are sending layoutreturn
+ */
+ if (found && !layoutreturn && pnfs_prepare_layoutreturn(lo))
+ roc = true;
out_noroc:
- if (lo) {
- stateid = lo->plh_stateid;
- if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
- &lo->plh_flags))
- layoutreturn = pnfs_prepare_layoutreturn(lo);
- }
spin_unlock(&ino->i_lock);
- if (layoutreturn) {
- pnfs_layoutcommit_inode(ino, true);
+ pnfs_free_lseg_list(&tmp_list);
+ pnfs_layoutcommit_inode(ino, true);
+ if (layoutreturn)
pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
- }
- return false;
+ return roc;
}
void pnfs_roc_release(struct inode *ino)
@@ -1135,7 +1134,7 @@ void pnfs_roc_release(struct inode *ino)
spin_lock(&ino->i_lock);
lo = NFS_I(ino)->layout;
- lo->plh_block_lgets--;
+ pnfs_clear_layoutreturn_waitbit(lo);
if (atomic_dec_and_test(&lo->plh_refcount)) {
pnfs_detach_layout_hdr(lo);
spin_unlock(&ino->i_lock);
@@ -1153,27 +1152,16 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
lo->plh_barrier = barrier;
spin_unlock(&ino->i_lock);
+ trace_nfs4_layoutreturn_on_close(ino, 0);
}
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
{
struct nfs_inode *nfsi = NFS_I(ino);
struct pnfs_layout_hdr *lo;
- struct pnfs_layout_segment *lseg;
- nfs4_stateid stateid;
u32 current_seqid;
- bool layoutreturn = false;
spin_lock(&ino->i_lock);
- list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) {
- if (!test_bit(NFS_LSEG_ROC, &lseg->pls_flags))
- continue;
- if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
- continue;
- rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
- spin_unlock(&ino->i_lock);
- return true;
- }
lo = nfsi->layout;
current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
@@ -1181,19 +1169,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
* a barrier, we choose the worst-case barrier.
*/
*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
- stateid = lo->plh_stateid;
- if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
- &lo->plh_flags))
- layoutreturn = pnfs_prepare_layoutreturn(lo);
- if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
- rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
-
spin_unlock(&ino->i_lock);
- if (layoutreturn) {
- pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
- return true;
- }
- return false;
}
/*
@@ -1221,16 +1197,41 @@ pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
}
-static void
-pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
- struct pnfs_layout_segment *lseg)
+static bool
+pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
{
- struct pnfs_layout_segment *lp;
+ return pnfs_lseg_range_cmp(l1, l2) > 0;
+}
+
+static bool
+pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_segment *old)
+{
+ return false;
+}
+
+void
+pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ bool (*is_after)(const struct pnfs_layout_range *,
+ const struct pnfs_layout_range *),
+ bool (*do_merge)(struct pnfs_layout_segment *,
+ struct pnfs_layout_segment *),
+ struct list_head *free_me)
+{
+ struct pnfs_layout_segment *lp, *tmp;
dprintk("%s:Begin\n", __func__);
- list_for_each_entry(lp, &lo->plh_segs, pls_list) {
- if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
+ list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
+ if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
+ continue;
+ if (do_merge(lseg, lp)) {
+ mark_lseg_invalid(lp, free_me);
+ continue;
+ }
+ if (is_after(&lseg->pls_range, &lp->pls_range))
continue;
list_add_tail(&lseg->pls_list, &lp->pls_list);
dprintk("%s: inserted lseg %p "
@@ -1252,6 +1253,24 @@ out:
dprintk("%s:Return\n", __func__);
}
+EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
+
+static void
+pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ struct inode *inode = lo->plh_inode;
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+ if (ld->add_lseg != NULL)
+ ld->add_lseg(lo, lseg, free_me);
+ else
+ pnfs_generic_layout_insert_lseg(lo, lseg,
+ pnfs_lseg_range_is_after,
+ pnfs_lseg_no_merge,
+ free_me);
+}
static struct pnfs_layout_hdr *
alloc_init_layout_hdr(struct inode *ino,
@@ -1344,8 +1363,6 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
ret = pnfs_get_lseg(lseg);
break;
}
- if (lseg->pls_range.offset > range->offset)
- break;
}
dprintk("%s:Return lseg %p ref %d\n",
@@ -1438,6 +1455,8 @@ static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
{
+ if (!pnfs_should_retry_layoutget(lo))
+ return false;
/*
* send layoutcommit as it can hold up layoutreturn due to lseg
* reference
@@ -1484,6 +1503,9 @@ pnfs_update_layout(struct inode *ino,
if (!pnfs_enabled_sb(NFS_SERVER(ino)))
goto out;
+ if (iomode == IOMODE_READ && i_size_read(ino) == 0)
+ goto out;
+
if (pnfs_within_mdsthreshold(ctx, ino, iomode))
goto out;
@@ -1533,8 +1555,7 @@ lookup_again:
* Because we free lsegs before sending LAYOUTRETURN, we need to wait
* for LAYOUTRETURN even if first is true.
*/
- if (!lseg && pnfs_should_retry_layoutget(lo) &&
- test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
spin_unlock(&ino->i_lock);
dprintk("%s wait for layoutreturn\n", __func__);
if (pnfs_prepare_to_retry_layoutget(lo)) {
@@ -1547,7 +1568,7 @@ lookup_again:
goto out_put_layout_hdr;
}
- if (pnfs_layoutgets_blocked(lo, &arg, 0))
+ if (pnfs_layoutgets_blocked(lo))
goto out_unlock;
atomic_inc(&lo->plh_outstanding);
spin_unlock(&ino->i_lock);
@@ -1593,6 +1614,26 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(pnfs_update_layout);
+static bool
+pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
+{
+ switch (range->iomode) {
+ case IOMODE_READ:
+ case IOMODE_RW:
+ break;
+ default:
+ return false;
+ }
+ if (range->offset == NFS4_MAX_UINT64)
+ return false;
+ if (range->length == 0)
+ return false;
+ if (range->length != NFS4_MAX_UINT64 &&
+ range->length > NFS4_MAX_UINT64 - range->offset)
+ return false;
+ return true;
+}
+
struct pnfs_layout_segment *
pnfs_layout_process(struct nfs4_layoutget *lgp)
{
@@ -1601,7 +1642,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct pnfs_layout_segment *lseg;
struct inode *ino = lo->plh_inode;
LIST_HEAD(free_me);
- int status = 0;
+ int status = -EINVAL;
+
+ if (!pnfs_sanity_check_layout_range(&res->range))
+ goto out;
/* Inject layout blob into I/O device driver */
lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
@@ -1619,12 +1663,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
lseg->pls_range = res->range;
spin_lock(&ino->i_lock);
- if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
- dprintk("%s forget reply due to recall\n", __func__);
- goto out_forget_reply;
- }
-
- if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
+ if (pnfs_layoutgets_blocked(lo)) {
dprintk("%s forget reply due to state\n", __func__);
goto out_forget_reply;
}
@@ -1651,12 +1690,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
pnfs_get_lseg(lseg);
- pnfs_layout_insert_lseg(lo, lseg);
+ pnfs_layout_insert_lseg(lo, lseg, &free_me);
- if (res->return_on_close) {
+ if (res->return_on_close)
set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
- set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
- }
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me);
@@ -1692,6 +1729,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
lseg->pls_range.length);
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
mark_lseg_invalid(lseg, tmp_list);
+ set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ &lo->plh_flags);
}
}
@@ -2267,7 +2306,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
#if IS_ENABLED(CONFIG_NFS_V4_2)
int
-pnfs_report_layoutstat(struct inode *inode)
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
{
struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
struct nfs_server *server = NFS_SERVER(inode);
@@ -2294,7 +2333,7 @@ pnfs_report_layoutstat(struct inode *inode)
pnfs_get_layout_hdr(hdr);
spin_unlock(&inode->i_lock);
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), gfp_flags);
if (!data) {
status = -ENOMEM;
goto out_put;
@@ -2324,3 +2363,7 @@ out_put:
}
EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
#endif
+
+unsigned int layoutstats_timer;
+module_param(layoutstats_timer, uint, 0644);
+EXPORT_SYMBOL_GPL(layoutstats_timer);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 3e6ab7bfbabd..78c9351ff117 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,7 +94,6 @@ enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
- NFS_LAYOUT_ROC, /* some lseg had roc bit set */
NFS_LAYOUT_RETURN, /* Return this layout ASAP */
NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
@@ -129,6 +128,9 @@ struct pnfs_layoutdriver_type {
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
void (*free_lseg) (struct pnfs_layout_segment *lseg);
+ void (*add_lseg) (struct pnfs_layout_hdr *layoutid,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me);
void (*return_range) (struct pnfs_layout_hdr *lo,
struct pnfs_layout_range *range);
@@ -184,15 +186,15 @@ struct pnfs_layoutdriver_type {
struct pnfs_layout_hdr {
atomic_t plh_refcount;
+ atomic_t plh_outstanding; /* number of RPCs out */
struct list_head plh_layouts; /* other client layouts */
struct list_head plh_bulk_destroy;
struct list_head plh_segs; /* layout segments list */
- nfs4_stateid plh_stateid;
- atomic_t plh_outstanding; /* number of RPCs out */
unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
- u32 plh_barrier; /* ignore lower seqids */
unsigned long plh_retry_timestamp;
unsigned long plh_flags;
+ nfs4_stateid plh_stateid;
+ u32 plh_barrier; /* ignore lower seqids */
enum pnfs_iomode plh_return_iomode;
loff_t plh_lwb; /* last write byte for layoutcommit */
struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
@@ -267,7 +269,7 @@ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
+void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier);
void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
@@ -286,6 +288,14 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
gfp_t gfp_flags);
void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
+void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ bool (*is_after)(const struct pnfs_layout_range *lseg_range,
+ const struct pnfs_layout_range *old),
+ bool (*do_merge)(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_segment *old),
+ struct list_head *free_me);
+
void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
@@ -529,12 +539,31 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
nfss->pnfs_curr_ld->id == src->l_type);
}
+static inline u64
+pnfs_calc_offset_end(u64 offset, u64 len)
+{
+ if (len == NFS4_MAX_UINT64 || len >= NFS4_MAX_UINT64 - offset)
+ return NFS4_MAX_UINT64;
+ return offset + len - 1;
+}
+
+static inline u64
+pnfs_calc_offset_length(u64 offset, u64 end)
+{
+ if (end == NFS4_MAX_UINT64 || end <= offset)
+ return NFS4_MAX_UINT64;
+ return 1 + end - offset;
+}
+
+extern unsigned int layoutstats_timer;
+
#ifdef NFS_DEBUG
void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
#else
static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
{
}
+
#endif /* NFS_DEBUG */
#else /* CONFIG_NFS_V4_1 */
@@ -605,10 +634,9 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
{
}
-static inline bool
-pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+static inline void
+pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
{
- return false;
}
static inline void set_pnfs_layoutdriver(struct nfs_server *s,
@@ -691,10 +719,10 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
#endif /* CONFIG_NFS_V4_1 */
#if IS_ENABLED(CONFIG_NFS_V4_2)
-int pnfs_report_layoutstat(struct inode *inode);
+int pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags);
#else
static inline int
-pnfs_report_layoutstat(struct inode *inode)
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
{
return 0;
}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index f37e25b6311c..24655b807d44 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -124,11 +124,12 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
if (ret) {
cinfo->ds->nwritten -= ret;
cinfo->ds->ncommitting += ret;
- bucket->clseg = bucket->wlseg;
- if (list_empty(src))
+ if (bucket->clseg == NULL)
+ bucket->clseg = pnfs_get_lseg(bucket->wlseg);
+ if (list_empty(src)) {
+ pnfs_put_lseg_locked(bucket->wlseg);
bucket->wlseg = NULL;
- else
- pnfs_get_lseg(bucket->clseg);
+ }
}
return ret;
}
@@ -182,19 +183,23 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
struct pnfs_commit_bucket *bucket;
struct pnfs_layout_segment *freeme;
+ LIST_HEAD(pages);
int i;
+ spin_lock(cinfo->lock);
for (i = idx; i < fl_cinfo->nbuckets; i++) {
bucket = &fl_cinfo->buckets[i];
if (list_empty(&bucket->committing))
continue;
- nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
- spin_lock(cinfo->lock);
freeme = bucket->clseg;
bucket->clseg = NULL;
+ list_splice_init(&bucket->committing, &pages);
spin_unlock(cinfo->lock);
+ nfs_retry_commit(&pages, freeme, cinfo, i);
pnfs_put_lseg(freeme);
+ spin_lock(cinfo->lock);
}
+ spin_unlock(cinfo->lock);
}
static unsigned int
@@ -216,10 +221,6 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
if (!data)
break;
data->ds_commit_index = i;
- spin_lock(cinfo->lock);
- data->lseg = bucket->clseg;
- bucket->clseg = NULL;
- spin_unlock(cinfo->lock);
list_add(&data->pages, list);
nreq++;
}
@@ -229,6 +230,22 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
return nreq;
}
+static inline
+void pnfs_fetch_commit_bucket_list(struct list_head *pages,
+ struct nfs_commit_data *data,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_commit_bucket *bucket;
+
+ bucket = &cinfo->ds->buckets[data->ds_commit_index];
+ spin_lock(cinfo->lock);
+ list_splice_init(&bucket->committing, pages);
+ data->lseg = bucket->clseg;
+ bucket->clseg = NULL;
+ spin_unlock(cinfo->lock);
+
+}
+
/* This follows nfs_commit_list pretty closely */
int
pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
@@ -243,7 +260,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
if (!list_empty(mds_pages)) {
data = nfs_commitdata_alloc();
if (data != NULL) {
- data->lseg = NULL;
+ data->ds_commit_index = -1;
list_add(&data->pages, &list);
nreq++;
} else {
@@ -265,19 +282,16 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
list_for_each_entry_safe(data, tmp, &list, pages) {
list_del_init(&data->pages);
- if (!data->lseg) {
+ if (data->ds_commit_index < 0) {
nfs_init_commit(data, mds_pages, NULL, cinfo);
nfs_initiate_commit(NFS_CLIENT(inode), data,
NFS_PROTO(data->inode),
data->mds_ops, how, 0);
} else {
- struct pnfs_commit_bucket *buckets;
+ LIST_HEAD(pages);
- buckets = cinfo->ds->buckets;
- nfs_init_commit(data,
- &buckets[data->ds_commit_index].committing,
- data->lseg,
- cinfo);
+ pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
+ nfs_init_commit(data, &pages, data->lseg, cinfo);
initiate_commit(data, how);
}
}
@@ -359,26 +373,31 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
return false;
}
+/*
+ * Checks if 'dsaddrs1' contains a subset of 'dsaddrs2'. If it does,
+ * declare a match.
+ */
static bool
_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
const struct list_head *dsaddrs2)
{
struct nfs4_pnfs_ds_addr *da1, *da2;
-
- /* step through both lists, comparing as we go */
- for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
- da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
- da1 != NULL && da2 != NULL;
- da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
- da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
- if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
- (struct sockaddr *)&da2->da_addr))
- return false;
+ struct sockaddr *sa1, *sa2;
+ bool match = false;
+
+ list_for_each_entry(da1, dsaddrs1, da_node) {
+ sa1 = (struct sockaddr *)&da1->da_addr;
+ match = false;
+ list_for_each_entry(da2, dsaddrs2, da_node) {
+ sa2 = (struct sockaddr *)&da2->da_addr;
+ match = same_sockaddr(sa1, sa2);
+ if (match)
+ break;
+ }
+ if (!match)
+ break;
}
- if (da1 == NULL && da2 == NULL)
- return true;
-
- return false;
+ return match;
}
/*
@@ -863,9 +882,10 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
}
set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
cinfo->ds->nwritten++;
- spin_unlock(cinfo->lock);
- nfs_request_add_commit_list(req, list, cinfo);
+ nfs_request_add_commit_list_locked(req, list, cinfo);
+ spin_unlock(cinfo->lock);
+ nfs_mark_page_unstable(req->wb_page, cinfo);
}
EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index aa62004f1706..383a027de452 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -381,9 +381,12 @@ int __init register_nfs_fs(void)
ret = nfs_register_sysctl();
if (ret < 0)
goto error_2;
- register_shrinker(&acl_shrinker);
+ ret = register_shrinker(&acl_shrinker);
+ if (ret < 0)
+ goto error_3;
return 0;
-
+error_3:
+ nfs_unregister_sysctl();
error_2:
unregister_nfs4_fs();
error_1:
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 75a35a1afa79..388f48079c43 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -768,6 +768,28 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
}
/**
+ * nfs_request_add_commit_list_locked - add request to a commit list
+ * @req: pointer to a struct nfs_page
+ * @dst: commit list head
+ * @cinfo: holds list lock and accounting info
+ *
+ * This sets the PG_CLEAN bit, updates the cinfo count of
+ * number of outstanding requests requiring a commit as well as
+ * the MM page stats.
+ *
+ * The caller must hold the cinfo->lock, and the nfs_page lock.
+ */
+void
+nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
+ struct nfs_commit_info *cinfo)
+{
+ set_bit(PG_CLEAN, &req->wb_flags);
+ nfs_list_add_request(req, dst);
+ cinfo->mds->ncommit++;
+}
+EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
+
+/**
* nfs_request_add_commit_list - add request to a commit list
* @req: pointer to a struct nfs_page
* @dst: commit list head
@@ -784,13 +806,10 @@ void
nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
struct nfs_commit_info *cinfo)
{
- set_bit(PG_CLEAN, &(req)->wb_flags);
spin_lock(cinfo->lock);
- nfs_list_add_request(req, dst);
- cinfo->mds->ncommit++;
+ nfs_request_add_commit_list_locked(req, dst, cinfo);
spin_unlock(cinfo->lock);
- if (!cinfo->dreq)
- nfs_mark_page_unstable(req->wb_page);
+ nfs_mark_page_unstable(req->wb_page, cinfo);
}
EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
@@ -1793,7 +1812,7 @@ out_mark_dirty:
return res;
}
-static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct nfs_inode *nfsi = NFS_I(inode);
int flags = FLUSH_SYNC;
@@ -1828,11 +1847,6 @@ out_mark_dirty:
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return ret;
}
-
-int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- return nfs_commit_unstable_pages(inode, wbc);
-}
EXPORT_SYMBOL_GPL(nfs_write_inode);
/*