summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/dcache.c4
-rw-r--r--fs/direct-io.c6
-rw-r--r--fs/eventpoll.c12
-rw-r--r--fs/ext3/balloc.c21
-rw-r--r--fs/ext3/namei.c2
-rw-r--r--fs/ext3/super.c7
-rw-r--r--fs/ext4/extents.c4
-rw-r--r--fs/fuse/cuse.c2
-rw-r--r--fs/jbd/journal.c2
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/client.c131
-rw-r--r--fs/nfs/direct.c8
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nfs/idmap.c90
-rw-r--r--fs/nfs/internal.h22
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs4_fs.h28
-rw-r--r--fs/nfs/nfs4filelayout.c361
-rw-r--r--fs/nfs/nfs4filelayout.h19
-rw-r--r--fs/nfs/nfs4filelayoutdev.c252
-rw-r--r--fs/nfs/nfs4proc.c123
-rw-r--r--fs/nfs/nfs4renewd.c6
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/nfs4xdr.c38
-rw-r--r--fs/nfs/pagelist.c22
-rw-r--r--fs/nfs/pnfs.c330
-rw-r--r--fs/nfs/pnfs.h118
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/read.c127
-rw-r--r--fs/nfs/super.c284
-rw-r--r--fs/nfs/write.c153
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/omfs/dir.c66
-rw-r--r--fs/quota/quota_v2.c2
-rw-r--r--fs/ubifs/Kconfig23
-rw-r--r--fs/ubifs/commit.c58
-rw-r--r--fs/ubifs/debug.c34
-rw-r--r--fs/ubifs/debug.h30
-rw-r--r--fs/ubifs/io.c201
-rw-r--r--fs/ubifs/journal.c28
-rw-r--r--fs/ubifs/lprops.c26
-rw-r--r--fs/ubifs/lpt_commit.c56
-rw-r--r--fs/ubifs/orphan.c10
-rw-r--r--fs/ubifs/recovery.c44
-rw-r--r--fs/ubifs/scan.c2
-rw-r--r--fs/ubifs/super.c54
-rw-r--r--fs/ubifs/tnc.c10
-rw-r--r--fs/ubifs/ubifs.h45
-rw-r--r--fs/udf/balloc.c4
-rw-r--r--fs/udf/file.c7
-rw-r--r--fs/udf/inode.c239
-rw-r--r--fs/udf/truncate.c146
-rw-r--r--fs/udf/udfdecl.h12
57 files changed, 2178 insertions, 1115 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e1aa8d607bc7..100b07f021b4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2493,7 +2493,7 @@ int close_ctree(struct btrfs_root *root)
* ERROR state on disk.
*
* 2. when btrfs flips readonly just in btrfs_commit_super,
- * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+ * and in such case, btrfs cannot write sb via btrfs_commit_super,
* and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
* btrfs will cleanup all FS resources first and write sb then.
*/
diff --git a/fs/dcache.c b/fs/dcache.c
index 1baddc1cec48..ad25c4cec7d5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1811,7 +1811,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
* false-negative result. d_lookup() protects against concurrent
* renames using rename_lock seqlock.
*
- * See Documentation/vfs/dcache-locking.txt for more details.
+ * See Documentation/filesystems/path-lookup.txt for more details.
*/
hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
struct inode *i;
@@ -1931,7 +1931,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
* false-negative result. d_lookup() protects against concurrent
* renames using rename_lock seqlock.
*
- * See Documentation/vfs/dcache-locking.txt for more details.
+ * See Documentation/filesystems/path-lookup.txt for more details.
*/
rcu_read_lock();
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b044705eedd4..dcb5577cde1d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -645,11 +645,11 @@ static int dio_send_cur_page(struct dio *dio)
/*
* See whether this new request is contiguous with the old.
*
- * Btrfs cannot handl having logically non-contiguous requests
- * submitted. For exmple if you have
+ * Btrfs cannot handle having logically non-contiguous requests
+ * submitted. For example if you have
*
* Logical: [0-4095][HOLE][8192-12287]
- * Phyiscal: [0-4095] [4096-8181]
+ * Physical: [0-4095] [4096-8191]
*
* We cannot submit those pages together as one BIO. So if our
* current logical offset in the file does not equal what would
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4a09af9e9a63..ff12f7ac73ef 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -62,7 +62,7 @@
* This mutex is acquired by ep_free() during the epoll file
* cleanup path and it is also acquired by eventpoll_release_file()
* if a file has been pushed inside an epoll set and it is then
- * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
+ * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
* It is also acquired when inserting an epoll fd onto another epoll
* fd. We do this so that we walk the epoll tree and ensure that this
* insertion does not create a cycle of epoll file descriptors, which
@@ -152,11 +152,11 @@ struct epitem {
/*
* This structure is stored inside the "private_data" member of the file
- * structure and rapresent the main data sructure for the eventpoll
+ * structure and represents the main data structure for the eventpoll
* interface.
*/
struct eventpoll {
- /* Protect the this structure access */
+ /* Protect the access to this structure */
spinlock_t lock;
/*
@@ -793,7 +793,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
/*
* This is the callback that is passed to the wait queue wakeup
- * machanism. It is called by the stored file descriptors when they
+ * mechanism. It is called by the stored file descriptors when they
* have events to report.
*/
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
@@ -824,9 +824,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
goto out_unlock;
/*
- * If we are trasfering events to userspace, we can hold no locks
+ * If we are transferring events to userspace, we can hold no locks
* (because we're accessing user memory, and because of linux f_op->poll()
- * semantics). All the events that happens during that period of time are
+ * semantics). All the events that happen during that period of time are
* chained in ep->ovflist and requeued later on.
*/
if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 045995c8ce5a..153242187fce 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1991,6 +1991,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
spin_unlock(sb_bgl_lock(sbi, group));
percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+ free_blocks -= next - start;
/* Do not issue a TRIM on extents smaller than minblocks */
if ((next - start) < minblocks)
goto free_extent;
@@ -2040,7 +2041,7 @@ free_extent:
cond_resched();
/* No more suitable extents */
- if ((free_blocks - count) < minblocks)
+ if (free_blocks < minblocks)
break;
}
@@ -2090,7 +2091,8 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
int ret = 0;
- start = range->start >> sb->s_blocksize_bits;
+ start = (range->start >> sb->s_blocksize_bits) +
+ le32_to_cpu(es->s_first_data_block);
len = range->len >> sb->s_blocksize_bits;
minlen = range->minlen >> sb->s_blocksize_bits;
trimmed = 0;
@@ -2099,10 +2101,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
return -EINVAL;
if (start >= max_blks)
goto out;
- if (start < le32_to_cpu(es->s_first_data_block)) {
- len -= le32_to_cpu(es->s_first_data_block) - start;
- start = le32_to_cpu(es->s_first_data_block);
- }
if (start + len > max_blks)
len = max_blks - start;
@@ -2129,10 +2127,15 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (free_blocks < minlen)
continue;
- if (len >= EXT3_BLOCKS_PER_GROUP(sb))
- len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
- else
+ /*
+ * For all the groups except the last one, last block will
+ * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to
+ * change it for the last group in which case first_block +
+ * len < EXT3_BLOCKS_PER_GROUP(sb).
+ */
+ if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb))
last_block = first_block + len;
+ len -= last_block - first_block;
ret = ext3_trim_all_free(sb, group, first_block,
last_block, minlen);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 0521a007ae6d..32f3b8695859 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1540,8 +1540,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto cleanup;
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
+ memset(&node2->fake, 0, sizeof(struct fake_dirent));
node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
- node2->fake.inode = 0;
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext3_journal_get_write_access(handle, frame->bh);
if (err)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9cc19a1dea8e..071689f86e18 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1464,6 +1464,13 @@ static void ext3_orphan_cleanup (struct super_block * sb,
return;
}
+ /* Check if feature set allows readwrite operations */
+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
+ ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+ "unknown ROCOMPAT features");
+ return;
+ }
+
if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
if (es->s_last_orphan)
jbd_debug(1, "Errors on filesystem, "
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ccce8a7e94ed..7516fb9c0bd5 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -131,7 +131,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
* fragmenting the file system's free space. Maybe we
* should have some hueristics or some way to allow
* userspace to pass a hint to file system,
- * especiially if the latter case turns out to be
+ * especially if the latter case turns out to be
* common.
*/
ex = path[depth].p_ext;
@@ -2844,7 +2844,7 @@ fix_extent_len:
* ext4_get_blocks_dio_write() when DIO to write
* to an uninitialized extent.
*
- * Writing to an uninitized extent may result in splitting the uninitialized
+ * Writing to an uninitialized extent may result in splitting the uninitialized
* extent into multiple /initialized uninitialized extents (up to three)
* There are three possibilities:
* a> There is no split required: Entire extent should be uninitialized
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 3e87cce5837d..7c39b885f969 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -458,7 +458,7 @@ static void cuse_fc_release(struct fuse_conn *fc)
* @file: file struct being opened
*
* Userland CUSE server can create a CUSE device by opening /dev/cuse
- * and replying to the initilaization request kernel sends. This
+ * and replying to the initialization request kernel sends. This
* function is responsible for handling CUSE device initialization.
* Because the fd opened by this function is used during
* initialization, this function only creates cuse_conn and sends
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index da1b5e4ffce1..eb11601f2e00 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -839,7 +839,7 @@ journal_t * journal_init_inode (struct inode *inode)
err = journal_bmap(journal, 0, &blocknr);
/* If that failed, give up */
if (err) {
- printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
+ printk(KERN_ERR "%s: Cannot locate journal superblock\n",
__func__);
goto out_err;
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 97e73469b2c4..90407b8fece7 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -991,7 +991,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
err = jbd2_journal_bmap(journal, 0, &blocknr);
/* If that failed, give up */
if (err) {
- printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
+ printk(KERN_ERR "%s: Cannot locate journal superblock\n",
__func__);
goto out_err;
}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 89587573fe50..2f41dccea18e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
rv = NFS4ERR_DELAY;
list_del_init(&lo->plh_bulk_recall);
spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&free_me_list);
put_layout_hdr(lo);
iput(ino);
}
- pnfs_free_lseg_list(&free_me_list);
return rv;
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index bd3ca32879e7..139be9647d80 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -82,6 +82,11 @@ retry:
#endif /* CONFIG_NFS_V4 */
/*
+ * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
+ */
+static int nfs4_disable_idmapping = 0;
+
+/*
* RPC cruft for NFS
*/
static struct rpc_version *nfs_version[5] = {
@@ -481,7 +486,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
* Look up a client by IP address and protocol version
* - creates a new record if one doesn't yet exist
*/
-static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
+static struct nfs_client *
+nfs_get_client(const struct nfs_client_initdata *cl_init,
+ const struct rpc_timeout *timeparms,
+ const char *ip_addr,
+ rpc_authflavor_t authflavour,
+ int noresvport)
{
struct nfs_client *clp, *new = NULL;
int error;
@@ -512,6 +522,13 @@ install_client:
clp = new;
list_add(&clp->cl_share_link, &nfs_client_list);
spin_unlock(&nfs_client_lock);
+
+ error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
+ authflavour, noresvport);
+ if (error < 0) {
+ nfs_put_client(clp);
+ return ERR_PTR(error);
+ }
dprintk("--> nfs_get_client() = %p [new]\n", clp);
return clp;
@@ -767,9 +784,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
/*
* Initialise an NFS2 or NFS3 client
*/
-static int nfs_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const struct nfs_parsed_mount_data *data)
+int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
+ const char *ip_addr, rpc_authflavor_t authflavour,
+ int noresvport)
{
int error;
@@ -784,7 +801,7 @@ static int nfs_init_client(struct nfs_client *clp,
* - RFC 2623, sec 2.3.2
*/
error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
- 0, data->flags & NFS_MOUNT_NORESVPORT);
+ 0, noresvport);
if (error < 0)
goto error;
nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -820,19 +837,17 @@ static int nfs_init_server(struct nfs_server *server,
cl_init.rpc_ops = &nfs_v3_clientops;
#endif
+ nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+ data->timeo, data->retrans);
+
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init);
+ clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
+ data->flags & NFS_MOUNT_NORESVPORT);
if (IS_ERR(clp)) {
dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
return PTR_ERR(clp);
}
- nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
- data->timeo, data->retrans);
- error = nfs_init_client(clp, &timeparms, data);
- if (error < 0)
- goto error;
-
server->nfs_client = clp;
/* Initialise the client representation from the mount data */
@@ -1009,14 +1024,19 @@ static void nfs_server_insert_lists(struct nfs_server *server)
spin_lock(&nfs_client_lock);
list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
list_add_tail(&server->master_link, &nfs_volume_list);
+ clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
spin_unlock(&nfs_client_lock);
}
static void nfs_server_remove_lists(struct nfs_server *server)
{
+ struct nfs_client *clp = server->nfs_client;
+
spin_lock(&nfs_client_lock);
list_del_rcu(&server->client_link);
+ if (clp && list_empty(&clp->cl_superblocks))
+ set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
list_del(&server->master_link);
spin_unlock(&nfs_client_lock);
@@ -1307,11 +1327,11 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
/*
* Initialise an NFS4 client record
*/
-static int nfs4_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr,
- rpc_authflavor_t authflavour,
- int flags)
+int nfs4_init_client(struct nfs_client *clp,
+ const struct rpc_timeout *timeparms,
+ const char *ip_addr,
+ rpc_authflavor_t authflavour,
+ int noresvport)
{
int error;
@@ -1325,7 +1345,7 @@ static int nfs4_init_client(struct nfs_client *clp,
clp->rpc_ops = &nfs_v4_clientops;
error = nfs_create_rpc_client(clp, timeparms, authflavour,
- 1, flags & NFS_MOUNT_NORESVPORT);
+ 1, noresvport);
if (error < 0)
goto error;
strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1378,27 +1398,71 @@ static int nfs4_set_client(struct nfs_server *server,
dprintk("--> nfs4_set_client()\n");
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init);
+ clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
+ server->flags & NFS_MOUNT_NORESVPORT);
if (IS_ERR(clp)) {
error = PTR_ERR(clp);
goto error;
}
- error = nfs4_init_client(clp, timeparms, ip_addr, authflavour,
- server->flags);
- if (error < 0)
- goto error_put;
+
+ /*
+ * Query for the lease time on clientid setup or renewal
+ *
+ * Note that this will be set on nfs_clients that were created
+ * only for the DS role and did not set this bit, but now will
+ * serve a dual role.
+ */
+ set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
server->nfs_client = clp;
dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
return 0;
-
-error_put:
- nfs_put_client(clp);
error:
dprintk("<-- nfs4_set_client() = xerror %d\n", error);
return error;
}
+/*
+ * Set up a pNFS Data Server client.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+ const struct sockaddr *ds_addr,
+ int ds_addrlen, int ds_proto)
+{
+ struct nfs_client_initdata cl_init = {
+ .addr = ds_addr,
+ .addrlen = ds_addrlen,
+ .rpc_ops = &nfs_v4_clientops,
+ .proto = ds_proto,
+ .minorversion = mds_clp->cl_minorversion,
+ };
+ struct rpc_timeout ds_timeout = {
+ .to_initval = 15 * HZ,
+ .to_maxval = 15 * HZ,
+ .to_retries = 1,
+ .to_exponential = 1,
+ };
+ struct nfs_client *clp;
+
+ /*
+ * Set an authflavor equual to the MDS value. Use the MDS nfs_client
+ * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
+ * (section 13.1 RFC 5661).
+ */
+ clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+ mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
+
+ dprintk("<-- %s %p\n", __func__, clp);
+ return clp;
+}
+EXPORT_SYMBOL(nfs4_set_ds_client);
/*
* Session has been established, and the client marked ready.
@@ -1435,6 +1499,10 @@ static int nfs4_server_common_setup(struct nfs_server *server,
BUG_ON(!server->nfs_client->rpc_ops);
BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+ /* data servers support only a subset of NFSv4.1 */
+ if (is_ds_only_client(server->nfs_client))
+ return -EPROTONOSUPPORT;
+
fattr = nfs_alloc_fattr();
if (fattr == NULL)
return -ENOMEM;
@@ -1504,6 +1572,13 @@ static int nfs4_init_server(struct nfs_server *server,
if (error < 0)
goto error;
+ /*
+ * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+ * authentication.
+ */
+ if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX)
+ server->caps |= NFS_CAP_UIDGID_NOMAP;
+
if (data->rsize)
server->rsize = nfs_block_size(data->rsize, NULL);
if (data->wsize)
@@ -1921,3 +1996,7 @@ void nfs_fs_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
+
+module_param(nfs4_disable_idmapping, bool, 0644);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+ "Turn off NFSv4 idmapping when using 'sec=sys'");
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9943a75bb6d1..8eea25366717 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -45,6 +45,7 @@
#include <linux/pagemap.h>
#include <linux/kref.h>
#include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
@@ -649,8 +650,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
{
struct nfs_write_data *data = calldata;
- if (nfs_writeback_done(task, data) != 0)
- return;
+ nfs_writeback_done(task, data);
}
/*
@@ -938,6 +938,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
if (retval)
goto out;
+ task_io_account_read(count);
+
retval = nfs_direct_read(iocb, iov, nr_segs, pos);
if (retval > 0)
iocb->ki_pos = pos + retval;
@@ -999,6 +1001,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
if (retval)
goto out;
+ task_io_account_write(count);
+
retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
if (retval > 0)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7bf029ef4084..d85a534b15cd 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -387,10 +387,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
file->f_path.dentry->d_name.name,
mapping->host->i_ino, len, (long long) pos);
- pnfs_update_layout(mapping->host,
- nfs_file_open_context(file),
- IOMODE_RW);
-
start:
/*
* Prevent starvation issues if someone is doing a consistency
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 18696882f1c6..79664a1025af 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -33,16 +33,41 @@
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+
+static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+{
+ unsigned long val;
+ char buf[16];
+
+ if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
+ return 0;
+ memcpy(buf, name, namelen);
+ buf[namelen] = '\0';
+ if (strict_strtoul(buf, 0, &val) != 0)
+ return 0;
+ *res = val;
+ return 1;
+}
+
+static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
+{
+ return snprintf(buf, buflen, "%u", id);
+}
#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
#include <linux/slab.h>
#include <linux/cred.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs_sb.h>
#include <linux/nfs_idmap.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <linux/rcupdate.h>
-#include <linux/kernel.h>
#include <linux/err.h>
#include <keys/user-type.h>
@@ -219,23 +244,39 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,
return ret;
}
-int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
{
+ if (nfs_map_string_to_numeric(name, namelen, uid))
+ return 0;
return nfs_idmap_lookup_id(name, namelen, "uid", uid);
}
-int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
{
+ if (nfs_map_string_to_numeric(name, namelen, gid))
+ return 0;
return nfs_idmap_lookup_id(name, namelen, "gid", gid);
}
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
{
- return nfs_idmap_lookup_name(uid, "user", buf, buflen);
+ int ret = -EINVAL;
+
+ if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+ ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
+ if (ret < 0)
+ ret = nfs_map_numeric_to_string(uid, buf, buflen);
+ return ret;
}
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
{
- return nfs_idmap_lookup_name(gid, "group", buf, buflen);
+ int ret = -EINVAL;
+
+ if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+ ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
+ if (ret < 0)
+ ret = nfs_map_numeric_to_string(gid, buf, buflen);
+ return ret;
}
#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
@@ -243,7 +284,6 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/init.h>
-#include <linux/types.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/in.h>
@@ -695,31 +735,45 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
return hash;
}
-int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
{
- struct idmap *idmap = clp->cl_idmap;
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ if (nfs_map_string_to_numeric(name, namelen, uid))
+ return 0;
return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
}
-int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
{
- struct idmap *idmap = clp->cl_idmap;
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ if (nfs_map_string_to_numeric(name, namelen, uid))
+ return 0;
return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
}
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
{
- struct idmap *idmap = clp->cl_idmap;
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ int ret = -EINVAL;
- return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+ if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+ ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+ if (ret < 0)
+ ret = nfs_map_numeric_to_string(uid, buf, buflen);
+ return ret;
}
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
{
- struct idmap *idmap = clp->cl_idmap;
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ int ret = -EINVAL;
- return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+ if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+ ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+ if (ret < 0)
+ ret = nfs_map_numeric_to_string(uid, buf, buflen);
+ return ret;
}
#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e94ad22da5d2..72e0bddf7a2f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -148,6 +148,9 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
struct nfs_fattr *);
extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
extern int nfs4_check_client_ready(struct nfs_client *clp);
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+ const struct sockaddr *ds_addr,
+ int ds_addrlen, int ds_proto);
#ifdef CONFIG_PROC_FS
extern int __init nfs_fs_proc_init(void);
extern void nfs_fs_proc_exit(void);
@@ -213,8 +216,14 @@ extern const u32 nfs41_maxwrite_overhead;
extern struct rpc_procinfo nfs4_procedures[];
#endif
+extern int nfs4_init_ds_session(struct nfs_client *clp);
+
/* proc.c */
void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+extern int nfs_init_client(struct nfs_client *clp,
+ const struct rpc_timeout *timeparms,
+ const char *ip_addr, rpc_authflavor_t authflavour,
+ int noresvport);
/* dir.c */
extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -262,9 +271,15 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
#endif
/* read.c */
+extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops);
extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
/* write.c */
+extern int nfs_initiate_write(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how);
extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
@@ -274,6 +289,13 @@ extern int nfs_migrate_page(struct address_space *,
#endif
/* nfs4proc.c */
+extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
+extern int nfs4_init_client(struct nfs_client *clp,
+ const struct rpc_timeout *timeparms,
+ const char *ip_addr,
+ rpc_authflavor_t authflavour,
+ int noresvport);
+extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
extern int _nfs4_call_sync(struct nfs_server *server,
struct rpc_message *msg,
struct nfs4_sequence_args *args,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ce939c062a52..d0c80d8b3f96 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -885,4 +885,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.lock = nfs3_proc_lock,
.clear_acl_cache = nfs3_forget_cached_acls,
.close_context = nfs_close_context,
+ .init_client = nfs_init_client,
};
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1be36cf65bfc..c64be1cff080 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -252,6 +252,9 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
extern int nfs4_setup_sequence(const struct nfs_server *server,
struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
int cache_reply, struct rpc_task *task);
+extern int nfs41_setup_sequence(struct nfs4_session *session,
+ struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+ int cache_reply, struct rpc_task *task);
extern void nfs4_destroy_session(struct nfs4_session *session);
extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
extern int nfs4_proc_create_session(struct nfs_client *);
@@ -259,6 +262,19 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *);
extern int nfs4_init_session(struct nfs_server *server);
extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
struct nfs_fsinfo *fsinfo);
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+ return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
+ EXCHGID4_FLAG_USE_PNFS_DS;
+}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+ return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
+}
#else /* CONFIG_NFS_v4_1 */
static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
{
@@ -276,6 +292,18 @@ static inline int nfs4_init_session(struct nfs_server *server)
{
return 0;
}
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+ return false;
+}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+ return false;
+}
#endif /* CONFIG_NFS_V4_1 */
extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 23f930caf1e2..428558464817 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -40,32 +40,309 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
MODULE_DESCRIPTION("The NFSv4 file layout driver");
-static int
-filelayout_set_layoutdriver(struct nfs_server *nfss)
+#define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
+
+static loff_t
+filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
+ loff_t offset)
{
- int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
- nfs4_fl_free_deviceid_callback);
- if (status) {
- printk(KERN_WARNING "%s: deviceid cache could not be "
- "initialized\n", __func__);
- return status;
+ u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
+ u64 tmp;
+
+ offset -= flseg->pattern_offset;
+ tmp = offset;
+ do_div(tmp, stripe_width);
+
+ return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
+}
+
+/* This function is used by the layout driver to calculate the
+ * offset of the file on the dserver based on whether the
+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
+ */
+static loff_t
+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+ switch (flseg->stripe_type) {
+ case STRIPE_SPARSE:
+ return offset;
+
+ case STRIPE_DENSE:
+ return filelayout_get_dense_offset(flseg, offset);
}
- dprintk("%s: deviceid cache has been initialized successfully\n",
- __func__);
+
+ BUG();
+}
+
+/* For data server errors we don't recover from */
+static void
+filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+ if (lseg->pls_range.iomode == IOMODE_RW) {
+ dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+ set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+ } else {
+ dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+ set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+ }
+}
+
+static int filelayout_async_handle_error(struct rpc_task *task,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ int *reset)
+{
+ if (task->tk_status >= 0)
+ return 0;
+
+ *reset = 0;
+
+ switch (task->tk_status) {
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+ dprintk("%s ERROR %d, Reset session. Exchangeid "
+ "flags 0x%x\n", __func__, task->tk_status,
+ clp->cl_exchange_flags);
+ nfs4_schedule_session_recovery(clp->cl_session);
+ break;
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_GRACE:
+ case -EKEYEXPIRED:
+ rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+ break;
+ default:
+ dprintk("%s DS error. Retry through MDS %d\n", __func__,
+ task->tk_status);
+ *reset = 1;
+ break;
+ }
+ task->tk_status = 0;
+ return -EAGAIN;
+}
+
+/* NFS_PROTO call done callback routines */
+
+static int filelayout_read_done_cb(struct rpc_task *task,
+ struct nfs_read_data *data)
+{
+ struct nfs_client *clp = data->ds_clp;
+ int reset = 0;
+
+ dprintk("%s DS read\n", __func__);
+
+ if (filelayout_async_handle_error(task, data->args.context->state,
+ data->ds_clp, &reset) == -EAGAIN) {
+ dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+ __func__, data->ds_clp, data->ds_clp->cl_session);
+ if (reset) {
+ filelayout_set_lo_fail(data->lseg);
+ nfs4_reset_read(task, data);
+ clp = NFS_SERVER(data->inode)->nfs_client;
+ }
+ nfs_restart_rpc(task, clp);
+ return -EAGAIN;
+ }
+
return 0;
}
-/* Clear out the layout by destroying its device list */
-static int
-filelayout_clear_layoutdriver(struct nfs_server *nfss)
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void filelayout_read_prepare(struct rpc_task *task, void *data)
{
- dprintk("--> %s\n", __func__);
+ struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+ rdata->read_done_cb = filelayout_read_done_cb;
+
+ if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
+ &rdata->args.seq_args, &rdata->res.seq_res,
+ 0, task))
+ return;
+
+ rpc_call_start(task);
+}
+
+static void filelayout_read_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+ dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+
+ /* Note this may cause RPC to be resent */
+ rdata->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_read_release(void *data)
+{
+ struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+ rdata->mds_ops->rpc_release(data);
+}
+
+static int filelayout_write_done_cb(struct rpc_task *task,
+ struct nfs_write_data *data)
+{
+ int reset = 0;
+
+ if (filelayout_async_handle_error(task, data->args.context->state,
+ data->ds_clp, &reset) == -EAGAIN) {
+ struct nfs_client *clp;
+
+ dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+ __func__, data->ds_clp, data->ds_clp->cl_session);
+ if (reset) {
+ filelayout_set_lo_fail(data->lseg);
+ nfs4_reset_write(task, data);
+ clp = NFS_SERVER(data->inode)->nfs_client;
+ } else
+ clp = data->ds_clp;
+ nfs_restart_rpc(task, clp);
+ return -EAGAIN;
+ }
- if (nfss->nfs_client->cl_devid_cache)
- pnfs_put_deviceid_cache(nfss->nfs_client);
return 0;
}
+static void filelayout_write_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+ if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
+ &wdata->args.seq_args, &wdata->res.seq_res,
+ 0, task))
+ return;
+
+ rpc_call_start(task);
+}
+
+static void filelayout_write_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+ /* Note this may cause RPC to be resent */
+ wdata->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_write_release(void *data)
+{
+ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+ wdata->mds_ops->rpc_release(data);
+}
+
+struct rpc_call_ops filelayout_read_call_ops = {
+ .rpc_call_prepare = filelayout_read_prepare,
+ .rpc_call_done = filelayout_read_call_done,
+ .rpc_release = filelayout_read_release,
+};
+
+struct rpc_call_ops filelayout_write_call_ops = {
+ .rpc_call_prepare = filelayout_write_prepare,
+ .rpc_call_done = filelayout_write_call_done,
+ .rpc_release = filelayout_write_release,
+};
+
+static enum pnfs_try_status
+filelayout_read_pagelist(struct nfs_read_data *data)
+{
+ struct pnfs_layout_segment *lseg = data->lseg;
+ struct nfs4_pnfs_ds *ds;
+ loff_t offset = data->args.offset;
+ u32 j, idx;
+ struct nfs_fh *fh;
+ int status;
+
+ dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+ __func__, data->inode->i_ino,
+ data->args.pgbase, (size_t)data->args.count, offset);
+
+ /* Retrieve the correct rpc_client for the byte range */
+ j = nfs4_fl_calc_j_index(lseg, offset);
+ idx = nfs4_fl_calc_ds_index(lseg, j);
+ ds = nfs4_fl_prepare_ds(lseg, idx);
+ if (!ds) {
+ /* Either layout fh index faulty, or ds connect failed */
+ set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+ set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+ return PNFS_NOT_ATTEMPTED;
+ }
+ dprintk("%s USE DS:ip %x %hu\n", __func__,
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
+ /* No multipath support. Use first DS */
+ data->ds_clp = ds->ds_clp;
+ fh = nfs4_fl_select_ds_fh(lseg, j);
+ if (fh)
+ data->args.fh = fh;
+
+ data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+ data->mds_offset = offset;
+
+ /* Perform an asynchronous read to ds */
+ status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
+ &filelayout_read_call_ops);
+ BUG_ON(status != 0);
+ return PNFS_ATTEMPTED;
+}
+
+/* Perform async writes. */
+static enum pnfs_try_status
+filelayout_write_pagelist(struct nfs_write_data *data, int sync)
+{
+ struct pnfs_layout_segment *lseg = data->lseg;
+ struct nfs4_pnfs_ds *ds;
+ loff_t offset = data->args.offset;
+ u32 j, idx;
+ struct nfs_fh *fh;
+ int status;
+
+ /* Retrieve the correct rpc_client for the byte range */
+ j = nfs4_fl_calc_j_index(lseg, offset);
+ idx = nfs4_fl_calc_ds_index(lseg, j);
+ ds = nfs4_fl_prepare_ds(lseg, idx);
+ if (!ds) {
+ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+ set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+ set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+ return PNFS_NOT_ATTEMPTED;
+ }
+ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
+ data->inode->i_ino, sync, (size_t) data->args.count, offset,
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
+ /* We can't handle commit to ds yet */
+ if (!FILELAYOUT_LSEG(lseg)->commit_through_mds)
+ data->args.stable = NFS_FILE_SYNC;
+
+ data->write_done_cb = filelayout_write_done_cb;
+ data->ds_clp = ds->ds_clp;
+ fh = nfs4_fl_select_ds_fh(lseg, j);
+ if (fh)
+ data->args.fh = fh;
+ /*
+ * Get the file offset on the dserver. Set the write offset to
+ * this offset and save the original offset.
+ */
+ data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+ data->mds_offset = offset;
+
+ /* Perform an asynchronous write */
+ status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
+ &filelayout_write_call_ops, sync);
+ BUG_ON(status != 0);
+ return PNFS_ATTEMPTED;
+}
+
/*
* filelayout_check_layout()
*
@@ -92,14 +369,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
goto out;
}
- if (fl->stripe_unit % PAGE_SIZE) {
- dprintk("%s Stripe unit (%u) not page aligned\n",
+ if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
+ dprintk("%s Invalid stripe unit (%u)\n",
__func__, fl->stripe_unit);
goto out;
}
/* find and reference the deviceid */
- dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+ dsaddr = nfs4_fl_find_get_deviceid(id);
if (dsaddr == NULL) {
dsaddr = get_device_info(lo->plh_inode, id);
if (dsaddr == NULL)
@@ -134,7 +411,7 @@ out:
dprintk("--> %s returns %d\n", __func__, status);
return status;
out_put:
- pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+ nfs4_fl_put_deviceid(dsaddr);
goto out;
}
@@ -243,23 +520,47 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
static void
filelayout_free_lseg(struct pnfs_layout_segment *lseg)
{
- struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
dprintk("--> %s\n", __func__);
- pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
- &fl->dsaddr->deviceid);
+ nfs4_fl_put_deviceid(fl->dsaddr);
_filelayout_free_lseg(fl);
}
+/*
+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * return 1 : coalesce page
+ * return 0 : don't coalesce page
+ */
+int
+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ u64 p_stripe, r_stripe;
+ u32 stripe_unit;
+
+ if (!pgio->pg_lseg)
+ return 1;
+ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
+ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
+ stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+
+ do_div(p_stripe, stripe_unit);
+ do_div(r_stripe, stripe_unit);
+
+ return (p_stripe == r_stripe);
+}
+
static struct pnfs_layoutdriver_type filelayout_type = {
- .id = LAYOUT_NFSV4_1_FILES,
- .name = "LAYOUT_NFSV4_1_FILES",
- .owner = THIS_MODULE,
- .set_layoutdriver = filelayout_set_layoutdriver,
- .clear_layoutdriver = filelayout_clear_layoutdriver,
- .alloc_lseg = filelayout_alloc_lseg,
- .free_lseg = filelayout_free_lseg,
+ .id = LAYOUT_NFSV4_1_FILES,
+ .name = "LAYOUT_NFSV4_1_FILES",
+ .owner = THIS_MODULE,
+ .alloc_lseg = filelayout_alloc_lseg,
+ .free_lseg = filelayout_free_lseg,
+ .pg_test = filelayout_pg_test,
+ .read_pagelist = filelayout_read_pagelist,
+ .write_pagelist = filelayout_write_pagelist,
};
static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index bbf60dd2ab9d..ee0c907742b5 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -55,8 +55,14 @@ struct nfs4_pnfs_ds {
atomic_t ds_count;
};
+/* nfs4_file_layout_dsaddr flags */
+#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001
+
struct nfs4_file_layout_dsaddr {
- struct pnfs_deviceid_node deviceid;
+ struct hlist_node node;
+ struct nfs4_deviceid deviceid;
+ atomic_t ref;
+ unsigned long flags;
u32 stripe_count;
u8 *stripe_indices;
u32 ds_num;
@@ -83,11 +89,18 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
generic_hdr);
}
-extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
+extern struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
+
extern void print_ds(struct nfs4_pnfs_ds *ds);
extern void print_deviceid(struct nfs4_deviceid *dev_id);
+u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
+ u32 ds_idx);
extern struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
+extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
struct nfs4_file_layout_dsaddr *
get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index b73c34375f60..68143c162e3b 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,6 +37,30 @@
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_FL_DEVICE_ID_HASH_BITS 5
+#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
+#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
+
+static inline u32
+nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
+{
+ unsigned char *cptr = (unsigned char *)id->data;
+ unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+ u32 x = 0;
+
+ while (nbytes--) {
+ x *= 37;
+ x += *cptr++;
+ }
+ return x & NFS4_FL_DEVICE_ID_HASH_MASK;
+}
+
+static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(filelayout_deviceid_lock);
+
+/*
* Data server cache
*
* Data servers can be mapped to different device ids.
@@ -104,6 +128,67 @@ _data_server_lookup_locked(u32 ip_addr, u32 port)
return NULL;
}
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server
+ * Currently only support IPv4
+ */
+static int
+nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
+{
+ struct nfs_client *clp;
+ struct sockaddr_in sin;
+ int status = 0;
+
+ dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+ mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ds->ds_ip_addr;
+ sin.sin_port = ds->ds_port;
+
+ clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
+ sizeof(sin), IPPROTO_TCP);
+ if (IS_ERR(clp)) {
+ status = PTR_ERR(clp);
+ goto out;
+ }
+
+ if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) {
+ if (!is_ds_client(clp)) {
+ status = -ENODEV;
+ goto out_put;
+ }
+ ds->ds_clp = clp;
+ dprintk("%s [existing] ip=%x, port=%hu\n", __func__,
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+ goto out;
+ }
+
+ /*
+ * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
+ * be equal to the MDS lease. Renewal is scheduled in create_session.
+ */
+ spin_lock(&mds_srv->nfs_client->cl_lock);
+ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
+ spin_unlock(&mds_srv->nfs_client->cl_lock);
+ clp->cl_last_renewal = jiffies;
+
+ /* New nfs_client */
+ status = nfs4_init_ds_session(clp);
+ if (status)
+ goto out_put;
+
+ ds->ds_clp = clp;
+ dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr),
+ ntohs(ds->ds_port));
+out:
+ return status;
+out_put:
+ nfs_put_client(clp);
+ goto out;
+}
+
static void
destroy_ds(struct nfs4_pnfs_ds *ds)
{
@@ -122,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
struct nfs4_pnfs_ds *ds;
int i;
- print_deviceid(&dsaddr->deviceid.de_id);
+ print_deviceid(&dsaddr->deviceid);
for (i = 0; i < dsaddr->ds_num; i++) {
ds = dsaddr->ds_list[i];
@@ -139,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
kfree(dsaddr);
}
-void
-nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
-{
- struct nfs4_file_layout_dsaddr *dsaddr =
- container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
-
- nfs4_fl_free_deviceid(dsaddr);
-}
-
static struct nfs4_pnfs_ds *
nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
{
@@ -300,7 +376,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
dsaddr->stripe_count = cnt;
dsaddr->ds_num = num;
- memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+ memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
/* Go back an read stripe indices */
p = indicesp;
@@ -350,28 +426,37 @@ out_err:
}
/*
- * Decode the opaque device specified in 'dev'
- * and add it to the list of available devices.
- * If the deviceid is already cached, nfs4_add_deviceid will return
- * a pointer to the cached struct and throw away the new.
+ * Decode the opaque device specified in 'dev' and add it to the cache of
+ * available devices.
*/
-static struct nfs4_file_layout_dsaddr*
+static struct nfs4_file_layout_dsaddr *
decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
{
- struct nfs4_file_layout_dsaddr *dsaddr;
- struct pnfs_deviceid_node *d;
+ struct nfs4_file_layout_dsaddr *d, *new;
+ long hash;
- dsaddr = decode_device(inode, dev);
- if (!dsaddr) {
+ new = decode_device(inode, dev);
+ if (!new) {
printk(KERN_WARNING "%s: Could not decode or add device\n",
__func__);
return NULL;
}
- d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
- &dsaddr->deviceid);
+ spin_lock(&filelayout_deviceid_lock);
+ d = nfs4_fl_find_get_deviceid(&new->deviceid);
+ if (d) {
+ spin_unlock(&filelayout_deviceid_lock);
+ nfs4_fl_free_deviceid(new);
+ return d;
+ }
+
+ INIT_HLIST_NODE(&new->node);
+ atomic_set(&new->ref, 1);
+ hash = nfs4_fl_deviceid_hash(&new->deviceid);
+ hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
+ spin_unlock(&filelayout_deviceid_lock);
- return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+ return new;
}
/*
@@ -446,12 +531,123 @@ out_free:
return dsaddr;
}
+void
+nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+ if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
+ hlist_del_rcu(&dsaddr->node);
+ spin_unlock(&filelayout_deviceid_lock);
+
+ synchronize_rcu();
+ nfs4_fl_free_deviceid(dsaddr);
+ }
+}
+
struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
+{
+ struct nfs4_file_layout_dsaddr *d;
+ struct hlist_node *n;
+ long hash = nfs4_fl_deviceid_hash(id);
+
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
+ if (!memcmp(&d->deviceid, id, sizeof(*id))) {
+ if (!atomic_inc_not_zero(&d->ref))
+ goto fail;
+ rcu_read_unlock();
+ return d;
+ }
+ }
+fail:
+ rcu_read_unlock();
+ return NULL;
+}
+
+/*
+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
+ * Then: ((res + fsi) % dsaddr->stripe_count)
+ */
+u32
+nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+ u64 tmp;
+
+ tmp = offset - flseg->pattern_offset;
+ do_div(tmp, flseg->stripe_unit);
+ tmp += flseg->first_stripe_index;
+ return do_div(tmp, flseg->dsaddr->stripe_count);
+}
+
+u32
+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
+{
+ return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+}
+
+struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+ u32 i;
+
+ if (flseg->stripe_type == STRIPE_SPARSE) {
+ if (flseg->num_fh == 1)
+ i = 0;
+ else if (flseg->num_fh == 0)
+ /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+ return NULL;
+ else
+ i = nfs4_fl_calc_ds_index(lseg, j);
+ } else
+ i = j;
+ return flseg->fh_array[i];
+}
+
+static void
+filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
+ int err, u32 ds_addr)
+{
+ u32 *p = (u32 *)&dsaddr->deviceid;
+
+ printk(KERN_ERR "NFS: data server %x connection error %d."
+ " Deviceid [%x%x%x%x] marked out of use.\n",
+ ds_addr, err, p[0], p[1], p[2], p[3]);
+
+ spin_lock(&filelayout_deviceid_lock);
+ dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
+ spin_unlock(&filelayout_deviceid_lock);
+}
+
+struct nfs4_pnfs_ds *
+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
{
- struct pnfs_deviceid_node *d;
+ struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
+ struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
- d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
- return (d == NULL) ? NULL :
- container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+ if (ds == NULL) {
+ printk(KERN_ERR "%s: No data server for offset index %d\n",
+ __func__, ds_idx);
+ return NULL;
+ }
+
+ if (!ds->ds_clp) {
+ struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
+ int err;
+
+ if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) {
+ /* Already tried to connect, don't try again */
+ dprintk("%s Deviceid marked out of use\n", __func__);
+ return NULL;
+ }
+ err = nfs4_ds_connect(s, ds);
+ if (err) {
+ filelayout_mark_devid_negative(dsaddr, err,
+ ntohl(ds->ds_ip_addr));
+ return NULL;
+ }
+ }
+ return ds;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0a07e353a961..1d84e7088af9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -85,6 +85,9 @@ static int nfs4_map_errors(int err)
switch (err) {
case -NFS4ERR_RESOURCE:
return -EREMOTEIO;
+ case -NFS4ERR_BADOWNER:
+ case -NFS4ERR_BADNAME:
+ return -EINVAL;
default:
dprintk("%s could not handle NFSv4 error %d\n",
__func__, -err);
@@ -241,7 +244,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
/* This is the error handling routine for processes that are allowed
* to sleep.
*/
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state *state = exception->state;
@@ -293,6 +296,19 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
break;
case -NFS4ERR_OLD_STATEID:
exception->retry = 1;
+ break;
+ case -NFS4ERR_BADOWNER:
+ /* The following works around a Linux server bug! */
+ case -NFS4ERR_BADNAME:
+ if (server->caps & NFS_CAP_UIDGID_NOMAP) {
+ server->caps &= ~NFS_CAP_UIDGID_NOMAP;
+ exception->retry = 1;
+ printk(KERN_WARNING "NFS: v4 server %s "
+ "does not accept raw "
+ "uid/gids. "
+ "Reenabling the idmapper.\n",
+ server->nfs_client->cl_hostname);
+ }
}
/* We failed to handle the error */
return nfs4_map_errors(ret);
@@ -505,7 +521,7 @@ out:
return ret_id;
}
-static int nfs41_setup_sequence(struct nfs4_session *session,
+int nfs41_setup_sequence(struct nfs4_session *session,
struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res,
int cache_reply,
@@ -571,6 +587,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
res->sr_status = 1;
return 0;
}
+EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
int nfs4_setup_sequence(const struct nfs_server *server,
struct nfs4_sequence_args *args,
@@ -1573,9 +1590,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
return 0;
}
-static int nfs4_recover_expired_lease(struct nfs_server *server)
+static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
{
- struct nfs_client *clp = server->nfs_client;
unsigned int loop;
int ret;
@@ -1592,6 +1608,11 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
return ret;
}
+static int nfs4_recover_expired_lease(struct nfs_server *server)
+{
+ return nfs4_client_recover_expired_lease(server->nfs_client);
+}
+
/*
* OPEN_EXPIRED:
* reclaim state on the server after a network partition.
@@ -3069,15 +3090,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return err;
}
-static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
{
struct nfs_server *server = NFS_SERVER(data->inode);
- dprintk("--> %s\n", __func__);
-
- if (!nfs4_sequence_done(task, &data->res.seq_res))
- return -EAGAIN;
-
if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
nfs_restart_rpc(task, server->nfs_client);
return -EAGAIN;
@@ -3089,19 +3105,44 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
return 0;
}
+static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+{
+
+ dprintk("--> %s\n", __func__);
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return -EAGAIN;
+
+ return data->read_done_cb(task, data);
+}
+
static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
{
data->timestamp = jiffies;
+ data->read_done_cb = nfs4_read_done_cb;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
}
-static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+/* Reset the the nfs_read_data to send the read to the MDS. */
+void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
+{
+ dprintk("%s Reset task for i/o through\n", __func__);
+ put_lseg(data->lseg);
+ data->lseg = NULL;
+ /* offsets will differ in the dense stripe case */
+ data->args.offset = data->mds_offset;
+ data->ds_clp = NULL;
+ data->args.fh = NFS_FH(data->inode);
+ data->read_done_cb = nfs4_read_done_cb;
+ task->tk_ops = data->mds_ops;
+ rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_read);
+
+static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
{
struct inode *inode = data->inode;
- if (!nfs4_sequence_done(task, &data->res.seq_res))
- return -EAGAIN;
-
if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
return -EAGAIN;
@@ -3113,11 +3154,41 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
return 0;
}
+static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return -EAGAIN;
+ return data->write_done_cb(task, data);
+}
+
+/* Reset the the nfs_write_data to send the write to the MDS. */
+void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
+{
+ dprintk("%s Reset task for i/o through\n", __func__);
+ put_lseg(data->lseg);
+ data->lseg = NULL;
+ data->ds_clp = NULL;
+ data->write_done_cb = nfs4_write_done_cb;
+ data->args.fh = NFS_FH(data->inode);
+ data->args.bitmask = data->res.server->cache_consistency_bitmask;
+ data->args.offset = data->mds_offset;
+ data->res.fattr = &data->fattr;
+ task->tk_ops = data->mds_ops;
+ rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_write);
+
static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
{
struct nfs_server *server = NFS_SERVER(data->inode);
- data->args.bitmask = server->cache_consistency_bitmask;
+ if (data->lseg) {
+ data->args.bitmask = NULL;
+ data->res.fattr = NULL;
+ } else
+ data->args.bitmask = server->cache_consistency_bitmask;
+ if (!data->write_done_cb)
+ data->write_done_cb = nfs4_write_done_cb;
data->res.server = server;
data->timestamp = jiffies;
@@ -5118,6 +5189,27 @@ int nfs4_init_session(struct nfs_server *server)
return ret;
}
+int nfs4_init_ds_session(struct nfs_client *clp)
+{
+ struct nfs4_session *session = clp->cl_session;
+ int ret;
+
+ if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
+ return 0;
+
+ ret = nfs4_client_recover_expired_lease(clp);
+ if (!ret)
+ /* Test for the DS role */
+ if (!is_ds_client(clp))
+ ret = -ENODEV;
+ if (!ret)
+ ret = nfs4_check_client_ready(clp);
+ return ret;
+
+}
+EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
+
+
/*
* Renew the cl_session lease.
*/
@@ -5648,6 +5740,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.clear_acl_cache = nfs4_zap_acl_attr,
.close_context = nfs4_close_context,
.open_context = nfs4_atomic_open,
+ .init_client = nfs4_init_client,
};
static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 402143d75fc5..df8e7f3ca56d 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -64,12 +64,8 @@ nfs4_renew_state(struct work_struct *work)
ops = clp->cl_mvops->state_renewal_ops;
dprintk("%s: start\n", __func__);
- rcu_read_lock();
- if (list_empty(&clp->cl_superblocks)) {
- rcu_read_unlock();
+ if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
goto out;
- }
- rcu_read_unlock();
spin_lock(&clp->cl_lock);
lease = clp->cl_lease_time;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0592288f9f06..ab1bf5bb021f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
int status;
struct nfs_fsinfo fsinfo;
+ if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+ nfs4_schedule_state_renewal(clp);
+ return 0;
+ }
+
status = nfs4_proc_get_lease_time(clp, &fsinfo);
if (status == 0) {
/* Update lease time and schedule renewal */
@@ -1448,6 +1453,7 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session)
{
nfs4_schedule_lease_recovery(session->clp);
}
+EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
void nfs41_handle_recall_slot(struct nfs_client *clp)
{
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 94d50e86a124..0cf560f77884 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -844,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
if (iap->ia_valid & ATTR_MODE)
len += 4;
if (iap->ia_valid & ATTR_UID) {
- owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
+ owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
if (owner_namelen < 0) {
dprintk("nfs: couldn't resolve uid %d to string\n",
iap->ia_uid);
@@ -856,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
}
if (iap->ia_valid & ATTR_GID) {
- owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
+ owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
if (owner_grouplen < 0) {
dprintk("nfs: couldn't resolve gid %d to string\n",
iap->ia_gid);
@@ -1384,7 +1384,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
hdr->replen += decode_putrootfh_maxsz;
}
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
+static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid)
{
nfs4_stateid stateid;
__be32 *p;
@@ -1392,6 +1392,8 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
p = reserve_space(xdr, NFS4_STATEID_SIZE);
if (ctx->state != NULL) {
nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
+ if (zero_seqid)
+ stateid.stateid.seqid = 0;
xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
} else
xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1404,7 +1406,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
p = reserve_space(xdr, 4);
*p = cpu_to_be32(OP_READ);
- encode_stateid(xdr, args->context, args->lock_context);
+ encode_stateid(xdr, args->context, args->lock_context,
+ hdr->minorversion);
p = reserve_space(xdr, 12);
p = xdr_encode_hyper(p, args->offset);
@@ -1592,7 +1595,8 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
p = reserve_space(xdr, 4);
*p = cpu_to_be32(OP_WRITE);
- encode_stateid(xdr, args->context, args->lock_context);
+ encode_stateid(xdr, args->context, args->lock_context,
+ hdr->minorversion);
p = reserve_space(xdr, 16);
p = xdr_encode_hyper(p, args->offset);
@@ -2271,7 +2275,8 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_putfh(xdr, args->fh, &hdr);
encode_write(xdr, args, &hdr);
req->rq_snd_buf.flags |= XDRBUF_WRITE;
- encode_getfattr(xdr, args->bitmask, &hdr);
+ if (args->bitmask)
+ encode_getfattr(xdr, args->bitmask, &hdr);
encode_nops(&hdr);
}
@@ -3382,7 +3387,7 @@ out_overflow:
}
static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
- struct nfs_client *clp, uint32_t *uid, int may_sleep)
+ const struct nfs_server *server, uint32_t *uid, int may_sleep)
{
uint32_t len;
__be32 *p;
@@ -3402,7 +3407,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
if (!may_sleep) {
/* do nothing */
} else if (len < XDR_MAX_NETOBJ) {
- if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
+ if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
ret = NFS_ATTR_FATTR_OWNER;
else
dprintk("%s: nfs_map_name_to_uid failed!\n",
@@ -3420,7 +3425,7 @@ out_overflow:
}
static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
- struct nfs_client *clp, uint32_t *gid, int may_sleep)
+ const struct nfs_server *server, uint32_t *gid, int may_sleep)
{
uint32_t len;
__be32 *p;
@@ -3440,7 +3445,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
if (!may_sleep) {
/* do nothing */
} else if (len < XDR_MAX_NETOBJ) {
- if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
+ if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
ret = NFS_ATTR_FATTR_GROUP;
else
dprintk("%s: nfs_map_group_to_gid failed!\n",
@@ -3939,14 +3944,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;
- status = decode_attr_owner(xdr, bitmap, server->nfs_client,
- &fattr->uid, may_sleep);
+ status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
if (status < 0)
goto xdr_error;
fattr->valid |= status;
- status = decode_attr_group(xdr, bitmap, server->nfs_client,
- &fattr->gid, may_sleep);
+ status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
if (status < 0)
goto xdr_error;
fattr->valid |= status;
@@ -5690,8 +5693,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_write(xdr, res);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ if (res->fattr)
+ decode_getfattr(xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
if (!status)
status = res->count;
out:
@@ -6167,8 +6171,6 @@ static struct {
{ NFS4ERR_DQUOT, -EDQUOT },
{ NFS4ERR_STALE, -ESTALE },
{ NFS4ERR_BADHANDLE, -EBADHANDLE },
- { NFS4ERR_BADOWNER, -EINVAL },
- { NFS4ERR_BADNAME, -EINVAL },
{ NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
{ NFS4ERR_NOTSUPP, -ENOTSUPP },
{ NFS4ERR_TOOSMALL, -ETOOSMALL },
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e1164e3f9e69..23e794410669 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -20,6 +20,7 @@
#include <linux/nfs_mount.h>
#include "internal.h"
+#include "pnfs.h"
static struct kmem_cache *nfs_page_cachep;
@@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req)
*/
void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
struct inode *inode,
- int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+ int (*doio)(struct nfs_pageio_descriptor *),
size_t bsize,
int io_flags)
{
@@ -226,6 +227,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
desc->pg_doio = doio;
desc->pg_ioflags = io_flags;
desc->pg_error = 0;
+ desc->pg_lseg = NULL;
}
/**
@@ -240,7 +242,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
* Return 'true' if this is the case, else return 'false'.
*/
static int nfs_can_coalesce_requests(struct nfs_page *prev,
- struct nfs_page *req)
+ struct nfs_page *req,
+ struct nfs_pageio_descriptor *pgio)
{
if (req->wb_context->cred != prev->wb_context->cred)
return 0;
@@ -254,6 +257,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
return 0;
if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
return 0;
+ /*
+ * Non-whole file layouts need to check that req is inside of
+ * pgio->pg_lseg.
+ */
+ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
+ return 0;
return 1;
}
@@ -286,7 +295,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
if (newlen > desc->pg_bsize)
return 0;
prev = nfs_list_entry(desc->pg_list.prev);
- if (!nfs_can_coalesce_requests(prev, req))
+ if (!nfs_can_coalesce_requests(prev, req, desc))
return 0;
} else
desc->pg_base = req->wb_pgbase;
@@ -302,12 +311,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
{
if (!list_empty(&desc->pg_list)) {
- int error = desc->pg_doio(desc->pg_inode,
- &desc->pg_list,
- nfs_page_array_len(desc->pg_base,
- desc->pg_count),
- desc->pg_count,
- desc->pg_ioflags);
+ int error = desc->pg_doio(desc);
if (error < 0)
desc->pg_error = error;
else
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 1b1bc1a0fb0a..f38813a0a295 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -30,6 +30,7 @@
#include <linux/nfs_fs.h>
#include "internal.h"
#include "pnfs.h"
+#include "iostat.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
@@ -74,10 +75,8 @@ find_pnfs_driver(u32 id)
void
unset_pnfs_layoutdriver(struct nfs_server *nfss)
{
- if (nfss->pnfs_curr_ld) {
- nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
+ if (nfss->pnfs_curr_ld)
module_put(nfss->pnfs_curr_ld->owner);
- }
nfss->pnfs_curr_ld = NULL;
}
@@ -115,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
goto out_no_driver;
}
server->pnfs_curr_ld = ld_type;
- if (ld_type->set_layoutdriver(server)) {
- printk(KERN_ERR
- "%s: Error initializing mount point for layout driver %u.\n",
- __func__, id);
- module_put(ld_type->owner);
- goto out_no_driver;
- }
+
dprintk("%s: pNFS module for %u set\n", __func__, id);
return;
@@ -230,37 +223,41 @@ static void free_lseg(struct pnfs_layout_segment *lseg)
put_layout_hdr(NFS_I(ino)->layout);
}
-/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
- * could sleep, so must be called outside of the lock.
- * Returns 1 if object was removed, otherwise return 0.
- */
-static int
-put_lseg_locked(struct pnfs_layout_segment *lseg,
- struct list_head *tmp_list)
+static void
+put_lseg_common(struct pnfs_layout_segment *lseg)
+{
+ struct inode *inode = lseg->pls_layout->plh_inode;
+
+ BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ list_del_init(&lseg->pls_list);
+ if (list_empty(&lseg->pls_layout->plh_segs)) {
+ set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
+ /* Matched by initial refcount set in alloc_init_layout_hdr */
+ put_layout_hdr_locked(lseg->pls_layout);
+ }
+ rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
+}
+
+void
+put_lseg(struct pnfs_layout_segment *lseg)
{
+ struct inode *inode;
+
+ if (!lseg)
+ return;
+
dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
atomic_read(&lseg->pls_refcount),
test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
- if (atomic_dec_and_test(&lseg->pls_refcount)) {
- struct inode *ino = lseg->pls_layout->plh_inode;
+ inode = lseg->pls_layout->plh_inode;
+ if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+ LIST_HEAD(free_me);
- BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
- list_del(&lseg->pls_list);
- if (list_empty(&lseg->pls_layout->plh_segs)) {
- struct nfs_client *clp;
-
- clp = NFS_SERVER(ino)->nfs_client;
- spin_lock(&clp->cl_lock);
- /* List does not take a reference, so no need for put here */
- list_del_init(&lseg->pls_layout->plh_layouts);
- spin_unlock(&clp->cl_lock);
- clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
- }
- rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
- list_add(&lseg->pls_list, tmp_list);
- return 1;
+ put_lseg_common(lseg);
+ list_add(&lseg->pls_list, &free_me);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&free_me);
}
- return 0;
}
static bool
@@ -281,7 +278,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
* list. It will now be removed when all
* outstanding io is finished.
*/
- rv = put_lseg_locked(lseg, tmp_list);
+ dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+ atomic_read(&lseg->pls_refcount));
+ if (atomic_dec_and_test(&lseg->pls_refcount)) {
+ put_lseg_common(lseg);
+ list_add(&lseg->pls_list, tmp_list);
+ rv = 1;
+ }
}
return rv;
}
@@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
dprintk("%s:Begin lo %p\n", __func__, lo);
+ if (list_empty(&lo->plh_segs)) {
+ if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
+ put_layout_hdr_locked(lo);
+ return 0;
+ }
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
dprintk("%s: freeing lseg %p iomode %d "
@@ -312,11 +320,27 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return invalid - removed;
}
+/* note free_me must contain lsegs from a single layout_hdr */
void
pnfs_free_lseg_list(struct list_head *free_me)
{
struct pnfs_layout_segment *lseg, *tmp;
+ struct pnfs_layout_hdr *lo;
+
+ if (list_empty(free_me))
+ return;
+ lo = list_first_entry(free_me, struct pnfs_layout_segment,
+ pls_list)->pls_layout;
+
+ if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
+ struct nfs_client *clp;
+
+ clp = NFS_SERVER(lo->plh_inode)->nfs_client;
+ spin_lock(&clp->cl_lock);
+ list_del_init(&lo->plh_layouts);
+ spin_unlock(&clp->cl_lock);
+ }
list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
list_del(&lseg->pls_list);
free_lseg(lseg);
@@ -332,10 +356,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
spin_lock(&nfsi->vfs_inode.i_lock);
lo = nfsi->layout;
if (lo) {
- set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+ lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
- /* Matched by refcount set to 1 in alloc_init_layout_hdr */
- put_layout_hdr_locked(lo);
}
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
@@ -403,6 +425,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
(int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
return true;
return lo->plh_block_lgets ||
+ test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
(list_empty(&lo->plh_segs) &&
(atomic_read(&lo->plh_outstanding) > lget));
@@ -674,7 +697,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
is_matching_lseg(lseg, iomode)) {
- ret = lseg;
+ ret = get_lseg(lseg);
break;
}
if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
@@ -699,6 +722,7 @@ pnfs_update_layout(struct inode *ino,
struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
struct pnfs_layout_hdr *lo;
struct pnfs_layout_segment *lseg = NULL;
+ bool first = false;
if (!pnfs_enabled_sb(NFS_SERVER(ino)))
return NULL;
@@ -715,21 +739,25 @@ pnfs_update_layout(struct inode *ino,
dprintk("%s matches recall, use MDS\n", __func__);
goto out_unlock;
}
- /* Check to see if the layout for the given range already exists */
- lseg = pnfs_find_lseg(lo, iomode);
- if (lseg)
- goto out_unlock;
/* if LAYOUTGET already failed once we don't try again */
if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
goto out_unlock;
+ /* Check to see if the layout for the given range already exists */
+ lseg = pnfs_find_lseg(lo, iomode);
+ if (lseg)
+ goto out_unlock;
+
if (pnfs_layoutgets_blocked(lo, NULL, 0))
goto out_unlock;
atomic_inc(&lo->plh_outstanding);
get_layout_hdr(lo);
- if (list_empty(&lo->plh_segs)) {
+ if (list_empty(&lo->plh_segs))
+ first = true;
+ spin_unlock(&ino->i_lock);
+ if (first) {
/* The lo must be on the clp list if there is any
* chance of a CB_LAYOUTRECALL(FILE) coming in.
*/
@@ -738,24 +766,18 @@ pnfs_update_layout(struct inode *ino,
list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
spin_unlock(&clp->cl_lock);
}
- spin_unlock(&ino->i_lock);
lseg = send_layoutget(lo, ctx, iomode);
- if (!lseg) {
- spin_lock(&ino->i_lock);
- if (list_empty(&lo->plh_segs)) {
- spin_lock(&clp->cl_lock);
- list_del_init(&lo->plh_layouts);
- spin_unlock(&clp->cl_lock);
- clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
- }
- spin_unlock(&ino->i_lock);
+ if (!lseg && first) {
+ spin_lock(&clp->cl_lock);
+ list_del_init(&lo->plh_layouts);
+ spin_unlock(&clp->cl_lock);
}
atomic_dec(&lo->plh_outstanding);
put_layout_hdr(lo);
out:
dprintk("%s end, state 0x%lx lseg %p\n", __func__,
- nfsi->layout->plh_flags, lseg);
+ nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
return lseg;
out_unlock:
spin_unlock(&ino->i_lock);
@@ -808,7 +830,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
}
init_lseg(lo, lseg);
lseg->pls_range = res->range;
- *lgp->lsegpp = lseg;
+ *lgp->lsegpp = get_lseg(lseg);
pnfs_insert_layout(lo, lseg);
if (res->return_on_close) {
@@ -829,137 +851,97 @@ out_forget_reply:
goto out;
}
-/*
- * Device ID cache. Currently supports one layout type per struct nfs_client.
- * Add layout type to the lookup key to expand to support multiple types.
- */
-int
-pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
- void (*free_callback)(struct pnfs_deviceid_node *))
+static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev,
+ struct nfs_page *req)
{
- struct pnfs_deviceid_cache *c;
-
- c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
- if (!c)
- return -ENOMEM;
- spin_lock(&clp->cl_lock);
- if (clp->cl_devid_cache != NULL) {
- atomic_inc(&clp->cl_devid_cache->dc_ref);
- dprintk("%s [kref [%d]]\n", __func__,
- atomic_read(&clp->cl_devid_cache->dc_ref));
- kfree(c);
- } else {
- /* kzalloc initializes hlists */
- spin_lock_init(&c->dc_lock);
- atomic_set(&c->dc_ref, 1);
- c->dc_free_callback = free_callback;
- clp->cl_devid_cache = c;
- dprintk("%s [new]\n", __func__);
+ if (pgio->pg_count == prev->wb_bytes) {
+ /* This is first coelesce call for a series of nfs_pages */
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ prev->wb_context,
+ IOMODE_READ);
}
- spin_unlock(&clp->cl_lock);
- return 0;
+ return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
}
-EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
-/*
- * Called from pnfs_layoutdriver_type->free_lseg
- * last layout segment reference frees deviceid
- */
void
-pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
- struct pnfs_deviceid_node *devid)
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
{
- struct nfs4_deviceid *id = &devid->de_id;
- struct pnfs_deviceid_node *d;
- struct hlist_node *n;
- long h = nfs4_deviceid_hash(id);
+ struct pnfs_layoutdriver_type *ld;
- dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
- if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
- return;
+ ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
+}
- hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
- if (!memcmp(&d->de_id, id, sizeof(*id))) {
- hlist_del_rcu(&d->de_node);
- spin_unlock(&c->dc_lock);
- synchronize_rcu();
- c->dc_free_callback(devid);
- return;
- }
- spin_unlock(&c->dc_lock);
- /* Why wasn't it found in the list? */
- BUG();
-}
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
-
-/* Find and reference a deviceid */
-struct pnfs_deviceid_node *
-pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
-{
- struct pnfs_deviceid_node *d;
- struct hlist_node *n;
- long hash = nfs4_deviceid_hash(id);
-
- dprintk("--> %s hash %ld\n", __func__, hash);
- rcu_read_lock();
- hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
- if (!memcmp(&d->de_id, id, sizeof(*id))) {
- if (!atomic_inc_not_zero(&d->de_ref)) {
- goto fail;
- } else {
- rcu_read_unlock();
- return d;
- }
- }
+static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ if (pgio->pg_count == prev->wb_bytes) {
+ /* This is first coelesce call for a series of nfs_pages */
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ prev->wb_context,
+ IOMODE_RW);
}
-fail:
- rcu_read_unlock();
- return NULL;
+ return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
+}
+
+void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+{
+ struct pnfs_layoutdriver_type *ld;
+
+ ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
+}
+
+enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *wdata,
+ const struct rpc_call_ops *call_ops, int how)
+{
+ struct inode *inode = wdata->inode;
+ enum pnfs_try_status trypnfs;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ wdata->mds_ops = call_ops;
+
+ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
+ inode->i_ino, wdata->args.count, wdata->args.offset, how);
+
+ trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+ if (trypnfs == PNFS_NOT_ATTEMPTED) {
+ put_lseg(wdata->lseg);
+ wdata->lseg = NULL;
+ } else
+ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
+
+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+ return trypnfs;
}
-EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
/*
- * Add a deviceid to the cache.
- * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ * Call the appropriate parallel I/O subsystem read function.
*/
-struct pnfs_deviceid_node *
-pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
-{
- struct pnfs_deviceid_node *d;
- long hash = nfs4_deviceid_hash(&new->de_id);
-
- dprintk("--> %s hash %ld\n", __func__, hash);
- spin_lock(&c->dc_lock);
- d = pnfs_find_get_deviceid(c, &new->de_id);
- if (d) {
- spin_unlock(&c->dc_lock);
- dprintk("%s [discard]\n", __func__);
- c->dc_free_callback(new);
- return d;
- }
- INIT_HLIST_NODE(&new->de_node);
- atomic_set(&new->de_ref, 1);
- hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
- spin_unlock(&c->dc_lock);
- dprintk("%s [new]\n", __func__);
- return new;
-}
-EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
-
-void
-pnfs_put_deviceid_cache(struct nfs_client *clp)
+enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *rdata,
+ const struct rpc_call_ops *call_ops)
{
- struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
+ struct inode *inode = rdata->inode;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ enum pnfs_try_status trypnfs;
- dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
- if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
- int i;
- /* Verify cache is empty */
- for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
- BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
- clp->cl_devid_cache = NULL;
- spin_unlock(&clp->cl_lock);
- kfree(local);
+ rdata->mds_ops = call_ops;
+
+ dprintk("%s: Reading ino:%lu %u@%llu\n",
+ __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
+
+ trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
+ if (trypnfs == PNFS_NOT_ATTEMPTED) {
+ put_lseg(rdata->lseg);
+ rdata->lseg = NULL;
+ } else {
+ nfs_inc_stats(inode, NFSIOS_PNFS_READ);
}
+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+ return trypnfs;
}
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e2612ea0cbed..6380b9405bcd 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,8 @@
#ifndef FS_NFS_PNFS_H
#define FS_NFS_PNFS_H
+#include <linux/nfs_page.h>
+
enum {
NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
NFS_LSEG_ROC, /* roc bit received from server */
@@ -43,6 +45,11 @@ struct pnfs_layout_segment {
struct pnfs_layout_hdr *pls_layout;
};
+enum pnfs_try_status {
+ PNFS_ATTEMPTED = 0,
+ PNFS_NOT_ATTEMPTED = 1,
+};
+
#ifdef CONFIG_NFS_V4_1
#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
@@ -61,10 +68,18 @@ struct pnfs_layoutdriver_type {
const u32 id;
const char *name;
struct module *owner;
- int (*set_layoutdriver) (struct nfs_server *);
- int (*clear_layoutdriver) (struct nfs_server *);
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
void (*free_lseg) (struct pnfs_layout_segment *lseg);
+
+ /* test for nfs page cache coalescing */
+ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+
+ /*
+ * Return PNFS_ATTEMPTED to indicate the layout code has attempted
+ * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
+ */
+ enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
+ enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
};
struct pnfs_layout_hdr {
@@ -90,52 +105,6 @@ struct pnfs_device {
unsigned int pglen;
};
-/*
- * Device ID RCU cache. A device ID is unique per client ID and layout type.
- */
-#define NFS4_DEVICE_ID_HASH_BITS 5
-#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
-#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
-
-static inline u32
-nfs4_deviceid_hash(struct nfs4_deviceid *id)
-{
- unsigned char *cptr = (unsigned char *)id->data;
- unsigned int nbytes = NFS4_DEVICEID4_SIZE;
- u32 x = 0;
-
- while (nbytes--) {
- x *= 37;
- x += *cptr++;
- }
- return x & NFS4_DEVICE_ID_HASH_MASK;
-}
-
-struct pnfs_deviceid_node {
- struct hlist_node de_node;
- struct nfs4_deviceid de_id;
- atomic_t de_ref;
-};
-
-struct pnfs_deviceid_cache {
- spinlock_t dc_lock;
- atomic_t dc_ref;
- void (*dc_free_callback)(struct pnfs_deviceid_node *);
- struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
-};
-
-extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
- void (*free_callback)(struct pnfs_deviceid_node *));
-extern void pnfs_put_deviceid_cache(struct nfs_client *);
-extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
- struct pnfs_deviceid_cache *,
- struct nfs4_deviceid *);
-extern struct pnfs_deviceid_node *pnfs_add_deviceid(
- struct pnfs_deviceid_cache *,
- struct pnfs_deviceid_node *);
-extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
- struct pnfs_deviceid_node *devid);
-
extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
@@ -146,11 +115,18 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
/* pnfs.c */
void get_layout_hdr(struct pnfs_layout_hdr *lo);
+void put_lseg(struct pnfs_layout_segment *lseg);
struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
enum pnfs_iomode access_type);
void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
void unset_pnfs_layoutdriver(struct nfs_server *);
+enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
+ const struct rpc_call_ops *, int);
+enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
+ const struct rpc_call_ops *);
+void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
+void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
@@ -177,6 +153,16 @@ static inline int lo_fail_bit(u32 iomode)
NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
}
+static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+ if (lseg) {
+ atomic_inc(&lseg->pls_refcount);
+ smp_mb__after_atomic_inc();
+ }
+ return lseg;
+}
+
/* Return true if a layout driver is being used for this mountpoint */
static inline int pnfs_enabled_sb(struct nfs_server *nfss)
{
@@ -194,12 +180,36 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
}
static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+ return NULL;
+}
+
+static inline void put_lseg(struct pnfs_layout_segment *lseg)
+{
+}
+
+static inline struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
enum pnfs_iomode access_type)
{
return NULL;
}
+static inline enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *data,
+ const struct rpc_call_ops *call_ops)
+{
+ return PNFS_NOT_ATTEMPTED;
+}
+
+static inline enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *data,
+ const struct rpc_call_ops *call_ops, int how)
+{
+ return PNFS_NOT_ATTEMPTED;
+}
+
static inline bool
pnfs_roc(struct inode *ino)
{
@@ -230,6 +240,18 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
{
}
+static inline void
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+ pgio->pg_test = NULL;
+}
+
+static inline void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+ pgio->pg_test = NULL;
+}
+
#endif /* CONFIG_NFS_V4_1 */
#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 77d5e21c4ad6..b8ec170f2a0f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -741,4 +741,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.lock = nfs_proc_lock,
.lock_check_bounds = nfs_lock_check_bounds,
.close_context = nfs_close_context,
+ .init_client = nfs_init_client,
};
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index aedcaa7f291f..7cded2b12a05 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -18,19 +18,20 @@
#include <linux/sunrpc/clnt.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
+#include <linux/module.h>
#include <asm/system.h>
+#include "pnfs.h"
#include "nfs4_fs.h"
#include "internal.h"
#include "iostat.h"
#include "fscache.h"
-#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
-static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int);
-static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int);
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc);
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc);
static const struct rpc_call_ops nfs_read_partial_ops;
static const struct rpc_call_ops nfs_read_full_ops;
@@ -69,6 +70,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
static void nfs_readdata_release(struct nfs_read_data *rdata)
{
+ put_lseg(rdata->lseg);
put_nfs_open_context(rdata->args.context);
nfs_readdata_free(rdata);
}
@@ -114,14 +116,13 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
struct page *page)
{
- LIST_HEAD(one_request);
struct nfs_page *new;
unsigned int len;
+ struct nfs_pageio_descriptor pgio;
len = nfs_page_length(page);
if (len == 0)
return nfs_return_empty_page(page);
- pnfs_update_layout(inode, ctx, IOMODE_READ);
new = nfs_create_request(ctx, inode, page, 0, len);
if (IS_ERR(new)) {
unlock_page(page);
@@ -130,11 +131,14 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
if (len < PAGE_CACHE_SIZE)
zero_user_segment(page, len, PAGE_CACHE_SIZE);
- nfs_list_add_request(new, &one_request);
+ nfs_pageio_init(&pgio, inode, NULL, 0, 0);
+ nfs_list_add_request(new, &pgio.pg_list);
+ pgio.pg_count = len;
+
if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
- nfs_pagein_multi(inode, &one_request, 1, len, 0);
+ nfs_pagein_multi(&pgio);
else
- nfs_pagein_one(inode, &one_request, 1, len, 0);
+ nfs_pagein_one(&pgio);
return 0;
}
@@ -155,24 +159,20 @@ static void nfs_readpage_release(struct nfs_page *req)
nfs_release_request(req);
}
-/*
- * Set up the NFS read request struct
- */
-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
- const struct rpc_call_ops *call_ops,
- unsigned int count, unsigned int offset)
+int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops)
{
- struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct inode *inode = data->inode;
int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
struct rpc_task *task;
struct rpc_message msg = {
.rpc_argp = &data->args,
.rpc_resp = &data->res,
- .rpc_cred = req->wb_context->cred,
+ .rpc_cred = data->cred,
};
struct rpc_task_setup task_setup_data = {
.task = &data->task,
- .rpc_client = NFS_CLIENT(inode),
+ .rpc_client = clnt,
.rpc_message = &msg,
.callback_ops = call_ops,
.callback_data = data,
@@ -180,9 +180,39 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
.flags = RPC_TASK_ASYNC | swap_flags,
};
+ /* Set up the initial task struct. */
+ NFS_PROTO(inode)->read_setup(data, &msg);
+
+ dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
+ "offset %llu)\n",
+ data->task.tk_pid,
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ data->args.count,
+ (unsigned long long)data->args.offset);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_read);
+
+/*
+ * Set up the NFS read request struct
+ */
+static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+ const struct rpc_call_ops *call_ops,
+ unsigned int count, unsigned int offset,
+ struct pnfs_layout_segment *lseg)
+{
+ struct inode *inode = req->wb_context->path.dentry->d_inode;
+
data->req = req;
data->inode = inode;
- data->cred = msg.rpc_cred;
+ data->cred = req->wb_context->cred;
+ data->lseg = get_lseg(lseg);
data->args.fh = NFS_FH(inode);
data->args.offset = req_offset(req) + offset;
@@ -197,21 +227,11 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
data->res.eof = 0;
nfs_fattr_init(&data->fattr);
- /* Set up the initial task struct. */
- NFS_PROTO(inode)->read_setup(data, &msg);
-
- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- count,
- (unsigned long long)data->args.offset);
+ if (data->lseg &&
+ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
+ return 0;
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task))
- return PTR_ERR(task);
- rpc_put_task(task);
- return 0;
+ return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
}
static void
@@ -240,20 +260,21 @@ nfs_async_read_error(struct list_head *head)
* won't see the new data until our attribute cache is updated. This is more
* or less conventional NFS client behavior.
*/
-static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
{
- struct nfs_page *req = nfs_list_entry(head->next);
+ struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
struct page *page = req->wb_page;
struct nfs_read_data *data;
- size_t rsize = NFS_SERVER(inode)->rsize, nbytes;
+ size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes;
unsigned int offset;
int requests = 0;
int ret = 0;
+ struct pnfs_layout_segment *lseg;
LIST_HEAD(list);
nfs_list_remove_request(req);
- nbytes = count;
+ nbytes = desc->pg_count;
do {
size_t len = min(nbytes,rsize);
@@ -266,9 +287,11 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
} while(nbytes != 0);
atomic_set(&req->wb_complete, requests);
+ BUG_ON(desc->pg_lseg != NULL);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
ClearPageError(page);
offset = 0;
- nbytes = count;
+ nbytes = desc->pg_count;
do {
int ret2;
@@ -280,12 +303,14 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
if (nbytes < rsize)
rsize = nbytes;
ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
- rsize, offset);
+ rsize, offset, lseg);
if (ret == 0)
ret = ret2;
offset += rsize;
nbytes -= rsize;
} while (nbytes != 0);
+ put_lseg(lseg);
+ desc->pg_lseg = NULL;
return ret;
@@ -300,16 +325,21 @@ out_bad:
return -ENOMEM;
}
-static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
{
struct nfs_page *req;
struct page **pages;
struct nfs_read_data *data;
+ struct list_head *head = &desc->pg_list;
+ struct pnfs_layout_segment *lseg = desc->pg_lseg;
int ret = -ENOMEM;
- data = nfs_readdata_alloc(npages);
- if (!data)
- goto out_bad;
+ data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
+ desc->pg_count));
+ if (!data) {
+ nfs_async_read_error(head);
+ goto out;
+ }
pages = data->pagevec;
while (!list_empty(head)) {
@@ -320,10 +350,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
*pages++ = req->wb_page;
}
req = nfs_list_entry(data->pages.next);
+ if ((!lseg) && list_is_singular(&data->pages))
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
- return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
-out_bad:
- nfs_async_read_error(head);
+ ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
+ 0, lseg);
+out:
+ put_lseg(lseg);
+ desc->pg_lseg = NULL;
return ret;
}
@@ -366,6 +400,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
return;
/* Yes, so retry the read at the end of the data */
+ data->mds_offset += resp->count;
argp->offset += resp->count;
argp->pgbase += resp->count;
argp->count -= resp->count;
@@ -625,7 +660,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
if (ret == 0)
goto read_complete; /* all pages were read */
- pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
+ pnfs_pageio_init_read(&pgio, inode);
if (rsize < PAGE_CACHE_SIZE)
nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d3286583009a..2b8e9a5e366a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1008,6 +1008,27 @@ static int nfs_parse_security_flavors(char *value,
return 1;
}
+static int nfs_get_option_str(substring_t args[], char **option)
+{
+ kfree(*option);
+ *option = match_strdup(args);
+ return !option;
+}
+
+static int nfs_get_option_ul(substring_t args[], unsigned long *option)
+{
+ int rc;
+ char *string;
+
+ string = match_strdup(args);
+ if (string == NULL)
+ return -ENOMEM;
+ rc = strict_strtoul(string, 10, option);
+ kfree(string);
+
+ return rc;
+}
+
/*
* Error-check and convert a string of mount options from user space into
* a data structure. The whole mount string is processed; bad options are
@@ -1156,155 +1177,82 @@ static int nfs_parse_mount_options(char *raw,
* options that take numeric values
*/
case Opt_port:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0 || option > USHRT_MAX)
+ if (nfs_get_option_ul(args, &option) ||
+ option > USHRT_MAX)
goto out_invalid_value;
mnt->nfs_server.port = option;
break;
case Opt_rsize:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0)
+ if (nfs_get_option_ul(args, &option))
goto out_invalid_value;
mnt->rsize = option;
break;
case Opt_wsize:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0)
+ if (nfs_get_option_ul(args, &option))
goto out_invalid_value;
mnt->wsize = option;
break;
case Opt_bsize:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0)
+ if (nfs_get_option_ul(args, &option))
goto out_invalid_value;
mnt->bsize = option;
break;
case Opt_timeo:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0 || option == 0)
+ if (nfs_get_option_ul(args, &option) || option == 0)
goto out_invalid_value;
mnt->timeo = option;
break;
case Opt_retrans:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0 || option == 0)
+ if (nfs_get_option_ul(args, &option) || option == 0)
goto out_invalid_value;
mnt->retrans = option;
break;
case Opt_acregmin:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0)
+ if (nfs_get_option_ul(args, &option))
goto out_invalid_value;
mnt->acregmin = option;
break;
case Opt_acregmax:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0)
+ if (nfs_get_option_ul(args, &option))
goto out_invalid_value;
mnt->acregmax = option;
break;
case Opt_acdirmin:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0)
+ if (nfs_get_option_ul(args, &option))
goto out_invalid_value;
mnt->acdirmin = option;
break;
case Opt_acdirmax:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0)
+ if (nfs_get_option_ul(args, &option))
goto out_invalid_value;
mnt->acdirmax = option;
break;
case Opt_actimeo:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0)
+ if (nfs_get_option_ul(args, &option))
goto out_invalid_value;
mnt->acregmin = mnt->acregmax =
mnt->acdirmin = mnt->acdirmax = option;
break;
case Opt_namelen:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0)
+ if (nfs_get_option_ul(args, &option))
goto out_invalid_value;
mnt->namlen = option;
break;
case Opt_mountport:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0 || option > USHRT_MAX)
+ if (nfs_get_option_ul(args, &option) ||
+ option > USHRT_MAX)
goto out_invalid_value;
mnt->mount_server.port = option;
break;
case Opt_mountvers:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0 ||
+ if (nfs_get_option_ul(args, &option) ||
option < NFS_MNT_VERSION ||
option > NFS_MNT3_VERSION)
goto out_invalid_value;
mnt->mount_server.version = option;
break;
case Opt_nfsvers:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0)
+ if (nfs_get_option_ul(args, &option))
goto out_invalid_value;
switch (option) {
case NFS2_VERSION:
@@ -1324,12 +1272,7 @@ static int nfs_parse_mount_options(char *raw,
}
break;
case Opt_minorversion:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
- rc = strict_strtoul(string, 10, &option);
- kfree(string);
- if (rc != 0)
+ if (nfs_get_option_ul(args, &option))
goto out_invalid_value;
if (option > NFS4_MAX_MINOR_VERSION)
goto out_invalid_value;
@@ -1365,21 +1308,18 @@ static int nfs_parse_mount_options(char *raw,
case Opt_xprt_udp:
mnt->flags &= ~NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
- kfree(string);
break;
case Opt_xprt_tcp6:
protofamily = AF_INET6;
case Opt_xprt_tcp:
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
- kfree(string);
break;
case Opt_xprt_rdma:
/* vector side protocols to TCP */
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
xprt_load_transport(string);
- kfree(string);
break;
default:
dfprintk(MOUNT, "NFS: unrecognized "
@@ -1387,6 +1327,7 @@ static int nfs_parse_mount_options(char *raw,
kfree(string);
return 0;
}
+ kfree(string);
break;
case Opt_mountproto:
string = match_strdup(args);
@@ -1429,18 +1370,13 @@ static int nfs_parse_mount_options(char *raw,
goto out_invalid_address;
break;
case Opt_clientaddr:
- string = match_strdup(args);
- if (string == NULL)
+ if (nfs_get_option_str(args, &mnt->client_address))
goto out_nomem;
- kfree(mnt->client_address);
- mnt->client_address = string;
break;
case Opt_mounthost:
- string = match_strdup(args);
- if (string == NULL)
+ if (nfs_get_option_str(args,
+ &mnt->mount_server.hostname))
goto out_nomem;
- kfree(mnt->mount_server.hostname);
- mnt->mount_server.hostname = string;
break;
case Opt_mountaddr:
string = match_strdup(args);
@@ -1480,11 +1416,8 @@ static int nfs_parse_mount_options(char *raw,
};
break;
case Opt_fscache_uniq:
- string = match_strdup(args);
- if (string == NULL)
+ if (nfs_get_option_str(args, &mnt->fscache_uniq))
goto out_nomem;
- kfree(mnt->fscache_uniq);
- mnt->fscache_uniq = string;
mnt->options |= NFS_OPTION_FSCACHE;
break;
case Opt_local_lock:
@@ -1694,99 +1627,59 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
return nfs_walk_authlist(args, &request);
}
-static int nfs_parse_simple_hostname(const char *dev_name,
- char **hostname, size_t maxnamlen,
- char **export_path, size_t maxpathlen)
+/*
+ * Split "dev_name" into "hostname:export_path".
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path. If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+ char **hostname, size_t maxnamlen,
+ char **export_path, size_t maxpathlen)
{
size_t len;
- char *colon, *comma;
+ char *end;
- colon = strchr(dev_name, ':');
- if (colon == NULL)
- goto out_bad_devname;
-
- len = colon - dev_name;
- if (len > maxnamlen)
- goto out_hostname;
-
- /* N.B. caller will free nfs_server.hostname in all cases */
- *hostname = kstrndup(dev_name, len, GFP_KERNEL);
- if (!*hostname)
- goto out_nomem;
-
- /* kill possible hostname list: not supported */
- comma = strchr(*hostname, ',');
- if (comma != NULL) {
- if (comma == *hostname)
+ /* Is the host name protected with square brakcets? */
+ if (*dev_name == '[') {
+ end = strchr(++dev_name, ']');
+ if (end == NULL || end[1] != ':')
goto out_bad_devname;
- *comma = '\0';
- }
-
- colon++;
- len = strlen(colon);
- if (len > maxpathlen)
- goto out_path;
- *export_path = kstrndup(colon, len, GFP_KERNEL);
- if (!*export_path)
- goto out_nomem;
-
- dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
- return 0;
-
-out_bad_devname:
- dfprintk(MOUNT, "NFS: device name not in host:path format\n");
- return -EINVAL;
-out_nomem:
- dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
- return -ENOMEM;
-
-out_hostname:
- dfprintk(MOUNT, "NFS: server hostname too long\n");
- return -ENAMETOOLONG;
-
-out_path:
- dfprintk(MOUNT, "NFS: export pathname too long\n");
- return -ENAMETOOLONG;
-}
-
-/*
- * Hostname has square brackets around it because it contains one or
- * more colons. We look for the first closing square bracket, and a
- * colon must follow it.
- */
-static int nfs_parse_protected_hostname(const char *dev_name,
- char **hostname, size_t maxnamlen,
- char **export_path, size_t maxpathlen)
-{
- size_t len;
- char *start, *end;
+ len = end - dev_name;
+ end++;
+ } else {
+ char *comma;
- start = (char *)(dev_name + 1);
+ end = strchr(dev_name, ':');
+ if (end == NULL)
+ goto out_bad_devname;
+ len = end - dev_name;
- end = strchr(start, ']');
- if (end == NULL)
- goto out_bad_devname;
- if (*(end + 1) != ':')
- goto out_bad_devname;
+ /* kill possible hostname list: not supported */
+ comma = strchr(dev_name, ',');
+ if (comma != NULL && comma < end)
+ *comma = 0;
+ }
- len = end - start;
if (len > maxnamlen)
goto out_hostname;
/* N.B. caller will free nfs_server.hostname in all cases */
- *hostname = kstrndup(start, len, GFP_KERNEL);
+ *hostname = kstrndup(dev_name, len, GFP_KERNEL);
if (*hostname == NULL)
goto out_nomem;
-
- end += 2;
- len = strlen(end);
+ len = strlen(++end);
if (len > maxpathlen)
goto out_path;
*export_path = kstrndup(end, len, GFP_KERNEL);
if (!*export_path)
goto out_nomem;
+ dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
return 0;
out_bad_devname:
@@ -1807,29 +1700,6 @@ out_path:
}
/*
- * Split "dev_name" into "hostname:export_path".
- *
- * The leftmost colon demarks the split between the server's hostname
- * and the export path. If the hostname starts with a left square
- * bracket, then it may contain colons.
- *
- * Note: caller frees hostname and export path, even on error.
- */
-static int nfs_parse_devname(const char *dev_name,
- char **hostname, size_t maxnamlen,
- char **export_path, size_t maxpathlen)
-{
- if (*dev_name == '[')
- return nfs_parse_protected_hostname(dev_name,
- hostname, maxnamlen,
- export_path, maxpathlen);
-
- return nfs_parse_simple_hostname(dev_name,
- hostname, maxnamlen,
- export_path, maxpathlen);
-}
-
-/*
* Validate the NFS2/NFS3 mount data
* - fills in the mount root filehandle
*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 42b92d7a9cc4..47a3ad63e0d5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -28,6 +28,7 @@
#include "iostat.h"
#include "nfs4_fs.h"
#include "fscache.h"
+#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
@@ -96,6 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p)
static void nfs_writedata_release(struct nfs_write_data *wdata)
{
+ put_lseg(wdata->lseg);
put_nfs_open_context(wdata->args.context);
nfs_writedata_free(wdata);
}
@@ -781,25 +783,21 @@ static int flush_task_priority(int how)
return RPC_PRIORITY_NORMAL;
}
-/*
- * Set up the argument/result storage required for the RPC call.
- */
-static int nfs_write_rpcsetup(struct nfs_page *req,
- struct nfs_write_data *data,
- const struct rpc_call_ops *call_ops,
- unsigned int count, unsigned int offset,
- int how)
+int nfs_initiate_write(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how)
{
- struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct inode *inode = data->inode;
int priority = flush_task_priority(how);
struct rpc_task *task;
struct rpc_message msg = {
.rpc_argp = &data->args,
.rpc_resp = &data->res,
- .rpc_cred = req->wb_context->cred,
+ .rpc_cred = data->cred,
};
struct rpc_task_setup task_setup_data = {
- .rpc_client = NFS_CLIENT(inode),
+ .rpc_client = clnt,
.task = &data->task,
.rpc_message = &msg,
.callback_ops = call_ops,
@@ -810,12 +808,52 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
};
int ret = 0;
+ /* Set up the initial task struct. */
+ NFS_PROTO(inode)->write_setup(data, &msg);
+
+ dprintk("NFS: %5u initiated write call "
+ "(req %s/%lld, %u bytes @ offset %llu)\n",
+ data->task.tk_pid,
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ data->args.count,
+ (unsigned long long)data->args.offset);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task)) {
+ ret = PTR_ERR(task);
+ goto out;
+ }
+ if (how & FLUSH_SYNC) {
+ ret = rpc_wait_for_completion_task(task);
+ if (ret == 0)
+ ret = task->tk_status;
+ }
+ rpc_put_task(task);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_write);
+
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+static int nfs_write_rpcsetup(struct nfs_page *req,
+ struct nfs_write_data *data,
+ const struct rpc_call_ops *call_ops,
+ unsigned int count, unsigned int offset,
+ struct pnfs_layout_segment *lseg,
+ int how)
+{
+ struct inode *inode = req->wb_context->path.dentry->d_inode;
+
/* Set up the RPC argument and reply structs
* NB: take care not to mess about with data->commit et al. */
data->req = req;
data->inode = inode = req->wb_context->path.dentry->d_inode;
- data->cred = msg.rpc_cred;
+ data->cred = req->wb_context->cred;
+ data->lseg = get_lseg(lseg);
data->args.fh = NFS_FH(inode);
data->args.offset = req_offset(req) + offset;
@@ -836,30 +874,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
data->res.verf = &data->verf;
nfs_fattr_init(&data->fattr);
- /* Set up the initial task struct. */
- NFS_PROTO(inode)->write_setup(data, &msg);
-
- dprintk("NFS: %5u initiated write call "
- "(req %s/%lld, %u bytes @ offset %llu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- count,
- (unsigned long long)data->args.offset);
+ if (data->lseg &&
+ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
+ return 0;
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task)) {
- ret = PTR_ERR(task);
- goto out;
- }
- if (how & FLUSH_SYNC) {
- ret = rpc_wait_for_completion_task(task);
- if (ret == 0)
- ret = task->tk_status;
- }
- rpc_put_task(task);
-out:
- return ret;
+ return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
}
/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -879,20 +898,21 @@ static void nfs_redirty_request(struct nfs_page *req)
* Generate multiple small requests to write out a single
* contiguous dirty area on one page.
*/
-static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
{
- struct nfs_page *req = nfs_list_entry(head->next);
+ struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
struct page *page = req->wb_page;
struct nfs_write_data *data;
- size_t wsize = NFS_SERVER(inode)->wsize, nbytes;
+ size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes;
unsigned int offset;
int requests = 0;
int ret = 0;
+ struct pnfs_layout_segment *lseg;
LIST_HEAD(list);
nfs_list_remove_request(req);
- nbytes = count;
+ nbytes = desc->pg_count;
do {
size_t len = min(nbytes, wsize);
@@ -905,9 +925,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
} while (nbytes != 0);
atomic_set(&req->wb_complete, requests);
+ BUG_ON(desc->pg_lseg);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
ClearPageError(page);
offset = 0;
- nbytes = count;
+ nbytes = desc->pg_count;
do {
int ret2;
@@ -919,13 +941,15 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
if (nbytes < wsize)
wsize = nbytes;
ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
- wsize, offset, how);
+ wsize, offset, lseg, desc->pg_ioflags);
if (ret == 0)
ret = ret2;
offset += wsize;
nbytes -= wsize;
} while (nbytes != 0);
+ put_lseg(lseg);
+ desc->pg_lseg = NULL;
return ret;
out_bad:
@@ -946,16 +970,26 @@ out_bad:
* This is the case if nfs_updatepage detects a conflicting request
* that has been written but not committed.
*/
-static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
{
struct nfs_page *req;
struct page **pages;
struct nfs_write_data *data;
+ struct list_head *head = &desc->pg_list;
+ struct pnfs_layout_segment *lseg = desc->pg_lseg;
+ int ret;
- data = nfs_writedata_alloc(npages);
- if (!data)
- goto out_bad;
-
+ data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
+ desc->pg_count));
+ if (!data) {
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_remove_request(req);
+ nfs_redirty_request(req);
+ }
+ ret = -ENOMEM;
+ goto out;
+ }
pages = data->pagevec;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
@@ -965,16 +999,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
*pages++ = req->wb_page;
}
req = nfs_list_entry(data->pages.next);
+ if ((!lseg) && list_is_singular(&data->pages))
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
/* Set up the argument struct */
- return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
- out_bad:
- while (!list_empty(head)) {
- req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_redirty_request(req);
- }
- return -ENOMEM;
+ ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
+out:
+ put_lseg(lseg); /* Cleans any gotten in ->pg_test */
+ desc->pg_lseg = NULL;
+ return ret;
}
static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -982,6 +1015,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
{
size_t wsize = NFS_SERVER(inode)->wsize;
+ pnfs_pageio_init_write(pgio, inode);
+
if (wsize < PAGE_CACHE_SIZE)
nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
else
@@ -1132,7 +1167,7 @@ static const struct rpc_call_ops nfs_write_full_ops = {
/*
* This function is called when the WRITE call is complete.
*/
-int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
{
struct nfs_writeargs *argp = &data->args;
struct nfs_writeres *resp = &data->res;
@@ -1151,7 +1186,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
*/
status = NFS_PROTO(data->inode)->write_done(task, data);
if (status != 0)
- return status;
+ return;
nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1166,6 +1201,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
*/
static unsigned long complain;
+ /* Note this will print the MDS for a DS write */
if (time_before(complain, jiffies)) {
dprintk("NFS: faulty NFS server %s:"
" (committed = %d) != (stable = %d)\n",
@@ -1186,6 +1222,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
/* Was this an NFSv2 write or an NFSv3 stable write? */
if (resp->verf->committed != NFS_UNSTABLE) {
/* Resend from where the server left off */
+ data->mds_offset += resp->count;
argp->offset += resp->count;
argp->pgbase += resp->count;
argp->count -= resp->count;
@@ -1196,7 +1233,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
argp->stable = NFS_FILE_SYNC;
}
nfs_restart_rpc(task, server->nfs_client);
- return -EAGAIN;
+ return;
}
if (time_before(complain, jiffies)) {
printk(KERN_WARNING
@@ -1207,7 +1244,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
/* Can't do anything about it except throw an error. */
task->tk_status = -EIO;
}
- return 0;
+ return;
}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8b61220cffc5..6b1305dc26c0 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -876,7 +876,7 @@ SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
#endif
/*
- * fanotify_user_setup - Our initialization function. Note that we cannnot return
+ * fanotify_user_setup - Our initialization function. Note that we cannot return
* error because we have compiled-in VFS hooks. So an (unlikely) failure here
* must result in panic().
*/
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 4cd5d5d78f9f..bd46e7c8a0ef 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -841,7 +841,7 @@ out:
}
/*
- * inotify_user_setup - Our initialization function. Note that we cannnot return
+ * inotify_user_setup - Our initialization function. Note that we cannot return
* error because we have compiled-in VFS hooks. So an (unlikely) failure here
* must result in panic().
*/
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index d417b3f9b0c7..f97b6f1c61dd 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -354,7 +354,7 @@ static inline int ocfs2_match(int len,
/*
* Returns 0 if not found, -1 on failure, and 1 on success
*/
-static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+static inline int ocfs2_search_dirblock(struct buffer_head *bh,
struct inode *dir,
const char *name, int namelen,
unsigned long offset,
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 393f3f659da7..de4ff29f1e05 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -235,33 +235,22 @@ static int omfs_dir_is_empty(struct inode *inode)
return *ptr != ~0;
}
-static int omfs_unlink(struct inode *dir, struct dentry *dentry)
+static int omfs_remove(struct inode *dir, struct dentry *dentry)
{
- int ret;
struct inode *inode = dentry->d_inode;
+ int ret;
+
+ if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode))
+ return -ENOTEMPTY;
ret = omfs_delete_entry(dentry);
if (ret)
- goto end_unlink;
-
- inode_dec_link_count(inode);
+ return ret;
+
+ clear_nlink(inode);
+ mark_inode_dirty(inode);
mark_inode_dirty(dir);
-
-end_unlink:
- return ret;
-}
-
-static int omfs_rmdir(struct inode *dir, struct dentry *dentry)
-{
- int err = -ENOTEMPTY;
- struct inode *inode = dentry->d_inode;
-
- if (omfs_dir_is_empty(inode)) {
- err = omfs_unlink(dir, dentry);
- if (!err)
- inode_dec_link_count(inode);
- }
- return err;
+ return 0;
}
static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode)
@@ -372,9 +361,10 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
OMFS_NAMELEN), filp->f_pos, self, d_type);
- if (res == 0)
- filp->f_pos++;
brelse(bh);
+ if (res < 0)
+ break;
+ filp->f_pos++;
}
out:
return res;
@@ -385,44 +375,28 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
{
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
- struct buffer_head *bh;
- int is_dir;
int err;
- is_dir = S_ISDIR(old_inode->i_mode);
-
if (new_inode) {
/* overwriting existing file/dir */
- err = -ENOTEMPTY;
- if (is_dir && !omfs_dir_is_empty(new_inode))
- goto out;
-
- err = -ENOENT;
- bh = omfs_find_entry(new_dir, new_dentry->d_name.name,
- new_dentry->d_name.len);
- if (IS_ERR(bh))
- goto out;
- brelse(bh);
-
- err = omfs_unlink(new_dir, new_dentry);
+ err = omfs_remove(new_dir, new_dentry);
if (err)
goto out;
}
/* since omfs locates files by name, we need to unlink _before_
* adding the new link or we won't find the old one */
- inode_inc_link_count(old_inode);
- err = omfs_unlink(old_dir, old_dentry);
- if (err) {
- inode_dec_link_count(old_inode);
+ err = omfs_delete_entry(old_dentry);
+ if (err)
goto out;
- }
+ mark_inode_dirty(old_dir);
err = omfs_add_link(new_dentry, old_inode);
if (err)
goto out;
old_inode->i_ctime = CURRENT_TIME_SEC;
+ mark_inode_dirty(old_inode);
out:
return err;
}
@@ -488,8 +462,8 @@ const struct inode_operations omfs_dir_inops = {
.mkdir = omfs_mkdir,
.rename = omfs_rename,
.create = omfs_create,
- .unlink = omfs_unlink,
- .rmdir = omfs_rmdir,
+ .unlink = omfs_remove,
+ .rmdir = omfs_remove,
};
const struct file_operations omfs_dir_operations = {
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 65444d29406b..f1ab3604db5a 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -112,7 +112,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
if (!info->dqi_priv) {
printk(KERN_WARNING
"Not enough memory for quota information structure.\n");
- return -1;
+ return -ENOMEM;
}
qinfo = info->dqi_priv;
if (version == 0) {
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 830e3f76f442..1d1859dc3de5 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -44,23 +44,20 @@ config UBIFS_FS_ZLIB
# Debugging-related stuff
config UBIFS_FS_DEBUG
- bool "Enable debugging"
+ bool "Enable debugging support"
depends on UBIFS_FS
select DEBUG_FS
select KALLSYMS_ALL
help
- This option enables UBIFS debugging.
-
-config UBIFS_FS_DEBUG_MSG_LVL
- int "Default message level (0 = no extra messages, 3 = lots)"
- depends on UBIFS_FS_DEBUG
- default "0"
- help
- This controls the amount of debugging messages produced by UBIFS.
- If reporting bugs, please try to have available a full dump of the
- messages at level 1 while the misbehaviour was occurring. Level 2
- may become necessary if level 1 messages were not enough to find the
- bug. Generally Level 3 should be avoided.
+ This option enables UBIFS debugging support. It makes sure various
+ assertions, self-checks, debugging messages and test modes are compiled
+ in (this all is compiled out otherwise). Assertions are light-weight
+ and this option also enables them. Self-checks, debugging messages and
+ test modes are switched off by default. Thus, it is safe and actually
+ recommended to have debugging support enabled, and it should not slow
+ down UBIFS. You can then further enable / disable individual debugging
+ features using UBIFS module parameters and the corresponding sysfs
+ interfaces.
config UBIFS_FS_DEBUG_CHKS
bool "Enable extra checks"
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 02429d81ca33..b148fbc80f8d 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -48,6 +48,56 @@
#include <linux/slab.h>
#include "ubifs.h"
+/*
+ * nothing_to_commit - check if there is nothing to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which checks if there is anything to commit. It is
+ * used as an optimization to avoid starting the commit if it is not really
+ * necessary. Indeed, the commit operation always assumes flash I/O (e.g.,
+ * writing the commit start node to the log), and it is better to avoid doing
+ * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is
+ * nothing to commit, it is more optimal to avoid any flash I/O.
+ *
+ * This function has to be called with @c->commit_sem locked for writing -
+ * this function does not take LPT/TNC locks because the @c->commit_sem
+ * guarantees that we have exclusive access to the TNC and LPT data structures.
+ *
+ * This function returns %1 if there is nothing to commit and %0 otherwise.
+ */
+static int nothing_to_commit(struct ubifs_info *c)
+{
+ /*
+ * During mounting or remounting from R/O mode to R/W mode we may
+ * commit for various recovery-related reasons.
+ */
+ if (c->mounting || c->remounting_rw)
+ return 0;
+
+ /*
+ * If the root TNC node is dirty, we definitely have something to
+ * commit.
+ */
+ if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags))
+ return 0;
+
+ /*
+ * Even though the TNC is clean, the LPT tree may have dirty nodes. For
+ * example, this may happen if the budgeting subsystem invoked GC to
+ * make some free space, and the GC found an LEB with only dirty and
+ * free space. In this case GC would just change the lprops of this
+ * LEB (by turning all space into free space) and unmap it.
+ */
+ if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags))
+ return 0;
+
+ ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
+ ubifs_assert(c->dirty_pn_cnt == 0);
+ ubifs_assert(c->dirty_nn_cnt == 0);
+
+ return 1;
+}
+
/**
* do_commit - commit the journal.
* @c: UBIFS file-system description object
@@ -70,6 +120,12 @@ static int do_commit(struct ubifs_info *c)
goto out_up;
}
+ if (nothing_to_commit(c)) {
+ up_write(&c->commit_sem);
+ err = 0;
+ goto out_cancel;
+ }
+
/* Sync all write buffers (necessary for recovery) */
for (i = 0; i < c->jhead_cnt; i++) {
err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
@@ -162,12 +218,12 @@ static int do_commit(struct ubifs_info *c)
if (err)
goto out;
+out_cancel:
spin_lock(&c->cs_lock);
c->cmt_state = COMMIT_RESTING;
wake_up(&c->cmt_wq);
dbg_cmt("commit end");
spin_unlock(&c->cs_lock);
-
return 0;
out_up:
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 0bee4dbffc31..01c2b028e525 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -43,8 +43,8 @@ DEFINE_SPINLOCK(dbg_lock);
static char dbg_key_buf0[128];
static char dbg_key_buf1[128];
-unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT;
-unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT;
+unsigned int ubifs_msg_flags;
+unsigned int ubifs_chk_flags;
unsigned int ubifs_tst_flags;
module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
@@ -810,16 +810,24 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
{
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
+ void *buf;
if (dbg_failure_mode)
return;
printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
current->pid, lnum);
- sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
+
+ buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+ if (!buf) {
+ ubifs_err("cannot allocate memory for dumping LEB %d", lnum);
+ return;
+ }
+
+ sleb = ubifs_scan(c, lnum, 0, buf, 0);
if (IS_ERR(sleb)) {
ubifs_err("scan error %d", (int)PTR_ERR(sleb));
- return;
+ goto out;
}
printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
@@ -835,6 +843,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
current->pid, lnum);
ubifs_scan_destroy(sleb);
+
+out:
+ vfree(buf);
return;
}
@@ -2690,16 +2701,8 @@ int ubifs_debugging_init(struct ubifs_info *c)
if (!c->dbg)
return -ENOMEM;
- c->dbg->buf = vmalloc(c->leb_size);
- if (!c->dbg->buf)
- goto out;
-
failure_mode_init(c);
return 0;
-
-out:
- kfree(c->dbg);
- return -ENOMEM;
}
/**
@@ -2709,7 +2712,6 @@ out:
void ubifs_debugging_exit(struct ubifs_info *c)
{
failure_mode_exit(c);
- vfree(c->dbg->buf);
kfree(c->dbg);
}
@@ -2813,19 +2815,19 @@ int dbg_debugfs_init_fs(struct ubifs_info *c)
}
fname = "dump_lprops";
- dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+ dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
if (IS_ERR(dent))
goto out_remove;
d->dfs_dump_lprops = dent;
fname = "dump_budg";
- dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+ dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
if (IS_ERR(dent))
goto out_remove;
d->dfs_dump_budg = dent;
fname = "dump_tnc";
- dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+ dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
if (IS_ERR(dent))
goto out_remove;
d->dfs_dump_tnc = dent;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 69ebe4729151..919f0de29d8f 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -27,7 +27,6 @@
/**
* ubifs_debug_info - per-FS debugging information.
- * @buf: a buffer of LEB size, used for various purposes
* @old_zroot: old index root - used by 'dbg_check_old_index()'
* @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
* @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
@@ -54,7 +53,6 @@
* dfs_dump_tnc: "dump TNC" debugfs knob
*/
struct ubifs_debug_info {
- void *buf;
struct ubifs_zbranch old_zroot;
int old_zroot_level;
unsigned long long old_zroot_sqnum;
@@ -173,7 +171,7 @@ const char *dbg_key_str1(const struct ubifs_info *c,
#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
/*
- * Debugging message type flags (must match msg_type_names in debug.c).
+ * Debugging message type flags.
*
* UBIFS_MSG_GEN: general messages
* UBIFS_MSG_JNL: journal messages
@@ -205,14 +203,8 @@ enum {
UBIFS_MSG_RCVRY = 0x1000,
};
-/* Debugging message type flags for each default debug message level */
-#define UBIFS_MSG_LVL_0 0
-#define UBIFS_MSG_LVL_1 0x1
-#define UBIFS_MSG_LVL_2 0x7f
-#define UBIFS_MSG_LVL_3 0xffff
-
/*
- * Debugging check flags (must match chk_names in debug.c).
+ * Debugging check flags.
*
* UBIFS_CHK_GEN: general checks
* UBIFS_CHK_TNC: check TNC
@@ -233,7 +225,7 @@ enum {
};
/*
- * Special testing flags (must match tst_names in debug.c).
+ * Special testing flags.
*
* UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
* UBIFS_TST_RCVRY: failure mode for recovery testing
@@ -243,22 +235,6 @@ enum {
UBIFS_TST_RCVRY = 0x4,
};
-#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1
-#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2
-#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3
-#else
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0
-#endif
-
-#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS
-#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff
-#else
-#define UBIFS_CHK_FLAGS_DEFAULT 0
-#endif
-
extern spinlock_t dbg_lock;
extern unsigned int ubifs_msg_flags;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index d82173182eeb..dfd168b7807e 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -31,6 +31,26 @@
* buffer is full or when it is not used for some time (by timer). This is
* similar to the mechanism is used by JFFS2.
*
+ * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum
+ * write size (@c->max_write_size). The latter is the maximum amount of bytes
+ * the underlying flash is able to program at a time, and writing in
+ * @c->max_write_size units should presumably be faster. Obviously,
+ * @c->min_io_size <= @c->max_write_size. Write-buffers are of
+ * @c->max_write_size bytes in size for maximum performance. However, when a
+ * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size
+ * boundary) which contains data is written, not the whole write-buffer,
+ * because this is more space-efficient.
+ *
+ * This optimization adds few complications to the code. Indeed, on the one
+ * hand, we want to write in optimal @c->max_write_size bytes chunks, which
+ * also means aligning writes at the @c->max_write_size bytes offsets. On the
+ * other hand, we do not want to waste space when synchronizing the write
+ * buffer, so during synchronization we writes in smaller chunks. And this makes
+ * the next write offset to be not aligned to @c->max_write_size bytes. So the
+ * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned
+ * to @c->max_write_size bytes again. We do this by temporarily shrinking
+ * write-buffer size (@wbuf->size).
+ *
* Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
* mutexes defined inside these objects. Since sometimes upper-level code
* has to lock the write-buffer (e.g. journal space reservation code), many
@@ -46,8 +66,8 @@
* UBIFS uses padding when it pads to the next min. I/O unit. In this case it
* uses padding nodes or padding bytes, if the padding node does not fit.
*
- * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes
- * every time they are read from the flash media.
+ * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when
+ * they are read from the flash media.
*/
#include <linux/crc32.h>
@@ -88,8 +108,12 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
* This function may skip data nodes CRC checking if @c->no_chk_data_crc is
* true, which is controlled by corresponding UBIFS mount option. However, if
* @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
- * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is
- * ignored and CRC is checked.
+ * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are
+ * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC
+ * is checked. This is because during mounting or re-mounting from R/O mode to
+ * R/W mode we may read journal nodes (when replying the journal or doing the
+ * recovery) and the journal nodes may potentially be corrupted, so checking is
+ * required.
*
* This function returns zero in case of success and %-EUCLEAN in case of bad
* CRC or magic.
@@ -131,8 +155,8 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
node_len > c->ranges[type].max_len)
goto out_len;
- if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc &&
- c->no_chk_data_crc)
+ if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting &&
+ !c->remounting_rw && c->no_chk_data_crc)
return 0;
crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
@@ -343,11 +367,17 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
*
* This function synchronizes write-buffer @buf and returns zero in case of
* success or a negative error code in case of failure.
+ *
+ * Note, although write-buffers are of @c->max_write_size, this function does
+ * not necessarily writes all @c->max_write_size bytes to the flash. Instead,
+ * if the write-buffer is only partially filled with data, only the used part
+ * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized.
+ * This way we waste less space.
*/
int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
{
struct ubifs_info *c = wbuf->c;
- int err, dirt;
+ int err, dirt, sync_len;
cancel_wbuf_timer_nolock(wbuf);
if (!wbuf->used || wbuf->lnum == -1)
@@ -357,27 +387,53 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
dbg_io("LEB %d:%d, %d bytes, jhead %s",
wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
ubifs_assert(!(wbuf->avail & 7));
- ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
+ ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size);
+ ubifs_assert(wbuf->size >= c->min_io_size);
+ ubifs_assert(wbuf->size <= c->max_write_size);
+ ubifs_assert(wbuf->size % c->min_io_size == 0);
ubifs_assert(!c->ro_media && !c->ro_mount);
+ if (c->leb_size - wbuf->offs >= c->max_write_size)
+ ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
if (c->ro_error)
return -EROFS;
- ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
+ /*
+ * Do not write whole write buffer but write only the minimum necessary
+ * amount of min. I/O units.
+ */
+ sync_len = ALIGN(wbuf->used, c->min_io_size);
+ dirt = sync_len - wbuf->used;
+ if (dirt)
+ ubifs_pad(c, wbuf->buf + wbuf->used, dirt);
err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
- c->min_io_size, wbuf->dtype);
+ sync_len, wbuf->dtype);
if (err) {
ubifs_err("cannot write %d bytes to LEB %d:%d",
- c->min_io_size, wbuf->lnum, wbuf->offs);
+ sync_len, wbuf->lnum, wbuf->offs);
dbg_dump_stack();
return err;
}
- dirt = wbuf->avail;
-
spin_lock(&wbuf->lock);
- wbuf->offs += c->min_io_size;
- wbuf->avail = c->min_io_size;
+ wbuf->offs += sync_len;
+ /*
+ * Now @wbuf->offs is not necessarily aligned to @c->max_write_size.
+ * But our goal is to optimize writes and make sure we write in
+ * @c->max_write_size chunks and to @c->max_write_size-aligned offset.
+ * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make
+ * sure that @wbuf->offs + @wbuf->size is aligned to
+ * @c->max_write_size. This way we make sure that after next
+ * write-buffer flush we are again at the optimal offset (aligned to
+ * @c->max_write_size).
+ */
+ if (c->leb_size - wbuf->offs < c->max_write_size)
+ wbuf->size = c->leb_size - wbuf->offs;
+ else if (wbuf->offs & (c->max_write_size - 1))
+ wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
+ else
+ wbuf->size = c->max_write_size;
+ wbuf->avail = wbuf->size;
wbuf->used = 0;
wbuf->next_ino = 0;
spin_unlock(&wbuf->lock);
@@ -420,7 +476,13 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
spin_lock(&wbuf->lock);
wbuf->lnum = lnum;
wbuf->offs = offs;
- wbuf->avail = c->min_io_size;
+ if (c->leb_size - wbuf->offs < c->max_write_size)
+ wbuf->size = c->leb_size - wbuf->offs;
+ else if (wbuf->offs & (c->max_write_size - 1))
+ wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
+ else
+ wbuf->size = c->max_write_size;
+ wbuf->avail = wbuf->size;
wbuf->used = 0;
spin_unlock(&wbuf->lock);
wbuf->dtype = dtype;
@@ -500,8 +562,9 @@ out_timers:
*
* This function writes data to flash via write-buffer @wbuf. This means that
* the last piece of the node won't reach the flash media immediately if it
- * does not take whole minimal I/O unit. Instead, the node will sit in RAM
- * until the write-buffer is synchronized (e.g., by timer).
+ * does not take whole max. write unit (@c->max_write_size). Instead, the node
+ * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or
+ * because more data are appended to the write-buffer).
*
* This function returns zero in case of success and a negative error code in
* case of failure. If the node cannot be written because there is no more
@@ -518,9 +581,14 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
- ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
+ ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size);
+ ubifs_assert(wbuf->size >= c->min_io_size);
+ ubifs_assert(wbuf->size <= c->max_write_size);
+ ubifs_assert(wbuf->size % c->min_io_size == 0);
ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
ubifs_assert(!c->ro_media && !c->ro_mount);
+ if (c->leb_size - wbuf->offs >= c->max_write_size)
+ ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
err = -ENOSPC;
@@ -543,14 +611,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
dbg_io("flush jhead %s wbuf to LEB %d:%d",
dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
- wbuf->offs, c->min_io_size,
+ wbuf->offs, wbuf->size,
wbuf->dtype);
if (err)
goto out;
spin_lock(&wbuf->lock);
- wbuf->offs += c->min_io_size;
- wbuf->avail = c->min_io_size;
+ wbuf->offs += wbuf->size;
+ if (c->leb_size - wbuf->offs >= c->max_write_size)
+ wbuf->size = c->max_write_size;
+ else
+ wbuf->size = c->leb_size - wbuf->offs;
+ wbuf->avail = wbuf->size;
wbuf->used = 0;
wbuf->next_ino = 0;
spin_unlock(&wbuf->lock);
@@ -564,33 +636,57 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
goto exit;
}
- /*
- * The node is large enough and does not fit entirely within current
- * minimal I/O unit. We have to fill and flush write-buffer and switch
- * to the next min. I/O unit.
- */
- dbg_io("flush jhead %s wbuf to LEB %d:%d",
- dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
- memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
- err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
- c->min_io_size, wbuf->dtype);
- if (err)
- goto out;
+ offs = wbuf->offs;
+ written = 0;
- offs = wbuf->offs + c->min_io_size;
- len -= wbuf->avail;
- aligned_len -= wbuf->avail;
- written = wbuf->avail;
+ if (wbuf->used) {
+ /*
+ * The node is large enough and does not fit entirely within
+ * current available space. We have to fill and flush
+ * write-buffer and switch to the next max. write unit.
+ */
+ dbg_io("flush jhead %s wbuf to LEB %d:%d",
+ dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
+ memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
+ err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+ wbuf->size, wbuf->dtype);
+ if (err)
+ goto out;
+
+ offs += wbuf->size;
+ len -= wbuf->avail;
+ aligned_len -= wbuf->avail;
+ written += wbuf->avail;
+ } else if (wbuf->offs & (c->max_write_size - 1)) {
+ /*
+ * The write-buffer offset is not aligned to
+ * @c->max_write_size and @wbuf->size is less than
+ * @c->max_write_size. Write @wbuf->size bytes to make sure the
+ * following writes are done in optimal @c->max_write_size
+ * chunks.
+ */
+ dbg_io("write %d bytes to LEB %d:%d",
+ wbuf->size, wbuf->lnum, wbuf->offs);
+ err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs,
+ wbuf->size, wbuf->dtype);
+ if (err)
+ goto out;
+
+ offs += wbuf->size;
+ len -= wbuf->size;
+ aligned_len -= wbuf->size;
+ written += wbuf->size;
+ }
/*
- * The remaining data may take more whole min. I/O units, so write the
- * remains multiple to min. I/O unit size directly to the flash media.
+ * The remaining data may take more whole max. write units, so write the
+ * remains multiple to max. write unit size directly to the flash media.
* We align node length to 8-byte boundary because we anyway flash wbuf
* if the remaining space is less than 8 bytes.
*/
- n = aligned_len >> c->min_io_shift;
+ n = aligned_len >> c->max_write_shift;
if (n) {
- n <<= c->min_io_shift;
+ n <<= c->max_write_shift;
dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
wbuf->dtype);
@@ -606,14 +702,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
if (aligned_len)
/*
* And now we have what's left and what does not take whole
- * min. I/O unit, so write it to the write-buffer and we are
+ * max. write unit, so write it to the write-buffer and we are
* done.
*/
memcpy(wbuf->buf, buf + written, len);
wbuf->offs = offs;
+ if (c->leb_size - wbuf->offs >= c->max_write_size)
+ wbuf->size = c->max_write_size;
+ else
+ wbuf->size = c->leb_size - wbuf->offs;
+ wbuf->avail = wbuf->size - aligned_len;
wbuf->used = aligned_len;
- wbuf->avail = c->min_io_size - aligned_len;
wbuf->next_ino = 0;
spin_unlock(&wbuf->lock);
@@ -837,11 +937,11 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
{
size_t size;
- wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL);
+ wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL);
if (!wbuf->buf)
return -ENOMEM;
- size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
+ size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
wbuf->inodes = kmalloc(size, GFP_KERNEL);
if (!wbuf->inodes) {
kfree(wbuf->buf);
@@ -851,7 +951,14 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
wbuf->used = 0;
wbuf->lnum = wbuf->offs = -1;
- wbuf->avail = c->min_io_size;
+ /*
+ * If the LEB starts at the max. write size aligned address, then
+ * write-buffer size has to be set to @c->max_write_size. Otherwise,
+ * set it to something smaller so that it ends at the closest max.
+ * write size boundary.
+ */
+ size = c->max_write_size - (c->leb_start % c->max_write_size);
+ wbuf->avail = wbuf->size = size;
wbuf->dtype = UBI_UNKNOWN;
wbuf->sync_callback = NULL;
mutex_init(&wbuf->io_mutex);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 914f1bd89e57..aed25e864227 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -690,7 +690,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
{
struct ubifs_data_node *data;
int err, lnum, offs, compr_type, out_len;
- int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR;
+ int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
struct ubifs_inode *ui = ubifs_inode(inode);
dbg_jnl("ino %lu, blk %u, len %d, key %s",
@@ -698,9 +698,19 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
DBGKEY(key));
ubifs_assert(len <= UBIFS_BLOCK_SIZE);
- data = kmalloc(dlen, GFP_NOFS);
- if (!data)
- return -ENOMEM;
+ data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
+ if (!data) {
+ /*
+ * Fall-back to the write reserve buffer. Note, we might be
+ * currently on the memory reclaim path, when the kernel is
+ * trying to free some memory by writing out dirty pages. The
+ * write reserve buffer helps us to guarantee that we are
+ * always able to write the data.
+ */
+ allocated = 0;
+ mutex_lock(&c->write_reserve_mutex);
+ data = c->write_reserve_buf;
+ }
data->ch.node_type = UBIFS_DATA_NODE;
key_write(c, key, &data->key);
@@ -736,7 +746,10 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
goto out_ro;
finish_reservation(c);
- kfree(data);
+ if (!allocated)
+ mutex_unlock(&c->write_reserve_mutex);
+ else
+ kfree(data);
return 0;
out_release:
@@ -745,7 +758,10 @@ out_ro:
ubifs_ro_mode(c, err);
finish_reservation(c);
out_free:
- kfree(data);
+ if (!allocated)
+ mutex_unlock(&c->write_reserve_mutex);
+ else
+ kfree(data);
return err;
}
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 4d4ca388889b..c7b25e2f7764 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1035,7 +1035,8 @@ static int scan_check_cb(struct ubifs_info *c,
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
struct ubifs_lp_stats *lst = &data->lst;
- int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty;
+ int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
+ void *buf = NULL;
cat = lp->flags & LPROPS_CAT_MASK;
if (cat != LPROPS_UNCAT) {
@@ -1093,7 +1094,13 @@ static int scan_check_cb(struct ubifs_info *c,
}
}
- sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
+ buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+ if (!buf) {
+ ubifs_err("cannot allocate memory to scan LEB %d", lnum);
+ goto out;
+ }
+
+ sleb = ubifs_scan(c, lnum, 0, buf, 0);
if (IS_ERR(sleb)) {
/*
* After an unclean unmount, empty and freeable LEBs
@@ -1105,7 +1112,8 @@ static int scan_check_cb(struct ubifs_info *c,
lst->empty_lebs += 1;
lst->total_free += c->leb_size;
lst->total_dark += ubifs_calc_dark(c, c->leb_size);
- return LPT_SCAN_CONTINUE;
+ ret = LPT_SCAN_CONTINUE;
+ goto exit;
}
if (lp->free + lp->dirty == c->leb_size &&
@@ -1115,10 +1123,12 @@ static int scan_check_cb(struct ubifs_info *c,
lst->total_free += lp->free;
lst->total_dirty += lp->dirty;
lst->total_dark += ubifs_calc_dark(c, c->leb_size);
- return LPT_SCAN_CONTINUE;
+ ret = LPT_SCAN_CONTINUE;
+ goto exit;
}
data->err = PTR_ERR(sleb);
- return LPT_SCAN_STOP;
+ ret = LPT_SCAN_STOP;
+ goto exit;
}
is_idx = -1;
@@ -1236,7 +1246,10 @@ static int scan_check_cb(struct ubifs_info *c,
}
ubifs_scan_destroy(sleb);
- return LPT_SCAN_CONTINUE;
+ ret = LPT_SCAN_CONTINUE;
+exit:
+ vfree(buf);
+ return ret;
out_print:
ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1246,6 +1259,7 @@ out_print:
out_destroy:
ubifs_scan_destroy(sleb);
out:
+ vfree(buf);
data->err = -EINVAL;
return LPT_SCAN_STOP;
}
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 5c90dec5db0b..0a3c2c3f5c4a 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1628,29 +1628,35 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
{
int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
int ret;
- void *buf = c->dbg->buf;
+ void *buf, *p;
if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
return 0;
+ buf = p = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+ if (!buf) {
+ ubifs_err("cannot allocate memory for ltab checking");
+ return 0;
+ }
+
dbg_lp("LEB %d", lnum);
err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
if (err) {
dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
- return err;
+ goto out;
}
while (1) {
- if (!is_a_node(c, buf, len)) {
+ if (!is_a_node(c, p, len)) {
int i, pad_len;
- pad_len = get_pad_len(c, buf, len);
+ pad_len = get_pad_len(c, p, len);
if (pad_len) {
- buf += pad_len;
+ p += pad_len;
len -= pad_len;
dirty += pad_len;
continue;
}
- if (!dbg_is_all_ff(buf, len)) {
+ if (!dbg_is_all_ff(p, len)) {
dbg_msg("invalid empty space in LEB %d at %d",
lnum, c->leb_size - len);
err = -EINVAL;
@@ -1668,16 +1674,21 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
lnum, dirty, c->ltab[i].dirty);
err = -EINVAL;
}
- return err;
+ goto out;
}
- node_type = get_lpt_node_type(c, buf, &node_num);
+ node_type = get_lpt_node_type(c, p, &node_num);
node_len = get_lpt_node_len(c, node_type);
ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
if (ret == 1)
dirty += node_len;
- buf += node_len;
+ p += node_len;
len -= node_len;
}
+
+ err = 0;
+out:
+ vfree(buf);
+ return err;
}
/**
@@ -1870,25 +1881,31 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
{
int err, len = c->leb_size, node_type, node_num, node_len, offs;
- void *buf = c->dbg->buf;
+ void *buf, *p;
printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
current->pid, lnum);
+ buf = p = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+ if (!buf) {
+ ubifs_err("cannot allocate memory to dump LPT");
+ return;
+ }
+
err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
if (err) {
ubifs_err("cannot read LEB %d, error %d", lnum, err);
- return;
+ goto out;
}
while (1) {
offs = c->leb_size - len;
- if (!is_a_node(c, buf, len)) {
+ if (!is_a_node(c, p, len)) {
int pad_len;
- pad_len = get_pad_len(c, buf, len);
+ pad_len = get_pad_len(c, p, len);
if (pad_len) {
printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
lnum, offs, pad_len);
- buf += pad_len;
+ p += pad_len;
len -= pad_len;
continue;
}
@@ -1898,7 +1915,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
break;
}
- node_type = get_lpt_node_type(c, buf, &node_num);
+ node_type = get_lpt_node_type(c, p, &node_num);
switch (node_type) {
case UBIFS_LPT_PNODE:
{
@@ -1923,7 +1940,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
else
printk(KERN_DEBUG "LEB %d:%d, nnode, ",
lnum, offs);
- err = ubifs_unpack_nnode(c, buf, &nnode);
+ err = ubifs_unpack_nnode(c, p, &nnode);
for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
nnode.nbranch[i].offs);
@@ -1944,15 +1961,18 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
break;
default:
ubifs_err("LPT node type %d not recognized", node_type);
- return;
+ goto out;
}
- buf += node_len;
+ p += node_len;
len -= node_len;
}
printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
current->pid, lnum);
+out:
+ vfree(buf);
+ return;
}
/**
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 82009c74b6a3..2cdbd31641d7 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -892,15 +892,22 @@ static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
{
int lnum, err = 0;
+ void *buf;
/* Check no-orphans flag and skip this if no orphans */
if (c->no_orphs)
return 0;
+ buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+ if (!buf) {
+ ubifs_err("cannot allocate memory to check orphans");
+ return 0;
+ }
+
for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
struct ubifs_scan_leb *sleb;
- sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
+ sleb = ubifs_scan(c, lnum, 0, buf, 0);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
break;
@@ -912,6 +919,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
break;
}
+ vfree(buf);
return err;
}
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 77e9b874b6c2..936f2cbfe6b6 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -28,6 +28,23 @@
* UBIFS always cleans away all remnants of an unclean un-mount, so that
* errors do not accumulate. However UBIFS defers recovery if it is mounted
* read-only, and the flash is not modified in that case.
+ *
+ * The general UBIFS approach to the recovery is that it recovers from
+ * corruptions which could be caused by power cuts, but it refuses to recover
+ * from corruption caused by other reasons. And UBIFS tries to distinguish
+ * between these 2 reasons of corruptions and silently recover in the former
+ * case and loudly complain in the latter case.
+ *
+ * UBIFS writes only to erased LEBs, so it writes only to the flash space
+ * containing only 0xFFs. UBIFS also always writes strictly from the beginning
+ * of the LEB to the end. And UBIFS assumes that the underlying flash media
+ * writes in @c->max_write_size bytes at a time.
+ *
+ * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min.
+ * I/O unit corresponding to offset X to contain corrupted data, all the
+ * following min. I/O units have to contain empty space (all 0xFFs). If this is
+ * not true, the corruption cannot be the result of a power cut, and UBIFS
+ * refuses to mount.
*/
#include <linux/crc32.h>
@@ -362,8 +379,9 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
* @offs: offset to check
*
* This function returns %1 if @offs was in the last write to the LEB whose data
- * is in @buf, otherwise %0 is returned. The determination is made by checking
- * for subsequent empty space starting from the next @c->min_io_size boundary.
+ * is in @buf, otherwise %0 is returned. The determination is made by checking
+ * for subsequent empty space starting from the next @c->max_write_size
+ * boundary.
*/
static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
{
@@ -371,10 +389,10 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
uint8_t *p;
/*
- * Round up to the next @c->min_io_size boundary i.e. @offs is in the
- * last wbuf written. After that should be empty space.
+ * Round up to the next @c->max_write_size boundary i.e. @offs is in
+ * the last wbuf written. After that should be empty space.
*/
- empty_offs = ALIGN(offs + 1, c->min_io_size);
+ empty_offs = ALIGN(offs + 1, c->max_write_size);
check_len = c->leb_size - empty_offs;
p = buf + empty_offs - offs;
return is_empty(p, check_len);
@@ -429,7 +447,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
int skip, dlen = le32_to_cpu(ch->len);
/* Check for empty space after the corrupt node's common header */
- skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
+ skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs;
if (is_empty(buf + skip, len - skip))
return 1;
/*
@@ -441,7 +459,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
return 0;
}
/* Now we know the corrupt node's length we can skip over it */
- skip = ALIGN(offs + dlen, c->min_io_size) - offs;
+ skip = ALIGN(offs + dlen, c->max_write_size) - offs;
/* After which there should be empty space */
if (is_empty(buf + skip, len - skip))
return 1;
@@ -671,10 +689,14 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
} else {
int corruption = first_non_ff(buf, len);
+ /*
+ * See header comment for this file for more
+ * explanations about the reasons we have this check.
+ */
ubifs_err("corrupt empty space LEB %d:%d, corruption "
"starts at %d", lnum, offs, corruption);
/* Make sure we dump interesting non-0xFF data */
- offs = corruption;
+ offs += corruption;
buf += corruption;
goto corrupted;
}
@@ -836,12 +858,8 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
static int recover_head(const struct ubifs_info *c, int lnum, int offs,
void *sbuf)
{
- int len, err;
+ int len = c->max_write_size, err;
- if (c->min_io_size > 1)
- len = c->min_io_size;
- else
- len = 512;
if (offs + len > c->leb_size)
len = c->leb_size - offs;
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 3e1ee57dbeaa..36216b46f772 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -328,7 +328,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
if (!quiet)
ubifs_err("empty space starts at non-aligned offset %d",
offs);
- goto corrupted;;
+ goto corrupted;
}
ubifs_end_scan(c, sleb, lnum, offs);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6e11c2975dcf..e5dc1e120e8d 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -512,9 +512,12 @@ static int init_constants_early(struct ubifs_info *c)
c->leb_cnt = c->vi.size;
c->leb_size = c->vi.usable_leb_size;
+ c->leb_start = c->di.leb_start;
c->half_leb_size = c->leb_size / 2;
c->min_io_size = c->di.min_io_size;
c->min_io_shift = fls(c->min_io_size) - 1;
+ c->max_write_size = c->di.max_write_size;
+ c->max_write_shift = fls(c->max_write_size) - 1;
if (c->leb_size < UBIFS_MIN_LEB_SZ) {
ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
@@ -534,6 +537,18 @@ static int init_constants_early(struct ubifs_info *c)
}
/*
+ * Maximum write size has to be greater or equivalent to min. I/O
+ * size, and be multiple of min. I/O size.
+ */
+ if (c->max_write_size < c->min_io_size ||
+ c->max_write_size % c->min_io_size ||
+ !is_power_of_2(c->max_write_size)) {
+ ubifs_err("bad write buffer size %d for %d min. I/O unit",
+ c->max_write_size, c->min_io_size);
+ return -EINVAL;
+ }
+
+ /*
* UBIFS aligns all node to 8-byte boundary, so to make function in
* io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
* less than 8.
@@ -541,6 +556,10 @@ static int init_constants_early(struct ubifs_info *c)
if (c->min_io_size < 8) {
c->min_io_size = 8;
c->min_io_shift = 3;
+ if (c->max_write_size < c->min_io_size) {
+ c->max_write_size = c->min_io_size;
+ c->max_write_shift = c->min_io_shift;
+ }
}
c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
@@ -1202,11 +1221,14 @@ static int mount_ubifs(struct ubifs_info *c)
if (c->bulk_read == 1)
bu_init(c);
- /*
- * We have to check all CRCs, even for data nodes, when we mount the FS
- * (specifically, when we are replaying).
- */
- c->always_chk_crc = 1;
+ if (!c->ro_mount) {
+ c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ,
+ GFP_KERNEL);
+ if (!c->write_reserve_buf)
+ goto out_free;
+ }
+
+ c->mounting = 1;
err = ubifs_read_superblock(c);
if (err)
@@ -1382,7 +1404,7 @@ static int mount_ubifs(struct ubifs_info *c)
if (err)
goto out_infos;
- c->always_chk_crc = 0;
+ c->mounting = 0;
ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
c->vi.ubi_num, c->vi.vol_id, c->vi.name);
@@ -1403,6 +1425,7 @@ static int mount_ubifs(struct ubifs_info *c)
dbg_msg("compiled on: " __DATE__ " at " __TIME__);
dbg_msg("min. I/O unit size: %d bytes", c->min_io_size);
+ dbg_msg("max. write size: %d bytes", c->max_write_size);
dbg_msg("LEB size: %d bytes (%d KiB)",
c->leb_size, c->leb_size >> 10);
dbg_msg("data journal heads: %d",
@@ -1432,9 +1455,9 @@ static int mount_ubifs(struct ubifs_info *c)
UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu",
UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
- dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu",
+ dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d",
UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
- UBIFS_MAX_DENT_NODE_SZ);
+ UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
dbg_msg("dead watermark: %d", c->dead_wm);
dbg_msg("dark watermark: %d", c->dark_wm);
dbg_msg("LEB overhead: %d", c->leb_overhead);
@@ -1474,6 +1497,7 @@ out_wbufs:
out_cbuf:
kfree(c->cbuf);
out_free:
+ kfree(c->write_reserve_buf);
kfree(c->bu.buf);
vfree(c->ileb_buf);
vfree(c->sbuf);
@@ -1512,6 +1536,7 @@ static void ubifs_umount(struct ubifs_info *c)
kfree(c->cbuf);
kfree(c->rcvrd_mst_node);
kfree(c->mst_node);
+ kfree(c->write_reserve_buf);
kfree(c->bu.buf);
vfree(c->ileb_buf);
vfree(c->sbuf);
@@ -1543,7 +1568,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
mutex_lock(&c->umount_mutex);
dbg_save_space_info(c);
c->remounting_rw = 1;
- c->always_chk_crc = 1;
err = check_free_space(c);
if (err)
@@ -1598,6 +1622,10 @@ static int ubifs_remount_rw(struct ubifs_info *c)
goto out;
}
+ c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL);
+ if (!c->write_reserve_buf)
+ goto out;
+
err = ubifs_lpt_init(c, 0, 1);
if (err)
goto out;
@@ -1650,7 +1678,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
dbg_gen("re-mounted read-write");
c->ro_mount = 0;
c->remounting_rw = 0;
- c->always_chk_crc = 0;
err = dbg_check_space_info(c);
mutex_unlock(&c->umount_mutex);
return err;
@@ -1663,11 +1690,12 @@ out:
c->bgt = NULL;
}
free_wbufs(c);
+ kfree(c->write_reserve_buf);
+ c->write_reserve_buf = NULL;
vfree(c->ileb_buf);
c->ileb_buf = NULL;
ubifs_lpt_free(c, 1);
c->remounting_rw = 0;
- c->always_chk_crc = 0;
mutex_unlock(&c->umount_mutex);
return err;
}
@@ -1707,6 +1735,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
free_wbufs(c);
vfree(c->orph_buf);
c->orph_buf = NULL;
+ kfree(c->write_reserve_buf);
+ c->write_reserve_buf = NULL;
vfree(c->ileb_buf);
c->ileb_buf = NULL;
ubifs_lpt_free(c, 1);
@@ -1937,6 +1967,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
mutex_init(&c->mst_mutex);
mutex_init(&c->umount_mutex);
mutex_init(&c->bu_mutex);
+ mutex_init(&c->write_reserve_mutex);
init_waitqueue_head(&c->cmt_wq);
c->buds = RB_ROOT;
c->old_idx = RB_ROOT;
@@ -1954,6 +1985,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
INIT_LIST_HEAD(&c->old_buds);
INIT_LIST_HEAD(&c->orph_list);
INIT_LIST_HEAD(&c->orph_new);
+ c->no_chk_data_crc = 1;
c->vfs_sb = sb;
c->highest_inum = UBIFS_FIRST_INO;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index ad9cf0133622..de485979ca39 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -447,8 +447,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
*
* Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
* is true (it is controlled by corresponding mount option). However, if
- * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always
- * checked.
+ * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to
+ * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is
+ * because during mounting or re-mounting from R/O mode to R/W mode we may read
+ * journal nodes (when replying the journal or doing the recovery) and the
+ * journal nodes may potentially be corrupted, so checking is required.
*/
static int try_read_node(const struct ubifs_info *c, void *buf, int type,
int len, int lnum, int offs)
@@ -476,7 +479,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
if (node_len != len)
return 0;
- if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc)
+ if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting &&
+ !c->remounting_rw)
return 1;
crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 381d6b207a52..8c40ad3c6721 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -151,6 +151,12 @@
*/
#define WORST_COMPR_FACTOR 2
+/*
+ * How much memory is needed for a buffer where we comress a data node.
+ */
+#define COMPRESSED_DATA_NODE_BUF_SZ \
+ (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR)
+
/* Maximum expected tree height for use by bottom_up_buf */
#define BOTTOM_UP_HEIGHT 64
@@ -646,6 +652,7 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
* @offs: write-buffer offset in this logical eraseblock
* @avail: number of bytes available in the write-buffer
* @used: number of used bytes in the write-buffer
+ * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range)
* @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
* %UBI_UNKNOWN)
* @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
@@ -680,6 +687,7 @@ struct ubifs_wbuf {
int offs;
int avail;
int used;
+ int size;
int dtype;
int jhead;
int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
@@ -1003,6 +1011,11 @@ struct ubifs_debug_info;
* @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
* @bu: pre-allocated bulk-read information
*
+ * @write_reserve_mutex: protects @write_reserve_buf
+ * @write_reserve_buf: on the write path we allocate memory, which might
+ * sometimes be unavailable, in which case we use this
+ * write reserve buffer
+ *
* @log_lebs: number of logical eraseblocks in the log
* @log_bytes: log size in bytes
* @log_last: last LEB of the log
@@ -1024,7 +1037,12 @@ struct ubifs_debug_info;
*
* @min_io_size: minimal input/output unit size
* @min_io_shift: number of bits in @min_io_size minus one
+ * @max_write_size: maximum amount of bytes the underlying flash can write at a
+ * time (MTD write buffer size)
+ * @max_write_shift: number of bits in @max_write_size minus one
* @leb_size: logical eraseblock size in bytes
+ * @leb_start: starting offset of logical eraseblocks within physical
+ * eraseblocks
* @half_leb_size: half LEB size
* @idx_leb_size: how many bytes of an LEB are effectively available when it is
* used to store indexing nodes (@leb_size - @max_idx_node_sz)
@@ -1166,22 +1184,21 @@ struct ubifs_debug_info;
* @rp_uid: reserved pool user ID
* @rp_gid: reserved pool group ID
*
- * @empty: if the UBI device is empty
+ * @empty: %1 if the UBI device is empty
+ * @need_recovery: %1 if the file-system needs recovery
+ * @replaying: %1 during journal replay
+ * @mounting: %1 while mounting
+ * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
* @replay_tree: temporary tree used during journal replay
* @replay_list: temporary list used during journal replay
* @replay_buds: list of buds to replay
* @cs_sqnum: sequence number of first node in the log (commit start node)
* @replay_sqnum: sequence number of node currently being replayed
- * @need_recovery: file-system needs recovery
- * @replaying: set to %1 during journal replay
* @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
* mode
* @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
* FS to R/W mode
* @size_tree: inode size information for recovery
- * @remounting_rw: set while re-mounting from R/O mode to R/W mode
- * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
- * mode)
* @mount_opts: UBIFS-specific mount options
*
* @dbg: debugging-related information
@@ -1250,6 +1267,9 @@ struct ubifs_info {
struct mutex bu_mutex;
struct bu_info bu;
+ struct mutex write_reserve_mutex;
+ void *write_reserve_buf;
+
int log_lebs;
long long log_bytes;
int log_last;
@@ -1271,7 +1291,10 @@ struct ubifs_info {
int min_io_size;
int min_io_shift;
+ int max_write_size;
+ int max_write_shift;
int leb_size;
+ int leb_start;
int half_leb_size;
int idx_leb_size;
int leb_cnt;
@@ -1402,19 +1425,19 @@ struct ubifs_info {
gid_t rp_gid;
/* The below fields are used only during mounting and re-mounting */
- int empty;
+ unsigned int empty:1;
+ unsigned int need_recovery:1;
+ unsigned int replaying:1;
+ unsigned int mounting:1;
+ unsigned int remounting_rw:1;
struct rb_root replay_tree;
struct list_head replay_list;
struct list_head replay_buds;
unsigned long long cs_sqnum;
unsigned long long replay_sqnum;
- int need_recovery;
- int replaying;
struct list_head unclean_leb_list;
struct ubifs_mst_node *rcvrd_mst_node;
struct rb_root size_tree;
- int remounting_rw;
- int always_chk_crc;
struct ubifs_mount_opts mount_opts;
#ifdef CONFIG_UBIFS_FS_DEBUG
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 306ee39ef2c3..8994dd041660 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -31,7 +31,7 @@
#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
#define udf_find_next_one_bit(addr, size, offset) \
- ext2_find_next_bit(addr, size, offset)
+ ext2_find_next_bit((unsigned long *)(addr), size, offset)
static int read_block_bitmap(struct super_block *sb,
struct udf_bitmap *bitmap, unsigned int block,
@@ -297,7 +297,7 @@ repeat:
break;
}
} else {
- bit = udf_find_next_one_bit((char *)bh->b_data,
+ bit = udf_find_next_one_bit(bh->b_data,
sb->s_blocksize << 3,
group_start << 3);
if (bit < sb->s_blocksize << 3)
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 89c78486cbbe..f391a2adc699 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -123,8 +123,8 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (inode->i_sb->s_blocksize <
(udf_file_entry_alloc_offset(inode) +
pos + count)) {
- udf_expand_file_adinicb(inode, pos + count, &err);
- if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+ err = udf_expand_file_adinicb(inode);
+ if (err) {
udf_debug("udf_expand_adinicb: err=%d\n", err);
up_write(&iinfo->i_data_sem);
return err;
@@ -237,7 +237,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
+ error = udf_setsize(inode, attr->ia_size);
if (error)
return error;
}
@@ -249,5 +249,4 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
const struct inode_operations udf_file_inode_operations = {
.setattr = udf_setattr,
- .truncate = udf_truncate,
};
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c6a2e782b97b..ccc814321414 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -73,14 +73,12 @@ void udf_evict_inode(struct inode *inode)
struct udf_inode_info *iinfo = UDF_I(inode);
int want_delete = 0;
- truncate_inode_pages(&inode->i_data, 0);
-
if (!inode->i_nlink && !is_bad_inode(inode)) {
want_delete = 1;
- inode->i_size = 0;
- udf_truncate(inode);
+ udf_setsize(inode, 0);
udf_update_inode(inode, IS_SYNC(inode));
- }
+ } else
+ truncate_inode_pages(&inode->i_data, 0);
invalidate_inode_buffers(inode);
end_writeback(inode);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
@@ -117,9 +115,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
+ struct inode *inode = mapping->host;
+ struct udf_inode_info *iinfo = UDF_I(inode);
+ loff_t isize = inode->i_size;
+
+ if (pos + len > isize) {
+ truncate_pagecache(inode, pos + len, isize);
+ if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+ down_write(&iinfo->i_data_sem);
+ udf_truncate_extents(inode);
+ up_write(&iinfo->i_data_sem);
+ }
+ }
}
return ret;
@@ -139,30 +146,31 @@ const struct address_space_operations udf_aops = {
.bmap = udf_bmap,
};
-void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err)
+int udf_expand_file_adinicb(struct inode *inode)
{
struct page *page;
char *kaddr;
struct udf_inode_info *iinfo = UDF_I(inode);
+ int err;
struct writeback_control udf_wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = 1,
};
- /* from now on we have normal address_space methods */
- inode->i_data.a_ops = &udf_aops;
-
if (!iinfo->i_lenAlloc) {
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
else
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+ /* from now on we have normal address_space methods */
+ inode->i_data.a_ops = &udf_aops;
mark_inode_dirty(inode);
- return;
+ return 0;
}
- page = grab_cache_page(inode->i_mapping, 0);
- BUG_ON(!PageLocked(page));
+ page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
if (!PageUptodate(page)) {
kaddr = kmap(page);
@@ -181,11 +189,24 @@ void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
else
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
-
- inode->i_data.a_ops->writepage(page, &udf_wbc);
+ /* from now on we have normal address_space methods */
+ inode->i_data.a_ops = &udf_aops;
+ err = inode->i_data.a_ops->writepage(page, &udf_wbc);
+ if (err) {
+ /* Restore everything back so that we don't lose data... */
+ lock_page(page);
+ kaddr = kmap(page);
+ memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
+ inode->i_size);
+ kunmap(page);
+ unlock_page(page);
+ iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
+ inode->i_data.a_ops = &udf_adinicb_aops;
+ }
page_cache_release(page);
-
mark_inode_dirty(inode);
+
+ return err;
}
struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
@@ -348,8 +369,10 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
}
/* Extend the file by 'blocks' blocks, return the number of extents added */
-int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
- struct kernel_long_ad *last_ext, sector_t blocks)
+static int udf_do_extend_file(struct inode *inode,
+ struct extent_position *last_pos,
+ struct kernel_long_ad *last_ext,
+ sector_t blocks)
{
sector_t add;
int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
@@ -357,6 +380,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
struct kernel_lb_addr prealloc_loc = {};
int prealloc_len = 0;
struct udf_inode_info *iinfo;
+ int err;
/* The previous extent is fake and we should not extend by anything
* - there's nothing to do... */
@@ -422,26 +446,29 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
/* Create enough extents to cover the whole hole */
while (blocks > add) {
blocks -= add;
- if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
- last_ext->extLength, 1) == -1)
- return -1;
+ err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
+ last_ext->extLength, 1);
+ if (err)
+ return err;
count++;
}
if (blocks) {
last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
(blocks << sb->s_blocksize_bits);
- if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
- last_ext->extLength, 1) == -1)
- return -1;
+ err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
+ last_ext->extLength, 1);
+ if (err)
+ return err;
count++;
}
out:
/* Do we have some preallocated blocks saved? */
if (prealloc_len) {
- if (udf_add_aext(inode, last_pos, &prealloc_loc,
- prealloc_len, 1) == -1)
- return -1;
+ err = udf_add_aext(inode, last_pos, &prealloc_loc,
+ prealloc_len, 1);
+ if (err)
+ return err;
last_ext->extLocation = prealloc_loc;
last_ext->extLength = prealloc_len;
count++;
@@ -453,11 +480,68 @@ out:
else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
last_pos->offset -= sizeof(struct long_ad);
else
- return -1;
+ return -EIO;
return count;
}
+static int udf_extend_file(struct inode *inode, loff_t newsize)
+{
+
+ struct extent_position epos;
+ struct kernel_lb_addr eloc;
+ uint32_t elen;
+ int8_t etype;
+ struct super_block *sb = inode->i_sb;
+ sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
+ int adsize;
+ struct udf_inode_info *iinfo = UDF_I(inode);
+ struct kernel_long_ad extent;
+ int err;
+
+ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+ adsize = sizeof(struct short_ad);
+ else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+ adsize = sizeof(struct long_ad);
+ else
+ BUG();
+
+ etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
+
+ /* File has extent covering the new size (could happen when extending
+ * inside a block)? */
+ if (etype != -1)
+ return 0;
+ if (newsize & (sb->s_blocksize - 1))
+ offset++;
+ /* Extended file just to the boundary of the last file block? */
+ if (offset == 0)
+ return 0;
+
+ /* Truncate is extending the file by 'offset' blocks */
+ if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
+ (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
+ /* File has no extents at all or has empty last
+ * indirect extent! Create a fake extent... */
+ extent.extLocation.logicalBlockNum = 0;
+ extent.extLocation.partitionReferenceNum = 0;
+ extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
+ } else {
+ epos.offset -= adsize;
+ etype = udf_next_aext(inode, &epos, &extent.extLocation,
+ &extent.extLength, 0);
+ extent.extLength |= etype << 30;
+ }
+ err = udf_do_extend_file(inode, &epos, &extent, offset);
+ if (err < 0)
+ goto out;
+ err = 0;
+ iinfo->i_lenExtents = newsize;
+out:
+ brelse(epos.bh);
+ return err;
+}
+
static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
int *err, sector_t *phys, int *new)
{
@@ -540,7 +624,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
elen = EXT_RECORDED_ALLOCATED |
((elen + inode->i_sb->s_blocksize - 1) &
~(inode->i_sb->s_blocksize - 1));
- etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
+ udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
}
brelse(prev_epos.bh);
brelse(cur_epos.bh);
@@ -564,19 +648,17 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
memset(&laarr[0].extLocation, 0x00,
sizeof(struct kernel_lb_addr));
laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
- /* Will udf_extend_file() create real extent from
+ /* Will udf_do_extend_file() create real extent from
a fake one? */
startnum = (offset > 0);
}
/* Create extents for the hole between EOF and offset */
- ret = udf_extend_file(inode, &prev_epos, laarr, offset);
- if (ret == -1) {
+ ret = udf_do_extend_file(inode, &prev_epos, laarr, offset);
+ if (ret < 0) {
brelse(prev_epos.bh);
brelse(cur_epos.bh);
brelse(next_epos.bh);
- /* We don't really know the error here so we just make
- * something up */
- *err = -ENOSPC;
+ *err = ret;
return NULL;
}
c = 0;
@@ -1005,52 +1087,66 @@ struct buffer_head *udf_bread(struct inode *inode, int block,
return NULL;
}
-void udf_truncate(struct inode *inode)
+int udf_setsize(struct inode *inode, loff_t newsize)
{
- int offset;
int err;
struct udf_inode_info *iinfo;
+ int bsize = 1 << inode->i_blkbits;
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)))
- return;
+ return -EINVAL;
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return;
+ return -EPERM;
iinfo = UDF_I(inode);
- if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+ if (newsize > inode->i_size) {
down_write(&iinfo->i_data_sem);
- if (inode->i_sb->s_blocksize <
- (udf_file_entry_alloc_offset(inode) +
- inode->i_size)) {
- udf_expand_file_adinicb(inode, inode->i_size, &err);
- if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
- inode->i_size = iinfo->i_lenAlloc;
- up_write(&iinfo->i_data_sem);
- return;
+ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+ if (bsize <
+ (udf_file_entry_alloc_offset(inode) + newsize)) {
+ err = udf_expand_file_adinicb(inode);
+ if (err) {
+ up_write(&iinfo->i_data_sem);
+ return err;
+ }
} else
- udf_truncate_extents(inode);
- } else {
- offset = inode->i_size & (inode->i_sb->s_blocksize - 1);
- memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset,
- 0x00, inode->i_sb->s_blocksize -
- offset - udf_file_entry_alloc_offset(inode));
- iinfo->i_lenAlloc = inode->i_size;
+ iinfo->i_lenAlloc = newsize;
+ }
+ err = udf_extend_file(inode, newsize);
+ if (err) {
+ up_write(&iinfo->i_data_sem);
+ return err;
}
+ truncate_setsize(inode, newsize);
up_write(&iinfo->i_data_sem);
} else {
- block_truncate_page(inode->i_mapping, inode->i_size,
- udf_get_block);
+ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+ down_write(&iinfo->i_data_sem);
+ memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize,
+ 0x00, bsize - newsize -
+ udf_file_entry_alloc_offset(inode));
+ iinfo->i_lenAlloc = newsize;
+ truncate_setsize(inode, newsize);
+ up_write(&iinfo->i_data_sem);
+ goto update_time;
+ }
+ err = block_truncate_page(inode->i_mapping, newsize,
+ udf_get_block);
+ if (err)
+ return err;
down_write(&iinfo->i_data_sem);
+ truncate_setsize(inode, newsize);
udf_truncate_extents(inode);
up_write(&iinfo->i_data_sem);
}
-
+update_time:
inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
if (IS_SYNC(inode))
udf_sync_inode(inode);
else
mark_inode_dirty(inode);
+ return 0;
}
static void __udf_read_inode(struct inode *inode)
@@ -1637,14 +1733,13 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
return NULL;
}
-int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
- struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+int udf_add_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc)
{
int adsize;
struct short_ad *sad = NULL;
struct long_ad *lad = NULL;
struct allocExtDesc *aed;
- int8_t etype;
uint8_t *ptr;
struct udf_inode_info *iinfo = UDF_I(inode);
@@ -1660,7 +1755,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
adsize = sizeof(struct long_ad);
else
- return -1;
+ return -EIO;
if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
unsigned char *sptr, *dptr;
@@ -1672,12 +1767,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
obloc.partitionReferenceNum,
obloc.logicalBlockNum, &err);
if (!epos->block.logicalBlockNum)
- return -1;
+ return -ENOSPC;
nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
&epos->block,
0));
if (!nbh)
- return -1;
+ return -EIO;
lock_buffer(nbh);
memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
set_buffer_uptodate(nbh);
@@ -1746,7 +1841,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
epos->bh = nbh;
}
- etype = udf_write_aext(inode, epos, eloc, elen, inc);
+ udf_write_aext(inode, epos, eloc, elen, inc);
if (!epos->bh) {
iinfo->i_lenAlloc += adsize;
@@ -1764,11 +1859,11 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
mark_buffer_dirty_inode(epos->bh, inode);
}
- return etype;
+ return 0;
}
-int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
- struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+void udf_write_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc)
{
int adsize;
uint8_t *ptr;
@@ -1798,7 +1893,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
adsize = sizeof(struct long_ad);
break;
default:
- return -1;
+ return;
}
if (epos->bh) {
@@ -1817,8 +1912,6 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
if (inc)
epos->offset += adsize;
-
- return (elen >> 30);
}
int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 225527cdc885..8424308db4b4 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -197,6 +197,11 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
mark_buffer_dirty_inode(epos->bh, inode);
}
+/*
+ * Truncate extents of inode to inode->i_size. This function can be used only
+ * for making file shorter. For making file longer, udf_extend_file() has to
+ * be used.
+ */
void udf_truncate_extents(struct inode *inode)
{
struct extent_position epos;
@@ -219,96 +224,65 @@ void udf_truncate_extents(struct inode *inode)
etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
byte_offset = (offset << sb->s_blocksize_bits) +
(inode->i_size & (sb->s_blocksize - 1));
- if (etype != -1) {
- epos.offset -= adsize;
- extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
- epos.offset += adsize;
- if (byte_offset)
- lenalloc = epos.offset;
- else
- lenalloc = epos.offset - adsize;
-
- if (!epos.bh)
- lenalloc -= udf_file_entry_alloc_offset(inode);
- else
- lenalloc -= sizeof(struct allocExtDesc);
-
- while ((etype = udf_current_aext(inode, &epos, &eloc,
- &elen, 0)) != -1) {
- if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
- udf_write_aext(inode, &epos, &neloc, nelen, 0);
- if (indirect_ext_len) {
- /* We managed to free all extents in the
- * indirect extent - free it too */
- BUG_ON(!epos.bh);
- udf_free_blocks(sb, inode, &epos.block,
- 0, indirect_ext_len);
- } else if (!epos.bh) {
- iinfo->i_lenAlloc = lenalloc;
- mark_inode_dirty(inode);
- } else
- udf_update_alloc_ext_desc(inode,
- &epos, lenalloc);
- brelse(epos.bh);
- epos.offset = sizeof(struct allocExtDesc);
- epos.block = eloc;
- epos.bh = udf_tread(sb,
- udf_get_lb_pblock(sb, &eloc, 0));
- if (elen)
- indirect_ext_len =
- (elen + sb->s_blocksize - 1) >>
- sb->s_blocksize_bits;
- else
- indirect_ext_len = 1;
- } else {
- extent_trunc(inode, &epos, &eloc, etype,
- elen, 0);
- epos.offset += adsize;
- }
- }
+ if (etype == -1) {
+ /* We should extend the file? */
+ WARN_ON(byte_offset);
+ return;
+ }
+ epos.offset -= adsize;
+ extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
+ epos.offset += adsize;
+ if (byte_offset)
+ lenalloc = epos.offset;
+ else
+ lenalloc = epos.offset - adsize;
- if (indirect_ext_len) {
- BUG_ON(!epos.bh);
- udf_free_blocks(sb, inode, &epos.block, 0,
- indirect_ext_len);
- } else if (!epos.bh) {
- iinfo->i_lenAlloc = lenalloc;
- mark_inode_dirty(inode);
- } else
- udf_update_alloc_ext_desc(inode, &epos, lenalloc);
- } else if (inode->i_size) {
- if (byte_offset) {
- struct kernel_long_ad extent;
+ if (!epos.bh)
+ lenalloc -= udf_file_entry_alloc_offset(inode);
+ else
+ lenalloc -= sizeof(struct allocExtDesc);
- /*
- * OK, there is not extent covering inode->i_size and
- * no extent above inode->i_size => truncate is
- * extending the file by 'offset' blocks.
- */
- if ((!epos.bh &&
- epos.offset ==
- udf_file_entry_alloc_offset(inode)) ||
- (epos.bh && epos.offset ==
- sizeof(struct allocExtDesc))) {
- /* File has no extents at all or has empty last
- * indirect extent! Create a fake extent... */
- extent.extLocation.logicalBlockNum = 0;
- extent.extLocation.partitionReferenceNum = 0;
- extent.extLength =
- EXT_NOT_RECORDED_NOT_ALLOCATED;
- } else {
- epos.offset -= adsize;
- etype = udf_next_aext(inode, &epos,
- &extent.extLocation,
- &extent.extLength, 0);
- extent.extLength |= etype << 30;
- }
- udf_extend_file(inode, &epos, &extent,
- offset +
- ((inode->i_size &
- (sb->s_blocksize - 1)) != 0));
+ while ((etype = udf_current_aext(inode, &epos, &eloc,
+ &elen, 0)) != -1) {
+ if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
+ udf_write_aext(inode, &epos, &neloc, nelen, 0);
+ if (indirect_ext_len) {
+ /* We managed to free all extents in the
+ * indirect extent - free it too */
+ BUG_ON(!epos.bh);
+ udf_free_blocks(sb, inode, &epos.block,
+ 0, indirect_ext_len);
+ } else if (!epos.bh) {
+ iinfo->i_lenAlloc = lenalloc;
+ mark_inode_dirty(inode);
+ } else
+ udf_update_alloc_ext_desc(inode,
+ &epos, lenalloc);
+ brelse(epos.bh);
+ epos.offset = sizeof(struct allocExtDesc);
+ epos.block = eloc;
+ epos.bh = udf_tread(sb,
+ udf_get_lb_pblock(sb, &eloc, 0));
+ if (elen)
+ indirect_ext_len =
+ (elen + sb->s_blocksize - 1) >>
+ sb->s_blocksize_bits;
+ else
+ indirect_ext_len = 1;
+ } else {
+ extent_trunc(inode, &epos, &eloc, etype, elen, 0);
+ epos.offset += adsize;
}
}
+
+ if (indirect_ext_len) {
+ BUG_ON(!epos.bh);
+ udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len);
+ } else if (!epos.bh) {
+ iinfo->i_lenAlloc = lenalloc;
+ mark_inode_dirty(inode);
+ } else
+ udf_update_alloc_ext_desc(inode, &epos, lenalloc);
iinfo->i_lenExtents = inode->i_size;
brelse(epos.bh);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index eba48209f9f3..dbd52d4b5eed 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -136,22 +136,20 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
extern long udf_ioctl(struct file *, unsigned int, unsigned long);
/* inode.c */
extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
-extern void udf_expand_file_adinicb(struct inode *, int, int *);
+extern int udf_expand_file_adinicb(struct inode *);
extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
-extern void udf_truncate(struct inode *);
+extern int udf_setsize(struct inode *, loff_t);
extern void udf_read_inode(struct inode *);
extern void udf_evict_inode(struct inode *);
extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
extern long udf_block_map(struct inode *, sector_t);
-extern int udf_extend_file(struct inode *, struct extent_position *,
- struct kernel_long_ad *, sector_t);
extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
struct kernel_lb_addr *, uint32_t *, sector_t *);
-extern int8_t udf_add_aext(struct inode *, struct extent_position *,
+extern int udf_add_aext(struct inode *, struct extent_position *,
+ struct kernel_lb_addr *, uint32_t, int);
+extern void udf_write_aext(struct inode *, struct extent_position *,
struct kernel_lb_addr *, uint32_t, int);
-extern int8_t udf_write_aext(struct inode *, struct extent_position *,
- struct kernel_lb_addr *, uint32_t, int);
extern int8_t udf_delete_aext(struct inode *, struct extent_position,
struct kernel_lb_addr, uint32_t);
extern int8_t udf_next_aext(struct inode *, struct extent_position *,