From 474279dc0f7745124fc76b474c8dc1294f8e87ce Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 1 Oct 2013 16:11:26 -0400 Subject: split __lookup_mnt() in two functions Instead of passing the direction as argument (and checking it on every step through the hash chain), just have separate __lookup_mnt() and __lookup_mnt_last(). And use the standard iterators... Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 645268f23eb6..1f844fbfce72 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1111,7 +1111,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, if (!d_mountpoint(path->dentry)) break; - mounted = __lookup_mnt(path->mnt, path->dentry, 1); + mounted = __lookup_mnt(path->mnt, path->dentry); if (!mounted) break; path->mnt = &mounted->mnt; @@ -1132,7 +1132,7 @@ static void follow_mount_rcu(struct nameidata *nd) { while (d_mountpoint(nd->path.dentry)) { struct mount *mounted; - mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); + mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); if (!mounted) break; nd->path.mnt = &mounted->mnt; -- cgit v1.2.3 From 48a066e72d970a3e225a9c18690d570c736fc455 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Sep 2013 22:06:07 -0400 Subject: RCU'd vfsmounts * RCU-delayed freeing of vfsmounts * vfsmount_lock replaced with a seqlock (mount_lock) * sequence number from mount_lock is stored in nameidata->m_seq and used when we exit RCU mode * new vfsmount flag - MNT_SYNC_UMOUNT. Set by umount_tree() when its caller knows that vfsmount will have no surviving references. * synchronize_rcu() done between unlocking namespace_sem in namespace_unlock() and doing pending mntput(). * new helper: legitimize_mnt(mnt, seq). Checks the mount_lock sequence number against seq, then grabs reference to mnt. Then it rechecks mount_lock again to close the race and either returns success or drops the reference it has acquired. The subtle point is that in case of MNT_SYNC_UMOUNT we can simply decrement the refcount and sod off - aforementioned synchronize_rcu() makes sure that final mntput() won't come until we leave RCU mode. We need that, since we don't want to end up with some lazy pathwalk racing with umount() and stealing the final mntput() from it - caller of umount() may expect it to return only once the fs is shut down and we don't want to break that. In other cases (i.e. with MNT_SYNC_UMOUNT absent) we have to do full-blown mntput() in case of mount_lock sequence number mismatch happening just as we'd grabbed the reference, but in those cases we won't be stealing the final mntput() from anything that would care. * mntput_no_expire() doesn't lock anything on the fast path now. Incidentally, SMP and UP cases are handled the same way - no ifdefs there. * normal pathname resolution does *not* do any writes to mount_lock. It does, of course, bump the refcounts of vfsmount and dentry in the very end, but that's it. Signed-off-by: Al Viro --- fs/namei.c | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 1f844fbfce72..cb0ebae07e52 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -484,14 +484,12 @@ EXPORT_SYMBOL(path_put); static inline void lock_rcu_walk(void) { - br_read_lock(&vfsmount_lock); rcu_read_lock(); } static inline void unlock_rcu_walk(void) { rcu_read_unlock(); - br_read_unlock(&vfsmount_lock); } /** @@ -512,26 +510,23 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) BUG_ON(!(nd->flags & LOOKUP_RCU)); /* - * Get a reference to the parent first: we're - * going to make "path_put(nd->path)" valid in - * non-RCU context for "terminate_walk()". - * - * If this doesn't work, return immediately with - * RCU walking still active (and then we will do - * the RCU walk cleanup in terminate_walk()). + * After legitimizing the bastards, terminate_walk() + * will do the right thing for non-RCU mode, and all our + * subsequent exit cases should rcu_read_unlock() + * before returning. Do vfsmount first; if dentry + * can't be legitimized, just set nd->path.dentry to NULL + * and rely on dput(NULL) being a no-op. */ - if (!lockref_get_not_dead(&parent->d_lockref)) + if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) return -ECHILD; - - /* - * After the mntget(), we terminate_walk() will do - * the right thing for non-RCU mode, and all our - * subsequent exit cases should unlock_rcu_walk() - * before returning. - */ - mntget(nd->path.mnt); nd->flags &= ~LOOKUP_RCU; + if (!lockref_get_not_dead(&parent->d_lockref)) { + nd->path.dentry = NULL; + unlock_rcu_walk(); + return -ECHILD; + } + /* * For a negative lookup, the lookup sequence point is the parents * sequence point, and it only needs to revalidate the parent dentry. @@ -608,16 +603,21 @@ static int complete_walk(struct nameidata *nd) if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; + if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) { + unlock_rcu_walk(); + return -ECHILD; + } if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { unlock_rcu_walk(); + mntput(nd->path.mnt); return -ECHILD; } if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { unlock_rcu_walk(); dput(dentry); + mntput(nd->path.mnt); return -ECHILD; } - mntget(nd->path.mnt); unlock_rcu_walk(); } @@ -909,15 +909,15 @@ int follow_up(struct path *path) struct mount *parent; struct dentry *mountpoint; - br_read_lock(&vfsmount_lock); + read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); @@ -1048,8 +1048,8 @@ static int follow_managed(struct path *path, unsigned flags) /* Something is mounted on this dentry in another * namespace and/or whatever was mounted there in this - * namespace got unmounted before we managed to get the - * vfsmount_lock */ + * namespace got unmounted before lookup_mnt() could + * get it */ } /* Handle an automount point */ @@ -1864,6 +1864,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (flags & LOOKUP_RCU) { lock_rcu_walk(); nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + nd->m_seq = read_seqbegin(&mount_lock); } else { path_get(&nd->path); } @@ -1872,6 +1873,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->root.mnt = NULL; + nd->m_seq = read_seqbegin(&mount_lock); if (*name=='/') { if (flags & LOOKUP_RCU) { lock_rcu_walk(); -- cgit v1.2.3 From 8b61e74ffc6310e1d35a9b51c8463093851f8bcf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 Nov 2013 12:45:01 -0500 Subject: get rid of {lock,unlock}_rcu_walk() those have become aliases for rcu_read_{lock,unlock}() Signed-off-by: Al Viro --- fs/namei.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index cb0ebae07e52..e5c0118ba9f8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -482,16 +482,6 @@ EXPORT_SYMBOL(path_put); * to restart the path walk from the beginning in ref-walk mode. */ -static inline void lock_rcu_walk(void) -{ - rcu_read_lock(); -} - -static inline void unlock_rcu_walk(void) -{ - rcu_read_unlock(); -} - /** * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data @@ -523,7 +513,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) if (!lockref_get_not_dead(&parent->d_lockref)) { nd->path.dentry = NULL; - unlock_rcu_walk(); + rcu_read_unlock(); return -ECHILD; } @@ -561,17 +551,17 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) spin_unlock(&fs->lock); } - unlock_rcu_walk(); + rcu_read_unlock(); return 0; unlock_and_drop_dentry: spin_unlock(&fs->lock); drop_dentry: - unlock_rcu_walk(); + rcu_read_unlock(); dput(dentry); goto drop_root_mnt; out: - unlock_rcu_walk(); + rcu_read_unlock(); drop_root_mnt: if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; @@ -604,21 +594,21 @@ static int complete_walk(struct nameidata *nd) nd->root.mnt = NULL; if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) { - unlock_rcu_walk(); + rcu_read_unlock(); return -ECHILD; } if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { - unlock_rcu_walk(); + rcu_read_unlock(); mntput(nd->path.mnt); return -ECHILD; } if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { - unlock_rcu_walk(); + rcu_read_unlock(); dput(dentry); mntput(nd->path.mnt); return -ECHILD; } - unlock_rcu_walk(); + rcu_read_unlock(); } if (likely(!(nd->flags & LOOKUP_JUMPED))) @@ -1174,7 +1164,7 @@ failed: nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - unlock_rcu_walk(); + rcu_read_unlock(); return -ECHILD; } @@ -1501,7 +1491,7 @@ static void terminate_walk(struct nameidata *nd) nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - unlock_rcu_walk(); + rcu_read_unlock(); } } @@ -1862,7 +1852,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { - lock_rcu_walk(); + rcu_read_lock(); nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); nd->m_seq = read_seqbegin(&mount_lock); } else { @@ -1876,7 +1866,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->m_seq = read_seqbegin(&mount_lock); if (*name=='/') { if (flags & LOOKUP_RCU) { - lock_rcu_walk(); + rcu_read_lock(); set_root_rcu(nd); } else { set_root(nd); @@ -1888,7 +1878,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct fs_struct *fs = current->fs; unsigned seq; - lock_rcu_walk(); + rcu_read_lock(); do { seq = read_seqcount_begin(&fs->seq); @@ -1920,7 +1910,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (f.need_put) *fp = f.file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - lock_rcu_walk(); + rcu_read_lock(); } else { path_get(&nd->path); fdput(f); -- cgit v1.2.3 From b18825a7c8e37a7cf6abb97a12a6ad71af160de7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Sep 2013 19:22:53 +0100 Subject: VFS: Put a small type field into struct dentry::d_flags Put a type field into struct dentry::d_flags to indicate if the dentry is one of the following types that relate particularly to pathwalk: Miss (negative dentry) Directory "Automount" directory (defective - no i_op->lookup()) Symlink Other (regular, socket, fifo, device) The type field is set to one of the first five types on a dentry by calls to __d_instantiate() and d_obtain_alias() from information in the inode (if one is given). The type is cleared by dentry_unlink_inode() when it reconstitutes an existing dentry as a negative dentry. Accessors provided are: d_set_type(dentry, type) d_is_directory(dentry) d_is_autodir(dentry) d_is_symlink(dentry) d_is_file(dentry) d_is_negative(dentry) d_is_positive(dentry) A bunch of checks in pathname resolution switched to those. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/namei.c | 95 +++++++++++++++++++++++++------------------------------------- 1 file changed, 38 insertions(+), 57 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index e5c0118ba9f8..e1fa43346c61 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1501,18 +1501,9 @@ static void terminate_walk(struct nameidata *nd) * so we keep a cache of "no, this doesn't need follow_link" * for the common case. */ -static inline int should_follow_link(struct inode *inode, int follow) +static inline int should_follow_link(struct dentry *dentry, int follow) { - if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { - if (likely(inode->i_op->follow_link)) - return follow; - - /* This gets set once for the inode lifetime */ - spin_lock(&inode->i_lock); - inode->i_opflags |= IOP_NOFOLLOW; - spin_unlock(&inode->i_lock); - } - return 0; + return unlikely(d_is_symlink(dentry)) ? follow : 0; } static inline int walk_component(struct nameidata *nd, struct path *path, @@ -1542,7 +1533,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path, if (!inode) goto out_path_put; - if (should_follow_link(inode, follow)) { + if (should_follow_link(path->dentry, follow)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { err = -ECHILD; @@ -1600,26 +1591,6 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) return res; } -/* - * We really don't want to look at inode->i_op->lookup - * when we don't have to. So we keep a cache bit in - * the inode ->i_opflags field that says "yes, we can - * do lookup on this inode". - */ -static inline int can_lookup(struct inode *inode) -{ - if (likely(inode->i_opflags & IOP_LOOKUP)) - return 1; - if (likely(!inode->i_op->lookup)) - return 0; - - /* We do this once for the lifetime of the inode */ - spin_lock(&inode->i_lock); - inode->i_opflags |= IOP_LOOKUP; - spin_unlock(&inode->i_lock); - return 1; -} - /* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: @@ -1823,7 +1794,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (err) return err; } - if (!can_lookup(nd->inode)) { + if (!d_is_directory(nd->path.dentry)) { err = -ENOTDIR; break; } @@ -1841,9 +1812,10 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; if (flags & LOOKUP_ROOT) { - struct inode *inode = nd->root.dentry->d_inode; + struct dentry *root = nd->root.dentry; + struct inode *inode = root->d_inode; if (*name) { - if (!can_lookup(inode)) + if (!d_is_directory(root)) return -ENOTDIR; retval = inode_permission(inode, MAY_EXEC); if (retval) @@ -1899,7 +1871,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, dentry = f.file->f_path.dentry; if (*name) { - if (!can_lookup(dentry->d_inode)) { + if (!d_is_directory(dentry)) { fdput(f); return -ENOTDIR; } @@ -1981,7 +1953,7 @@ static int path_lookupat(int dfd, const char *name, err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) { - if (!can_lookup(nd->inode)) { + if (!d_is_directory(nd->path.dentry)) { path_put(&nd->path); err = -ENOTDIR; } @@ -2273,7 +2245,7 @@ done: } path->dentry = dentry; path->mnt = mntget(nd->path.mnt); - if (should_follow_link(dentry->d_inode, nd->flags & LOOKUP_FOLLOW)) + if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW)) return 1; follow_mount(path); error = 0; @@ -2417,12 +2389,14 @@ static inline int check_sticky(struct inode *dir, struct inode *inode) * 10. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct inode *dir,struct dentry *victim,int isdir) +static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) { + struct inode *inode = victim->d_inode; int error; - if (!victim->d_inode) + if (d_is_negative(victim)) return -ENOENT; + BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); @@ -2432,15 +2406,16 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| - IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + + if (check_sticky(dir, inode) || IS_APPEND(inode) || + IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) return -EPERM; if (isdir) { - if (!S_ISDIR(victim->d_inode->i_mode)) + if (!d_is_directory(victim) && !d_is_autodir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; - } else if (S_ISDIR(victim->d_inode->i_mode)) + } else if (d_is_directory(victim) || d_is_autodir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; @@ -2974,7 +2949,7 @@ retry_lookup: /* * create/update audit record if it already exists. */ - if (path->dentry->d_inode) + if (d_is_positive(path->dentry)) audit_inode(name, path->dentry, 0); /* @@ -3003,12 +2978,12 @@ retry_lookup: finish_lookup: /* we _can_ be in RCU mode here */ error = -ENOENT; - if (!inode) { + if (d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } - if (should_follow_link(inode, !symlink_ok)) { + if (should_follow_link(path->dentry, !symlink_ok)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { error = -ECHILD; @@ -3037,10 +3012,11 @@ finish_open: } audit_inode(name, nd->path.dentry, 0); error = -EISDIR; - if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) + if ((open_flag & O_CREAT) && + (d_is_directory(nd->path.dentry) || d_is_autodir(nd->path.dentry))) goto out; error = -ENOTDIR; - if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode)) + if ((nd->flags & LOOKUP_DIRECTORY) && !d_is_directory(nd->path.dentry)) goto out; if (!S_ISREG(nd->inode->i_mode)) will_truncate = false; @@ -3266,7 +3242,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, nd.root.mnt = mnt; nd.root.dentry = dentry; - if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) + if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU); @@ -3316,8 +3292,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname, goto unlock; error = -EEXIST; - if (dentry->d_inode) + if (d_is_positive(dentry)) goto fail; + /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - @@ -3706,7 +3683,7 @@ retry: if (nd.last.name[nd.last.len]) goto slashes; inode = dentry->d_inode; - if (!inode) + if (d_is_negative(dentry)) goto slashes; ihold(inode); error = security_path_unlink(&nd.path, dentry); @@ -3731,8 +3708,12 @@ exit1: return error; slashes: - error = !dentry->d_inode ? -ENOENT : - S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; + if (d_is_negative(dentry)) + error = -ENOENT; + else if (d_is_directory(dentry) || d_is_autodir(dentry)) + error = -EISDIR; + else + error = -ENOTDIR; goto exit2; } @@ -4046,7 +4027,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { int error; - int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); + int is_dir = d_is_directory(old_dentry) || d_is_autodir(old_dentry); const unsigned char *old_name; if (old_dentry->d_inode == new_dentry->d_inode) @@ -4134,10 +4115,10 @@ retry: goto exit3; /* source must exist */ error = -ENOENT; - if (!old_dentry->d_inode) + if (d_is_negative(old_dentry)) goto exit4; /* unless the source is a directory trailing slashes give -ENOTDIR */ - if (!S_ISDIR(old_dentry->d_inode->i_mode)) { + if (!d_is_directory(old_dentry) && !d_is_autodir(old_dentry)) { error = -ENOTDIR; if (oldnd.last.name[oldnd.last.len]) goto exit4; -- cgit v1.2.3 From 13a2c3be03973d61b6cb89ff870e758c86327bb7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 23 Oct 2013 16:09:16 -0400 Subject: dcache: fix outdated DCACHE_NEED_LOOKUP comment The DCACHE_NEED_LOOKUP case referred to here was removed with 39e3c9553f34381a1b664c27b0c696a266a5735e "vfs: remove DCACHE_NEED_LOOKUP". There are only four real_lookup() callers and all of them pass in an unhashed dentry just returned from d_alloc. Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index e1fa43346c61..2a5a7aa9f43f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1298,8 +1298,8 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, } /* - * Call i_op->lookup on the dentry. The dentry must be negative but may be - * hashed if it was pouplated with DCACHE_NEED_LOOKUP. + * Call i_op->lookup on the dentry. The dentry must be negative and + * unhashed. * * dir->d_inode->i_mutex must be held */ -- cgit v1.2.3 From 6cedba8962f440c72447f811d0d530a8a9dc637a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 5 Mar 2012 11:40:41 -0500 Subject: vfs: take i_mutex on renamed file A read delegation is used by NFSv4 as a guarantee that a client can perform local read opens without informing the server. The open operation takes the last component of the pathname as an argument, thus is also a lookup operation, and giving the client the above guarantee means informing the client before we allow anything that would change the set of names pointing to the inode. Therefore, we need to break delegations on rename, link, and unlink. We also need to prevent new delegations from being acquired while one of these operations is in progress. We could add some completely new locking for that purpose, but it's simpler to use the i_mutex, since that's already taken by all the operations we care about. The single exception is rename. So, modify rename to take the i_mutex on the file that is being renamed. Also fix up lockdep and Documentation/filesystems/directory-locking to reflect the change. Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/namei.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 2a5a7aa9f43f..88cec0330bf7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3918,7 +3918,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * That's where 4.4 screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. - * c) we have to lock _three_ objects - parents and victim (if it exists). + * c) we have to lock _four_ objects - parents and victim (if it exists), + * and source (if it is not a directory). * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change @@ -3994,6 +3995,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode *target = new_dentry->d_inode; + struct inode *source = old_dentry->d_inode; int error; error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); @@ -4001,8 +4003,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, return error; dget(new_dentry); - if (target) - mutex_lock(&target->i_mutex); + lock_two_nondirectories(source, target); error = -EBUSY; if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) @@ -4017,8 +4018,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) d_move(old_dentry, new_dentry); out: - if (target) - mutex_unlock(&target->i_mutex); + unlock_two_nondirectories(source, target); dput(new_dentry); return error; } -- cgit v1.2.3 From 9accbb977ab78234b8f298df5f306ed08d06bedb Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 28 Aug 2012 07:03:24 -0400 Subject: namei: minor vfs_unlink cleanup We'll be using dentry->d_inode in one more place. Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/namei.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 88cec0330bf7..e633a58d4222 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3617,6 +3617,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) int vfs_unlink(struct inode *dir, struct dentry *dentry) { + struct inode *target = dentry->d_inode; int error = may_delete(dir, dentry, 0); if (error) @@ -3625,7 +3626,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) if (!dir->i_op->unlink) return -EPERM; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&target->i_mutex); if (d_mountpoint(dentry)) error = -EBUSY; else { @@ -3636,11 +3637,11 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) dont_mount(dentry); } } - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&target->i_mutex); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { - fsnotify_link_count(dentry->d_inode); + fsnotify_link_count(target); d_delete(dentry); } -- cgit v1.2.3 From b21996e36c8e3b92a84e972378bde80b43acd890 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 20 Sep 2011 09:14:34 -0400 Subject: locks: break delegations on unlink We need to break delegations on any operation that changes the set of links pointing to an inode. Start with unlink. Such operations also hold the i_mutex on a parent directory. Breaking a delegation may require waiting for a timeout (by default 90 seconds) in the case of a unresponsive NFS client. To avoid blocking all directory operations, we therefore drop locks before waiting for the delegation. The logic then looks like: acquire locks ... test for delegation; if found: take reference on inode release locks wait for delegation break drop reference on inode retry It is possible this could never terminate. (Even if we take precautions to prevent another delegation being acquired on the same inode, we could get a different inode on each retry.) But this seems very unlikely. The initial test for a delegation happens after the lock on the target inode is acquired, but the directory inode may have been acquired further up the call stack. We therefore add a "struct inode **" argument to any intervening functions, which we use to pass the inode back up to the caller in the case it needs a delegation synchronously broken. Cc: David Howells Cc: Tyler Hicks Cc: Dustin Kirkland Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/namei.c | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index e633a58d4222..67ce331a3ed8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3615,7 +3615,25 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) return do_rmdir(AT_FDCWD, pathname); } -int vfs_unlink(struct inode *dir, struct dentry *dentry) +/** + * vfs_unlink - unlink a filesystem object + * @dir: parent directory + * @dentry: victim + * @delegated_inode: returns victim inode, if the inode is delegated. + * + * The caller must hold dir->i_mutex. + * + * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and + * return a reference to the inode in delegated_inode. The caller + * should then break the delegation on that inode and retry. Because + * breaking a delegation may take a long time, the caller should drop + * dir->i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ +int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; int error = may_delete(dir, dentry, 0); @@ -3632,11 +3650,20 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) else { error = security_inode_unlink(dir, dentry); if (!error) { + error = break_deleg(target, O_WRONLY|O_NONBLOCK); + if (error) { + if (error == -EWOULDBLOCK && delegated_inode) { + *delegated_inode = target; + ihold(target); + } + goto out; + } error = dir->i_op->unlink(dir, dentry); if (!error) dont_mount(dentry); } } +out: mutex_unlock(&target->i_mutex); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ @@ -3661,6 +3688,7 @@ static long do_unlinkat(int dfd, const char __user *pathname) struct dentry *dentry; struct nameidata nd; struct inode *inode = NULL; + struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: name = user_path_parent(dfd, pathname, &nd, lookup_flags); @@ -3675,7 +3703,7 @@ retry: error = mnt_want_write(nd.path.mnt); if (error) goto exit1; - +retry_deleg: mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); error = PTR_ERR(dentry); @@ -3690,13 +3718,21 @@ retry: error = security_path_unlink(&nd.path, dentry); if (error) goto exit2; - error = vfs_unlink(nd.path.dentry->d_inode, dentry); + error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode); exit2: dput(dentry); } mutex_unlock(&nd.path.dentry->d_inode->i_mutex); if (inode) iput(inode); /* truncate the inode here */ + inode = NULL; + if (delegated_inode) { + error = break_deleg(delegated_inode, O_WRONLY); + iput(delegated_inode); + delegated_inode = NULL; + if (!error) + goto retry_deleg; + } mnt_drop_write(nd.path.mnt); exit1: path_put(&nd.path); -- cgit v1.2.3 From 5a14696c1795d3843673b5cf1982d0e5357a5bbf Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 28 Aug 2012 07:50:40 -0700 Subject: locks: helper functions for delegation breaking We'll need the same logic for rename and link. Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/namei.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 67ce331a3ed8..cfaeaae0f2db 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3650,14 +3650,9 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate else { error = security_inode_unlink(dir, dentry); if (!error) { - error = break_deleg(target, O_WRONLY|O_NONBLOCK); - if (error) { - if (error == -EWOULDBLOCK && delegated_inode) { - *delegated_inode = target; - ihold(target); - } + error = try_break_deleg(target, delegated_inode); + if (error) goto out; - } error = dir->i_op->unlink(dir, dentry); if (!error) dont_mount(dentry); @@ -3727,9 +3722,7 @@ exit2: iput(inode); /* truncate the inode here */ inode = NULL; if (delegated_inode) { - error = break_deleg(delegated_inode, O_WRONLY); - iput(delegated_inode); - delegated_inode = NULL; + error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } -- cgit v1.2.3 From 8e6d782cab50884ba94324632700e6233a252f6a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 20 Sep 2011 16:59:58 -0400 Subject: locks: break delegations on rename Cc: David Howells Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/namei.c | 47 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index cfaeaae0f2db..ce7e580e4e14 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4022,7 +4022,8 @@ out: } static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + struct inode **delegated_inode) { struct inode *target = new_dentry->d_inode; struct inode *source = old_dentry->d_inode; @@ -4039,6 +4040,14 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) goto out; + error = try_break_deleg(source, delegated_inode); + if (error) + goto out; + if (target) { + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; + } error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); if (error) goto out; @@ -4053,8 +4062,30 @@ out: return error; } +/** + * vfs_rename - rename a filesystem object + * @old_dir: parent of source + * @old_dentry: source + * @new_dir: parent of destination + * @new_dentry: destination + * @delegated_inode: returns an inode needing a delegation break + * + * The caller must hold multiple mutexes--see lock_rename()). + * + * If vfs_rename discovers a delegation in need of breaking at either + * the source or destination, it will return -EWOULDBLOCK and return a + * reference to the inode in delegated_inode. The caller should then + * break the delegation and retry. Because breaking a delegation may + * take a long time, the caller should drop all locks before doing + * so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + struct inode **delegated_inode) { int error; int is_dir = d_is_directory(old_dentry) || d_is_autodir(old_dentry); @@ -4082,7 +4113,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (is_dir) error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); else - error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); + error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,delegated_inode); if (!error) fsnotify_move(old_dir, new_dir, old_name, is_dir, new_dentry->d_inode, old_dentry); @@ -4098,6 +4129,7 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, struct dentry *old_dentry, *new_dentry; struct dentry *trap; struct nameidata oldnd, newnd; + struct inode *delegated_inode = NULL; struct filename *from; struct filename *to; unsigned int lookup_flags = 0; @@ -4137,6 +4169,7 @@ retry: newnd.flags &= ~LOOKUP_PARENT; newnd.flags |= LOOKUP_RENAME_TARGET; +retry_deleg: trap = lock_rename(new_dir, old_dir); old_dentry = lookup_hash(&oldnd); @@ -4173,13 +4206,19 @@ retry: if (error) goto exit5; error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); + new_dir->d_inode, new_dentry, + &delegated_inode); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_dir, old_dir); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } mnt_drop_write(oldnd.path.mnt); exit2: if (retry_estale(error, lookup_flags)) -- cgit v1.2.3 From 146a8595c6399ee6ab4b5cc34c0d28aa4835fdc5 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 20 Sep 2011 17:14:31 -0400 Subject: locks: break delegations on link Cc: Tyler Hicks Cc: Dustin Kirkland Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/namei.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index ce7e580e4e14..251178a1e383 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3819,7 +3819,26 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn return sys_symlinkat(oldname, AT_FDCWD, newname); } -int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) +/** + * vfs_link - create a new link + * @old_dentry: object to be linked + * @dir: new parent + * @new_dentry: where to create the new link + * @delegated_inode: returns inode needing a delegation break + * + * The caller must hold dir->i_mutex + * + * If vfs_link discovers a delegation on the to-be-linked file in need + * of breaking, it will return -EWOULDBLOCK and return a reference to the + * inode in delegated_inode. The caller should then break the delegation + * and retry. Because breaking a delegation may take a long time, the + * caller should drop the i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ +int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; @@ -3855,8 +3874,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; - else - error = dir->i_op->link(old_dentry, dir, new_dentry); + else { + error = try_break_deleg(inode, delegated_inode); + if (!error) + error = dir->i_op->link(old_dentry, dir, new_dentry); + } if (!error && (inode->i_state & I_LINKABLE)) { spin_lock(&inode->i_lock); @@ -3883,6 +3905,7 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, { struct dentry *new_dentry; struct path old_path, new_path; + struct inode *delegated_inode = NULL; int how = 0; int error; @@ -3921,9 +3944,14 @@ retry: error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); + error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, how)) { how |= LOOKUP_REVAL; goto retry; -- cgit v1.2.3