xfs: convert inode cache lookups to use RCU locking

With delayed logging greatly increasing the sustained parallelism of inode operations, the inode cache locking is showing significant read vs write contention when inode reclaim runs at the same time as lookups. There is also a lot more write lock acquistions than there are read locks (4:1 ratio) so the read locking is not really buying us much in the way of parallelism. To avoid the read vs write contention, change the cache to use RCU locking on the read side. To avoid needing to RCU free every single inode, use the built in slab RCU freeing mechanism. This requires us to be able to detect lookups of freed inodes, so enѕure that ever freed inode has an inode number of zero and the XFS_IRECLAIM flag set. We already check the XFS_IRECLAIM flag in cache hit lookup path, but also add a check for a zero inode number as well. We canthen convert all the read locking lockups to use RCU read side locking and hence remove all read side locking. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Alex Elder <aelder@sgi.com>
author: Dave Chinner <dchinner@redhat.com> 2010-12-17 17:29:43 +1100
committer: Dave Chinner <david@fromorbit.com> 2010-12-17 17:29:43 +1100
commit: 1a3e8f3da09c7082d25b512a0ffe569391e4c09a (patch)
tree: c717ebe79e1f969f929d1fe6fb044fb59114449f /fs/xfs/xfs_iget.c
parent: d95b7aaf9ab6738bef1ebcc52ab66563085e44ac (diff)
1 files changed, 35 insertions, 12 deletions
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 9fae47556604..04ed09b907b8 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -80,6 +80,7 @@ xfs_inode_alloc(
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
+	ASSERT(ip->i_ino == 0);
 
 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
@@ -98,9 +99,6 @@ xfs_inode_alloc(
 	ip->i_size = 0;
 	ip->i_new_size = 0;
 
-	/* prevent anyone from using this yet */
-	VFS_I(ip)->i_state = I_NEW;
-
 	return ip;
 }
 
@@ -159,6 +157,16 @@ xfs_inode_free(
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
 
+	/*
+	 * Because we use RCU freeing we need to ensure the inode always
+	 * appears to be reclaimed with an invalid inode number when in the
+	 * free state. The ip->i_flags_lock provides the barrier against lookup
+	 * races.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags = XFS_IRECLAIM;
+	ip->i_ino = 0;
+	spin_unlock(&ip->i_flags_lock);
 	call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free);
 }
 
@@ -169,14 +177,29 @@ static int
 xfs_iget_cache_hit(
 	struct xfs_perag	*pag,
 	struct xfs_inode	*ip,
+	xfs_ino_t		ino,
 	int			flags,
-	int			lock_flags) __releases(pag->pag_ici_lock)
+	int			lock_flags) __releases(RCU)
 {
 	struct inode		*inode = VFS_I(ip);
 	struct xfs_mount	*mp = ip->i_mount;
 	int			error;
 
+	/*
+	 * check for re-use of an inode within an RCU grace period due to the
+	 * radix tree nodes not being updated yet. We monitor for this by
+	 * setting the inode number to zero before freeing the inode structure.
+	 * If the inode has been reallocated and set up, then the inode number
+	 * will not match, so check for that, too.
+	 */
 	spin_lock(&ip->i_flags_lock);
+	if (ip->i_ino != ino) {
+		trace_xfs_iget_skip(ip);
+		XFS_STATS_INC(xs_ig_frecycle);
+		error = EAGAIN;
+		goto out_error;
+	}
+
 
 	/*
 	 * If we are racing with another cache hit that is currently
@@ -219,7 +242,7 @@ xfs_iget_cache_hit(
 		ip->i_flags |= XFS_IRECLAIM;
 
 		spin_unlock(&ip->i_flags_lock);
-		read_unlock(&pag->pag_ici_lock);
+		rcu_read_unlock();
 
 		error = -inode_init_always(mp->m_super, inode);
 		if (error) {
@@ -227,7 +250,7 @@ xfs_iget_cache_hit(
 			 * Re-initializing the inode failed, and we are in deep
 			 * trouble.  Try to re-add it to the reclaim list.
 			 */
-			read_lock(&pag->pag_ici_lock);
+			rcu_read_lock();
 			spin_lock(&ip->i_flags_lock);
 
 			ip->i_flags &= ~XFS_INEW;
@@ -261,7 +284,7 @@ xfs_iget_cache_hit(
 
 		/* We've got a live one. */
 		spin_unlock(&ip->i_flags_lock);
-		read_unlock(&pag->pag_ici_lock);
+		rcu_read_unlock();
 		trace_xfs_iget_hit(ip);
 	}
 
@@ -275,7 +298,7 @@ xfs_iget_cache_hit(
 
 out_error:
 	spin_unlock(&ip->i_flags_lock);
-	read_unlock(&pag->pag_ici_lock);
+	rcu_read_unlock();
 	return error;
 }
 
@@ -397,7 +420,7 @@ xfs_iget(
 	xfs_agino_t	agino;
 
 	/* reject inode numbers outside existing AGs */
-	if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
 		return EINVAL;
 
 	/* get the perag structure and ensure that it's inode capable */
@@ -406,15 +429,15 @@ xfs_iget(
 
 again:
 	error = 0;
-	read_lock(&pag->pag_ici_lock);
+	rcu_read_lock();
 	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 
 	if (ip) {
-		error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
 		if (error)
 			goto out_error_or_again;
 	} else {
-		read_unlock(&pag->pag_ici_lock);
+		rcu_read_unlock();
 		XFS_STATS_INC(xs_ig_missed);
 
 		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
author	Dave Chinner <dchinner@redhat.com>	2010-12-17 17:29:43 +1100
committer	Dave Chinner <david@fromorbit.com>	2010-12-17 17:29:43 +1100
commit	1a3e8f3da09c7082d25b512a0ffe569391e4c09a (patch)
tree	c717ebe79e1f969f929d1fe6fb044fb59114449f /fs/xfs/xfs_iget.c
parent	d95b7aaf9ab6738bef1ebcc52ab66563085e44ac (diff)