From 355e3532132b487ebf6a4900fad8f3525fa3e137 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 12 Dec 2018 08:46:32 -0800 Subject: xfs: cache minimum realtime summary level The realtime summary is a two-dimensional array on disk, effectively: u32 rsum[log2(number of realtime extents) + 1][number of blocks in the bitmap] rsum[log][bbno] is the number of extents of size 2**log which start in bitmap block bbno. xfs_rtallocate_extent_near() uses xfs_rtany_summary() to check whether rsum[log][bbno] != 0 for any log level. However, the summary array is stored in row-major order (i.e., like an array in C), so all of these entries are not adjacent, but rather spread across the entire summary file. In the worst case (a full bitmap block), xfs_rtany_summary() has to check every level. This means that on a moderately-used realtime device, an allocation will waste a lot of time finding, reading, and releasing buffers for the realtime summary. In particular, one of our storage services (which runs on servers with 8 very slow CPUs and 15 8 TB XFS realtime filesystems) spends almost 5% of its CPU cycles in xfs_rtbuf_get() and xfs_trans_brelse() called from xfs_rtany_summary(). One solution would be to also store the summary with the dimensions swapped. However, this would require a disk format change to a very old component of XFS. Instead, we can cache the minimum size which contains any extents. We do so lazily; rather than guaranteeing that the cache contains the precise minimum, it always contains a loose lower bound which we tighten when we read or update a summary block. This only uses a few kilobytes of memory and is already serialized via the realtime bitmap and summary inode locks, so the cost is minimal. With this change, the same workload only spends 0.2% of its CPU cycles in the realtime allocator. Signed-off-by: Omar Sandoval Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_rtalloc.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'fs/xfs/xfs_rtalloc.c') diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 926ed314ffba..aefd63d46397 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -64,8 +64,12 @@ xfs_rtany_summary( int log; /* loop counter, log2 of ext. size */ xfs_suminfo_t sum; /* summary data */ + /* There are no extents at levels < m_rsum_cache[bbno]. */ + if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno]) + low = mp->m_rsum_cache[bbno]; + /* - * Loop over logs of extent sizes. Order is irrelevant. + * Loop over logs of extent sizes. */ for (log = low; log <= high; log++) { /* @@ -80,13 +84,17 @@ xfs_rtany_summary( */ if (sum) { *stat = 1; - return 0; + goto out; } } /* * Found nothing, return failure. */ *stat = 0; +out: + /* There were no extents at levels < log. */ + if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log; return 0; } @@ -1187,8 +1195,8 @@ xfs_rtmount_init( } /* - * Get the bitmap and summary inodes into the mount structure - * at mount time. + * Get the bitmap and summary inodes and the summary cache into the mount + * structure at mount time. */ int /* error */ xfs_rtmount_inodes( @@ -1211,6 +1219,14 @@ xfs_rtmount_inodes( return error; } ASSERT(mp->m_rsumip != NULL); + /* + * The rsum cache is initialized to all zeroes, which is trivially a + * lower bound on the minimum level with any free extents. We can + * continue without the cache if it couldn't be allocated. + */ + mp->m_rsum_cache = kmem_zalloc_large(sbp->sb_rbmblocks, KM_SLEEP); + if (!mp->m_rsum_cache) + xfs_warn(mp, "could not allocate realtime summary cache"); return 0; } @@ -1218,6 +1234,7 @@ void xfs_rtunmount_inodes( struct xfs_mount *mp) { + kmem_free(mp->m_rsum_cache); if (mp->m_rbmip) xfs_irele(mp->m_rbmip); if (mp->m_rsumip) -- cgit v1.2.3 From 64bafd2f1e484e27071e7584642005d56516cb77 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 12 Dec 2018 15:18:52 -0800 Subject: xfs: require both realtime inodes to mount Since mkfs always formats the filesystem with the realtime bitmap and summary inodes immediately after the root directory, we should expect that both of them are present and loadable, even if there isn't a realtime volume attached. There's no reason to skip this if rbmino == NULLFSINO; in fact, this causes an immediate crash if the there /is/ a realtime volume and someone writes to it. Signed-off-by: Darrick J. Wong Reviewed-by: Bill O'Donnell --- fs/xfs/xfs_rtalloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/xfs/xfs_rtalloc.c') diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index aefd63d46397..afe4d29f7ab4 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1206,13 +1206,11 @@ xfs_rtmount_inodes( xfs_sb_t *sbp; sbp = &mp->m_sb; - if (sbp->sb_rbmino == NULLFSINO) - return 0; error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip); if (error) return error; ASSERT(mp->m_rbmip != NULL); - ASSERT(sbp->sb_rsumino != NULLFSINO); + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); if (error) { xfs_irele(mp->m_rbmip); -- cgit v1.2.3 From 65eed012d1f2d0f0bf0ffc036826d58147de77b8 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 21 Dec 2018 18:45:18 -0800 Subject: xfs: reallocate realtime summary cache on growfs At mount time, we allocate m_rsum_cache with the number of realtime bitmap blocks. However, xfs_growfs_rt() can increase the number of realtime bitmap blocks. Using the cache after this happens may access out of the bounds of the cache. Fix it by reallocating the cache in this case. Fixes: 355e3532132b ("xfs: cache minimum realtime summary level") Signed-off-by: Omar Sandoval Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_rtalloc.c | 44 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 8 deletions(-) (limited to 'fs/xfs/xfs_rtalloc.c') diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index afe4d29f7ab4..ac0fcdad0c4e 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -861,6 +861,21 @@ out_trans_cancel: return error; } +static void +xfs_alloc_rsum_cache( + xfs_mount_t *mp, /* file system mount structure */ + xfs_extlen_t rbmblocks) /* number of rt bitmap blocks */ +{ + /* + * The rsum cache is initialized to all zeroes, which is trivially a + * lower bound on the minimum level with any free extents. We can + * continue without the cache if it couldn't be allocated. + */ + mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP); + if (!mp->m_rsum_cache) + xfs_warn(mp, "could not allocate realtime summary cache"); +} + /* * Visible (exported) functions. */ @@ -889,6 +904,7 @@ xfs_growfs_rt( xfs_extlen_t rsumblocks; /* current number of rt summary blks */ xfs_sb_t *sbp; /* old superblock */ xfs_fsblock_t sumbno; /* summary block number */ + uint8_t *rsum_cache; /* old summary cache */ sbp = &mp->m_sb; /* @@ -945,6 +961,11 @@ xfs_growfs_rt( error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip); if (error) return error; + + rsum_cache = mp->m_rsum_cache; + if (nrbmblocks != sbp->sb_rbmblocks) + xfs_alloc_rsum_cache(mp, nrbmblocks); + /* * Allocate a new (fake) mount/sb. */ @@ -1070,6 +1091,20 @@ error_cancel: */ kmem_free(nmp); + /* + * If we had to allocate a new rsum_cache, we either need to free the + * old one (if we succeeded) or free the new one and restore the old one + * (if there was an error). + */ + if (rsum_cache != mp->m_rsum_cache) { + if (error) { + kmem_free(mp->m_rsum_cache); + mp->m_rsum_cache = rsum_cache; + } else { + kmem_free(rsum_cache); + } + } + return error; } @@ -1217,14 +1252,7 @@ xfs_rtmount_inodes( return error; } ASSERT(mp->m_rsumip != NULL); - /* - * The rsum cache is initialized to all zeroes, which is trivially a - * lower bound on the minimum level with any free extents. We can - * continue without the cache if it couldn't be allocated. - */ - mp->m_rsum_cache = kmem_zalloc_large(sbp->sb_rbmblocks, KM_SLEEP); - if (!mp->m_rsum_cache) - xfs_warn(mp, "could not allocate realtime summary cache"); + xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks); return 0; } -- cgit v1.2.3