summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_zone_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_zone_alloc.c')
-rw-r--r--fs/xfs/xfs_zone_alloc.c478
1 files changed, 226 insertions, 252 deletions
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 80add26c0111..8dde444596f1 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -26,14 +26,22 @@
#include "xfs_trace.h"
#include "xfs_mru_cache.h"
+static void
+xfs_open_zone_free_rcu(
+ struct callback_head *cb)
+{
+ struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu);
+
+ xfs_rtgroup_rele(oz->oz_rtg);
+ kfree(oz);
+}
+
void
xfs_open_zone_put(
struct xfs_open_zone *oz)
{
- if (atomic_dec_and_test(&oz->oz_ref)) {
- xfs_rtgroup_rele(oz->oz_rtg);
- kfree(oz);
- }
+ if (atomic_dec_and_test(&oz->oz_ref))
+ call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
}
static inline uint32_t
@@ -166,10 +174,9 @@ xfs_open_zone_mark_full(
static void
xfs_zone_record_blocks(
struct xfs_trans *tp,
- xfs_fsblock_t fsbno,
- xfs_filblks_t len,
struct xfs_open_zone *oz,
- bool used)
+ xfs_fsblock_t fsbno,
+ xfs_filblks_t len)
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rtgroup *rtg = oz->oz_rtg;
@@ -179,18 +186,37 @@ xfs_zone_record_blocks(
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
- if (used) {
- rmapip->i_used_blocks += len;
- ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
- } else {
- xfs_add_frextents(mp, len);
- }
+ rmapip->i_used_blocks += len;
+ ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
oz->oz_written += len;
if (oz->oz_written == rtg_blocks(rtg))
xfs_open_zone_mark_full(oz);
xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
}
+/*
+ * Called for blocks that have been written to disk, but not actually linked to
+ * an inode, which can happen when garbage collection races with user data
+ * writes to a file.
+ */
+static void
+xfs_zone_skip_blocks(
+ struct xfs_open_zone *oz,
+ xfs_filblks_t len)
+{
+ struct xfs_rtgroup *rtg = oz->oz_rtg;
+
+ trace_xfs_zone_skip_blocks(oz, 0, len);
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ oz->oz_written += len;
+ if (oz->oz_written == rtg_blocks(rtg))
+ xfs_open_zone_mark_full(oz);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+
+ xfs_add_frextents(rtg_mount(rtg), len);
+}
+
static int
xfs_zoned_map_extent(
struct xfs_trans *tp,
@@ -220,6 +246,14 @@ xfs_zoned_map_extent(
* If a data write raced with this GC write, keep the existing data in
* the data fork, mark our newly written GC extent as reclaimable, then
* move on to the next extent.
+ *
+ * Note that this can also happen when racing with operations that do
+ * not actually invalidate the data, but just move it to a different
+ * inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the
+ * inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE). If the
+ * data was just moved around, GC fails to free the zone, but the zone
+ * becomes a GC candidate again as soon as all previous GC I/O has
+ * finished and these blocks will be moved out eventually.
*/
if (old_startblock != NULLFSBLOCK &&
old_startblock != data.br_startblock)
@@ -250,8 +284,7 @@ xfs_zoned_map_extent(
}
}
- xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
- true);
+ xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount);
/* Map the new blocks into the data fork. */
xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
@@ -259,8 +292,7 @@ xfs_zoned_map_extent(
skip:
trace_xfs_reflink_cow_remap_skip(ip, new);
- xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
- false);
+ xfs_zone_skip_blocks(oz, new->br_blockcount);
return 0;
}
@@ -358,44 +390,6 @@ xfs_zone_free_blocks(
return 0;
}
-/*
- * Check if the zone containing the data just before the offset we are
- * writing to is still open and has space.
- */
-static struct xfs_open_zone *
-xfs_last_used_zone(
- struct iomap_ioend *ioend)
-{
- struct xfs_inode *ip = XFS_I(ioend->io_inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
- struct xfs_rtgroup *rtg = NULL;
- struct xfs_open_zone *oz = NULL;
- struct xfs_iext_cursor icur;
- struct xfs_bmbt_irec got;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
- &icur, &got)) {
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return NULL;
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
- if (!rtg)
- return NULL;
-
- xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
- oz = READ_ONCE(rtg->rtg_open_zone);
- if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref)))
- oz = NULL;
- xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
-
- xfs_rtgroup_rele(rtg);
- return oz;
-}
-
static struct xfs_group *
xfs_find_free_zone(
struct xfs_mount *mp,
@@ -434,7 +428,7 @@ xfs_init_open_zone(
spin_lock_init(&oz->oz_alloc_lock);
atomic_set(&oz->oz_ref, 1);
oz->oz_rtg = rtg;
- oz->oz_write_pointer = write_pointer;
+ oz->oz_allocated = write_pointer;
oz->oz_written = write_pointer;
oz->oz_write_hint = write_hint;
oz->oz_is_gc = is_gc;
@@ -515,64 +509,58 @@ xfs_try_open_zone(
return oz;
}
+enum xfs_zone_alloc_score {
+ /* Any open zone will do it, we're desperate */
+ XFS_ZONE_ALLOC_ANY = 0,
+
+ /* It better fit somehow */
+ XFS_ZONE_ALLOC_OK = 1,
+
+ /* Only reuse a zone if it fits really well. */
+ XFS_ZONE_ALLOC_GOOD = 2,
+};
+
/*
- * For data with short or medium lifetime, try to colocated it into an
- * already open zone with a matching temperature.
+ * Life time hint co-location matrix. Fields not set default to 0
+ * aka XFS_ZONE_ALLOC_ANY.
*/
-static bool
-xfs_colocate_eagerly(
- enum rw_hint file_hint)
-{
- switch (file_hint) {
- case WRITE_LIFE_MEDIUM:
- case WRITE_LIFE_SHORT:
- case WRITE_LIFE_NONE:
- return true;
- default:
- return false;
- }
-}
-
-static bool
-xfs_good_hint_match(
- struct xfs_open_zone *oz,
- enum rw_hint file_hint)
-{
- switch (oz->oz_write_hint) {
- case WRITE_LIFE_LONG:
- case WRITE_LIFE_EXTREME:
- /* colocate long and extreme */
- if (file_hint == WRITE_LIFE_LONG ||
- file_hint == WRITE_LIFE_EXTREME)
- return true;
- break;
- case WRITE_LIFE_MEDIUM:
- /* colocate medium with medium */
- if (file_hint == WRITE_LIFE_MEDIUM)
- return true;
- break;
- case WRITE_LIFE_SHORT:
- case WRITE_LIFE_NONE:
- case WRITE_LIFE_NOT_SET:
- /* colocate short and none */
- if (file_hint <= WRITE_LIFE_SHORT)
- return true;
- break;
- }
- return false;
-}
+static const unsigned int
+xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = {
+ [WRITE_LIFE_NOT_SET] = {
+ [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK,
+ },
+ [WRITE_LIFE_NONE] = {
+ [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK,
+ },
+ [WRITE_LIFE_SHORT] = {
+ [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD,
+ },
+ [WRITE_LIFE_MEDIUM] = {
+ [WRITE_LIFE_MEDIUM] = XFS_ZONE_ALLOC_GOOD,
+ },
+ [WRITE_LIFE_LONG] = {
+ [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK,
+ [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK,
+ },
+ [WRITE_LIFE_EXTREME] = {
+ [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK,
+ [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK,
+ },
+};
static bool
xfs_try_use_zone(
struct xfs_zone_info *zi,
enum rw_hint file_hint,
struct xfs_open_zone *oz,
- bool lowspace)
+ unsigned int goodness)
{
- if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
+ if (oz->oz_allocated == rtg_blocks(oz->oz_rtg))
return false;
- if (!lowspace && !xfs_good_hint_match(oz, file_hint))
+
+ if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness)
return false;
+
if (!atomic_inc_not_zero(&oz->oz_ref))
return false;
@@ -603,14 +591,14 @@ static struct xfs_open_zone *
xfs_select_open_zone_lru(
struct xfs_zone_info *zi,
enum rw_hint file_hint,
- bool lowspace)
+ unsigned int goodness)
{
struct xfs_open_zone *oz;
lockdep_assert_held(&zi->zi_open_zones_lock);
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
- if (xfs_try_use_zone(zi, file_hint, oz, lowspace))
+ if (xfs_try_use_zone(zi, file_hint, oz, goodness))
return oz;
cond_resched_lock(&zi->zi_open_zones_lock);
@@ -627,7 +615,7 @@ xfs_select_open_zone_mru(
lockdep_assert_held(&zi->zi_open_zones_lock);
list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
- if (xfs_try_use_zone(zi, file_hint, oz, false))
+ if (xfs_try_use_zone(zi, file_hint, oz, XFS_ZONE_ALLOC_OK))
return oz;
cond_resched_lock(&zi->zi_open_zones_lock);
@@ -642,25 +630,29 @@ static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip)
}
/*
- * Try to pack inodes that are written back after they were closed tight instead
- * of trying to open new zones for them or spread them to the least recently
- * used zone. This optimizes the data layout for workloads that untar or copy
- * a lot of small files. Right now this does not separate multiple such
+ * Try to tightly pack small files that are written back after they were closed
+ * instead of trying to open new zones for them or spread them to the least
+ * recently used zone. This optimizes the data layout for workloads that untar
+ * or copy a lot of small files. Right now this does not separate multiple such
* streams.
*/
static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
{
+ struct xfs_mount *mp = ip->i_mount;
+ size_t zone_capacity =
+ XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks);
+
+ /*
+ * Do not pack write files that are already using a full zone to avoid
+ * fragmentation.
+ */
+ if (i_size_read(VFS_I(ip)) >= zone_capacity)
+ return false;
+
return !inode_is_open_for_write(VFS_I(ip)) &&
!(ip->i_diflags & XFS_DIFLAG_APPEND);
}
-/*
- * Pick a new zone for writes.
- *
- * If we aren't using up our budget of open zones just open a new one from the
- * freelist. Else try to find one that matches the expected data lifetime. If
- * we don't find one that is good pick any zone that is available.
- */
static struct xfs_open_zone *
xfs_select_zone_nowait(
struct xfs_mount *mp,
@@ -680,31 +672,34 @@ xfs_select_zone_nowait(
* data.
*/
spin_lock(&zi->zi_open_zones_lock);
- if (xfs_colocate_eagerly(write_hint))
- oz = xfs_select_open_zone_lru(zi, write_hint, false);
- else if (pack_tight)
+ oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD);
+ if (oz)
+ goto out_unlock;
+
+ if (pack_tight)
oz = xfs_select_open_zone_mru(zi, write_hint);
if (oz)
goto out_unlock;
/*
- * See if we can open a new zone and use that.
+ * See if we can open a new zone and use that so that data for different
+ * files is mixed as little as possible.
*/
oz = xfs_try_open_zone(mp, write_hint);
if (oz)
goto out_unlock;
/*
- * Try to colocate cold data with other cold data if we failed to open a
- * new zone for it.
+ * Try to find an zone that is an ok match to colocate data with.
*/
- if (write_hint != WRITE_LIFE_NOT_SET &&
- !xfs_colocate_eagerly(write_hint))
- oz = xfs_select_open_zone_lru(zi, write_hint, false);
- if (!oz)
- oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false);
- if (!oz)
- oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true);
+ oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK);
+ if (oz)
+ goto out_unlock;
+
+ /*
+ * Pick the least recently used zone, regardless of hint match
+ */
+ oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY);
out_unlock:
spin_unlock(&zi->zi_open_zones_lock);
return oz;
@@ -727,7 +722,7 @@ xfs_select_zone(
for (;;) {
prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
oz = xfs_select_zone_nowait(mp, write_hint, pack_tight);
- if (oz)
+ if (oz || xfs_is_shutdown(mp))
break;
schedule();
}
@@ -744,25 +739,25 @@ xfs_zone_alloc_blocks(
{
struct xfs_rtgroup *rtg = oz->oz_rtg;
struct xfs_mount *mp = rtg_mount(rtg);
- xfs_rgblock_t rgbno;
+ xfs_rgblock_t allocated;
spin_lock(&oz->oz_alloc_lock);
count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
- (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer);
+ (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_allocated);
if (!count_fsb) {
spin_unlock(&oz->oz_alloc_lock);
return 0;
}
- rgbno = oz->oz_write_pointer;
- oz->oz_write_pointer += count_fsb;
+ allocated = oz->oz_allocated;
+ oz->oz_allocated += count_fsb;
spin_unlock(&oz->oz_alloc_lock);
- trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb);
+ trace_xfs_zone_alloc_blocks(oz, allocated, count_fsb);
*sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector);
if (!*is_seq)
- *sector += XFS_FSB_TO_BB(mp, rgbno);
+ *sector += XFS_FSB_TO_BB(mp, allocated);
return XFS_FSB_TO_B(mp, count_fsb);
}
@@ -777,118 +772,75 @@ xfs_mark_rtg_boundary(
ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
}
-static void
-xfs_submit_zoned_bio(
- struct iomap_ioend *ioend,
- struct xfs_open_zone *oz,
- bool is_seq)
-{
- ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
- ioend->io_private = oz;
- atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */
-
- if (is_seq) {
- ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
- ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
- } else {
- xfs_mark_rtg_boundary(ioend);
- }
-
- submit_bio(&ioend->io_bio);
-}
-
-/*
- * Cache the last zone written to for an inode so that it is considered first
- * for subsequent writes.
- */
-struct xfs_zone_cache_item {
- struct xfs_mru_cache_elem mru;
- struct xfs_open_zone *oz;
-};
-
-static inline struct xfs_zone_cache_item *
-xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
-{
- return container_of(mru, struct xfs_zone_cache_item, mru);
-}
-
-static void
-xfs_zone_cache_free_func(
- void *data,
- struct xfs_mru_cache_elem *mru)
-{
- struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru);
-
- xfs_open_zone_put(item->oz);
- kfree(item);
-}
-
/*
* Check if we have a cached last open zone available for the inode and
* if yes return a reference to it.
*/
static struct xfs_open_zone *
-xfs_cached_zone(
- struct xfs_mount *mp,
- struct xfs_inode *ip)
+xfs_get_cached_zone(
+ struct xfs_inode *ip)
{
- struct xfs_mru_cache_elem *mru;
- struct xfs_open_zone *oz;
+ struct xfs_open_zone *oz;
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (!mru)
- return NULL;
- oz = xfs_zone_cache_item(mru)->oz;
+ rcu_read_lock();
+ oz = VFS_I(ip)->i_private;
if (oz) {
/*
* GC only steals open zones at mount time, so no GC zones
* should end up in the cache.
*/
ASSERT(!oz->oz_is_gc);
- ASSERT(atomic_read(&oz->oz_ref) > 0);
- atomic_inc(&oz->oz_ref);
+ if (!atomic_inc_not_zero(&oz->oz_ref))
+ oz = NULL;
}
- xfs_mru_cache_done(mp->m_zone_cache);
+ rcu_read_unlock();
+
return oz;
}
/*
- * Update the last used zone cache for a given inode.
+ * Stash our zone in the inode so that is is reused for future allocations.
*
- * The caller must have a reference on the open zone.
+ * The open_zone structure will be pinned until either the inode is freed or
+ * until the cached open zone is replaced with a different one because the
+ * current one was full when we tried to use it. This means we keep any
+ * open zone around forever as long as any inode that used it for the last
+ * write is cached, which slightly increases the memory use of cached inodes
+ * that were every written to, but significantly simplifies the cached zone
+ * lookup. Because the open_zone is clearly marked as full when all data
+ * in the underlying RTG was written, the caching is always safe.
*/
static void
-xfs_zone_cache_create_association(
- struct xfs_inode *ip,
- struct xfs_open_zone *oz)
+xfs_set_cached_zone(
+ struct xfs_inode *ip,
+ struct xfs_open_zone *oz)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_zone_cache_item *item = NULL;
- struct xfs_mru_cache_elem *mru;
+ struct xfs_open_zone *old_oz;
- ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
+ old_oz = xchg(&VFS_I(ip)->i_private, oz);
+ if (old_oz)
+ xfs_open_zone_put(old_oz);
+}
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (mru) {
- /*
- * If we have an association already, update it to point to the
- * new zone.
- */
- item = xfs_zone_cache_item(mru);
- xfs_open_zone_put(item->oz);
- item->oz = oz;
- xfs_mru_cache_done(mp->m_zone_cache);
- return;
- }
+static void
+xfs_submit_zoned_bio(
+ struct iomap_ioend *ioend,
+ struct xfs_open_zone *oz,
+ bool is_seq)
+{
+ ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
+ ioend->io_private = oz;
+ atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */
- item = kmalloc(sizeof(*item), GFP_KERNEL);
- if (!item) {
- xfs_open_zone_put(oz);
- return;
+ if (is_seq) {
+ ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
+ ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
+ } else {
+ xfs_mark_rtg_boundary(ioend);
}
- item->oz = oz;
- xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
+
+ submit_bio(&ioend->io_bio);
}
void
@@ -908,22 +860,18 @@ xfs_zone_alloc_and_submit(
goto out_error;
/*
- * If we don't have a cached zone in this write context, see if the
- * last extent before the one we are writing to points to an active
- * zone. If so, just continue writing to it.
+ * If we don't have a locally cached zone in this write context, see if
+ * the inode is still associated with a zone and use that if so.
*/
- if (!*oz && ioend->io_offset)
- *oz = xfs_last_used_zone(ioend);
if (!*oz)
- *oz = xfs_cached_zone(mp, ip);
+ *oz = xfs_get_cached_zone(ip);
if (!*oz) {
select_zone:
*oz = xfs_select_zone(mp, write_hint, pack_tight);
if (!*oz)
goto out_error;
-
- xfs_zone_cache_create_association(ip, *oz);
+ xfs_set_cached_zone(ip, *oz);
}
alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@@ -983,7 +931,7 @@ xfs_zone_rgbno_is_valid(
lockdep_assert_held(&rtg_rmap(rtg)->i_lock);
if (rtg->rtg_open_zone)
- return rgbno < rtg->rtg_open_zone->oz_write_pointer;
+ return rgbno < rtg->rtg_open_zone->oz_allocated;
return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa,
rtg_rgno(rtg), XFS_RTG_FREE);
}
@@ -1001,6 +949,12 @@ xfs_free_open_zones(
xfs_open_zone_put(oz);
}
spin_unlock(&zi->zi_open_zones_lock);
+
+ /*
+ * Wait for all open zones to be freed so that they drop the group
+ * references:
+ */
+ rcu_barrier();
}
struct xfs_init_zones {
@@ -1017,7 +971,7 @@ xfs_init_zone(
{
struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_zone_info *zi = mp->m_zone_info;
- uint64_t used = rtg_rmap(rtg)->i_used_blocks;
+ uint32_t used = rtg_rmap(rtg)->i_used_blocks;
xfs_rgblock_t write_pointer, highest_rgbno;
int error;
@@ -1114,24 +1068,27 @@ xfs_get_zone_info_cb(
}
/*
- * Calculate the max open zone limit based on the of number of
- * backing zones available
+ * Calculate the max open zone limit based on the of number of backing zones
+ * available.
*/
static inline uint32_t
xfs_max_open_zones(
struct xfs_mount *mp)
{
unsigned int max_open, max_open_data_zones;
+
/*
- * We need two zones for every open data zone,
- * one in reserve as we don't reclaim open zones. One data zone
- * and its spare is included in XFS_MIN_ZONES.
+ * We need two zones for every open data zone, one in reserve as we
+ * don't reclaim open zones. One data zone and its spare is included
+ * in XFS_MIN_ZONES to support at least one user data writer.
*/
max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
/*
- * Cap the max open limit to 1/4 of available space
+ * Cap the max open limit to 1/4 of available space. Without this we'd
+ * run out of easy reclaim targets too quickly and storage devices don't
+ * handle huge numbers of concurrent write streams overly well.
*/
max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
@@ -1163,7 +1120,7 @@ xfs_calc_open_zones(
if (bdev_open_zones)
mp->m_max_open_zones = bdev_open_zones;
else
- mp->m_max_open_zones = xfs_max_open_zones(mp);
+ mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES;
}
if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
@@ -1247,6 +1204,7 @@ xfs_mount_zones(
.mp = mp,
};
struct xfs_buftarg *bt = mp->m_rtdev_targp;
+ xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks;
int error;
if (!bt) {
@@ -1276,11 +1234,34 @@ xfs_mount_zones(
if (!mp->m_zone_info)
return -ENOMEM;
- xfs_info(mp, "%u zones of %u blocks size (%u max open)",
- mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
- mp->m_max_open_zones);
+ xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
+ mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones);
trace_xfs_zones_mount(mp);
+ /*
+ * The writeback code switches between inodes regularly to provide
+ * fairness. The default lower bound is 4MiB, but for zoned file
+ * systems we want to increase that both to reduce seeks, but also more
+ * importantly so that workloads that writes files in a multiple of the
+ * zone size do not get fragmented and require garbage collection when
+ * they shouldn't. Increase is to the zone size capped by the max
+ * extent len.
+ *
+ * Note that because s_min_writeback_pages is a superblock field, this
+ * value also get applied to non-zoned files on the data device if
+ * there are any. On typical zoned setup all data is on the RT device
+ * because using the more efficient sequential write required zones
+ * is the reason for using the zone allocator, and either the RT device
+ * and the (meta)data device are on the same block device, or the
+ * (meta)data device is on a fast SSD while the data on the RT device
+ * is on a SMR HDD. In any combination of the above cases enforcing
+ * the higher min_writeback_pages for non-RT inodes is either a noop
+ * or beneficial.
+ */
+ mp->m_super->s_min_writeback_pages =
+ XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >>
+ PAGE_SHIFT;
+
if (bdev_is_zoned(bt->bt_bdev)) {
error = blkdev_report_zones(bt->bt_bdev,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
@@ -1292,8 +1273,10 @@ xfs_mount_zones(
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
error = xfs_init_zone(&iz, rtg, NULL);
- if (error)
+ if (error) {
+ xfs_rtgroup_rele(rtg);
goto out_free_zone_info;
+ }
}
}
@@ -1311,14 +1294,6 @@ xfs_mount_zones(
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
-
- /*
- * Set up a mru cache to track inode to open zone for data placement
- * purposes. The magic values for group count and life time is the
- * same as the defaults for file streams, which seems sane enough.
- */
- xfs_mru_cache_create(&mp->m_zone_cache, mp,
- 5000, 10, xfs_zone_cache_free_func);
return 0;
out_free_zone_info:
@@ -1332,5 +1307,4 @@ xfs_unmount_zones(
{
xfs_zone_gc_unmount(mp);
xfs_free_zone_info(mp->m_zone_info);
- xfs_mru_cache_destroy(mp->m_zone_cache);
}