summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-13 17:03:48 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-13 17:03:48 -0700
commit0b0128e64af056a7dd29fa3bc780af654e53f861 (patch)
treef5d027e0daf5c349c367da3720cc5273fd488784 /fs
parent230fb3a33efd52613910a3970495b20295557731 (diff)
parent2ffc6900d5c3a7cd59becda2aa67581d9bd3858e (diff)
Merge tag 'xfs-merge-7.1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull xfs updates from Carlos Maiolino: "There aren't any new features. The whole series is just a collection of bug fixes and code refactoring. There is some new information added a couple new tracepoints, new data added to mountstats, but no big changes" * tag 'xfs-merge-7.1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (41 commits) xfs: fix number of GC bvecs xfs: untangle the open zones reporting in mountinfo xfs: expose the number of open zones in sysfs xfs: reduce special casing for the open GC zone xfs: streamline GC zone selection xfs: refactor GC zone selection helpers xfs: rename xfs_zone_gc_iter_next to xfs_zone_gc_iter_irec xfs: put the open zone later xfs_open_zone_put xfs: add a separate tracepoint for stealing an open zone for GC xfs: delay initial open of the GC zone xfs: fix a resource leak in xfs_alloc_buftarg() xfs: handle too many open zones when mounting xfs: refactor xfs_mount_zones xfs: fix integer overflow in busy extent sort comparator xfs: fix integer overflow in deferred intent sort comparators xfs: fold xfs_setattr_size into xfs_vn_setattr_size xfs: remove a duplicate assert in xfs_setattr_size xfs: return default quota limits for IDs without a dquot xfs: start gc on zonegc_low_space attribute updates xfs: don't decrement the buffer LRU count for in-use buffers ...
Diffstat (limited to 'fs')
-rw-r--r--fs/iomap/buffered-io.c6
-rw-r--r--fs/xfs/libxfs/xfs_ag.c13
-rw-r--r--fs/xfs/libxfs/xfs_ag.h2
-rw-r--r--fs/xfs/libxfs/xfs_fs.h5
-rw-r--r--fs/xfs/xfs_buf.c236
-rw-r--r--fs/xfs/xfs_buf.h20
-rw-r--r--fs/xfs/xfs_buf_mem.c11
-rw-r--r--fs/xfs/xfs_extent_busy.c4
-rw-r--r--fs/xfs/xfs_extfree_item.c2
-rw-r--r--fs/xfs/xfs_file.c112
-rw-r--r--fs/xfs/xfs_ioctl.c19
-rw-r--r--fs/xfs/xfs_iomap.c146
-rw-r--r--fs/xfs/xfs_iops.c39
-rw-r--r--fs/xfs/xfs_mount.c75
-rw-r--r--fs/xfs/xfs_mount.h3
-rw-r--r--fs/xfs/xfs_qm_syscalls.c43
-rw-r--r--fs/xfs/xfs_refcount_item.c2
-rw-r--r--fs/xfs/xfs_rmap_item.c2
-rw-r--r--fs/xfs/xfs_sysfs.c20
-rw-r--r--fs/xfs/xfs_trace.h12
-rw-r--r--fs/xfs/xfs_zone_alloc.c194
-rw-r--r--fs/xfs/xfs_zone_alloc.h4
-rw-r--r--fs/xfs/xfs_zone_gc.c271
-rw-r--r--fs/xfs/xfs_zone_info.c23
-rw-r--r--fs/xfs/xfs_zone_priv.h15
25 files changed, 740 insertions, 539 deletions
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e4b6886e5c3c..d7b648421a70 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1647,16 +1647,12 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
while ((ret = iomap_iter(&iter, ops)) > 0) {
const struct iomap *srcmap = iomap_iter_srcmap(&iter);
- if (WARN_ON_ONCE((iter.iomap.flags & IOMAP_F_FOLIO_BATCH) &&
- srcmap->type != IOMAP_UNWRITTEN))
- return -EIO;
-
if (!(iter.iomap.flags & IOMAP_F_FOLIO_BATCH) &&
(srcmap->type == IOMAP_HOLE ||
srcmap->type == IOMAP_UNWRITTEN)) {
s64 status;
- if (range_dirty) {
+ if (range_dirty && srcmap->type == IOMAP_UNWRITTEN) {
range_dirty = false;
status = iomap_zero_iter_flush_and_stale(&iter);
} else {
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index bd8fbb40b49e..dcd2f93b6a6c 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -110,10 +110,7 @@ xfs_perag_uninit(
struct xfs_group *xg)
{
#ifdef __KERNEL__
- struct xfs_perag *pag = to_perag(xg);
-
- cancel_delayed_work_sync(&pag->pag_blockgc_work);
- xfs_buf_cache_destroy(&pag->pag_bcache);
+ cancel_delayed_work_sync(&to_perag(xg)->pag_blockgc_work);
#endif
}
@@ -235,10 +232,6 @@ xfs_perag_alloc(
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
#endif /* __KERNEL__ */
- error = xfs_buf_cache_init(&pag->pag_bcache);
- if (error)
- goto out_free_perag;
-
/*
* Pre-calculated geometry
*/
@@ -250,12 +243,10 @@ xfs_perag_alloc(
error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG);
if (error)
- goto out_buf_cache_destroy;
+ goto out_free_perag;
return 0;
-out_buf_cache_destroy:
- xfs_buf_cache_destroy(&pag->pag_bcache);
out_free_perag:
kfree(pag);
return error;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 3cd4790768ff..16a9b43a3c27 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -85,8 +85,6 @@ struct xfs_perag {
int pag_ici_reclaimable; /* reclaimable inodes */
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
- struct xfs_buf_cache pag_bcache;
-
/* background prealloc block trimming */
struct delayed_work pag_blockgc_work;
#endif /* __KERNEL__ */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index d165de607d17..185f09f327c0 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -995,7 +995,8 @@ struct xfs_rtgroup_geometry {
__u32 rg_sick; /* o: sick things in ag */
__u32 rg_checked; /* o: checked metadata in ag */
__u32 rg_flags; /* i/o: flags for this ag */
- __u32 rg_reserved[27]; /* o: zero */
+ __u32 rg_writepointer; /* o: write pointer block offset for zoned */
+ __u32 rg_reserved[26]; /* o: zero */
};
#define XFS_RTGROUP_GEOM_SICK_SUPER (1U << 0) /* superblock */
#define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */
@@ -1003,6 +1004,8 @@ struct xfs_rtgroup_geometry {
#define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */
#define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */
+#define XFS_RTGROUP_GEOM_WRITEPOINTER (1U << 0) /* write pointer */
+
/* Health monitor event domains */
/* affects the whole fs */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d2f3c50d80e7..580d40a5ee57 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -31,20 +31,20 @@ struct kmem_cache *xfs_buf_cache;
*
* xfs_buf_stale:
* b_sema (caller holds)
- * b_lock
+ * b_lockref.lock
* lru_lock
*
* xfs_buf_rele:
- * b_lock
+ * b_lockref.lock
* lru_lock
*
* xfs_buftarg_drain_rele
* lru_lock
- * b_lock (trylock due to inversion)
+ * b_lockref.lock (trylock due to inversion)
*
* xfs_buftarg_isolate
* lru_lock
- * b_lock (trylock due to inversion)
+ * b_lockref.lock (trylock due to inversion)
*/
static void xfs_buf_submit(struct xfs_buf *bp);
@@ -78,14 +78,11 @@ xfs_buf_stale(
*/
bp->b_flags &= ~_XBF_DELWRI_Q;
- spin_lock(&bp->b_lock);
+ spin_lock(&bp->b_lockref.lock);
atomic_set(&bp->b_lru_ref, 0);
- if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
- (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
- bp->b_hold--;
-
- ASSERT(bp->b_hold >= 1);
- spin_unlock(&bp->b_lock);
+ if (!__lockref_is_dead(&bp->b_lockref))
+ list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
+ spin_unlock(&bp->b_lockref.lock);
}
static void
@@ -277,10 +274,8 @@ xfs_buf_alloc(
* inserting into the hash table are safe (and will have to wait for
* the unlock to do anything non-trivial).
*/
- bp->b_hold = 1;
+ lockref_init(&bp->b_lockref);
sema_init(&bp->b_sema, 0); /* held, no waiters */
-
- spin_lock_init(&bp->b_lock);
atomic_set(&bp->b_lru_ref, 1);
init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_lru);
@@ -368,20 +363,6 @@ static const struct rhashtable_params xfs_buf_hash_params = {
.obj_cmpfn = _xfs_buf_obj_cmp,
};
-int
-xfs_buf_cache_init(
- struct xfs_buf_cache *bch)
-{
- return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
-}
-
-void
-xfs_buf_cache_destroy(
- struct xfs_buf_cache *bch)
-{
- rhashtable_destroy(&bch->bc_hash);
-}
-
static int
xfs_buf_map_verify(
struct xfs_buftarg *btp,
@@ -437,23 +418,9 @@ xfs_buf_find_lock(
return 0;
}
-static bool
-xfs_buf_try_hold(
- struct xfs_buf *bp)
-{
- spin_lock(&bp->b_lock);
- if (bp->b_hold == 0) {
- spin_unlock(&bp->b_lock);
- return false;
- }
- bp->b_hold++;
- spin_unlock(&bp->b_lock);
- return true;
-}
-
static inline int
xfs_buf_lookup(
- struct xfs_buf_cache *bch,
+ struct xfs_buftarg *btp,
struct xfs_buf_map *map,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
@@ -462,8 +429,8 @@ xfs_buf_lookup(
int error;
rcu_read_lock();
- bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
- if (!bp || !xfs_buf_try_hold(bp)) {
+ bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params);
+ if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
rcu_read_unlock();
return -ENOENT;
}
@@ -487,7 +454,6 @@ xfs_buf_lookup(
static int
xfs_buf_find_insert(
struct xfs_buftarg *btp,
- struct xfs_buf_cache *bch,
struct xfs_perag *pag,
struct xfs_buf_map *cmap,
struct xfs_buf_map *map,
@@ -507,14 +473,14 @@ xfs_buf_find_insert(
new_bp->b_pag = pag;
rcu_read_lock();
- bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
+ bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash,
&new_bp->b_rhash_head, xfs_buf_hash_params);
if (IS_ERR(bp)) {
rcu_read_unlock();
error = PTR_ERR(bp);
goto out_free_buf;
}
- if (bp && xfs_buf_try_hold(bp)) {
+ if (bp && lockref_get_not_dead(&bp->b_lockref)) {
/* found an existing buffer */
rcu_read_unlock();
error = xfs_buf_find_lock(bp, flags);
@@ -549,16 +515,6 @@ xfs_buftarg_get_pag(
return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
}
-static inline struct xfs_buf_cache *
-xfs_buftarg_buf_cache(
- struct xfs_buftarg *btp,
- struct xfs_perag *pag)
-{
- if (pag)
- return &pag->pag_bcache;
- return btp->bt_cache;
-}
-
/*
* Assembles a buffer covering the specified range. The code is optimised for
* cache hits, as metadata intensive workloads will see 3 orders of magnitude
@@ -572,7 +528,6 @@ xfs_buf_get_map(
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
- struct xfs_buf_cache *bch;
struct xfs_perag *pag;
struct xfs_buf *bp = NULL;
struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
@@ -589,9 +544,8 @@ xfs_buf_get_map(
return error;
pag = xfs_buftarg_get_pag(btp, &cmap);
- bch = xfs_buftarg_buf_cache(btp, pag);
- error = xfs_buf_lookup(bch, &cmap, flags, &bp);
+ error = xfs_buf_lookup(btp, &cmap, flags, &bp);
if (error && error != -ENOENT)
goto out_put_perag;
@@ -603,7 +557,7 @@ xfs_buf_get_map(
goto out_put_perag;
/* xfs_buf_find_insert() consumes the perag reference. */
- error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
+ error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
flags, &bp);
if (error)
return error;
@@ -856,82 +810,25 @@ xfs_buf_hold(
{
trace_xfs_buf_hold(bp, _RET_IP_);
- spin_lock(&bp->b_lock);
- bp->b_hold++;
- spin_unlock(&bp->b_lock);
+ lockref_get(&bp->b_lockref);
}
static void
-xfs_buf_rele_uncached(
+xfs_buf_destroy(
struct xfs_buf *bp)
{
- ASSERT(list_empty(&bp->b_lru));
-
- spin_lock(&bp->b_lock);
- if (--bp->b_hold) {
- spin_unlock(&bp->b_lock);
- return;
- }
- spin_unlock(&bp->b_lock);
- xfs_buf_free(bp);
-}
-
-static void
-xfs_buf_rele_cached(
- struct xfs_buf *bp)
-{
- struct xfs_buftarg *btp = bp->b_target;
- struct xfs_perag *pag = bp->b_pag;
- struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
- bool freebuf = false;
-
- trace_xfs_buf_rele(bp, _RET_IP_);
-
- spin_lock(&bp->b_lock);
- ASSERT(bp->b_hold >= 1);
- if (bp->b_hold > 1) {
- bp->b_hold--;
- goto out_unlock;
- }
+ ASSERT(__lockref_is_dead(&bp->b_lockref));
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- /* we are asked to drop the last reference */
- if (atomic_read(&bp->b_lru_ref)) {
- /*
- * If the buffer is added to the LRU, keep the reference to the
- * buffer for the LRU and clear the (now stale) dispose list
- * state flag, else drop the reference.
- */
- if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
- bp->b_state &= ~XFS_BSTATE_DISPOSE;
- else
- bp->b_hold--;
- } else {
- bp->b_hold--;
- /*
- * most of the time buffers will already be removed from the
- * LRU, so optimise that case by checking for the
- * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
- * was on was the disposal list
- */
- if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
- list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
- } else {
- ASSERT(list_empty(&bp->b_lru));
- }
+ if (!xfs_buf_is_uncached(bp)) {
+ rhashtable_remove_fast(&bp->b_target->bt_hash,
+ &bp->b_rhash_head, xfs_buf_hash_params);
- ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
- xfs_buf_hash_params);
- if (pag)
- xfs_perag_put(pag);
- freebuf = true;
+ if (bp->b_pag)
+ xfs_perag_put(bp->b_pag);
}
-out_unlock:
- spin_unlock(&bp->b_lock);
-
- if (freebuf)
- xfs_buf_free(bp);
+ xfs_buf_free(bp);
}
/*
@@ -942,10 +839,23 @@ xfs_buf_rele(
struct xfs_buf *bp)
{
trace_xfs_buf_rele(bp, _RET_IP_);
- if (xfs_buf_is_uncached(bp))
- xfs_buf_rele_uncached(bp);
- else
- xfs_buf_rele_cached(bp);
+
+ if (lockref_put_or_lock(&bp->b_lockref))
+ return;
+ if (!--bp->b_lockref.count) {
+ if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
+ goto kill;
+ list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
+ }
+ spin_unlock(&bp->b_lockref.lock);
+ return;
+
+kill:
+ lockref_mark_dead(&bp->b_lockref);
+ list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
+ spin_unlock(&bp->b_lockref.lock);
+
+ xfs_buf_destroy(bp);
}
/*
@@ -1254,9 +1164,11 @@ xfs_buf_ioerror_alert(
/*
* To simulate an I/O failure, the buffer must be locked and held with at least
- * three references. The LRU reference is dropped by the stale call. The buf
- * item reference is dropped via ioend processing. The third reference is owned
- * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
+ * two references.
+ *
+ * The buf item reference is dropped via ioend processing. The second reference
+ * is owned by the caller and is dropped on I/O completion if the buffer is
+ * XBF_ASYNC.
*/
void
xfs_buf_ioend_fail(
@@ -1512,23 +1424,18 @@ xfs_buftarg_drain_rele(
struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
struct list_head *dispose = arg;
- if (!spin_trylock(&bp->b_lock))
+ if (!spin_trylock(&bp->b_lockref.lock))
return LRU_SKIP;
- if (bp->b_hold > 1) {
+ if (bp->b_lockref.count > 0) {
/* need to wait, so skip it this pass */
- spin_unlock(&bp->b_lock);
+ spin_unlock(&bp->b_lockref.lock);
trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
return LRU_SKIP;
}
- /*
- * clear the LRU reference count so the buffer doesn't get
- * ignored in xfs_buf_rele().
- */
- atomic_set(&bp->b_lru_ref, 0);
- bp->b_state |= XFS_BSTATE_DISPOSE;
+ lockref_mark_dead(&bp->b_lockref);
list_lru_isolate_move(lru, item, dispose);
- spin_unlock(&bp->b_lock);
+ spin_unlock(&bp->b_lockref.lock);
return LRU_REMOVED;
}
@@ -1581,7 +1488,7 @@ xfs_buftarg_drain(
"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
(long long)xfs_buf_daddr(bp));
}
- xfs_buf_rele(bp);
+ xfs_buf_destroy(bp);
}
if (loop++ != 0)
delay(100);
@@ -1610,24 +1517,37 @@ xfs_buftarg_isolate(
struct list_head *dispose = arg;
/*
- * we are inverting the lru lock/bp->b_lock here, so use a trylock.
- * If we fail to get the lock, just skip it.
+ * We are inverting the lru lock vs bp->b_lockref.lock order here, so
+ * use a trylock. If we fail to get the lock, just skip the buffer.
*/
- if (!spin_trylock(&bp->b_lock))
+ if (!spin_trylock(&bp->b_lockref.lock))
return LRU_SKIP;
+
+ /*
+ * If the buffer is in use, remove it from the LRU for now. We can't
+ * free it while someone is using it, and we should also not count
+ * eviction passed for it, just as if it hadn't been added to the LRU
+ * yet.
+ */
+ if (bp->b_lockref.count > 0) {
+ list_lru_isolate(lru, &bp->b_lru);
+ spin_unlock(&bp->b_lockref.lock);
+ return LRU_REMOVED;
+ }
+
/*
* Decrement the b_lru_ref count unless the value is already
* zero. If the value is already zero, we need to reclaim the
* buffer, otherwise it gets another trip through the LRU.
*/
if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
- spin_unlock(&bp->b_lock);
+ spin_unlock(&bp->b_lockref.lock);
return LRU_ROTATE;
}
- bp->b_state |= XFS_BSTATE_DISPOSE;
+ lockref_mark_dead(&bp->b_lockref);
list_lru_isolate_move(lru, item, dispose);
- spin_unlock(&bp->b_lock);
+ spin_unlock(&bp->b_lockref.lock);
return LRU_REMOVED;
}
@@ -1647,7 +1567,7 @@ xfs_buftarg_shrink_scan(
struct xfs_buf *bp;
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
- xfs_buf_rele(bp);
+ xfs_buf_destroy(bp);
}
return freed;
@@ -1670,6 +1590,7 @@ xfs_destroy_buftarg(
ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
percpu_counter_destroy(&btp->bt_readahead_count);
list_lru_destroy(&btp->bt_lru);
+ rhashtable_destroy(&btp->bt_hash);
}
void
@@ -1764,8 +1685,10 @@ xfs_init_buftarg(
ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
DEFAULT_RATELIMIT_BURST);
- if (list_lru_init(&btp->bt_lru))
+ if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params))
return -ENOMEM;
+ if (list_lru_init(&btp->bt_lru))
+ goto out_destroy_hash;
if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
goto out_destroy_lru;
@@ -1783,6 +1706,8 @@ out_destroy_io_count:
percpu_counter_destroy(&btp->bt_readahead_count);
out_destroy_lru:
list_lru_destroy(&btp->bt_lru);
+out_destroy_hash:
+ rhashtable_destroy(&btp->bt_hash);
return -ENOMEM;
}
@@ -1831,6 +1756,7 @@ xfs_alloc_buftarg(
return btp;
error_free:
+ fs_put_dax(btp->bt_daxdev, mp);
kfree(btp);
return ERR_PTR(error);
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e25cd2a160f3..bf39d89f0f6d 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -14,6 +14,7 @@
#include <linux/dax.h>
#include <linux/uio.h>
#include <linux/list_lru.h>
+#include <linux/lockref.h>
extern struct kmem_cache *xfs_buf_cache;
@@ -69,18 +70,6 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_TRYLOCK, "TRYLOCK" }
/*
- * Internal state flags.
- */
-#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
-
-struct xfs_buf_cache {
- struct rhashtable bc_hash;
-};
-
-int xfs_buf_cache_init(struct xfs_buf_cache *bch);
-void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
-
-/*
* The xfs_buftarg contains 2 notions of "sector size" -
*
* 1) The metadata sector size, which is the minimum unit and
@@ -117,8 +106,7 @@ struct xfs_buftarg {
unsigned int bt_awu_min;
unsigned int bt_awu_max;
- /* built-in cache, if we're not using the perag one */
- struct xfs_buf_cache bt_cache[];
+ struct rhashtable bt_hash;
};
struct xfs_buf_map {
@@ -159,7 +147,7 @@ struct xfs_buf {
xfs_daddr_t b_rhash_key; /* buffer cache index */
int b_length; /* size of buffer in BBs */
- unsigned int b_hold; /* reference count */
+ struct lockref b_lockref; /* refcount + lock */
atomic_t b_lru_ref; /* lru reclaim ref count */
xfs_buf_flags_t b_flags; /* status flags */
struct semaphore b_sema; /* semaphore for lockables */
@@ -169,8 +157,6 @@ struct xfs_buf {
* bt_lru_lock and not by b_sema
*/
struct list_head b_lru; /* lru list */
- spinlock_t b_lock; /* internal state lock */
- unsigned int b_state; /* internal state flags */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag;
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index b0b3696bf599..b2fd7276b131 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -58,7 +58,7 @@ xmbuf_alloc(
struct xfs_buftarg *btp;
int error;
- btp = kzalloc_flex(*btp, bt_cache, 1);
+ btp = kzalloc_obj(*btp);
if (!btp)
return -ENOMEM;
@@ -81,10 +81,6 @@ xmbuf_alloc(
/* ensure all writes are below EOF to avoid pagecache zeroing */
i_size_write(inode, inode->i_sb->s_maxbytes);
- error = xfs_buf_cache_init(btp->bt_cache);
- if (error)
- goto out_file;
-
/* Initialize buffer target */
btp->bt_mount = mp;
btp->bt_dev = (dev_t)-1U;
@@ -95,15 +91,13 @@ xmbuf_alloc(
error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
if (error)
- goto out_bcache;
+ goto out_file;
trace_xmbuf_create(btp);
*btpp = btp;
return 0;
-out_bcache:
- xfs_buf_cache_destroy(btp->bt_cache);
out_file:
fput(file);
out_free_btp:
@@ -122,7 +116,6 @@ xmbuf_free(
trace_xmbuf_free(btp);
xfs_destroy_buftarg(btp);
- xfs_buf_cache_destroy(btp->bt_cache);
fput(btp->bt_file);
kfree(btp);
}
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 3efdca3d675b..41cf0605ec22 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -690,9 +690,9 @@ xfs_extent_busy_ag_cmp(
container_of(l2, struct xfs_extent_busy, list);
s32 diff;
- diff = b1->group->xg_gno - b2->group->xg_gno;
+ diff = cmp_int(b1->group->xg_gno, b2->group->xg_gno);
if (!diff)
- diff = b1->bno - b2->bno;
+ diff = cmp_int(b1->bno, b2->bno);
return diff;
}
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 749a4eb9793c..2266d56e37dc 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -387,7 +387,7 @@ xfs_extent_free_diff_items(
struct xfs_extent_free_item *ra = xefi_entry(a);
struct xfs_extent_free_item *rb = xefi_entry(b);
- return ra->xefi_group->xg_gno - rb->xefi_group->xg_gno;
+ return cmp_int(ra->xefi_group->xg_gno, rb->xefi_group->xg_gno);
}
/* Log a free extent to the intent item. */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 6246f34df9fd..845a97c9b063 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -560,6 +560,72 @@ xfs_zoned_write_space_reserve(
flags, ac);
}
+/*
+ * We need to lock the test/set EOF update as we can be racing with
+ * other IO completions here to update the EOF. Failing to serialise
+ * here can result in EOF moving backwards and Bad Things Happen when
+ * that occurs.
+ *
+ * As IO completion only ever extends EOF, we can do an unlocked check
+ * here to avoid taking the spinlock. If we land within the current EOF,
+ * then we do not need to do an extending update at all, and we don't
+ * need to take the lock to check this. If we race with an update moving
+ * EOF, then we'll either still be beyond EOF and need to take the lock,
+ * or we'll be within EOF and we don't need to take it at all.
+ */
+static int
+xfs_dio_endio_set_isize(
+ struct inode *inode,
+ loff_t offset,
+ ssize_t size)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ if (offset + size <= i_size_read(inode))
+ return 0;
+
+ spin_lock(&ip->i_flags_lock);
+ if (offset + size <= i_size_read(inode)) {
+ spin_unlock(&ip->i_flags_lock);
+ return 0;
+ }
+
+ i_size_write(inode, offset + size);
+ spin_unlock(&ip->i_flags_lock);
+
+ return xfs_setfilesize(ip, offset, size);
+}
+
+static int
+xfs_zoned_dio_write_end_io(
+ struct kiocb *iocb,
+ ssize_t size,
+ int error,
+ unsigned flags)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_inode *ip = XFS_I(inode);
+ unsigned int nofs_flag;
+
+ ASSERT(!(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
+
+ trace_xfs_end_io_direct_write(ip, iocb->ki_pos, size);
+
+ if (xfs_is_shutdown(ip->i_mount))
+ return -EIO;
+
+ if (error || !size)
+ return error;
+
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
+
+ nofs_flag = memalloc_nofs_save();
+ error = xfs_dio_endio_set_isize(inode, iocb->ki_pos, size);
+ memalloc_nofs_restore(nofs_flag);
+
+ return error;
+}
+
static int
xfs_dio_write_end_io(
struct kiocb *iocb,
@@ -572,8 +638,7 @@ xfs_dio_write_end_io(
loff_t offset = iocb->ki_pos;
unsigned int nofs_flag;
- ASSERT(!xfs_is_zoned_inode(ip) ||
- !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
+ ASSERT(!xfs_is_zoned_inode(ip));
trace_xfs_end_io_direct_write(ip, offset, size);
@@ -623,30 +688,8 @@ xfs_dio_write_end_io(
* with the on-disk inode size being outside the in-core inode size. We
* have no other method of updating EOF for AIO, so always do it here
* if necessary.
- *
- * We need to lock the test/set EOF update as we can be racing with
- * other IO completions here to update the EOF. Failing to serialise
- * here can result in EOF moving backwards and Bad Things Happen when
- * that occurs.
- *
- * As IO completion only ever extends EOF, we can do an unlocked check
- * here to avoid taking the spinlock. If we land within the current EOF,
- * then we do not need to do an extending update at all, and we don't
- * need to take the lock to check this. If we race with an update moving
- * EOF, then we'll either still be beyond EOF and need to take the lock,
- * or we'll be within EOF and we don't need to take it at all.
*/
- if (offset + size <= i_size_read(inode))
- goto out;
-
- spin_lock(&ip->i_flags_lock);
- if (offset + size > i_size_read(inode)) {
- i_size_write(inode, offset + size);
- spin_unlock(&ip->i_flags_lock);
- error = xfs_setfilesize(ip, offset, size);
- } else {
- spin_unlock(&ip->i_flags_lock);
- }
+ error = xfs_dio_endio_set_isize(inode, offset, size);
out:
memalloc_nofs_restore(nofs_flag);
@@ -688,7 +731,7 @@ xfs_dio_zoned_submit_io(
static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
.bio_set = &iomap_ioend_bioset,
.submit_io = xfs_dio_zoned_submit_io,
- .end_io = xfs_dio_write_end_io,
+ .end_io = xfs_zoned_dio_write_end_io,
};
/*
@@ -1263,6 +1306,23 @@ xfs_falloc_insert_range(
if (offset >= isize)
return -EINVAL;
+ /*
+ * Let writeback clean up EOF folio state before we bump i_size. The
+ * insert flushes before it starts shifting and under certain
+ * circumstances we can write back blocks that should technically be
+ * considered post-eof (and thus should not be submitted for writeback).
+ *
+ * For example, a large, dirty folio that spans EOF and is backed by
+ * post-eof COW fork preallocation can cause block remap into the data
+ * fork. This shifts back out beyond EOF, but creates an expectedly
+ * written post-eof block. The insert is going to flush, unmap and
+ * cancel prealloc across this whole range, so flush EOF now before we
+ * bump i_size to provide consistent behavior.
+ */
+ error = filemap_write_and_wait_range(inode->i_mapping, isize, isize);
+ if (error)
+ return error;
+
error = xfs_falloc_setsize(file, isize + len);
if (error)
return error;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index facffdc8dca8..46e234863644 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -37,12 +37,15 @@
#include "xfs_ioctl.h"
#include "xfs_xattr.h"
#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
#include "xfs_file.h"
#include "xfs_exchrange.h"
#include "xfs_handle.h"
#include "xfs_rtgroup.h"
#include "xfs_healthmon.h"
#include "xfs_verify_media.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zone_alloc.h"
#include <linux/mount.h>
#include <linux/fileattr.h>
@@ -413,6 +416,7 @@ xfs_ioc_rtgroup_geometry(
{
struct xfs_rtgroup *rtg;
struct xfs_rtgroup_geometry rgeo;
+ xfs_rgblock_t highest_rgbno;
int error;
if (copy_from_user(&rgeo, arg, sizeof(rgeo)))
@@ -433,6 +437,21 @@ xfs_ioc_rtgroup_geometry(
if (error)
return error;
+ if (xfs_has_zoned(mp)) {
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ if (rtg->rtg_open_zone) {
+ rgeo.rg_writepointer = rtg->rtg_open_zone->oz_allocated;
+ } else {
+ highest_rgbno = xfs_rtrmap_highest_rgbno(rtg);
+ if (highest_rgbno == NULLRGBLOCK)
+ rgeo.rg_writepointer = 0;
+ else
+ rgeo.rg_writepointer = highest_rgbno + 1;
+ }
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+ rgeo.rg_flags |= XFS_RTGROUP_GEOM_WRITEPOINTER;
+ }
+
if (copy_to_user(arg, &rgeo, sizeof(rgeo)))
return -EFAULT;
return 0;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 9c2f12d5fec9..f20a02f49ed9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1593,6 +1593,7 @@ xfs_zoned_buffered_write_iomap_begin(
{
struct iomap_iter *iter =
container_of(iomap, struct iomap_iter, iomap);
+ struct address_space *mapping = inode->i_mapping;
struct xfs_zone_alloc_ctx *ac = iter->private;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1617,6 +1618,7 @@ xfs_zoned_buffered_write_iomap_begin(
if (error)
return error;
+restart:
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
@@ -1654,14 +1656,6 @@ xfs_zoned_buffered_write_iomap_begin(
&smap))
smap.br_startoff = end_fsb; /* fake hole until EOF */
if (smap.br_startoff > offset_fsb) {
- /*
- * We never need to allocate blocks for zeroing a hole.
- */
- if (flags & IOMAP_ZERO) {
- xfs_hole_to_iomap(ip, iomap, offset_fsb,
- smap.br_startoff);
- goto out_unlock;
- }
end_fsb = min(end_fsb, smap.br_startoff);
} else {
end_fsb = min(end_fsb,
@@ -1694,6 +1688,33 @@ xfs_zoned_buffered_write_iomap_begin(
XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE));
/*
+ * When zeroing, don't allocate blocks for holes as they are already
+ * zeroes, but we need to ensure that no extents exist in both the data
+ * and COW fork to ensure this really is a hole.
+ *
+ * A window exists where we might observe a hole in both forks with
+ * valid data in cache. Writeback removes the COW fork blocks on
+ * submission but doesn't remap into the data fork until completion. If
+ * the data fork was previously a hole, we'll fail to zero. Until we
+ * find a way to avoid this transient state, check for dirty pagecache
+ * and flush to wait on blocks to land in the data fork.
+ */
+ if ((flags & IOMAP_ZERO) && srcmap->type == IOMAP_HOLE) {
+ if (filemap_range_needs_writeback(mapping, offset,
+ offset + count - 1)) {
+ xfs_iunlock(ip, lockmode);
+ error = filemap_write_and_wait_range(mapping, offset,
+ offset + count - 1);
+ if (error)
+ return error;
+ goto restart;
+ }
+
+ xfs_hole_to_iomap(ip, iomap, offset_fsb, end_fsb);
+ goto out_unlock;
+ }
+
+ /*
* The block reservation is supposed to cover all blocks that the
* operation could possible write, but there is a nasty corner case
* where blocks could be stolen from underneath us:
@@ -1767,6 +1788,8 @@ xfs_buffered_write_iomap_begin(
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+ xfs_fileoff_t cow_fsb = NULLFILEOFF;
+ xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
struct xfs_bmbt_irec imap, cmap;
struct xfs_iext_cursor icur, ccur;
xfs_fsblock_t prealloc_blocks = 0;
@@ -1811,30 +1834,96 @@ xfs_buffered_write_iomap_begin(
goto out_unlock;
/*
- * Search the data fork first to look up our source mapping. We
- * always need the data fork map, as we have to return it to the
- * iomap code so that the higher level write code can read data in to
- * perform read-modify-write cycles for unaligned writes.
+ * Search the data fork first to look up our source mapping. We always
+ * need the data fork map, as we have to return it to the iomap code so
+ * that the higher level write code can read data in to perform
+ * read-modify-write cycles for unaligned writes.
+ *
+ * Then search the COW fork extent list even if we did not find a data
+ * fork extent. This serves two purposes: first this implements the
+ * speculative preallocation using cowextsize, so that we also unshare
+ * block adjacent to shared blocks instead of just the shared blocks
+ * themselves. Second the lookup in the extent list is generally faster
+ * than going out to the shared extent tree.
*/
eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
if (eof)
imap.br_startoff = end_fsb; /* fake hole until the end */
+ if (xfs_is_cow_inode(ip)) {
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+ cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+ &ccur, &cmap);
+ if (!cow_eof)
+ cow_fsb = cmap.br_startoff;
+ }
- /* We never need to allocate blocks for zeroing or unsharing a hole. */
- if ((flags & (IOMAP_UNSHARE | IOMAP_ZERO)) &&
- imap.br_startoff > offset_fsb) {
+ /* We never need to allocate blocks for unsharing a hole. */
+ if ((flags & IOMAP_UNSHARE) && imap.br_startoff > offset_fsb) {
xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
goto out_unlock;
}
/*
+ * We may need to zero over a hole in the data fork if it's fronted by
+ * COW blocks and dirty pagecache. Scan such file ranges for dirty
+ * cache and fill the iomap batch with folios that need zeroing.
+ */
+ if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+ loff_t start, end;
+ unsigned int fbatch_count;
+
+ imap.br_blockcount = imap.br_startoff - offset_fsb;
+ imap.br_startoff = offset_fsb;
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+
+ if (cow_fsb == NULLFILEOFF)
+ goto found_imap;
+ if (cow_fsb > offset_fsb) {
+ xfs_trim_extent(&imap, offset_fsb,
+ cow_fsb - offset_fsb);
+ goto found_imap;
+ }
+
+ /* no zeroing beyond eof, so split at the boundary */
+ if (offset_fsb >= eof_fsb)
+ goto found_imap;
+ if (offset_fsb < eof_fsb && end_fsb > eof_fsb)
+ xfs_trim_extent(&imap, offset_fsb,
+ eof_fsb - offset_fsb);
+
+ /* COW fork blocks overlap the hole */
+ xfs_trim_extent(&imap, offset_fsb,
+ cmap.br_startoff + cmap.br_blockcount - offset_fsb);
+ start = XFS_FSB_TO_B(mp, imap.br_startoff);
+ end = XFS_FSB_TO_B(mp, imap.br_startoff + imap.br_blockcount);
+ fbatch_count = iomap_fill_dirty_folios(iter, &start, end,
+ &iomap_flags);
+ xfs_trim_extent(&imap, offset_fsb,
+ XFS_B_TO_FSB(mp, start) - offset_fsb);
+
+ /*
+ * Report the COW mapping if we have folios to zero. Otherwise
+ * ignore the COW blocks as preallocation and report a hole.
+ */
+ if (fbatch_count) {
+ xfs_trim_extent(&cmap, imap.br_startoff,
+ imap.br_blockcount);
+ imap.br_startoff = end_fsb; /* fake hole */
+ goto found_cow;
+ }
+ goto found_imap;
+ }
+
+ /*
* For zeroing, trim extents that extend beyond the EOF block. If a
* delalloc extent starts beyond the EOF block, convert it to an
* unwritten extent.
*/
if (flags & IOMAP_ZERO) {
- xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
-
if (isnullstartblock(imap.br_startblock) &&
offset_fsb >= eof_fsb)
goto convert_delay;
@@ -1867,24 +1956,13 @@ xfs_buffered_write_iomap_begin(
}
/*
- * Search the COW fork extent list even if we did not find a data fork
- * extent. This serves two purposes: first this implements the
- * speculative preallocation using cowextsize, so that we also unshare
- * block adjacent to shared blocks instead of just the shared blocks
- * themselves. Second the lookup in the extent list is generally faster
- * than going out to the shared extent tree.
+ * Now that we've handled any operation specific special cases, at this
+ * point we can report a COW mapping if found.
*/
- if (xfs_is_cow_inode(ip)) {
- if (!ip->i_cowfp) {
- ASSERT(!xfs_is_reflink_inode(ip));
- xfs_ifork_init_cow(ip);
- }
- cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
- &ccur, &cmap);
- if (!cow_eof && cmap.br_startoff <= offset_fsb) {
- trace_xfs_reflink_cow_found(ip, &cmap);
- goto found_cow;
- }
+ if (xfs_is_cow_inode(ip) &&
+ !cow_eof && cmap.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &cmap);
+ goto found_cow;
}
if (imap.br_startoff <= offset_fsb) {
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 208543e57eda..325c2200c501 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -901,20 +901,18 @@ out_dqrele:
/*
* Truncate file. Must have write permission and not be a directory.
- *
- * Caution: The caller of this function is responsible for calling
- * setattr_prepare() or otherwise verifying the change is fine.
*/
-STATIC int
-xfs_setattr_size(
+int
+xfs_vn_setattr_size(
struct mnt_idmap *idmap,
struct dentry *dentry,
- struct xfs_inode *ip,
struct iattr *iattr)
{
+ struct inode *inode = d_inode(dentry);
+ struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct inode *inode = VFS_I(ip);
- xfs_off_t oldsize, newsize;
+ xfs_off_t oldsize = inode->i_size;
+ xfs_off_t newsize = iattr->ia_size;
struct xfs_trans *tp;
int error;
uint lock_flags = 0;
@@ -927,8 +925,11 @@ xfs_setattr_size(
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
- oldsize = inode->i_size;
- newsize = iattr->ia_size;
+ trace_xfs_setattr(ip);
+
+ error = xfs_vn_change_ok(idmap, dentry, iattr);
+ if (error)
+ return error;
/*
* Short circuit the truncate case for zero length files.
@@ -1109,7 +1110,6 @@ xfs_setattr_size(
xfs_inode_clear_eofblocks_tag(ip);
}
- ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
setattr_copy(idmap, inode, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1129,23 +1129,6 @@ out_trans_cancel:
goto out_unlock;
}
-int
-xfs_vn_setattr_size(
- struct mnt_idmap *idmap,
- struct dentry *dentry,
- struct iattr *iattr)
-{
- struct xfs_inode *ip = XFS_I(d_inode(dentry));
- int error;
-
- trace_xfs_setattr(ip);
-
- error = xfs_vn_change_ok(idmap, dentry, iattr);
- if (error)
- return error;
- return xfs_setattr_size(idmap, dentry, ip, iattr);
-}
-
STATIC int
xfs_vn_setattr(
struct mnt_idmap *idmap,
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index ef1ea8a1238c..b24195f570cd 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -44,17 +44,36 @@
#include "xfs_healthmon.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
-static int xfs_uuid_table_size;
-static uuid_t *xfs_uuid_table;
+static DEFINE_XARRAY_ALLOC(xfs_uuid_table);
+
+static uuid_t *
+xfs_uuid_search(
+ uuid_t *new_uuid)
+{
+ unsigned long index = 0;
+ uuid_t *uuid;
+
+ xa_for_each(&xfs_uuid_table, index, uuid) {
+ if (uuid_equal(uuid, new_uuid))
+ return uuid;
+ }
+ return NULL;
+}
+
+static void
+xfs_uuid_delete(
+ uuid_t *uuid,
+ unsigned int index)
+{
+ ASSERT(uuid_equal(xa_load(&xfs_uuid_table, index), uuid));
+ xa_erase(&xfs_uuid_table, index);
+}
void
xfs_uuid_table_free(void)
{
- if (xfs_uuid_table_size == 0)
- return;
- kfree(xfs_uuid_table);
- xfs_uuid_table = NULL;
- xfs_uuid_table_size = 0;
+ ASSERT(xa_empty(&xfs_uuid_table));
+ xa_destroy(&xfs_uuid_table);
}
/*
@@ -66,7 +85,7 @@ xfs_uuid_mount(
struct xfs_mount *mp)
{
uuid_t *uuid = &mp->m_sb.sb_uuid;
- int hole, i;
+ int ret;
/* Publish UUID in struct super_block */
super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid));
@@ -80,30 +99,17 @@ xfs_uuid_mount(
}
mutex_lock(&xfs_uuid_table_mutex);
- for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
- if (uuid_is_null(&xfs_uuid_table[i])) {
- hole = i;
- continue;
- }
- if (uuid_equal(uuid, &xfs_uuid_table[i]))
- goto out_duplicate;
- }
-
- if (hole < 0) {
- xfs_uuid_table = krealloc(xfs_uuid_table,
- (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
- GFP_KERNEL | __GFP_NOFAIL);
- hole = xfs_uuid_table_size++;
+ if (unlikely(xfs_uuid_search(uuid))) {
+ xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount",
+ uuid);
+ mutex_unlock(&xfs_uuid_table_mutex);
+ return -EINVAL;
}
- xfs_uuid_table[hole] = *uuid;
- mutex_unlock(&xfs_uuid_table_mutex);
- return 0;
-
- out_duplicate:
+ ret = xa_alloc(&xfs_uuid_table, &mp->m_uuid_table_index, uuid,
+ xa_limit_32b, GFP_KERNEL);
mutex_unlock(&xfs_uuid_table_mutex);
- xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
- return -EINVAL;
+ return ret;
}
STATIC void
@@ -111,21 +117,12 @@ xfs_uuid_unmount(
struct xfs_mount *mp)
{
uuid_t *uuid = &mp->m_sb.sb_uuid;
- int i;
if (xfs_has_nouuid(mp))
return;
mutex_lock(&xfs_uuid_table_mutex);
- for (i = 0; i < xfs_uuid_table_size; i++) {
- if (uuid_is_null(&xfs_uuid_table[i]))
- continue;
- if (!uuid_equal(uuid, &xfs_uuid_table[i]))
- continue;
- memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
- break;
- }
- ASSERT(i < xfs_uuid_table_size);
+ xfs_uuid_delete(uuid, mp->m_uuid_table_index);
mutex_unlock(&xfs_uuid_table_mutex);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index ddd4028be8d6..d964bae096ef 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -346,6 +346,9 @@ typedef struct xfs_mount {
/* Private data referring to a health monitor object. */
struct xfs_healthmon __rcu *m_healthmon;
+
+ /* Index of uuid record in the uuid xarray. */
+ unsigned int m_uuid_table_index;
} xfs_mount_t;
#define M_IGEO(mp) (&(mp)->m_ino_geo)
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index d50b7318cb5c..21a784986828 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -391,6 +391,38 @@ out_rele:
return error;
}
+/*
+ * Fill out the default quota limits for an ID that has no dquot on disk.
+ * Returns 0 if default limits are configured
+ * and were filled in, -ENOENT otherwise.
+ */
+static int
+xfs_qm_scall_getquota_fill_defaults(
+ struct xfs_mount *mp,
+ xfs_dqtype_t type,
+ struct qc_dqblk *dst)
+{
+ struct xfs_def_quota *defq;
+
+ defq = xfs_get_defquota(mp->m_quotainfo, type);
+
+ if (!defq->blk.soft && !defq->blk.hard &&
+ !defq->ino.soft && !defq->ino.hard &&
+ !defq->rtb.soft && !defq->rtb.hard) {
+ return -ENOENT;
+ }
+
+ memset(dst, 0, sizeof(*dst));
+ dst->d_spc_softlimit = XFS_FSB_TO_B(mp, defq->blk.soft);
+ dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, defq->blk.hard);
+ dst->d_ino_softlimit = defq->ino.soft;
+ dst->d_ino_hardlimit = defq->ino.hard;
+ dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, defq->rtb.soft);
+ dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, defq->rtb.hard);
+
+ return 0;
+}
+
/* Fill out the quota context. */
static void
xfs_qm_scall_getquota_fill_qc(
@@ -451,8 +483,17 @@ xfs_qm_scall_getquota(
* set doalloc. If it doesn't exist, we'll get ENOENT back.
*/
error = xfs_qm_dqget(mp, id, type, false, &dqp);
- if (error)
+ if (error) {
+ /*
+ * If there is no dquot on disk and default limits are
+ * configured, return them with zero usage so that
+ * unprivileged users can see what limits apply to them.
+ */
+ if (error == -ENOENT && id != 0 &&
+ !xfs_qm_scall_getquota_fill_defaults(mp, type, dst))
+ return 0;
return error;
+ }
/*
* If everything's NULL, this dquot doesn't quite exist as far as
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 881c3f3a6a24..8bccf89a7766 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -266,7 +266,7 @@ xfs_refcount_update_diff_items(
struct xfs_refcount_intent *ra = ci_entry(a);
struct xfs_refcount_intent *rb = ci_entry(b);
- return ra->ri_group->xg_gno - rb->ri_group->xg_gno;
+ return cmp_int(ra->ri_group->xg_gno, rb->ri_group->xg_gno);
}
/* Log refcount updates in the intent item. */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index a39fe08dcd8f..2a3a73a8566d 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -267,7 +267,7 @@ xfs_rmap_update_diff_items(
struct xfs_rmap_intent *ra = ri_entry(a);
struct xfs_rmap_intent *rb = ri_entry(b);
- return ra->ri_group->xg_gno - rb->ri_group->xg_gno;
+ return cmp_int(ra->ri_group->xg_gno, rb->ri_group->xg_gno);
}
/* Log rmap updates in the intent item. */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 6c7909838234..676777064c2d 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -13,7 +13,9 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_mount.h"
+#include "xfs_zone_priv.h"
#include "xfs_zones.h"
+#include "xfs_zone_alloc.h"
struct xfs_sysfs_attr {
struct attribute attr;
@@ -719,11 +721,23 @@ max_open_zones_show(
XFS_SYSFS_ATTR_RO(max_open_zones);
static ssize_t
+nr_open_zones_show(
+ struct kobject *kobj,
+ char *buf)
+{
+ struct xfs_zone_info *zi = zoned_to_mp(kobj)->m_zone_info;
+
+ return sysfs_emit(buf, "%u\n", READ_ONCE(zi->zi_nr_open_zones));
+}
+XFS_SYSFS_ATTR_RO(nr_open_zones);
+
+static ssize_t
zonegc_low_space_store(
struct kobject *kobj,
const char *buf,
size_t count)
{
+ struct xfs_mount *mp = zoned_to_mp(kobj);
int ret;
unsigned int val;
@@ -734,7 +748,10 @@ zonegc_low_space_store(
if (val > 100)
return -EINVAL;
- zoned_to_mp(kobj)->m_zonegc_low_space = val;
+ if (mp->m_zonegc_low_space != val) {
+ mp->m_zonegc_low_space = val;
+ xfs_zone_gc_wakeup(mp);
+ }
return count;
}
@@ -751,6 +768,7 @@ XFS_SYSFS_ATTR_RW(zonegc_low_space);
static struct attribute *xfs_zoned_attrs[] = {
ATTR_LIST(max_open_zones),
+ ATTR_LIST(nr_open_zones),
ATTR_LIST(zonegc_low_space),
NULL,
};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 5e8190fe2be9..1c098cfc5c00 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -394,6 +394,7 @@ DEFINE_ZONE_EVENT(xfs_zone_full);
DEFINE_ZONE_EVENT(xfs_zone_opened);
DEFINE_ZONE_EVENT(xfs_zone_reset);
DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened);
+DEFINE_ZONE_EVENT(xfs_zone_gc_target_stolen);
TRACE_EVENT(xfs_zone_free_blocks,
TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
@@ -461,6 +462,7 @@ DEFINE_EVENT(xfs_zone_alloc_class, name, \
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_skip_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_spurious_open);
TRACE_EVENT(xfs_zone_gc_select_victim,
TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket),
@@ -740,7 +742,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->dev = bp->b_target->bt_dev;
__entry->bno = xfs_buf_daddr(bp);
__entry->nblks = bp->b_length;
- __entry->hold = bp->b_hold;
+ __entry->hold = bp->b_lockref.count;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
@@ -814,7 +816,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
__entry->bno = xfs_buf_daddr(bp);
__entry->length = bp->b_length;
__entry->flags = flags;
- __entry->hold = bp->b_hold;
+ __entry->hold = bp->b_lockref.count;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->caller_ip = caller_ip;
@@ -858,7 +860,7 @@ TRACE_EVENT(xfs_buf_ioerror,
__entry->dev = bp->b_target->bt_dev;
__entry->bno = xfs_buf_daddr(bp);
__entry->length = bp->b_length;
- __entry->hold = bp->b_hold;
+ __entry->hold = bp->b_lockref.count;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->error = error;
@@ -902,7 +904,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
__entry->buf_len = bip->bli_buf->b_length;
__entry->buf_flags = bip->bli_buf->b_flags;
- __entry->buf_hold = bip->bli_buf->b_hold;
+ __entry->buf_hold = bip->bli_buf->b_lockref.count;
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
__entry->buf_lockval = bip->bli_buf->b_sema.count;
__entry->li_flags = bip->bli_item.li_flags;
@@ -5206,7 +5208,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class,
__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
__entry->bno = xfs_buf_daddr(bp);
__entry->nblks = bp->b_length;
- __entry->hold = bp->b_hold;
+ __entry->hold = bp->b_lockref.count;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index e3d19b6dc64a..a851b98143c0 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -174,42 +174,33 @@ xfs_open_zone_mark_full(
WRITE_ONCE(rtg->rtg_open_zone, NULL);
spin_lock(&zi->zi_open_zones_lock);
- if (oz->oz_is_gc) {
- ASSERT(current == zi->zi_gc_thread);
- zi->zi_open_gc_zone = NULL;
- } else {
+ if (oz->oz_is_gc)
+ zi->zi_nr_open_gc_zones--;
+ else
zi->zi_nr_open_zones--;
- list_del_init(&oz->oz_entry);
- }
+ list_del_init(&oz->oz_entry);
spin_unlock(&zi->zi_open_zones_lock);
- xfs_open_zone_put(oz);
- wake_up_all(&zi->zi_zone_wait);
+ if (oz->oz_is_gc)
+ wake_up_process(zi->zi_gc_thread);
+ else
+ wake_up_all(&zi->zi_zone_wait);
+
if (used < rtg_blocks(rtg))
xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used);
+ xfs_open_zone_put(oz);
}
-static void
-xfs_zone_record_blocks(
- struct xfs_trans *tp,
+static inline void
+xfs_zone_inc_written(
struct xfs_open_zone *oz,
- xfs_fsblock_t fsbno,
xfs_filblks_t len)
{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_rtgroup *rtg = oz->oz_rtg;
- struct xfs_inode *rmapip = rtg_rmap(rtg);
-
- trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len);
+ xfs_assert_ilocked(rtg_rmap(oz->oz_rtg), XFS_ILOCK_EXCL);
- xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
- xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
- rmapip->i_used_blocks += len;
- ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
oz->oz_written += len;
- if (oz->oz_written == rtg_blocks(rtg))
+ if (oz->oz_written == rtg_blocks(oz->oz_rtg))
xfs_open_zone_mark_full(oz);
- xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
}
/*
@@ -227,9 +218,7 @@ xfs_zone_skip_blocks(
trace_xfs_zone_skip_blocks(oz, 0, len);
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
- oz->oz_written += len;
- if (oz->oz_written == rtg_blocks(rtg))
- xfs_open_zone_mark_full(oz);
+ xfs_zone_inc_written(oz, len);
xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
xfs_add_frextents(rtg_mount(rtg), len);
@@ -244,6 +233,8 @@ xfs_zoned_map_extent(
xfs_fsblock_t old_startblock)
{
struct xfs_bmbt_irec data;
+ struct xfs_rtgroup *rtg = oz->oz_rtg;
+ struct xfs_inode *rmapip = rtg_rmap(rtg);
int nmaps = 1;
int error;
@@ -302,7 +293,15 @@ xfs_zoned_map_extent(
}
}
- xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount);
+ trace_xfs_zone_record_blocks(oz,
+ xfs_rtb_to_rgbno(tp->t_mountp, new->br_startblock),
+ new->br_blockcount);
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+ rmapip->i_used_blocks += new->br_blockcount;
+ ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
+ xfs_zone_inc_written(oz, new->br_blockcount);
+ xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
/* Map the new blocks into the data fork. */
xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
@@ -560,6 +559,9 @@ xfs_try_use_zone(
struct xfs_open_zone *oz,
unsigned int goodness)
{
+ if (oz->oz_is_gc)
+ return false;
+
if (oz->oz_allocated == rtg_blocks(oz->oz_rtg))
return false;
@@ -681,10 +683,11 @@ xfs_select_zone_nowait(
if (oz)
goto out_unlock;
- if (pack_tight)
+ if (pack_tight) {
oz = xfs_select_open_zone_mru(zi, write_hint);
- if (oz)
- goto out_unlock;
+ if (oz)
+ goto out_unlock;
+ }
/*
* See if we can open a new zone and use that so that data for different
@@ -695,7 +698,7 @@ xfs_select_zone_nowait(
goto out_unlock;
/*
- * Try to find an zone that is an ok match to colocate data with.
+ * Try to find a zone that is an ok match to colocate data with.
*/
oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK);
if (oz)
@@ -1232,6 +1235,100 @@ xfs_free_zone_info(
kfree(zi);
}
+static int
+xfs_report_zones(
+ struct xfs_mount *mp,
+ struct xfs_init_zones *iz)
+{
+ struct xfs_rtgroup *rtg = NULL;
+
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ xfs_rgblock_t write_pointer;
+ int error;
+
+ error = xfs_query_write_pointer(iz, rtg, &write_pointer);
+ if (!error)
+ error = xfs_init_zone(iz, rtg, write_pointer);
+ if (error) {
+ xfs_rtgroup_rele(rtg);
+ return error;
+ }
+ }
+
+ return 0;
+}
+
+static inline bool
+xfs_zone_is_conv(
+ struct xfs_rtgroup *rtg)
+{
+ return !bdev_zone_is_seq(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
+ xfs_gbno_to_daddr(rtg_group(rtg), 0));
+}
+
+static struct xfs_open_zone *
+xfs_find_fullest_conventional_open_zone(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_open_zone *found = NULL, *oz;
+
+ spin_lock(&zi->zi_open_zones_lock);
+ list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
+ if (!xfs_zone_is_conv(oz->oz_rtg))
+ continue;
+ if (!found || oz->oz_allocated > found->oz_allocated)
+ found = oz;
+ }
+ spin_unlock(&zi->zi_open_zones_lock);
+
+ return found;
+}
+
+/*
+ * Find the fullest conventional zones and remove them from the open zone pool
+ * until we are at the open zone limit.
+ *
+ * We can end up with spurious "open" zones when the last blocks in a fully
+ * written zone were invalidate as there is no write pointer for conventional
+ * zones.
+ *
+ * If we are still over the limit when there is no conventional open zone left,
+ * the user overrode the max open zones limit using the max_open_zones mount
+ * option we should fail.
+ */
+static int
+xfs_finish_spurious_open_zones(
+ struct xfs_mount *mp,
+ struct xfs_init_zones *iz)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+
+ while (zi->zi_nr_open_zones > mp->m_max_open_zones) {
+ struct xfs_open_zone *oz;
+ xfs_filblks_t adjust;
+
+ oz = xfs_find_fullest_conventional_open_zone(mp);
+ if (!oz) {
+ xfs_err(mp,
+"too many open zones for max_open_zones limit (%u/%u)",
+ zi->zi_nr_open_zones, mp->m_max_open_zones);
+ return -EINVAL;
+ }
+
+ xfs_rtgroup_lock(oz->oz_rtg, XFS_RTGLOCK_RMAP);
+ adjust = rtg_blocks(oz->oz_rtg) - oz->oz_written;
+ trace_xfs_zone_spurious_open(oz, oz->oz_written, adjust);
+ oz->oz_written = rtg_blocks(oz->oz_rtg);
+ xfs_open_zone_mark_full(oz);
+ xfs_rtgroup_unlock(oz->oz_rtg, XFS_RTGLOCK_RMAP);
+ iz->available -= adjust;
+ iz->reclaimable += adjust;
+ }
+
+ return 0;
+}
+
int
xfs_mount_zones(
struct xfs_mount *mp)
@@ -1240,7 +1337,6 @@ xfs_mount_zones(
.zone_capacity = mp->m_groups[XG_TYPE_RTG].blocks,
.zone_size = xfs_rtgroup_raw_size(mp),
};
- struct xfs_rtgroup *rtg = NULL;
int error;
if (!mp->m_rtdev_targp) {
@@ -1270,9 +1366,17 @@ xfs_mount_zones(
if (!mp->m_zone_info)
return -ENOMEM;
- xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
- mp->m_sb.sb_rgcount, iz.zone_capacity, mp->m_max_open_zones);
- trace_xfs_zones_mount(mp);
+ error = xfs_report_zones(mp, &iz);
+ if (error)
+ goto out_free_zone_info;
+
+ error = xfs_finish_spurious_open_zones(mp, &iz);
+ if (error)
+ goto out_free_zone_info;
+
+ xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+ iz.available + iz.reclaimable);
/*
* The writeback code switches between inodes regularly to provide
@@ -1298,22 +1402,6 @@ xfs_mount_zones(
XFS_FSB_TO_B(mp, min(iz.zone_capacity, XFS_MAX_BMBT_EXTLEN)) >>
PAGE_SHIFT;
- while ((rtg = xfs_rtgroup_next(mp, rtg))) {
- xfs_rgblock_t write_pointer;
-
- error = xfs_query_write_pointer(&iz, rtg, &write_pointer);
- if (!error)
- error = xfs_init_zone(&iz, rtg, write_pointer);
- if (error) {
- xfs_rtgroup_rele(rtg);
- goto out_free_zone_info;
- }
- }
-
- xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
- xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
- iz.available + iz.reclaimable);
-
/*
* The user may configure GC to free up a percentage of unused blocks.
* By default this is 0. GC will always trigger at the minimum level
@@ -1324,6 +1412,10 @@ xfs_mount_zones(
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
+
+ xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
+ mp->m_sb.sb_rgcount, iz.zone_capacity, mp->m_max_open_zones);
+ trace_xfs_zones_mount(mp);
return 0;
out_free_zone_info:
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
index 4db02816d0fd..8b2ef98c81ef 100644
--- a/fs/xfs/xfs_zone_alloc.h
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -51,6 +51,7 @@ int xfs_mount_zones(struct xfs_mount *mp);
void xfs_unmount_zones(struct xfs_mount *mp);
void xfs_zone_gc_start(struct xfs_mount *mp);
void xfs_zone_gc_stop(struct xfs_mount *mp);
+void xfs_zone_gc_wakeup(struct xfs_mount *mp);
#else
static inline int xfs_mount_zones(struct xfs_mount *mp)
{
@@ -65,6 +66,9 @@ static inline void xfs_zone_gc_start(struct xfs_mount *mp)
static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
{
}
+static inline void xfs_zone_gc_wakeup(struct xfs_mount *mp)
+{
+}
#endif /* CONFIG_XFS_RT */
#endif /* _XFS_ZONE_ALLOC_H */
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index b2626a482563..fedcc47048af 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -125,6 +125,7 @@ struct xfs_zone_gc_iter {
*/
struct xfs_zone_gc_data {
struct xfs_mount *mp;
+ struct xfs_open_zone *oz;
/* bioset used to allocate the gc_bios */
struct bio_set bio_set;
@@ -170,25 +171,37 @@ xfs_zoned_need_gc(
s64 available, free, threshold;
s32 remainder;
+ /* If we have no reclaimable blocks, running GC is useless. */
if (!xfs_zoned_have_reclaimable(mp->m_zone_info))
return false;
+ /*
+ * In order to avoid file fragmentation as much as possible, we should
+ * make sure that we can open enough zones. So trigger GC if the number
+ * of blocks immediately available for writes is lower than the total
+ * number of blocks from all possible open zones.
+ */
available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
-
if (available <
xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
return true;
- free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
+ /*
+ * For cases where the user wants to be more aggressive with GC,
+ * the sysfs attribute zonegc_low_space may be set to a non zero value,
+ * to indicate that GC should try to maintain at least zonegc_low_space
+ * percent of the free space to be directly available for writing. Check
+ * this here.
+ */
+ if (!mp->m_zonegc_low_space)
+ return false;
+ free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
threshold = div_s64_rem(free, 100, &remainder);
threshold = threshold * mp->m_zonegc_low_space +
remainder * div_s64(mp->m_zonegc_low_space, 100);
- if (available < threshold)
- return true;
-
- return false;
+ return available < threshold;
}
static struct xfs_zone_gc_data *
@@ -362,7 +375,7 @@ done:
}
static bool
-xfs_zone_gc_iter_next(
+xfs_zone_gc_iter_irec(
struct xfs_mount *mp,
struct xfs_zone_gc_iter *iter,
struct xfs_rmap_irec *chunk_rec,
@@ -371,9 +384,6 @@ xfs_zone_gc_iter_next(
struct xfs_rmap_irec *irec;
int error;
- if (!iter->victim_rtg)
- return false;
-
retry:
if (iter->rec_idx == iter->rec_count) {
error = xfs_zone_gc_query(mp, iter);
@@ -515,10 +525,11 @@ xfs_zone_gc_select_victim(
return true;
}
-static struct xfs_open_zone *
-xfs_zone_gc_steal_open(
- struct xfs_zone_info *zi)
+static int
+xfs_zone_gc_steal_open_zone(
+ struct xfs_zone_gc_data *data)
{
+ struct xfs_zone_info *zi = data->mp->m_zone_info;
struct xfs_open_zone *oz, *found = NULL;
spin_lock(&zi->zi_open_zones_lock);
@@ -526,56 +537,64 @@ xfs_zone_gc_steal_open(
if (!found || oz->oz_allocated < found->oz_allocated)
found = oz;
}
-
- if (found) {
- found->oz_is_gc = true;
- list_del_init(&found->oz_entry);
- zi->zi_nr_open_zones--;
+ if (!found) {
+ spin_unlock(&zi->zi_open_zones_lock);
+ return -EIO;
}
+ trace_xfs_zone_gc_target_stolen(found->oz_rtg);
+ found->oz_is_gc = true;
+ zi->zi_nr_open_zones--;
+ zi->zi_nr_open_gc_zones++;
spin_unlock(&zi->zi_open_zones_lock);
- return found;
+
+ atomic_inc(&found->oz_ref);
+ data->oz = found;
+ return 0;
}
-static struct xfs_open_zone *
+/*
+ * Ensure we have a valid open zone to write to.
+ */
+static bool
xfs_zone_gc_select_target(
- struct xfs_mount *mp)
+ struct xfs_zone_gc_data *data)
{
- struct xfs_zone_info *zi = mp->m_zone_info;
- struct xfs_open_zone *oz = zi->zi_open_gc_zone;
+ struct xfs_zone_info *zi = data->mp->m_zone_info;
+
+ if (data->oz) {
+ /*
+ * If we have space available, just keep using the existing
+ * zone.
+ */
+ if (data->oz->oz_allocated < rtg_blocks(data->oz->oz_rtg))
+ return true;
+
+ /*
+ * Wait for all writes to the current zone to finish before
+ * picking a new one.
+ */
+ if (data->oz->oz_written < rtg_blocks(data->oz->oz_rtg))
+ return false;
+
+ xfs_open_zone_put(data->oz);
+ }
/*
- * We need to wait for pending writes to finish.
+ * Open a new zone when there is none currently in use.
*/
- if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
- return NULL;
-
ASSERT(zi->zi_nr_open_zones <=
- mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
- oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
- if (oz)
- trace_xfs_zone_gc_target_opened(oz->oz_rtg);
+ data->mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
+ data->oz = xfs_open_zone(data->mp, WRITE_LIFE_NOT_SET, true);
+ if (!data->oz)
+ return false;
+ trace_xfs_zone_gc_target_opened(data->oz->oz_rtg);
+ atomic_inc(&data->oz->oz_ref);
spin_lock(&zi->zi_open_zones_lock);
- zi->zi_open_gc_zone = oz;
+ zi->zi_nr_open_gc_zones++;
+ list_add_tail(&data->oz->oz_entry, &zi->zi_open_zones);
spin_unlock(&zi->zi_open_zones_lock);
- return oz;
-}
-
-/*
- * Ensure we have a valid open zone to write the GC data to.
- *
- * If the current target zone has space keep writing to it, else first wait for
- * all pending writes and then pick a new one.
- */
-static struct xfs_open_zone *
-xfs_zone_gc_ensure_target(
- struct xfs_mount *mp)
-{
- struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
-
- if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
- return xfs_zone_gc_select_target(mp);
- return oz;
+ return true;
}
static void
@@ -590,7 +609,7 @@ xfs_zone_gc_end_io(
wake_up_process(data->mp->m_zone_info->zi_gc_thread);
}
-static struct xfs_open_zone *
+static bool
xfs_zone_gc_alloc_blocks(
struct xfs_zone_gc_data *data,
xfs_extlen_t *count_fsb,
@@ -598,11 +617,7 @@ xfs_zone_gc_alloc_blocks(
bool *is_seq)
{
struct xfs_mount *mp = data->mp;
- struct xfs_open_zone *oz;
-
- oz = xfs_zone_gc_ensure_target(mp);
- if (!oz)
- return NULL;
+ struct xfs_open_zone *oz = data->oz;
*count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available));
@@ -624,7 +639,7 @@ xfs_zone_gc_alloc_blocks(
spin_unlock(&mp->m_sb_lock);
if (!*count_fsb)
- return NULL;
+ return false;
*daddr = xfs_gbno_to_daddr(rtg_group(oz->oz_rtg), 0);
*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
@@ -632,7 +647,7 @@ xfs_zone_gc_alloc_blocks(
*daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
oz->oz_allocated += *count_fsb;
atomic_inc(&oz->oz_ref);
- return oz;
+ return true;
}
static void
@@ -658,13 +673,34 @@ xfs_zone_gc_add_data(
}
static bool
+xfs_zone_gc_can_start_chunk(
+ struct xfs_zone_gc_data *data)
+{
+
+ if (xfs_is_shutdown(data->mp))
+ return false;
+ if (!data->scratch_available)
+ return false;
+
+ if (!data->iter.victim_rtg) {
+ if (kthread_should_stop() || kthread_should_park())
+ return false;
+ if (!xfs_zoned_need_gc(data->mp))
+ return false;
+ if (!xfs_zone_gc_select_victim(data))
+ return false;
+ }
+
+ return xfs_zone_gc_select_target(data);
+}
+
+static bool
xfs_zone_gc_start_chunk(
struct xfs_zone_gc_data *data)
{
struct xfs_zone_gc_iter *iter = &data->iter;
struct xfs_mount *mp = data->mp;
struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
- struct xfs_open_zone *oz;
struct xfs_rmap_irec irec;
struct xfs_gc_bio *chunk;
struct xfs_inode *ip;
@@ -672,14 +708,15 @@ xfs_zone_gc_start_chunk(
xfs_daddr_t daddr;
bool is_seq;
- if (xfs_is_shutdown(mp))
+ if (!xfs_zone_gc_can_start_chunk(data))
return false;
- if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
+ set_current_state(TASK_RUNNING);
+ if (!xfs_zone_gc_iter_irec(mp, iter, &irec, &ip))
return false;
- oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
- &is_seq);
- if (!oz) {
+
+ if (!xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
+ &is_seq)) {
xfs_irele(ip);
return false;
}
@@ -699,7 +736,7 @@ xfs_zone_gc_start_chunk(
chunk->new_daddr = daddr;
chunk->is_seq = is_seq;
chunk->data = data;
- chunk->oz = oz;
+ chunk->oz = data->oz;
chunk->victim_rtg = iter->victim_rtg;
atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref);
atomic_inc(&chunk->victim_rtg->rtg_gccount);
@@ -985,33 +1022,6 @@ xfs_zone_gc_reset_zones(
} while (next);
}
-static bool
-xfs_zone_gc_should_start_new_work(
- struct xfs_zone_gc_data *data)
-{
- struct xfs_open_zone *oz;
-
- if (xfs_is_shutdown(data->mp))
- return false;
- if (!data->scratch_available)
- return false;
-
- oz = xfs_zone_gc_ensure_target(data->mp);
- if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
- return false;
-
- if (!data->iter.victim_rtg) {
- if (kthread_should_stop() || kthread_should_park())
- return false;
- if (!xfs_zoned_need_gc(data->mp))
- return false;
- if (!xfs_zone_gc_select_victim(data))
- return false;
- }
-
- return true;
-}
-
/*
* Handle the work to read and write data for GC and to reset the zones,
* including handling all completions.
@@ -1061,13 +1071,10 @@ xfs_zone_gc_handle_work(
}
blk_finish_plug(&plug);
- if (xfs_zone_gc_should_start_new_work(data)) {
- set_current_state(TASK_RUNNING);
- blk_start_plug(&plug);
- while (xfs_zone_gc_start_chunk(data))
- ;
- blk_finish_plug(&plug);
- }
+ blk_start_plug(&plug);
+ while (xfs_zone_gc_start_chunk(data))
+ ;
+ blk_finish_plug(&plug);
}
/*
@@ -1127,6 +1134,8 @@ xfs_zoned_gcd(
}
xfs_clear_zonegc_running(mp);
+ if (data->oz)
+ xfs_open_zone_put(data->oz);
if (data->iter.victim_rtg)
xfs_rtgroup_rele(data->iter.victim_rtg);
@@ -1151,41 +1160,49 @@ xfs_zone_gc_stop(
kthread_park(mp->m_zone_info->zi_gc_thread);
}
+void
+xfs_zone_gc_wakeup(
+ struct xfs_mount *mp)
+{
+ struct super_block *sb = mp->m_super;
+
+ /*
+ * If we are unmounting the file system we must not try to
+ * wake gc as m_zone_info might have been freed already.
+ */
+ if (down_read_trylock(&sb->s_umount)) {
+ if (!xfs_is_readonly(mp))
+ wake_up_process(mp->m_zone_info->zi_gc_thread);
+ up_read(&sb->s_umount);
+ }
+}
+
int
xfs_zone_gc_mount(
struct xfs_mount *mp)
{
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_zone_gc_data *data;
- struct xfs_open_zone *oz;
int error;
+ data = xfs_zone_gc_data_alloc(mp);
+ if (!data)
+ return -ENOMEM;
+
/*
- * If there are no free zones available for GC, pick the open zone with
+ * If there are no free zones available for GC, or the number of open
+ * zones has reached the open zone limit, pick the open zone with
* the least used space to GC into. This should only happen after an
- * unclean shutdown near ENOSPC while GC was ongoing.
- *
- * We also need to do this for the first gc zone allocation if we
- * unmounted while at the open limit.
+ * unclean shutdown while GC was ongoing. Otherwise a GC zone will
+ * be selected from the free zone pool on demand.
*/
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
- zi->zi_nr_open_zones == mp->m_max_open_zones)
- oz = xfs_zone_gc_steal_open(zi);
- else
- oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
- if (!oz) {
- xfs_warn(mp, "unable to allocate a zone for gc");
- error = -EIO;
- goto out;
- }
-
- trace_xfs_zone_gc_target_opened(oz->oz_rtg);
- zi->zi_open_gc_zone = oz;
-
- data = xfs_zone_gc_data_alloc(mp);
- if (!data) {
- error = -ENOMEM;
- goto out_put_gc_zone;
+ zi->zi_nr_open_zones >= mp->m_max_open_zones) {
+ error = xfs_zone_gc_steal_open_zone(data);
+ if (error) {
+ xfs_warn(mp, "unable to steal an open zone for gc");
+ goto out_free_gc_data;
+ }
}
zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
@@ -1193,18 +1210,18 @@ xfs_zone_gc_mount(
if (IS_ERR(zi->zi_gc_thread)) {
xfs_warn(mp, "unable to create zone gc thread");
error = PTR_ERR(zi->zi_gc_thread);
- goto out_free_gc_data;
+ goto out_put_oz;
}
/* xfs_zone_gc_start will unpark for rw mounts */
kthread_park(zi->zi_gc_thread);
return 0;
+out_put_oz:
+ if (data->oz)
+ xfs_open_zone_put(data->oz);
out_free_gc_data:
kfree(data);
-out_put_gc_zone:
- xfs_open_zone_put(zi->zi_open_gc_zone);
-out:
return error;
}
@@ -1215,6 +1232,4 @@ xfs_zone_gc_unmount(
struct xfs_zone_info *zi = mp->m_zone_info;
kthread_stop(zi->zi_gc_thread);
- if (zi->zi_open_gc_zone)
- xfs_open_zone_put(zi->zi_open_gc_zone);
}
diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c
index 53eabbc3334c..47b475e21af8 100644
--- a/fs/xfs/xfs_zone_info.c
+++ b/fs/xfs/xfs_zone_info.c
@@ -30,11 +30,12 @@ xfs_show_open_zone(
struct seq_file *m,
struct xfs_open_zone *oz)
{
- seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n",
+ seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s %s\n",
rtg_rgno(oz->oz_rtg),
oz->oz_allocated, oz->oz_written,
rtg_rmap(oz->oz_rtg)->i_used_blocks,
- xfs_write_hint_to_str(oz->oz_write_hint));
+ xfs_write_hint_to_str(oz->oz_write_hint),
+ oz->oz_is_gc ? "(GC)" : "");
}
static void
@@ -58,9 +59,8 @@ xfs_show_full_zone_used_distribution(
spin_unlock(&zi->zi_used_buckets_lock);
full = mp->m_sb.sb_rgcount;
- if (zi->zi_open_gc_zone)
- full--;
full -= zi->zi_nr_open_zones;
+ full -= zi->zi_nr_open_gc_zones;
full -= atomic_read(&zi->zi_nr_free_zones);
full -= reclaimable;
@@ -90,15 +90,20 @@ xfs_zoned_show_stats(
seq_printf(m, "\tRT GC required: %d\n",
xfs_zoned_need_gc(mp));
+ seq_printf(m, "\ttotal number of zones: %u\n",
+ mp->m_sb.sb_rgcount);
seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones));
- seq_puts(m, "\topen zones:\n");
+
spin_lock(&zi->zi_open_zones_lock);
+ seq_printf(m, "\tmax open zones: %u\n",
+ mp->m_max_open_zones);
+ seq_printf(m, "\tnr open zones: %u\n",
+ zi->zi_nr_open_zones);
+ seq_printf(m, "\tnr open GC zones: %u\n",
+ zi->zi_nr_open_gc_zones);
+ seq_puts(m, "\topen zones:\n");
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
xfs_show_open_zone(m, oz);
- if (zi->zi_open_gc_zone) {
- seq_puts(m, "\topen gc zone:\n");
- xfs_show_open_zone(m, zi->zi_open_gc_zone);
- }
spin_unlock(&zi->zi_open_zones_lock);
seq_puts(m, "\tused blocks distribution (fully written zones):\n");
xfs_show_full_zone_used_distribution(m, mp);
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index 8fbf9a52964e..fcb57506d8e6 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -32,11 +32,7 @@ struct xfs_open_zone {
*/
enum rw_hint oz_write_hint;
- /*
- * Is this open zone used for garbage collection? There can only be a
- * single open GC zone, which is pointed to by zi_open_gc_zone in
- * struct xfs_zone_info. Constant over the life time of an open zone.
- */
+ /* Is this open zone used for garbage collection? */
bool oz_is_gc;
/*
@@ -68,6 +64,7 @@ struct xfs_zone_info {
spinlock_t zi_open_zones_lock;
struct list_head zi_open_zones;
unsigned int zi_nr_open_zones;
+ unsigned int zi_nr_open_gc_zones;
/*
* Free zone search cursor and number of free zones:
@@ -81,15 +78,9 @@ struct xfs_zone_info {
wait_queue_head_t zi_zone_wait;
/*
- * Pointer to the GC thread, and the current open zone used by GC
- * (if any).
- *
- * zi_open_gc_zone is mostly private to the GC thread, but can be read
- * for debugging from other threads, in which case zi_open_zones_lock
- * must be taken to access it.
+ * Pointer to the GC thread.
*/
struct task_struct *zi_gc_thread;
- struct xfs_open_zone *zi_open_gc_zone;
/*
* List of zones that need a reset: