summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2025-11-05 08:07:41 -0700
committerJens Axboe <axboe@kernel.dk>2025-11-05 08:07:41 -0700
commit55de535e0769cbc61c2ac415673365ee0efa96e0 (patch)
tree001fe4ec17675dd053e75dc151618774bbb99691
parentf68ff6bc0d0893d959aac39c662895096d866c84 (diff)
parente04ccfc28252f181ea8d469d834b48e7dece65b2 (diff)
Merge branch 'cached-zones' into for-6.19/block
This patch series implements a cached report zones using information from the block layer zone write plugs and a new zone condition tracking. This avoids having to execute slow report zones commands on the device when for instance mounting file systems, which can significantly speed things up, especially in setups with multiple SMR HDDs (e.g. a BTRFS RAID volume). The first patch improves handling of zone management commands. Patch 2 fixes zone resource updates and the following 3 patches cleanup the zone code in preparation for introducing cached zone report support. From patch 6 to 13, cached report zones is implemented and made available to users with a new ioctl() command. Finally, patches 14 and 15 introduce the use of cached report zones in the mount operation of XFS and BTRFS. Link: https://lore.kernel.org/linux-block/20251104212249.1075412-1-dlemoal@kernel.org/ Signed-off-by: Jens Axboe <axboe@kernel.dk> * cached-zones: xfs: use blkdev_report_zones_cached() btrfs: use blkdev_report_zones_cached() block: add zone write plug condition to debugfs zone_wplugs block: improve zone_wplugs debugfs attribute output block: introduce BLKREPORTZONESV2 ioctl block: introduce blkdev_report_zones_cached() block: introduce blkdev_get_zone_info() block: refactor blkdev_report_zones() code block: track zone conditions block: use zone condition to determine conventional zones block: reorganize struct blk_zone_wplug block: introduce disk_report_zone() block: cleanup blkdev_report_zones() block: freeze queue when updating zone resources block: handle zone management operations completions
-rw-r--r--block/blk-zoned.c798
-rw-r--r--block/blk.h14
-rw-r--r--block/ioctl.c1
-rw-r--r--drivers/block/null_blk/null_blk.h3
-rw-r--r--drivers/block/null_blk/zoned.c4
-rw-r--r--drivers/block/ublk_drv.c4
-rw-r--r--drivers/block/virtio_blk.c11
-rw-r--r--drivers/block/zloop.c4
-rw-r--r--drivers/md/dm-zone.c54
-rw-r--r--drivers/md/dm.h3
-rw-r--r--drivers/nvme/host/core.c5
-rw-r--r--drivers/nvme/host/multipath.c4
-rw-r--r--drivers/nvme/host/nvme.h2
-rw-r--r--drivers/nvme/host/zns.c10
-rw-r--r--drivers/scsi/sd.h2
-rw-r--r--drivers/scsi/sd_zbc.c20
-rw-r--r--fs/btrfs/zoned.c11
-rw-r--r--fs/xfs/libxfs/xfs_zones.c1
-rw-r--r--fs/xfs/xfs_zone_alloc.c2
-rw-r--r--include/linux/blkdev.h49
-rw-r--r--include/linux/device-mapper.h10
-rw-r--r--include/uapi/linux/blkzoned.h46
-rw-r--r--include/uapi/linux/fs.h2
23 files changed, 784 insertions, 276 deletions
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 5e2a5788dc3b..bba64b427082 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -33,12 +33,18 @@ static const char *const zone_cond_name[] = {
ZONE_COND_NAME(READONLY),
ZONE_COND_NAME(FULL),
ZONE_COND_NAME(OFFLINE),
+ ZONE_COND_NAME(ACTIVE),
};
#undef ZONE_COND_NAME
/*
* Per-zone write plug.
* @node: hlist_node structure for managing the plug using a hash table.
+ * @bio_list: The list of BIOs that are currently plugged.
+ * @bio_work: Work struct to handle issuing of plugged BIOs
+ * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
+ * @disk: The gendisk the plug belongs to.
+ * @lock: Spinlock to atomically manipulate the plug.
* @ref: Zone write plug reference counter. A zone write plug reference is
* always at least 1 when the plug is hashed in the disk plug hash table.
* The reference is incremented whenever a new BIO needing plugging is
@@ -48,29 +54,44 @@ static const char *const zone_cond_name[] = {
* reference is dropped whenever the zone of the zone write plug is reset,
* finished and when the zone becomes full (last write BIO to the zone
* completes).
- * @lock: Spinlock to atomically manipulate the plug.
* @flags: Flags indicating the plug state.
* @zone_no: The number of the zone the plug is managing.
* @wp_offset: The zone write pointer location relative to the start of the zone
* as a number of 512B sectors.
- * @bio_list: The list of BIOs that are currently plugged.
- * @bio_work: Work struct to handle issuing of plugged BIOs
- * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
- * @disk: The gendisk the plug belongs to.
+ * @cond: Condition of the zone
*/
struct blk_zone_wplug {
struct hlist_node node;
- refcount_t ref;
- spinlock_t lock;
- unsigned int flags;
- unsigned int zone_no;
- unsigned int wp_offset;
struct bio_list bio_list;
struct work_struct bio_work;
struct rcu_head rcu_head;
struct gendisk *disk;
+ spinlock_t lock;
+ refcount_t ref;
+ unsigned int flags;
+ unsigned int zone_no;
+ unsigned int wp_offset;
+ enum blk_zone_cond cond;
};
+static inline bool disk_need_zone_resources(struct gendisk *disk)
+{
+ /*
+ * All request-based zoned devices need zone resources so that the
+ * block layer can automatically handle write BIO plugging. BIO-based
+ * device drivers (e.g. DM devices) are normally responsible for
+ * handling zone write ordering and do not need zone resources, unless
+ * the driver requires zone append emulation.
+ */
+ return queue_is_mq(disk->queue) ||
+ queue_emulates_zone_append(disk->queue);
+}
+
+static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
+{
+ return 1U << disk->zone_wplugs_hash_bits;
+}
+
/*
* Zone write plug flags bits:
* - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
@@ -109,28 +130,108 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
}
EXPORT_SYMBOL_GPL(blk_zone_cond_str);
-struct disk_report_zones_cb_args {
- struct gendisk *disk;
- report_zones_cb user_cb;
- void *user_data;
-};
+static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno,
+ enum blk_zone_cond cond)
+{
+ if (!zones_cond)
+ return;
-static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
- struct blk_zone *zone);
+ switch (cond) {
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ zones_cond[zno] = BLK_ZONE_COND_ACTIVE;
+ return;
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_FULL:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ default:
+ zones_cond[zno] = cond;
+ return;
+ }
+}
-static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
+static void disk_zone_set_cond(struct gendisk *disk, sector_t sector,
+ enum blk_zone_cond cond)
{
- struct disk_report_zones_cb_args *args = data;
- struct gendisk *disk = args->disk;
+ u8 *zones_cond;
- if (disk->zone_wplugs_hash)
- disk_zone_wplug_sync_wp_offset(disk, zone);
+ rcu_read_lock();
+ zones_cond = rcu_dereference(disk->zones_cond);
+ if (zones_cond) {
+ unsigned int zno = disk_zone_no(disk, sector);
+
+ /*
+ * The condition of a conventional, readonly and offline zones
+ * never changes, so do nothing if the target zone is in one of
+ * these conditions.
+ */
+ switch (zones_cond[zno]) {
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_READONLY:
+ case BLK_ZONE_COND_OFFLINE:
+ break;
+ default:
+ blk_zone_set_cond(zones_cond, zno, cond);
+ break;
+ }
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
+ * @bdev: block device to check
+ * @sector: sector number
+ *
+ * Check if @sector on @bdev is contained in a sequential write required zone.
+ */
+bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ unsigned int zno = disk_zone_no(disk, sector);
+ bool is_seq = false;
+ u8 *zones_cond;
+
+ if (!bdev_is_zoned(bdev))
+ return false;
+
+ rcu_read_lock();
+ zones_cond = rcu_dereference(disk->zones_cond);
+ if (zones_cond && zno < disk->nr_zones)
+ is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP;
+ rcu_read_unlock();
+
+ return is_seq;
+}
+EXPORT_SYMBOL_GPL(bdev_zone_is_seq);
+
+/*
+ * Zone report arguments for block device drivers report_zones operation.
+ * @cb: report_zones_cb callback for each reported zone.
+ * @data: Private data passed to report_zones_cb.
+ */
+struct blk_report_zones_args {
+ report_zones_cb cb;
+ void *data;
+ bool report_active;
+};
+
+static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector,
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args)
+{
+ struct gendisk *disk = bdev->bd_disk;
+
+ if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
+ return -EOPNOTSUPP;
- if (!args->user_cb)
+ if (!nr_zones || sector >= get_capacity(disk))
return 0;
- return args->user_cb(zone, idx, args->user_data);
+ return disk->fops->report_zones(disk, sector, nr_zones, args);
}
/**
@@ -155,22 +256,12 @@ static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx,
int blkdev_report_zones(struct block_device *bdev, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data)
{
- struct gendisk *disk = bdev->bd_disk;
- sector_t capacity = get_capacity(disk);
- struct disk_report_zones_cb_args args = {
- .disk = disk,
- .user_cb = cb,
- .user_data = data,
+ struct blk_report_zones_args args = {
+ .cb = cb,
+ .data = data,
};
- if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
- return -EOPNOTSUPP;
-
- if (!nr_zones || sector >= capacity)
- return 0;
-
- return disk->fops->report_zones(disk, sector, nr_zones,
- disk_report_zones_cb, &args);
+ return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
}
EXPORT_SYMBOL_GPL(blkdev_report_zones);
@@ -266,7 +357,12 @@ static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
}
/*
- * BLKREPORTZONE ioctl processing.
+ * Mask of valid input flags for BLKREPORTZONEV2 ioctl.
+ */
+#define BLK_ZONE_REPV2_INPUT_FLAGS BLK_ZONE_REP_CACHED
+
+/*
+ * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing.
* Called from blkdev_ioctl.
*/
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
@@ -290,8 +386,22 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
return -EINVAL;
args.zones = argp + sizeof(struct blk_zone_report);
- ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
- blkdev_copy_zone_to_user, &args);
+
+ switch (cmd) {
+ case BLKREPORTZONE:
+ ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
+ blkdev_copy_zone_to_user, &args);
+ break;
+ case BLKREPORTZONEV2:
+ if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS)
+ return -EINVAL;
+ ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones,
+ blkdev_copy_zone_to_user, &args);
+ break;
+ default:
+ return -EINVAL;
+ }
+
if (ret < 0)
return ret;
@@ -401,6 +511,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk,
{
struct blk_zone_wplug *zwplg;
unsigned long flags;
+ u8 *zones_cond;
unsigned int idx =
hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
@@ -416,6 +527,20 @@ static bool disk_insert_zone_wplug(struct gendisk *disk,
return false;
}
}
+
+ /*
+ * Set the zone condition: if we do not yet have a zones_cond array
+ * attached to the disk, then this is a zone write plug insert from the
+ * first call to blk_revalidate_disk_zones(), in which case the zone is
+ * necessarilly in the active condition.
+ */
+ zones_cond = rcu_dereference_check(disk->zones_cond,
+ lockdep_is_held(&disk->zone_wplugs_lock));
+ if (zones_cond)
+ zwplug->cond = zones_cond[zwplug->zone_no];
+ else
+ zwplug->cond = BLK_ZONE_COND_ACTIVE;
+
hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
atomic_inc(&disk->nr_zone_wplugs);
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
@@ -515,10 +640,15 @@ static void disk_remove_zone_wplug(struct gendisk *disk,
/*
* Mark the zone write plug as unhashed and drop the extra reference we
- * took when the plug was inserted in the hash table.
+ * took when the plug was inserted in the hash table. Also update the
+ * disk zone condition array with the current condition of the zone
+ * write plug.
*/
zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ blk_zone_set_cond(rcu_dereference_check(disk->zones_cond,
+ lockdep_is_held(&disk->zone_wplugs_lock)),
+ zwplug->zone_no, zwplug->cond);
hlist_del_init_rcu(&zwplug->node);
atomic_dec(&disk->nr_zone_wplugs);
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
@@ -621,6 +751,22 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
}
/*
+ * Update a zone write plug condition based on the write pointer offset.
+ */
+static void disk_zone_wplug_update_cond(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ lockdep_assert_held(&zwplug->lock);
+
+ if (disk_zone_wplug_is_full(disk, zwplug))
+ zwplug->cond = BLK_ZONE_COND_FULL;
+ else if (!zwplug->wp_offset)
+ zwplug->cond = BLK_ZONE_COND_EMPTY;
+ else
+ zwplug->cond = BLK_ZONE_COND_ACTIVE;
+}
+
+/*
* Set a zone write plug write pointer offset to the specified value.
* This aborts all plugged BIOs, which is fine as this function is called for
* a zone reset operation, a zone finish operation or if the zone needs a wp
@@ -635,6 +781,8 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
/* Update the zone write pointer and abort all plugged BIOs. */
zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE;
zwplug->wp_offset = wp_offset;
+ disk_zone_wplug_update_cond(disk, zwplug);
+
disk_zone_wplug_abort(zwplug);
/*
@@ -688,81 +836,333 @@ static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
disk_put_zone_wplug(zwplug);
}
-static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector)
+/**
+ * disk_report_zone - Report one zone
+ * @disk: Target disk
+ * @zone: The zone to report
+ * @idx: The index of the zone in the overall zone report
+ * @args: report zones callback and data
+ *
+ * Description:
+ * Helper function for block device drivers to report one zone of a zone
+ * report initiated with blkdev_report_zones(). The zone being reported is
+ * specified by @zone and used to update, if necessary, the zone write plug
+ * information for the zone. If @args specifies a user callback function,
+ * this callback is executed.
+ */
+int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
+ unsigned int idx, struct blk_report_zones_args *args)
+{
+ if (args->report_active) {
+ /*
+ * If we come here, then this is a report zones as a fallback
+ * for a cached report. So collapse the implicit open, explicit
+ * open and closed conditions into the active zone condition.
+ */
+ switch (zone->cond) {
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ zone->cond = BLK_ZONE_COND_ACTIVE;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (disk->zone_wplugs_hash)
+ disk_zone_wplug_sync_wp_offset(disk, zone);
+
+ if (args && args->cb)
+ return args->cb(zone, idx, args->data);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(disk_report_zone);
+
+static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
+{
+ memcpy(data, zone, sizeof(struct blk_zone));
+ return 0;
+}
+
+static int blkdev_report_zone_fallback(struct block_device *bdev,
+ sector_t sector, struct blk_zone *zone)
{
- struct disk_report_zones_cb_args args = {
- .disk = disk,
+ struct blk_report_zones_args args = {
+ .cb = blkdev_report_zone_cb,
+ .data = zone,
+ .report_active = true,
};
- return disk->fops->report_zones(disk, sector, 1,
- disk_report_zones_cb, &args);
+ return blkdev_do_report_zones(bdev, sector, 1, &args);
}
-static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
- unsigned int wp_offset)
+/**
+ * blkdev_get_zone_info - Get a single zone information from cached data
+ * @bdev: Target block device
+ * @sector: Sector contained by the target zone
+ * @zone: zone structure to return the zone information
+ *
+ * Description:
+ * Get the zone information for the zone containing @sector using the zone
+ * write plug of the target zone, if one exist, or the disk zone condition
+ * array otherwise. The zone condition may be reported as being
+ * the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit
+ * open, explicit open or closed condition.
+ *
+ * Returns 0 on success and a negative error code on failure.
+ */
+int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
+ struct blk_zone *zone)
{
- struct gendisk *disk = bio->bi_bdev->bd_disk;
- sector_t sector = bio->bi_iter.bi_sector;
+ struct gendisk *disk = bdev->bd_disk;
+ sector_t zone_sectors = bdev_zone_sectors(bdev);
struct blk_zone_wplug *zwplug;
unsigned long flags;
+ u8 *zones_cond;
- /* Conventional zones cannot be reset nor finished. */
- if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
- bio_io_error(bio);
- return true;
+ if (!bdev_is_zoned(bdev))
+ return -EOPNOTSUPP;
+
+ if (sector >= get_capacity(disk))
+ return -EINVAL;
+
+ memset(zone, 0, sizeof(*zone));
+ sector = ALIGN_DOWN(sector, zone_sectors);
+
+ rcu_read_lock();
+ zones_cond = rcu_dereference(disk->zones_cond);
+ if (!disk->zone_wplugs_hash || !zones_cond) {
+ rcu_read_unlock();
+ return blkdev_report_zone_fallback(bdev, sector, zone);
}
+ zone->cond = zones_cond[disk_zone_no(disk, sector)];
+ rcu_read_unlock();
+
+ zone->start = sector;
+ zone->len = zone_sectors;
/*
- * No-wait reset or finish BIOs do not make much sense as the callers
- * issue these as blocking operations in most cases. To avoid issues
- * the BIO execution potentially failing with BLK_STS_AGAIN, warn about
- * REQ_NOWAIT being set and ignore that flag.
+ * If this is a conventional zone, we do not have a zone write plug and
+ * can report the zone immediately.
*/
- if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT))
- bio->bi_opf &= ~REQ_NOWAIT;
+ if (zone->cond == BLK_ZONE_COND_NOT_WP) {
+ zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
+ zone->capacity = zone_sectors;
+ zone->wp = ULLONG_MAX;
+ return 0;
+ }
+
+ /*
+ * This is a sequential write required zone. If the zone is read-only or
+ * offline, only set the zone write pointer to an invalid value and
+ * report the zone.
+ */
+ zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+ if (disk_zone_is_last(disk, zone))
+ zone->capacity = disk->last_zone_capacity;
+ else
+ zone->capacity = disk->zone_capacity;
+
+ if (zone->cond == BLK_ZONE_COND_READONLY ||
+ zone->cond == BLK_ZONE_COND_OFFLINE) {
+ zone->wp = ULLONG_MAX;
+ return 0;
+ }
/*
- * If we have a zone write plug, set its write pointer offset to 0
- * (reset case) or to the zone size (finish case). This will abort all
- * BIOs plugged for the target zone. It is fine as resetting or
- * finishing zones while writes are still in-flight will result in the
+ * If the zone does not have a zone write plug, it is either full or
+ * empty, as we otherwise would have a zone write plug for it. In this
+ * case, set the write pointer accordingly and report the zone.
+ * Otherwise, if we have a zone write plug, use it.
+ */
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (!zwplug) {
+ if (zone->cond == BLK_ZONE_COND_FULL)
+ zone->wp = ULLONG_MAX;
+ else
+ zone->wp = sector;
+ return 0;
+ }
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+ if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) {
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ disk_put_zone_wplug(zwplug);
+ return blkdev_report_zone_fallback(bdev, sector, zone);
+ }
+ zone->cond = zwplug->cond;
+ zone->wp = sector + zwplug->wp_offset;
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ disk_put_zone_wplug(zwplug);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkdev_get_zone_info);
+
+/**
+ * blkdev_report_zones_cached - Get cached zones information
+ * @bdev: Target block device
+ * @sector: Sector from which to report zones
+ * @nr_zones: Maximum number of zones to report
+ * @cb: Callback function called for each reported zone
+ * @data: Private data for the callback function
+ *
+ * Description:
+ * Similar to blkdev_report_zones() but instead of calling into the low level
+ * device driver to get the zone report from the device, use
+ * blkdev_get_zone_info() to generate the report from the disk zone write
+ * plugs and zones condition array. Since calling this function without a
+ * callback does not make sense, @cb must be specified.
+ */
+int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ sector_t capacity = get_capacity(disk);
+ sector_t zone_sectors = bdev_zone_sectors(bdev);
+ unsigned int idx = 0;
+ struct blk_zone zone;
+ int ret;
+
+ if (!cb || !bdev_is_zoned(bdev) ||
+ WARN_ON_ONCE(!disk->fops->report_zones))
+ return -EOPNOTSUPP;
+
+ if (!nr_zones || sector >= capacity)
+ return 0;
+
+ /*
+ * If we do not have any zone write plug resources, fallback to using
+ * the regular zone report.
+ */
+ if (!disk_need_zone_resources(disk)) {
+ struct blk_report_zones_args args = {
+ .cb = cb,
+ .data = data,
+ .report_active = true,
+ };
+
+ return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
+ }
+
+ for (sector = ALIGN_DOWN(sector, zone_sectors);
+ sector < capacity && idx < nr_zones;
+ sector += zone_sectors, idx++) {
+ ret = blkdev_get_zone_info(bdev, sector, &zone);
+ if (ret)
+ return ret;
+
+ ret = cb(&zone, idx, data);
+ if (ret)
+ return ret;
+ }
+
+ return idx;
+}
+EXPORT_SYMBOL_GPL(blkdev_report_zones_cached);
+
+static void blk_zone_reset_bio_endio(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ sector_t sector = bio->bi_iter.bi_sector;
+ struct blk_zone_wplug *zwplug;
+
+ /*
+ * If we have a zone write plug, set its write pointer offset to 0.
+ * This will abort all BIOs plugged for the target zone. It is fine as
+ * resetting zones while writes are still in-flight will result in the
* writes failing anyway.
*/
zwplug = disk_get_zone_wplug(disk, sector);
if (zwplug) {
+ unsigned long flags;
+
spin_lock_irqsave(&zwplug->lock, flags);
- disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
+ disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
spin_unlock_irqrestore(&zwplug->lock, flags);
disk_put_zone_wplug(zwplug);
+ } else {
+ disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
}
-
- return false;
}
-static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
+static void blk_zone_reset_all_bio_endio(struct bio *bio)
{
struct gendisk *disk = bio->bi_bdev->bd_disk;
+ sector_t capacity = get_capacity(disk);
struct blk_zone_wplug *zwplug;
unsigned long flags;
sector_t sector;
+ unsigned int i;
- /*
- * Set the write pointer offset of all zone write plugs to 0. This will
- * abort all plugged BIOs. It is fine as resetting zones while writes
- * are still in-flight will result in the writes failing anyway.
- */
- for (sector = 0; sector < get_capacity(disk);
- sector += disk->queue->limits.chunk_sectors) {
- zwplug = disk_get_zone_wplug(disk, sector);
- if (zwplug) {
+ /* Update the condition of all zone write plugs. */
+ rcu_read_lock();
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
+ hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i],
+ node) {
spin_lock_irqsave(&zwplug->lock, flags);
disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
spin_unlock_irqrestore(&zwplug->lock, flags);
- disk_put_zone_wplug(zwplug);
}
}
+ rcu_read_unlock();
- return false;
+ /* Update the cached zone conditions. */
+ for (sector = 0; sector < capacity;
+ sector += bdev_zone_sectors(bio->bi_bdev))
+ disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
+}
+
+static void blk_zone_finish_bio_endio(struct bio *bio)
+{
+ struct block_device *bdev = bio->bi_bdev;
+ struct gendisk *disk = bdev->bd_disk;
+ sector_t sector = bio->bi_iter.bi_sector;
+ struct blk_zone_wplug *zwplug;
+
+ /*
+ * If we have a zone write plug, set its write pointer offset to the
+ * zone size. This will abort all BIOs plugged for the target zone. It
+ * is fine as resetting zones while writes are still in-flight will
+ * result in the writes failing anyway.
+ */
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (zwplug) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+ disk_zone_wplug_set_wp_offset(disk, zwplug,
+ bdev_zone_sectors(bdev));
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ disk_put_zone_wplug(zwplug);
+ } else {
+ disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL);
+ }
+}
+
+void blk_zone_mgmt_bio_endio(struct bio *bio)
+{
+ /* If the BIO failed, we have nothing to do. */
+ if (bio->bi_status != BLK_STS_OK)
+ return;
+
+ switch (bio_op(bio)) {
+ case REQ_OP_ZONE_RESET:
+ blk_zone_reset_bio_endio(bio);
+ return;
+ case REQ_OP_ZONE_RESET_ALL:
+ blk_zone_reset_all_bio_endio(bio);
+ return;
+ case REQ_OP_ZONE_FINISH:
+ blk_zone_finish_bio_endio(bio);
+ return;
+ default:
+ return;
+ }
}
static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
@@ -836,6 +1236,7 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
*/
void blk_zone_write_plug_bio_merged(struct bio *bio)
{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
struct blk_zone_wplug *zwplug;
unsigned long flags;
@@ -857,13 +1258,13 @@ void blk_zone_write_plug_bio_merged(struct bio *bio)
* have at least one request and one BIO referencing the zone write
* plug. So this should not fail.
*/
- zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk,
- bio->bi_iter.bi_sector);
+ zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
if (WARN_ON_ONCE(!zwplug))
return;
spin_lock_irqsave(&zwplug->lock, flags);
zwplug->wp_offset += bio_sectors(bio);
+ disk_zone_wplug_update_cond(disk, zwplug);
spin_unlock_irqrestore(&zwplug->lock, flags);
}
@@ -922,6 +1323,7 @@ void blk_zone_write_plug_init_request(struct request *req)
/* Drop the reference taken by disk_zone_wplug_add_bio(). */
blk_queue_exit(q);
zwplug->wp_offset += bio_sectors(bio);
+ disk_zone_wplug_update_cond(disk, zwplug);
req_back_sector += bio_sectors(bio);
}
@@ -985,6 +1387,7 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
/* Advance the zone write pointer offset. */
zwplug->wp_offset += bio_sectors(bio);
+ disk_zone_wplug_update_cond(disk, zwplug);
return true;
}
@@ -1106,6 +1509,30 @@ static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
disk_put_zone_wplug(zwplug);
}
+static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio)
+{
+ if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL &&
+ !bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
+ /*
+ * Zone reset and zone finish operations do not apply to
+ * conventional zones.
+ */
+ bio_io_error(bio);
+ return true;
+ }
+
+ /*
+ * No-wait zone management BIOs do not make much sense as the callers
+ * issue these as blocking operations in most cases. To avoid issues
+ * with the BIO execution potentially failing with BLK_STS_AGAIN, warn
+ * about REQ_NOWAIT being set and ignore that flag.
+ */
+ if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT))
+ bio->bi_opf &= ~REQ_NOWAIT;
+
+ return false;
+}
+
/**
* blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
* @bio: The BIO being submitted
@@ -1153,12 +1580,9 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
case REQ_OP_WRITE_ZEROES:
return blk_zone_wplug_handle_write(bio, nr_segs);
case REQ_OP_ZONE_RESET:
- return blk_zone_wplug_handle_reset_or_finish(bio, 0);
case REQ_OP_ZONE_FINISH:
- return blk_zone_wplug_handle_reset_or_finish(bio,
- bdev_zone_sectors(bdev));
case REQ_OP_ZONE_RESET_ALL:
- return blk_zone_wplug_handle_reset_all(bio);
+ return blk_zone_wplug_handle_zone_mgmt(bio);
default:
return false;
}
@@ -1332,11 +1756,6 @@ put_zwplug:
disk_put_zone_wplug(zwplug);
}
-static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
-{
- return 1U << disk->zone_wplugs_hash_bits;
-}
-
void disk_init_zone_resources(struct gendisk *disk)
{
spin_lock_init(&disk->zone_wplugs_lock);
@@ -1417,22 +1836,16 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
disk->zone_wplugs_hash_bits = 0;
}
-static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk,
- unsigned long *bitmap)
+static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond)
{
- unsigned int nr_conv_zones = 0;
unsigned long flags;
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
- if (bitmap)
- nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones);
- bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap,
- lockdep_is_held(&disk->zone_wplugs_lock));
+ zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond,
+ lockdep_is_held(&disk->zone_wplugs_lock));
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
- kfree_rcu_mightsleep(bitmap);
-
- return nr_conv_zones;
+ kfree_rcu_mightsleep(zones_cond);
}
void disk_free_zone_resources(struct gendisk *disk)
@@ -1456,31 +1869,37 @@ void disk_free_zone_resources(struct gendisk *disk)
mempool_destroy(disk->zone_wplugs_pool);
disk->zone_wplugs_pool = NULL;
- disk_set_conv_zones_bitmap(disk, NULL);
+ disk_set_zones_cond_array(disk, NULL);
disk->zone_capacity = 0;
disk->last_zone_capacity = 0;
disk->nr_zones = 0;
}
-static inline bool disk_need_zone_resources(struct gendisk *disk)
-{
- /*
- * All mq zoned devices need zone resources so that the block layer
- * can automatically handle write BIO plugging. BIO-based device drivers
- * (e.g. DM devices) are normally responsible for handling zone write
- * ordering and do not need zone resources, unless the driver requires
- * zone append emulation.
- */
- return queue_is_mq(disk->queue) ||
- queue_emulates_zone_append(disk->queue);
-}
+struct blk_revalidate_zone_args {
+ struct gendisk *disk;
+ u8 *zones_cond;
+ unsigned int nr_zones;
+ unsigned int nr_conv_zones;
+ unsigned int zone_capacity;
+ unsigned int last_zone_capacity;
+ sector_t sector;
+};
static int disk_revalidate_zone_resources(struct gendisk *disk,
- unsigned int nr_zones)
+ struct blk_revalidate_zone_args *args)
{
struct queue_limits *lim = &disk->queue->limits;
unsigned int pool_size;
+ args->disk = disk;
+ args->nr_zones =
+ DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors);
+
+ /* Cached zone conditions: 1 byte per zone */
+ args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO);
+ if (!args->zones_cond)
+ return -ENOMEM;
+
if (!disk_need_zone_resources(disk))
return 0;
@@ -1490,7 +1909,8 @@ static int disk_revalidate_zone_resources(struct gendisk *disk,
*/
pool_size = max(lim->max_open_zones, lim->max_active_zones);
if (!pool_size)
- pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones);
+ pool_size =
+ min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones);
if (!disk->zone_wplugs_hash)
return disk_alloc_zone_resources(disk, pool_size);
@@ -1498,15 +1918,6 @@ static int disk_revalidate_zone_resources(struct gendisk *disk,
return 0;
}
-struct blk_revalidate_zone_args {
- struct gendisk *disk;
- unsigned long *conv_zones_bitmap;
- unsigned int nr_zones;
- unsigned int zone_capacity;
- unsigned int last_zone_capacity;
- sector_t sector;
-};
-
/*
* Update the disk zone resources information and device queue limits.
* The disk queue is frozen when this is executed.
@@ -1515,30 +1926,34 @@ static int disk_update_zone_resources(struct gendisk *disk,
struct blk_revalidate_zone_args *args)
{
struct request_queue *q = disk->queue;
- unsigned int nr_seq_zones, nr_conv_zones;
- unsigned int pool_size;
+ unsigned int nr_seq_zones;
+ unsigned int pool_size, memflags;
struct queue_limits lim;
+ int ret = 0;
+
+ lim = queue_limits_start_update(q);
+
+ memflags = blk_mq_freeze_queue(q);
disk->nr_zones = args->nr_zones;
- disk->zone_capacity = args->zone_capacity;
- disk->last_zone_capacity = args->last_zone_capacity;
- nr_conv_zones =
- disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap);
- if (nr_conv_zones >= disk->nr_zones) {
+ if (args->nr_conv_zones >= disk->nr_zones) {
pr_warn("%s: Invalid number of conventional zones %u / %u\n",
- disk->disk_name, nr_conv_zones, disk->nr_zones);
- return -ENODEV;
+ disk->disk_name, args->nr_conv_zones, disk->nr_zones);
+ ret = -ENODEV;
+ goto unfreeze;
}
- lim = queue_limits_start_update(q);
+ disk->zone_capacity = args->zone_capacity;
+ disk->last_zone_capacity = args->last_zone_capacity;
+ disk_set_zones_cond_array(disk, args->zones_cond);
/*
- * Some devices can advertize zone resource limits that are larger than
+ * Some devices can advertise zone resource limits that are larger than
* the number of sequential zones of the zoned block device, e.g. a
* small ZNS namespace. For such case, assume that the zoned device has
* no zone resource limits.
*/
- nr_seq_zones = disk->nr_zones - nr_conv_zones;
+ nr_seq_zones = disk->nr_zones - args->nr_conv_zones;
if (lim.max_open_zones >= nr_seq_zones)
lim.max_open_zones = 0;
if (lim.max_active_zones >= nr_seq_zones)
@@ -1568,7 +1983,53 @@ static int disk_update_zone_resources(struct gendisk *disk,
}
commit:
- return queue_limits_commit_update_frozen(q, &lim);
+ ret = queue_limits_commit_update(q, &lim);
+
+unfreeze:
+ if (ret)
+ disk_free_zone_resources(disk);
+
+ blk_mq_unfreeze_queue(q, memflags);
+
+ return ret;
+}
+
+static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx,
+ struct blk_revalidate_zone_args *args)
+{
+ enum blk_zone_cond cond = zone->cond;
+
+ /* Check that the zone condition is consistent with the zone type. */
+ switch (cond) {
+ case BLK_ZONE_COND_NOT_WP:
+ if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
+ goto invalid_condition;
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_FULL:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
+ goto invalid_condition;
+ break;
+ default:
+ pr_warn("%s: Invalid zone condition 0x%X\n",
+ args->disk->disk_name, cond);
+ return -ENODEV;
+ }
+
+ blk_zone_set_cond(args->zones_cond, idx, cond);
+
+ return 0;
+
+invalid_condition:
+ pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n",
+ args->disk->disk_name, cond, zone->type);
+
+ return -ENODEV;
}
static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
@@ -1585,17 +2046,7 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
if (disk_zone_is_last(disk, zone))
args->last_zone_capacity = zone->capacity;
- if (!disk_need_zone_resources(disk))
- return 0;
-
- if (!args->conv_zones_bitmap) {
- args->conv_zones_bitmap =
- bitmap_zalloc(args->nr_zones, GFP_NOIO);
- if (!args->conv_zones_bitmap)
- return -ENOMEM;
- }
-
- set_bit(idx, args->conv_zones_bitmap);
+ args->nr_conv_zones++;
return 0;
}
@@ -1693,6 +2144,11 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENODEV;
}
+ /* Check zone condition */
+ ret = blk_revalidate_zone_cond(zone, idx, args);
+ if (ret)
+ return ret;
+
/* Check zone type */
switch (zone->type) {
case BLK_ZONE_TYPE_CONVENTIONAL:
@@ -1733,7 +2189,11 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
sector_t zone_sectors = q->limits.chunk_sectors;
sector_t capacity = get_capacity(disk);
struct blk_revalidate_zone_args args = { };
- unsigned int noio_flag;
+ unsigned int memflags, noio_flag;
+ struct blk_report_zones_args rep_args = {
+ .cb = blk_revalidate_zone_cb,
+ .data = &args,
+ };
int ret = -ENOMEM;
if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
@@ -1756,17 +2216,14 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
* Ensure that all memory allocations in this context are done as if
* GFP_NOIO was specified.
*/
- args.disk = disk;
- args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
noio_flag = memalloc_noio_save();
- ret = disk_revalidate_zone_resources(disk, args.nr_zones);
+ ret = disk_revalidate_zone_resources(disk, &args);
if (ret) {
memalloc_noio_restore(noio_flag);
return ret;
}
- ret = disk->fops->report_zones(disk, 0, UINT_MAX,
- blk_revalidate_zone_cb, &args);
+ ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args);
if (!ret) {
pr_warn("%s: No zones reported\n", disk->disk_name);
ret = -ENODEV;
@@ -1783,20 +2240,14 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
ret = -ENODEV;
}
- /*
- * Set the new disk zone parameters only once the queue is frozen and
- * all I/Os are completed.
- */
if (ret > 0)
- ret = disk_update_zone_resources(disk, &args);
- else
- pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
- if (ret) {
- unsigned int memflags = blk_mq_freeze_queue(q);
+ return disk_update_zone_resources(disk, &args);
- disk_free_zone_resources(disk);
- blk_mq_unfreeze_queue(q, memflags);
- }
+ pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
+
+ memflags = blk_mq_freeze_queue(q);
+ disk_free_zone_resources(disk);
+ blk_mq_unfreeze_queue(q, memflags);
return ret;
}
@@ -1817,6 +2268,7 @@ EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask)
{
+ struct gendisk *disk = bdev->bd_disk;
int ret;
if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
@@ -1832,7 +2284,7 @@ int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
* pointer. Undo this using a report zone to update the zone write
* pointer to the correct current value.
*/
- ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector);
+ ret = disk->fops->report_zones(disk, sector, 1, NULL);
if (ret != 1)
return ret < 0 ? ret : -EIO;
@@ -1851,18 +2303,22 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
unsigned int zwp_wp_offset, zwp_flags;
unsigned int zwp_zone_no, zwp_ref;
unsigned int zwp_bio_list_size;
+ enum blk_zone_cond zwp_cond;
unsigned long flags;
spin_lock_irqsave(&zwplug->lock, flags);
zwp_zone_no = zwplug->zone_no;
zwp_flags = zwplug->flags;
zwp_ref = refcount_read(&zwplug->ref);
+ zwp_cond = zwplug->cond;
zwp_wp_offset = zwplug->wp_offset;
zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
spin_unlock_irqrestore(&zwplug->lock, flags);
- seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref,
- zwp_wp_offset, zwp_bio_list_size);
+ seq_printf(m,
+ "Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n",
+ zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond),
+ zwp_wp_offset, zwp_bio_list_size);
}
int queue_zone_wplugs_show(void *data, struct seq_file *m)
diff --git a/block/blk.h b/block/blk.h
index 32a10024efba..4d809588b771 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -489,10 +489,24 @@ static inline bool blk_req_bio_is_zone_append(struct request *rq,
void blk_zone_write_plug_bio_merged(struct bio *bio);
void blk_zone_write_plug_init_request(struct request *rq);
void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio);
+void blk_zone_mgmt_bio_endio(struct bio *bio);
void blk_zone_write_plug_bio_endio(struct bio *bio);
static inline void blk_zone_bio_endio(struct bio *bio)
{
/*
+ * Zone management BIOs may impact zone write plugs (e.g. a zone reset
+ * changes a zone write plug zone write pointer offset), but these
+ * operation do not go through zone write plugging as they may operate
+ * on zones that do not have a zone write
+ * plug. blk_zone_mgmt_bio_endio() handles the potential changes to zone
+ * write plugs that are present.
+ */
+ if (op_is_zone_mgmt(bio_op(bio))) {
+ blk_zone_mgmt_bio_endio(bio);
+ return;
+ }
+
+ /*
* For write BIOs to zoned devices, signal the completion of the BIO so
* that the next write BIO can be submitted by zone write plugging.
*/
diff --git a/block/ioctl.c b/block/ioctl.c
index 3927ca4707d0..698629e4c619 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -581,6 +581,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
case BLKGETDISKSEQ:
return put_u64(argp, bdev->bd_disk->diskseq);
case BLKREPORTZONE:
+ case BLKREPORTZONEV2:
return blkdev_report_zones_ioctl(bdev, cmd, arg);
case BLKRESETZONE:
case BLKOPENZONE:
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 7bb6128dbaaf..6c4c4bbe7dad 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -143,7 +143,8 @@ int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim);
int null_register_zoned_dev(struct nullb *nullb);
void null_free_zoned_dev(struct nullb_device *dev);
int null_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data);
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args);
blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
sector_t sector, sector_t nr_sectors);
size_t null_zone_valid_read_len(struct nullb *nullb,
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 4e5728f45989..6a93b12a06ff 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -191,7 +191,7 @@ void null_free_zoned_dev(struct nullb_device *dev)
}
int null_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
struct nullb *nullb = disk->private_data;
struct nullb_device *dev = nullb->dev;
@@ -225,7 +225,7 @@ int null_report_zones(struct gendisk *disk, sector_t sector,
blkz.capacity = zone->capacity;
null_unlock_zone(dev, zone);
- error = cb(&blkz, i, data);
+ error = disk_report_zone(disk, &blkz, i, args);
if (error)
return error;
}
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 96e07763cd28..97cc4bc0a6ce 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -367,7 +367,7 @@ static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
}
static int ublk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
struct ublk_device *ub = disk->private_data;
unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
@@ -430,7 +430,7 @@ free_req:
if (!zone->len)
break;
- ret = cb(zone, i, data);
+ ret = disk_report_zone(disk, zone, i, args);
if (ret)
goto out;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index f061420dfb10..a5e97f03dbf0 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -584,7 +584,8 @@ out:
static int virtblk_parse_zone(struct virtio_blk *vblk,
struct virtio_blk_zone_descriptor *entry,
- unsigned int idx, report_zones_cb cb, void *data)
+ unsigned int idx,
+ struct blk_report_zones_args *args)
{
struct blk_zone zone = { };
@@ -650,12 +651,12 @@ static int virtblk_parse_zone(struct virtio_blk *vblk,
* The callback below checks the validity of the reported
* entry data, no need to further validate it here.
*/
- return cb(&zone, idx, data);
+ return disk_report_zone(vblk->disk, &zone, idx, args);
}
static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb,
- void *data)
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args)
{
struct virtio_blk *vblk = disk->private_data;
struct virtio_blk_zone_report *report;
@@ -693,7 +694,7 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
for (i = 0; i < nz && zone_idx < nr_zones; i++) {
ret = virtblk_parse_zone(vblk, &report->zones[i],
- zone_idx, cb, data);
+ zone_idx, args);
if (ret)
goto fail_report;
diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c
index a423228e201b..92be9f0af00a 100644
--- a/drivers/block/zloop.c
+++ b/drivers/block/zloop.c
@@ -647,7 +647,7 @@ static int zloop_open(struct gendisk *disk, blk_mode_t mode)
}
static int zloop_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
struct zloop_device *zlo = disk->private_data;
struct blk_zone blkz = {};
@@ -687,7 +687,7 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector,
mutex_unlock(&zone->lock);
- ret = cb(&blkz, i, data);
+ ret = disk_report_zone(disk, &blkz, i, args);
if (ret)
return ret;
}
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 78e17dd4d01b..984fb621b0e9 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -17,33 +17,26 @@
* For internal zone reports bypassing the top BIO submission path.
*/
static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
- sector_t sector, unsigned int nr_zones,
- report_zones_cb cb, void *data)
+ unsigned int nr_zones,
+ struct dm_report_zones_args *args)
{
- struct gendisk *disk = md->disk;
- int ret;
- struct dm_report_zones_args args = {
- .next_sector = sector,
- .orig_data = data,
- .orig_cb = cb,
- };
-
do {
struct dm_target *tgt;
+ int ret;
- tgt = dm_table_find_target(t, args.next_sector);
+ tgt = dm_table_find_target(t, args->next_sector);
if (WARN_ON_ONCE(!tgt->type->report_zones))
return -EIO;
- args.tgt = tgt;
- ret = tgt->type->report_zones(tgt, &args,
- nr_zones - args.zone_idx);
+ args->tgt = tgt;
+ ret = tgt->type->report_zones(tgt, args,
+ nr_zones - args->zone_idx);
if (ret < 0)
return ret;
- } while (args.zone_idx < nr_zones &&
- args.next_sector < get_capacity(disk));
+ } while (args->zone_idx < nr_zones &&
+ args->next_sector < get_capacity(md->disk));
- return args.zone_idx;
+ return args->zone_idx;
}
/*
@@ -52,7 +45,8 @@ static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
* generally implemented by targets using dm_report_zones().
*/
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args)
{
struct mapped_device *md = disk->private_data;
struct dm_table *map;
@@ -76,9 +70,14 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
map = zone_revalidate_map;
}
- if (map)
- ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb,
- data);
+ if (map) {
+ struct dm_report_zones_args dm_args = {
+ .disk = md->disk,
+ .next_sector = sector,
+ .rep_args = args,
+ };
+ ret = dm_blk_do_report_zones(md, map, nr_zones, &dm_args);
+ }
if (put_table)
dm_put_live_table(md, srcu_idx);
@@ -113,7 +112,9 @@ static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx,
}
args->next_sector = zone->start + zone->len;
- return args->orig_cb(zone, args->zone_idx++, args->orig_data);
+
+ return disk_report_zone(args->disk, zone, args->zone_idx++,
+ args->rep_args);
}
/*
@@ -492,10 +493,15 @@ int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t,
sector_t sector, unsigned int nr_zones,
unsigned long *need_reset)
{
+ struct dm_report_zones_args args = {
+ .disk = md->disk,
+ .next_sector = sector,
+ .cb = dm_zone_need_reset_cb,
+ .data = need_reset,
+ };
int ret;
- ret = dm_blk_do_report_zones(md, t, sector, nr_zones,
- dm_zone_need_reset_cb, need_reset);
+ ret = dm_blk_do_report_zones(md, t, nr_zones, &args);
if (ret != nr_zones) {
DMERR("Get %s zone reset bitmap failed\n",
md->disk->disk_name);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 245f52b59215..7a795979ec72 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -109,7 +109,8 @@ void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim);
void dm_zone_endio(struct dm_io *io, struct bio *clone);
#ifdef CONFIG_BLK_DEV_ZONED
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data);
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args);
bool dm_is_zone_write(struct mapped_device *md, struct bio *bio);
int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t,
sector_t sector, unsigned int nr_zones,
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fa4181d7de73..c0fe50fb7b08 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2599,10 +2599,9 @@ static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
#ifdef CONFIG_BLK_DEV_ZONED
static int nvme_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
- return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
- data);
+ return nvme_ns_report_zones(disk->private_data, sector, nr_zones, args);
}
#else
#define nvme_report_zones NULL
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 543e17aead12..0b7ac0735bd0 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -576,7 +576,7 @@ static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16],
#ifdef CONFIG_BLK_DEV_ZONED
static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
struct nvme_ns_head *head = disk->private_data;
struct nvme_ns *ns;
@@ -585,7 +585,7 @@ static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
srcu_idx = srcu_read_lock(&head->srcu);
ns = nvme_find_path(head);
if (ns)
- ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
+ ret = nvme_ns_report_zones(ns, sector, nr_zones, args);
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 102fae6a231c..928c748ccbd1 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -1108,7 +1108,7 @@ struct nvme_zone_info {
};
int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data);
+ unsigned int nr_zones, struct blk_report_zones_args *args);
int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf,
struct nvme_zone_info *zi);
void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index cce4c5b55aa9..deea2dbef5b8 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -148,8 +148,8 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
static int nvme_zone_parse_entry(struct nvme_ns *ns,
struct nvme_zone_descriptor *entry,
- unsigned int idx, report_zones_cb cb,
- void *data)
+ unsigned int idx,
+ struct blk_report_zones_args *args)
{
struct nvme_ns_head *head = ns->head;
struct blk_zone zone = { };
@@ -169,11 +169,11 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
else
zone.wp = nvme_lba_to_sect(head, le64_to_cpu(entry->wp));
- return cb(&zone, idx, data);
+ return disk_report_zone(ns->disk, &zone, idx, args);
}
int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
struct nvme_zone_report *report;
struct nvme_command c = { };
@@ -213,7 +213,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
for (i = 0; i < nz && zone_idx < nr_zones; i++) {
ret = nvme_zone_parse_entry(ns, &report->entries[i],
- zone_idx, cb, data);
+ zone_idx, args);
if (ret)
goto out_free;
zone_idx++;
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 36382eca941c..574af8243016 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -240,7 +240,7 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
struct scsi_sense_hdr *sshdr);
int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data);
+ unsigned int nr_zones, struct blk_report_zones_args *args);
#else /* CONFIG_BLK_DEV_ZONED */
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index a8db66428f80..56e455fb5add 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -35,8 +35,7 @@ static bool sd_zbc_is_gap_zone(const u8 buf[64])
* @buf: SCSI zone descriptor.
* @idx: Index of the zone relative to the first zone reported by the current
* sd_zbc_report_zones() call.
- * @cb: Callback function pointer.
- * @data: Second argument passed to @cb.
+ * @args: report zones arguments (callback, etc)
*
* Return: Value returned by @cb.
*
@@ -44,12 +43,11 @@ static bool sd_zbc_is_gap_zone(const u8 buf[64])
* call @cb(blk_zone, @data).
*/
static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64],
- unsigned int idx, report_zones_cb cb, void *data)
+ unsigned int idx, struct blk_report_zones_args *args)
{
struct scsi_device *sdp = sdkp->device;
struct blk_zone zone = { 0 };
sector_t start_lba, gran;
- int ret;
if (WARN_ON_ONCE(sd_zbc_is_gap_zone(buf)))
return -EINVAL;
@@ -87,11 +85,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64],
else
zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24]));
- ret = cb(&zone, idx, data);
- if (ret)
- return ret;
-
- return 0;
+ return disk_report_zone(sdkp->disk, &zone, idx, args);
}
/**
@@ -217,14 +211,14 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp)
* @disk: Disk to report zones for.
* @sector: Start sector.
* @nr_zones: Maximum number of zones to report.
- * @cb: Callback function called to report zone information.
- * @data: Second argument passed to @cb.
+ * @args: Callback arguments.
*
* Called by the block layer to iterate over zone information. See also the
* disk->fops->report_zones() calls in block/blk-zoned.c.
*/
int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args)
{
struct scsi_disk *sdkp = scsi_disk(disk);
sector_t lba = sectors_to_logical(sdkp->device, sector);
@@ -283,7 +277,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
}
ret = sd_zbc_parse_report(sdkp, buf + offset, zone_idx,
- cb, data);
+ args);
if (ret)
goto out;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 0ea0df18a8e4..a16b1a896c78 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -264,8 +264,8 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
}
}
- ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
- copy_zone_info_cb, zones);
+ ret = blkdev_report_zones_cached(device->bdev, pos >> SECTOR_SHIFT,
+ *nr_zones, copy_zone_info_cb, zones);
if (ret < 0) {
btrfs_err(device->fs_info,
"zoned: failed to read zone %llu on %s (devid %llu)",
@@ -494,6 +494,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
case BLK_ZONE_COND_CLOSED:
+ case BLK_ZONE_COND_ACTIVE:
__set_bit(nreported, zone_info->active_zones);
nactive++;
break;
@@ -896,9 +897,9 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
if (sb_zone + 1 >= nr_zones)
return -ENOENT;
- ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
- BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
- zones);
+ ret = blkdev_report_zones_cached(bdev, zone_start_sector(sb_zone, bdev),
+ BTRFS_NR_SB_LOG_ZONES,
+ copy_zone_info_cb, zones);
if (ret < 0)
return ret;
if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES))
diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c
index b0791a71931c..b40f71f878b5 100644
--- a/fs/xfs/libxfs/xfs_zones.c
+++ b/fs/xfs/libxfs/xfs_zones.c
@@ -95,6 +95,7 @@ xfs_zone_validate_seq(
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
case BLK_ZONE_COND_CLOSED:
+ case BLK_ZONE_COND_ACTIVE:
return xfs_zone_validate_wp(zone, rtg, write_pointer);
case BLK_ZONE_COND_FULL:
return xfs_zone_validate_full(zone, rtg, write_pointer);
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 1147bacb2da8..d121768dbccb 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -1250,7 +1250,7 @@ xfs_mount_zones(
trace_xfs_zones_mount(mp);
if (bdev_is_zoned(bt->bt_bdev)) {
- error = blkdev_report_zones(bt->bt_bdev,
+ error = blkdev_report_zones_cached(bt->bt_bdev,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz);
if (error < 0)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 99be263b31ab..f0ab02e0a673 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -38,6 +38,7 @@ struct blk_flush_queue;
struct kiocb;
struct pr_ops;
struct rq_qos;
+struct blk_report_zones_args;
struct blk_queue_stats;
struct blk_stat_callback;
struct blk_crypto_profile;
@@ -195,7 +196,7 @@ struct gendisk {
unsigned int nr_zones;
unsigned int zone_capacity;
unsigned int last_zone_capacity;
- unsigned long __rcu *conv_zones_bitmap;
+ u8 __rcu *zones_cond;
unsigned int zone_wplugs_hash_bits;
atomic_t nr_zone_wplugs;
spinlock_t zone_wplugs_lock;
@@ -432,9 +433,17 @@ struct queue_limits {
typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
void *data);
+int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
+ unsigned int idx, struct blk_report_zones_args *args);
+
+int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
+ struct blk_zone *zone);
+
#define BLK_ALL_ZONES ((unsigned int)-1)
int blkdev_report_zones(struct block_device *bdev, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
+int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data);
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
sector_t sectors, sector_t nr_sectors);
int blk_revalidate_disk_zones(struct gendisk *disk);
@@ -921,12 +930,20 @@ static inline unsigned int bdev_zone_capacity(struct block_device *bdev,
{
return disk_zone_capacity(bdev->bd_disk, pos);
}
+
+bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector);
+
#else /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int disk_nr_zones(struct gendisk *disk)
{
return 0;
}
+static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
+{
+ return false;
+}
+
static inline bool bio_needs_zone_write_plugging(struct bio *bio)
{
return false;
@@ -1529,33 +1546,6 @@ static inline bool bdev_is_zone_aligned(struct block_device *bdev,
return bdev_is_zone_start(bdev, sector);
}
-/**
- * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
- * @bdev: block device to check
- * @sector: sector number
- *
- * Check if @sector on @bdev is contained in a sequential write required zone.
- */
-static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
-{
- bool is_seq = false;
-
-#if IS_ENABLED(CONFIG_BLK_DEV_ZONED)
- if (bdev_is_zoned(bdev)) {
- struct gendisk *disk = bdev->bd_disk;
- unsigned long *bitmap;
-
- rcu_read_lock();
- bitmap = rcu_dereference(disk->conv_zones_bitmap);
- is_seq = !bitmap ||
- !test_bit(disk_zone_no(disk, sector), bitmap);
- rcu_read_unlock();
- }
-#endif
-
- return is_seq;
-}
-
int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask);
@@ -1662,7 +1652,8 @@ struct block_device_operations {
/* this callback is with swap_lock and sometimes page table lock held */
void (*swap_slot_free_notify) (struct block_device *, unsigned long);
int (*report_zones)(struct gendisk *, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data);
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args);
char *(*devnode)(struct gendisk *disk, umode_t *mode);
/* returns the length of the identifier or a negative errno: */
int (*get_unique_id)(struct gendisk *disk, u8 id[16],
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 84fdc3a6a19a..38f625af6ab4 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -538,12 +538,18 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone);
#ifdef CONFIG_BLK_DEV_ZONED
struct dm_report_zones_args {
struct dm_target *tgt;
+ struct gendisk *disk;
sector_t next_sector;
- void *orig_data;
- report_zones_cb orig_cb;
unsigned int zone_idx;
+ /* for block layer ->report_zones */
+ struct blk_report_zones_args *rep_args;
+
+ /* for internal users */
+ report_zones_cb cb;
+ void *data;
+
/* must be filled by ->report_zones before calling dm_report_zones_cb */
sector_t start;
};
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index f85743ef6e7d..e33f02703350 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -48,6 +48,8 @@ enum blk_zone_type {
* FINISH ZONE command.
* @BLK_ZONE_COND_READONLY: The zone is read-only.
* @BLK_ZONE_COND_OFFLINE: The zone is offline (sectors cannot be read/written).
+ * @BLK_ZONE_COND_ACTIVE: The zone is either implicitly open, explicitly open,
+ * or closed.
*
* The Zone Condition state machine in the ZBC/ZAC standards maps the above
* deinitions as:
@@ -61,6 +63,13 @@ enum blk_zone_type {
*
* Conditions 0x5 to 0xC are reserved by the current ZBC/ZAC spec and should
* be considered invalid.
+ *
+ * The condition BLK_ZONE_COND_ACTIVE is used only with cached zone reports.
+ * It is used to report any of the BLK_ZONE_COND_IMP_OPEN,
+ * BLK_ZONE_COND_EXP_OPEN and BLK_ZONE_COND_CLOSED conditions. Conversely, a
+ * regular zone report will never report a zone condition using
+ * BLK_ZONE_COND_ACTIVE and instead use the conditions BLK_ZONE_COND_IMP_OPEN,
+ * BLK_ZONE_COND_EXP_OPEN or BLK_ZONE_COND_CLOSED as reported by the device.
*/
enum blk_zone_cond {
BLK_ZONE_COND_NOT_WP = 0x0,
@@ -71,15 +80,27 @@ enum blk_zone_cond {
BLK_ZONE_COND_READONLY = 0xD,
BLK_ZONE_COND_FULL = 0xE,
BLK_ZONE_COND_OFFLINE = 0xF,
+
+ BLK_ZONE_COND_ACTIVE = 0xFF,
};
/**
* enum blk_zone_report_flags - Feature flags of reported zone descriptors.
*
- * @BLK_ZONE_REP_CAPACITY: Zone descriptor has capacity field.
+ * @BLK_ZONE_REP_CAPACITY: Output only. Indicates that zone descriptors in a
+ * zone report have a valid capacity field.
+ * @BLK_ZONE_REP_CACHED: Input only. Indicates that the zone report should be
+ * generated using cached zone information. In this case,
+ * the implicit open, explicit open and closed zone
+ * conditions are all reported with the
+ * BLK_ZONE_COND_ACTIVE condition.
*/
enum blk_zone_report_flags {
- BLK_ZONE_REP_CAPACITY = (1 << 0),
+ /* Output flags */
+ BLK_ZONE_REP_CAPACITY = (1U << 0),
+
+ /* Input flags */
+ BLK_ZONE_REP_CACHED = (1U << 31),
};
/**
@@ -122,6 +143,10 @@ struct blk_zone {
* @sector: starting sector of report
* @nr_zones: IN maximum / OUT actual
* @flags: one or more flags as defined by enum blk_zone_report_flags.
+ * @flags: one or more flags as defined by enum blk_zone_report_flags.
+ * With BLKREPORTZONE, this field is ignored as an input and is valid
+ * only as an output. Using BLKREPORTZONEV2, this field is used as both
+ * input and output.
* @zones: Space to hold @nr_zones @zones entries on reply.
*
* The array of at most @nr_zones must follow this structure in memory.
@@ -148,9 +173,19 @@ struct blk_zone_range {
/**
* Zoned block device ioctl's:
*
- * @BLKREPORTZONE: Get zone information. Takes a zone report as argument.
- * The zone report will start from the zone containing the
- * sector specified in the report request structure.
+ * @BLKREPORTZONE: Get zone information from a zoned device. Takes a zone report
+ * as argument. The zone report will start from the zone
+ * containing the sector specified in struct blk_zone_report.
+ * The flags field of struct blk_zone_report is used as an
+ * output only and ignored as an input.
+ * DEPRECATED, use BLKREPORTZONEV2 instead.
+ * @BLKREPORTZONEV2: Same as @BLKREPORTZONE but uses the flags field of
+ * struct blk_zone_report as an input, allowing to get a zone
+ * report using cached zone information if the flag
+ * BLK_ZONE_REP_CACHED is set. In such case, the zone report
+ * may include zones with the condition @BLK_ZONE_COND_ACTIVE
+ * (c.f. the description of this condition above for more
+ * details).
* @BLKRESETZONE: Reset the write pointer of the zones in the specified
* sector range. The sector range must be zone aligned.
* @BLKGETZONESZ: Get the device zone size in number of 512 B sectors.
@@ -169,5 +204,6 @@ struct blk_zone_range {
#define BLKOPENZONE _IOW(0x12, 134, struct blk_zone_range)
#define BLKCLOSEZONE _IOW(0x12, 135, struct blk_zone_range)
#define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range)
+#define BLKREPORTZONEV2 _IOWR(0x12, 142, struct blk_zone_report)
#endif /* _UAPI_BLKZONED_H */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 957ce3343a4f..66ca526cf786 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -298,7 +298,7 @@ struct file_attr {
#define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127)
#define BLKGETDISKSEQ _IOR(0x12,128,__u64)
-/* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */
+/* 130-136 and 142 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */
/* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */
#define BLKTRACESETUP2 _IOWR(0x12, 142, struct blk_user_trace_setup2)