summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2025-06-23 12:45:32 +0200
committerChristian Brauner <brauner@kernel.org>2025-06-23 12:45:32 +0200
commit4f984fe7b4d9aea332c7ff59827a4e168f0e4e1b (patch)
treee8420d212f12379e272fa36e4cf03493a83f771c
parente04c78d86a9699d136910cfc0bdcf01087e3267e (diff)
parentf4265b8d32c49ff95711e6fef7d05245a2905b30 (diff)
Merge patch series "fallocate: introduce FALLOC_FL_WRITE_ZEROES flag"
Zhang Yi <yi.zhang@huawei.com> says: Currently, we can use the fallocate command to quickly create a pre-allocated file. However, on most filesystems, such as ext4 and XFS, fallocate create pre-allocation blocks in an unwritten state, and the FALLOC_FL_ZERO_RANGE flag also behaves similarly. The extent state must be converted to a written state when the user writes data into this range later, which can trigger numerous metadata changes and consequent journal I/O. This may leads to significant write amplification and performance degradation in synchronous write mode. Therefore, we need a method to create a pre-allocated file with written extents that can be used for pure overwriting. At the monent, the only method available is to create an empty file and write zero data into it (for example, using 'dd' with a large block size). However, this method is slow and consumes a considerable amount of disk bandwidth, we must pre-allocate files in advance but cannot add pre-allocated files while user business services are running. Fortunately, with the development and more and more widely used of flash-based storage devices, we can efficiently write zeros to SSDs using the unmap write zeroes command if the devices do not write physical zeroes to the media. For example, if SCSI SSDs support the UMMAP bit or NVMe SSDs support the DEAC bit[1], the write zeroes command does not write actual data to the device, instead, NVMe converts the zeroed range to a deallocated state, which works fast and consumes almost no disk write bandwidth. Consequently, this feature can provide us with a faster method for creating pre-allocated files with written extents and zeroed data. However, please note that this may be a best-effort optimization rather than a mandatory requirement, some devices may partially fall back to writing physical zeroes due to factors such as receiving unaligned commands. This series aims to implement this by: 1. Introduce a new feature BLK_FEAT_WRITE_ZEROES_UNMAP to the block device queue limit features, which indicates whether the storage is device explicitly supports the unmapped write zeroes command. This flag should be set to 1 by the driver if the attached disk supports this command. 2. Introduce a queue limit flag, BLK_FLAG_WRITE_ZEROES_UNMAP_DISABLED, along with a corresponding sysfs entry. Users can query the support status of the unmap write zeroes operation and disable this operation if the write zeroes operation is very slow. /sys/block/<disk>/queue/write_zeroes_unmap 3. Introduce a new flag, FALLOC_FL_WRITE_ZEROES, into the fallocate. Filesystems that support this operation should allocate written extents and issue zeroes to the specified range of the device. For local block device filesystems, this operation should depend on the write_zeroes_unmap operaion of the underlying block device. It should return -EOPNOTSUPP if the device doesn't enable unmap write zeroes operaion. This series implements the BLK_FEAT_WRITE_ZEROES_UNMAP feature and BLK_FLAG_WRITE_ZEROES_UNMAP_DISABLED flag for SCSI, NVMe and device-mapper drivers, and add the FALLOC_FL_WRITE_ZEROES and STATX_ATTR_WRITE_ZEROES_UNMAP support for ext4 and raw bdev devices. Any comments are welcome. I've tested performance with this series on ext4 filesystem on my machine with an Intel Xeon Gold 6248R CPU, a 7TB KCD61LUL7T68 NVMe SSD which supports unmap write zeroes command with the Deallocated state and the DEAC bit. Feel free to give it a try. 0. Ensure the NVMe device supports WRITE_ZERO command. $ cat /sys/block/nvme5n1/queue/write_zeroes_max_bytes 8388608 $ nvme id-ns -H /dev/nvme5n1 | grep -i -A 3 "dlfeat" dlfeat : 25 [4:4] : 0x1 Guard Field of Deallocated Logical Blocks is set to CRC of The Value Read [3:3] : 0x1 Deallocate Bit in the Write Zeroes Command is Supported [2:0] : 0x1 Bytes Read From a Deallocated Logical Block and its Metadata are 0x00 1. Compare 'dd' and fallocate with unmap write zeroes, the later one is significantly faster than 'dd'. Create a 1GB and 10GB zeroed file. $dd if=/dev/zero of=foo bs=2M count=$count oflag=direct $time fallocate -w -l $size bar #1G dd: 0.5s FALLOC_FL_WRITE_ZEROES: 0.17s #10G dd: 5.0s FALLOC_FL_WRITE_ZEROES: 1.7s 2. Run fio overwrite and fallocate with unmap write zeroes simultaneously, fallocate has little impact on write bandwidth and only slightly affects write latency. a) Test bandwidth costs. $ fio -directory=/test -direct=1 -iodepth=10 -fsync=0 -rw=write \ -numjobs=10 -bs=2M -ioengine=libaio -size=20G -runtime=20 \ -fallocate=none -overwrite=1 -group_reportin -name=bw_test Without background zero range: bw (MiB/s): min= 2068, max= 2280, per=100.00%, avg=2186.40 With background zero range: bw (MiB/s): min= 2056, max= 2308, per=100.00%, avg=2186.20 b) Test write latency costs. $ fio -filename=/test/foo -direct=1 -iodepth=1 -fsync=0 -rw=write \ -numjobs=1 -bs=4k -ioengine=psync -size=5G -runtime=20 \ -fallocate=none -overwrite=1 -group_reportin -name=lat_test Without background zero range: lat (nsec): min=9269, max=71635, avg=9840.65 With a background zero range: lat (usec): min=9, max=982, avg=11.03 3. Compare overwriting in a pre-allocated unwritten file and a written file in O_DSYNC mode. Write to a file with written extents is much faster. # First mkfs and create a test file according to below three cases, # and then run fio. $ fio -filename=/test/foo -direct=1 -iodepth=1 -fdatasync=1 \ -rw=write -numjobs=1 -bs=4k -ioengine=psync -size=5G \ -runtime=20 -fallocate=none -group_reportin -name=test unwritten file: IOPS=20.1k, BW=78.7MiB/s unwritten file + fast_commit: IOPS=42.9k, BW=167MiB/s written file: IOPS=98.8k, BW=386MiB/s * patches from https://lore.kernel.org/20250619111806.3546162-1-yi.zhang@huaweicloud.com: ext4: add FALLOC_FL_WRITE_ZEROES support block: add FALLOC_FL_WRITE_ZEROES support block: factor out common part in blkdev_fallocate() fs: introduce FALLOC_FL_WRITE_ZEROES to fallocate dm: clear unmap write zeroes limits when disabling write zeroes scsi: sd: set max_hw_wzeroes_unmap_sectors if device supports SD_ZERO_*_UNMAP nvmet: set WZDS and DRB if device enables unmap write zeroes operation nvme: set max_hw_wzeroes_unmap_sectors if device supports DEAC bit block: introduce max_{hw|user}_wzeroes_unmap_sectors to queue limits Link: https://lore.kernel.org/20250619111806.3546162-1-yi.zhang@huaweicloud.com Signed-off-by: Christian Brauner <brauner@kernel.org>
-rw-r--r--Documentation/ABI/stable/sysfs-block33
-rw-r--r--block/blk-settings.c20
-rw-r--r--block/blk-sysfs.c26
-rw-r--r--block/fops.c44
-rw-r--r--drivers/md/dm-table.c4
-rw-r--r--drivers/nvme/host/core.c20
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c4
-rw-r--r--drivers/scsi/sd.c5
-rw-r--r--fs/ext4/extents.c66
-rw-r--r--fs/open.c1
-rw-r--r--include/linux/blkdev.h10
-rw-r--r--include/linux/falloc.h3
-rw-r--r--include/trace/events/ext4.h3
-rw-r--r--include/uapi/linux/falloc.h17
14 files changed, 212 insertions, 44 deletions
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 4ba771b56b3b..803f578dc023 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -778,6 +778,39 @@ Description:
0, write zeroes is not supported by the device.
+What: /sys/block/<disk>/queue/write_zeroes_unmap_max_hw_bytes
+Date: January 2025
+Contact: Zhang Yi <yi.zhang@huawei.com>
+Description:
+ [RO] This file indicates whether a device supports zeroing data
+ in a specified block range without incurring the cost of
+ physically writing zeroes to the media for each individual
+ block. If this parameter is set to write_zeroes_max_bytes, the
+ device implements a zeroing operation which opportunistically
+ avoids writing zeroes to media while still guaranteeing that
+ subsequent reads from the specified block range will return
+ zeroed data. This operation is a best-effort optimization, a
+ device may fall back to physically writing zeroes to the media
+ due to other factors such as misalignment or being asked to
+ clear a block range smaller than the device's internal
+ allocation unit. If this parameter is set to 0, the device may
+ have to write each logical block media during a zeroing
+ operation.
+
+
+What: /sys/block/<disk>/queue/write_zeroes_unmap_max_bytes
+Date: January 2025
+Contact: Zhang Yi <yi.zhang@huawei.com>
+Description:
+ [RW] While write_zeroes_unmap_max_hw_bytes is the hardware limit
+ for the device, this setting is the software limit. Since the
+ unmap write zeroes operation is a best-effort optimization, some
+ devices may still physically writing zeroes to media. So the
+ speed of this operation is not guaranteed. Writing a value of
+ '0' to this file disables this operation. Otherwise, this
+ parameter should be equal to write_zeroes_unmap_max_hw_bytes.
+
+
What: /sys/block/<disk>/queue/zone_append_max_bytes
Date: May 2020
Contact: linux-block@vger.kernel.org
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a000daafbfb4..b5c75f0ac3e9 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -50,6 +50,8 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_sectors = UINT_MAX;
lim->max_dev_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
+ lim->max_hw_wzeroes_unmap_sectors = UINT_MAX;
+ lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
lim->max_hw_zone_append_sectors = UINT_MAX;
lim->max_user_discard_sectors = UINT_MAX;
}
@@ -333,6 +335,12 @@ int blk_validate_limits(struct queue_limits *lim)
if (!lim->max_segments)
lim->max_segments = BLK_MAX_SEGMENTS;
+ if (lim->max_hw_wzeroes_unmap_sectors &&
+ lim->max_hw_wzeroes_unmap_sectors != lim->max_write_zeroes_sectors)
+ return -EINVAL;
+ lim->max_wzeroes_unmap_sectors = min(lim->max_hw_wzeroes_unmap_sectors,
+ lim->max_user_wzeroes_unmap_sectors);
+
lim->max_discard_sectors =
min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);
@@ -418,10 +426,11 @@ int blk_set_default_limits(struct queue_limits *lim)
{
/*
* Most defaults are set by capping the bounds in blk_validate_limits,
- * but max_user_discard_sectors is special and needs an explicit
- * initialization to the max value here.
+ * but these limits are special and need an explicit initialization to
+ * the max value here.
*/
lim->max_user_discard_sectors = UINT_MAX;
+ lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
return blk_validate_limits(lim);
}
@@ -708,6 +717,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
+ t->max_user_wzeroes_unmap_sectors =
+ min(t->max_user_wzeroes_unmap_sectors,
+ b->max_user_wzeroes_unmap_sectors);
+ t->max_hw_wzeroes_unmap_sectors =
+ min(t->max_hw_wzeroes_unmap_sectors,
+ b->max_hw_wzeroes_unmap_sectors);
+
t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors,
b->max_hw_zone_append_sectors);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b2b9b89d6967..48c7ecbb531f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -161,6 +161,8 @@ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_wzeroes_unmap_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_wzeroes_unmap_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors)
@@ -205,6 +207,24 @@ static int queue_max_discard_sectors_store(struct gendisk *disk,
return 0;
}
+static int queue_max_wzeroes_unmap_sectors_store(struct gendisk *disk,
+ const char *page, size_t count, struct queue_limits *lim)
+{
+ unsigned long max_zeroes_bytes, max_hw_zeroes_bytes;
+ ssize_t ret;
+
+ ret = queue_var_store(&max_zeroes_bytes, page, count);
+ if (ret < 0)
+ return ret;
+
+ max_hw_zeroes_bytes = lim->max_hw_wzeroes_unmap_sectors << SECTOR_SHIFT;
+ if (max_zeroes_bytes != 0 && max_zeroes_bytes != max_hw_zeroes_bytes)
+ return -EINVAL;
+
+ lim->max_user_wzeroes_unmap_sectors = max_zeroes_bytes >> SECTOR_SHIFT;
+ return 0;
+}
+
static int
queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count,
struct queue_limits *lim)
@@ -514,6 +534,10 @@ QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_max_hw_wzeroes_unmap_sectors,
+ "write_zeroes_unmap_max_hw_bytes");
+QUEUE_LIM_RW_ENTRY(queue_max_wzeroes_unmap_sectors,
+ "write_zeroes_unmap_max_bytes");
QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
@@ -662,6 +686,8 @@ static struct attribute *queue_attrs[] = {
&queue_atomic_write_unit_min_entry.attr,
&queue_atomic_write_unit_max_entry.attr,
&queue_max_write_zeroes_sectors_entry.attr,
+ &queue_max_hw_wzeroes_unmap_sectors_entry.attr,
+ &queue_max_wzeroes_unmap_sectors_entry.attr,
&queue_max_zone_append_sectors_entry.attr,
&queue_zone_write_granularity_entry.attr,
&queue_rotational_entry.attr,
diff --git a/block/fops.c b/block/fops.c
index 1309861d4c2c..474656a0de70 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -841,7 +841,7 @@ reexpand:
#define BLKDEV_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
- FALLOC_FL_ZERO_RANGE)
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES)
static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
@@ -850,11 +850,19 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
struct block_device *bdev = I_BDEV(inode);
loff_t end = start + len - 1;
loff_t isize;
+ unsigned int flags;
int error;
/* Fail if we don't recognize the flags. */
if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
+ /*
+ * Don't allow writing zeroes if the device does not enable the
+ * unmap write zeroes operation.
+ */
+ if ((mode & FALLOC_FL_WRITE_ZEROES) &&
+ !bdev_write_zeroes_unmap_sectors(bdev))
+ return -EOPNOTSUPP;
/* Don't go off the end of the device. */
isize = bdev_nr_bytes(bdev);
@@ -877,34 +885,32 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
inode_lock(inode);
filemap_invalidate_lock(inode->i_mapping);
- /*
- * Invalidate the page cache, including dirty pages, for valid
- * de-allocate mode calls to fallocate().
- */
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
- error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
- if (error)
- goto fail;
-
- error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
- len >> SECTOR_SHIFT, GFP_KERNEL,
- BLKDEV_ZERO_NOUNMAP);
+ flags = BLKDEV_ZERO_NOUNMAP;
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
- error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
- if (error)
- goto fail;
-
- error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
- len >> SECTOR_SHIFT, GFP_KERNEL,
- BLKDEV_ZERO_NOFALLBACK);
+ flags = BLKDEV_ZERO_NOFALLBACK;
+ break;
+ case FALLOC_FL_WRITE_ZEROES:
+ flags = 0;
break;
default:
error = -EOPNOTSUPP;
+ goto fail;
}
+ /*
+ * Invalidate the page cache, including dirty pages, for valid
+ * de-allocate mode calls to fallocate().
+ */
+ error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
+ if (error)
+ goto fail;
+
+ error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
+ len >> SECTOR_SHIFT, GFP_KERNEL, flags);
fail:
filemap_invalidate_unlock(inode->i_mapping);
inode_unlock(inode);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 24a857ff6d0b..d9d5e6aa5707 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -2065,8 +2065,10 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
limits->discard_alignment = 0;
}
- if (!dm_table_supports_write_zeroes(t))
+ if (!dm_table_supports_write_zeroes(t)) {
limits->max_write_zeroes_sectors = 0;
+ limits->max_hw_wzeroes_unmap_sectors = 0;
+ }
if (!dm_table_supports_secure_erase(t))
limits->max_secure_erase_sectors = 0;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 92697f98c601..90af4a15b696 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2420,22 +2420,24 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
else
lim.write_stream_granularity = 0;
- ret = queue_limits_commit_update(ns->disk->queue, &lim);
- if (ret) {
- blk_mq_unfreeze_queue(ns->disk->queue, memflags);
- goto out;
- }
-
- set_capacity_and_notify(ns->disk, capacity);
-
/*
* Only set the DEAC bit if the device guarantees that reads from
* deallocated data return zeroes. While the DEAC bit does not
* require that, it must be a no-op if reads from deallocated data
* do not return zeroes.
*/
- if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
+ if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) {
ns->head->features |= NVME_NS_DEAC;
+ lim.max_hw_wzeroes_unmap_sectors = lim.max_write_zeroes_sectors;
+ }
+
+ ret = queue_limits_commit_update(ns->disk->queue, &lim);
+ if (ret) {
+ blk_mq_unfreeze_queue(ns->disk->queue, memflags);
+ goto out;
+ }
+
+ set_capacity_and_notify(ns->disk, capacity);
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue, memflags);
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index eba42df2f821..31d6c4a63fa5 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -46,6 +46,10 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
id->npda = id->npdg;
/* NOWS = Namespace Optimal Write Size */
id->nows = to0based(bdev_io_opt(bdev) / bdev_logical_block_size(bdev));
+
+ /* Set WZDS and DRB if device supports unmapped write zeroes */
+ if (bdev_write_zeroes_unmap_sectors(bdev))
+ id->dlfeat = (1 << 3) | 0x1;
}
void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3f6e87705b62..877dc34e8f37 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1141,6 +1141,11 @@ static void sd_config_write_same(struct scsi_disk *sdkp,
out:
lim->max_write_zeroes_sectors =
sdkp->max_ws_blocks * (logical_block_size >> SECTOR_SHIFT);
+
+ if (sdkp->zeroing_mode == SD_ZERO_WS16_UNMAP ||
+ sdkp->zeroing_mode == SD_ZERO_WS10_UNMAP)
+ lim->max_hw_wzeroes_unmap_sectors =
+ lim->max_write_zeroes_sectors;
}
static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b543a46fc809..b43aa82c1b39 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4501,6 +4501,8 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
struct ext4_map_blocks map;
unsigned int credits;
loff_t epos, old_size = i_size_read(inode);
+ unsigned int blkbits = inode->i_blkbits;
+ bool alloc_zero = false;
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
map.m_lblk = offset;
@@ -4514,6 +4516,17 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
/*
+ * Do the actual write zero during a running journal transaction
+ * costs a lot. First allocate an unwritten extent and then
+ * convert it to written after zeroing it out.
+ */
+ if (flags & EXT4_GET_BLOCKS_ZERO) {
+ flags &= ~EXT4_GET_BLOCKS_ZERO;
+ flags |= EXT4_GET_BLOCKS_UNWRIT_EXT;
+ alloc_zero = true;
+ }
+
+ /*
* credits to insert 1 extent into extent tree
*/
credits = ext4_chunk_trans_blocks(inode, len);
@@ -4549,9 +4562,7 @@ retry:
* allow a full retry cycle for any remaining allocations
*/
retries = 0;
- map.m_lblk += ret;
- map.m_len = len = len - ret;
- epos = (loff_t)map.m_lblk << inode->i_blkbits;
+ epos = (loff_t)(map.m_lblk + ret) << blkbits;
inode_set_ctime_current(inode);
if (new_size) {
if (epos > new_size)
@@ -4571,6 +4582,21 @@ retry:
ret2 = ret3 ? ret3 : ret2;
if (unlikely(ret2))
break;
+
+ if (alloc_zero &&
+ (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
+ ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
+ map.m_len);
+ if (likely(!ret2))
+ ret2 = ext4_convert_unwritten_extents(NULL,
+ inode, (loff_t)map.m_lblk << blkbits,
+ (loff_t)map.m_len << blkbits);
+ if (ret2)
+ break;
+ }
+
+ map.m_lblk += ret;
+ map.m_len = len = len - ret;
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -4636,7 +4662,11 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (end_lblk > start_lblk) {
ext4_lblk_t zero_blks = end_lblk - start_lblk;
- flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE);
+ if (mode & FALLOC_FL_WRITE_ZEROES)
+ flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE;
+ else
+ flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+ EXT4_EX_NOCACHE);
ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
new_size, flags);
if (ret)
@@ -4745,11 +4775,18 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
+ /*
+ * Don't allow writing zeroes if the underlying device does not
+ * enable the unmap write zeroes operation.
+ */
+ if ((mode & FALLOC_FL_WRITE_ZEROES) &&
+ !bdev_write_zeroes_unmap_sectors(inode->i_sb->s_bdev))
+ return -EOPNOTSUPP;
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
- FALLOC_FL_INSERT_RANGE))
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE |
+ FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES))
return -EOPNOTSUPP;
inode_lock(inode);
@@ -4780,16 +4817,23 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (ret)
goto out_invalidate_lock;
- if (mode & FALLOC_FL_PUNCH_HOLE)
+ switch (mode & FALLOC_FL_MODE_MASK) {
+ case FALLOC_FL_PUNCH_HOLE:
ret = ext4_punch_hole(file, offset, len);
- else if (mode & FALLOC_FL_COLLAPSE_RANGE)
+ break;
+ case FALLOC_FL_COLLAPSE_RANGE:
ret = ext4_collapse_range(file, offset, len);
- else if (mode & FALLOC_FL_INSERT_RANGE)
+ break;
+ case FALLOC_FL_INSERT_RANGE:
ret = ext4_insert_range(file, offset, len);
- else if (mode & FALLOC_FL_ZERO_RANGE)
+ break;
+ case FALLOC_FL_ZERO_RANGE:
+ case FALLOC_FL_WRITE_ZEROES:
ret = ext4_zero_range(file, offset, len, mode);
- else
+ break;
+ default:
ret = -EOPNOTSUPP;
+ }
out_invalidate_lock:
filemap_invalidate_unlock(mapping);
diff --git a/fs/open.c b/fs/open.c
index 7828234a7caa..b777e11e5522 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -281,6 +281,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
break;
case FALLOC_FL_COLLAPSE_RANGE:
case FALLOC_FL_INSERT_RANGE:
+ case FALLOC_FL_WRITE_ZEROES:
if (mode & FALLOC_FL_KEEP_SIZE)
return -EOPNOTSUPP;
break;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a59880c809c7..1a5725c1f93d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -383,6 +383,9 @@ struct queue_limits {
unsigned int max_user_discard_sectors;
unsigned int max_secure_erase_sectors;
unsigned int max_write_zeroes_sectors;
+ unsigned int max_wzeroes_unmap_sectors;
+ unsigned int max_hw_wzeroes_unmap_sectors;
+ unsigned int max_user_wzeroes_unmap_sectors;
unsigned int max_hw_zone_append_sectors;
unsigned int max_zone_append_sectors;
unsigned int discard_granularity;
@@ -1042,6 +1045,7 @@ static inline void blk_queue_disable_secure_erase(struct request_queue *q)
static inline void blk_queue_disable_write_zeroes(struct request_queue *q)
{
q->limits.max_write_zeroes_sectors = 0;
+ q->limits.max_wzeroes_unmap_sectors = 0;
}
/*
@@ -1378,6 +1382,12 @@ static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
return bdev_limits(bdev)->max_write_zeroes_sectors;
}
+static inline unsigned int
+bdev_write_zeroes_unmap_sectors(struct block_device *bdev)
+{
+ return bdev_limits(bdev)->max_wzeroes_unmap_sectors;
+}
+
static inline bool bdev_nonrot(struct block_device *bdev)
{
return blk_queue_nonrot(bdev_get_queue(bdev));
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index 3f49f3df6af5..7c38c6b76b60 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -36,7 +36,8 @@ struct space_resv {
FALLOC_FL_COLLAPSE_RANGE | \
FALLOC_FL_ZERO_RANGE | \
FALLOC_FL_INSERT_RANGE | \
- FALLOC_FL_UNSHARE_RANGE)
+ FALLOC_FL_UNSHARE_RANGE | \
+ FALLOC_FL_WRITE_ZEROES)
/* on ia32 l_start is on a 32-bit boundary */
#if defined(CONFIG_X86_64)
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 156908641e68..6f9cf2811733 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -92,7 +92,8 @@ TRACE_DEFINE_ENUM(ES_REFERENCED_B);
{ FALLOC_FL_KEEP_SIZE, "KEEP_SIZE"}, \
{ FALLOC_FL_PUNCH_HOLE, "PUNCH_HOLE"}, \
{ FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"}, \
- { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"})
+ { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"}, \
+ { FALLOC_FL_WRITE_ZEROES, "WRITE_ZEROES"})
TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME);
diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
index 5810371ed72b..1f9ca757d02d 100644
--- a/include/uapi/linux/falloc.h
+++ b/include/uapi/linux/falloc.h
@@ -78,4 +78,21 @@
*/
#define FALLOC_FL_UNSHARE_RANGE 0x40
+/*
+ * FALLOC_FL_WRITE_ZEROES zeroes a specified file range in such a way that
+ * subsequent writes to that range do not require further changes to the file
+ * mapping metadata. This flag is beneficial for subsequent pure overwriting
+ * within this range, as it can save on block allocation and, consequently,
+ * significant metadata changes. Therefore, filesystems that always require
+ * out-of-place writes should not support this flag.
+ *
+ * Different filesystems may implement different limitations on the
+ * granularity of the zeroing operation. Most will preferably be accelerated
+ * by submitting write zeroes command if the backing storage supports, which
+ * may not physically write zeros to the media.
+ *
+ * This flag cannot be specified in conjunction with the FALLOC_FL_KEEP_SIZE.
+ */
+#define FALLOC_FL_WRITE_ZEROES 0x80
+
#endif /* _UAPI_FALLOC_H_ */