From 6640d552185f7c11235c64a832004db9af119b2d Mon Sep 17 00:00:00 2001 From: Ranganath V N Date: Sat, 11 Oct 2025 12:08:29 +0530 Subject: fs: ext4: fix uninitialized symbols Fix the issue detected by the smatch tool. fs/ext4/inode.c:3583 ext4_map_blocks_atomic_write_slow() error: uninitialized symbol 'next_pblk'. fs/ext4/namei.c:1776 ext4_lookup() error: uninitialized symbol 'de'. fs/ext4/namei.c:1829 ext4_get_parent() error: uninitialized symbol 'de'. fs/ext4/namei.c:3162 ext4_rmdir() error: uninitialized symbol 'de'. fs/ext4/namei.c:3242 __ext4_unlink() error: uninitialized symbol 'de'. fs/ext4/namei.c:3697 ext4_find_delete_entry() error: uninitialized symbol 'de'. These changes enhance code clarity, address static analysis tool errors. Signed-off-by: Ranganath V N Message-ID: <20251011063830.47485-1-vnranganath.20@gmail.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e99306a8f47c..6356340b768d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3544,7 +3544,7 @@ static int ext4_map_blocks_atomic_write_slow(handle_t *handle, ext4_lblk_t m_lblk = map->m_lblk; unsigned int m_len = map->m_len; unsigned int mapped_len = 0, m_flags = 0; - ext4_fsblk_t next_pblk; + ext4_fsblk_t next_pblk = 0; bool check_next_pblk = false; int ret = 0; -- cgit v1.2.3 From 7da5565cab4069b2b171dbfa7554b596a7fdf827 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:19 +0800 Subject: ext4: make ext4_es_lookup_extent() pass out the extent seq counter When querying extents in the extent status tree, we should hold the data_sem if we want to obtain the sequence number as a valid cookie simultaneously. However, currently, ext4_map_blocks() calls ext4_es_lookup_extent() without holding data_sem. Therefore, we should acquire i_es_lock instead, which also ensures that the sequence cookie and the extent remain consistent. Consequently, make ext4_es_lookup_extent() to pass out the sequence number when necessary. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-4-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6356340b768d..b62c1a87ed6b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -649,7 +649,7 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, * extent status tree. */ if (flags & EXT4_GET_BLOCKS_PRE_IO && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { if (ext4_es_is_written(&es)) return retval; } @@ -723,7 +723,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -1908,7 +1908,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); @@ -1961,7 +1961,7 @@ add_delayed: * is held in write mode, before inserting a new da entry in * the extent status tree. */ - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); -- cgit v1.2.3 From 07c440e8da8fee5b3512a5742ddc71776a0041ac Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:20 +0800 Subject: ext4: pass out extent seq counter when mapping blocks When creating or querying mapping blocks using the ext4_map_blocks() and ext4_map_{query|create}_blocks() helpers, also pass out the extent sequence number of the block mapping info through the ext4_map_blocks structure. This sequence number can later serve as a valid cookie within iomap infrastructure and the move extents procedure. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-5-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b62c1a87ed6b..783c883d4d5e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -550,10 +550,13 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, retval = ext4_ext_map_blocks(handle, inode, map, flags); else retval = ext4_ind_map_blocks(handle, inode, map, flags); - - if (retval <= 0) + if (retval < 0) return retval; + /* A hole? */ + if (retval == 0) + goto out; + if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " @@ -573,11 +576,13 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, false); - return retval; + } else { + retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map, + orig_mlen); } - - return ext4_map_query_blocks_next_in_leaf(handle, inode, map, - orig_mlen); +out: + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); + return retval; } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, @@ -649,7 +654,7 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, * extent status tree. */ if (flags & EXT4_GET_BLOCKS_PRE_IO && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { if (ext4_es_is_written(&es)) return retval; } @@ -658,6 +663,7 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); return retval; } @@ -723,7 +729,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -1979,6 +1985,8 @@ add_delayed: map->m_flags |= EXT4_MAP_DELAYED; retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); + if (!retval) + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); up_write(&EXT4_I(inode)->i_data_sem); return retval; -- cgit v1.2.3 From 4091c8206cfd2e3bb529ef260887296b90d9b6a2 Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Tue, 4 Nov 2025 16:12:24 +0800 Subject: ext4: clear i_state_flags when alloc inode i_state_flags used on 32-bit archs, need to clear this flag when alloc inode. Find this issue when umount ext4, sometimes track the inode as orphan accidently, cause ext4 mesg dump. Fixes: acf943e9768e ("ext4: fix checks for orphan inodes") Signed-off-by: Haibo Chen Reviewed-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251104-ext4-v1-1-73691a0800f9@nxp.com> Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 783c883d4d5e..32d9f0b36c33 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5296,7 +5296,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); - ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); -- cgit v1.2.3 From dac092195b6a35bc7c9f11e2884cfecb1b25e20c Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Wed, 12 Nov 2025 16:45:36 +0800 Subject: ext4: rename EXT4_GET_BLOCKS_PRE_IO This flag has been generalized to split an unwritten extent when we do dio or dioread_nolock writeback, or to avoid merge new extents which was created by extents split. Update some related comments too. Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Signed-off-by: Yang Erkun Message-ID: <20251112084538.1658232-2-yangerkun@huawei.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 32d9f0b36c33..3883793425cb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -653,7 +653,7 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, * If the extent has been zeroed out, we don't need to update * extent status tree. */ - if (flags & EXT4_GET_BLOCKS_PRE_IO && + if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { if (ext4_es_is_written(&es)) return retval; -- cgit v1.2.3 From a9272422316f6c0ddbdfd03e695079e2b3655995 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Wed, 12 Nov 2025 16:45:37 +0800 Subject: ext4: cleanup for ext4_map_blocks Retval from ext4_map_create_blocks means we really create some blocks, cannot happened with m_flags without EXT4_MAP_UNWRITTEN and EXT4_MAP_MAPPED. Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Signed-off-by: Yang Erkun Message-ID: <20251112084538.1658232-3-yangerkun@huawei.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3883793425cb..8e694c56d3b6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -816,7 +816,13 @@ found: down_write(&EXT4_I(inode)->i_data_sem); retval = ext4_map_create_blocks(handle, inode, map, flags); up_write((&EXT4_I(inode)->i_data_sem)); - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + + if (retval < 0) + ext_debug(inode, "failed with err %d\n", retval); + if (retval <= 0) + return retval; + + if (map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; @@ -845,12 +851,8 @@ found: return ret; } } - if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || - map->m_flags & EXT4_MAP_MAPPED)) - ext4_fc_track_range(handle, inode, map->m_lblk, - map->m_lblk + map->m_len - 1); - if (retval < 0) - ext_debug(inode, "failed with err %d\n", retval); + ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + + map->m_len - 1); return retval; } -- cgit v1.2.3 From 5835b1339e33549d9e7342fae56243b4fcd758c9 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 21 Nov 2025 17:06:31 +0800 Subject: ext4: remove page offset calculation in ext4_block_zero_page_range() For bs <= ps scenarios, calculating the offset within the block is sufficient. For bs > ps, an initial page offset calculation can lead to incorrect behavior. Thus this redundant calculation has been removed. Signed-off-by: Zhihao Cheng Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-2-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8e694c56d3b6..4afe227fd03f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4167,9 +4167,8 @@ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; - unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize = inode->i_sb->s_blocksize; - unsigned max = blocksize - (offset & (blocksize - 1)); + unsigned int max = blocksize - (from & (blocksize - 1)); /* * correct length if it does not fall between -- cgit v1.2.3 From b73f45a32420a8393e92fb2dec3b7d109e565127 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:32 +0800 Subject: ext4: remove page offset calculation in ext4_block_truncate_page() For bs <= ps scenarios, calculating the offset within the block is sufficient. For bs > ps, an initial page offset calculation can lead to incorrect behavior. Thus this redundant calculation has been removed. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-3-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4afe227fd03f..d232154cc14d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4193,7 +4193,6 @@ static int ext4_block_zero_page_range(handle_t *handle, static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { - unsigned offset = from & (PAGE_SIZE-1); unsigned length; unsigned blocksize; struct inode *inode = mapping->host; @@ -4202,8 +4201,8 @@ static int ext4_block_truncate_page(handle_t *handle, if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); + blocksize = i_blocksize(inode); + length = blocksize - (from & (blocksize - 1)); return ext4_block_zero_page_range(handle, mapping, from, length); } -- cgit v1.2.3 From d37a7ddd3a384bd34f985273d6e776d3d50b0edd Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:34 +0800 Subject: ext4: make ext4_punch_hole() support large block size When preparing for bs > ps support, clean up unnecessary PAGE_SIZE references in ext4_punch_hole(). Previously, when a hole extended beyond i_size, we aligned the hole end upwards to PAGE_SIZE to handle partial folio invalidation. Now that truncate_inode_pages_range() already handles partial folio invalidation correctly, this alignment is no longer required. However, to save pointless tail block zeroing, we still keep rounding up to the block size here. In addition, as Honza pointed out, when the hole end equals i_size, it should also be rounded up to the block size. This patch fixes that as well. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-5-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d232154cc14d..5cf392142c8c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4408,10 +4408,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) /* * If the hole extends beyond i_size, set the hole to end after - * the page that contains i_size. + * the block that contains i_size to save pointless tail block zeroing. */ - if (end > inode->i_size) - end = round_up(inode->i_size, PAGE_SIZE); + if (end >= inode->i_size) + end = round_up(inode->i_size, sb->s_blocksize); if (end > max_end) end = max_end; length = end - offset; -- cgit v1.2.3 From 8611e608a8fa01e8b82c9008b4dac9f24531ae0f Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:36 +0800 Subject: ext4: introduce s_min_folio_order for future BS > PS support This commit introduces the s_min_folio_order field to the ext4_sb_info structure. This field will store the minimum folio order required by the current filesystem, laying groundwork for future support of block sizes greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Pankaj Raghav Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-7-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5cf392142c8c..ff697adab5f6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5183,7 +5183,8 @@ void ext4_set_inode_mapping_order(struct inode *inode) if (!ext4_should_enable_large_folio(inode)) return; - mapping_set_folio_order_range(inode->i_mapping, 0, + mapping_set_folio_order_range(inode->i_mapping, + EXT4_SB(inode->i_sb)->s_min_folio_order, EXT4_MAX_PAGECACHE_ORDER(inode)); } -- cgit v1.2.3 From 125d1f6a5a77ed6a1af3eb0957240f54e4124af2 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:39 +0800 Subject: ext4: add EXT4_LBLK_TO_B macro for logical block to bytes conversion No functional changes. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-10-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ff697adab5f6..29259e10d78e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -837,9 +837,8 @@ found: !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { - loff_t start_byte = - (loff_t)map->m_lblk << inode->i_blkbits; - loff_t length = (loff_t)map->m_len << inode->i_blkbits; + loff_t start_byte = EXT4_LBLK_TO_B(inode, map->m_lblk); + loff_t length = EXT4_LBLK_TO_B(inode, map->m_len); if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, @@ -2235,7 +2234,6 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; - int blkbits = mpd->inode->i_blkbits; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); @@ -2261,7 +2259,8 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, err = PTR_ERR(io_end_vec); goto out; } - io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; + io_end_vec->offset = EXT4_LBLK_TO_B(mpd->inode, + mpd->map.m_lblk); } *map_bh = true; goto out; @@ -2271,7 +2270,7 @@ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); - io_end_size += (1 << blkbits); + io_end_size += i_blocksize(mpd->inode); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; @@ -2473,7 +2472,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); - io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; + io_end_vec->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { @@ -3513,8 +3512,8 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else iomap->bdev = inode->i_sb->s_bdev; - iomap->offset = (u64) map->m_lblk << blkbits; - iomap->length = (u64) map->m_len << blkbits; + iomap->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); + iomap->length = EXT4_LBLK_TO_B(inode, map->m_len); if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -3688,7 +3687,6 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; - u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; bool force_commit = false; @@ -3747,7 +3745,7 @@ retry: * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ - else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) + else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; -- cgit v1.2.3 From bff6235d623a022260b8af5559ced3534fb7fc2e Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:47 +0800 Subject: ext4: support large block size in ext4_block_write_begin() Use the EXT4_PG_TO_LBLK() macro to convert folio indexes to blocks to avoid negative left shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-18-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 29259e10d78e..269c1ded169b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1170,8 +1170,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, unsigned block_start, block_end; sector_t block; int err = 0; - unsigned blocksize = inode->i_sb->s_blocksize; - unsigned bbits; + unsigned int blocksize = i_blocksize(inode); struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; @@ -1180,12 +1179,12 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, BUG_ON(!folio_test_locked(folio)); BUG_ON(to > folio_size(folio)); BUG_ON(from > to); + WARN_ON_ONCE(blocksize > folio_size(folio)); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); - bbits = ilog2(blocksize); - block = (sector_t)folio->index << (PAGE_SHIFT - bbits); + block = EXT4_PG_TO_LBLK(inode, folio->index); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { -- cgit v1.2.3 From b967ab748765bf2cf9512efaa8aa987ab4482c7d Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:48 +0800 Subject: ext4: support large block size in mpage_map_and_submit_buffers() Use the EXT4_PG_TO_LBLK/EXT4_LBLK_TO_PG macros to complete the conversion between folio indexes and blocks to avoid negative left/right shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-19-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 269c1ded169b..847770f57bfc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2299,15 +2299,14 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct folio_batch fbatch; unsigned nr, i; struct inode *inode = mpd->inode; - int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; - start = mpd->map.m_lblk >> bpp_bits; - end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; + start = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk); + end = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk + mpd->map.m_len - 1); pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); @@ -2318,7 +2317,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; - lblk = folio->index << bpp_bits; + lblk = EXT4_PG_TO_LBLK(inode, folio->index); err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* -- cgit v1.2.3 From 8e50e23b769ace4885fc132e6fca2b4343c27fb1 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:49 +0800 Subject: ext4: support large block size in mpage_prepare_extent_to_map() Use the EXT4_PG_TO_LBLK/EXT4_LBLK_TO_PG macros to complete the conversion between folio indexes and blocks to avoid negative left/right shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-20-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 847770f57bfc..8cf33a58d1a6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2620,7 +2620,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) pgoff_t end = mpd->end_pos >> PAGE_SHIFT; xa_mark_t tag; int i, err = 0; - int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; @@ -2659,7 +2658,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && mpd->wbc->nr_to_write <= - mpd->map.m_len >> (PAGE_SHIFT - blkbits)) + EXT4_LBLK_TO_PG(mpd->inode, mpd->map.m_len)) goto out; /* If we can't merge this page, we are done. */ @@ -2737,8 +2736,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)folio->index) << - (PAGE_SHIFT - blkbits); + lblk = EXT4_PG_TO_LBLK(mpd->inode, folio->index); head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, lblk); -- cgit v1.2.3 From c00a6292d0616c304cb712d823370f1a82f899b2 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 21 Nov 2025 17:06:50 +0800 Subject: ext4: support large block size in __ext4_block_zero_page_range() Use the EXT4_PG_TO_LBLK() macro to convert folio indexes to blocks to avoid negative left shifts after supporting blocksize greater than PAGE_SIZE. Signed-off-by: Zhihao Cheng Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-21-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8cf33a58d1a6..3b6f66463add 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4076,7 +4076,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, blocksize = inode->i_sb->s_blocksize; - iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = EXT4_PG_TO_LBLK(inode, folio->index); bh = folio_buffers(folio); if (!bh) -- cgit v1.2.3 From 58fd191f99f3791c6687e98041c89a6477d9f64d Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:51 +0800 Subject: ext4: make data=journal support large block size Currently, ext4_set_inode_mapping_order() does not set max folio order for files with the data journalling flag. For files that already have large folios enabled, ext4_inode_journal_mode() ignores the data journalling flag once max folio order is set. This is not because data journalling cannot work with large folios, but because credit estimates will go through the roof if there are too many blocks per folio. Since the real constraint is blocks-per-folio, to support data=journal under LBS, we now set max folio order to be equal to min folio order for files with the journalling flag. When LBS is disabled, the max folio order remains unset as before. Therefore, before ext4_change_inode_journal_flag() switches the journalling mode, we call truncate_pagecache() to drop all page cache for that inode, and filemap_write_and_wait() is called unconditionally. After that, once the journalling mode has been switched, we can safely reset the inode mapping order, and the mapping_large_folio_support() check in ext4_inode_journal_mode() can be removed. Suggested-by: Jan Kara Suggested-by: Dan Carpenter Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-22-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3b6f66463add..1eab837e47c5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5154,9 +5154,6 @@ static bool ext4_should_enable_large_folio(struct inode *inode) if (!S_ISREG(inode->i_mode)) return false; - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return false; if (ext4_has_feature_verity(sb)) return false; if (ext4_has_feature_encrypt(sb)) @@ -5174,12 +5171,20 @@ static bool ext4_should_enable_large_folio(struct inode *inode) umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) void ext4_set_inode_mapping_order(struct inode *inode) { + u32 max_order; + if (!ext4_should_enable_large_folio(inode)) return; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + max_order = EXT4_SB(inode->i_sb)->s_min_folio_order; + else + max_order = EXT4_MAX_PAGECACHE_ORDER(inode); + mapping_set_folio_order_range(inode->i_mapping, EXT4_SB(inode->i_sb)->s_min_folio_order, - EXT4_MAX_PAGECACHE_ORDER(inode)); + max_order); } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, @@ -6554,14 +6559,14 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) * dirty data which can be converted only after flushing the dirty * data (and journalled aops don't know how to handle these cases). */ - if (val) { - filemap_invalidate_lock(inode->i_mapping); - err = filemap_write_and_wait(inode->i_mapping); - if (err < 0) { - filemap_invalidate_unlock(inode->i_mapping); - return err; - } + filemap_invalidate_lock(inode->i_mapping); + err = filemap_write_and_wait(inode->i_mapping); + if (err < 0) { + filemap_invalidate_unlock(inode->i_mapping); + return err; } + /* Before switch the inode journalling mode evict all the page cache. */ + truncate_pagecache(inode, 0); alloc_ctx = ext4_writepages_down_write(inode->i_sb); jbd2_journal_lock_updates(journal); @@ -6581,17 +6586,17 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (err < 0) { jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); + filemap_invalidate_unlock(inode->i_mapping); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); + ext4_set_inode_mapping_order(inode); jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); - - if (val) - filemap_invalidate_unlock(inode->i_mapping); + filemap_invalidate_unlock(inode->i_mapping); /* Finally we can mark the inode as dirty. */ -- cgit v1.2.3 From 1a3e9e8aa4f72440b00ef6171b4198f82822d679 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:52 +0800 Subject: ext4: support verifying data from large folios with fs-verity Eric Biggers already added support for verifying data from large folios several years ago in commit 5d0f0e57ed90 ("fsverity: support verifying data from large folios"). With ext4 now supporting large block sizes, the fs-verity tests `kvm-xfstests -c ext4/64k -g verity -x encrypt` pass without issues. Therefore, remove the restriction and allow large folios to be enabled together with fs-verity. Cc: Eric Biggers Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-23-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1eab837e47c5..a566469ae07a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5154,8 +5154,6 @@ static bool ext4_should_enable_large_folio(struct inode *inode) if (!S_ISREG(inode->i_mode)) return false; - if (ext4_has_feature_verity(sb)) - return false; if (ext4_has_feature_encrypt(sb)) return false; -- cgit v1.2.3 From 709f0f1f1bf5ca62a000084e5446ca6b57c8678c Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Nov 2025 17:06:53 +0800 Subject: ext4: add checks for large folio incompatibilities when BS > PS Supporting a block size greater than the page size (BS > PS) requires support for large folios. However, several features (e.g., encrypt) do not yet support large folios. To prevent conflicts, this patch adds checks at mount time to prohibit these features from being used when BS > PS. Since these features cannot be changed on remount, there is no need to check on remount. This patch adds s_max_folio_order, initialized during mount according to filesystem features and mount options. If s_max_folio_order is 0, large folios are disabled. With this in place, ext4_set_inode_mapping_order() can be simplified by checking s_max_folio_order, avoiding redundant checks. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Reviewed-by: Ojaswin Mujoo Message-ID: <20251121090654.631996-24-libaokun@huaweicloud.com> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 38 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 28 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a566469ae07a..7510fce3d0f0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5148,41 +5148,23 @@ error: return -EFSCORRUPTED; } -static bool ext4_should_enable_large_folio(struct inode *inode) +void ext4_set_inode_mapping_order(struct inode *inode) { struct super_block *sb = inode->i_sb; + u16 min_order, max_order; - if (!S_ISREG(inode->i_mode)) - return false; - if (ext4_has_feature_encrypt(sb)) - return false; - - return true; -} - -/* - * Limit the maximum folio order to 2048 blocks to prevent overestimation - * of reserve handle credits during the folio writeback in environments - * where the PAGE_SIZE exceeds 4KB. - */ -#define EXT4_MAX_PAGECACHE_ORDER(i) \ - umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) -void ext4_set_inode_mapping_order(struct inode *inode) -{ - u32 max_order; + max_order = EXT4_SB(sb)->s_max_folio_order; + if (!max_order) + return; - if (!ext4_should_enable_large_folio(inode)) + min_order = EXT4_SB(sb)->s_min_folio_order; + if (!min_order && !S_ISREG(inode->i_mode)) return; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - max_order = EXT4_SB(inode->i_sb)->s_min_folio_order; - else - max_order = EXT4_MAX_PAGECACHE_ORDER(inode); + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + max_order = min_order; - mapping_set_folio_order_range(inode->i_mapping, - EXT4_SB(inode->i_sb)->s_min_folio_order, - max_order); + mapping_set_folio_order_range(inode->i_mapping, min_order, max_order); } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, -- cgit v1.2.3 From 91ef18b567dae84c0cea9b996d933c856e366f52 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 25 Nov 2025 11:13:41 +0100 Subject: ext4: mark inodes without acls in __ext4_iget() Mark inodes without acls with cache_no_acl() in __ext4_iget() so that path lookup can run in RCU mode from the start. This is interesting in particular for the case where the file owner does the lookup because in that case end up constantly hitting the slow path otherwise. We drop out from the fast path (because ACL state is unknown) but never end up calling check_acl() to cache ACL state. The problem was originally analyzed by Linus and fix tested by Matheusz, I'm just putting it into mergeable form :). Link: https://lore.kernel.org/all/CAHk-=whSzc75TLLPWskV0xuaHR4tpWBr=LduqhcCFr4kCmme_w@mail.gmail.com Reported-by: Mateusz Guzik Reported-by: Linus Torvalds Signed-off-by: Jan Kara Reviewed-by: Baokun Li Message-ID: <20251125101340.24276-2-jack@suse.cz> Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7510fce3d0f0..eeb3ec4c2a9a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5508,7 +5508,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (ret) goto bad_inode; brelse(iloc.bh); - + /* Initialize the "no ACL's" state for the simple cases */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) && !ei->i_file_acl) + cache_no_acl(inode); unlock_new_inode(inode); return inode; -- cgit v1.2.3