From 01d5d96542fd4e383da79593f8a3450995ce2257 Mon Sep 17 00:00:00 2001 From: Leah Rumancik Date: Tue, 18 May 2021 15:13:25 +0000 Subject: ext4: add discard/zeroout flags to journal flush Add a flags argument to jbd2_journal_flush to enable discarding or zero-filling the journal blocks while flushing the journal. Signed-off-by: Leah Rumancik Link: https://lore.kernel.org/r/20210518151327.130198-1-leah.rumancik@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index db0e1920cb12..8543233b0388 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1370,6 +1370,10 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) * mode */ #define JBD2_FAST_COMMIT_ONGOING 0x100 /* Fast commit is ongoing */ #define JBD2_FULL_COMMIT_ONGOING 0x200 /* Full commit is ongoing */ +#define JBD2_JOURNAL_FLUSH_DISCARD 0x0001 +#define JBD2_JOURNAL_FLUSH_ZEROOUT 0x0002 +#define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \ + JBD2_JOURNAL_FLUSH_ZEROOUT) /* * Function declarations for the journaling transaction and buffer @@ -1500,7 +1504,7 @@ extern int jbd2_journal_invalidatepage(journal_t *, struct page *, unsigned int, unsigned int); extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page); extern int jbd2_journal_stop(handle_t *); -extern int jbd2_journal_flush (journal_t *); +extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); extern void jbd2_journal_lock_updates (journal_t *); extern void jbd2_journal_unlock_updates (journal_t *); -- cgit v1.2.3 From fcf37549ae19e904bc6a5eadf5c25eca36100c5e Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 10 Jun 2021 19:24:34 +0800 Subject: jbd2: ensure abort the journal if detect IO error when writing original buffer back Although we merged c044f3d8360 ("jbd2: abort journal if free a async write error metadata buffer"), there is a race between jbd2_journal_try_to_free_buffers() and jbd2_journal_destroy(), so the jbd2_log_do_checkpoint() may still fail to detect the buffer write io error flag which may lead to filesystem inconsistency. jbd2_journal_try_to_free_buffers() ext4_put_super() jbd2_journal_destroy() __jbd2_journal_remove_checkpoint() detect buffer write error jbd2_log_do_checkpoint() jbd2_cleanup_journal_tail() <--- lead to inconsistency jbd2_journal_abort() Fix this issue by introducing a new atomic flag which only have one JBD2_CHECKPOINT_IO_ERROR bit now, and set it in __jbd2_journal_remove_checkpoint() when freeing a checkpoint buffer which has write_io_error flag. Then jbd2_journal_destroy() will detect this mark and abort the journal to prevent updating log tail. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210610112440.3438139-3-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 8543233b0388..d5db408ae064 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -779,6 +779,11 @@ struct journal_s */ unsigned long j_flags; + /** + * @j_atomic_flags: Atomic journaling state flags. + */ + unsigned long j_atomic_flags; + /** * @j_errno: * @@ -1375,6 +1380,12 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) #define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \ JBD2_JOURNAL_FLUSH_ZEROOUT) +/* + * Journal atomic flag definitions + */ +#define JBD2_CHECKPOINT_IO_ERROR 0x001 /* Detect io error while writing + * buffer back to disk */ + /* * Function declarations for the journaling transaction and buffer * management -- cgit v1.2.3 From 4ba3fcdde7e36af93610ceb3cc38365b14539865 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 10 Jun 2021 19:24:37 +0800 Subject: jbd2,ext4: add a shrinker to release checkpointed buffers Current metadata buffer release logic in bdev_try_to_free_page() have a lot of use-after-free issues when umount filesystem concurrently, and it is difficult to fix directly because ext4 is the only user of s_op->bdev_try_to_free_page callback and we may have to add more special refcount or lock that is only used by ext4 into the common vfs layer, which is unacceptable. One better solution is remove the bdev_try_to_free_page callback, but the real problem is we cannot easily release journal_head on the checkpointed buffer, so try_to_free_buffers() cannot release buffers and page under memory pressure, which is more likely to trigger out-of-memory. So we cannot remove the callback directly before we find another way to release journal_head. This patch introduce a shrinker to free journal_head on the checkpointed transaction. After the journal_head got freed, try_to_free_buffers() could free buffer properly. Signed-off-by: Zhang Yi Suggested-by: Jan Kara Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210610112440.3438139-6-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 26 ++++++++++++ include/trace/events/jbd2.h | 101 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d5db408ae064..6cc035321562 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -909,6 +909,29 @@ struct journal_s */ struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH]; + /** + * @j_shrinker: + * + * Journal head shrinker, reclaim buffer's journal head which + * has been written back. + */ + struct shrinker j_shrinker; + + /** + * @j_jh_shrink_count: + * + * Number of journal buffers on the checkpoint list. [j_list_lock] + */ + struct percpu_counter j_jh_shrink_count; + + /** + * @j_shrink_transaction: + * + * Record next transaction will shrink on the checkpoint list. + * [j_list_lock] + */ + transaction_t *j_shrink_transaction; + /** * @j_head: * @@ -1422,6 +1445,7 @@ extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy); +unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); int __jbd2_journal_remove_checkpoint(struct journal_head *); void jbd2_journal_destroy_checkpoint(journal_t *journal); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); @@ -1532,6 +1556,8 @@ extern int jbd2_journal_set_features (journal_t *, unsigned long, unsigned long, unsigned long); extern void jbd2_journal_clear_features (journal_t *, unsigned long, unsigned long, unsigned long); +extern int jbd2_journal_register_shrinker(journal_t *journal); +extern void jbd2_journal_unregister_shrinker(journal_t *journal); extern int jbd2_journal_load (journal_t *journal); extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index d16a32867f3a..a4dfe005983d 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -394,6 +394,107 @@ TRACE_EVENT(jbd2_lock_buffer_stall, __entry->stall_ms) ); +DECLARE_EVENT_CLASS(jbd2_journal_shrink, + + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, + unsigned long count), + + TP_ARGS(journal, nr_to_scan, count), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, nr_to_scan) + __field(unsigned long, count) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->nr_to_scan = nr_to_scan; + __entry->count = count; + ), + + TP_printk("dev %d,%d nr_to_scan %lu count %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_to_scan, __entry->count) +); + +DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_count, + + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), + + TP_ARGS(journal, nr_to_scan, count) +); + +DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_scan_enter, + + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), + + TP_ARGS(journal, nr_to_scan, count) +); + +TRACE_EVENT(jbd2_shrink_scan_exit, + + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, + unsigned long nr_shrunk, unsigned long count), + + TP_ARGS(journal, nr_to_scan, nr_shrunk, count), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, nr_to_scan) + __field(unsigned long, nr_shrunk) + __field(unsigned long, count) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->nr_to_scan = nr_to_scan; + __entry->nr_shrunk = nr_shrunk; + __entry->count = count; + ), + + TP_printk("dev %d,%d nr_to_scan %lu nr_shrunk %lu count %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_to_scan, __entry->nr_shrunk, + __entry->count) +); + +TRACE_EVENT(jbd2_shrink_checkpoint_list, + + TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid, + unsigned long nr_freed, unsigned long nr_scanned, + tid_t next_tid), + + TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, + nr_scanned, next_tid), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(tid_t, first_tid) + __field(tid_t, tid) + __field(tid_t, last_tid) + __field(unsigned long, nr_freed) + __field(unsigned long, nr_scanned) + __field(tid_t, next_tid) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->first_tid = first_tid; + __entry->tid = tid; + __entry->last_tid = last_tid; + __entry->nr_freed = nr_freed; + __entry->nr_scanned = nr_scanned; + __entry->next_tid = next_tid; + ), + + TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu " + "scanned %lu next transaction %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->first_tid, __entry->tid, __entry->last_tid, + __entry->nr_freed, __entry->nr_scanned, __entry->next_tid) +); + #endif /* _TRACE_JBD2_H */ /* This part must be outside protection */ -- cgit v1.2.3 From acc6100d3ffa24bdd2add8ea85fb66811bcce5d4 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 10 Jun 2021 19:24:40 +0800 Subject: fs: remove bdev_try_to_free_page callback After remove the unique user of sop->bdev_try_to_free_page() callback, we could remove the callback and the corresponding blkdev_releasepage() at all. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210610112440.3438139-9-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index c3c88fdb9b2a..c3277b445f96 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2171,7 +2171,6 @@ struct super_operations { ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); struct dquot **(*get_dquots)(struct inode *); #endif - int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); long (*nr_cached_objects)(struct super_block *, struct shrink_control *); long (*free_cached_objects)(struct super_block *, -- cgit v1.2.3