summaryrefslogtreecommitdiff
path: root/fs/buffer.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/buffer.c')
-rw-r--r--fs/buffer.c454
1 files changed, 173 insertions, 281 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 22b43642ba57..cbed175f418b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -54,7 +54,6 @@
#include "internal.h"
-static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
enum rw_hint hint, struct writeback_control *wbc);
@@ -468,146 +467,187 @@ EXPORT_SYMBOL(mark_buffer_async_write);
* a successful fsync(). For example, ext2 indirect blocks need to be
* written back and waited upon before fsync() returns.
*
- * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
- * inode_has_buffers() and invalidate_inode_buffers() are provided for the
- * management of a list of dependent buffers at ->i_mapping->i_private_list.
- *
- * Locking is a little subtle: try_to_free_buffers() will remove buffers
- * from their controlling inode's queue when they are being freed. But
- * try_to_free_buffers() will be operating against the *blockdev* mapping
- * at the time, not against the S_ISREG file which depends on those buffers.
- * So the locking for i_private_list is via the i_private_lock in the address_space
- * which backs the buffers. Which is different from the address_space
- * against which the buffers are listed. So for a particular address_space,
- * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact,
- * mapping->i_private_list will always be protected by the backing blockdev's
- * ->i_private_lock.
- *
- * Which introduces a requirement: all buffers on an address_space's
- * ->i_private_list must be from the same address_space: the blockdev's.
- *
- * address_spaces which do not place buffers at ->i_private_list via these
- * utility functions are free to use i_private_lock and i_private_list for
- * whatever they want. The only requirement is that list_empty(i_private_list)
- * be true at clear_inode() time.
- *
- * FIXME: clear_inode should not call invalidate_inode_buffers(). The
- * filesystems should do that. invalidate_inode_buffers() should just go
- * BUG_ON(!list_empty).
- *
- * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
- * take an address_space, not an inode. And it should be called
- * mark_buffer_dirty_fsync() to clearly define why those buffers are being
- * queued up.
- *
- * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
- * list if it is already on a list. Because if the buffer is on a list,
- * it *must* already be on the right one. If not, the filesystem is being
- * silly. This will save a ton of locking. But first we have to ensure
- * that buffers are taken *off* the old inode's list when they are freed
- * (presumably in truncate). That requires careful auditing of all
- * filesystems (do it inside bforget()). It could also be done by bringing
- * b_inode back.
+ * The functions mmb_mark_buffer_dirty(), mmb_sync(), mmb_has_buffers()
+ * and mmb_invalidate() are provided for the management of a list of dependent
+ * buffers in mapping_metadata_bhs struct.
+ *
+ * The locking is a little subtle: The list of buffer heads is protected by
+ * the lock in mapping_metadata_bhs so functions coming from bdev mapping
+ * (such as try_to_free_buffers()) need to safely get to mapping_metadata_bhs
+ * using RCU, grab the lock, verify we didn't race with somebody detaching the
+ * bh / moving it to different inode and only then proceeding.
*/
-/*
- * The buffer's backing address_space's i_private_lock must be held
- */
-static void __remove_assoc_queue(struct buffer_head *bh)
+void mmb_init(struct mapping_metadata_bhs *mmb, struct address_space *mapping)
{
- list_del_init(&bh->b_assoc_buffers);
- WARN_ON(!bh->b_assoc_map);
- bh->b_assoc_map = NULL;
+ spin_lock_init(&mmb->lock);
+ INIT_LIST_HEAD(&mmb->list);
+ mmb->mapping = mapping;
}
+EXPORT_SYMBOL(mmb_init);
-int inode_has_buffers(struct inode *inode)
+static void __remove_assoc_queue(struct mapping_metadata_bhs *mmb,
+ struct buffer_head *bh)
{
- return !list_empty(&inode->i_data.i_private_list);
+ lockdep_assert_held(&mmb->lock);
+ list_del_init(&bh->b_assoc_buffers);
+ WARN_ON(!bh->b_mmb);
+ bh->b_mmb = NULL;
}
-/*
- * osync is designed to support O_SYNC io. It waits synchronously for
- * all already-submitted IO to complete, but does not queue any new
- * writes to the disk.
- *
- * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
- * as you dirty the buffers, and then use osync_inode_buffers to wait for
- * completion. Any other dirty buffers which are not yet queued for
- * write will not be flushed to disk by the osync.
- */
-static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
+static void remove_assoc_queue(struct buffer_head *bh)
{
- struct buffer_head *bh;
- struct list_head *p;
- int err = 0;
+ struct mapping_metadata_bhs *mmb;
- spin_lock(lock);
-repeat:
- list_for_each_prev(p, list) {
- bh = BH_ENTRY(p);
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(lock);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- err = -EIO;
- brelse(bh);
- spin_lock(lock);
- goto repeat;
+ /*
+ * The locking dance is ugly here. We need to acquire the lock
+ * protecting the metadata bh list while possibly racing with bh
+ * being removed from the list or moved to a different one. We
+ * use RCU to pin mapping_metadata_bhs in memory to
+ * opportunistically acquire the lock and then recheck the bh
+ * didn't move under us.
+ */
+ while (bh->b_mmb) {
+ rcu_read_lock();
+ mmb = READ_ONCE(bh->b_mmb);
+ if (mmb) {
+ spin_lock(&mmb->lock);
+ if (bh->b_mmb == mmb)
+ __remove_assoc_queue(mmb, bh);
+ spin_unlock(&mmb->lock);
}
+ rcu_read_unlock();
}
- spin_unlock(lock);
- return err;
}
+bool mmb_has_buffers(struct mapping_metadata_bhs *mmb)
+{
+ return !list_empty(&mmb->list);
+}
+EXPORT_SYMBOL_GPL(mmb_has_buffers);
+
/**
- * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
- * @mapping: the mapping which wants those buffers written
+ * mmb_sync - write out & wait upon all buffers in a list
+ * @mmb: the list of buffers to write
+ *
+ * Starts I/O against the buffers in the given list and waits upon
+ * that I/O. Basically, this is a convenience function for fsync(). @mmb is
+ * for a file or directory which needs those buffers to be written for a
+ * successful fsync().
*
- * Starts I/O against the buffers at mapping->i_private_list, and waits upon
- * that I/O.
+ * We have conflicting pressures: we want to make sure that all
+ * initially dirty buffers get waited on, but that any subsequently
+ * dirtied buffers don't. After all, we don't want fsync to last
+ * forever if somebody is actively writing to the file.
*
- * Basically, this is a convenience function for fsync().
- * @mapping is a file or directory which needs those buffers to be written for
- * a successful fsync().
+ * Do this in two main stages: first we copy dirty buffers to a
+ * temporary inode list, queueing the writes as we go. Then we clean
+ * up, waiting for those writes to complete. mark_buffer_dirty_inode()
+ * doesn't touch b_assoc_buffers list if b_mmb is not NULL so we are sure the
+ * buffer stays on our list until IO completes (at which point it can be
+ * reaped).
*/
-int sync_mapping_buffers(struct address_space *mapping)
+int mmb_sync(struct mapping_metadata_bhs *mmb)
{
- struct address_space *buffer_mapping = mapping->i_private_data;
+ struct buffer_head *bh;
+ int err = 0;
+ struct blk_plug plug;
+ LIST_HEAD(tmp);
- if (buffer_mapping == NULL || list_empty(&mapping->i_private_list))
+ if (!mmb_has_buffers(mmb))
return 0;
- return fsync_buffers_list(&buffer_mapping->i_private_lock,
- &mapping->i_private_list);
+ blk_start_plug(&plug);
+
+ spin_lock(&mmb->lock);
+ while (!list_empty(&mmb->list)) {
+ bh = BH_ENTRY(mmb->list.next);
+ WARN_ON_ONCE(bh->b_mmb != mmb);
+ __remove_assoc_queue(mmb, bh);
+ /* Avoid race with mark_buffer_dirty_inode() which does
+ * a lockless check and we rely on seeing the dirty bit */
+ smp_mb();
+ if (buffer_dirty(bh) || buffer_locked(bh)) {
+ list_add(&bh->b_assoc_buffers, &tmp);
+ bh->b_mmb = mmb;
+ if (buffer_dirty(bh)) {
+ get_bh(bh);
+ spin_unlock(&mmb->lock);
+ /*
+ * Ensure any pending I/O completes so that
+ * write_dirty_buffer() actually writes the
+ * current contents - it is a noop if I/O is
+ * still in flight on potentially older
+ * contents.
+ */
+ write_dirty_buffer(bh, REQ_SYNC);
+
+ /*
+ * Kick off IO for the previous mapping. Note
+ * that we will not run the very last mapping,
+ * wait_on_buffer() will do that for us
+ * through sync_buffer().
+ */
+ brelse(bh);
+ spin_lock(&mmb->lock);
+ }
+ }
+ }
+
+ spin_unlock(&mmb->lock);
+ blk_finish_plug(&plug);
+ spin_lock(&mmb->lock);
+
+ while (!list_empty(&tmp)) {
+ bh = BH_ENTRY(tmp.prev);
+ get_bh(bh);
+ __remove_assoc_queue(mmb, bh);
+ /* Avoid race with mark_buffer_dirty_inode() which does
+ * a lockless check and we rely on seeing the dirty bit */
+ smp_mb();
+ if (buffer_dirty(bh)) {
+ list_add(&bh->b_assoc_buffers, &mmb->list);
+ bh->b_mmb = mmb;
+ }
+ spin_unlock(&mmb->lock);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ err = -EIO;
+ brelse(bh);
+ spin_lock(&mmb->lock);
+ }
+ spin_unlock(&mmb->lock);
+ return err;
}
-EXPORT_SYMBOL(sync_mapping_buffers);
+EXPORT_SYMBOL(mmb_sync);
/**
- * generic_buffers_fsync_noflush - generic buffer fsync implementation
- * for simple filesystems with no inode lock
+ * mmb_fsync_noflush - fsync implementation for simple filesystems with
+ * metadata buffers list
*
* @file: file to synchronize
+ * @mmb: list of metadata bhs to flush
* @start: start offset in bytes
* @end: end offset in bytes (inclusive)
* @datasync: only synchronize essential metadata if true
*
- * This is a generic implementation of the fsync method for simple
- * filesystems which track all non-inode metadata in the buffers list
- * hanging off the address_space structure.
+ * This is an implementation of the fsync method for simple filesystems which
+ * track all non-inode metadata in the buffers list hanging off the @mmb
+ * structure.
*/
-int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
- bool datasync)
+int mmb_fsync_noflush(struct file *file, struct mapping_metadata_bhs *mmb,
+ loff_t start, loff_t end, bool datasync)
{
struct inode *inode = file->f_mapping->host;
int err;
- int ret;
+ int ret = 0;
err = file_write_and_wait_range(file, start, end);
if (err)
return err;
- ret = sync_mapping_buffers(inode->i_mapping);
+ if (mmb)
+ ret = mmb_sync(mmb);
if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
goto out;
if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
@@ -624,34 +664,35 @@ out:
ret = err;
return ret;
}
-EXPORT_SYMBOL(generic_buffers_fsync_noflush);
+EXPORT_SYMBOL(mmb_fsync_noflush);
/**
- * generic_buffers_fsync - generic buffer fsync implementation
- * for simple filesystems with no inode lock
+ * mmb_fsync - fsync implementation for simple filesystems with metadata
+ * buffers list
*
* @file: file to synchronize
+ * @mmb: list of metadata bhs to flush
* @start: start offset in bytes
* @end: end offset in bytes (inclusive)
* @datasync: only synchronize essential metadata if true
*
- * This is a generic implementation of the fsync method for simple
- * filesystems which track all non-inode metadata in the buffers list
- * hanging off the address_space structure. This also makes sure that
- * a device cache flush operation is called at the end.
+ * This is an implementation of the fsync method for simple filesystems which
+ * track all non-inode metadata in the buffers list hanging off the @mmb
+ * structure. This also makes sure that a device cache flush operation is
+ * called at the end.
*/
-int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
- bool datasync)
+int mmb_fsync(struct file *file, struct mapping_metadata_bhs *mmb,
+ loff_t start, loff_t end, bool datasync)
{
struct inode *inode = file->f_mapping->host;
int ret;
- ret = generic_buffers_fsync_noflush(file, start, end, datasync);
+ ret = mmb_fsync_noflush(file, mmb, start, end, datasync);
if (!ret)
ret = blkdev_issue_flush(inode->i_sb->s_bdev);
return ret;
}
-EXPORT_SYMBOL(generic_buffers_fsync);
+EXPORT_SYMBOL(mmb_fsync);
/*
* Called when we've recently written block `bblock', and it is known that
@@ -672,26 +713,18 @@ void write_boundary_block(struct block_device *bdev,
}
}
-void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
+void mmb_mark_buffer_dirty(struct buffer_head *bh,
+ struct mapping_metadata_bhs *mmb)
{
- struct address_space *mapping = inode->i_mapping;
- struct address_space *buffer_mapping = bh->b_folio->mapping;
-
mark_buffer_dirty(bh);
- if (!mapping->i_private_data) {
- mapping->i_private_data = buffer_mapping;
- } else {
- BUG_ON(mapping->i_private_data != buffer_mapping);
- }
- if (!bh->b_assoc_map) {
- spin_lock(&buffer_mapping->i_private_lock);
- list_move_tail(&bh->b_assoc_buffers,
- &mapping->i_private_list);
- bh->b_assoc_map = mapping;
- spin_unlock(&buffer_mapping->i_private_lock);
+ if (!bh->b_mmb) {
+ spin_lock(&mmb->lock);
+ list_move_tail(&bh->b_assoc_buffers, &mmb->list);
+ bh->b_mmb = mmb;
+ spin_unlock(&mmb->lock);
}
}
-EXPORT_SYMBOL(mark_buffer_dirty_inode);
+EXPORT_SYMBOL(mmb_mark_buffer_dirty);
/**
* block_dirty_folio - Mark a folio as dirty.
@@ -758,153 +791,20 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
EXPORT_SYMBOL(block_dirty_folio);
/*
- * Write out and wait upon a list of buffers.
- *
- * We have conflicting pressures: we want to make sure that all
- * initially dirty buffers get waited on, but that any subsequently
- * dirtied buffers don't. After all, we don't want fsync to last
- * forever if somebody is actively writing to the file.
- *
- * Do this in two main stages: first we copy dirty buffers to a
- * temporary inode list, queueing the writes as we go. Then we clean
- * up, waiting for those writes to complete.
- *
- * During this second stage, any subsequent updates to the file may end
- * up refiling the buffer on the original inode's dirty list again, so
- * there is a chance we will end up with a buffer queued for write but
- * not yet completed on that list. So, as a final cleanup we go through
- * the osync code to catch these locked, dirty buffers without requeuing
- * any newly dirty buffers for write.
- */
-static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
-{
- struct buffer_head *bh;
- struct address_space *mapping;
- int err = 0, err2;
- struct blk_plug plug;
- LIST_HEAD(tmp);
-
- blk_start_plug(&plug);
-
- spin_lock(lock);
- while (!list_empty(list)) {
- bh = BH_ENTRY(list->next);
- mapping = bh->b_assoc_map;
- __remove_assoc_queue(bh);
- /* Avoid race with mark_buffer_dirty_inode() which does
- * a lockless check and we rely on seeing the dirty bit */
- smp_mb();
- if (buffer_dirty(bh) || buffer_locked(bh)) {
- list_add(&bh->b_assoc_buffers, &tmp);
- bh->b_assoc_map = mapping;
- if (buffer_dirty(bh)) {
- get_bh(bh);
- spin_unlock(lock);
- /*
- * Ensure any pending I/O completes so that
- * write_dirty_buffer() actually writes the
- * current contents - it is a noop if I/O is
- * still in flight on potentially older
- * contents.
- */
- write_dirty_buffer(bh, REQ_SYNC);
-
- /*
- * Kick off IO for the previous mapping. Note
- * that we will not run the very last mapping,
- * wait_on_buffer() will do that for us
- * through sync_buffer().
- */
- brelse(bh);
- spin_lock(lock);
- }
- }
- }
-
- spin_unlock(lock);
- blk_finish_plug(&plug);
- spin_lock(lock);
-
- while (!list_empty(&tmp)) {
- bh = BH_ENTRY(tmp.prev);
- get_bh(bh);
- mapping = bh->b_assoc_map;
- __remove_assoc_queue(bh);
- /* Avoid race with mark_buffer_dirty_inode() which does
- * a lockless check and we rely on seeing the dirty bit */
- smp_mb();
- if (buffer_dirty(bh)) {
- list_add(&bh->b_assoc_buffers,
- &mapping->i_private_list);
- bh->b_assoc_map = mapping;
- }
- spin_unlock(lock);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- err = -EIO;
- brelse(bh);
- spin_lock(lock);
- }
-
- spin_unlock(lock);
- err2 = osync_buffers_list(lock, list);
- if (err)
- return err;
- else
- return err2;
-}
-
-/*
- * Invalidate any and all dirty buffers on a given inode. We are
+ * Invalidate any and all dirty buffers on a given buffers list. We are
* probably unmounting the fs, but that doesn't mean we have already
* done a sync(). Just drop the buffers from the inode list.
- *
- * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which
- * assumes that all the buffers are against the blockdev.
- */
-void invalidate_inode_buffers(struct inode *inode)
-{
- if (inode_has_buffers(inode)) {
- struct address_space *mapping = &inode->i_data;
- struct list_head *list = &mapping->i_private_list;
- struct address_space *buffer_mapping = mapping->i_private_data;
-
- spin_lock(&buffer_mapping->i_private_lock);
- while (!list_empty(list))
- __remove_assoc_queue(BH_ENTRY(list->next));
- spin_unlock(&buffer_mapping->i_private_lock);
- }
-}
-EXPORT_SYMBOL(invalidate_inode_buffers);
-
-/*
- * Remove any clean buffers from the inode's buffer list. This is called
- * when we're trying to free the inode itself. Those buffers can pin it.
- *
- * Returns true if all buffers were removed.
*/
-int remove_inode_buffers(struct inode *inode)
+void mmb_invalidate(struct mapping_metadata_bhs *mmb)
{
- int ret = 1;
-
- if (inode_has_buffers(inode)) {
- struct address_space *mapping = &inode->i_data;
- struct list_head *list = &mapping->i_private_list;
- struct address_space *buffer_mapping = mapping->i_private_data;
-
- spin_lock(&buffer_mapping->i_private_lock);
- while (!list_empty(list)) {
- struct buffer_head *bh = BH_ENTRY(list->next);
- if (buffer_dirty(bh)) {
- ret = 0;
- break;
- }
- __remove_assoc_queue(bh);
- }
- spin_unlock(&buffer_mapping->i_private_lock);
+ if (mmb_has_buffers(mmb)) {
+ spin_lock(&mmb->lock);
+ while (!list_empty(&mmb->list))
+ __remove_assoc_queue(mmb, BH_ENTRY(mmb->list.next));
+ spin_unlock(&mmb->lock);
}
- return ret;
}
+EXPORT_SYMBOL(mmb_invalidate);
/*
* Create the appropriate buffers when given a folio for data area and
@@ -1214,8 +1114,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh)
/* FIXME: do we need to set this in both places? */
if (bh->b_folio && bh->b_folio->mapping)
mapping_set_error(bh->b_folio->mapping, -EIO);
- if (bh->b_assoc_map)
- mapping_set_error(bh->b_assoc_map, -EIO);
+ if (bh->b_mmb)
+ mapping_set_error(bh->b_mmb->mapping, -EIO);
}
EXPORT_SYMBOL(mark_buffer_write_io_error);
@@ -1245,14 +1145,7 @@ EXPORT_SYMBOL(__brelse);
void __bforget(struct buffer_head *bh)
{
clear_buffer_dirty(bh);
- if (bh->b_assoc_map) {
- struct address_space *buffer_mapping = bh->b_folio->mapping;
-
- spin_lock(&buffer_mapping->i_private_lock);
- list_del_init(&bh->b_assoc_buffers);
- bh->b_assoc_map = NULL;
- spin_unlock(&buffer_mapping->i_private_lock);
- }
+ remove_assoc_queue(bh);
__brelse(bh);
}
EXPORT_SYMBOL(__bforget);
@@ -2900,8 +2793,7 @@ drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
do {
struct buffer_head *next = bh->b_this_page;
- if (bh->b_assoc_map)
- __remove_assoc_queue(bh);
+ remove_assoc_queue(bh);
bh = next;
} while (bh != head);
*buffers_to_free = head;