diff options
Diffstat (limited to 'fs/buffer.c')
| -rw-r--r-- | fs/buffer.c | 454 |
1 files changed, 173 insertions, 281 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index 22b43642ba57..cbed175f418b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -54,7 +54,6 @@ #include "internal.h" -static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, enum rw_hint hint, struct writeback_control *wbc); @@ -468,146 +467,187 @@ EXPORT_SYMBOL(mark_buffer_async_write); * a successful fsync(). For example, ext2 indirect blocks need to be * written back and waited upon before fsync() returns. * - * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(), - * inode_has_buffers() and invalidate_inode_buffers() are provided for the - * management of a list of dependent buffers at ->i_mapping->i_private_list. - * - * Locking is a little subtle: try_to_free_buffers() will remove buffers - * from their controlling inode's queue when they are being freed. But - * try_to_free_buffers() will be operating against the *blockdev* mapping - * at the time, not against the S_ISREG file which depends on those buffers. - * So the locking for i_private_list is via the i_private_lock in the address_space - * which backs the buffers. Which is different from the address_space - * against which the buffers are listed. So for a particular address_space, - * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact, - * mapping->i_private_list will always be protected by the backing blockdev's - * ->i_private_lock. - * - * Which introduces a requirement: all buffers on an address_space's - * ->i_private_list must be from the same address_space: the blockdev's. - * - * address_spaces which do not place buffers at ->i_private_list via these - * utility functions are free to use i_private_lock and i_private_list for - * whatever they want. The only requirement is that list_empty(i_private_list) - * be true at clear_inode() time. - * - * FIXME: clear_inode should not call invalidate_inode_buffers(). The - * filesystems should do that. invalidate_inode_buffers() should just go - * BUG_ON(!list_empty). - * - * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should - * take an address_space, not an inode. And it should be called - * mark_buffer_dirty_fsync() to clearly define why those buffers are being - * queued up. - * - * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the - * list if it is already on a list. Because if the buffer is on a list, - * it *must* already be on the right one. If not, the filesystem is being - * silly. This will save a ton of locking. But first we have to ensure - * that buffers are taken *off* the old inode's list when they are freed - * (presumably in truncate). That requires careful auditing of all - * filesystems (do it inside bforget()). It could also be done by bringing - * b_inode back. + * The functions mmb_mark_buffer_dirty(), mmb_sync(), mmb_has_buffers() + * and mmb_invalidate() are provided for the management of a list of dependent + * buffers in mapping_metadata_bhs struct. + * + * The locking is a little subtle: The list of buffer heads is protected by + * the lock in mapping_metadata_bhs so functions coming from bdev mapping + * (such as try_to_free_buffers()) need to safely get to mapping_metadata_bhs + * using RCU, grab the lock, verify we didn't race with somebody detaching the + * bh / moving it to different inode and only then proceeding. */ -/* - * The buffer's backing address_space's i_private_lock must be held - */ -static void __remove_assoc_queue(struct buffer_head *bh) +void mmb_init(struct mapping_metadata_bhs *mmb, struct address_space *mapping) { - list_del_init(&bh->b_assoc_buffers); - WARN_ON(!bh->b_assoc_map); - bh->b_assoc_map = NULL; + spin_lock_init(&mmb->lock); + INIT_LIST_HEAD(&mmb->list); + mmb->mapping = mapping; } +EXPORT_SYMBOL(mmb_init); -int inode_has_buffers(struct inode *inode) +static void __remove_assoc_queue(struct mapping_metadata_bhs *mmb, + struct buffer_head *bh) { - return !list_empty(&inode->i_data.i_private_list); + lockdep_assert_held(&mmb->lock); + list_del_init(&bh->b_assoc_buffers); + WARN_ON(!bh->b_mmb); + bh->b_mmb = NULL; } -/* - * osync is designed to support O_SYNC io. It waits synchronously for - * all already-submitted IO to complete, but does not queue any new - * writes to the disk. - * - * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer - * as you dirty the buffers, and then use osync_inode_buffers to wait for - * completion. Any other dirty buffers which are not yet queued for - * write will not be flushed to disk by the osync. - */ -static int osync_buffers_list(spinlock_t *lock, struct list_head *list) +static void remove_assoc_queue(struct buffer_head *bh) { - struct buffer_head *bh; - struct list_head *p; - int err = 0; + struct mapping_metadata_bhs *mmb; - spin_lock(lock); -repeat: - list_for_each_prev(p, list) { - bh = BH_ENTRY(p); - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(lock); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - err = -EIO; - brelse(bh); - spin_lock(lock); - goto repeat; + /* + * The locking dance is ugly here. We need to acquire the lock + * protecting the metadata bh list while possibly racing with bh + * being removed from the list or moved to a different one. We + * use RCU to pin mapping_metadata_bhs in memory to + * opportunistically acquire the lock and then recheck the bh + * didn't move under us. + */ + while (bh->b_mmb) { + rcu_read_lock(); + mmb = READ_ONCE(bh->b_mmb); + if (mmb) { + spin_lock(&mmb->lock); + if (bh->b_mmb == mmb) + __remove_assoc_queue(mmb, bh); + spin_unlock(&mmb->lock); } + rcu_read_unlock(); } - spin_unlock(lock); - return err; } +bool mmb_has_buffers(struct mapping_metadata_bhs *mmb) +{ + return !list_empty(&mmb->list); +} +EXPORT_SYMBOL_GPL(mmb_has_buffers); + /** - * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers - * @mapping: the mapping which wants those buffers written + * mmb_sync - write out & wait upon all buffers in a list + * @mmb: the list of buffers to write + * + * Starts I/O against the buffers in the given list and waits upon + * that I/O. Basically, this is a convenience function for fsync(). @mmb is + * for a file or directory which needs those buffers to be written for a + * successful fsync(). * - * Starts I/O against the buffers at mapping->i_private_list, and waits upon - * that I/O. + * We have conflicting pressures: we want to make sure that all + * initially dirty buffers get waited on, but that any subsequently + * dirtied buffers don't. After all, we don't want fsync to last + * forever if somebody is actively writing to the file. * - * Basically, this is a convenience function for fsync(). - * @mapping is a file or directory which needs those buffers to be written for - * a successful fsync(). + * Do this in two main stages: first we copy dirty buffers to a + * temporary inode list, queueing the writes as we go. Then we clean + * up, waiting for those writes to complete. mark_buffer_dirty_inode() + * doesn't touch b_assoc_buffers list if b_mmb is not NULL so we are sure the + * buffer stays on our list until IO completes (at which point it can be + * reaped). */ -int sync_mapping_buffers(struct address_space *mapping) +int mmb_sync(struct mapping_metadata_bhs *mmb) { - struct address_space *buffer_mapping = mapping->i_private_data; + struct buffer_head *bh; + int err = 0; + struct blk_plug plug; + LIST_HEAD(tmp); - if (buffer_mapping == NULL || list_empty(&mapping->i_private_list)) + if (!mmb_has_buffers(mmb)) return 0; - return fsync_buffers_list(&buffer_mapping->i_private_lock, - &mapping->i_private_list); + blk_start_plug(&plug); + + spin_lock(&mmb->lock); + while (!list_empty(&mmb->list)) { + bh = BH_ENTRY(mmb->list.next); + WARN_ON_ONCE(bh->b_mmb != mmb); + __remove_assoc_queue(mmb, bh); + /* Avoid race with mark_buffer_dirty_inode() which does + * a lockless check and we rely on seeing the dirty bit */ + smp_mb(); + if (buffer_dirty(bh) || buffer_locked(bh)) { + list_add(&bh->b_assoc_buffers, &tmp); + bh->b_mmb = mmb; + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(&mmb->lock); + /* + * Ensure any pending I/O completes so that + * write_dirty_buffer() actually writes the + * current contents - it is a noop if I/O is + * still in flight on potentially older + * contents. + */ + write_dirty_buffer(bh, REQ_SYNC); + + /* + * Kick off IO for the previous mapping. Note + * that we will not run the very last mapping, + * wait_on_buffer() will do that for us + * through sync_buffer(). + */ + brelse(bh); + spin_lock(&mmb->lock); + } + } + } + + spin_unlock(&mmb->lock); + blk_finish_plug(&plug); + spin_lock(&mmb->lock); + + while (!list_empty(&tmp)) { + bh = BH_ENTRY(tmp.prev); + get_bh(bh); + __remove_assoc_queue(mmb, bh); + /* Avoid race with mark_buffer_dirty_inode() which does + * a lockless check and we rely on seeing the dirty bit */ + smp_mb(); + if (buffer_dirty(bh)) { + list_add(&bh->b_assoc_buffers, &mmb->list); + bh->b_mmb = mmb; + } + spin_unlock(&mmb->lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(&mmb->lock); + } + spin_unlock(&mmb->lock); + return err; } -EXPORT_SYMBOL(sync_mapping_buffers); +EXPORT_SYMBOL(mmb_sync); /** - * generic_buffers_fsync_noflush - generic buffer fsync implementation - * for simple filesystems with no inode lock + * mmb_fsync_noflush - fsync implementation for simple filesystems with + * metadata buffers list * * @file: file to synchronize + * @mmb: list of metadata bhs to flush * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * - * This is a generic implementation of the fsync method for simple - * filesystems which track all non-inode metadata in the buffers list - * hanging off the address_space structure. + * This is an implementation of the fsync method for simple filesystems which + * track all non-inode metadata in the buffers list hanging off the @mmb + * structure. */ -int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, - bool datasync) +int mmb_fsync_noflush(struct file *file, struct mapping_metadata_bhs *mmb, + loff_t start, loff_t end, bool datasync) { struct inode *inode = file->f_mapping->host; int err; - int ret; + int ret = 0; err = file_write_and_wait_range(file, start, end); if (err) return err; - ret = sync_mapping_buffers(inode->i_mapping); + if (mmb) + ret = mmb_sync(mmb); if (!(inode_state_read_once(inode) & I_DIRTY_ALL)) goto out; if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) @@ -624,34 +664,35 @@ out: ret = err; return ret; } -EXPORT_SYMBOL(generic_buffers_fsync_noflush); +EXPORT_SYMBOL(mmb_fsync_noflush); /** - * generic_buffers_fsync - generic buffer fsync implementation - * for simple filesystems with no inode lock + * mmb_fsync - fsync implementation for simple filesystems with metadata + * buffers list * * @file: file to synchronize + * @mmb: list of metadata bhs to flush * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * - * This is a generic implementation of the fsync method for simple - * filesystems which track all non-inode metadata in the buffers list - * hanging off the address_space structure. This also makes sure that - * a device cache flush operation is called at the end. + * This is an implementation of the fsync method for simple filesystems which + * track all non-inode metadata in the buffers list hanging off the @mmb + * structure. This also makes sure that a device cache flush operation is + * called at the end. */ -int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, - bool datasync) +int mmb_fsync(struct file *file, struct mapping_metadata_bhs *mmb, + loff_t start, loff_t end, bool datasync) { struct inode *inode = file->f_mapping->host; int ret; - ret = generic_buffers_fsync_noflush(file, start, end, datasync); + ret = mmb_fsync_noflush(file, mmb, start, end, datasync); if (!ret) ret = blkdev_issue_flush(inode->i_sb->s_bdev); return ret; } -EXPORT_SYMBOL(generic_buffers_fsync); +EXPORT_SYMBOL(mmb_fsync); /* * Called when we've recently written block `bblock', and it is known that @@ -672,26 +713,18 @@ void write_boundary_block(struct block_device *bdev, } } -void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) +void mmb_mark_buffer_dirty(struct buffer_head *bh, + struct mapping_metadata_bhs *mmb) { - struct address_space *mapping = inode->i_mapping; - struct address_space *buffer_mapping = bh->b_folio->mapping; - mark_buffer_dirty(bh); - if (!mapping->i_private_data) { - mapping->i_private_data = buffer_mapping; - } else { - BUG_ON(mapping->i_private_data != buffer_mapping); - } - if (!bh->b_assoc_map) { - spin_lock(&buffer_mapping->i_private_lock); - list_move_tail(&bh->b_assoc_buffers, - &mapping->i_private_list); - bh->b_assoc_map = mapping; - spin_unlock(&buffer_mapping->i_private_lock); + if (!bh->b_mmb) { + spin_lock(&mmb->lock); + list_move_tail(&bh->b_assoc_buffers, &mmb->list); + bh->b_mmb = mmb; + spin_unlock(&mmb->lock); } } -EXPORT_SYMBOL(mark_buffer_dirty_inode); +EXPORT_SYMBOL(mmb_mark_buffer_dirty); /** * block_dirty_folio - Mark a folio as dirty. @@ -758,153 +791,20 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) EXPORT_SYMBOL(block_dirty_folio); /* - * Write out and wait upon a list of buffers. - * - * We have conflicting pressures: we want to make sure that all - * initially dirty buffers get waited on, but that any subsequently - * dirtied buffers don't. After all, we don't want fsync to last - * forever if somebody is actively writing to the file. - * - * Do this in two main stages: first we copy dirty buffers to a - * temporary inode list, queueing the writes as we go. Then we clean - * up, waiting for those writes to complete. - * - * During this second stage, any subsequent updates to the file may end - * up refiling the buffer on the original inode's dirty list again, so - * there is a chance we will end up with a buffer queued for write but - * not yet completed on that list. So, as a final cleanup we go through - * the osync code to catch these locked, dirty buffers without requeuing - * any newly dirty buffers for write. - */ -static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) -{ - struct buffer_head *bh; - struct address_space *mapping; - int err = 0, err2; - struct blk_plug plug; - LIST_HEAD(tmp); - - blk_start_plug(&plug); - - spin_lock(lock); - while (!list_empty(list)) { - bh = BH_ENTRY(list->next); - mapping = bh->b_assoc_map; - __remove_assoc_queue(bh); - /* Avoid race with mark_buffer_dirty_inode() which does - * a lockless check and we rely on seeing the dirty bit */ - smp_mb(); - if (buffer_dirty(bh) || buffer_locked(bh)) { - list_add(&bh->b_assoc_buffers, &tmp); - bh->b_assoc_map = mapping; - if (buffer_dirty(bh)) { - get_bh(bh); - spin_unlock(lock); - /* - * Ensure any pending I/O completes so that - * write_dirty_buffer() actually writes the - * current contents - it is a noop if I/O is - * still in flight on potentially older - * contents. - */ - write_dirty_buffer(bh, REQ_SYNC); - - /* - * Kick off IO for the previous mapping. Note - * that we will not run the very last mapping, - * wait_on_buffer() will do that for us - * through sync_buffer(). - */ - brelse(bh); - spin_lock(lock); - } - } - } - - spin_unlock(lock); - blk_finish_plug(&plug); - spin_lock(lock); - - while (!list_empty(&tmp)) { - bh = BH_ENTRY(tmp.prev); - get_bh(bh); - mapping = bh->b_assoc_map; - __remove_assoc_queue(bh); - /* Avoid race with mark_buffer_dirty_inode() which does - * a lockless check and we rely on seeing the dirty bit */ - smp_mb(); - if (buffer_dirty(bh)) { - list_add(&bh->b_assoc_buffers, - &mapping->i_private_list); - bh->b_assoc_map = mapping; - } - spin_unlock(lock); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - err = -EIO; - brelse(bh); - spin_lock(lock); - } - - spin_unlock(lock); - err2 = osync_buffers_list(lock, list); - if (err) - return err; - else - return err2; -} - -/* - * Invalidate any and all dirty buffers on a given inode. We are + * Invalidate any and all dirty buffers on a given buffers list. We are * probably unmounting the fs, but that doesn't mean we have already * done a sync(). Just drop the buffers from the inode list. - * - * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which - * assumes that all the buffers are against the blockdev. - */ -void invalidate_inode_buffers(struct inode *inode) -{ - if (inode_has_buffers(inode)) { - struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->i_private_list; - struct address_space *buffer_mapping = mapping->i_private_data; - - spin_lock(&buffer_mapping->i_private_lock); - while (!list_empty(list)) - __remove_assoc_queue(BH_ENTRY(list->next)); - spin_unlock(&buffer_mapping->i_private_lock); - } -} -EXPORT_SYMBOL(invalidate_inode_buffers); - -/* - * Remove any clean buffers from the inode's buffer list. This is called - * when we're trying to free the inode itself. Those buffers can pin it. - * - * Returns true if all buffers were removed. */ -int remove_inode_buffers(struct inode *inode) +void mmb_invalidate(struct mapping_metadata_bhs *mmb) { - int ret = 1; - - if (inode_has_buffers(inode)) { - struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->i_private_list; - struct address_space *buffer_mapping = mapping->i_private_data; - - spin_lock(&buffer_mapping->i_private_lock); - while (!list_empty(list)) { - struct buffer_head *bh = BH_ENTRY(list->next); - if (buffer_dirty(bh)) { - ret = 0; - break; - } - __remove_assoc_queue(bh); - } - spin_unlock(&buffer_mapping->i_private_lock); + if (mmb_has_buffers(mmb)) { + spin_lock(&mmb->lock); + while (!list_empty(&mmb->list)) + __remove_assoc_queue(mmb, BH_ENTRY(mmb->list.next)); + spin_unlock(&mmb->lock); } - return ret; } +EXPORT_SYMBOL(mmb_invalidate); /* * Create the appropriate buffers when given a folio for data area and @@ -1214,8 +1114,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh) /* FIXME: do we need to set this in both places? */ if (bh->b_folio && bh->b_folio->mapping) mapping_set_error(bh->b_folio->mapping, -EIO); - if (bh->b_assoc_map) - mapping_set_error(bh->b_assoc_map, -EIO); + if (bh->b_mmb) + mapping_set_error(bh->b_mmb->mapping, -EIO); } EXPORT_SYMBOL(mark_buffer_write_io_error); @@ -1245,14 +1145,7 @@ EXPORT_SYMBOL(__brelse); void __bforget(struct buffer_head *bh) { clear_buffer_dirty(bh); - if (bh->b_assoc_map) { - struct address_space *buffer_mapping = bh->b_folio->mapping; - - spin_lock(&buffer_mapping->i_private_lock); - list_del_init(&bh->b_assoc_buffers); - bh->b_assoc_map = NULL; - spin_unlock(&buffer_mapping->i_private_lock); - } + remove_assoc_queue(bh); __brelse(bh); } EXPORT_SYMBOL(__bforget); @@ -2900,8 +2793,7 @@ drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free) do { struct buffer_head *next = bh->b_this_page; - if (bh->b_assoc_map) - __remove_assoc_queue(bh); + remove_assoc_queue(bh); bh = next; } while (bh != head); *buffers_to_free = head; |
