summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2025-04-09 17:12:59 +0200
committerChristian Brauner <brauner@kernel.org>2025-09-19 13:11:00 +0200
commite1b849cfa6b61f1c866a908c9e8dd9b5aaab820b (patch)
treec4ec0f4d3e0ecfc88637d1ddb9fd5d4021890735
parent8f5ae30d69d7543eee0d70083daf4de8fe15d585 (diff)
writeback: Avoid contention on wb->list_lock when switching inodes
There can be multiple inode switch works that are trying to switch inodes to / from the same wb. This can happen in particular if some cgroup exits which owns many (thousands) inodes and we need to switch them all. In this case several inode_switch_wbs_work_fn() instances will be just spinning on the same wb->list_lock while only one of them makes forward progress. This wastes CPU cycles and quickly leads to softlockup reports and unusable system. Instead of running several inode_switch_wbs_work_fn() instances in parallel switching to the same wb and contending on wb->list_lock, run just one work item per wb and manage a queue of isw items switching to this wb. Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jan Kara <jack@suse.cz>
-rw-r--r--fs/fs-writeback.c99
-rw-r--r--include/linux/backing-dev-defs.h4
-rw-r--r--include/linux/writeback.h2
-rw-r--r--mm/backing-dev.c5
4 files changed, 74 insertions, 36 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index cc57367fb641..b0e9092ccf04 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
}
struct inode_switch_wbs_context {
- struct rcu_work work;
+ /* List of queued switching contexts for the wb */
+ struct llist_node list;
/*
* Multiple inodes can be switched at once. The switching procedure
@@ -378,7 +379,6 @@ struct inode_switch_wbs_context {
* array embedded into struct inode_switch_wbs_context. Otherwise
* an inode could be left in a non-consistent state.
*/
- struct bdi_writeback *new_wb;
struct inode *inodes[];
};
@@ -486,13 +486,11 @@ skip_switch:
return switched;
}
-static void inode_switch_wbs_work_fn(struct work_struct *work)
+static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
+ struct inode_switch_wbs_context *isw)
{
- struct inode_switch_wbs_context *isw =
- container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
- struct bdi_writeback *new_wb = isw->new_wb;
unsigned long nr_switched = 0;
struct inode **inodep;
@@ -543,6 +541,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
atomic_dec(&isw_nr_in_flight);
}
+void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+ struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback,
+ switch_work);
+ struct inode_switch_wbs_context *isw, *next_isw;
+ struct llist_node *list;
+
+ /*
+ * Grab out reference to wb so that it cannot get freed under us
+ * after we process all the isw items.
+ */
+ wb_get(new_wb);
+ while (1) {
+ list = llist_del_all(&new_wb->switch_wbs_ctxs);
+ /* Nothing to do? */
+ if (!list)
+ break;
+ /*
+ * In addition to synchronizing among switchers, I_WB_SWITCH
+ * tells the RCU protected stat update paths to grab the i_page
+ * lock so that stat transfer can synchronize against them.
+ * Let's continue after I_WB_SWITCH is guaranteed to be
+ * visible.
+ */
+ synchronize_rcu();
+
+ llist_for_each_entry_safe(isw, next_isw, list, list)
+ process_inode_switch_wbs(new_wb, isw);
+ }
+ wb_put(new_wb);
+}
+
static bool inode_prepare_wbs_switch(struct inode *inode,
struct bdi_writeback *new_wb)
{
@@ -572,6 +602,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
return true;
}
+static void wb_queue_isw(struct bdi_writeback *wb,
+ struct inode_switch_wbs_context *isw)
+{
+ if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
+ queue_work(isw_wq, &wb->switch_work);
+}
+
/**
* inode_switch_wbs - change the wb association of an inode
* @inode: target inode
@@ -585,6 +622,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
+ struct bdi_writeback *new_wb = NULL;
/* noop if seems to be already in progress */
if (inode->i_state & I_WB_SWITCH)
@@ -609,40 +647,34 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (!memcg_css)
goto out_free;
- isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
css_put(memcg_css);
- if (!isw->new_wb)
+ if (!new_wb)
goto out_free;
- if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+ if (!inode_prepare_wbs_switch(inode, new_wb))
goto out_free;
isw->inodes[0] = inode;
- /*
- * In addition to synchronizing among switchers, I_WB_SWITCH tells
- * the RCU protected stat update paths to grab the i_page
- * lock so that stat transfer can synchronize against them.
- * Let's continue after I_WB_SWITCH is guaranteed to be visible.
- */
- INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
- queue_rcu_work(isw_wq, &isw->work);
+ wb_queue_isw(new_wb, isw);
return;
out_free:
atomic_dec(&isw_nr_in_flight);
- if (isw->new_wb)
- wb_put(isw->new_wb);
+ if (new_wb)
+ wb_put(new_wb);
kfree(isw);
}
-static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
+static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb,
+ struct inode_switch_wbs_context *isw,
struct list_head *list, int *nr)
{
struct inode *inode;
list_for_each_entry(inode, list, i_io_list) {
- if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+ if (!inode_prepare_wbs_switch(inode, new_wb))
continue;
isw->inodes[*nr] = inode;
@@ -666,6 +698,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
+ struct bdi_writeback *new_wb;
int nr;
bool restart = false;
@@ -678,12 +711,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
for (memcg_css = wb->memcg_css->parent; memcg_css;
memcg_css = memcg_css->parent) {
- isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
- if (isw->new_wb)
+ new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
+ if (new_wb)
break;
}
- if (unlikely(!isw->new_wb))
- isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
+ if (unlikely(!new_wb))
+ new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
nr = 0;
spin_lock(&wb->list_lock);
@@ -695,27 +728,21 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
* bandwidth restrictions, as writeback of inode metadata is not
* accounted for.
*/
- restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
+ restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr);
if (!restart)
- restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
+ restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time,
+ &nr);
spin_unlock(&wb->list_lock);
/* no attached inodes? bail out */
if (nr == 0) {
atomic_dec(&isw_nr_in_flight);
- wb_put(isw->new_wb);
+ wb_put(new_wb);
kfree(isw);
return restart;
}
- /*
- * In addition to synchronizing among switchers, I_WB_SWITCH tells
- * the RCU protected stat update paths to grab the i_page
- * lock so that stat transfer can synchronize against them.
- * Let's continue after I_WB_SWITCH is guaranteed to be visible.
- */
- INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
- queue_rcu_work(isw_wq, &isw->work);
+ wb_queue_isw(new_wb, isw);
return restart;
}
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 2ad261082bba..c5c9d89c73ed 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -152,6 +152,10 @@ struct bdi_writeback {
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
struct list_head b_attached; /* attached inodes, protected by list_lock */
struct list_head offline_node; /* anchored at offline_cgwbs */
+ struct work_struct switch_work; /* work used to perform inode switching
+ * to this wb */
+ struct llist_head switch_wbs_ctxs; /* queued contexts for
+ * writeback switching */
union {
struct work_struct release_work;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a2848d731a46..15a4bc4ab819 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
}
+void inode_switch_wbs_work_fn(struct work_struct *work);
+
#else /* CONFIG_CGROUP_WRITEBACK */
static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 783904d8c5ef..0beaca6bacf7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work)
wb_exit(wb);
bdi_put(bdi);
WARN_ON_ONCE(!list_empty(&wb->b_attached));
+ WARN_ON_ONCE(work_pending(&wb->switch_work));
call_rcu(&wb->rcu, cgwb_free_rcu);
}
@@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
wb->memcg_css = memcg_css;
wb->blkcg_css = blkcg_css;
INIT_LIST_HEAD(&wb->b_attached);
+ INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
+ init_llist_head(&wb->switch_wbs_ctxs);
INIT_WORK(&wb->release_work, cgwb_release_workfn);
set_bit(WB_registered, &wb->state);
bdi_get(bdi);
@@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
if (!ret) {
bdi->wb.memcg_css = &root_mem_cgroup->css;
bdi->wb.blkcg_css = blkcg_root_css;
+ INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
+ init_llist_head(&bdi->wb.switch_wbs_ctxs);
}
return ret;
}