summaryrefslogtreecommitdiff
path: root/fs/bcachefs/btree_io.c
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2025-07-10 10:08:47 -0700
committerJakub Kicinski <kuba@kernel.org>2025-07-10 10:10:49 -0700
commit3321e97eab71df7d632b35276da9f8503e6e040f (patch)
tree6a4ec5e7556d99a0ff25f4035f078d8e0186f359 /fs/bcachefs/btree_io.c
parente090f978054e1cfcd970234589168fcbcba33976 (diff)
parentbc9ff192a6c940d9a26e21a0a82f2667067aaf5f (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR (net-6.16-rc6). No conflicts. Adjacent changes: Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml 0a12c435a1d6 ("dt-bindings: net: sun8i-emac: Add A100 EMAC compatible") b3603c0466a8 ("dt-bindings: net: sun8i-emac: Rename A523 EMAC0 to GMAC0") Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'fs/bcachefs/btree_io.c')
-rw-r--r--fs/bcachefs/btree_io.c43
1 files changed, 35 insertions, 8 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 08b22bddd747..e874a4357f64 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1337,15 +1337,42 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_node_reset_sib_u64s(b);
- scoped_guard(rcu)
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
-
- if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
- set_btree_node_need_rewrite(b);
- set_btree_node_need_rewrite_degraded(b);
+ /*
+ * XXX:
+ *
+ * We deadlock if too many btree updates require node rewrites while
+ * we're still in journal replay.
+ *
+ * This is because btree node rewrites generate more updates for the
+ * interior updates (alloc, backpointers), and if those updates touch
+ * new nodes and generate more rewrites - well, you see the problem.
+ *
+ * The biggest cause is that we don't use the btree write buffer (for
+ * the backpointer updates - this needs some real thought on locking in
+ * order to fix.
+ *
+ * The problem with this workaround (not doing the rewrite for degraded
+ * nodes in journal replay) is that those degraded nodes persist, and we
+ * don't want that (this is a real bug when a btree node write completes
+ * with fewer replicas than we wanted and leaves a degraded node due to
+ * device _removal_, i.e. the device went away mid write).
+ *
+ * It's less of a bug here, but still a problem because we don't yet
+ * have a way of tracking degraded data - we another index (all
+ * extents/btree nodes, by replicas entry) in order to fix properly
+ * (re-replicate degraded data at the earliest possible time).
+ */
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
+ scoped_guard(rcu)
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+ struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+
+ if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
+ set_btree_node_need_rewrite(b);
+ set_btree_node_need_rewrite_degraded(b);
+ }
}
- }
+ }
if (!ptr_written) {
set_btree_node_need_rewrite(b);