Merge tag 'bcachefs-2024-03-19' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs fixes from Kent Overstreet:
 "Assorted bugfixes.

  Most are fixes for simple assertion pops; the most significant fix is
  for a deadlock in recovery when we have to rewrite large numbers of
  btree nodes to fix errors. This was incorrectly running out of the
  same workqueue as the core interior btree update path - we now give it
  its own single threaded workqueue.

  This was visible to users as "bch2_btree_update_start(): error:
  BCH_ERR_journal_reclaim_would_deadlock" - and then recovery hanging"

* tag 'bcachefs-2024-03-19' of https://evilpiepirate.org/git/bcachefs:
  bcachefs: Fix lost wakeup on journal shutdown
  bcachefs; Fix deadlock in bch2_btree_update_start()
  bcachefs: ratelimit errors from async_btree_node_rewrite
  bcachefs: Run check_topology() first
  bcachefs: Improve bch2_fatal_error()
  bcachefs: Fix lost transaction restart error
  bcachefs: Don't corrupt journal keys gap buffer when dropping alloc info
  bcachefs: fix for building in userspace
  bcachefs: bch2_snapshot_is_ancestor() now safe to call in early recovery
  bcachefs: Fix nested transaction restart handling in bch2_bucket_gens_init()
  bcachefs: Improve sysfs internal/btree_updates
  bcachefs: Split out btree_node_rewrite_worker
  bcachefs: Fix locking in bch2_alloc_write_key()
  bcachefs: Avoid extent entry type assertions in .invalid()
  bcachefs: Fix spurious -BCH_ERR_transaction_restart_nested
  bcachefs: Fix check_key_has_snapshot() call
  bcachefs: Change "accounting overran journal reservation" to a warning
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c47f72f2..893e38f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -532,13 +532,13 @@
 		u8 gen = bch2_alloc_to_v4(k, &a)->gen;
 		unsigned offset;
 		struct bpos pos = alloc_gens_pos(iter.pos, &offset);
+		int ret2 = 0;
 
 		if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
-			ret = commit_do(trans, NULL, NULL,
-					BCH_TRANS_COMMIT_no_enospc,
-				bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
-			if (ret)
-				break;
+			ret2 =  bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
+				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+			if (ret2)
+				goto iter_err;
 			have_bucket_gens_key = false;
 		}
 
@@ -549,7 +549,8 @@
 		}
 
 		g.v.gens[offset] = gen;
-		0;
+iter_err:
+		ret2;
 	}));
 
 	if (have_bucket_gens_key && !ret)
@@ -852,7 +853,7 @@
 					bucket_journal_seq);
 			if (ret) {
 				bch2_fs_fatal_error(c,
-					"error setting bucket_needs_journal_commit: %i", ret);
+					"setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
 				return ret;
 			}
 		}
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index ca58193..214b15c8 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1356,15 +1356,17 @@
 
 		/* Don't retry from all devices if we're out of open buckets: */
 		if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
-			int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+			int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, watermark,
 					      flags, cl);
-			if (!ret ||
-			    bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-			    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+			if (!ret2 ||
+			    bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
+			    bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
+				ret = ret2;
 				goto alloc_done;
+			}
 		}
 
 		/*
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 339dc3e..799aa32 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -849,6 +849,8 @@
 	struct workqueue_struct	*btree_interior_update_worker;
 	struct work_struct	btree_interior_update_work;
 
+	struct workqueue_struct	*btree_node_rewrite_worker;
+
 	struct list_head	pending_node_rewrites;
 	struct mutex		pending_node_rewrites_lock;
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 584aee7..bdaed29 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1392,11 +1392,11 @@
 					 *old,
 					 b->data_type);
 	gc = *b;
-	percpu_up_read(&c->mark_lock);
 
 	if (gc.data_type != old_gc.data_type ||
 	    gc.dirty_sectors != old_gc.dirty_sectors)
 		bch2_dev_usage_update_m(c, ca, &old_gc, &gc);
+	percpu_up_read(&c->mark_lock);
 
 	if (metadata_only &&
 	    gc.data_type != BCH_DATA_sb &&
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 624c828..34df8cc 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1066,7 +1066,7 @@
 
 			ret = bset_encrypt(c, i, b->written << 9);
 			if (bch2_fs_fatal_err_on(ret, c,
-					"error decrypting btree node: %i", ret))
+					"decrypting btree node: %s", bch2_err_str(ret)))
 				goto fsck_err;
 
 			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
@@ -1107,7 +1107,7 @@
 
 			ret = bset_encrypt(c, i, b->written << 9);
 			if (bch2_fs_fatal_err_on(ret, c,
-					"error decrypting btree node: %i\n", ret))
+					"decrypting btree node: %s", bch2_err_str(ret)))
 				goto fsck_err;
 
 			sectors = vstruct_sectors(bne, c->block_bits);
@@ -1338,7 +1338,7 @@
 	if (saw_error && !btree_node_read_error(b)) {
 		printbuf_reset(&buf);
 		bch2_bpos_to_text(&buf, b->key.k.p);
-		bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
+		bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
 			 __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
 
 		bch2_btree_node_rewrite_async(c, b);
@@ -1874,8 +1874,8 @@
 	return;
 err:
 	set_btree_node_noevict(b);
-	if (!bch2_err_matches(ret, EROFS))
-		bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret));
+	bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
+			     "writing btree node: %s", bch2_err_str(ret));
 	goto out;
 }
 
@@ -2131,7 +2131,7 @@
 
 	ret = bset_encrypt(c, i, b->written << 9);
 	if (bch2_fs_fatal_err_on(ret, c,
-			"error encrypting btree node: %i\n", ret))
+			"encrypting btree node: %s", bch2_err_str(ret)))
 		goto err;
 
 	nonce = btree_nonce(i, b->written << 9);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 8a71d43..581edcb 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -676,7 +676,7 @@
 			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
 			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
 			     !bch2_journal_error(j), c,
-			     "error flushing key cache: %s", bch2_err_str(ret));
+			     "flushing key cache: %s", bch2_err_str(ret));
 	if (ret)
 		goto out;
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 642213e..b2f5f2e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -646,7 +646,7 @@
 	bch2_trans_unlock(trans);
 
 	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
-			     "%s(): error %s", __func__, bch2_err_str(ret));
+			     "%s", bch2_err_str(ret));
 err:
 	if (as->b) {
 
@@ -1067,13 +1067,18 @@
 	flags &= ~BCH_WATERMARK_MASK;
 	flags |= watermark;
 
-	if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
-	    watermark < c->journal.watermark) {
+	if (watermark < c->journal.watermark) {
 		struct journal_res res = { 0 };
+		unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK;
+
+		if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
+		    watermark != BCH_WATERMARK_reclaim)
+			journal_flags |= JOURNAL_RES_GET_NONBLOCK;
 
 		ret = drop_locks_do(trans,
-			bch2_journal_res_get(&c->journal, &res, 1,
-					     watermark|JOURNAL_RES_GET_CHECK));
+			bch2_journal_res_get(&c->journal, &res, 1, journal_flags));
+		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
 		if (ret)
 			return ERR_PTR(ret);
 	}
@@ -1117,6 +1122,7 @@
 	closure_init(&as->cl, NULL);
 	as->c		= c;
 	as->start_time	= start_time;
+	as->ip_started	= _RET_IP_;
 	as->mode	= BTREE_INTERIOR_NO_UPDATE;
 	as->took_gc_lock = true;
 	as->btree_id	= path->btree_id;
@@ -1192,7 +1198,8 @@
 err:
 	bch2_btree_update_free(as, trans);
 	if (!bch2_err_matches(ret, ENOSPC) &&
-	    !bch2_err_matches(ret, EROFS))
+	    !bch2_err_matches(ret, EROFS) &&
+	    ret != -BCH_ERR_journal_reclaim_would_deadlock)
 		bch_err_fn_ratelimited(c, ret);
 	return ERR_PTR(ret);
 }
@@ -2114,7 +2121,7 @@
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
 		      async_btree_node_rewrite_trans(trans, a));
-	bch_err_fn(c, ret);
+	bch_err_fn_ratelimited(c, ret);
 	bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
 	kfree(a);
 }
@@ -2161,7 +2168,7 @@
 		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
 	}
 
-	queue_work(c->btree_interior_update_worker, &a->work);
+	queue_work(c->btree_node_rewrite_worker, &a->work);
 }
 
 void bch2_do_pending_node_rewrites(struct bch_fs *c)
@@ -2173,7 +2180,7 @@
 		list_del(&a->list);
 
 		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
-		queue_work(c->btree_interior_update_worker, &a->work);
+		queue_work(c->btree_node_rewrite_worker, &a->work);
 	}
 	mutex_unlock(&c->pending_node_rewrites_lock);
 }
@@ -2441,12 +2448,12 @@
 
 	mutex_lock(&c->btree_interior_update_lock);
 	list_for_each_entry(as, &c->btree_interior_update_list, list)
-		prt_printf(out, "%p m %u w %u r %u j %llu\n",
-		       as,
-		       as->mode,
-		       as->nodes_written,
-		       closure_nr_remaining(&as->cl),
-		       as->journal.seq);
+		prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+			   (void *) as->ip_started,
+			   as->mode,
+			   as->nodes_written,
+			   closure_nr_remaining(&as->cl),
+			   as->journal.seq);
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
@@ -2510,6 +2517,8 @@
 
 void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
 {
+	if (c->btree_node_rewrite_worker)
+		destroy_workqueue(c->btree_node_rewrite_worker);
 	if (c->btree_interior_update_worker)
 		destroy_workqueue(c->btree_interior_update_worker);
 	mempool_exit(&c->btree_interior_update_pool);
@@ -2534,6 +2543,11 @@
 	if (!c->btree_interior_update_worker)
 		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
 
+	c->btree_node_rewrite_worker =
+		alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
+	if (!c->btree_node_rewrite_worker)
+		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+
 	if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
 				      sizeof(struct btree_update)))
 		return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 3439b03..f651dd48 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -32,6 +32,7 @@
 	struct closure			cl;
 	struct bch_fs			*c;
 	u64				start_time;
+	unsigned long			ip_started;
 
 	struct list_head		list;
 	struct list_head		unwritten_list;
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index b77e7b3..5cbad84 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -378,7 +378,7 @@
 		}
 	}
 err:
-	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+	bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
 	trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
 	bch2_journal_pin_drop(j, &wb->flushing.pin);
 	wb->flushing.keys.nr = 0;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c2f46b2..96edf2c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -990,8 +990,8 @@
 				ret = !gc
 					? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
 					: update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
-				bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors",
-						     __func__);
+				bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors",
+						     bch2_err_str(ret));
 				if (ret)
 					return ret;
 			}
@@ -1020,7 +1020,7 @@
 			struct printbuf buf = PRINTBUF;
 
 			bch2_bkey_val_to_text(&buf, c, k);
-			bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
+			bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
 			printbuf_exit(&buf);
 		}
 		if (ret)
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index b1f147e..208ce6f 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -170,7 +170,7 @@
 		struct printbuf buf = PRINTBUF;
 
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+		bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf);
 		printbuf_exit(&buf);
 	}
 out:
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index b98e2c2..0820752 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -448,7 +448,7 @@
 			struct printbuf buf = PRINTBUF;
 
 			bch2_bkey_val_to_text(&buf, c, new);
-			bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+			bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
 			printbuf_exit(&buf);
 			return ret;
 		}
@@ -1868,10 +1868,10 @@
 		return -BCH_ERR_stripe_alloc_blocked;
 
 	ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
+	bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
+			     "reading stripe key: %s", bch2_err_str(ret));
 	if (ret) {
 		bch2_stripe_close(c, h->s);
-		if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret));
 		return ret;
 	}
 
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 94491190..ae1d667 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -191,9 +191,9 @@
 
 void bch2_fatal_error(struct bch_fs *);
 
-#define bch2_fs_fatal_error(c, ...)					\
+#define bch2_fs_fatal_error(c, _msg, ...)				\
 do {									\
-	bch_err(c, __VA_ARGS__);					\
+	bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__);	\
 	bch2_fatal_error(c);						\
 } while (0)
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 6219f2c..fd2669c 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -108,17 +108,17 @@
 
 static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
 {
-	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+	return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
 }
 
 static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
 {
-	return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
+	return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
 }
 
 static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
 {
-	switch (extent_entry_type(e)) {
+	switch (__extent_entry_type(e)) {
 	case BCH_EXTENT_ENTRY_crc32:
 	case BCH_EXTENT_ENTRY_crc64:
 	case BCH_EXTENT_ENTRY_crc128:
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 3f07384..0ccee05 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -108,7 +108,8 @@
 		goto retry;
 
 	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
-			     "inode %u:%llu not found when updating",
+			     "%s: inode %u:%llu not found when updating",
+			     bch2_err_str(ret),
 			     inode_inum(inode).subvol,
 			     inode_inum(inode).inum);
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index f48033b..47d4eef 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1114,10 +1114,9 @@
 	return ret;
 }
 
-static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
 {
 	struct bch_fs *c = trans->c;
-	u32 restart_count = trans->restart_count;
 	int ret = 0;
 	s64 count2;
 
@@ -1149,7 +1148,14 @@
 	}
 fsck_err:
 	bch_err_fn(c, ret);
-	return ret ?: trans_was_restarted(trans, restart_count);
+	return ret;
+}
+
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+	u32 restart_count = trans->restart_count;
+	return check_i_sectors_notnested(trans, w) ?:
+		trans_was_restarted(trans, restart_count);
 }
 
 struct extent_end {
@@ -1533,7 +1539,7 @@
 			check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
 			check_extent_overbig(trans, &iter, k);
 		})) ?:
-		check_i_sectors(trans, &w));
+		check_i_sectors_notnested(trans, &w));
 
 	bch2_disk_reservation_put(c, &res);
 	extent_ends_exit(&extent_ends);
@@ -1563,10 +1569,9 @@
 	return ret;
 }
 
-static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
 {
 	struct bch_fs *c = trans->c;
-	u32 restart_count = trans->restart_count;
 	int ret = 0;
 	s64 count2;
 
@@ -1598,7 +1603,14 @@
 	}
 fsck_err:
 	bch_err_fn(c, ret);
-	return ret ?: trans_was_restarted(trans, restart_count);
+	return ret;
+}
+
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+	u32 restart_count = trans->restart_count;
+	return check_subdir_count_notnested(trans, w) ?:
+		trans_was_restarted(trans, restart_count);
 }
 
 static int check_dirent_inode_dirent(struct btree_trans *trans,
@@ -2003,7 +2015,8 @@
 				k,
 				NULL, NULL,
 				BCH_TRANS_COMMIT_no_enospc,
-			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)));
+			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
+		check_subdir_count_notnested(trans, &dir));
 
 	snapshots_seen_exit(&s);
 	inode_walker_exit(&dir);
@@ -2022,8 +2035,10 @@
 	int ret;
 
 	ret = check_key_has_snapshot(trans, iter, k);
-	if (ret)
+	if (ret < 0)
 		return ret;
+	if (ret)
+		return 0;
 
 	i = walk_inode(trans, inode, k);
 	ret = PTR_ERR_OR_ZERO(i);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index f314b2e..9c9a25d 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -511,18 +511,18 @@
 	if (journal_res_get_fast(j, res, flags))
 		return 0;
 
+	if (bch2_journal_error(j))
+		return -BCH_ERR_erofs_journal_err;
+
+	if (j->blocked)
+		return -BCH_ERR_journal_res_get_blocked;
+
 	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
 		ret = JOURNAL_ERR_journal_full;
 		can_discard = j->can_discard;
 		goto out;
 	}
 
-	if (j->blocked)
-		return -BCH_ERR_journal_res_get_blocked;
-
-	if (bch2_journal_error(j))
-		return -BCH_ERR_erofs_journal_err;
-
 	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
 		ret = JOURNAL_ERR_max_in_flight;
 		goto out;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index d76c3c0..725fcf4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1082,9 +1082,7 @@
 		ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
 			     j->encrypted_start,
 			     vstruct_end(j) - (void *) j->encrypted_start);
-		bch2_fs_fatal_err_on(ret, c,
-				"error decrypting journal entry: %s",
-				bch2_err_str(ret));
+		bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
 
 		mutex_lock(&jlist->lock);
 		ret = journal_entry_add(c, ca, (struct journal_ptr) {
@@ -1820,7 +1818,8 @@
 			jset_entry_for_each_key(i, k) {
 				ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
 				if (ret) {
-					bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+					bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
+							    bch2_err_str(ret));
 					bch2_journal_keys_to_write_buffer_end(c, &wb);
 					return ret;
 				}
@@ -1848,7 +1847,8 @@
 
 	bch2_journal_super_entries_add_common(c, &end, seq);
 	u64s	= (u64 *) end - (u64 *) start;
-	BUG_ON(u64s > j->entry_u64s_reserved);
+
+	WARN_ON(u64s > j->entry_u64s_reserved);
 
 	le32_add_cpu(&jset->u64s, u64s);
 
@@ -1856,7 +1856,7 @@
 	bytes	= vstruct_bytes(jset);
 
 	if (sectors > w->sectors) {
-		bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
+		bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
 				    vstruct_bytes(jset), w->sectors << 9,
 				    u64s, w->u64s_reserved, j->entry_u64s_reserved);
 		return -EINVAL;
@@ -1884,8 +1884,7 @@
 	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
 		    jset->encrypted_start,
 		    vstruct_end(jset) - (void *) jset->encrypted_start);
-	if (bch2_fs_fatal_err_on(ret, c,
-			"error decrypting journal entry: %i", ret))
+	if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)))
 		return ret;
 
 	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index ad59810..9fac838 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -101,8 +101,8 @@
 		struct printbuf buf = PRINTBUF;
 
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-		bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
-				     __func__, buf.buf, bch2_err_str(ret));
+		bch2_fs_fatal_error(c, "deleting logged operation %s: %s",
+				    buf.buf, bch2_err_str(ret));
 		printbuf_exit(&buf);
 	}
 }
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 69e06a8..0d2b82d 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -155,8 +155,7 @@
 	if (bch2_err_matches(ret, EROFS))
 		return ret;
 
-	if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
-				 __func__, bch2_err_str(ret)))
+	if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
 		return ret;
 
 	ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 2af219a..03f9d6af 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -90,10 +90,12 @@
 	struct journal_keys *keys = &c->journal_keys;
 	size_t src, dst;
 
+	move_gap(keys, keys->nr);
+
 	for (src = 0, dst = 0; src < keys->nr; src++)
 		if (!btree_id_is_alloc(keys->data[src].btree_id))
 			keys->data[dst++] = keys->data[src];
-	keys->nr = dst;
+	keys->nr = keys->gap = dst;
 }
 
 /*
@@ -203,6 +205,8 @@
 
 	BUG_ON(!atomic_read(&keys->ref));
 
+	move_gap(keys, keys->nr);
+
 	/*
 	 * First, attempt to replay keys in sorted order. This is more
 	 * efficient - better locality of btree access -  but some might fail if
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
index 1361e34..4959e95 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_types.h
@@ -13,11 +13,11 @@
  * must never change:
  */
 #define BCH_RECOVERY_PASSES()							\
+	x(check_topology,			 4, 0)				\
 	x(alloc_read,				 0, PASS_ALWAYS)		\
 	x(stripes_read,				 1, PASS_ALWAYS)		\
 	x(initialize_subvolumes,		 2, 0)				\
 	x(snapshots_read,			 3, PASS_ALWAYS)		\
-	x(check_topology,			 4, 0)				\
 	x(check_allocations,			 5, PASS_FSCK)			\
 	x(trans_mark_dev_sbs,			 6, PASS_ALWAYS|PASS_SILENT)	\
 	x(fs_journal_alloc,			 7, PASS_ALWAYS|PASS_SILENT)	\
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index ac6ba04..39debe8 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -91,18 +91,20 @@
 
 /* Snapshot nodes: */
 
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
 {
-	struct snapshot_table *t;
-
-	rcu_read_lock();
-	t = rcu_dereference(c->snapshots);
-
 	while (id && id < ancestor)
 		id = __snapshot_t(t, id)->parent;
+	return id == ancestor;
+}
+
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	rcu_read_lock();
+	bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
 	rcu_read_unlock();
 
-	return id == ancestor;
+	return ret;
 }
 
 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
@@ -120,13 +122,15 @@
 
 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
 {
-	struct snapshot_table *t;
 	bool ret;
 
-	EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots);
-
 	rcu_read_lock();
-	t = rcu_dereference(c->snapshots);
+	struct snapshot_table *t = rcu_dereference(c->snapshots);
+
+	if (unlikely(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots)) {
+		ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
+		goto out;
+	}
 
 	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
 		id = get_ancestor_below(t, id, ancestor);
@@ -134,11 +138,11 @@
 	if (id && id < ancestor) {
 		ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
 
-		EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
+		EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
 	} else {
 		ret = id == ancestor;
 	}
-
+out:
 	rcu_read_unlock();
 
 	return ret;
@@ -547,7 +551,7 @@
 			"snapshot tree points to missing subvolume:\n  %s",
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-	    fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
+	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
 						le32_to_cpu(subvol.snapshot),
 						root_id),
 			c, snapshot_tree_to_wrong_subvol,
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index bceac29..ad28e37 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -985,7 +985,7 @@
 		prt_str(&buf, " > ");
 		bch2_version_to_text(&buf, bcachefs_metadata_version_current);
 		prt_str(&buf, ")");
-		bch2_fs_fatal_error(c, "%s", buf.buf);
+		bch2_fs_fatal_error(c, ": %s", buf.buf);
 		printbuf_exit(&buf);
 		return -BCH_ERR_sb_not_downgraded;
 	}
@@ -1005,7 +1005,7 @@
 
 		if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
 			bch2_fs_fatal_error(c,
-				"Superblock write was silently dropped! (seq %llu expected %llu)",
+				": Superblock write was silently dropped! (seq %llu expected %llu)",
 				le64_to_cpu(ca->sb_read_scratch->seq),
 				ca->disk_sb.seq);
 			percpu_ref_put(&ca->io_ref);
@@ -1015,7 +1015,7 @@
 
 		if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
 			bch2_fs_fatal_error(c,
-				"Superblock modified by another process (seq %llu expected %llu)",
+				": Superblock modified by another process (seq %llu expected %llu)",
 				le64_to_cpu(ca->sb_read_scratch->seq),
 				ca->disk_sb.seq);
 			percpu_ref_put(&ca->io_ref);
@@ -1066,7 +1066,7 @@
 				 !can_mount_with_written ||
 				 (can_mount_without_written &&
 				  !can_mount_with_written), c,
-		"Unable to write superblock to sufficient devices (from %ps)",
+		": Unable to write superblock to sufficient devices (from %ps)",
 		(void *) _RET_IP_))
 		ret = -1;
 out:
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 233f864..1ad6e5c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -87,20 +87,28 @@
 	NULL
 };
 
+__printf(2, 0)
+static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
+{
+#ifdef __KERNEL__
+	if (unlikely(stdio)) {
+		if (fmt[0] == KERN_SOH[0])
+			fmt += 2;
+
+		bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
+		return;
+	}
+#endif
+	vprintk(fmt, args);
+}
+
 void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
 {
 	struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
 
 	va_list args;
 	va_start(args, fmt);
-	if (likely(!stdio)) {
-		vprintk(fmt, args);
-	} else {
-		if (fmt[0] == KERN_SOH[0])
-			fmt += 2;
-
-		bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
-	}
+	bch2_print_maybe_redirect(stdio, fmt, args);
 	va_end(args);
 }
 
@@ -110,14 +118,7 @@
 
 	va_list args;
 	va_start(args, fmt);
-	if (likely(!stdio)) {
-		vprintk(fmt, args);
-	} else {
-		if (fmt[0] == KERN_SOH[0])
-			fmt += 2;
-
-		bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
-	}
+	bch2_print_maybe_redirect(stdio, fmt, args);
 	va_end(args);
 }
 
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 7ffbddb..175aee3 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -683,6 +683,9 @@
 /* Move the gap in a gap buffer: */
 #define move_gap(_d, _new_gap)						\
 do {									\
+	BUG_ON(_new_gap > (_d)->nr);					\
+	BUG_ON((_d)->gap > (_d)->nr);					\
+									\
 	__move_gap((_d)->data, sizeof((_d)->data[0]),			\
 		   (_d)->nr, (_d)->size, (_d)->gap, _new_gap);		\
 	(_d)->gap = _new_gap;						\