bcachefs: Add BCH_SUBVOLUME_UNLINKED

Snapshot deletion needs to become a multi step process, where we unlink,
then tear down the page cache, then delete the subvolume - the deleting
flag is equivalent to an inode with i_nlink = 0.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 1608faa..56727001 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -353,6 +353,7 @@ enum bch_time_stats {
 #include "quota_types.h"
 #include "rebalance_types.h"
 #include "replicas_types.h"
+#include "subvolume_types.h"
 #include "super_types.h"
 
 /* Number of nodes btree coalesce will try to coalesce at once */
@@ -657,6 +658,9 @@ struct bch_fs {
 	struct bch_snapshot_table __rcu *snapshot_table;
 	struct mutex		snapshot_table_lock;
 	struct work_struct	snapshot_delete_work;
+	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
+	struct snapshot_id_list	snapshots_unlinked;
+	struct mutex		snapshots_unlinked_lock;
 
 	/* BTREE CACHE */
 	struct bio_set		btree_bio;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 481bf64..8e1423b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -974,6 +974,7 @@ LE32_BITMASK(BCH_SUBVOLUME_RO,		struct bch_subvolume, flags,  0,  1)
  * can delete it (or whether it should just be rm -rf'd)
  */
 LE32_BITMASK(BCH_SUBVOLUME_SNAP,	struct bch_subvolume, flags,  1,  2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED,	struct bch_subvolume, flags,  2,  3)
 
 /* Snapshots */
 
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index c49de74..5f3429e99 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -239,7 +239,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
 		      struct bch_inode_unpacked *dir_u,
 		      struct bch_inode_unpacked *inode_u,
 		      const struct qstr *name,
-		      int deleting_snapshot)
+		      bool deleting_snapshot)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter dir_iter = { NULL };
@@ -267,35 +267,19 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) {
+	if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
 		ret = bch2_empty_dir_trans(trans, inum);
 		if (ret)
 			goto err;
 	}
 
-	if (deleting_snapshot < 0 &&
-	    inode_u->bi_subvol) {
-		struct bch_subvolume s;
-
-		ret = bch2_subvolume_get(trans, inode_u->bi_subvol, true,
-					 BTREE_ITER_CACHED|
-					 BTREE_ITER_WITH_UPDATES,
-					 &s);
-		if (ret)
-			goto err;
-
-		if (BCH_SUBVOLUME_SNAP(&s))
-			deleting_snapshot = 1;
+	if (deleting_snapshot && !inode_u->bi_subvol) {
+		ret = -ENOENT;
+		goto err;
 	}
 
-	if (deleting_snapshot == 1) {
-		if (!inode_u->bi_subvol) {
-			ret = -ENOENT;
-			goto err;
-		}
-
-		ret = bch2_subvolume_delete(trans, inode_u->bi_subvol,
-					    deleting_snapshot);
+	if (deleting_snapshot || inode_u->bi_subvol) {
+		ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
 		if (ret)
 			goto err;
 
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index 9bb0a96..dde2378 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -26,7 +26,7 @@ int bch2_link_trans(struct btree_trans *,
 int bch2_unlink_trans(struct btree_trans *, subvol_inum,
 		      struct bch_inode_unpacked *,
 		      struct bch_inode_unpacked *,
-		      const struct qstr *, int);
+		      const struct qstr *, bool);
 
 int bch2_rename_trans(struct btree_trans *,
 		      subvol_inum, struct bch_inode_unpacked *,
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index a12b591..de94895 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -441,7 +441,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
 
 	dir = path.dentry->d_parent->d_inode;
 
-	ret = __bch2_unlink(dir, path.dentry, 1);
+	ret = __bch2_unlink(dir, path.dentry, true);
 	if (!ret) {
 		fsnotify_rmdir(dir, path.dentry);
 		d_delete(path.dentry);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 334cd33..c325e5c 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -490,7 +490,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 }
 
 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
-		  int deleting_snapshot)
+		  bool deleting_snapshot)
 {
 	struct bch_fs *c = vdir->i_sb->s_fs_info;
 	struct bch_inode_info *dir = to_bch_ei(vdir);
@@ -527,7 +527,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 
 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 {
-	return __bch2_unlink(vdir, dentry, -1);
+	return __bch2_unlink(vdir, dentry, false);
 }
 
 static int bch2_symlink(struct mnt_idmap *idmap,
@@ -1292,6 +1292,12 @@ static int bch2_vfs_write_inode(struct inode *vinode,
 	return ret;
 }
 
+static int bch2_drop_inode(struct inode *vinode)
+{
+
+	return generic_drop_inode(vinode);
+}
+
 static void bch2_evict_inode(struct inode *vinode)
 {
 	struct bch_fs *c = vinode->i_sb->s_fs_info;
@@ -1496,6 +1502,7 @@ static const struct super_operations bch_super_operations = {
 	.alloc_inode	= bch2_alloc_inode,
 	.destroy_inode	= bch2_destroy_inode,
 	.write_inode	= bch2_vfs_write_inode,
+	.drop_inode	= bch2_drop_inode,
 	.evict_inode	= bch2_evict_inode,
 	.sync_fs	= bch2_sync_fs,
 	.statfs		= bch2_statfs,
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 40898c4..2616b15 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -183,7 +183,7 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
 int bch2_setattr_nonsize(struct mnt_idmap *,
 			 struct bch_inode_info *,
 			 struct iattr *);
-int __bch2_unlink(struct inode *, struct dentry *, int);
+int __bch2_unlink(struct inode *, struct dentry *, bool);
 
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index a61d380a..6b3eecd 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -256,7 +256,7 @@ static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
 
 	/* Subvolume root? */
 	if (inode_u.bi_subvol) {
-		ret = bch2_subvolume_delete(trans, inode_u.bi_subvol, -1);
+		ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
 		if (ret)
 			goto err;
 	}
@@ -992,12 +992,28 @@ static int check_subvols(struct bch_fs *c)
 	struct btree_trans trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	struct bkey_s_c_subvolume subvol;
 	int ret;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
 	for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN,
 			   0, k, ret) {
+		if (k.k->type != KEY_TYPE_subvolume)
+			continue;
+
+		subvol = bkey_s_c_to_subvolume(k);
+
+		if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+			ret = __bch2_trans_do(&trans,  NULL, NULL,
+					      BTREE_INSERT_LAZY_RW,
+					bch2_subvolume_delete(&trans, iter.pos.offset));
+			if (ret) {
+				bch_err(c, "error deleting subvolume %llu: %i",
+					iter.pos.offset, ret);
+				break;
+			}
+		}
 	}
 	bch2_trans_iter_exit(&trans, &iter);
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 7fccf84..3ae321a 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -709,11 +709,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached)
 	bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
 
 	/* Subvolume root? */
-	if (inode_u.bi_subvol) {
-		ret = bch2_subvolume_delete(&trans, inode_u.bi_subvol, -1);
-		if (ret)
-			goto err;
-	}
+	BUG_ON(inode_u.bi_subvol);
 
 	bkey_inode_generation_init(&delete.k_i);
 	delete.k.p = iter.pos;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 9bd8d61..58cda98 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -4,6 +4,7 @@
 #include "btree_key_cache.h"
 #include "btree_update.h"
 #include "error.h"
+#include "fs.h"
 #include "subvolume.h"
 
 /* Snapshot tree: */
@@ -541,13 +542,6 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 	return ret;
 }
 
-/* List of snapshot IDs that are being deleted: */
-struct snapshot_id_list {
-	u32		nr;
-	u32		size;
-	u32		*d;
-};
-
 static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
 {
 	unsigned i;
@@ -819,9 +813,11 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
 	return ret;
 }
 
-/* XXX: mark snapshot id for deletion, walk btree and delete: */
-int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid,
-			  int deleting_snapshot)
+/*
+ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
+ * deletion/cleanup:
+ */
+int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -849,12 +845,6 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid,
 	subvol = bkey_s_c_to_subvolume(k);
 	snapid = le32_to_cpu(subvol.v->snapshot);
 
-	if (deleting_snapshot >= 0 &&
-	    deleting_snapshot != BCH_SUBVOLUME_SNAP(subvol.v)) {
-		ret = -ENOENT;
-		goto err;
-	}
-
 	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
 	ret = PTR_ERR_OR_ZERO(delete);
 	if (ret)
@@ -880,6 +870,163 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid,
 	return ret;
 }
 
+static void bch2_evict_subvolume_inodes(struct bch_fs *c,
+				 struct snapshot_id_list *s)
+{
+	struct super_block *sb = c->vfs_sb;
+	struct inode *inode;
+
+	spin_lock(&sb->s_inode_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+		    (inode->i_state & I_FREEING))
+			continue;
+
+		d_mark_dontcache(inode);
+		d_prune_aliases(inode);
+	}
+	spin_unlock(&sb->s_inode_list_lock);
+again:
+	cond_resched();
+	spin_lock(&sb->s_inode_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+		    (inode->i_state & I_FREEING))
+			continue;
+
+		if (!(inode->i_state & I_DONTCACHE)) {
+			d_mark_dontcache(inode);
+			d_prune_aliases(inode);
+		}
+
+		spin_lock(&inode->i_lock);
+		if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
+		    !(inode->i_state & I_FREEING)) {
+			wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
+			DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+			spin_unlock(&inode->i_lock);
+			spin_unlock(&sb->s_inode_list_lock);
+			schedule();
+			finish_wait(wq, &wait.wq_entry);
+			goto again;
+		}
+
+		spin_unlock(&inode->i_lock);
+	}
+	spin_unlock(&sb->s_inode_list_lock);
+}
+
+void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs,
+				snapshot_wait_for_pagecache_and_delete_work);
+	struct snapshot_id_list s;
+	u32 *id;
+	int ret = 0;
+
+	while (!ret) {
+		mutex_lock(&c->snapshots_unlinked_lock);
+		s = c->snapshots_unlinked;
+		memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked));
+		mutex_unlock(&c->snapshots_unlinked_lock);
+
+		if (!s.nr)
+			break;
+
+		bch2_evict_subvolume_inodes(c, &s);
+
+		for (id = s.d; id < s.d + s.nr; id++) {
+			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+				      bch2_subvolume_delete(&trans, *id));
+			if (ret) {
+				bch_err(c, "error %i deleting subvolume %u", ret, *id);
+				break;
+			}
+		}
+
+		kfree(s.d);
+	}
+
+	percpu_ref_put(&c->writes);
+}
+
+struct subvolume_unlink_hook {
+	struct btree_trans_commit_hook	h;
+	u32				subvol;
+};
+
+int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
+						      struct btree_trans_commit_hook *_h)
+{
+	struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
+	struct bch_fs *c = trans->c;
+	int ret = 0;
+
+	mutex_lock(&c->snapshots_unlinked_lock);
+	if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
+		ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol);
+	mutex_unlock(&c->snapshots_unlinked_lock);
+
+	if (ret)
+		return ret;
+
+	if (unlikely(!percpu_ref_tryget(&c->writes)))
+		return -EROFS;
+
+	if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
+		percpu_ref_put(&c->writes);
+	return 0;
+}
+
+int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_subvolume *n;
+	struct subvolume_unlink_hook *h;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+			     POS(0, subvolid),
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_subvolume) {
+		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
+		ret = -EIO;
+		goto err;
+	}
+
+	n = bch2_trans_kmalloc(trans, sizeof(*n));
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		goto err;
+
+	bkey_reassemble(&n->k_i, k);
+	SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
+
+	ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+	if (ret)
+		goto err;
+
+	h = bch2_trans_kmalloc(trans, sizeof(*h));
+	ret = PTR_ERR_OR_ZERO(h);
+	if (ret)
+		goto err;
+
+	h->h.fn		= bch2_subvolume_wait_for_pagecache_and_delete_hook;
+	h->subvol	= subvolid;
+	bch2_trans_commit_hook(trans, &h->h);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 			  u32 src_subvolid,
 			  u32 *new_subvolid,
@@ -977,5 +1124,8 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 int bch2_fs_subvolumes_init(struct bch_fs *c)
 {
 	INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
+	INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
+		  bch2_subvolume_wait_for_pagecache_and_delete);
+	mutex_init(&c->snapshots_unlinked_lock);
 	return 0;
 }
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index f98c8c0..45234c9 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -2,6 +2,8 @@
 #ifndef _BCACHEFS_SUBVOLUME_H
 #define _BCACHEFS_SUBVOLUME_H
 
+#include "subvolume_types.h"
+
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
 
@@ -108,7 +110,8 @@ int bch2_subvolume_get(struct btree_trans *, unsigned,
 		       bool, int, struct bch_subvolume *);
 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 
-int bch2_subvolume_delete(struct btree_trans *, u32, int);
+int bch2_subvolume_delete(struct btree_trans *, u32);
+int bch2_subvolume_unlink(struct btree_trans *, u32);
 int bch2_subvolume_create(struct btree_trans *, u64, u32,
 			  u32 *, u32 *, bool);
 
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
new file mode 100644
index 0000000..9410b95
--- /dev/null
+++ b/fs/bcachefs/subvolume_types.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
+#define _BCACHEFS_SUBVOLUME_TYPES_H
+
+struct snapshot_id_list {
+	u32		nr;
+	u32		size;
+	u32		*d;
+};
+
+#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */