bcachefs: Erasure coding

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index c13f2cf..2f8300b 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -16,6 +16,8 @@
 	select CRYPTO_CHACHA20
 	select CRYPTO_POLY1305
 	select KEYS
+	select RAID6_PQ
+	select XOR_BLOCKS
 	help
 	The bcachefs filesystem - a modern, copy on write filesystem, with
 	support for multiple devices, compression, checksumming, etc.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 5318287..b9521d7 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -22,6 +22,7 @@
 	debug.o			\
 	dirent.o		\
 	disk_groups.o		\
+	ec.o			\
 	error.o			\
 	extents.o		\
 	fs.o			\
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 291d352..b49d0cd 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -10,6 +10,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "debug.h"
+#include "ec.h"
 #include "error.h"
 #include "journal_io.h"
 #include "trace.h"
@@ -1113,6 +1114,24 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 	}
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
+	while (1) {
+		struct open_bucket *ob;
+
+		spin_lock(&c->freelist_lock);
+		if (!ca->open_buckets_partial_nr) {
+			spin_unlock(&c->freelist_lock);
+			break;
+		}
+		ob = c->open_buckets +
+			ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+		ob->on_partial_list = false;
+		spin_unlock(&c->freelist_lock);
+
+		bch2_open_bucket_put(c, ob);
+	}
+
+	bch2_ec_stop_dev(c, ca);
+
 	/*
 	 * Wake up threads that were blocked on allocation, so they can notice
 	 * the device can no longer be removed and the capacity has changed:
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index df74e41..6e5f6e5 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -62,6 +62,7 @@
 #include "clock.h"
 #include "debug.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "io.h"
 #include "trace.h"
 
@@ -95,6 +96,11 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
+	if (ob->ec) {
+		bch2_ec_bucket_written(c, ob);
+		return;
+	}
+
 	percpu_down_read(&c->usage_lock);
 	spin_lock(&ob->lock);
 
@@ -114,6 +120,19 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	closure_wake_up(&c->open_buckets_wait);
 }
 
+void bch2_open_bucket_write_error(struct bch_fs *c,
+				  struct open_buckets *obs,
+				  unsigned dev)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->ptr.dev == dev &&
+		    ob->ec)
+			bch2_ec_bucket_cancel(c, ob);
+}
+
 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 {
 	struct open_bucket *ob;
@@ -129,15 +148,17 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 }
 
 static void open_bucket_free_unused(struct bch_fs *c,
-				    struct write_point *wp,
-				    struct open_bucket *ob)
+				    struct open_bucket *ob,
+				    bool may_realloc)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
 	BUG_ON(ca->open_buckets_partial_nr >=
 	       ARRAY_SIZE(ca->open_buckets_partial));
 
-	if (wp->type == BCH_DATA_USER) {
+	if (ca->open_buckets_partial_nr <
+	    ARRAY_SIZE(ca->open_buckets_partial) &&
+	    may_realloc) {
 		spin_lock(&c->freelist_lock);
 		ob->on_partial_list = true;
 		ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
@@ -285,18 +306,18 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 	return ob;
 }
 
-static int __dev_alloc_cmp(struct write_point *wp,
-			   unsigned l, unsigned r)
+static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
+			    unsigned l, unsigned r)
 {
-	return ((wp->next_alloc[l] > wp->next_alloc[r]) -
-		(wp->next_alloc[l] < wp->next_alloc[r]));
+	return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
+		(stripe->next_alloc[l] < stripe->next_alloc[r]));
 }
 
-#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r)
+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
 
-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
-					 struct write_point *wp,
-					 struct bch_devs_mask *devs)
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
+					  struct dev_stripe_state *stripe,
+					  struct bch_devs_mask *devs)
 {
 	struct dev_alloc_list ret = { .nr = 0 };
 	struct bch_dev *ca;
@@ -305,14 +326,14 @@ struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
 	for_each_member_device_rcu(ca, c, i, devs)
 		ret.devs[ret.nr++] = i;
 
-	bubble_sort(ret.devs, ret.nr, dev_alloc_cmp);
+	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
 	return ret;
 }
 
-void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
-		     struct write_point *wp)
+void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca,
+			       struct dev_stripe_state *stripe)
 {
-	u64 *v = wp->next_alloc + ca->dev_idx;
+	u64 *v = stripe->next_alloc + ca->dev_idx;
 	u64 free_space = dev_buckets_free(c, ca);
 	u64 free_space_inv = free_space
 		? div64_u64(1ULL << 48, free_space)
@@ -324,26 +345,30 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
 	else
 		*v = U64_MAX;
 
-	for (v = wp->next_alloc;
-	     v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++)
+	for (v = stripe->next_alloc;
+	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
 		*v = *v < scale ? 0 : *v - scale;
 }
 
+#define BUCKET_MAY_ALLOC_PARTIAL	(1 << 0)
+#define BUCKET_ALLOC_USE_DURABILITY	(1 << 1)
+
 static int bch2_bucket_alloc_set(struct bch_fs *c,
 				 struct open_buckets *ptrs,
-				 struct write_point *wp,
+				 struct dev_stripe_state *stripe,
 				 struct bch_devs_mask *devs_may_alloc,
 				 unsigned nr_replicas,
 				 unsigned *nr_effective,
 				 bool *have_cache,
 				 enum alloc_reserve reserve,
+				 unsigned flags,
 				 struct closure *cl)
 {
 	struct dev_alloc_list devs_sorted =
-		bch2_wp_alloc_list(c, wp, devs_may_alloc);
+		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
 	struct bch_dev *ca;
 	bool alloc_failure = false;
-	unsigned i;
+	unsigned i, durability;
 
 	BUG_ON(*nr_effective >= nr_replicas);
 
@@ -354,13 +379,11 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
 		if (!ca)
 			continue;
 
-		if (!ca->mi.durability &&
-		    (*have_cache ||
-		     wp->type != BCH_DATA_USER))
+		if (!ca->mi.durability && *have_cache)
 			continue;
 
 		ob = bch2_bucket_alloc(c, ca, reserve,
-				       wp->type == BCH_DATA_USER, cl);
+				flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
 		if (IS_ERR(ob)) {
 			enum bucket_alloc_ret ret = -PTR_ERR(ob);
 
@@ -375,13 +398,16 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
 			continue;
 		}
 
+		durability = (flags & BUCKET_ALLOC_USE_DURABILITY)
+			? ca->mi.durability : 1;
+
 		__clear_bit(ca->dev_idx, devs_may_alloc->d);
-		*nr_effective	+= ca->mi.durability;
-		*have_cache	|= !ca->mi.durability;
+		*nr_effective	+= durability;
+		*have_cache	|= !durability;
 
 		ob_push(c, ptrs, ob);
 
-		bch2_wp_rescale(c, ca, wp);
+		bch2_dev_stripe_increment(c, ca, stripe);
 
 		if (*nr_effective >= nr_replicas)
 			return 0;
@@ -390,15 +416,150 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
 	return alloc_failure ? -ENOSPC : -EROFS;
 }
 
+/* Allocate from stripes: */
+
+/*
+ * XXX: use a higher watermark for allocating open buckets here:
+ */
+static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct bch_devs_mask devs;
+	struct open_bucket *ob;
+	unsigned i, nr_have = 0, nr_data =
+		min_t(unsigned, h->nr_active_devs,
+		      EC_STRIPE_MAX) - h->redundancy;
+	bool have_cache = true;
+	int ret = 0;
+
+	BUG_ON(h->blocks.nr > nr_data);
+	BUG_ON(h->parity.nr > h->redundancy);
+
+	devs = h->devs;
+
+	open_bucket_for_each(c, &h->parity, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+	open_bucket_for_each(c, &h->blocks, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+
+	percpu_down_read(&c->usage_lock);
+	rcu_read_lock();
+
+	if (h->parity.nr < h->redundancy) {
+		nr_have = h->parity.nr;
+
+		ret = bch2_bucket_alloc_set(c, &h->parity,
+					    &h->parity_stripe,
+					    &devs,
+					    h->redundancy,
+					    &nr_have,
+					    &have_cache,
+					    RESERVE_NONE,
+					    0,
+					    NULL);
+		if (ret)
+			goto err;
+	}
+
+	if (h->blocks.nr < nr_data) {
+		nr_have = h->blocks.nr;
+
+		ret = bch2_bucket_alloc_set(c, &h->blocks,
+					    &h->block_stripe,
+					    &devs,
+					    nr_data,
+					    &nr_have,
+					    &have_cache,
+					    RESERVE_NONE,
+					    0,
+					    NULL);
+		if (ret)
+			goto err;
+	}
+
+	rcu_read_unlock();
+	percpu_up_read(&c->usage_lock);
+
+	return bch2_ec_stripe_new_alloc(c, h);
+err:
+	rcu_read_unlock();
+	percpu_up_read(&c->usage_lock);
+	return -1;
+}
+
+/*
+ * if we can't allocate a new stripe because there are already too many
+ * partially filled stripes, force allocating from an existing stripe even when
+ * it's to a device we don't want:
+ */
+
+static void bucket_alloc_from_stripe(struct bch_fs *c,
+				     struct open_buckets *ptrs,
+				     struct write_point *wp,
+				     struct bch_devs_mask *devs_may_alloc,
+				     u16 target,
+				     unsigned erasure_code,
+				     unsigned nr_replicas,
+				     unsigned *nr_effective,
+				     bool *have_cache)
+{
+	struct dev_alloc_list devs_sorted;
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	struct bch_dev *ca;
+	unsigned i, ec_idx;
+
+	if (!erasure_code)
+		return;
+
+	if (nr_replicas < 2)
+		return;
+
+	if (ec_open_bucket(c, ptrs))
+		return;
+
+	h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1);
+	if (!h)
+		return;
+
+	if (!h->s && ec_stripe_alloc(c, h))
+		goto out_put_head;
+
+	rcu_read_lock();
+	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
+	rcu_read_unlock();
+
+	for (i = 0; i < devs_sorted.nr; i++)
+		open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
+			if (ob->ptr.dev == devs_sorted.devs[i] &&
+			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
+				goto got_bucket;
+	goto out_put_head;
+got_bucket:
+	ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+	ob->ec_idx	= ec_idx;
+	ob->ec		= h->s;
+
+	__clear_bit(ob->ptr.dev, devs_may_alloc->d);
+	*nr_effective	+= ca->mi.durability;
+	*have_cache	|= !ca->mi.durability;
+
+	ob_push(c, ptrs, ob);
+	atomic_inc(&h->s->pin);
+out_put_head:
+	bch2_ec_stripe_head_put(h);
+}
+
 /* Sector allocator */
 
-static int get_buckets_from_writepoint(struct bch_fs *c,
-				       struct open_buckets *ptrs,
-				       struct write_point *wp,
-				       struct bch_devs_mask *devs_may_alloc,
-				       unsigned nr_replicas,
-				       unsigned *nr_effective,
-				       bool *have_cache)
+static void get_buckets_from_writepoint(struct bch_fs *c,
+					struct open_buckets *ptrs,
+					struct write_point *wp,
+					struct bch_devs_mask *devs_may_alloc,
+					unsigned nr_replicas,
+					unsigned *nr_effective,
+					bool *have_cache,
+					bool need_ec)
 {
 	struct open_buckets ptrs_skip = { .nr = 0 };
 	struct open_bucket *ob;
@@ -410,7 +571,8 @@ static int get_buckets_from_writepoint(struct bch_fs *c,
 		if (*nr_effective < nr_replicas &&
 		    test_bit(ob->ptr.dev, devs_may_alloc->d) &&
 		    (ca->mi.durability ||
-		     (wp->type == BCH_DATA_USER && !*have_cache))) {
+		     (wp->type == BCH_DATA_USER && !*have_cache)) &&
+		    (ob->ec || !need_ec)) {
 			__clear_bit(ob->ptr.dev, devs_may_alloc->d);
 			*nr_effective	+= ca->mi.durability;
 			*have_cache	|= !ca->mi.durability;
@@ -421,8 +583,6 @@ static int get_buckets_from_writepoint(struct bch_fs *c,
 		}
 	}
 	wp->ptrs = ptrs_skip;
-
-	return *nr_effective < nr_replicas ? -ENOSPC : 0;
 }
 
 static int open_bucket_add_buckets(struct bch_fs *c,
@@ -430,22 +590,25 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 				   struct write_point *wp,
 				   struct bch_devs_list *devs_have,
 				   u16 target,
+				   unsigned erasure_code,
 				   unsigned nr_replicas,
 				   unsigned *nr_effective,
 				   bool *have_cache,
 				   enum alloc_reserve reserve,
-				   struct closure *cl)
+				   struct closure *_cl)
 {
 	struct bch_devs_mask devs;
-	const struct bch_devs_mask *t;
 	struct open_bucket *ob;
-	unsigned i;
+	struct closure *cl = NULL;
+	unsigned i, flags = BUCKET_ALLOC_USE_DURABILITY;
 	int ret;
 
-	percpu_down_read(&c->usage_lock);
-	rcu_read_lock();
+	if (wp->type == BCH_DATA_USER)
+		flags |= BUCKET_MAY_ALLOC_PARTIAL;
 
-	devs = c->rw_devs[wp->type];
+	rcu_read_lock();
+	devs = target_rw_devs(c, wp->type, target);
+	rcu_read_unlock();
 
 	/* Don't allocate from devices we already have pointers to: */
 	for (i = 0; i < devs_have->nr; i++)
@@ -454,50 +617,83 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 	open_bucket_for_each(c, ptrs, ob, i)
 		__clear_bit(ob->ptr.dev, devs.d);
 
-	t = bch2_target_to_mask(c, target);
-	if (t)
-		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+	if (erasure_code) {
+		get_buckets_from_writepoint(c, ptrs, wp, &devs,
+					    nr_replicas, nr_effective,
+					    have_cache, true);
+		if (*nr_effective >= nr_replicas)
+			return 0;
 
-	ret = get_buckets_from_writepoint(c, ptrs, wp, &devs,
-				nr_replicas, nr_effective, have_cache);
-	if (!ret)
-		goto out;
+		bucket_alloc_from_stripe(c, ptrs, wp, &devs,
+					 target, erasure_code,
+					 nr_replicas, nr_effective,
+					 have_cache);
+		if (*nr_effective >= nr_replicas)
+			return 0;
+	}
 
+	get_buckets_from_writepoint(c, ptrs, wp, &devs,
+				    nr_replicas, nr_effective,
+				    have_cache, false);
+	if (*nr_effective >= nr_replicas)
+		return 0;
+
+	percpu_down_read(&c->usage_lock);
+	rcu_read_lock();
+
+retry_blocking:
 	/*
 	 * Try nonblocking first, so that if one device is full we'll try from
 	 * other devices:
 	 */
-	ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs,
+	ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
 				nr_replicas, nr_effective, have_cache,
-				reserve, NULL);
-	if (!ret || ret == -EROFS || !cl)
-		goto out;
+				reserve, flags, cl);
+	if (ret && ret != -EROFS && !cl && _cl) {
+		cl = _cl;
+		goto retry_blocking;
+	}
 
-	ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs,
-				nr_replicas, nr_effective, have_cache,
-				reserve, cl);
-out:
 	rcu_read_unlock();
 	percpu_up_read(&c->usage_lock);
 
 	return ret;
 }
 
+void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
+				struct open_buckets *obs,
+				enum bch_data_type data_type)
+{
+	struct open_buckets ptrs = { .nr = 0 };
+	struct open_bucket *ob, *ob2;
+	unsigned i, j;
+
+	open_bucket_for_each(c, obs, ob, i) {
+		bool drop = !ca || ob->ptr.dev == ca->dev_idx;
+
+		if (!drop && ob->ec) {
+			mutex_lock(&ob->ec->lock);
+			open_bucket_for_each(c, &ob->ec->blocks, ob2, j)
+				drop |= ob2->ptr.dev == ca->dev_idx;
+			open_bucket_for_each(c, &ob->ec->parity, ob2, j)
+				drop |= ob2->ptr.dev == ca->dev_idx;
+			mutex_unlock(&ob->ec->lock);
+		}
+
+		if (drop)
+			bch2_open_bucket_put(c, ob);
+		else
+			ob_push(c, &ptrs, ob);
+	}
+
+	*obs = ptrs;
+}
+
 void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
 			  struct write_point *wp)
 {
-	struct open_buckets ptrs = { .nr = 0 };
-	struct open_bucket *ob;
-	unsigned i;
-
 	mutex_lock(&wp->lock);
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		if (!ca || ob->ptr.dev == ca->dev_idx)
-			open_bucket_free_unused(c, wp, ob);
-		else
-			ob_push(c, &ptrs, ob);
-
-	wp->ptrs = ptrs;
+	bch2_open_buckets_stop_dev(c, ca, &wp->ptrs, wp->type);
 	mutex_unlock(&wp->lock);
 }
 
@@ -630,6 +826,7 @@ static struct write_point *writepoint_find(struct bch_fs *c,
  */
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 				unsigned target,
+				unsigned erasure_code,
 				struct write_point_specifier write_point,
 				struct bch_devs_list *devs_have,
 				unsigned nr_replicas,
@@ -649,26 +846,37 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 	BUG_ON(!nr_replicas || !nr_replicas_required);
 retry:
 	write_points_nr = c->write_points_nr;
+
 	wp = writepoint_find(c, write_point.v);
 
+	/* metadata may not allocate on cache devices: */
+	if (wp->type != BCH_DATA_USER)
+		have_cache = true;
+
 	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target,
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve, cl);
 	} else {
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target,
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve, NULL);
 		if (!ret)
 			goto alloc_done;
 
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, 0,
+		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+					      0, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve, cl);
 	}
 alloc_done:
 	BUG_ON(!ret && nr_effective < nr_replicas);
 
+	if (erasure_code && !ec_open_bucket(c, &ptrs))
+		pr_debug("failed to get ec bucket: ret %u", ret);
+
 	if (ret == -EROFS &&
 	    nr_effective >= nr_replicas_required)
 		ret = 0;
@@ -678,7 +886,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 
 	/* Free buckets we didn't use: */
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		open_bucket_free_unused(c, wp, ob);
+		open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER);
 
 	wp->ptrs = ptrs;
 
@@ -697,7 +905,8 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
 			ob_push(c, &ptrs, ob);
 		else
-			open_bucket_free_unused(c, wp, ob);
+			open_bucket_free_unused(c, ob,
+					wp->type == BCH_DATA_USER);
 	wp->ptrs = ptrs;
 
 	mutex_unlock(&wp->lock);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 6672101..c71cf73 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -17,11 +17,11 @@ struct dev_alloc_list {
 	u8		devs[BCH_SB_MEMBERS_MAX];
 };
 
-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
-					 struct write_point *,
-					 struct bch_devs_mask *);
-void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
-		     struct write_point *);
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
+					  struct dev_stripe_state *,
+					  struct bch_devs_mask *);
+void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *,
+			       struct dev_stripe_state *);
 
 long bch2_bucket_alloc_new_fs(struct bch_dev *);
 
@@ -43,6 +43,22 @@ static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
 	     ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true);	\
 	     (_i)++)
 
+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
+						 struct open_buckets *obs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->ec)
+			return ob;
+
+	return NULL;
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *,
+			struct open_buckets *, unsigned);
+
 void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
 
 static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
@@ -76,7 +92,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
 }
 
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
-					     unsigned,
+					     unsigned, unsigned,
 					     struct write_point_specifier,
 					     struct bch_devs_list *,
 					     unsigned, unsigned,
@@ -88,6 +104,9 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
 				    struct bkey_i_extent *, unsigned);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 
+void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
+				struct open_buckets *, enum bch_data_type);
+
 void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
 			  struct write_point *);
 
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 2a9c6f0..ef3e400 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -8,6 +8,8 @@
 #include "clock_types.h"
 #include "fifo.h"
 
+struct ec_bucket_buf;
+
 /* There's two of these clocks, one for reads and one for writes: */
 struct bucket_clock {
 	/*
@@ -56,8 +58,10 @@ struct open_bucket {
 	u8			freelist;
 	bool			valid;
 	bool			on_partial_list;
+	u8			ec_idx;
 	unsigned		sectors_free;
 	struct bch_extent_ptr	ptr;
+	struct ec_stripe_new	*ec;
 };
 
 #define OPEN_BUCKET_LIST_MAX	15
@@ -67,18 +71,23 @@ struct open_buckets {
 	u8			v[OPEN_BUCKET_LIST_MAX];
 };
 
+struct dev_stripe_state {
+	u64			next_alloc[BCH_SB_MEMBERS_MAX];
+};
+
 struct write_point {
 	struct hlist_node	node;
 	struct mutex		lock;
 	u64			last_used;
 	unsigned long		write_point;
 	enum bch_data_type	type;
+	bool			is_ec;
 
 	/* calculated based on how many pointers we're actually going to use: */
 	unsigned		sectors_free;
 
 	struct open_buckets	ptrs;
-	u64			next_alloc[BCH_SB_MEMBERS_MAX];
+	struct dev_stripe_state	stripe;
 };
 
 struct write_point_specifier {
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 22df84b..b33fbf7 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -204,7 +204,7 @@
 #define dynamic_fault(...)		0
 #define race_fault(...)			0
 
-#define bch2_fs_init_fault(name)						\
+#define bch2_fs_init_fault(name)					\
 	dynamic_fault("bcachefs:bch_fs_init:" name)
 #define bch2_meta_read_fault(name)					\
 	 dynamic_fault("bcachefs:meta:read:" name)
@@ -273,7 +273,10 @@ do {									\
 	BCH_DEBUG_PARAM(test_alloc_startup,				\
 		"Force allocator startup to use the slowpath where it"	\
 		"can't find enough free buckets without invalidating"	\
-		"cached data")
+		"cached data")						\
+	BCH_DEBUG_PARAM(force_reconstruct_read,				\
+		"Force reads to use the reconstruct path, when reading"	\
+		"from erasure coded extents")
 
 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
 
@@ -311,6 +314,7 @@ enum bch_time_stats {
 #include "btree_types.h"
 #include "buckets_types.h"
 #include "clock_types.h"
+#include "ec_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
 #include "quota_types.h"
@@ -333,9 +337,13 @@ enum gc_phase {
 	GC_PHASE_START,
 	GC_PHASE_SB,
 
-#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
-	DEFINE_BCH_BTREE_IDS()
-#undef DEF_BTREE_ID
+	GC_PHASE_BTREE_EC,
+	GC_PHASE_BTREE_EXTENTS,
+	GC_PHASE_BTREE_INODES,
+	GC_PHASE_BTREE_DIRENTS,
+	GC_PHASE_BTREE_XATTRS,
+	GC_PHASE_BTREE_ALLOC,
+	GC_PHASE_BTREE_QUOTAS,
 
 	GC_PHASE_PENDING_DELETE,
 	GC_PHASE_ALLOC,
@@ -684,6 +692,21 @@ struct bch_fs {
 	/* REBALANCE */
 	struct bch_fs_rebalance	rebalance;
 
+	/* ERASURE CODING */
+	struct list_head	ec_new_stripe_list;
+	struct mutex		ec_new_stripe_lock;
+
+	GENRADIX(struct ec_stripe) ec_stripes;
+	struct mutex		ec_stripes_lock;
+
+	ec_stripes_heap		ec_stripes_heap;
+	spinlock_t		ec_stripes_heap_lock;
+
+	struct bio_set		ec_bioset;
+
+	struct work_struct	ec_stripe_delete_work;
+	struct llist_head	ec_stripe_delete_list;
+
 	/* VFS IO PATH - fs-io.c */
 	struct bio_set		writepage_bioset;
 	struct bio_set		dio_write_bioset;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index ecb7a97..a00e77f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -238,6 +238,9 @@ struct bkey_packed {
 } __attribute__((packed, aligned(8)));
 
 #define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
+#define BKEY_U64s_MAX			U8_MAX
+#define BKEY_VAL_U64s_MAX		(BKEY_U64s_MAX - BKEY_U64s)
+
 #define KEY_PACKED_BITS_START		24
 
 #define KEY_FORMAT_LOCAL_BTREE		0
@@ -465,8 +468,9 @@ enum bch_compression_type {
 	x(ptr,			0)		\
 	x(crc32,		1)		\
 	x(crc64,		2)		\
-	x(crc128,		3)
-#define BCH_EXTENT_ENTRY_MAX	4
+	x(crc128,		3)		\
+	x(stripe_ptr,		4)
+#define BCH_EXTENT_ENTRY_MAX	5
 
 enum bch_extent_entry_type {
 #define x(f, n) BCH_EXTENT_ENTRY_##f = n,
@@ -557,7 +561,7 @@ struct bch_extent_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u64			type:1,
 				cached:1,
-				erasure_coded:1,
+				unused:1,
 				reservation:1,
 				offset:44, /* 8 petabytes */
 				dev:8,
@@ -567,23 +571,35 @@ struct bch_extent_ptr {
 				dev:8,
 				offset:44,
 				reservation:1,
-				erasure_coded:1,
+				unused:1,
 				cached:1,
 				type:1;
 #endif
 } __attribute__((packed, aligned(8)));
 
-struct bch_extent_reservation {
+struct bch_extent_stripe_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u64			type:5,
-				unused:23,
+				block:8,
+				idx:51;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			idx:51,
+				block:8,
+				type:5;
+#endif
+};
+
+struct bch_extent_reservation {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:6,
+				unused:22,
 				replicas:4,
 				generation:32;
 #elif defined (__BIG_ENDIAN_BITFIELD)
 	__u64			generation:32,
 				replicas:4,
-				unused:23,
-				type:5;
+				unused:22,
+				type:6;
 #endif
 };
 
@@ -706,7 +722,8 @@ BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);
 	BCH_INODE_FIELD(bi_data_replicas,		8)	\
 	BCH_INODE_FIELD(bi_promote_target,		16)	\
 	BCH_INODE_FIELD(bi_foreground_target,		16)	\
-	BCH_INODE_FIELD(bi_background_target,		16)
+	BCH_INODE_FIELD(bi_background_target,		16)	\
+	BCH_INODE_FIELD(bi_erasure_code,		16)
 
 #define BCH_INODE_FIELDS_INHERIT()				\
 	BCH_INODE_FIELD(bi_data_checksum)			\
@@ -716,7 +733,8 @@ BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);
 	BCH_INODE_FIELD(bi_data_replicas)			\
 	BCH_INODE_FIELD(bi_promote_target)			\
 	BCH_INODE_FIELD(bi_foreground_target)			\
-	BCH_INODE_FIELD(bi_background_target)
+	BCH_INODE_FIELD(bi_background_target)			\
+	BCH_INODE_FIELD(bi_erasure_code)
 
 enum {
 	/*
@@ -876,6 +894,27 @@ struct bch_quota {
 } __attribute__((packed, aligned(8)));
 BKEY_VAL_TYPE(quota,	BCH_QUOTA);
 
+/* Erasure coding */
+
+enum {
+	BCH_STRIPE		= 128,
+};
+
+struct bch_stripe {
+	struct bch_val		v;
+	__le16			sectors;
+	__u8			algorithm;
+	__u8			nr_blocks;
+	__u8			nr_redundant;
+
+	__u8			csum_granularity_bits;
+	__u8			csum_type;
+	__u8			pad;
+
+	struct bch_extent_ptr	ptrs[0];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(stripe,	BCH_STRIPE);
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1065,7 +1104,7 @@ struct bch_sb_field_quota {
 struct bch_disk_group {
 	__u8			label[BCH_SB_LABEL_SIZE];
 	__le64			flags[2];
-};
+} __attribute__((packed, aligned(8)));
 
 LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
 LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
@@ -1074,7 +1113,7 @@ LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
 struct bch_sb_field_disk_groups {
 	struct bch_sb_field	field;
 	struct bch_disk_group	entries[0];
-};
+} __attribute__((packed, aligned(8)));
 
 /*
  * On clean shutdown, store btree roots and current journal sequence number in
@@ -1242,12 +1281,15 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
 					struct bch_sb, flags[2],  0,  4);
 LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
 
+LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
+
 /* Features: */
 enum bch_sb_features {
 	BCH_FEATURE_LZ4			= 0,
 	BCH_FEATURE_GZIP		= 1,
 	BCH_FEATURE_ZSTD		= 2,
 	BCH_FEATURE_ATOMIC_NLINK	= 3, /* should have gone under compat */
+	BCH_FEATURE_EC			= 4,
 	BCH_FEATURE_NR,
 };
 
@@ -1417,7 +1459,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
 	DEF_BTREE_ID(DIRENTS,	2, "dirents")			\
 	DEF_BTREE_ID(XATTRS,	3, "xattrs")			\
 	DEF_BTREE_ID(ALLOC,	4, "alloc")			\
-	DEF_BTREE_ID(QUOTAS,	5, "quotas")
+	DEF_BTREE_ID(QUOTAS,	5, "quotas")			\
+	DEF_BTREE_ID(EC,	6, "erasure_coding")
 
 #define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
 
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 9a0286d..9679631 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -588,6 +588,8 @@ BKEY_VAL_ACCESSORS(alloc,		BCH_ALLOC);
 
 BKEY_VAL_ACCESSORS(quota,		BCH_QUOTA);
 
+BKEY_VAL_ACCESSORS(stripe,		BCH_STRIPE);
+
 /* byte order helpers */
 
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 7335fbb..81c6695 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -5,6 +5,7 @@
 #include "btree_types.h"
 #include "alloc_background.h"
 #include "dirent.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
@@ -18,6 +19,7 @@ const struct bkey_ops bch2_bkey_ops[] = {
 	[BKEY_TYPE_XATTRS]	= bch2_bkey_xattr_ops,
 	[BKEY_TYPE_ALLOC]	= bch2_bkey_alloc_ops,
 	[BKEY_TYPE_QUOTAS]	= bch2_bkey_quota_ops,
+	[BKEY_TYPE_EC]		= bch2_bkey_ec_ops,
 	[BKEY_TYPE_BTREE]	= bch2_bkey_btree_ops,
 };
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 92b82ea..e900fd4 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -15,6 +15,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "debug.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
@@ -116,6 +117,7 @@ static bool bkey_type_needs_gc(enum bkey_type type)
 	switch (type) {
 	case BKEY_TYPE_BTREE:
 	case BKEY_TYPE_EXTENTS:
+	case BKEY_TYPE_EC:
 		return true;
 	default:
 		return false;
@@ -156,6 +158,17 @@ static u8 ptr_gens_recalc_oldest(struct bch_fs *c,
 		}
 		}
 		break;
+	case BKEY_TYPE_EC:
+		switch (k.k->type) {
+		case BCH_STRIPE: {
+			struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+			for (ptr = s.v->ptrs;
+			     ptr < s.v->ptrs + s.v->nr_blocks;
+			     ptr++)
+				ptr_gen_recalc_oldest(c, ptr, &max_stale);
+		}
+		}
 	default:
 		break;
 	}
@@ -217,6 +230,21 @@ static int ptr_gens_check(struct bch_fs *c, enum bkey_type type,
 		}
 		}
 		break;
+	case BKEY_TYPE_EC:
+		switch (k.k->type) {
+		case BCH_STRIPE: {
+			struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+			for (ptr = s.v->ptrs;
+			     ptr < s.v->ptrs + s.v->nr_blocks;
+			     ptr++) {
+				ret = ptr_gen_check(c, type, ptr);
+				if (ret)
+					return ret;
+			}
+		}
+		}
+		break;
 	default:
 		break;
 	}
@@ -362,15 +390,27 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 	return 0;
 }
 
+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
+{
+	return  (int) btree_id_to_gc_phase(l) -
+		(int) btree_id_to_gc_phase(r);
+}
+
 static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
 			  bool initial)
 {
+	enum btree_id ids[BTREE_ID_NR];
 	unsigned i;
 
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		enum bkey_type type = bkey_type(0, i);
+	for (i = 0; i < BTREE_ID_NR; i++)
+		ids[i] = i;
+	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
 
-		int ret = bch2_gc_btree(c, i, initial);
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		enum btree_id id = ids[i];
+		enum bkey_type type = bkey_type(0, id);
+
+		int ret = bch2_gc_btree(c, id, initial);
 		if (ret)
 			return ret;
 
@@ -602,6 +642,7 @@ static void bch2_gc_start(struct bch_fs *c)
 				new.data_type		= 0;
 				new.cached_sectors	= 0;
 				new.dirty_sectors	= 0;
+				new.stripe		= 0;
 			}));
 			ca->oldest_gens[b] = new.gen;
 		}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 86b80e3..47a5900 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -55,11 +55,22 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 	return 0;
 }
 
+static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
+{
+	switch (id) {
+#define DEF_BTREE_ID(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n;
+	DEFINE_BCH_BTREE_IDS()
+#undef DEF_BTREE_ID
+	default:
+		BUG();
+	}
+}
+
 static inline struct gc_pos gc_pos_btree(enum btree_id id,
 					 struct bpos pos, unsigned level)
 {
 	return (struct gc_pos) {
-		.phase	= GC_PHASE_BTREE_EXTENTS + id,
+		.phase	= btree_id_to_gc_phase(id),
 		.pos	= pos,
 		.level	= level,
 	};
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 92bacd16..01e476d 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -340,7 +340,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-	wp = bch2_alloc_sectors_start(c, c->opts.foreground_target,
+	wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
 				      writepoint_ptr(&c->btree_write_point),
 				      &devs_have,
 				      res->nr_replicas,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ea28788..9558129 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -69,6 +69,7 @@
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "ec.h"
 #include "error.h"
 #include "movinggc.h"
 #include "trace.h"
@@ -270,6 +271,7 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
 	for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
 		sum.data	+= stats.replicas[i].data[BCH_DATA_BTREE];
 		sum.data	+= stats.replicas[i].data[BCH_DATA_USER];
+		sum.data	+= stats.replicas[i].ec_data;
 		sum.cached	+= stats.replicas[i].data[BCH_DATA_CACHED];
 		sum.reserved	+= stats.replicas[i].persistent_reserved;
 	}
@@ -400,6 +402,8 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 
 	dev_usage->buckets_alloc +=
 		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+	dev_usage->buckets_ec +=
+		(int) new.stripe - (int) old.stripe;
 	dev_usage->buckets_unavailable +=
 		is_unavailable_bucket(new) - is_unavailable_bucket(old);
 
@@ -639,6 +643,49 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	       bucket_became_unavailable(c, old, new));
 }
 
+static void bch2_mark_stripe_ptr(struct bch_fs *c,
+				 struct bch_extent_stripe_ptr p,
+				 s64 sectors, unsigned flags,
+				 s64 *adjusted_disk_sectors,
+				 unsigned *redundancy)
+{
+	struct ec_stripe *m;
+	unsigned old, new, nr_data;
+	int blocks_nonempty_delta;
+	s64 parity_sectors;
+
+	m = genradix_ptr(&c->ec_stripes, p.idx);
+	if (WARN_ON(!m))
+		return;
+
+	if (WARN_ON(!m->alive))
+		return;
+
+	nr_data = m->nr_blocks - m->nr_redundant;
+
+	parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
+
+	if (sectors < 0)
+		parity_sectors = -parity_sectors;
+
+	*adjusted_disk_sectors += parity_sectors;
+
+	*redundancy = max_t(unsigned, *redundancy, m->nr_redundant + 1);
+
+	new = atomic_add_return(sectors, &m->block_sectors[p.block]);
+	old = new - sectors;
+
+	blocks_nonempty_delta = (int) !!new - (int) !!old;
+	if (!blocks_nonempty_delta)
+		return;
+
+	atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
+
+	BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
+
+	bch2_stripes_heap_update(c, m, p.idx);
+}
+
 static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			     s64 sectors, enum bch_data_type data_type,
 			     struct gc_pos pos,
@@ -655,28 +702,43 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 		struct extent_ptr_decoded p;
 		s64 cached_sectors	= 0;
 		s64 dirty_sectors	= 0;
+		s64 ec_sectors		= 0;
 		unsigned replicas	= 0;
+		unsigned ec_redundancy	= 0;
+		unsigned i;
 
 		extent_for_each_ptr_decode(e, p, entry) {
 			s64 disk_sectors = ptr_disk_sectors(e, p, sectors);
+			s64 adjusted_disk_sectors = disk_sectors;
 
 			bch2_mark_pointer(c, e, p, disk_sectors, data_type,
 					  stats, journal_seq, flags);
 
 			if (!p.ptr.cached)
+				for (i = 0; i < p.ec_nr; i++)
+					bch2_mark_stripe_ptr(c, p.ec[i],
+							disk_sectors, flags,
+							&adjusted_disk_sectors,
+							&ec_redundancy);
+			if (!p.ptr.cached)
 				replicas++;
 
 			if (p.ptr.cached)
-				cached_sectors	+= disk_sectors;
+				cached_sectors	+= adjusted_disk_sectors;
+			else if (!p.ec_nr)
+				dirty_sectors	+= adjusted_disk_sectors;
 			else
-				dirty_sectors	+= disk_sectors;
+				ec_sectors	+= adjusted_disk_sectors;
 		}
 
 		replicas	= clamp_t(unsigned,	replicas,
 					  1, ARRAY_SIZE(stats->replicas));
+		ec_redundancy	= clamp_t(unsigned,	ec_redundancy,
+					  1, ARRAY_SIZE(stats->replicas));
 
 		stats->replicas[0].data[BCH_DATA_CACHED]	+= cached_sectors;
 		stats->replicas[replicas - 1].data[data_type]	+= dirty_sectors;
+		stats->replicas[ec_redundancy - 1].ec_data	+= ec_sectors;
 		break;
 	}
 	case BCH_RESERVATION: {
@@ -692,6 +754,78 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 	}
 }
 
+static void bucket_set_stripe(struct bch_fs *c,
+			      const struct bch_stripe *v,
+			      bool enabled,
+			      struct bch_fs_usage *fs_usage,
+			      u64 journal_seq)
+{
+	unsigned i;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		const struct bch_extent_ptr *ptr = v->ptrs + i;
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bucket *g;
+		struct bucket_mark new, old;
+
+		BUG_ON(ptr_stale(ca, ptr));
+
+		rcu_read_lock();
+		g = PTR_BUCKET(ca, ptr);
+
+		old = bucket_cmpxchg(g, new, ({
+			new.stripe			= enabled;
+			if (journal_seq) {
+				new.journal_seq_valid	= 1;
+				new.journal_seq		= journal_seq;
+			}
+		}));
+		rcu_read_unlock();
+
+		BUG_ON(old.stripe == enabled);
+
+		bch2_dev_usage_update(c, ca, fs_usage, old, new);
+	}
+}
+
+static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
+			     bool inserting, struct gc_pos pos,
+			     struct bch_fs_usage *fs_usage,
+			     u64 journal_seq, unsigned flags)
+{
+	switch (k.k->type) {
+	case BCH_STRIPE: {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+		size_t idx = s.k->p.offset;
+		struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx);
+		unsigned i;
+
+		BUG_ON(!m);
+		BUG_ON(m->alive == inserting);
+
+		BUG_ON(atomic_read(&m->blocks_nonempty));
+
+		for (i = 0; i < EC_STRIPE_MAX; i++)
+			BUG_ON(atomic_read(&m->block_sectors[i]));
+
+		if (inserting) {
+			m->sectors	= le16_to_cpu(s.v->sectors);
+			m->algorithm	= s.v->algorithm;
+			m->nr_blocks	= s.v->nr_blocks;
+			m->nr_redundant	= s.v->nr_redundant;
+		}
+
+		if (inserting)
+			bch2_stripes_heap_insert(c, m, idx);
+		else
+			bch2_stripes_heap_del(c, m, idx);
+
+		bucket_set_stripe(c, s.v, inserting, fs_usage, 0);
+		break;
+	}
+	}
+}
+
 void bch2_mark_key(struct bch_fs *c,
 		   enum bkey_type type, struct bkey_s_c k,
 		   bool inserting, s64 sectors,
@@ -747,6 +881,10 @@ void bch2_mark_key(struct bch_fs *c,
 		bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
 				 pos, stats, journal_seq, flags);
 		break;
+	case BKEY_TYPE_EC:
+		bch2_mark_stripe(c, k, inserting,
+				 pos, stats, journal_seq, flags);
+		break;
 	default:
 		break;
 	}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 8fe6871..b48960f 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -185,6 +185,7 @@ static inline bool is_available_bucket(struct bucket_mark mark)
 {
 	return (!mark.owned_by_allocator &&
 		!mark.dirty_sectors &&
+		!mark.stripe &&
 		!mark.nouse);
 }
 
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 49f3ab9..9ec96db 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -19,7 +19,8 @@ struct bucket_mark {
 				gen_valid:1,
 				owned_by_allocator:1,
 				nouse:1,
-				journal_seq_valid:1;
+				journal_seq_valid:1,
+				stripe:1;
 		u16		dirty_sectors;
 		u16		cached_sectors;
 
@@ -53,6 +54,7 @@ struct bucket_array {
 struct bch_dev_usage {
 	u64			buckets[BCH_DATA_NR];
 	u64			buckets_alloc;
+	u64			buckets_ec;
 	u64			buckets_unavailable;
 
 	/* _compressed_ sectors: */
@@ -67,6 +69,7 @@ struct bch_fs_usage {
 
 	struct {
 		u64		data[BCH_DATA_NR];
+		u64		ec_data;
 		u64		persistent_reserved;
 	}			replicas[BCH_REPLICAS_MAX];
 
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index ceb75f8..c8e0c37 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -55,6 +55,19 @@ static inline struct target target_decode(unsigned target)
 }
 
 const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+
+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
+						  enum bch_data_type data_type,
+						  u16 target)
+{
+	struct bch_devs_mask devs = c->rw_devs[data_type];
+	const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
+
+	if (t)
+		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+	return devs;
+}
+
 bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
 
 int bch2_disk_path_find(struct bch_sb_handle *, const char *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
new file mode 100644
index 0000000..f6314aa
--- /dev/null
+++ b/fs/bcachefs/ec.c
@@ -0,0 +1,1265 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* erasure coding */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bset.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io.h"
+#include "keylist.h"
+#include "super-io.h"
+#include "util.h"
+
+#include <linux/raid/pq.h>
+#include <linux/raid/xor.h>
+#include <linux/sort.h>
+
+struct ec_bio {
+	struct bch_dev		*ca;
+	struct ec_stripe_buf	*buf;
+	size_t			idx;
+	struct bio		bio;
+};
+
+/* Stripes btree keys: */
+
+static unsigned stripe_csums_per_device(const struct bch_stripe *s)
+{
+	return DIV_ROUND_UP(le16_to_cpu(s->sectors),
+			    1 << s->csum_granularity_bits);
+}
+
+static unsigned stripe_val_u64s(const struct bch_stripe *s)
+{
+	unsigned bytes = sizeof(struct bch_stripe) +
+		sizeof(struct bch_extent_ptr) * s->nr_blocks +
+		bch_crc_bytes[s->csum_type] * s->nr_blocks * stripe_csums_per_device(s);
+	return DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx)
+{
+	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
+	void *csums = s->ptrs + s->nr_blocks;
+
+	BUG_ON(!csum_bytes);
+
+	return csums + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
+}
+
+const char *bch2_ec_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (k.k->p.inode)
+		return "invalid stripe key";
+
+	switch (k.k->type) {
+	case BCH_STRIPE: {
+		const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+
+		if (bkey_val_bytes(k.k) < sizeof(*s))
+			return "incorrect value size";
+
+		if (bkey_val_u64s(k.k) != stripe_val_u64s(s))
+			return "incorrect value size";
+
+		return NULL;
+	}
+	default:
+		return "invalid type";
+	}
+}
+
+void bch2_ec_key_to_text(struct printbuf *out, struct bch_fs *c,
+			 struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_STRIPE: {
+		const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+		unsigned i;
+
+		pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
+		       s->algorithm,
+		       le16_to_cpu(s->sectors),
+		       s->nr_blocks - s->nr_redundant,
+		       s->nr_redundant,
+		       s->csum_type,
+		       1U << s->csum_granularity_bits);
+
+		for (i = 0; i < s->nr_blocks; i++)
+			pr_buf(out, " %u:%llu", s->ptrs[i].dev,
+			       (u64) s->ptrs[i].offset);
+	}
+	}
+}
+
+static int ptr_matches_stripe(struct bch_fs *c,
+			      struct bch_stripe *v,
+			      const struct bch_extent_ptr *ptr)
+{
+	unsigned i;
+
+	for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
+		const struct bch_extent_ptr *ptr2 = v->ptrs + i;
+
+		if (ptr->dev == ptr2->dev &&
+		    ptr->gen == ptr2->gen &&
+		    ptr->offset >= ptr2->offset &&
+		    ptr->offset <  ptr2->offset + le16_to_cpu(v->sectors))
+			return i;
+	}
+
+	return -1;
+}
+
+static int extent_matches_stripe(struct bch_fs *c,
+				 struct bch_stripe *v,
+				 struct bkey_s_c k)
+{
+	struct bkey_s_c_extent e;
+	const struct bch_extent_ptr *ptr;
+	int idx;
+
+	if (!bkey_extent_is_data(k.k))
+		return -1;
+
+	e = bkey_s_c_to_extent(k);
+
+	extent_for_each_ptr(e, ptr) {
+		idx = ptr_matches_stripe(c, v, ptr);
+		if (idx >= 0)
+			return idx;
+	}
+
+	return -1;
+}
+
+static void ec_stripe_key_init(struct bch_fs *c,
+			       struct bkey_i_stripe *s,
+			       struct open_buckets *blocks,
+			       struct open_buckets *parity,
+			       unsigned stripe_size)
+{
+	struct open_bucket *ob;
+	unsigned i, u64s;
+
+	bkey_stripe_init(&s->k_i);
+	s->v.sectors			= cpu_to_le16(stripe_size);
+	s->v.algorithm			= 0;
+	s->v.nr_blocks			= parity->nr + blocks->nr;
+	s->v.nr_redundant		= parity->nr;
+	s->v.csum_granularity_bits	= ilog2(c->sb.encoded_extent_max);
+	s->v.csum_type			= BCH_CSUM_CRC32C;
+	s->v.pad			= 0;
+
+	open_bucket_for_each(c, blocks, ob, i)
+		s->v.ptrs[i]			= ob->ptr;
+
+	open_bucket_for_each(c, parity, ob, i)
+		s->v.ptrs[blocks->nr + i]	= ob->ptr;
+
+	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
+		BUG_ON(1 << s->v.csum_granularity_bits >=
+		       le16_to_cpu(s->v.sectors) ||
+		       s->v.csum_granularity_bits == U8_MAX);
+		s->v.csum_granularity_bits++;
+	}
+
+	set_bkey_val_u64s(&s->k, u64s);
+}
+
+/* Checksumming: */
+
+static void ec_generate_checksums(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned csum_granularity = 1 << v->csum_granularity_bits;
+	unsigned csums_per_device = stripe_csums_per_device(v);
+	unsigned csum_bytes = bch_crc_bytes[v->csum_type];
+	unsigned i, j;
+
+	if (!csum_bytes)
+		return;
+
+	BUG_ON(buf->offset);
+	BUG_ON(buf->size != le16_to_cpu(v->sectors));
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		for (j = 0; j < csums_per_device; j++) {
+			unsigned offset = j << v->csum_granularity_bits;
+			unsigned len = min(csum_granularity, buf->size - offset);
+
+			struct bch_csum csum =
+				bch2_checksum(NULL, v->csum_type,
+					      null_nonce(),
+					      buf->data[i] + (offset << 9),
+					      len << 9);
+
+			memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
+		}
+	}
+}
+
+static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned csum_granularity = 1 << v->csum_granularity_bits;
+	unsigned csum_bytes = bch_crc_bytes[v->csum_type];
+	unsigned i;
+
+	if (!csum_bytes)
+		return;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		unsigned offset = buf->offset;
+		unsigned end = buf->offset + buf->size;
+
+		if (!test_bit(i, buf->valid))
+			continue;
+
+		while (offset < end) {
+			unsigned j = offset >> v->csum_granularity_bits;
+			unsigned len = min(csum_granularity, end - offset);
+			struct bch_csum csum;
+
+			BUG_ON(offset & (csum_granularity - 1));
+			BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
+			       ((offset + len) & (csum_granularity - 1)));
+
+			csum = bch2_checksum(NULL, v->csum_type,
+					     null_nonce(),
+					     buf->data[i] + ((offset - buf->offset) << 9),
+					     len << 9);
+
+			if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
+				__bcache_io_error(c,
+					"checksum error while doing reconstruct read (%u:%u)",
+					i, j);
+				clear_bit(i, buf->valid);
+				break;
+			}
+
+			offset += len;
+		}
+	}
+}
+
+/* Erasure coding: */
+
+static void raid5_recov(unsigned disks, unsigned bytes,
+			unsigned failed, void **data)
+{
+	unsigned i = 2, nr;
+
+	BUG_ON(failed >= disks);
+
+	swap(data[0], data[failed]);
+	memcpy(data[0], data[1], bytes);
+
+	while (i < disks) {
+		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
+		xor_blocks(nr, bytes, data[0], data + i);
+		i += nr;
+	}
+
+	swap(data[0], data[failed]);
+}
+
+static void ec_generate_ec(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned nr_data = v->nr_blocks - v->nr_redundant;
+	unsigned bytes = le16_to_cpu(v->sectors) << 9;
+
+	switch (v->nr_redundant) {
+	case 2:
+		raid6_call.gen_syndrome(v->nr_blocks, bytes, buf->data);
+		fallthrough;
+	case 1:
+		raid5_recov(v->nr_blocks, bytes, nr_data, buf->data);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
+{
+	return nr - bitmap_weight(buf->valid, nr);
+}
+
+static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
+{
+	return __ec_nr_failed(buf, buf->key.v.nr_blocks);
+}
+
+static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0;
+	unsigned nr_data = v->nr_blocks - v->nr_redundant;
+	unsigned bytes = buf->size << 9;
+
+	if (ec_nr_failed(buf) > v->nr_redundant) {
+		__bcache_io_error(c,
+			"error doing reconstruct read: unable to read enough blocks");
+		return -1;
+	}
+
+	for (i = 0; i < nr_data; i++)
+		if (!test_bit(i, buf->valid))
+			failed[nr_failed++] = i;
+
+	switch (nr_failed) {
+	case 0:
+		break;
+	case 1:
+		if (test_bit(nr_data, buf->valid))
+			raid5_recov(nr_data + 1, bytes, failed[0], buf->data);
+		else
+			raid6_datap_recov(v->nr_blocks, bytes, failed[0], buf->data);
+		break;
+	case 2:
+		/* data+data failure. */
+		raid6_2data_recov(v->nr_blocks, bytes, failed[0], failed[1], buf->data);
+		break;
+
+	default:
+		BUG();
+	}
+
+	for (i = nr_data; i < v->nr_blocks; i++)
+		if (!test_bit(i, buf->valid)) {
+			ec_generate_ec(buf);
+			break;
+		}
+
+	return 0;
+}
+
+/* IO: */
+
+static void ec_block_endio(struct bio *bio)
+{
+	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+	struct bch_dev *ca = ec_bio->ca;
+	struct closure *cl = bio->bi_private;
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding"))
+		clear_bit(ec_bio->idx, ec_bio->buf->valid);
+
+	bio_put(&ec_bio->bio);
+	percpu_ref_put(&ca->io_ref);
+	closure_put(cl);
+}
+
+static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
+			unsigned rw, unsigned idx, struct closure *cl)
+{
+	struct bch_stripe *v = &buf->key.v;
+	unsigned offset = 0, bytes = buf->size << 9;
+	struct bch_extent_ptr *ptr = &v->ptrs[idx];
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+	if (!bch2_dev_get_ioref(ca, rw)) {
+		clear_bit(idx, buf->valid);
+		return;
+	}
+
+	while (offset < bytes) {
+		unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
+					   DIV_ROUND_UP(bytes, PAGE_SIZE));
+		unsigned b = min_t(size_t, bytes - offset,
+				   nr_iovecs << PAGE_SHIFT);
+		struct ec_bio *ec_bio;
+
+		ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
+						       nr_iovecs,
+						       rw,
+						       GFP_KERNEL,
+						       &c->ec_bioset),
+				      struct ec_bio, bio);
+
+		ec_bio->ca			= ca;
+		ec_bio->buf			= buf;
+		ec_bio->idx			= idx;
+
+		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
+		ec_bio->bio.bi_iter.bi_size	= b;
+		ec_bio->bio.bi_end_io		= ec_block_endio;
+		ec_bio->bio.bi_private		= cl;
+
+		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset);
+
+		closure_get(cl);
+		percpu_ref_get(&ca->io_ref);
+
+		submit_bio(&ec_bio->bio);
+
+		offset += b;
+	}
+
+	percpu_ref_put(&ca->io_ref);
+}
+
+/* recovery read path: */
+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+{
+	struct btree_iter iter;
+	struct ec_stripe_buf *buf;
+	struct closure cl;
+	struct bkey_s_c k;
+	struct bch_stripe *v;
+	unsigned stripe_idx;
+	unsigned offset, end;
+	unsigned i, nr_data, csum_granularity;
+	int ret = 0, idx;
+
+	closure_init_stack(&cl);
+
+	BUG_ON(!rbio->pick.idx ||
+	       rbio->pick.idx - 1 >= rbio->pick.ec_nr);
+
+	stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx;
+
+	buf = kzalloc(sizeof(*buf), GFP_NOIO);
+	if (!buf)
+		return -ENOMEM;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EC,
+			     POS(0, stripe_idx),
+			     BTREE_ITER_SLOTS);
+	k = bch2_btree_iter_peek_slot(&iter);
+	if (btree_iter_err(k) || k.k->type != BCH_STRIPE) {
+		__bcache_io_error(c,
+			"error doing reconstruct read: stripe not found");
+		kfree(buf);
+		return bch2_btree_iter_unlock(&iter) ?: -EIO;
+	}
+
+	bkey_reassemble(&buf->key.k_i, k);
+	bch2_btree_iter_unlock(&iter);
+
+	v = &buf->key.v;
+
+	nr_data = v->nr_blocks - v->nr_redundant;
+
+	idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
+	BUG_ON(idx < 0);
+
+	csum_granularity = 1U << v->csum_granularity_bits;
+
+	offset	= rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
+	end	= offset + bio_sectors(&rbio->bio);
+
+	BUG_ON(end > le16_to_cpu(v->sectors));
+
+	buf->offset	= round_down(offset, csum_granularity);
+	buf->size	= min_t(unsigned, le16_to_cpu(v->sectors),
+				round_up(end, csum_granularity)) - buf->offset;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
+		if (!buf->data[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	memset(buf->valid, 0xFF, sizeof(buf->valid));
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		struct bch_extent_ptr *ptr = v->ptrs + i;
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ptr_stale(ca, ptr)) {
+			__bcache_io_error(c,
+					  "error doing reconstruct read: stale pointer");
+			clear_bit(i, buf->valid);
+			continue;
+		}
+
+		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
+	}
+
+	closure_sync(&cl);
+
+	if (ec_nr_failed(buf) > v->nr_redundant) {
+		__bcache_io_error(c,
+			"error doing reconstruct read: unable to read enough blocks");
+		ret = -EIO;
+		goto err;
+	}
+
+	ec_validate_checksums(c, buf);
+
+	ret = ec_do_recov(c, buf);
+	if (ret)
+		goto err;
+
+	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
+		      buf->data[idx] + ((offset - buf->offset) << 9));
+err:
+	for (i = 0; i < v->nr_blocks; i++)
+		kfree(buf->data[i]);
+	kfree(buf);
+	return ret;
+}
+
+/* ec_stripe bucket accounting: */
+
+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
+{
+	ec_stripes_heap n, *h = &c->ec_stripes_heap;
+
+	if (idx >= h->size) {
+		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
+			return -ENOMEM;
+
+		spin_lock(&c->ec_stripes_heap_lock);
+		if (n.size > h->size) {
+			memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
+			n.used = h->used;
+			swap(*h, n);
+		}
+		spin_unlock(&c->ec_stripes_heap_lock);
+
+		free_heap(&n);
+	}
+
+	if (!genradix_ptr_alloc(&c->ec_stripes, idx, gfp))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int ec_stripe_mem_alloc(struct bch_fs *c,
+			       struct btree_iter *iter)
+{
+	size_t idx = iter->pos.offset;
+
+	if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
+		return 0;
+
+	bch2_btree_iter_unlock(iter);
+
+	if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
+		return -EINTR;
+	return -ENOMEM;
+}
+
+static ssize_t stripe_idx_to_delete(struct bch_fs *c)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+
+	return h->data[0].blocks_nonempty == 0 ? h->data[0].idx : -1;
+}
+
+static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
+				      struct ec_stripe_heap_entry l,
+				      struct ec_stripe_heap_entry r)
+{
+	return ((l.blocks_nonempty > r.blocks_nonempty) -
+		(l.blocks_nonempty < r.blocks_nonempty));
+}
+
+static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
+						   size_t i)
+{
+	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
+
+	genradix_ptr(&c->ec_stripes, h->data[i].idx)->heap_idx = i;
+}
+
+static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx);
+
+	BUG_ON(!m->alive);
+	BUG_ON(m->heap_idx >= h->used);
+	BUG_ON(h->data[m->heap_idx].idx != idx);
+}
+
+static inline unsigned stripe_entry_blocks(struct ec_stripe *m)
+{
+	return atomic_read(&m->pin)
+		? UINT_MAX : atomic_read(&m->blocks_nonempty);
+}
+
+void bch2_stripes_heap_update(struct bch_fs *c,
+			      struct ec_stripe *m, size_t idx)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	bool queue_delete;
+	size_t i;
+
+	spin_lock(&c->ec_stripes_heap_lock);
+
+	if (!m->alive) {
+		spin_unlock(&c->ec_stripes_heap_lock);
+		return;
+	}
+
+	heap_verify_backpointer(c, idx);
+
+	h->data[m->heap_idx].blocks_nonempty =
+		stripe_entry_blocks(m);
+
+	i = m->heap_idx;
+	heap_sift_up(h,	  i, ec_stripes_heap_cmp,
+		     ec_stripes_heap_set_backpointer);
+	heap_sift_down(h, i, ec_stripes_heap_cmp,
+		       ec_stripes_heap_set_backpointer);
+
+	heap_verify_backpointer(c, idx);
+
+	queue_delete = stripe_idx_to_delete(c) >= 0;
+	spin_unlock(&c->ec_stripes_heap_lock);
+
+	if (queue_delete)
+		schedule_work(&c->ec_stripe_delete_work);
+}
+
+void bch2_stripes_heap_del(struct bch_fs *c,
+			   struct ec_stripe *m, size_t idx)
+{
+	spin_lock(&c->ec_stripes_heap_lock);
+	heap_verify_backpointer(c, idx);
+
+	m->alive = false;
+	heap_del(&c->ec_stripes_heap, m->heap_idx,
+		 ec_stripes_heap_cmp,
+		 ec_stripes_heap_set_backpointer);
+	spin_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_stripes_heap_insert(struct bch_fs *c,
+			      struct ec_stripe *m, size_t idx)
+{
+	spin_lock(&c->ec_stripes_heap_lock);
+
+	BUG_ON(heap_full(&c->ec_stripes_heap));
+
+	heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
+			.idx = idx,
+			.blocks_nonempty = stripe_entry_blocks(m),
+		}),
+		 ec_stripes_heap_cmp,
+		 ec_stripes_heap_set_backpointer);
+	m->alive = true;
+
+	heap_verify_backpointer(c, idx);
+
+	spin_unlock(&c->ec_stripes_heap_lock);
+}
+
+static void ec_stripe_delete(struct bch_fs *c, unsigned idx)
+{
+	struct btree_iter iter;
+	struct bch_stripe *v = NULL;
+	struct bkey_s_c k;
+	struct bkey_i delete;
+	u64 journal_seq = 0;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EC,
+			     POS(0, idx),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+	if (btree_iter_err(k) || k.k->type != BCH_STRIPE)
+		goto out;
+
+	v = kmalloc(bkey_val_bytes(k.k), GFP_KERNEL);
+	BUG_ON(!v);
+	memcpy(v, bkey_s_c_to_stripe(k).v, bkey_val_bytes(k.k));
+
+	bkey_init(&delete.k);
+	delete.k.p = iter.pos;
+
+	bch2_btree_insert_at(c, NULL, &journal_seq,
+			     BTREE_INSERT_NOFAIL|
+			     BTREE_INSERT_USE_RESERVE|
+			     BTREE_INSERT_NOUNLOCK,
+			     BTREE_INSERT_ENTRY(&iter, &delete));
+out:
+	bch2_btree_iter_unlock(&iter);
+	kfree(v);
+}
+
+static void ec_stripe_delete_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, ec_stripe_delete_work);
+	ssize_t idx;
+
+	down_read(&c->gc_lock);
+
+	while (1) {
+		spin_lock(&c->ec_stripes_heap_lock);
+		idx = stripe_idx_to_delete(c);
+		spin_unlock(&c->ec_stripes_heap_lock);
+
+		if (idx < 0)
+			break;
+
+		ec_stripe_delete(c, idx);
+	}
+
+	up_read(&c->gc_lock);
+}
+
+static int ec_stripe_bkey_insert(struct bch_fs *c,
+				 struct bkey_i_stripe *stripe)
+{
+	struct ec_stripe *m;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	/* XXX: start pos hint */
+retry:
+	for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN,
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
+		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
+			bch2_btree_iter_unlock(&iter);
+			return -ENOSPC;
+		}
+
+		if (bkey_deleted(k.k))
+			goto found_slot;
+	}
+
+	return bch2_btree_iter_unlock(&iter) ?: -ENOSPC;
+found_slot:
+	mutex_lock(&c->ec_stripes_lock);
+	ret = ec_stripe_mem_alloc(c, &iter);
+	mutex_unlock(&c->ec_stripes_lock);
+
+	if (ret == -EINTR)
+		goto retry;
+	if (ret)
+		return ret;
+
+	m = genradix_ptr(&c->ec_stripes, iter.pos.offset);
+	atomic_inc(&m->pin);
+
+	stripe->k.p = iter.pos;
+
+	ret = bch2_btree_insert_at(c, NULL, NULL,
+				   BTREE_INSERT_NOFAIL|
+				   BTREE_INSERT_USE_RESERVE,
+				   BTREE_INSERT_ENTRY(&iter, &stripe->k_i));
+	bch2_btree_iter_unlock(&iter);
+
+	if (ret)
+		atomic_dec(&m->pin);
+
+	return ret;
+}
+
+/* stripe creation: */
+
+static void extent_stripe_ptr_add(struct bkey_s_extent e,
+				  struct ec_stripe_buf *s,
+				  struct bch_extent_ptr *ptr,
+				  unsigned block)
+{
+	struct bch_extent_stripe_ptr *dst = (void *) ptr;
+	union bch_extent_entry *end = extent_entry_last(e);
+
+	memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
+	e.k->u64s += sizeof(*dst) / sizeof(u64);
+
+	*dst = (struct bch_extent_stripe_ptr) {
+		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
+		.block		= block,
+		.idx		= s->key.k.p.offset,
+	};
+}
+
+static int ec_stripe_update_ptrs(struct bch_fs *c,
+				 struct ec_stripe_buf *s,
+				 struct bkey *pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	BKEY_PADDED(k) tmp;
+	int ret = 0, dev, idx;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+			     bkey_start_pos(pos),
+			     BTREE_ITER_INTENT);
+
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	       !btree_iter_err(k) &&
+	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
+		idx = extent_matches_stripe(c, &s->key.v, k);
+		if (idx < 0) {
+			bch2_btree_iter_next(&iter);
+			continue;
+		}
+
+		dev = s->key.v.ptrs[idx].dev;
+
+		bkey_reassemble(&tmp.k, k);
+		e = bkey_i_to_s_extent(&tmp.k);
+
+		extent_for_each_ptr(e, ptr)
+			if (ptr->dev != dev)
+				ptr->cached = true;
+
+		ptr = (void *) bch2_extent_has_device(e.c, dev);
+		BUG_ON(!ptr);
+
+		extent_stripe_ptr_add(e, s, ptr, idx);
+
+		ret = bch2_btree_insert_at(c, NULL, NULL,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_USE_RESERVE,
+				BTREE_INSERT_ENTRY(&iter, &tmp.k));
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+	}
+
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/*
+ * data buckets of new stripe all written: create the stripe
+ */
+static void ec_stripe_create(struct ec_stripe_new *s)
+{
+	struct ec_stripe *ec_stripe;
+	struct bch_fs *c = s->c;
+	struct open_bucket *ob;
+	struct bkey_i *k;
+	struct bch_stripe *v = &s->stripe.key.v;
+	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+	struct closure cl;
+	int ret;
+
+	BUG_ON(s->h->s == s);
+
+	closure_init_stack(&cl);
+
+	if (s->err) {
+		bch_err(c, "error creating stripe: error writing data buckets");
+		goto err;
+	}
+
+	if (!percpu_ref_tryget(&c->writes))
+		goto err;
+
+	BUG_ON(bitmap_weight(s->blocks_allocated,
+			     s->blocks.nr) != s->blocks.nr);
+
+	ec_generate_ec(&s->stripe);
+
+	ec_generate_checksums(&s->stripe);
+
+	/* write p/q: */
+	for (i = nr_data; i < v->nr_blocks; i++)
+		ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
+
+	closure_sync(&cl);
+
+	for (i = nr_data; i < v->nr_blocks; i++)
+		if (!test_bit(i, s->stripe.valid)) {
+			bch_err(c, "error creating stripe: error writing redundancy buckets");
+			goto err_put_writes;
+		}
+
+	ret = ec_stripe_bkey_insert(c, &s->stripe.key);
+	if (ret) {
+		bch_err(c, "error creating stripe: error creating stripe key");
+		goto err_put_writes;
+	}
+
+	for_each_keylist_key(&s->keys, k) {
+		ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
+		if (ret)
+			break;
+	}
+
+	ec_stripe = genradix_ptr(&c->ec_stripes, s->stripe.key.k.p.offset);
+
+	atomic_dec(&ec_stripe->pin);
+	bch2_stripes_heap_update(c, ec_stripe,
+				 s->stripe.key.k.p.offset);
+
+err_put_writes:
+	percpu_ref_put(&c->writes);
+err:
+	open_bucket_for_each(c, &s->blocks, ob, i) {
+		ob->ec = NULL;
+		__bch2_open_bucket_put(c, ob);
+	}
+
+	bch2_open_buckets_put(c, &s->parity);
+
+	bch2_keylist_free(&s->keys, s->inline_keys);
+
+	mutex_lock(&s->h->lock);
+	list_del(&s->list);
+	mutex_unlock(&s->h->lock);
+
+	for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
+		kvpfree(s->stripe.data[i], s->stripe.size << 9);
+	kfree(s);
+}
+
+static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s = h->s;
+
+	list_add(&s->list, &h->stripes);
+	h->s = NULL;
+
+	return s;
+}
+
+static void ec_stripe_new_put(struct ec_stripe_new *s)
+{
+	BUG_ON(atomic_read(&s->pin) <= 0);
+	if (atomic_dec_and_test(&s->pin))
+		ec_stripe_create(s);
+}
+
+/* have a full bucket - hand it off to be erasure coded: */
+void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct ec_stripe_new *s = ob->ec;
+
+	if (ob->sectors_free)
+		s->err = -1;
+
+	ec_stripe_new_put(s);
+}
+
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct ec_stripe_new *s = ob->ec;
+
+	s->err = -EIO;
+}
+
+void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
+{
+	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
+	struct bch_dev *ca;
+	unsigned offset;
+
+	if (!ob)
+		return NULL;
+
+	ca	= bch_dev_bkey_exists(c, ob->ptr.dev);
+	offset	= ca->mi.bucket_size - ob->sectors_free;
+
+	return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
+}
+
+void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
+			     struct bpos pos, unsigned sectors)
+{
+	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
+	struct ec_stripe_new *ec;
+
+	if (!ob)
+		return;
+
+	ec = ob->ec;
+	mutex_lock(&ec->lock);
+
+	if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
+				 ARRAY_SIZE(ec->inline_keys),
+				 BKEY_U64s)) {
+		BUG();
+	}
+
+	bkey_init(&ec->keys.top->k);
+	ec->keys.top->k.p	= pos;
+	bch2_key_resize(&ec->keys.top->k, sectors);
+	bch2_keylist_push(&ec->keys);
+
+	mutex_unlock(&ec->lock);
+}
+
+static int unsigned_cmp(const void *_l, const void *_r)
+{
+	unsigned l = *((const unsigned *) _l);
+	unsigned r = *((const unsigned *) _r);
+
+	return (l > r) - (l < r);
+}
+
+/* pick most common bucket size: */
+static unsigned pick_blocksize(struct bch_fs *c,
+			       struct bch_devs_mask *devs)
+{
+	struct bch_dev *ca;
+	unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
+	struct {
+		unsigned nr, size;
+	} cur = { 0, 0 }, best = { 0, 0 };
+
+	for_each_member_device_rcu(ca, c, i, devs)
+		sizes[nr++] = ca->mi.bucket_size;
+
+	sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
+
+	for (i = 0; i < nr; i++) {
+		if (sizes[i] != cur.size) {
+			if (cur.nr > best.nr)
+				best = cur;
+
+			cur.nr = 0;
+			cur.size = sizes[i];
+		}
+
+		cur.nr++;
+	}
+
+	if (cur.nr > best.nr)
+		best = cur;
+
+	return best.size;
+}
+
+int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s;
+	unsigned i;
+
+	BUG_ON(h->parity.nr != h->redundancy);
+	BUG_ON(!h->blocks.nr);
+	BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX);
+	lockdep_assert_held(&h->lock);
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	mutex_init(&s->lock);
+	atomic_set(&s->pin, 1);
+	s->c		= c;
+	s->h		= h;
+	s->blocks	= h->blocks;
+	s->parity	= h->parity;
+
+	memset(&h->blocks, 0, sizeof(h->blocks));
+	memset(&h->parity, 0, sizeof(h->parity));
+
+	bch2_keylist_init(&s->keys, s->inline_keys);
+
+	s->stripe.offset	= 0;
+	s->stripe.size		= h->blocksize;
+	memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
+
+	ec_stripe_key_init(c, &s->stripe.key,
+			   &s->blocks, &s->parity,
+			   h->blocksize);
+
+	for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
+		s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
+		if (!s->stripe.data[i])
+			goto err;
+	}
+
+	h->s = s;
+
+	return 0;
+err:
+	for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
+		kvpfree(s->stripe.data[i], s->stripe.size << 9);
+	kfree(s);
+	return -ENOMEM;
+}
+
+static struct ec_stripe_head *
+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
+			 unsigned algo, unsigned redundancy)
+{
+	struct ec_stripe_head *h;
+	struct bch_dev *ca;
+	unsigned i;
+
+	h = kzalloc(sizeof(*h), GFP_KERNEL);
+	if (!h)
+		return NULL;
+
+	mutex_init(&h->lock);
+	mutex_lock(&h->lock);
+	INIT_LIST_HEAD(&h->stripes);
+
+	h->target	= target;
+	h->algo		= algo;
+	h->redundancy	= redundancy;
+
+	rcu_read_lock();
+	h->devs = target_rw_devs(c, BCH_DATA_USER, target);
+
+	for_each_member_device_rcu(ca, c, i, &h->devs)
+		if (!ca->mi.durability)
+			__clear_bit(i, h->devs.d);
+
+	h->blocksize = pick_blocksize(c, &h->devs);
+
+	for_each_member_device_rcu(ca, c, i, &h->devs)
+		if (ca->mi.bucket_size == h->blocksize)
+			h->nr_active_devs++;
+
+	rcu_read_unlock();
+	list_add(&h->list, &c->ec_new_stripe_list);
+	return h;
+}
+
+void bch2_ec_stripe_head_put(struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s = NULL;
+
+	if (h->s &&
+	    bitmap_weight(h->s->blocks_allocated,
+			  h->s->blocks.nr) == h->s->blocks.nr)
+		s = ec_stripe_set_pending(h);
+
+	mutex_unlock(&h->lock);
+
+	if (s)
+		ec_stripe_new_put(s);
+}
+
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
+					       unsigned target,
+					       unsigned algo,
+					       unsigned redundancy)
+{
+	struct ec_stripe_head *h;
+
+	if (!redundancy)
+		return NULL;
+
+	mutex_lock(&c->ec_new_stripe_lock);
+	list_for_each_entry(h, &c->ec_new_stripe_list, list)
+		if (h->target		== target &&
+		    h->algo		== algo &&
+		    h->redundancy	== redundancy) {
+			mutex_lock(&h->lock);
+			goto found;
+		}
+
+	h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
+found:
+	mutex_unlock(&c->ec_new_stripe_lock);
+	return h;
+}
+
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	unsigned i;
+
+	mutex_lock(&c->ec_new_stripe_lock);
+	list_for_each_entry(h, &c->ec_new_stripe_list, list) {
+		struct ec_stripe_new *s = NULL;
+
+		mutex_lock(&h->lock);
+		bch2_open_buckets_stop_dev(c, ca,
+					   &h->blocks,
+					   BCH_DATA_USER);
+		bch2_open_buckets_stop_dev(c, ca,
+					   &h->parity,
+					   BCH_DATA_USER);
+
+		if (!h->s)
+			goto unlock;
+
+		open_bucket_for_each(c, &h->s->blocks, ob, i)
+			if (ob->ptr.dev == ca->dev_idx)
+				goto found;
+		open_bucket_for_each(c, &h->s->parity, ob, i)
+			if (ob->ptr.dev == ca->dev_idx)
+				goto found;
+		goto unlock;
+found:
+		h->s->err = -1;
+		s = ec_stripe_set_pending(h);
+unlock:
+		mutex_unlock(&h->lock);
+
+		if (s)
+			ec_stripe_new_put(s);
+	}
+	mutex_unlock(&c->ec_new_stripe_lock);
+}
+
+int bch2_fs_ec_start(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	size_t i, idx = 0;
+	int ret = 0;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS(0, U64_MAX), 0);
+
+	k = bch2_btree_iter_prev(&iter);
+	if (!IS_ERR_OR_NULL(k.k))
+		idx = k.k->p.offset + 1;
+	ret = bch2_btree_iter_unlock(&iter);
+	if (ret)
+		return ret;
+
+	if (!init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
+		       GFP_KERNEL))
+		return -ENOMEM;
+#if 0
+	ret = genradix_prealloc(&c->ec_stripes, idx, GFP_KERNEL);
+#else
+	for (i = 0; i < idx; i++)
+		if (!genradix_ptr_alloc(&c->ec_stripes, i, GFP_KERNEL))
+			return -ENOMEM;
+#endif
+	return 0;
+}
+
+void bch2_fs_ec_exit(struct bch_fs *c)
+{
+	struct ec_stripe_head *h;
+
+	while (1) {
+		mutex_lock(&c->ec_new_stripe_lock);
+		h = list_first_entry_or_null(&c->ec_new_stripe_list,
+					     struct ec_stripe_head, list);
+		if (h)
+			list_del(&h->list);
+		mutex_unlock(&c->ec_new_stripe_lock);
+		if (!h)
+			break;
+
+		BUG_ON(h->s);
+		BUG_ON(!list_empty(&h->stripes));
+		kfree(h);
+	}
+
+	free_heap(&c->ec_stripes_heap);
+	genradix_free(&c->ec_stripes);
+	bioset_exit(&c->ec_bioset);
+}
+
+int bch2_fs_ec_init(struct bch_fs *c)
+{
+	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
+
+	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
+			   BIOSET_NEED_BVECS);
+}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
new file mode 100644
index 0000000..bcf0652
--- /dev/null
+++ b/fs/bcachefs/ec.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_H
+#define _BCACHEFS_EC_H
+
+#include "ec_types.h"
+#include "keylist_types.h"
+
+const char *bch2_ec_key_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_ec_key_to_text(struct printbuf *, struct bch_fs *,
+			 struct bkey_s_c);
+
+#define bch2_bkey_ec_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_ec_key_invalid,		\
+	.val_to_text	= bch2_ec_key_to_text,		\
+}
+
+struct bch_read_bio;
+
+struct ec_stripe_buf {
+	/* might not be buffering the entire stripe: */
+	unsigned		offset;
+	unsigned		size;
+	unsigned long		valid[BITS_TO_LONGS(EC_STRIPE_MAX)];
+
+	void			*data[EC_STRIPE_MAX];
+
+	union {
+		struct bkey_i_stripe	key;
+		u64			pad[255];
+	};
+};
+
+struct ec_stripe_head;
+
+struct ec_stripe_new {
+	struct bch_fs		*c;
+	struct ec_stripe_head	*h;
+	struct mutex		lock;
+	struct list_head	list;
+
+	/* counts in flight writes, stripe is created when pin == 0 */
+	atomic_t		pin;
+
+	int			err;
+
+	unsigned long		blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
+
+	struct open_buckets	blocks;
+	struct open_buckets	parity;
+
+	struct keylist		keys;
+	u64			inline_keys[BKEY_U64s * 8];
+
+	struct ec_stripe_buf	stripe;
+};
+
+struct ec_stripe_head {
+	struct list_head	list;
+	struct mutex		lock;
+
+	struct list_head	stripes;
+
+	unsigned		target;
+	unsigned		algo;
+	unsigned		redundancy;
+
+	struct bch_devs_mask	devs;
+	unsigned		nr_active_devs;
+
+	unsigned		blocksize;
+
+	struct dev_stripe_state	block_stripe;
+	struct dev_stripe_state	parity_stripe;
+
+	struct open_buckets	blocks;
+	struct open_buckets	parity;
+
+	struct ec_stripe_new	*s;
+};
+
+int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
+
+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
+void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *,
+			     struct bpos, unsigned);
+
+void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
+
+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
+
+void bch2_ec_stripe_head_put(struct ec_stripe_head *);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
+					       unsigned, unsigned);
+
+void bch2_stripes_heap_update(struct bch_fs *, struct ec_stripe *, size_t);
+void bch2_stripes_heap_del(struct bch_fs *, struct ec_stripe *, size_t);
+void bch2_stripes_heap_insert(struct bch_fs *, struct ec_stripe *, size_t);
+
+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
+
+void bch2_ec_flush_new_stripes(struct bch_fs *);
+
+int bch2_fs_ec_start(struct bch_fs *);
+
+void bch2_fs_ec_exit(struct bch_fs *);
+int bch2_fs_ec_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
new file mode 100644
index 0000000..00e89c3
--- /dev/null
+++ b/fs/bcachefs/ec_types.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_TYPES_H
+#define _BCACHEFS_EC_TYPES_H
+
+#include <linux/llist.h>
+
+#define EC_STRIPE_MAX	16
+
+struct ec_stripe {
+	size_t			heap_idx;
+
+	u16			sectors;
+	u8			algorithm;
+
+	u8			nr_blocks;
+	u8			nr_redundant;
+
+	u8			alive;
+	atomic_t		pin;
+	atomic_t		blocks_nonempty;
+	atomic_t		block_sectors[EC_STRIPE_MAX];
+};
+
+struct ec_stripe_heap_entry {
+	size_t			idx;
+	unsigned		blocks_nonempty;
+};
+
+typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
+
+#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index df0ca1f..9bb4e102 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -193,29 +193,41 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
 	return nr_ptrs;
 }
 
-unsigned bch2_extent_ptr_durability(struct bch_fs *c,
-				    const struct bch_extent_ptr *ptr)
+static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+					   struct extent_ptr_decoded p)
 {
+	unsigned i, durability = 0;
 	struct bch_dev *ca;
 
-	if (ptr->cached)
+	if (p.ptr.cached)
 		return 0;
 
-	ca = bch_dev_bkey_exists(c, ptr->dev);
+	ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
-	if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
-		return 0;
+	if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
+		durability = max_t(unsigned, durability, ca->mi.durability);
 
-	return ca->mi.durability;
+	for (i = 0; i < p.ec_nr; i++) {
+		struct ec_stripe *s =
+			genradix_ptr(&c->ec_stripes, p.idx);
+
+		if (WARN_ON(!s))
+			continue;
+
+		durability = max_t(unsigned, durability, s->nr_redundant);
+	}
+
+	return durability;
 }
 
 unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
 {
-	const struct bch_extent_ptr *ptr;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 	unsigned durability = 0;
 
-	extent_for_each_ptr(e, ptr)
-		durability += bch2_extent_ptr_durability(c, ptr);
+	extent_for_each_ptr_decode(e, p, entry)
+		durability += bch2_extent_ptr_durability(c, p);
 
 	return durability;
 }
@@ -258,30 +270,46 @@ bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
 	return false;
 }
 
+static union bch_extent_entry *extent_entry_prev(struct bkey_s_extent e,
+					  union bch_extent_entry *entry)
+{
+	union bch_extent_entry *i = e.v->start;
+
+	if (i == entry)
+		return NULL;
+
+	while (extent_entry_next(i) != entry)
+		i = extent_entry_next(i);
+	return i;
+}
+
 union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e,
 					     struct bch_extent_ptr *ptr)
 {
-	union bch_extent_entry *dst;
-	union bch_extent_entry *src;
+	union bch_extent_entry *dst, *src, *prev;
+	bool drop_crc = true;
 
 	EBUG_ON(ptr < &e.v->start->ptr ||
 		ptr >= &extent_entry_last(e)->ptr);
 	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
 
-	src = to_entry(ptr + 1);
-
+	src = extent_entry_next(to_entry(ptr));
 	if (src != extent_entry_last(e) &&
-	    extent_entry_type(src) == BCH_EXTENT_ENTRY_ptr) {
-		dst = to_entry(ptr);
-	} else {
-		extent_for_each_entry(e, dst) {
-			if (dst == to_entry(ptr))
-				break;
+	    !extent_entry_is_crc(src))
+		drop_crc = false;
 
-			if (extent_entry_next(dst) == to_entry(ptr) &&
-			    extent_entry_is_crc(dst))
-				break;
+	dst = to_entry(ptr);
+	while ((prev = extent_entry_prev(e, dst))) {
+		if (extent_entry_is_ptr(prev))
+			break;
+
+		if (extent_entry_is_crc(prev)) {
+			if (drop_crc)
+				dst = prev;
+			break;
 		}
+
+		dst = prev;
 	}
 
 	memmove_u64s_down(dst, src,
@@ -423,6 +451,8 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
 				entry->crc128.csum.lo = (__force __le64)
 					swab64((__force u64) entry->crc128.csum.lo);
 				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
 			}
 		}
 		break;
@@ -470,6 +500,7 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
 	const union bch_extent_entry *entry;
 	struct bch_extent_crc_unpacked crc;
 	const struct bch_extent_ptr *ptr;
+	const struct bch_extent_stripe_ptr *ec;
 	struct bch_dev *ca;
 	bool first = true;
 
@@ -478,6 +509,18 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
 			pr_buf(out, " ");
 
 		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			ptr = entry_to_ptr(entry);
+			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+				? bch_dev_bkey_exists(c, ptr->dev)
+				: NULL;
+
+			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
+			       (u64) ptr->offset, ptr->gen,
+			       ptr->cached ? " cached" : "",
+			       ca && ptr_stale(ca, ptr)
+			       ? " stale" : "");
+			break;
 		case BCH_EXTENT_ENTRY_crc32:
 		case BCH_EXTENT_ENTRY_crc64:
 		case BCH_EXTENT_ENTRY_crc128:
@@ -490,17 +533,11 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
 			       crc.csum_type,
 			       crc.compression_type);
 			break;
-		case BCH_EXTENT_ENTRY_ptr:
-			ptr = entry_to_ptr(entry);
-			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
-				? bch_dev_bkey_exists(c, ptr->dev)
-				: NULL;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			ec = &entry->stripe_ptr;
 
-			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
-			       (u64) ptr->offset, ptr->gen,
-			       ptr->cached ? " cached" : "",
-			       ca && ptr_stale(ca, ptr)
-			       ? " stale" : "");
+			pr_buf(out, "ec: idx %llu block %u",
+			       (u64) ec->idx, ec->block);
 			break;
 		default:
 			pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
@@ -536,6 +573,11 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
 
 		f = &failed->devs[failed->nr++];
 		f->dev		= p->ptr.dev;
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else if (p->idx != f->idx) {
+		f->idx		= p->idx;
 		f->nr_failed	= 1;
 		f->nr_retries	= 0;
 	} else {
@@ -550,15 +592,22 @@ static inline bool ptr_better(struct bch_fs *c,
 			      const struct extent_ptr_decoded p1,
 			      const struct extent_ptr_decoded p2)
 {
-	struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
-	struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+	if (likely(!p1.idx && !p2.idx)) {
+		struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
+		struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
 
-	u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
-	u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+		u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+		u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
 
-	/* Pick at random, biased in favor of the faster device: */
+		/* Pick at random, biased in favor of the faster device: */
 
-	return bch2_rand_range(l1 + l2) > l1;
+		return bch2_rand_range(l1 + l2) > l1;
+	}
+
+	if (force_reconstruct_read(c))
+		return p1.idx > p2.idx;
+
+	return p1.idx < p2.idx;
 }
 
 static int extent_pick_read_device(struct bch_fs *c,
@@ -579,7 +628,20 @@ static int extent_pick_read_device(struct bch_fs *c,
 			continue;
 
 		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
-		if (f && f->nr_failed >= f->nr_retries)
+		if (f)
+			p.idx = f->nr_failed < f->nr_retries
+				? f->idx
+				: f->idx + 1;
+
+		if (!p.idx &&
+		    !bch2_dev_is_readable(ca))
+			p.idx++;
+
+		if (!p.idx && p.ec_nr)
+			p.idx++;
+
+		if (force_reconstruct_read(c) &&
+		    p.idx >= p.ec_nr + 1)
 			continue;
 
 		if (ret && !ptr_better(c, p, *pick))
@@ -616,8 +678,8 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
 				return "invalid extent entry type";
 
-			if (extent_entry_is_crc(entry))
-				return "has crc field";
+			if (!extent_entry_is_ptr(entry))
+				return "has non ptr field";
 		}
 
 		extent_for_each_ptr(e, ptr) {
@@ -754,6 +816,8 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
 			case BCH_EXTENT_ENTRY_crc128:
 				entry->crc128.offset += e.k->size - len;
 				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
 			}
 
 			if (extent_entry_is_crc(entry))
@@ -1512,7 +1576,18 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
 				return "invalid extent entry type";
 
-			if (extent_entry_is_crc(entry)) {
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_ptr:
+				ptr = entry_to_ptr(entry);
+
+				reason = extent_ptr_invalid(c, e, &entry->ptr,
+							    size_ondisk, false);
+				if (reason)
+					return reason;
+				break;
+			case BCH_EXTENT_ENTRY_crc32:
+			case BCH_EXTENT_ENTRY_crc64:
+			case BCH_EXTENT_ENTRY_crc128:
 				crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
 
 				if (crc.offset + e.k->size >
@@ -1533,13 +1608,9 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 					else if (nonce != crc.offset + crc.nonce)
 						return "incorrect nonce";
 				}
-			} else {
-				ptr = entry_to_ptr(entry);
-
-				reason = extent_ptr_invalid(c, e, &entry->ptr,
-							    size_ondisk, false);
-				if (reason)
-					return reason;
+				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
 			}
 		}
 
@@ -1756,6 +1827,7 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
 {
 	struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&e->k, NULL);
 	union bch_extent_entry *pos;
+	unsigned i;
 
 	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
 		pos = e->v.start;
@@ -1773,6 +1845,11 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
 found:
 	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
 	__extent_entry_insert(e, pos, to_entry(&p->ptr));
+
+	for (i = 0; i < p->ec_nr; i++) {
+		p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
+		__extent_entry_insert(e, pos, to_entry(&p->ec[i]));
+	}
 }
 
 /*
@@ -1827,26 +1904,27 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 				      unsigned target,
 				      unsigned nr_desired_replicas)
 {
-	struct bch_extent_ptr *ptr;
+	union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 	int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
 
 	if (target && extra > 0)
-		extent_for_each_ptr(e, ptr) {
-			int n = bch2_extent_ptr_durability(c, ptr);
+		extent_for_each_ptr_decode(e, p, entry) {
+			int n = bch2_extent_ptr_durability(c, p);
 
 			if (n && n <= extra &&
-			    !bch2_dev_in_target(c, ptr->dev, target)) {
-				ptr->cached = true;
+			    !bch2_dev_in_target(c, p.ptr.dev, target)) {
+				entry->ptr.cached = true;
 				extra -= n;
 			}
 		}
 
 	if (extra > 0)
-		extent_for_each_ptr(e, ptr) {
-			int n = bch2_extent_ptr_durability(c, ptr);
+		extent_for_each_ptr_decode(e, p, entry) {
+			int n = bch2_extent_ptr_durability(c, p);
 
 			if (n && n <= extra) {
-				ptr->cached = true;
+				entry->ptr.cached = true;
 				extra -= n;
 			}
 		}
@@ -1922,7 +2000,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
 
 			if ((extent_entry_type(en_l) !=
 			     extent_entry_type(en_r)) ||
-			    extent_entry_is_crc(en_l))
+			    !extent_entry_is_ptr(en_l))
 				return BCH_MERGE_NOMERGE;
 
 			lp = &en_l->ptr;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index d121ce5..15865b2 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -96,8 +96,6 @@ unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
 unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
 unsigned bch2_extent_is_compressed(struct bkey_s_c);
 
-unsigned bch2_extent_ptr_durability(struct bch_fs *,
-				    const struct bch_extent_ptr *);
 unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);
 
 bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
@@ -362,20 +360,13 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 
 /* Iterate over pointers, with crcs: */
 
-static inline struct extent_ptr_decoded
-__extent_ptr_decoded_init(const struct bkey *k)
-{
-	return (struct extent_ptr_decoded) {
-		.crc		= bch2_extent_crc_unpack(k, NULL),
-	};
-}
-
-#define EXTENT_ITERATE_EC		(1 << 0)
-
 #define __extent_ptr_next_decode(_e, _ptr, _entry)			\
 ({									\
 	__label__ out;							\
 									\
+	(_ptr).idx	= 0;						\
+	(_ptr).ec_nr	= 0;						\
+									\
 	extent_for_each_entry_from(_e, _entry, _entry)			\
 		switch (extent_entry_type(_entry)) {			\
 		case BCH_EXTENT_ENTRY_ptr:				\
@@ -387,14 +378,16 @@ __extent_ptr_decoded_init(const struct bkey *k)
 			(_ptr).crc = bch2_extent_crc_unpack((_e).k,	\
 					entry_to_crc(_entry));		\
 			break;						\
+		case BCH_EXTENT_ENTRY_stripe_ptr:			\
+			(_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr;	\
+			break;						\
 		}							\
-									\
 out:									\
 	_entry < extent_entry_last(_e);					\
 })
 
 #define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
-	for ((_ptr) = __extent_ptr_decoded_init((_e).k),		\
+	for ((_ptr).crc = bch2_extent_crc_unpack((_e).k, NULL),		\
 	     (_entry) = (_e).v->start;					\
 	     __extent_ptr_next_decode(_e, _ptr, _entry);		\
 	     (_entry) = extent_entry_next(_entry))
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
index 5738738..a85cda0e7 100644
--- a/fs/bcachefs/extents_types.h
+++ b/fs/bcachefs/extents_types.h
@@ -20,14 +20,18 @@ struct bch_extent_crc_unpacked {
 };
 
 struct extent_ptr_decoded {
+	unsigned			idx;
+	unsigned			ec_nr;
 	struct bch_extent_crc_unpacked	crc;
 	struct bch_extent_ptr		ptr;
+	struct bch_extent_stripe_ptr	ec[4];
 };
 
 struct bch_io_failures {
 	u8			nr;
 	struct bch_dev_io_failures {
 		u8		dev;
+		u8		idx;
 		u8		nr_failed;
 		u8		nr_retries;
 	}			devs[BCH_REPLICAS_MAX];
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index fbd0a82..2fee2f2 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -16,6 +16,7 @@
 #include "clock.h"
 #include "debug.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "io.h"
@@ -319,6 +320,7 @@ static void __bch2_write_index(struct bch_write_op *op)
 	struct bkey_s_extent e;
 	struct bch_extent_ptr *ptr;
 	struct bkey_i *src, *dst = keys->keys, *n, *k;
+	unsigned dev;
 	int ret;
 
 	for (src = keys->keys; src != keys->top; src = n) {
@@ -362,6 +364,10 @@ static void __bch2_write_index(struct bch_write_op *op)
 		}
 	}
 out:
+	/* If some a bucket wasn't written, we can't erasure code it: */
+	for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
+		bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+
 	bch2_open_buckets_put(c, &op->open_buckets);
 	return;
 err:
@@ -442,7 +448,8 @@ static void init_append_extent(struct bch_write_op *op,
 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 					struct write_point *wp,
 					struct bio *src,
-					bool *page_alloc_failed)
+					bool *page_alloc_failed,
+					void *buf)
 {
 	struct bch_write_bio *wbio;
 	struct bio *bio;
@@ -453,11 +460,18 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 	bio = bio_alloc_bioset(NULL, pages, 0,
 			       GFP_NOIO, &c->bio_write);
 	wbio			= wbio_init(bio);
-	wbio->bounce		= true;
 	wbio->put_bio		= true;
 	/* copy WRITE_SYNC flag */
 	wbio->bio.bi_opf	= src->bi_opf;
 
+	if (buf) {
+		bio->bi_iter.bi_size = output_available;
+		bch2_bio_map(bio, buf);
+		return bio;
+	}
+
+	wbio->bounce		= true;
+
 	/*
 	 * We can't use mempool for more than c->sb.encoded_extent_max
 	 * worth of pages, but we'd like to allocate more if we can:
@@ -622,14 +636,18 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 	struct bio *src = &op->wbio.bio, *dst = src;
 	struct bvec_iter saved_iter;
 	struct bkey_i *key_to_write;
+	void *ec_buf;
 	unsigned key_to_write_offset = op->insert_keys.top_p -
 		op->insert_keys.keys_p;
-	unsigned total_output = 0;
-	bool bounce = false, page_alloc_failed = false;
+	unsigned total_output = 0, total_input = 0;
+	bool bounce = false;
+	bool page_alloc_failed = false;
 	int ret, more = 0;
 
 	BUG_ON(!bio_sectors(src));
 
+	ec_buf = bch2_writepoint_ec_buf(c, wp);
+
 	switch (bch2_write_prep_encoded_data(op, wp)) {
 	case PREP_ENCODED_OK:
 		break;
@@ -639,16 +657,26 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 	case PREP_ENCODED_CHECKSUM_ERR:
 		goto csum_err;
 	case PREP_ENCODED_DO_WRITE:
+		if (ec_buf) {
+			dst = bch2_write_bio_alloc(c, wp, src,
+						   &page_alloc_failed,
+						   ec_buf);
+			bio_copy_data(dst, src);
+			bounce = true;
+		}
 		init_append_extent(op, wp, op->version, op->crc);
 		goto do_write;
 	}
 
-	if (op->compression_type ||
+	if (ec_buf ||
+	    op->compression_type ||
 	    (op->csum_type &&
 	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
 	    (bch2_csum_type_is_encryption(op->csum_type) &&
 	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
-		dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
+		dst = bch2_write_bio_alloc(c, wp, src,
+					   &page_alloc_failed,
+					   ec_buf);
 		bounce = true;
 	}
 
@@ -751,7 +779,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 		if (dst != src)
 			bio_advance(dst, dst_len);
 		bio_advance(src, src_len);
-		total_output += dst_len;
+		total_output	+= dst_len;
+		total_input	+= src_len;
 	} while (dst->bi_iter.bi_size &&
 		 src->bi_iter.bi_size &&
 		 wp->sectors_free &&
@@ -764,16 +793,20 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 
 	dst->bi_iter = saved_iter;
 
-	if (!bounce && more) {
-		dst = bio_split(src, total_output >> 9,
+	if (dst == src && more) {
+		BUG_ON(total_output != total_input);
+
+		dst = bio_split(src, total_input >> 9,
 				GFP_NOIO, &c->bio_write);
-		wbio_init(dst)->put_bio = true;
+		wbio_init(dst)->put_bio	= true;
+		/* copy WRITE_SYNC flag */
+		dst->bi_opf		= src->bi_opf;
 	}
 
 	dst->bi_iter.bi_size = total_output;
 
 	/* Free unneeded pages after compressing: */
-	if (bounce)
+	if (to_wbio(dst)->bounce)
 		while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
 			mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
 				     &c->bio_bounce_pages);
@@ -782,6 +815,10 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 
 	key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
 
+	bch2_ec_add_backpointer(c, wp,
+				bkey_start_pos(&key_to_write->k),
+				total_input >> 9);
+
 	dst->bi_end_io	= bch2_write_endio;
 	dst->bi_private	= &op->cl;
 	dst->bi_opf	= REQ_OP_WRITE;
@@ -796,10 +833,10 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 		"rewriting existing data (memory corruption?)");
 	ret = -EIO;
 err:
-	if (bounce) {
+	if (to_wbio(dst)->bounce)
 		bch2_bio_free_pages_pool(c, dst);
+	if (to_wbio(dst)->put_bio)
 		bio_put(dst);
-	}
 
 	return ret;
 }
@@ -811,6 +848,8 @@ static void __bch2_write(struct closure *cl)
 	struct write_point *wp;
 	int ret;
 again:
+	memset(&op->failed, 0, sizeof(op->failed));
+
 	do {
 		/* +1 for possible cache device: */
 		if (op->open_buckets.nr + op->nr_replicas + 1 >
@@ -825,6 +864,7 @@ static void __bch2_write(struct closure *cl)
 
 		wp = bch2_alloc_sectors_start(c,
 			op->target,
+			op->opts.erasure_code,
 			op->write_point,
 			&op->devs_have,
 			op->nr_replicas,
@@ -904,8 +944,6 @@ void bch2_write(struct closure *cl)
 
 	op->start_time = local_clock();
 
-	memset(&op->failed, 0, sizeof(op->failed));
-
 	bch2_keylist_init(&op->insert_keys, op->inline_keys);
 	wbio_init(&op->wbio.bio)->put_bio = false;
 
@@ -1576,8 +1614,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	if (!pick_ret)
 		goto hole;
 
-	if (pick_ret < 0)
-		goto no_device;
+	if (pick_ret < 0) {
+		__bcache_io_error(c, "no device to read from");
+		goto err;
+	}
 
 	if (pick_ret > 0)
 		ca = bch_dev_bkey_exists(c, pick.ptr.dev);
@@ -1704,36 +1744,51 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 
 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 
-	if (!rbio->have_ioref)
-		goto no_device_postclone;
-
 	percpu_down_read(&c->usage_lock);
 	bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
 	percpu_up_read(&c->usage_lock);
 
-	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
-		     bio_sectors(&rbio->bio));
+	if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
+		bio_inc_remaining(&orig->bio);
+		trace_read_split(&orig->bio);
+	}
 
-	bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
-
-	if (likely(!(flags & BCH_READ_IN_RETRY))) {
-		if (!(flags & BCH_READ_LAST_FRAGMENT)) {
-			bio_inc_remaining(&orig->bio);
-			trace_read_split(&orig->bio);
+	if (!rbio->pick.idx) {
+		if (!rbio->have_ioref) {
+			__bcache_io_error(c, "no device to read from");
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
 		}
 
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
+			     bio_sectors(&rbio->bio));
+		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
 		if (unlikely(c->opts.no_data_io)) {
-			bio_endio(&rbio->bio);
-			return 0;
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				bio_endio(&rbio->bio);
+		} else {
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				submit_bio(&rbio->bio);
+			else
+				submit_bio_wait(&rbio->bio);
+		}
+	} else {
+		/* Attempting reconstruct read: */
+		if (bch2_ec_read_extent(c, rbio)) {
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
 		}
 
-		submit_bio(&rbio->bio);
+		if (likely(!(flags & BCH_READ_IN_RETRY)))
+			bio_endio(&rbio->bio);
+	}
+out:
+	if (likely(!(flags & BCH_READ_IN_RETRY))) {
 		return 0;
 	} else {
 		int ret;
 
-		submit_bio_wait(&rbio->bio);
-
 		rbio->context = RBIO_CONTEXT_UNBOUND;
 		bch2_read_endio(&rbio->bio);
 
@@ -1748,22 +1803,12 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		return ret;
 	}
 
-no_device_postclone:
-	if (!rbio->split)
-		rbio->bio.bi_end_io = rbio->end_io;
-	bch2_rbio_free(rbio);
-no_device:
-	__bcache_io_error(c, "no device to read from");
-
-	if (likely(!(flags & BCH_READ_IN_RETRY))) {
-		orig->bio.bi_status = BLK_STS_IOERR;
-
-		if (flags & BCH_READ_LAST_FRAGMENT)
-			bch2_rbio_done(orig);
-		return 0;
-	} else {
+err:
+	if (flags & BCH_READ_IN_RETRY)
 		return READ_ERR;
-	}
+
+	orig->bio.bi_status = BLK_STS_IOERR;
+	goto out_read_done;
 
 hole:
 	/*
@@ -1775,7 +1820,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		orig->hole = true;
 
 	zero_fill_bio_iter(&orig->bio, iter);
-
+out_read_done:
 	if (flags & BCH_READ_LAST_FRAGMENT)
 		bch2_rbio_done(orig);
 	return 0;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index b1f6433..6eea96a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1071,7 +1071,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 	replicas = bch2_extent_nr_ptrs(e.c);
 
 	rcu_read_lock();
-	devs_sorted = bch2_wp_alloc_list(c, &j->wp,
+	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
 					 &c->rw_devs[BCH_DATA_JOURNAL]);
 
 	for (i = 0; i < devs_sorted.nr; i++) {
@@ -1098,8 +1098,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 		    sectors > ca->mi.bucket_size)
 			continue;
 
-		j->wp.next_alloc[ca->dev_idx] += U32_MAX;
-		bch2_wp_rescale(c, ca, &j->wp);
+		bch2_dev_stripe_increment(c, ca, &j->wp.stripe);
 
 		ja->sectors_free = ca->mi.bucket_size - sectors;
 		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 74e92a1..4d86c4bc 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -279,11 +279,37 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
 	case Opt_background_compression:
 		ret = bch2_check_set_has_compressed_data(c, v);
 		break;
+	case Opt_erasure_code:
+		if (v &&
+		    !(c->sb.features & (1ULL << BCH_FEATURE_EC))) {
+			mutex_lock(&c->sb_lock);
+			c->disk_sb.sb->features[0] |=
+				cpu_to_le64(1ULL << BCH_FEATURE_EC);
+
+			bch2_write_super(c);
+			mutex_unlock(&c->sb_lock);
+		}
+		break;
 	}
 
 	return ret;
 }
 
+int bch2_opts_check_may_set(struct bch_fs *c)
+{
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		ret = bch2_opt_check_may_set(c, i,
+				bch2_opt_get_by_id(&c->opts, i));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
 {
 	char *opt, *name, *val;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 8f4fab7..80869e3 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -111,6 +111,9 @@ enum opt_type {
 	BCH_OPT(promote_target,		u16,	OPT_RUNTIME,		\
 		OPT_FN(bch2_opt_target),				\
 		BCH_SB_PROMOTE_TARGET,	0)				\
+	BCH_OPT(erasure_code,		u16,	OPT_RUNTIME,		\
+		OPT_BOOL(),						\
+		BCH_SB_ERASURE_CODE,		false)			\
 	BCH_OPT(inodes_32bit,		u8,	OPT_RUNTIME,		\
 		OPT_BOOL(),						\
 		BCH_SB_INODE_32BIT,		false)			\
@@ -270,6 +273,7 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
 		      const struct bch_option *, u64, unsigned);
 
 int bch2_opt_check_may_set(struct bch_fs *, int, u64);
+int bch2_opts_check_may_set(struct bch_fs *);
 int bch2_parse_mount_opts(struct bch_opts *, char *);
 
 /* inode opts: */
@@ -281,7 +285,8 @@ int bch2_parse_mount_opts(struct bch_opts *, char *);
 	BCH_INODE_OPT(data_replicas,			8)	\
 	BCH_INODE_OPT(promote_target,			16)	\
 	BCH_INODE_OPT(foreground_target,		16)	\
-	BCH_INODE_OPT(background_target,		16)
+	BCH_INODE_OPT(background_target,		16)	\
+	BCH_INODE_OPT(erasure_code,			16)
 
 struct bch_io_opts {
 #define BCH_INODE_OPT(_name, _bits)	unsigned _name##_defined:1;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1ae8133..ddfba16 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -7,6 +7,7 @@
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "dirent.h"
+#include "ec.h"
 #include "error.h"
 #include "fsck.h"
 #include "journal_io.h"
@@ -213,6 +214,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
+	err = "cannot allocate memory";
+	ret = bch2_fs_ec_start(c);
+	if (ret)
+		goto err;
+
 	bch_verbose(c, "starting mark and sweep:");
 	err = "error in recovery";
 	ret = bch2_initial_gc(c, &journal);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 83fc9c9..0296931 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -80,9 +80,33 @@ static void extent_to_replicas(struct bkey_s_c k,
 
 		r->nr_required	= 1;
 
-		extent_for_each_ptr_decode(e, p, entry)
-			if (!p.ptr.cached)
-				r->devs[r->nr_devs++] = p.ptr.dev;
+		extent_for_each_ptr_decode(e, p, entry) {
+			if (p.ptr.cached)
+				continue;
+
+			if (p.ec_nr) {
+				r->nr_devs = 0;
+				break;
+			}
+
+			r->devs[r->nr_devs++] = p.ptr.dev;
+		}
+	}
+}
+
+static void stripe_to_replicas(struct bkey_s_c k,
+			       struct bch_replicas_entry *r)
+{
+	if (k.k->type == BCH_STRIPE) {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+		const struct bch_extent_ptr *ptr;
+
+		r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
+
+		for (ptr = s.v->ptrs;
+		     ptr < s.v->ptrs + s.v->nr_blocks;
+		     ptr++)
+			r->devs[r->nr_devs++] = ptr->dev;
 	}
 }
 
@@ -101,6 +125,10 @@ static void bkey_to_replicas(enum bkey_type type,
 		e->data_type = BCH_DATA_USER;
 		extent_to_replicas(k, e);
 		break;
+	case BKEY_TYPE_EC:
+		e->data_type = BCH_DATA_USER;
+		stripe_to_replicas(k, e);
+		break;
 	default:
 		break;
 	}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 0715430..3dbcb6d 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -3,6 +3,7 @@
 #include "bcachefs.h"
 #include "checksum.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "error.h"
 #include "io.h"
 #include "journal.h"
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index a22beff..931e50e8 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -20,6 +20,7 @@
 #include "compress.h"
 #include "debug.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "error.h"
 #include "fs.h"
 #include "fs-io.h"
@@ -364,6 +365,7 @@ static void bch2_fs_free(struct bch_fs *c)
 
 	bch2_fs_quota_exit(c);
 	bch2_fs_fsio_exit(c);
+	bch2_fs_ec_exit(c);
 	bch2_fs_encryption_exit(c);
 	bch2_fs_io_exit(c);
 	bch2_fs_btree_cache_exit(c);
@@ -544,6 +546,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	INIT_LIST_HEAD(&c->fsck_errors);
 	mutex_init(&c->fsck_error_lock);
 
+	INIT_LIST_HEAD(&c->ec_new_stripe_list);
+	mutex_init(&c->ec_new_stripe_lock);
+	mutex_init(&c->ec_stripes_lock);
+	spin_lock_init(&c->ec_stripes_heap_lock);
+
 	seqcount_init(&c->gc_pos_lock);
 
 	c->copy_gc_enabled		= 1;
@@ -612,6 +619,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_io_init(c) ||
 	    bch2_fs_encryption_init(c) ||
 	    bch2_fs_compress_init(c) ||
+	    bch2_fs_ec_init(c) ||
 	    bch2_fs_fsio_init(c))
 		goto err;
 
@@ -683,6 +691,10 @@ const char *bch2_fs_start(struct bch_fs *c)
 	if (ret)
 		goto err;
 
+	ret = bch2_opts_check_may_set(c);
+	if (ret)
+		goto err;
+
 	err = "dynamic fault";
 	if (bch2_fs_init_fault("fs_start"))
 		goto err;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 6a5da0f..188e195 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -19,6 +19,7 @@
 #include "btree_gc.h"
 #include "buckets.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "inode.h"
 #include "journal.h"
 #include "keylist.h"
@@ -188,6 +189,8 @@ sysfs_pd_controller_attribute(rebalance);
 read_attribute(rebalance_work);
 rw_attribute(promote_whole_extents);
 
+read_attribute(new_stripes);
+
 rw_attribute(pd_controllers_update_seconds);
 
 read_attribute(meta_replicas_have);
@@ -242,6 +245,8 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 			pr_buf(&out, "\t%s:\t\t%llu\n",
 			       bch2_data_types[type],
 			       stats.replicas[replicas].data[type]);
+		pr_buf(&out, "\terasure coded:\t%llu\n",
+		       stats.replicas[replicas].ec_data);
 		pr_buf(&out, "\treserved:\t%llu\n",
 		       stats.replicas[replicas].persistent_reserved);
 	}
@@ -310,6 +315,41 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 			compressed_sectors_uncompressed << 9);
 }
 
+static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf)
+{
+	char *out = buf, *end = buf + PAGE_SIZE;
+	struct ec_stripe_head *h;
+	struct ec_stripe_new *s;
+
+	mutex_lock(&c->ec_new_stripe_lock);
+	list_for_each_entry(h, &c->ec_new_stripe_list, list) {
+		out += scnprintf(out, end - out,
+				 "target %u algo %u redundancy %u:\n",
+				 h->target, h->algo, h->redundancy);
+
+		if (h->s)
+			out += scnprintf(out, end - out,
+					 "\tpending: blocks %u allocated %u\n",
+					 h->s->blocks.nr,
+					 bitmap_weight(h->s->blocks_allocated,
+						       h->s->blocks.nr));
+
+		mutex_lock(&h->lock);
+		list_for_each_entry(s, &h->stripes, list)
+			out += scnprintf(out, end - out,
+					 "\tin flight: blocks %u allocated %u pin %u\n",
+					 s->blocks.nr,
+					 bitmap_weight(s->blocks_allocated,
+						       s->blocks.nr),
+					 atomic_read(&s->pin));
+		mutex_unlock(&h->lock);
+
+	}
+	mutex_unlock(&c->ec_new_stripe_lock);
+
+	return out - buf;
+}
+
 SHOW(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -369,6 +409,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_compression_stats)
 		return bch2_compression_stats(c, buf);
 
+	if (attr == &sysfs_new_stripes)
+		return bch2_new_stripes(c, buf);
+
 #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
@@ -537,6 +580,8 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_rebalance_work,
 	sysfs_pd_controller_files(rebalance),
 
+	&sysfs_new_stripes,
+
 	&sysfs_internal_uuid,
 
 #define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@@ -765,6 +810,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		"    meta:               %llu\n"
 		"    user:               %llu\n"
 		"    cached:             %llu\n"
+		"    erasure coded:      %llu\n"
 		"    available:          %lli\n"
 		"sectors:\n"
 		"    sb:                 %llu\n"
@@ -788,6 +834,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		stats.buckets[BCH_DATA_BTREE],
 		stats.buckets[BCH_DATA_USER],
 		stats.buckets[BCH_DATA_CACHED],
+		stats.buckets_ec,
 		ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable,
 		stats.sectors[BCH_DATA_SB],
 		stats.sectors[BCH_DATA_JOURNAL],