bcachefs: Redo data_update interface
This patch significantly cleans up and simplifies the data_update
interface. Instead of only being able to specify a single pointer by
device to rewrite, we're now able to specify any or all of the pointers
in the original extent to be rewrited, as a bitmask.
data_cmd is no more: the various pred functions now just return true if
the extent should be moved/updated. All the data_update path does is
rewrite existing replicas, or add new ones.
This fixes a bug where with background compression on replicated
filesystems, where rebalance -> data_update would incorrectly drop the
wrong old replica, and keep trying to recompress an extent pointer and
each time failing to drop the right replica. Oops.
Now, the data update path doesn't look at the io options to decide which
pointers to keep and which to drop - it only goes off of the
data_update_options passed to it.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 0161b0a..f7bce89 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -89,6 +89,16 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
return ret;
}
+static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+ struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->dev == dev)
+ ptr->cached = true;
+}
+
int bch2_data_update_index_update(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
@@ -113,6 +123,7 @@ int bch2_data_update_index_update(struct bch_write_op *op)
while (1) {
struct bkey_s_c k;
+ struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
struct bkey_i *insert;
struct bkey_i_extent *new;
const union bch_extent_entry *entry;
@@ -121,6 +132,7 @@ int bch2_data_update_index_update(struct bch_write_op *op)
bool did_work = false;
bool extending = false, should_check_enospc;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+ unsigned i;
bch2_trans_begin(&trans);
@@ -131,8 +143,7 @@ int bch2_data_update_index_update(struct bch_write_op *op)
new = bkey_i_to_extent(bch2_keylist_front(keys));
- if (bversion_cmp(k.k->version, new->k.version) ||
- !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
+ if (!bch2_extents_match(k, old))
goto nomatch;
bkey_reassemble(_insert.k, k);
@@ -146,20 +157,34 @@ int bch2_data_update_index_update(struct bch_write_op *op)
bch2_cut_back(new->k.p, insert);
bch2_cut_back(insert->k.p, &new->k_i);
- if (m->data_cmd == DATA_REWRITE) {
- struct bch_extent_ptr *new_ptr, *old_ptr = (void *)
- bch2_bkey_has_device(bkey_i_to_s_c(insert),
- m->data_opts.rewrite_dev);
- if (!old_ptr)
- goto nomatch;
-
- if (old_ptr->cached)
- extent_for_each_ptr(extent_i_to_s(new), new_ptr)
- new_ptr->cached = true;
-
- __bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
+ /*
+ * @old: extent that we read from
+ * @insert: key that we're going to update, initialized from
+ * extent currently in btree - same as @old unless we raced with
+ * other updates
+ * @new: extent with new pointers that we'll be adding to @insert
+ *
+ * Fist, drop rewrite_ptrs from @new:
+ */
+ i = 0;
+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
+ if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+ bch2_extent_has_ptr(old, p, bkey_i_to_s_c(insert))) {
+ /*
+ * If we're going to be adding a pointer to the
+ * same device, we have to drop the old one -
+ * otherwise, we can just mark it cached:
+ */
+ if (bch2_bkey_has_device(bkey_i_to_s_c(&new->k_i), p.ptr.dev))
+ bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev);
+ else
+ bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev);
+ }
+ i++;
}
+
+ /* Add new ptrs: */
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
/*
@@ -177,12 +202,8 @@ int bch2_data_update_index_update(struct bch_write_op *op)
if (!did_work)
goto nomatch;
- bch2_bkey_narrow_crcs(insert,
- (struct bch_extent_crc_unpacked) { 0 });
+ bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
bch2_extent_normalize(c, bkey_i_to_s(insert));
- bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
- op->opts.background_target,
- op->opts.data_replicas);
ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
&extending,
@@ -250,134 +271,100 @@ int bch2_data_update_index_update(struct bch_write_op *op)
return ret;
}
-void bch2_data_update_read_done(struct data_update *m, struct bch_read_bio *rbio)
+void bch2_data_update_read_done(struct data_update *m,
+ struct bch_extent_crc_unpacked crc)
{
/* write bio must own pages: */
BUG_ON(!m->op.wbio.bio.bi_vcnt);
- m->ptr = rbio->pick.ptr;
- m->offset = rbio->data_pos.offset - rbio->pick.crc.offset;
- m->op.devs_have = rbio->devs_have;
- m->op.pos = rbio->data_pos;
- m->op.version = rbio->version;
- m->op.crc = rbio->pick.crc;
- m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
+ m->op.crc = crc;
+ m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
- if (m->data_cmd == DATA_REWRITE)
- bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
+ closure_call(&m->op.cl, bch2_write, NULL, NULL);
+}
+
+void bch2_data_update_exit(struct data_update *update)
+{
+ struct bch_fs *c = update->op.c;
+
+ bch2_bkey_buf_exit(&update->k, c);
+ bch2_disk_reservation_put(c, &update->op.res);
+ bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
}
int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
struct write_point_specifier wp,
struct bch_io_opts io_opts,
- enum data_cmd data_cmd,
- struct data_opts data_opts,
+ struct data_update_opts data_opts,
enum btree_id btree_id,
struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
- struct bch_extent_crc_unpacked crc;
struct extent_ptr_decoded p;
+ unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
int ret;
+ bch2_bkey_buf_init(&m->k);
+ bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
- m->data_cmd = data_cmd;
m->data_opts = data_opts;
- m->nr_ptrs_reserved = 0;
bch2_write_op_init(&m->op, c, io_opts);
-
- if (!bch2_bkey_is_incompressible(k))
- m->op.compression_type =
- bch2_compression_opt_to_type[io_opts.background_compression ?:
- io_opts.compression];
- else
- m->op.incompressible = true;
-
+ m->op.pos = bkey_start_pos(k.k);
+ m->op.version = k.k->version;
m->op.target = data_opts.target,
m->op.write_point = wp;
-
- /*
- * op->csum_type is normally initialized from the fs/file's current
- * options - but if an extent is encrypted, we require that it stays
- * encrypted:
- */
- bkey_for_each_crc(k.k, ptrs, crc, entry)
- if (bch2_csum_type_is_encryption(crc.csum_type)) {
- m->op.nonce = crc.nonce + crc.offset;
- m->op.csum_type = crc.csum_type;
- break;
- }
-
- if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
- m->op.alloc_reserve = RESERVE_movinggc;
- } else {
- /* XXX: this should probably be passed in */
- m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
- }
-
- m->op.flags |= BCH_WRITE_PAGES_STABLE|
+ m->op.flags |= BCH_WRITE_PAGES_STABLE|
BCH_WRITE_PAGES_OWNED|
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_FROM_INTERNAL|
- BCH_WRITE_MOVE;
+ BCH_WRITE_MOVE|
+ m->data_opts.write_flags;
+ m->op.compression_type =
+ bch2_compression_opt_to_type[io_opts.background_compression ?:
+ io_opts.compression];
+ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+ m->op.alloc_reserve = RESERVE_movinggc;
- m->op.nr_replicas = data_opts.nr_replicas;
- m->op.nr_replicas_required = data_opts.nr_replicas;
+ i = 0;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.ptr.cached)
+ m->data_opts.rewrite_ptrs &= ~(1U << i);
- switch (data_cmd) {
- case DATA_ADD_REPLICAS: {
+ if (!((1U << i) & m->data_opts.rewrite_ptrs))
+ bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+
+ if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+ crc_is_compressed(p.crc))
+ reserve_sectors += k.k->size;
+
/*
- * DATA_ADD_REPLICAS is used for moving data to a different
- * device in the background, and due to compression the new copy
- * might take up more space than the old copy:
+ * op->csum_type is normally initialized from the fs/file's
+ * current options - but if an extent is encrypted, we require
+ * that it stays encrypted:
*/
-#if 0
- int nr = (int) io_opts.data_replicas -
- bch2_bkey_nr_ptrs_allocated(k);
-#endif
- int nr = (int) io_opts.data_replicas;
-
- if (nr > 0) {
- m->op.nr_replicas = m->nr_ptrs_reserved = nr;
-
- ret = bch2_disk_reservation_get(c, &m->op.res,
- k.k->size, m->op.nr_replicas, 0);
- if (ret)
- return ret;
+ if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
+ m->op.nonce = p.crc.nonce + p.crc.offset;
+ m->op.csum_type = p.crc.csum_type;
}
- break;
- }
- case DATA_REWRITE: {
- unsigned compressed_sectors = 0;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (p.ptr.dev == data_opts.rewrite_dev) {
- if (p.ptr.cached)
- m->op.flags |= BCH_WRITE_CACHED;
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+ m->op.incompressible = true;
- if (!p.ptr.cached &&
- crc_is_compressed(p.crc))
- compressed_sectors += p.crc.compressed_size;
- }
-
- if (compressed_sectors) {
- ret = bch2_disk_reservation_add(c, &m->op.res,
- k.k->size * m->op.nr_replicas,
- BCH_DISK_RESERVATION_NOFAIL);
- if (ret)
- return ret;
- }
- break;
- }
- case DATA_PROMOTE:
- m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
- m->op.flags |= BCH_WRITE_CACHED;
- break;
- default:
- BUG();
+ i++;
}
+ if (reserve_sectors) {
+ ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
+ m->data_opts.extra_replicas
+ ? 0
+ : BCH_DISK_RESERVATION_NOFAIL);
+ if (ret)
+ return ret;
+ }
+
+ m->op.nr_replicas = m->op.nr_replicas_required =
+ hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas;
return 0;
}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 03b4ca5..ee38bd6 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -3,46 +3,37 @@
#ifndef _BCACHEFS_DATA_UPDATE_H
#define _BCACHEFS_DATA_UPDATE_H
+#include "bkey_buf.h"
#include "io_types.h"
-enum data_cmd {
- DATA_SKIP,
- DATA_SCRUB,
- DATA_ADD_REPLICAS,
- DATA_REWRITE,
- DATA_PROMOTE,
-};
+struct moving_context;
-struct data_opts {
+struct data_update_opts {
+ unsigned rewrite_ptrs;
u16 target;
- u8 rewrite_dev;
- u8 nr_replicas;
- int btree_insert_flags;
+ u8 extra_replicas;
+ unsigned btree_insert_flags;
+ unsigned write_flags;
};
struct data_update {
+ /* extent being updated: */
enum btree_id btree_id;
- enum data_cmd data_cmd;
- struct data_opts data_opts;
-
- unsigned nr_ptrs_reserved;
-
+ struct bkey_buf k;
+ struct data_update_opts data_opts;
struct moving_context *ctxt;
-
- /* what we read: */
- struct bch_extent_ptr ptr;
- u64 offset;
-
struct bch_write_op op;
};
int bch2_data_update_index_update(struct bch_write_op *);
-void bch2_data_update_read_done(struct data_update *, struct bch_read_bio *);
+void bch2_data_update_read_done(struct data_update *,
+ struct bch_extent_crc_unpacked);
+
+void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct bch_fs *, struct data_update *,
struct write_point_specifier,
- struct bch_io_opts,
- enum data_cmd, struct data_opts,
+ struct bch_io_opts, struct data_update_opts,
enum btree_id, struct bkey_s_c);
#endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 4e44234..38836c1 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -25,6 +25,8 @@
#include "trace.h"
#include "util.h"
+static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
+
static unsigned bch2_crc_field_size_max[] = {
[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
@@ -687,37 +689,6 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
return durability;
}
-void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
- unsigned target,
- unsigned nr_desired_replicas)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
-
- if (target && extra > 0)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- int n = bch2_extent_ptr_durability(c, p);
-
- if (n && n <= extra &&
- !bch2_dev_in_target(c, p.ptr.dev, target)) {
- entry->ptr.cached = true;
- extra -= n;
- }
- }
-
- if (extra > 0)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- int n = bch2_extent_ptr_durability(c, p);
-
- if (n && n <= extra) {
- entry->ptr.cached = true;
- extra -= n;
- }
- }
-}
-
void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
{
union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
@@ -821,8 +792,8 @@ static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
/*
* Returns pointer to the next entry after the one being dropped:
*/
-union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
+ struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry = to_entry(ptr), *next;
@@ -894,6 +865,14 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
}
+void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
+{
+ struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev);
+
+ if (ptr)
+ __bch2_bkey_drop_ptr(k, ptr);
+}
+
const struct bch_extent_ptr *
bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
{
@@ -939,6 +918,44 @@ bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
}
/*
+ * Returns true if two extents refer to the same data:
+ */
+bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
+{
+ struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
+ struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+ const union bch_extent_entry *entry1, *entry2;
+ struct extent_ptr_decoded p1, p2;
+
+ bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
+ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+ if (p1.ptr.dev == p2.ptr.dev &&
+ p1.ptr.gen == p2.ptr.gen &&
+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ return true;
+
+ return false;
+}
+
+bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
+ struct bkey_s_c k2)
+{
+ struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+ const union bch_extent_entry *entry2;
+ struct extent_ptr_decoded p2;
+
+ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+ if (p1.ptr.dev == p2.ptr.dev &&
+ p1.ptr.gen == p2.ptr.gen &&
+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ return true;
+
+ return false;
+}
+
+/*
* bch_extent_normalize - clean up an extent, dropping stale pointers etc.
*
* Returns true if @k should be dropped entirely
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 4f41f0f..3c17b81 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -577,15 +577,10 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
-void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
- unsigned, unsigned);
-
void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
-union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s,
- struct bch_extent_ptr *);
union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
struct bch_extent_ptr *);
@@ -607,11 +602,14 @@ do { \
} while (0)
void bch2_bkey_drop_device(struct bkey_s, unsigned);
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
struct bch_extent_ptr, u64);
+bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
+bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 743449e..c22ce1e 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -1490,13 +1490,12 @@ static void promote_done(struct bch_write_op *wop)
bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
op->start_time);
- bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
+ bch2_data_update_exit(&op->write);
promote_free(c, op);
}
static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
{
- struct bch_fs *c = rbio->c;
struct bio *bio = &op->write.op.wbio.bio;
trace_promote(&rbio->bio);
@@ -1509,9 +1508,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
- bch2_data_update_read_done(&op->write, rbio);
-
- closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, NULL);
+ bch2_data_update_read_done(&op->write, rbio->pick.crc);
}
static struct promote_op *__promote_alloc(struct bch_fs *c,
@@ -1569,10 +1566,10 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
ret = bch2_data_update_init(c, &op->write,
writepoint_hashed((unsigned long) current),
opts,
- DATA_PROMOTE,
- (struct data_opts) {
+ (struct data_update_opts) {
.target = opts.promote_target,
- .nr_replicas = 1,
+ .extra_replicas = 1,
+ .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
},
btree_id, k);
BUG_ON(ret);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index a3a486c..4060678 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -38,17 +38,9 @@ struct moving_io {
static void move_free(struct moving_io *io)
{
struct moving_context *ctxt = io->write.ctxt;
- struct bvec_iter_all iter;
- struct bio_vec *bv;
- bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
-
- bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
- if (bv->bv_page)
- __free_page(bv->bv_page);
-
+ bch2_data_update_exit(&io->write);
wake_up(&ctxt->wait);
-
kfree(io);
}
@@ -72,8 +64,7 @@ static void move_write(struct moving_io *io)
closure_get(&io->write.ctxt->cl);
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
- bch2_data_update_read_done(&io->write, &io->rbio);
- closure_call(&io->write.op.cl, bch2_write, NULL, NULL);
+ bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
}
static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@@ -135,8 +126,7 @@ static int bch2_move_extent(struct btree_trans *trans,
struct bch_io_opts io_opts,
enum btree_id btree_id,
struct bkey_s_c k,
- enum data_cmd data_cmd,
- struct data_opts data_opts)
+ struct data_update_opts data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -180,10 +170,11 @@ static int bch2_move_extent(struct btree_trans *trans,
io->rbio.bio.bi_end_io = move_read_endio;
ret = bch2_data_update_init(c, &io->write, wp, io_opts,
- data_cmd, data_opts, btree_id, k);
+ data_opts, btree_id, k);
if (ret)
goto err_free_pages;
+ io->write.ctxt = ctxt;
io->write.op.end_io = move_write_done;
atomic64_inc(&ctxt->stats->keys_moved);
@@ -262,8 +253,7 @@ static int __bch2_move_data(struct bch_fs *c,
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct data_opts data_opts;
- enum data_cmd data_cmd;
+ struct data_update_opts data_opts;
u64 delay, cur_inum = U64_MAX;
int ret = 0, ret2;
@@ -350,18 +340,9 @@ static int __bch2_move_data(struct bch_fs *c,
cur_inum = k.k->p.inode;
}
- switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
- case DATA_SKIP:
+ memset(&data_opts, 0, sizeof(data_opts));
+ if (!pred(c, arg, k, &io_opts, &data_opts))
goto next;
- case DATA_SCRUB:
- BUG();
- case DATA_ADD_REPLICAS:
- case DATA_REWRITE:
- case DATA_PROMOTE:
- break;
- default:
- BUG();
- }
/*
* The iterator gets unlocked by __bch2_read_extent - need to
@@ -370,8 +351,8 @@ static int __bch2_move_data(struct bch_fs *c,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
- data_cmd, data_opts);
+ ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts,
+ btree_id, k, data_opts);
if (ret2) {
if (ret2 == -EINTR)
continue;
@@ -476,9 +457,9 @@ int bch2_move_data(struct bch_fs *c,
return ret;
}
-typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *,
- struct btree *, struct bch_io_opts *,
- struct data_opts *);
+typedef bool (*move_btree_pred)(struct bch_fs *, void *,
+ struct btree *, struct bch_io_opts *,
+ struct data_update_opts *);
static int bch2_move_btree(struct bch_fs *c,
enum btree_id start_btree_id, struct bpos start_pos,
@@ -492,8 +473,7 @@ static int bch2_move_btree(struct bch_fs *c,
struct btree_iter iter;
struct btree *b;
enum btree_id id;
- struct data_opts data_opts;
- enum data_cmd cmd;
+ struct data_update_opts data_opts;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
@@ -522,17 +502,8 @@ static int bch2_move_btree(struct bch_fs *c,
stats->pos = iter.pos;
- switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
- case DATA_SKIP:
+ if (!pred(c, arg, b, &io_opts, &data_opts))
goto next;
- case DATA_SCRUB:
- BUG();
- case DATA_ADD_REPLICAS:
- case DATA_REWRITE:
- break;
- default:
- BUG();
- }
ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
if (ret == -EINTR)
@@ -562,20 +533,10 @@ static int bch2_move_btree(struct bch_fs *c,
return ret;
}
-#if 0
-static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
-{
- return DATA_SCRUB;
-}
-#endif
-
-static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool rereplicate_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
unsigned nr_good = bch2_bkey_durability(c, k);
unsigned replicas = bkey_is_btree_ptr(k.k)
@@ -583,43 +544,50 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
: io_opts->data_replicas;
if (!nr_good || nr_good >= replicas)
- return DATA_SKIP;
+ return false;
data_opts->target = 0;
- data_opts->nr_replicas = 1;
+ data_opts->extra_replicas = replicas - nr_good;
data_opts->btree_insert_flags = 0;
- return DATA_ADD_REPLICAS;
+ return true;
}
-static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool migrate_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
struct bch_ioctl_data *op = arg;
+ unsigned i = 0;
- if (!bch2_bkey_has_device(k, op->migrate.dev))
- return DATA_SKIP;
-
+ data_opts->rewrite_ptrs = 0;
data_opts->target = 0;
- data_opts->nr_replicas = 1;
+ data_opts->extra_replicas = 0;
data_opts->btree_insert_flags = 0;
- data_opts->rewrite_dev = op->migrate.dev;
- return DATA_REWRITE;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (ptr->dev == op->migrate.dev)
+ data_opts->rewrite_ptrs |= 1U << i;
+ i++;
+ }
+
+ return data_opts->rewrite_ptrs != 0;;
}
-static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
-static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool migrate_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
@@ -648,21 +616,21 @@ static bool bformat_needs_redo(struct bkey_format *f)
return false;
}
-static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
if (b->version_ondisk != c->sb.version ||
btree_node_need_rewrite(b) ||
bformat_needs_redo(&b->format)) {
data_opts->target = 0;
- data_opts->nr_replicas = 1;
+ data_opts->extra_replicas = 0;
data_opts->btree_insert_flags = 0;
- return DATA_REWRITE;
+ return true;
}
- return DATA_SKIP;
+ return false;
}
int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 6d273f6..fd55629 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -24,9 +24,8 @@ struct moving_context {
wait_queue_head_t wait;
};
-typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
- struct bkey_s_c,
- struct bch_io_opts *, struct data_opts *);
+typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
+ struct bch_io_opts *, struct data_update_opts *);
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 1e2de1e..d63b9fe 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -39,15 +39,32 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
cmp_int(l->offset, r->offset);
}
-static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool copygc_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
copygc_heap *h = &c->copygc_heap;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p = { 0 };
+ unsigned i = 0;
+
+ /*
+ * We need to use the journal reserve here, because
+ * - journal reclaim depends on btree key cache
+ * flushing to make forward progress,
+ * - which has to make forward progress when the
+ * journal is pre-reservation full,
+ * - and depends on allocation - meaning allocator and
+ * copygc
+ */
+
+ data_opts->rewrite_ptrs = 0;
+ data_opts->target = io_opts->background_target;
+ data_opts->extra_replicas = 0;
+ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE|
+ JOURNAL_WATERMARK_copygc;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
@@ -55,12 +72,12 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
.dev = p.ptr.dev,
.offset = p.ptr.offset,
};
- ssize_t i;
+ ssize_t eytz;
if (p.ptr.cached)
continue;
- i = eytzinger0_find_le(h->data, h->used,
+ eytz = eytzinger0_find_le(h->data, h->used,
sizeof(h->data[0]),
bucket_offset_cmp, &search);
#if 0
@@ -74,34 +91,16 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
BUG_ON(i != j);
#endif
- if (i >= 0 &&
- p.ptr.dev == h->data[i].dev &&
- p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
- p.ptr.gen == h->data[i].gen) {
- /*
- * We need to use the journal reserve here, because
- * - journal reclaim depends on btree key cache
- * flushing to make forward progress,
- * - which has to make forward progress when the
- * journal is pre-reservation full,
- * - and depends on allocation - meaning allocator and
- * copygc
- */
+ if (eytz >= 0 &&
+ p.ptr.dev == h->data[eytz].dev &&
+ p.ptr.offset < h->data[eytz].offset + ca->mi.bucket_size &&
+ p.ptr.gen == h->data[eytz].gen)
+ data_opts->rewrite_ptrs |= 1U << i;
- data_opts->target = io_opts->background_target;
- data_opts->nr_replicas = 1;
- data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE|
- JOURNAL_WATERMARK_copygc;
- data_opts->rewrite_dev = p.ptr.dev;
-
- if (p.has_ec)
- data_opts->nr_replicas += p.ec.redundancy;
-
- return DATA_REWRITE;
- }
+ i++;
}
- return DATA_SKIP;
+ return data_opts->rewrite_ptrs != 0;
}
static inline int fragmentation_cmp(copygc_heap *heap,
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 1724ae3..63b24dc 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -22,62 +22,70 @@
* returns -1 if it should not be moved, or
* device of pointer that should be moved, if known, or INT_MAX if unknown
*/
-static int __bch2_rebalance_pred(struct bch_fs *c,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts)
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
+ unsigned i;
+
+ data_opts->rewrite_ptrs = 0;
+ data_opts->target = io_opts->background_target;
+ data_opts->extra_replicas = 0;
+ data_opts->btree_insert_flags = 0;
if (io_opts->background_compression &&
- !bch2_bkey_is_incompressible(k))
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ !bch2_bkey_is_incompressible(k)) {
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ i = 0;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (!p.ptr.cached &&
p.crc.compression_type !=
bch2_compression_opt_to_type[io_opts->background_compression])
- return p.ptr.dev;
+ data_opts->rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
- if (io_opts->background_target)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached &&
- !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target))
- return p.ptr.dev;
+ if (io_opts->background_target) {
+ const struct bch_extent_ptr *ptr;
- return -1;
+ i = 0;
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!ptr->cached &&
+ !bch2_dev_in_target(c, ptr->dev, io_opts->background_target))
+ data_opts->rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
+
+ return data_opts->rewrite_ptrs != 0;
}
void bch2_rebalance_add_key(struct bch_fs *c,
struct bkey_s_c k,
struct bch_io_opts *io_opts)
{
- atomic64_t *counter;
- int dev;
+ struct data_update_opts update_opts = { 0 };
+ struct bkey_ptrs_c ptrs;
+ const struct bch_extent_ptr *ptr;
+ unsigned i;
- dev = __bch2_rebalance_pred(c, k, io_opts);
- if (dev < 0)
+ if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
return;
- counter = dev < INT_MAX
- ? &bch_dev_bkey_exists(c, dev)->rebalance_work
- : &c->rebalance.work_unknown_dev;
-
- if (atomic64_add_return(k.k->size, counter) == k.k->size)
- rebalance_wakeup(c);
-}
-
-static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
-{
- if (__bch2_rebalance_pred(c, k, io_opts) >= 0) {
- data_opts->target = io_opts->background_target;
- data_opts->nr_replicas = 1;
- data_opts->btree_insert_flags = 0;
- return DATA_ADD_REPLICAS;
- } else {
- return DATA_SKIP;
+ i = 0;
+ ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr(ptrs, ptr) {
+ if ((1U << i) && update_opts.rewrite_ptrs)
+ if (atomic64_add_return(k.k->size,
+ &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
+ k.k->size)
+ rebalance_wakeup(c);
+ i++;
}
}