| // SPDX-License-Identifier: GPL-2.0 |
| /* |
| * io_misc.c - fallocate, fpunch, truncate: |
| */ |
| |
| #include "bcachefs.h" |
| #include "alloc_foreground.h" |
| #include "bkey_buf.h" |
| #include "btree_update.h" |
| #include "buckets.h" |
| #include "clock.h" |
| #include "error.h" |
| #include "extents.h" |
| #include "extent_update.h" |
| #include "inode.h" |
| #include "io_misc.h" |
| #include "io_write.h" |
| #include "logged_ops.h" |
| #include "rebalance.h" |
| #include "subvolume.h" |
| |
| /* Overwrites whatever was present with zeroes: */ |
| int bch2_extent_fallocate(struct btree_trans *trans, |
| subvol_inum inum, |
| struct btree_iter *iter, |
| u64 sectors, |
| struct bch_io_opts opts, |
| s64 *i_sectors_delta, |
| struct write_point_specifier write_point) |
| { |
| struct bch_fs *c = trans->c; |
| struct disk_reservation disk_res = { 0 }; |
| struct closure cl; |
| struct open_buckets open_buckets = { 0 }; |
| struct bkey_s_c k; |
| struct bkey_buf old, new; |
| unsigned sectors_allocated = 0, new_replicas; |
| bool unwritten = opts.nocow && |
| c->sb.version >= bcachefs_metadata_version_unwritten_extents; |
| int ret; |
| |
| bch2_bkey_buf_init(&old); |
| bch2_bkey_buf_init(&new); |
| closure_init_stack(&cl); |
| |
| k = bch2_btree_iter_peek_slot(iter); |
| ret = bkey_err(k); |
| if (ret) |
| return ret; |
| |
| sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); |
| new_replicas = max(0, (int) opts.data_replicas - |
| (int) bch2_bkey_nr_ptrs_fully_allocated(k)); |
| |
| /* |
| * Get a disk reservation before (in the nocow case) calling |
| * into the allocator: |
| */ |
| ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); |
| if (unlikely(ret)) |
| goto err_noprint; |
| |
| bch2_bkey_buf_reassemble(&old, c, k); |
| |
| if (!unwritten) { |
| struct bkey_i_reservation *reservation; |
| |
| bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); |
| reservation = bkey_reservation_init(new.k); |
| reservation->k.p = iter->pos; |
| bch2_key_resize(&reservation->k, sectors); |
| reservation->v.nr_replicas = opts.data_replicas; |
| } else { |
| struct bkey_i_extent *e; |
| struct bch_devs_list devs_have; |
| struct write_point *wp; |
| |
| devs_have.nr = 0; |
| |
| bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); |
| |
| e = bkey_extent_init(new.k); |
| e->k.p = iter->pos; |
| |
| ret = bch2_alloc_sectors_start_trans(trans, |
| opts.foreground_target, |
| false, |
| write_point, |
| &devs_have, |
| opts.data_replicas, |
| opts.data_replicas, |
| BCH_WATERMARK_normal, 0, &cl, &wp); |
| if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) |
| ret = -BCH_ERR_transaction_restart_nested; |
| if (ret) |
| goto err; |
| |
| sectors = min_t(u64, sectors, wp->sectors_free); |
| sectors_allocated = sectors; |
| |
| bch2_key_resize(&e->k, sectors); |
| |
| bch2_open_bucket_get(c, wp, &open_buckets); |
| bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); |
| bch2_alloc_sectors_done(c, wp); |
| |
| extent_for_each_ptr(extent_i_to_s(e), ptr) |
| ptr->unwritten = true; |
| } |
| |
| ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, |
| 0, i_sectors_delta, true); |
| err: |
| if (!ret && sectors_allocated) |
| bch2_increment_clock(c, sectors_allocated, WRITE); |
| if (should_print_err(ret)) |
| bch_err_inum_offset_ratelimited(c, |
| inum.inum, |
| iter->pos.offset << 9, |
| "%s(): error: %s", __func__, bch2_err_str(ret)); |
| err_noprint: |
| bch2_open_buckets_put(c, &open_buckets); |
| bch2_disk_reservation_put(c, &disk_res); |
| bch2_bkey_buf_exit(&new, c); |
| bch2_bkey_buf_exit(&old, c); |
| |
| if (closure_nr_remaining(&cl) != 1) { |
| bch2_trans_unlock_long(trans); |
| bch2_wait_on_allocator(c, &cl); |
| } |
| |
| return ret; |
| } |
| |
| /* |
| * Returns -BCH_ERR_transacton_restart if we had to drop locks: |
| */ |
| int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, |
| subvol_inum inum, u64 end, |
| s64 *i_sectors_delta) |
| { |
| struct bch_fs *c = trans->c; |
| unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); |
| struct bpos end_pos = POS(inum.inum, end); |
| struct bkey_s_c k; |
| int ret = 0, ret2 = 0; |
| u32 snapshot; |
| |
| while (!ret || |
| bch2_err_matches(ret, BCH_ERR_transaction_restart)) { |
| struct disk_reservation disk_res = |
| bch2_disk_reservation_init(c, 0); |
| struct bkey_i delete; |
| |
| if (ret) |
| ret2 = ret; |
| |
| bch2_trans_begin(trans); |
| |
| ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); |
| if (ret) |
| continue; |
| |
| bch2_btree_iter_set_snapshot(iter, snapshot); |
| |
| /* |
| * peek_upto() doesn't have ideal semantics for extents: |
| */ |
| k = bch2_btree_iter_peek_upto(iter, end_pos); |
| if (!k.k) |
| break; |
| |
| ret = bkey_err(k); |
| if (ret) |
| continue; |
| |
| bkey_init(&delete.k); |
| delete.k.p = iter->pos; |
| |
| /* create the biggest key we can */ |
| bch2_key_resize(&delete.k, max_sectors); |
| bch2_cut_back(end_pos, &delete); |
| |
| ret = bch2_extent_update(trans, inum, iter, &delete, |
| &disk_res, 0, i_sectors_delta, false); |
| bch2_disk_reservation_put(c, &disk_res); |
| } |
| |
| return ret ?: ret2; |
| } |
| |
| int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, |
| s64 *i_sectors_delta) |
| { |
| struct btree_trans *trans = bch2_trans_get(c); |
| struct btree_iter iter; |
| int ret; |
| |
| bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, |
| POS(inum.inum, start), |
| BTREE_ITER_intent); |
| |
| ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); |
| |
| bch2_trans_iter_exit(trans, &iter); |
| bch2_trans_put(trans); |
| |
| if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
| ret = 0; |
| |
| return ret; |
| } |
| |
| /* truncate: */ |
| |
| void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) |
| { |
| struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k); |
| |
| prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); |
| prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); |
| prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size)); |
| } |
| |
| static int truncate_set_isize(struct btree_trans *trans, |
| subvol_inum inum, |
| u64 new_i_size, |
| bool warn) |
| { |
| struct btree_iter iter = { NULL }; |
| struct bch_inode_unpacked inode_u; |
| int ret; |
| |
| ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn) ?: |
| (inode_u.bi_size = new_i_size, 0) ?: |
| bch2_inode_write(trans, &iter, &inode_u); |
| |
| bch2_trans_iter_exit(trans, &iter); |
| return ret; |
| } |
| |
| static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, |
| struct bkey_i *op_k, |
| u64 *i_sectors_delta) |
| { |
| struct bch_fs *c = trans->c; |
| struct btree_iter fpunch_iter; |
| struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k); |
| subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; |
| u64 new_i_size = le64_to_cpu(op->v.new_i_size); |
| bool warn_errors = i_sectors_delta != NULL; |
| int ret; |
| |
| ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
| truncate_set_isize(trans, inum, new_i_size, i_sectors_delta != NULL)); |
| if (ret) |
| goto err; |
| |
| bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents, |
| POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), |
| BTREE_ITER_intent); |
| ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); |
| bch2_trans_iter_exit(trans, &fpunch_iter); |
| |
| if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
| ret = 0; |
| err: |
| if (warn_errors) |
| bch_err_fn(c, ret); |
| return ret; |
| } |
| |
| int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k) |
| { |
| return __bch2_resume_logged_op_truncate(trans, op_k, NULL); |
| } |
| |
| int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta) |
| { |
| struct bkey_i_logged_op_truncate op; |
| |
| bkey_logged_op_truncate_init(&op.k_i); |
| op.v.subvol = cpu_to_le32(inum.subvol); |
| op.v.inum = cpu_to_le64(inum.inum); |
| op.v.new_i_size = cpu_to_le64(new_i_size); |
| |
| /* |
| * Logged ops aren't atomic w.r.t. snapshot creation: creating a |
| * snapshot while they're in progress, then crashing, will result in the |
| * resume only proceeding in one of the snapshots |
| */ |
| down_read(&c->snapshot_create_lock); |
| struct btree_trans *trans = bch2_trans_get(c); |
| int ret = bch2_logged_op_start(trans, &op.k_i); |
| if (ret) |
| goto out; |
| ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta); |
| ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; |
| out: |
| bch2_trans_put(trans); |
| up_read(&c->snapshot_create_lock); |
| |
| return ret; |
| } |
| |
| /* finsert/fcollapse: */ |
| |
| void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) |
| { |
| struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k); |
| |
| prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); |
| prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); |
| prt_printf(out, " dst_offset=%lli", le64_to_cpu(op.v->dst_offset)); |
| prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset)); |
| } |
| |
| static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, |
| u64 offset, s64 len, bool warn) |
| { |
| struct btree_iter iter; |
| struct bch_inode_unpacked inode_u; |
| int ret; |
| |
| offset <<= 9; |
| len <<= 9; |
| |
| ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn); |
| if (ret) |
| return ret; |
| |
| if (len > 0) { |
| if (MAX_LFS_FILESIZE - inode_u.bi_size < len) { |
| ret = -EFBIG; |
| goto err; |
| } |
| |
| if (offset >= inode_u.bi_size) { |
| ret = -EINVAL; |
| goto err; |
| } |
| } |
| |
| inode_u.bi_size += len; |
| inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c); |
| |
| ret = bch2_inode_write(trans, &iter, &inode_u); |
| err: |
| bch2_trans_iter_exit(trans, &iter); |
| return ret; |
| } |
| |
| static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, |
| struct bkey_i *op_k, |
| u64 *i_sectors_delta) |
| { |
| struct bch_fs *c = trans->c; |
| struct btree_iter iter; |
| struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); |
| subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; |
| struct bch_io_opts opts; |
| u64 dst_offset = le64_to_cpu(op->v.dst_offset); |
| u64 src_offset = le64_to_cpu(op->v.src_offset); |
| s64 shift = dst_offset - src_offset; |
| u64 len = abs(shift); |
| u64 pos = le64_to_cpu(op->v.pos); |
| bool insert = shift > 0; |
| u32 snapshot; |
| bool warn_errors = i_sectors_delta != NULL; |
| int ret = 0; |
| |
| ret = bch2_inum_opts_get(trans, inum, &opts); |
| if (ret) |
| return ret; |
| |
| /* |
| * check for missing subvolume before fpunch, as in resume we don't want |
| * it to be a fatal error |
| */ |
| ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors); |
| if (ret) |
| return ret; |
| |
| bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, |
| POS(inum.inum, 0), |
| BTREE_ITER_intent); |
| |
| switch (op->v.state) { |
| case LOGGED_OP_FINSERT_start: |
| op->v.state = LOGGED_OP_FINSERT_shift_extents; |
| |
| if (insert) { |
| ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
| adjust_i_size(trans, inum, src_offset, len, warn_errors) ?: |
| bch2_logged_op_update(trans, &op->k_i)); |
| if (ret) |
| goto err; |
| } else { |
| bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset)); |
| |
| ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); |
| if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
| goto err; |
| |
| ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
| bch2_logged_op_update(trans, &op->k_i)); |
| } |
| |
| fallthrough; |
| case LOGGED_OP_FINSERT_shift_extents: |
| while (1) { |
| struct disk_reservation disk_res = |
| bch2_disk_reservation_init(c, 0); |
| struct bkey_i delete, *copy; |
| struct bkey_s_c k; |
| struct bpos src_pos = POS(inum.inum, src_offset); |
| |
| bch2_trans_begin(trans); |
| |
| ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, |
| warn_errors); |
| if (ret) |
| goto btree_err; |
| |
| bch2_btree_iter_set_snapshot(&iter, snapshot); |
| bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); |
| |
| k = insert |
| ? bch2_btree_iter_peek_prev(&iter) |
| : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); |
| if ((ret = bkey_err(k))) |
| goto btree_err; |
| |
| if (!k.k || |
| k.k->p.inode != inum.inum || |
| bkey_le(k.k->p, POS(inum.inum, src_offset))) |
| break; |
| |
| copy = bch2_bkey_make_mut_noupdate(trans, k); |
| if ((ret = PTR_ERR_OR_ZERO(copy))) |
| goto btree_err; |
| |
| if (insert && |
| bkey_lt(bkey_start_pos(k.k), src_pos)) { |
| bch2_cut_front(src_pos, copy); |
| |
| /* Splitting compressed extent? */ |
| bch2_disk_reservation_add(c, &disk_res, |
| copy->k.size * |
| bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)), |
| BCH_DISK_RESERVATION_NOFAIL); |
| } |
| |
| bkey_init(&delete.k); |
| delete.k.p = copy->k.p; |
| delete.k.p.snapshot = snapshot; |
| delete.k.size = copy->k.size; |
| |
| copy->k.p.offset += shift; |
| copy->k.p.snapshot = snapshot; |
| |
| op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); |
| |
| ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?: |
| bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: |
| bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: |
| bch2_logged_op_update(trans, &op->k_i) ?: |
| bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc); |
| btree_err: |
| bch2_disk_reservation_put(c, &disk_res); |
| |
| if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
| continue; |
| if (ret) |
| goto err; |
| |
| pos = le64_to_cpu(op->v.pos); |
| } |
| |
| op->v.state = LOGGED_OP_FINSERT_finish; |
| |
| if (!insert) { |
| ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
| adjust_i_size(trans, inum, src_offset, shift, warn_errors) ?: |
| bch2_logged_op_update(trans, &op->k_i)); |
| } else { |
| /* We need an inode update to update bi_journal_seq for fsync: */ |
| ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
| adjust_i_size(trans, inum, 0, 0, warn_errors) ?: |
| bch2_logged_op_update(trans, &op->k_i)); |
| } |
| |
| break; |
| case LOGGED_OP_FINSERT_finish: |
| break; |
| } |
| err: |
| bch2_trans_iter_exit(trans, &iter); |
| if (warn_errors) |
| bch_err_fn(c, ret); |
| return ret; |
| } |
| |
| int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k) |
| { |
| return __bch2_resume_logged_op_finsert(trans, op_k, NULL); |
| } |
| |
| int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum, |
| u64 offset, u64 len, bool insert, |
| s64 *i_sectors_delta) |
| { |
| struct bkey_i_logged_op_finsert op; |
| s64 shift = insert ? len : -len; |
| |
| bkey_logged_op_finsert_init(&op.k_i); |
| op.v.subvol = cpu_to_le32(inum.subvol); |
| op.v.inum = cpu_to_le64(inum.inum); |
| op.v.dst_offset = cpu_to_le64(offset + shift); |
| op.v.src_offset = cpu_to_le64(offset); |
| op.v.pos = cpu_to_le64(insert ? U64_MAX : offset); |
| |
| /* |
| * Logged ops aren't atomic w.r.t. snapshot creation: creating a |
| * snapshot while they're in progress, then crashing, will result in the |
| * resume only proceeding in one of the snapshots |
| */ |
| down_read(&c->snapshot_create_lock); |
| struct btree_trans *trans = bch2_trans_get(c); |
| int ret = bch2_logged_op_start(trans, &op.k_i); |
| if (ret) |
| goto out; |
| ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta); |
| ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; |
| out: |
| bch2_trans_put(trans); |
| up_read(&c->snapshot_create_lock); |
| |
| return ret; |
| } |