fs/xfs/scrub/newbt.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
  * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_btree.h"
 #include "xfs_btree_staging.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
 #include "xfs_rmap.h"
 #include "xfs_ag.h"
 #include "xfs_defer.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
 #include "scrub/repair.h"
 #include "scrub/newbt.h"

 /*
  * Estimate proper slack values for a btree that's being reloaded.
  *
  * Under most circumstances, we'll take whatever default loading value the
  * btree bulk loading code calculates for us.  However, there are some
  * exceptions to this rule:
  *
  * (0) If someone turned one of the debug knobs.
  * (1) If this is a per-AG btree and the AG has less than 10% space free.
  * (2) If this is an inode btree and the FS has less than 10% space free.

  * In either case, format the new btree blocks almost completely full to
  * minimize space usage.
  */
 static void
 xrep_newbt_estimate_slack(
 	struct xrep_newbt	*xnr)
 {
 	struct xfs_scrub	*sc = xnr->sc;
 	struct xfs_btree_bload	*bload = &xnr->bload;
 	uint64_t		free;
 	uint64_t		sz;

 	/*
 	 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
 	 * unless someone has set them otherwise, so we just pull the values
 	 * here.
 	 */
 	bload->leaf_slack = xfs_globals.bload_leaf_slack;
 	bload->node_slack = xfs_globals.bload_node_slack;

 	if (sc->ops->type == ST_PERAG) {
 		free = sc->sa.pag->pagf_freeblks;
 		sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
 	} else {
 		free = percpu_counter_sum(&sc->mp->m_fdblocks);
 		sz = sc->mp->m_sb.sb_dblocks;
 	}

 	/* No further changes if there's more than 10% free space left. */
 	if (free >= div_u64(sz, 10))
 		return;

 	/*
 	 * We're low on space; load the btrees as tightly as possible.  Leave
 	 * a couple of open slots in each btree block so that we don't end up
 	 * splitting the btrees like crazy after a mount.
 	 */
 	if (bload->leaf_slack < 0)
 		bload->leaf_slack = 2;
 	if (bload->node_slack < 0)
 		bload->node_slack = 2;
 }

 /* Initialize accounting resources for staging a new AG btree. */
 void
 xrep_newbt_init_ag(
 	struct xrep_newbt		*xnr,
 	struct xfs_scrub		*sc,
 	const struct xfs_owner_info	*oinfo,
 	xfs_fsblock_t			alloc_hint,
 	enum xfs_ag_resv_type		resv)
 {
 	memset(xnr, 0, sizeof(struct xrep_newbt));
 	xnr->sc = sc;
 	xnr->oinfo = *oinfo; /* structure copy */
 	xnr->alloc_hint = alloc_hint;
 	xnr->resv = resv;
 	INIT_LIST_HEAD(&xnr->resv_list);
 	xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
 	xrep_newbt_estimate_slack(xnr);
 }

 /* Initialize accounting resources for staging a new inode fork btree. */
 int
 xrep_newbt_init_inode(
 	struct xrep_newbt		*xnr,
 	struct xfs_scrub		*sc,
 	int				whichfork,
 	const struct xfs_owner_info	*oinfo)
 {
 	struct xfs_ifork		*ifp;

 	ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
 	if (!ifp)
 		return -ENOMEM;

 	xrep_newbt_init_ag(xnr, sc, oinfo,
 			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
 			XFS_AG_RESV_NONE);
 	xnr->ifake.if_fork = ifp;
 	xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
 	return 0;
 }

 /*
  * Initialize accounting resources for staging a new btree.  Callers are
  * expected to add their own reservations (and clean them up) manually.
  */
 void
 xrep_newbt_init_bare(
 	struct xrep_newbt		*xnr,
 	struct xfs_scrub		*sc)
 {
 	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
 			XFS_AG_RESV_NONE);
 }

 /*
  * Designate specific blocks to be used to build our new btree.  @pag must be
  * a passive reference.
  */
 STATIC int
 xrep_newbt_add_blocks(
 	struct xrep_newbt		*xnr,
 	struct xfs_perag		*pag,
 	const struct xfs_alloc_arg	*args)
 {
 	struct xfs_mount		*mp = xnr->sc->mp;
 	struct xrep_newbt_resv		*resv;
 	int				error;

 	resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
 	if (!resv)
 		return -ENOMEM;

 	INIT_LIST_HEAD(&resv->list);
 	resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
 	resv->len = args->len;
 	resv->used = 0;
 	resv->pag = xfs_perag_hold(pag);

 	if (args->tp) {
 		ASSERT(xnr->oinfo.oi_offset == 0);

 		error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
 		if (error)
 			goto out_pag;
 	}

 	list_add_tail(&resv->list, &xnr->resv_list);
 	return 0;
 out_pag:
 	xfs_perag_put(resv->pag);
 	kfree(resv);
 	return error;
 }

 /*
  * Add an extent to the new btree reservation pool.  Callers are required to
  * reap this reservation manually if the repair is cancelled.  @pag must be a
  * passive reference.
  */
 int
 xrep_newbt_add_extent(
 	struct xrep_newbt	*xnr,
 	struct xfs_perag	*pag,
 	xfs_agblock_t		agbno,
 	xfs_extlen_t		len)
 {
 	struct xfs_mount	*mp = xnr->sc->mp;
 	struct xfs_alloc_arg	args = {
 		.tp		= NULL, /* no autoreap */
 		.oinfo		= xnr->oinfo,
 		.fsbno		= XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
 		.len		= len,
 		.resv		= xnr->resv,
 	};

 	return xrep_newbt_add_blocks(xnr, pag, &args);
 }

 /* Don't let our allocation hint take us beyond this AG */
 static inline void
 xrep_newbt_validate_ag_alloc_hint(
 	struct xrep_newbt	*xnr)
 {
 	struct xfs_scrub	*sc = xnr->sc;
 	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);

 	if (agno == sc->sa.pag->pag_agno &&
 	    xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
 		return;

 	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
 					 XFS_AGFL_BLOCK(sc->mp) + 1);
 }

 /* Allocate disk space for a new per-AG btree. */
 STATIC int
 xrep_newbt_alloc_ag_blocks(
 	struct xrep_newbt	*xnr,
 	uint64_t		nr_blocks)
 {
 	struct xfs_scrub	*sc = xnr->sc;
 	struct xfs_mount	*mp = sc->mp;
 	int			error = 0;

 	ASSERT(sc->sa.pag != NULL);

 	while (nr_blocks > 0) {
 		struct xfs_alloc_arg	args = {
 			.tp		= sc->tp,
 			.mp		= mp,
 			.oinfo		= xnr->oinfo,
 			.minlen		= 1,
 			.maxlen		= nr_blocks,
 			.prod		= 1,
 			.resv		= xnr->resv,
 		};
 		xfs_agnumber_t		agno;

 		xrep_newbt_validate_ag_alloc_hint(xnr);

 		error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint);
 		if (error)
 			return error;
 		if (args.fsbno == NULLFSBLOCK)
 			return -ENOSPC;

 		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);

 		trace_xrep_newbt_alloc_ag_blocks(mp, agno,
 				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
 				xnr->oinfo.oi_owner);

 		if (agno != sc->sa.pag->pag_agno) {
 			ASSERT(agno == sc->sa.pag->pag_agno);
 			return -EFSCORRUPTED;
 		}

 		error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
 		if (error)
 			return error;

 		nr_blocks -= args.len;
 		xnr->alloc_hint = args.fsbno + args.len;

 		error = xrep_defer_finish(sc);
 		if (error)
 			return error;
 	}

 	return 0;
 }

 /* Don't let our allocation hint take us beyond EOFS */
 static inline void
 xrep_newbt_validate_file_alloc_hint(
 	struct xrep_newbt	*xnr)
 {
 	struct xfs_scrub	*sc = xnr->sc;

 	if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
 		return;

 	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
 }

 /* Allocate disk space for our new file-based btree. */
 STATIC int
 xrep_newbt_alloc_file_blocks(
 	struct xrep_newbt	*xnr,
 	uint64_t		nr_blocks)
 {
 	struct xfs_scrub	*sc = xnr->sc;
 	struct xfs_mount	*mp = sc->mp;
 	int			error = 0;

 	while (nr_blocks > 0) {
 		struct xfs_alloc_arg	args = {
 			.tp		= sc->tp,
 			.mp		= mp,
 			.oinfo		= xnr->oinfo,
 			.minlen		= 1,
 			.maxlen		= nr_blocks,
 			.prod		= 1,
 			.resv		= xnr->resv,
 		};
 		struct xfs_perag	*pag;
 		xfs_agnumber_t		agno;

 		xrep_newbt_validate_file_alloc_hint(xnr);

 		error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint);
 		if (error)
 			return error;
 		if (args.fsbno == NULLFSBLOCK)
 			return -ENOSPC;

 		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);

 		trace_xrep_newbt_alloc_file_blocks(mp, agno,
 				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
 				xnr->oinfo.oi_owner);

 		pag = xfs_perag_get(mp, agno);
 		if (!pag) {
 			ASSERT(0);
 			return -EFSCORRUPTED;
 		}

 		error = xrep_newbt_add_blocks(xnr, pag, &args);
 		xfs_perag_put(pag);
 		if (error)
 			return error;

 		nr_blocks -= args.len;
 		xnr->alloc_hint = args.fsbno + args.len;

 		error = xrep_defer_finish(sc);
 		if (error)
 			return error;
 	}

 	return 0;
 }

 /* Allocate disk space for our new btree. */
 int
 xrep_newbt_alloc_blocks(
 	struct xrep_newbt	*xnr,
 	uint64_t		nr_blocks)
 {
 	if (xnr->sc->ip)
 		return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
 	return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
 }

 /*
  * Free the unused part of a space extent that was reserved for a new ondisk
  * structure.  Returns the number of EFIs logged or a negative errno.
  */
 STATIC int
 xrep_newbt_free_extent(
 	struct xrep_newbt	*xnr,
 	struct xrep_newbt_resv	*resv,
 	bool			btree_committed)
 {
 	struct xfs_scrub	*sc = xnr->sc;
 	xfs_agblock_t		free_agbno = resv->agbno;
 	xfs_extlen_t		free_aglen = resv->len;
 	xfs_fsblock_t		fsbno;
 	int			error;

 	if (!btree_committed || resv->used == 0) {
 		/*
 		 * If we're not committing a new btree or we didn't use the
 		 * space reservation, let the existing EFI free the entire
 		 * space extent.
 		 */
 		trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
 				free_agbno, free_aglen, xnr->oinfo.oi_owner);
 		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
 		return 1;
 	}

 	/*
 	 * We used space and committed the btree.  Cancel the autoreap, remove
 	 * the written blocks from the reservation, and possibly log a new EFI
 	 * to free any unused reservation space.
 	 */
 	xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
 	free_agbno += resv->used;
 	free_aglen -= resv->used;

 	if (free_aglen == 0)
 		return 0;

 	trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
 			free_aglen, xnr->oinfo.oi_owner);

 	ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
 	ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);

 	/*
 	 * Use EFIs to free the reservations.  This reduces the chance
 	 * that we leak blocks if the system goes down.
 	 */
 	fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
 	error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
 			xnr->resv, true);
 	if (error)
 		return error;

 	return 1;
 }

 /* Free all the accounting info and disk space we reserved for a new btree. */
 STATIC int
 xrep_newbt_free(
 	struct xrep_newbt	*xnr,
 	bool			btree_committed)
 {
 	struct xfs_scrub	*sc = xnr->sc;
 	struct xrep_newbt_resv	*resv, *n;
 	unsigned int		freed = 0;
 	int			error = 0;

 	/*
 	 * If the filesystem already went down, we can't free the blocks.  Skip
 	 * ahead to freeing the incore metadata because we can't fix anything.
 	 */
 	if (xfs_is_shutdown(sc->mp))
 		goto junkit;

 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
 		int		ret;

 		ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
 		list_del(&resv->list);
 		xfs_perag_put(resv->pag);
 		kfree(resv);
 		if (ret < 0) {
 			error = ret;
 			goto junkit;
 		}

 		freed += ret;
 		if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
 			error = xrep_defer_finish(sc);
 			if (error)
 				goto junkit;
 			freed = 0;
 		}
 	}

 	if (freed)
 		error = xrep_defer_finish(sc);

 junkit:
 	/*
 	 * If we still have reservations attached to @newbt, cleanup must have
 	 * failed and the filesystem is about to go down.  Clean up the incore
 	 * reservations and try to commit to freeing the space we used.
 	 */
 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
 		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
 		list_del(&resv->list);
 		xfs_perag_put(resv->pag);
 		kfree(resv);
 	}

 	if (sc->ip) {
 		kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
 		xnr->ifake.if_fork = NULL;
 	}

 	return error;
 }

 /*
  * Free all the accounting info and unused disk space allocations after
  * committing a new btree.
  */
 int
 xrep_newbt_commit(
 	struct xrep_newbt	*xnr)
 {
 	return xrep_newbt_free(xnr, true);
 }

 /*
  * Free all the accounting info and all of the disk space we reserved for a new
  * btree that we're not going to commit.  We want to try to roll things back
  * cleanly for things like ENOSPC midway through allocation.
  */
 void
 xrep_newbt_cancel(
 	struct xrep_newbt	*xnr)
 {
 	xrep_newbt_free(xnr, false);
 }

 /* Feed one of the reserved btree blocks to the bulk loader. */
 int
 xrep_newbt_claim_block(
 	struct xfs_btree_cur	*cur,
 	struct xrep_newbt	*xnr,
 	union xfs_btree_ptr	*ptr)
 {
 	struct xrep_newbt_resv	*resv;
 	struct xfs_mount	*mp = cur->bc_mp;
 	xfs_agblock_t		agbno;

 	/*
 	 * The first item in the list should always have a free block unless
 	 * we're completely out.
 	 */
 	resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
 	if (resv->used == resv->len)
 		return -ENOSPC;

 	/*
 	 * Peel off a block from the start of the reservation.  We allocate
 	 * blocks in order to place blocks on disk in increasing record or key
 	 * order.  The block reservations tend to end up on the list in
 	 * decreasing order, which hopefully results in leaf blocks ending up
 	 * together.
 	 */
 	agbno = resv->agbno + resv->used;
 	resv->used++;

 	/* If we used all the blocks in this reservation, move it to the end. */
 	if (resv->used == resv->len)
 		list_move_tail(&resv->list, &xnr->resv_list);

 	trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
 			xnr->oinfo.oi_owner);

 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 		ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
 								agbno));
 	else
 		ptr->s = cpu_to_be32(agbno);

 	/* Relog all the EFIs. */
 	return xrep_defer_finish(xnr->sc);
 }

 /* How many reserved blocks are unused? */
 unsigned int
 xrep_newbt_unused_blocks(
 	struct xrep_newbt	*xnr)
 {
 	struct xrep_newbt_resv	*resv;
 	unsigned int		unused = 0;

 	list_for_each_entry(resv, &xnr->resv_list, list)
 		unused += resv->len - resv->used;
 	return unused;
 }
	// SPDX-License-Identifier: GPL-2.0-or-later
	/*
	* Copyright (C) 2022-2023 Oracle. All Rights Reserved.
	* Author: Darrick J. Wong <djwong@kernel.org>
	*/
	#include "xfs.h"
	#include "xfs_fs.h"
	#include "xfs_shared.h"
	#include "xfs_format.h"
	#include "xfs_trans_resv.h"
	#include "xfs_mount.h"
	#include "xfs_btree.h"
	#include "xfs_btree_staging.h"
	#include "xfs_log_format.h"
	#include "xfs_trans.h"
	#include "xfs_sb.h"
	#include "xfs_inode.h"
	#include "xfs_alloc.h"
	#include "xfs_rmap.h"
	#include "xfs_ag.h"
	#include "xfs_defer.h"
	#include "scrub/scrub.h"
	#include "scrub/common.h"
	#include "scrub/trace.h"
	#include "scrub/repair.h"
	#include "scrub/newbt.h"

	/*
	* Estimate proper slack values for a btree that's being reloaded.
	*
	* Under most circumstances, we'll take whatever default loading value the
	* btree bulk loading code calculates for us. However, there are some
	* exceptions to this rule:
	*
	* (0) If someone turned one of the debug knobs.
	* (1) If this is a per-AG btree and the AG has less than 10% space free.
	* (2) If this is an inode btree and the FS has less than 10% space free.

	* In either case, format the new btree blocks almost completely full to
	* minimize space usage.
	*/
	static void
	xrep_newbt_estimate_slack(
	struct xrep_newbt *xnr)
	{
	struct xfs_scrub *sc = xnr->sc;
	struct xfs_btree_bload *bload = &xnr->bload;
	uint64_t free;
	uint64_t sz;

	/*
	* The xfs_globals values are set to -1 (i.e. take the bload defaults)
	* unless someone has set them otherwise, so we just pull the values
	* here.
	*/
	bload->leaf_slack = xfs_globals.bload_leaf_slack;
	bload->node_slack = xfs_globals.bload_node_slack;

	if (sc->ops->type == ST_PERAG) {
	free = sc->sa.pag->pagf_freeblks;
	sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
	} else {
	free = percpu_counter_sum(&sc->mp->m_fdblocks);
	sz = sc->mp->m_sb.sb_dblocks;
	}

	/* No further changes if there's more than 10% free space left. */
	if (free >= div_u64(sz, 10))
	return;

	/*
	* We're low on space; load the btrees as tightly as possible. Leave
	* a couple of open slots in each btree block so that we don't end up
	* splitting the btrees like crazy after a mount.
	*/
	if (bload->leaf_slack < 0)
	bload->leaf_slack = 2;
	if (bload->node_slack < 0)
	bload->node_slack = 2;
	}

	/* Initialize accounting resources for staging a new AG btree. */
	void
	xrep_newbt_init_ag(
	struct xrep_newbt *xnr,
	struct xfs_scrub *sc,
	const struct xfs_owner_info *oinfo,
	xfs_fsblock_t alloc_hint,
	enum xfs_ag_resv_type resv)
	{
	memset(xnr, 0, sizeof(struct xrep_newbt));
	xnr->sc = sc;
	xnr->oinfo = oinfo; / structure copy */
	xnr->alloc_hint = alloc_hint;
	xnr->resv = resv;
	INIT_LIST_HEAD(&xnr->resv_list);
	xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
	xrep_newbt_estimate_slack(xnr);
	}

	/* Initialize accounting resources for staging a new inode fork btree. */
	int
	xrep_newbt_init_inode(
	struct xrep_newbt *xnr,
	struct xfs_scrub *sc,
	int whichfork,
	const struct xfs_owner_info *oinfo)
	{
	struct xfs_ifork *ifp;

	ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
	if (!ifp)
	return -ENOMEM;

	xrep_newbt_init_ag(xnr, sc, oinfo,
	XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
	XFS_AG_RESV_NONE);
	xnr->ifake.if_fork = ifp;
	xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
	return 0;
	}

	/*
	* Initialize accounting resources for staging a new btree. Callers are
	* expected to add their own reservations (and clean them up) manually.
	*/
	void
	xrep_newbt_init_bare(
	struct xrep_newbt *xnr,
	struct xfs_scrub *sc)
	{
	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
	XFS_AG_RESV_NONE);
	}

	/*
	* Designate specific blocks to be used to build our new btree. @pag must be
	* a passive reference.
	*/
	STATIC int
	xrep_newbt_add_blocks(
	struct xrep_newbt *xnr,
	struct xfs_perag *pag,
	const struct xfs_alloc_arg *args)
	{
	struct xfs_mount *mp = xnr->sc->mp;
	struct xrep_newbt_resv *resv;
	int error;

	resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
	if (!resv)
	return -ENOMEM;

	INIT_LIST_HEAD(&resv->list);
	resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
	resv->len = args->len;
	resv->used = 0;
	resv->pag = xfs_perag_hold(pag);

	if (args->tp) {
	ASSERT(xnr->oinfo.oi_offset == 0);

	error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
	if (error)
	goto out_pag;
	}

	list_add_tail(&resv->list, &xnr->resv_list);
	return 0;
	out_pag:
	xfs_perag_put(resv->pag);
	kfree(resv);
	return error;
	}

	/*
	* Add an extent to the new btree reservation pool. Callers are required to
	* reap this reservation manually if the repair is cancelled. @pag must be a
	* passive reference.
	*/
	int
	xrep_newbt_add_extent(
	struct xrep_newbt *xnr,
	struct xfs_perag *pag,
	xfs_agblock_t agbno,
	xfs_extlen_t len)
	{
	struct xfs_mount *mp = xnr->sc->mp;
	struct xfs_alloc_arg args = {
	.tp = NULL, /* no autoreap */
	.oinfo = xnr->oinfo,
	.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
	.len = len,
	.resv = xnr->resv,
	};

	return xrep_newbt_add_blocks(xnr, pag, &args);
	}

	/* Don't let our allocation hint take us beyond this AG */
	static inline void
	xrep_newbt_validate_ag_alloc_hint(
	struct xrep_newbt *xnr)
	{
	struct xfs_scrub *sc = xnr->sc;
	xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);

	if (agno == sc->sa.pag->pag_agno &&
	xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
	return;

	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
	XFS_AGFL_BLOCK(sc->mp) + 1);
	}

	/* Allocate disk space for a new per-AG btree. */
	STATIC int
	xrep_newbt_alloc_ag_blocks(
	struct xrep_newbt *xnr,
	uint64_t nr_blocks)
	{
	struct xfs_scrub *sc = xnr->sc;
	struct xfs_mount *mp = sc->mp;
	int error = 0;

	ASSERT(sc->sa.pag != NULL);

	while (nr_blocks > 0) {
	struct xfs_alloc_arg args = {
	.tp = sc->tp,
	.mp = mp,
	.oinfo = xnr->oinfo,
	.minlen = 1,
	.maxlen = nr_blocks,
	.prod = 1,
	.resv = xnr->resv,
	};
	xfs_agnumber_t agno;

	xrep_newbt_validate_ag_alloc_hint(xnr);

	error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint);
	if (error)
	return error;
	if (args.fsbno == NULLFSBLOCK)
	return -ENOSPC;

	agno = XFS_FSB_TO_AGNO(mp, args.fsbno);

	trace_xrep_newbt_alloc_ag_blocks(mp, agno,
	XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
	xnr->oinfo.oi_owner);

	if (agno != sc->sa.pag->pag_agno) {
	ASSERT(agno == sc->sa.pag->pag_agno);
	return -EFSCORRUPTED;
	}

	error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
	if (error)
	return error;

	nr_blocks -= args.len;
	xnr->alloc_hint = args.fsbno + args.len;

	error = xrep_defer_finish(sc);
	if (error)
	return error;
	}

	return 0;
	}

	/* Don't let our allocation hint take us beyond EOFS */
	static inline void
	xrep_newbt_validate_file_alloc_hint(
	struct xrep_newbt *xnr)
	{
	struct xfs_scrub *sc = xnr->sc;

	if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
	return;

	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
	}

	/* Allocate disk space for our new file-based btree. */
	STATIC int
	xrep_newbt_alloc_file_blocks(
	struct xrep_newbt *xnr,
	uint64_t nr_blocks)
	{
	struct xfs_scrub *sc = xnr->sc;
	struct xfs_mount *mp = sc->mp;
	int error = 0;

	while (nr_blocks > 0) {
	struct xfs_alloc_arg args = {
	.tp = sc->tp,
	.mp = mp,
	.oinfo = xnr->oinfo,
	.minlen = 1,
	.maxlen = nr_blocks,
	.prod = 1,
	.resv = xnr->resv,
	};
	struct xfs_perag *pag;
	xfs_agnumber_t agno;

	xrep_newbt_validate_file_alloc_hint(xnr);

	error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint);
	if (error)
	return error;
	if (args.fsbno == NULLFSBLOCK)
	return -ENOSPC;

	agno = XFS_FSB_TO_AGNO(mp, args.fsbno);

	trace_xrep_newbt_alloc_file_blocks(mp, agno,
	XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
	xnr->oinfo.oi_owner);

	pag = xfs_perag_get(mp, agno);
	if (!pag) {
	ASSERT(0);
	return -EFSCORRUPTED;
	}

	error = xrep_newbt_add_blocks(xnr, pag, &args);
	xfs_perag_put(pag);
	if (error)
	return error;

	nr_blocks -= args.len;
	xnr->alloc_hint = args.fsbno + args.len;

	error = xrep_defer_finish(sc);
	if (error)
	return error;
	}

	return 0;
	}

	/* Allocate disk space for our new btree. */
	int
	xrep_newbt_alloc_blocks(
	struct xrep_newbt *xnr,
	uint64_t nr_blocks)
	{
	if (xnr->sc->ip)
	return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
	return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
	}

	/*
	* Free the unused part of a space extent that was reserved for a new ondisk
	* structure. Returns the number of EFIs logged or a negative errno.
	*/
	STATIC int
	xrep_newbt_free_extent(
	struct xrep_newbt *xnr,
	struct xrep_newbt_resv *resv,
	bool btree_committed)
	{
	struct xfs_scrub *sc = xnr->sc;
	xfs_agblock_t free_agbno = resv->agbno;
	xfs_extlen_t free_aglen = resv->len;
	xfs_fsblock_t fsbno;
	int error;

	if (!btree_committed \|\| resv->used == 0) {
	/*
	* If we're not committing a new btree or we didn't use the
	* space reservation, let the existing EFI free the entire
	* space extent.
	*/
	trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
	free_agbno, free_aglen, xnr->oinfo.oi_owner);
	xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
	return 1;
	}

	/*
	* We used space and committed the btree. Cancel the autoreap, remove
	* the written blocks from the reservation, and possibly log a new EFI
	* to free any unused reservation space.
	*/
	xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
	free_agbno += resv->used;
	free_aglen -= resv->used;

	if (free_aglen == 0)
	return 0;

	trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
	free_aglen, xnr->oinfo.oi_owner);

	ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
	ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);

	/*
	* Use EFIs to free the reservations. This reduces the chance
	* that we leak blocks if the system goes down.
	*/
	fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
	error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
	xnr->resv, true);
	if (error)
	return error;

	return 1;
	}

	/* Free all the accounting info and disk space we reserved for a new btree. */
	STATIC int
	xrep_newbt_free(
	struct xrep_newbt *xnr,
	bool btree_committed)
	{
	struct xfs_scrub *sc = xnr->sc;
	struct xrep_newbt_resv resv, n;
	unsigned int freed = 0;
	int error = 0;

	/*
	* If the filesystem already went down, we can't free the blocks. Skip
	* ahead to freeing the incore metadata because we can't fix anything.
	*/
	if (xfs_is_shutdown(sc->mp))
	goto junkit;

	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
	int ret;

	ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
	list_del(&resv->list);
	xfs_perag_put(resv->pag);
	kfree(resv);
	if (ret < 0) {
	error = ret;
	goto junkit;
	}

	freed += ret;
	if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
	error = xrep_defer_finish(sc);
	if (error)
	goto junkit;
	freed = 0;
	}
	}

	if (freed)
	error = xrep_defer_finish(sc);

	junkit:
	/*
	* If we still have reservations attached to @newbt, cleanup must have
	* failed and the filesystem is about to go down. Clean up the incore
	* reservations and try to commit to freeing the space we used.
	*/
	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
	xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
	list_del(&resv->list);
	xfs_perag_put(resv->pag);
	kfree(resv);
	}

	if (sc->ip) {
	kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
	xnr->ifake.if_fork = NULL;
	}

	return error;
	}

	/*
	* Free all the accounting info and unused disk space allocations after
	* committing a new btree.
	*/
	int
	xrep_newbt_commit(
	struct xrep_newbt *xnr)
	{
	return xrep_newbt_free(xnr, true);
	}

	/*
	* Free all the accounting info and all of the disk space we reserved for a new
	* btree that we're not going to commit. We want to try to roll things back
	* cleanly for things like ENOSPC midway through allocation.
	*/
	void
	xrep_newbt_cancel(
	struct xrep_newbt *xnr)
	{
	xrep_newbt_free(xnr, false);
	}

	/* Feed one of the reserved btree blocks to the bulk loader. */
	int
	xrep_newbt_claim_block(
	struct xfs_btree_cur *cur,
	struct xrep_newbt *xnr,
	union xfs_btree_ptr *ptr)
	{
	struct xrep_newbt_resv *resv;
	struct xfs_mount *mp = cur->bc_mp;
	xfs_agblock_t agbno;

	/*
	* The first item in the list should always have a free block unless
	* we're completely out.
	*/
	resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
	if (resv->used == resv->len)
	return -ENOSPC;

	/*
	* Peel off a block from the start of the reservation. We allocate
	* blocks in order to place blocks on disk in increasing record or key
	* order. The block reservations tend to end up on the list in
	* decreasing order, which hopefully results in leaf blocks ending up
	* together.
	*/
	agbno = resv->agbno + resv->used;
	resv->used++;

	/* If we used all the blocks in this reservation, move it to the end. */
	if (resv->used == resv->len)
	list_move_tail(&resv->list, &xnr->resv_list);

	trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
	xnr->oinfo.oi_owner);

	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
	ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
	agbno));
	else
	ptr->s = cpu_to_be32(agbno);

	/* Relog all the EFIs. */
	return xrep_defer_finish(xnr->sc);
	}

	/* How many reserved blocks are unused? */
	unsigned int
	xrep_newbt_unused_blocks(
	struct xrep_newbt *xnr)
	{
	struct xrep_newbt_resv *resv;
	unsigned int unused = 0;

	list_for_each_entry(resv, &xnr->resv_list, list)
	unused += resv->len - resv->used;
	return unused;
	}