fs/xfs/libxfs/xfs_exchmaps.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
  * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_bmap.h"
 #include "xfs_icache.h"
 #include "xfs_quota.h"
 #include "xfs_exchmaps.h"
 #include "xfs_trace.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_trans_space.h"
 #include "xfs_error.h"
 #include "xfs_errortag.h"
 #include "xfs_health.h"
 #include "xfs_exchmaps_item.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_attr.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_dir2.h"
 #include "xfs_symlink_remote.h"

 struct kmem_cache	*xfs_exchmaps_intent_cache;

 /* bmbt mappings adjacent to a pair of records. */
 struct xfs_exchmaps_adjacent {
 	struct xfs_bmbt_irec		left1;
 	struct xfs_bmbt_irec		right1;
 	struct xfs_bmbt_irec		left2;
 	struct xfs_bmbt_irec		right2;
 };

 #define ADJACENT_INIT { \
 	.left1  = { .br_startblock = HOLESTARTBLOCK }, \
 	.right1 = { .br_startblock = HOLESTARTBLOCK }, \
 	.left2  = { .br_startblock = HOLESTARTBLOCK }, \
 	.right2 = { .br_startblock = HOLESTARTBLOCK }, \
 }

 /* Information to reset reflink flag / CoW fork state after an exchange. */

 /*
  * If the reflink flag is set on either inode, make sure it has an incore CoW
  * fork, since all reflink inodes must have them.  If there's a CoW fork and it
  * has mappings in it, make sure the inodes are tagged appropriately so that
  * speculative preallocations can be GC'd if we run low of space.
  */
 static inline void
 xfs_exchmaps_ensure_cowfork(
 	struct xfs_inode	*ip)
 {
 	struct xfs_ifork	*cfork;

 	if (xfs_is_reflink_inode(ip))
 		xfs_ifork_init_cow(ip);

 	cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
 	if (!cfork)
 		return;
 	if (cfork->if_bytes > 0)
 		xfs_inode_set_cowblocks_tag(ip);
 	else
 		xfs_inode_clear_cowblocks_tag(ip);
 }

 /*
  * Adjust the on-disk inode size upwards if needed so that we never add
  * mappings into the file past EOF.  This is crucial so that log recovery won't
  * get confused by the sudden appearance of post-eof mappings.
  */
 STATIC void
 xfs_exchmaps_update_size(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
 	struct xfs_bmbt_irec	*imap,
 	xfs_fsize_t		new_isize)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	xfs_fsize_t		len;

 	if (new_isize < 0)
 		return;

 	len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount),
 		  new_isize);

 	if (len <= ip->i_disk_size)
 		return;

 	trace_xfs_exchmaps_update_inode_size(ip, len);

 	ip->i_disk_size = len;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }

 /* Advance the incore state tracking after exchanging a mapping. */
 static inline void
 xmi_advance(
 	struct xfs_exchmaps_intent	*xmi,
 	const struct xfs_bmbt_irec	*irec)
 {
 	xmi->xmi_startoff1 += irec->br_blockcount;
 	xmi->xmi_startoff2 += irec->br_blockcount;
 	xmi->xmi_blockcount -= irec->br_blockcount;
 }

 /* Do we still have more mappings to exchange? */
 static inline bool
 xmi_has_more_exchange_work(const struct xfs_exchmaps_intent *xmi)
 {
 	return xmi->xmi_blockcount > 0;
 }

 /* Do we have post-operation cleanups to perform? */
 static inline bool
 xmi_has_postop_work(const struct xfs_exchmaps_intent *xmi)
 {
 	return xmi->xmi_flags & (XFS_EXCHMAPS_CLEAR_INO1_REFLINK |
 				 XFS_EXCHMAPS_CLEAR_INO2_REFLINK |
 				 __XFS_EXCHMAPS_INO2_SHORTFORM);
 }

 /* Check all mappings to make sure we can actually exchange them. */
 int
 xfs_exchmaps_check_forks(
 	struct xfs_mount		*mp,
 	const struct xfs_exchmaps_req	*req)
 {
 	struct xfs_ifork		*ifp1, *ifp2;
 	int				whichfork = xfs_exchmaps_reqfork(req);

 	/* No fork? */
 	ifp1 = xfs_ifork_ptr(req->ip1, whichfork);
 	ifp2 = xfs_ifork_ptr(req->ip2, whichfork);
 	if (!ifp1 || !ifp2)
 		return -EINVAL;

 	/* We don't know how to exchange local format forks. */
 	if (ifp1->if_format == XFS_DINODE_FMT_LOCAL ||
 	    ifp2->if_format == XFS_DINODE_FMT_LOCAL)
 		return -EINVAL;

 	return 0;
 }

 #ifdef CONFIG_XFS_QUOTA
 /* Log the actual updates to the quota accounting. */
 static inline void
 xfs_exchmaps_update_quota(
 	struct xfs_trans		*tp,
 	struct xfs_exchmaps_intent	*xmi,
 	struct xfs_bmbt_irec		*irec1,
 	struct xfs_bmbt_irec		*irec2)
 {
 	int64_t				ip1_delta = 0, ip2_delta = 0;
 	unsigned int			qflag;

 	qflag = XFS_IS_REALTIME_INODE(xmi->xmi_ip1) ? XFS_TRANS_DQ_RTBCOUNT :
 						      XFS_TRANS_DQ_BCOUNT;

 	if (xfs_bmap_is_real_extent(irec1)) {
 		ip1_delta -= irec1->br_blockcount;
 		ip2_delta += irec1->br_blockcount;
 	}

 	if (xfs_bmap_is_real_extent(irec2)) {
 		ip1_delta += irec2->br_blockcount;
 		ip2_delta -= irec2->br_blockcount;
 	}

 	xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip1, qflag, ip1_delta);
 	xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip2, qflag, ip2_delta);
 }
 #else
 # define xfs_exchmaps_update_quota(tp, xmi, irec1, irec2)	((void)0)
 #endif

 /* Decide if we want to skip this mapping from file1. */
 static inline bool
 xfs_exchmaps_can_skip_mapping(
 	struct xfs_exchmaps_intent	*xmi,
 	struct xfs_bmbt_irec		*irec)
 {
 	struct xfs_mount		*mp = xmi->xmi_ip1->i_mount;

 	/* Do not skip this mapping if the caller did not tell us to. */
 	if (!(xmi->xmi_flags & XFS_EXCHMAPS_INO1_WRITTEN))
 		return false;

 	/* Do not skip mapped, written mappings. */
 	if (xfs_bmap_is_written_extent(irec))
 		return false;

 	/*
 	 * The mapping is unwritten or a hole.  It cannot be a delalloc
 	 * reservation because we already excluded those.  It cannot be an
 	 * unwritten extent with dirty page cache because we flushed the page
 	 * cache.  For files where the allocation unit is 1FSB (files on the
 	 * data dev, rt files if the extent size is 1FSB), we can safely
 	 * skip this mapping.
 	 */
 	if (!xfs_inode_has_bigrtalloc(xmi->xmi_ip1))
 		return true;

 	/*
 	 * For a realtime file with a multi-fsb allocation unit, the decision
 	 * is trickier because we can only swap full allocation units.
 	 * Unwritten mappings can appear in the middle of an rtx if the rtx is
 	 * partially written, but they can also appear for preallocations.
 	 *
 	 * If the mapping is a hole, skip it entirely.  Holes should align with
 	 * rtx boundaries.
 	 */
 	if (!xfs_bmap_is_real_extent(irec))
 		return true;

 	/*
 	 * All mappings below this point are unwritten.
 	 *
 	 * - If the beginning is not aligned to an rtx, trim the end of the
 	 *   mapping so that it does not cross an rtx boundary, and swap it.
 	 *
 	 * - If both ends are aligned to an rtx, skip the entire mapping.
 	 */
 	if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) {
 		xfs_fileoff_t	new_end;

 		new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize);
 		irec->br_blockcount = min(irec->br_blockcount,
 					  new_end - irec->br_startoff);
 		return false;
 	}
 	if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize))
 		return true;

 	/*
 	 * All mappings below this point are unwritten, start on an rtx
 	 * boundary, and do not end on an rtx boundary.
 	 *
 	 * - If the mapping is longer than one rtx, trim the end of the mapping
 	 *   down to an rtx boundary and skip it.
 	 *
 	 * - The mapping is shorter than one rtx.  Swap it.
 	 */
 	if (irec->br_blockcount > mp->m_sb.sb_rextsize) {
 		xfs_fileoff_t	new_end;

 		new_end = rounddown_64(irec->br_startoff + irec->br_blockcount,
 				mp->m_sb.sb_rextsize);
 		irec->br_blockcount = new_end - irec->br_startoff;
 		return true;
 	}

 	return false;
 }

 /*
  * Walk forward through the file ranges in @xmi until we find two different
  * mappings to exchange.  If there is work to do, return the mappings;
  * otherwise we've reached the end of the range and xmi_blockcount will be
  * zero.
  *
  * If the walk skips over a pair of mappings to the same storage, save them as
  * the left records in @adj (if provided) so that the simulation phase can
  * avoid an extra lookup.
   */
 static int
 xfs_exchmaps_find_mappings(
 	struct xfs_exchmaps_intent	*xmi,
 	struct xfs_bmbt_irec		*irec1,
 	struct xfs_bmbt_irec		*irec2,
 	struct xfs_exchmaps_adjacent	*adj)
 {
 	int				nimaps;
 	int				bmap_flags;
 	int				error;

 	bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_whichfork(xmi));

 	for (; xmi_has_more_exchange_work(xmi); xmi_advance(xmi, irec1)) {
 		/* Read mapping from the first file */
 		nimaps = 1;
 		error = xfs_bmapi_read(xmi->xmi_ip1, xmi->xmi_startoff1,
 				xmi->xmi_blockcount, irec1, &nimaps,
 				bmap_flags);
 		if (error)
 			return error;
 		if (nimaps != 1 ||
 		    irec1->br_startblock == DELAYSTARTBLOCK ||
 		    irec1->br_startoff != xmi->xmi_startoff1) {
 			/*
 			 * We should never get no mapping or a delalloc mapping
 			 * or something that doesn't match what we asked for,
 			 * since the caller flushed both inodes and we hold the
 			 * ILOCKs for both inodes.
 			 */
 			ASSERT(0);
 			return -EINVAL;
 		}

 		if (xfs_exchmaps_can_skip_mapping(xmi, irec1)) {
 			trace_xfs_exchmaps_mapping1_skip(xmi->xmi_ip1, irec1);
 			continue;
 		}

 		/* Read mapping from the second file */
 		nimaps = 1;
 		error = xfs_bmapi_read(xmi->xmi_ip2, xmi->xmi_startoff2,
 				irec1->br_blockcount, irec2, &nimaps,
 				bmap_flags);
 		if (error)
 			return error;
 		if (nimaps != 1 ||
 		    irec2->br_startblock == DELAYSTARTBLOCK ||
 		    irec2->br_startoff != xmi->xmi_startoff2) {
 			/*
 			 * We should never get no mapping or a delalloc mapping
 			 * or something that doesn't match what we asked for,
 			 * since the caller flushed both inodes and we hold the
 			 * ILOCKs for both inodes.
 			 */
 			ASSERT(0);
 			return -EINVAL;
 		}

 		/*
 		 * We can only exchange as many blocks as the smaller of the
 		 * two mapping maps.
 		 */
 		irec1->br_blockcount = min(irec1->br_blockcount,
 					   irec2->br_blockcount);

 		trace_xfs_exchmaps_mapping1(xmi->xmi_ip1, irec1);
 		trace_xfs_exchmaps_mapping2(xmi->xmi_ip2, irec2);

 		/* We found something to exchange, so return it. */
 		if (irec1->br_startblock != irec2->br_startblock)
 			return 0;

 		/*
 		 * Two mappings pointing to the same physical block must not
 		 * have different states; that's filesystem corruption.  Move
 		 * on to the next mapping if they're both holes or both point
 		 * to the same physical space extent.
 		 */
 		if (irec1->br_state != irec2->br_state) {
 			xfs_bmap_mark_sick(xmi->xmi_ip1,
 					xfs_exchmaps_whichfork(xmi));
 			xfs_bmap_mark_sick(xmi->xmi_ip2,
 					xfs_exchmaps_whichfork(xmi));
 			return -EFSCORRUPTED;
 		}

 		/*
 		 * Save the mappings if we're estimating work and skipping
 		 * these identical mappings.
 		 */
 		if (adj) {
 			memcpy(&adj->left1, irec1, sizeof(*irec1));
 			memcpy(&adj->left2, irec2, sizeof(*irec2));
 		}
 	}

 	return 0;
 }

 /* Exchange these two mappings. */
 static void
 xfs_exchmaps_one_step(
 	struct xfs_trans		*tp,
 	struct xfs_exchmaps_intent	*xmi,
 	struct xfs_bmbt_irec		*irec1,
 	struct xfs_bmbt_irec		*irec2)
 {
 	int				whichfork = xfs_exchmaps_whichfork(xmi);

 	xfs_exchmaps_update_quota(tp, xmi, irec1, irec2);

 	/* Remove both mappings. */
 	xfs_bmap_unmap_extent(tp, xmi->xmi_ip1, whichfork, irec1);
 	xfs_bmap_unmap_extent(tp, xmi->xmi_ip2, whichfork, irec2);

 	/*
 	 * Re-add both mappings.  We exchange the file offsets between the two
 	 * maps and add the opposite map, which has the effect of filling the
 	 * logical offsets we just unmapped, but with with the physical mapping
 	 * information exchanged.
 	 */
 	swap(irec1->br_startoff, irec2->br_startoff);
 	xfs_bmap_map_extent(tp, xmi->xmi_ip1, whichfork, irec2);
 	xfs_bmap_map_extent(tp, xmi->xmi_ip2, whichfork, irec1);

 	/* Make sure we're not adding mappings past EOF. */
 	if (whichfork == XFS_DATA_FORK) {
 		xfs_exchmaps_update_size(tp, xmi->xmi_ip1, irec2,
 				xmi->xmi_isize1);
 		xfs_exchmaps_update_size(tp, xmi->xmi_ip2, irec1,
 				xmi->xmi_isize2);
 	}

 	/*
 	 * Advance our cursor and exit.   The caller (either defer ops or log
 	 * recovery) will log the XMD item, and if *blockcount is nonzero, it
 	 * will log a new XMI item for the remainder and call us back.
 	 */
 	xmi_advance(xmi, irec1);
 }

 /* Convert inode2's leaf attr fork back to shortform, if possible.. */
 STATIC int
 xfs_exchmaps_attr_to_sf(
 	struct xfs_trans		*tp,
 	struct xfs_exchmaps_intent	*xmi)
 {
 	struct xfs_da_args	args = {
 		.dp		= xmi->xmi_ip2,
 		.geo		= tp->t_mountp->m_attr_geo,
 		.whichfork	= XFS_ATTR_FORK,
 		.trans		= tp,
 		.owner		= xmi->xmi_ip2->i_ino,
 	};
 	struct xfs_buf		*bp;
 	int			forkoff;
 	int			error;

 	if (!xfs_attr_is_leaf(xmi->xmi_ip2))
 		return 0;

 	error = xfs_attr3_leaf_read(tp, xmi->xmi_ip2, 0, &bp);
 	if (error)
 		return error;

 	forkoff = xfs_attr_shortform_allfit(bp, xmi->xmi_ip2);
 	if (forkoff == 0)
 		return 0;

 	return xfs_attr3_leaf_to_shortform(bp, &args, forkoff);
 }

 /* Convert inode2's block dir fork back to shortform, if possible.. */
 STATIC int
 xfs_exchmaps_dir_to_sf(
 	struct xfs_trans		*tp,
 	struct xfs_exchmaps_intent	*xmi)
 {
 	struct xfs_da_args	args = {
 		.dp		= xmi->xmi_ip2,
 		.geo		= tp->t_mountp->m_dir_geo,
 		.whichfork	= XFS_DATA_FORK,
 		.trans		= tp,
 		.owner		= xmi->xmi_ip2->i_ino,
 	};
 	struct xfs_dir2_sf_hdr	sfh;
 	struct xfs_buf		*bp;
 	bool			isblock;
 	int			size;
 	int			error;

 	error = xfs_dir2_isblock(&args, &isblock);
 	if (error)
 		return error;

 	if (!isblock)
 		return 0;

 	error = xfs_dir3_block_read(tp, xmi->xmi_ip2, &bp);
 	if (error)
 		return error;

 	size = xfs_dir2_block_sfsize(xmi->xmi_ip2, bp->b_addr, &sfh);
 	if (size > xfs_inode_data_fork_size(xmi->xmi_ip2))
 		return 0;

 	return xfs_dir2_block_to_sf(&args, bp, size, &sfh);
 }

 /* Convert inode2's remote symlink target back to shortform, if possible. */
 STATIC int
 xfs_exchmaps_link_to_sf(
 	struct xfs_trans		*tp,
 	struct xfs_exchmaps_intent	*xmi)
 {
 	struct xfs_inode		*ip = xmi->xmi_ip2;
 	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
 	char				*buf;
 	int				error;

 	if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
 	    ip->i_disk_size > xfs_inode_data_fork_size(ip))
 		return 0;

 	/* Read the current symlink target into a buffer. */
 	buf = kmalloc(ip->i_disk_size + 1,
 			GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
 	if (!buf) {
 		ASSERT(0);
 		return -ENOMEM;
 	}

 	error = xfs_symlink_remote_read(ip, buf);
 	if (error)
 		goto free;

 	/* Remove the blocks. */
 	error = xfs_symlink_remote_truncate(tp, ip);
 	if (error)
 		goto free;

 	/* Convert fork to local format and log our changes. */
 	xfs_idestroy_fork(ifp);
 	ifp->if_bytes = 0;
 	ifp->if_format = XFS_DINODE_FMT_LOCAL;
 	xfs_init_local_fork(ip, XFS_DATA_FORK, buf, ip->i_disk_size);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
 free:
 	kfree(buf);
 	return error;
 }

 /* Clear the reflink flag after an exchange. */
 static inline void
 xfs_exchmaps_clear_reflink(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip)
 {
 	trace_xfs_reflink_unset_inode_flag(ip);

 	ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }

 /* Finish whatever work might come after an exchange operation. */
 static int
 xfs_exchmaps_do_postop_work(
 	struct xfs_trans		*tp,
 	struct xfs_exchmaps_intent	*xmi)
 {
 	if (xmi->xmi_flags & __XFS_EXCHMAPS_INO2_SHORTFORM) {
 		int			error = 0;

 		if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)
 			error = xfs_exchmaps_attr_to_sf(tp, xmi);
 		else if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode))
 			error = xfs_exchmaps_dir_to_sf(tp, xmi);
 		else if (S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode))
 			error = xfs_exchmaps_link_to_sf(tp, xmi);
 		xmi->xmi_flags &= ~__XFS_EXCHMAPS_INO2_SHORTFORM;
 		if (error)
 			return error;
 	}

 	if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO1_REFLINK) {
 		xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip1);
 		xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO1_REFLINK;
 	}

 	if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO2_REFLINK) {
 		xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip2);
 		xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO2_REFLINK;
 	}

 	return 0;
 }

 /* Finish one step in a mapping exchange operation, possibly relogging. */
 int
 xfs_exchmaps_finish_one(
 	struct xfs_trans		*tp,
 	struct xfs_exchmaps_intent	*xmi)
 {
 	struct xfs_bmbt_irec		irec1, irec2;
 	int				error;

 	if (xmi_has_more_exchange_work(xmi)) {
 		/*
 		 * If the operation state says that some range of the files
 		 * have not yet been exchanged, look for mappings in that range
 		 * to exchange.  If we find some mappings, exchange them.
 		 */
 		error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, NULL);
 		if (error)
 			return error;

 		if (xmi_has_more_exchange_work(xmi))
 			xfs_exchmaps_one_step(tp, xmi, &irec1, &irec2);

 		/*
 		 * If the caller asked us to exchange the file sizes after the
 		 * exchange and either we just exchanged the last mappings in
 		 * the range or we didn't find anything to exchange, update the
 		 * ondisk file sizes.
 		 */
 		if ((xmi->xmi_flags & XFS_EXCHMAPS_SET_SIZES) &&
 		    !xmi_has_more_exchange_work(xmi)) {
 			xmi->xmi_ip1->i_disk_size = xmi->xmi_isize1;
 			xmi->xmi_ip2->i_disk_size = xmi->xmi_isize2;

 			xfs_trans_log_inode(tp, xmi->xmi_ip1, XFS_ILOG_CORE);
 			xfs_trans_log_inode(tp, xmi->xmi_ip2, XFS_ILOG_CORE);
 		}
 	} else if (xmi_has_postop_work(xmi)) {
 		/*
 		 * Now that we're finished with the exchange operation,
 		 * complete the post-op cleanup work.
 		 */
 		error = xfs_exchmaps_do_postop_work(tp, xmi);
 		if (error)
 			return error;
 	}

 	if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE))
 		return -EIO;

 	/* If we still have work to do, ask for a new transaction. */
 	if (xmi_has_more_exchange_work(xmi) || xmi_has_postop_work(xmi)) {
 		trace_xfs_exchmaps_defer(tp->t_mountp, xmi);
 		return -EAGAIN;
 	}

 	/*
 	 * If we reach here, we've finished all the exchange work and the post
 	 * operation work.  The last thing we need to do before returning to
 	 * the caller is to make sure that COW forks are set up correctly.
 	 */
 	if (!(xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)) {
 		xfs_exchmaps_ensure_cowfork(xmi->xmi_ip1);
 		xfs_exchmaps_ensure_cowfork(xmi->xmi_ip2);
 	}

 	return 0;
 }

 /*
  * Compute the amount of bmbt blocks we should reserve for each file.  In the
  * worst case, each exchange will fill a hole with a new mapping, which could
  * result in a btree split every time we add a new leaf block.
  */
 static inline uint64_t
 xfs_exchmaps_bmbt_blocks(
 	struct xfs_mount		*mp,
 	const struct xfs_exchmaps_req	*req)
 {
 	return howmany_64(req->nr_exchanges,
 					XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)) *
 			XFS_EXTENTADD_SPACE_RES(mp, xfs_exchmaps_reqfork(req));
 }

 /* Compute the space we should reserve for the rmap btree expansions. */
 static inline uint64_t
 xfs_exchmaps_rmapbt_blocks(
 	struct xfs_mount		*mp,
 	const struct xfs_exchmaps_req	*req)
 {
 	if (!xfs_has_rmapbt(mp))
 		return 0;
 	if (XFS_IS_REALTIME_INODE(req->ip1))
 		return 0;

 	return howmany_64(req->nr_exchanges,
 					XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) *
 			XFS_RMAPADD_SPACE_RES(mp);
 }

 /* Estimate the bmbt and rmapbt overhead required to exchange mappings. */
 static int
 xfs_exchmaps_estimate_overhead(
 	struct xfs_exchmaps_req		*req)
 {
 	struct xfs_mount		*mp = req->ip1->i_mount;
 	xfs_filblks_t			bmbt_blocks;
 	xfs_filblks_t			rmapbt_blocks;
 	xfs_filblks_t			resblks = req->resblks;

 	/*
 	 * Compute the number of bmbt and rmapbt blocks we might need to handle
 	 * the estimated number of exchanges.
 	 */
 	bmbt_blocks = xfs_exchmaps_bmbt_blocks(mp, req);
 	rmapbt_blocks = xfs_exchmaps_rmapbt_blocks(mp, req);

 	trace_xfs_exchmaps_overhead(mp, bmbt_blocks, rmapbt_blocks);

 	/* Make sure the change in file block count doesn't overflow. */
 	if (check_add_overflow(req->ip1_bcount, bmbt_blocks, &req->ip1_bcount))
 		return -EFBIG;
 	if (check_add_overflow(req->ip2_bcount, bmbt_blocks, &req->ip2_bcount))
 		return -EFBIG;

 	/*
 	 * Add together the number of blocks we need to handle btree growth,
 	 * then add it to the number of blocks we need to reserve to this
 	 * transaction.
 	 */
 	if (check_add_overflow(resblks, bmbt_blocks, &resblks))
 		return -ENOSPC;
 	if (check_add_overflow(resblks, bmbt_blocks, &resblks))
 		return -ENOSPC;
 	if (check_add_overflow(resblks, rmapbt_blocks, &resblks))
 		return -ENOSPC;
 	if (check_add_overflow(resblks, rmapbt_blocks, &resblks))
 		return -ENOSPC;

 	/* Can't actually reserve more than UINT_MAX blocks. */
 	if (req->resblks > UINT_MAX)
 		return -ENOSPC;

 	req->resblks = resblks;
 	trace_xfs_exchmaps_final_estimate(req);
 	return 0;
 }

 /* Decide if we can merge two real mappings. */
 static inline bool
 xmi_can_merge(
 	const struct xfs_bmbt_irec	*b1,
 	const struct xfs_bmbt_irec	*b2)
 {
 	/* Don't merge holes. */
 	if (b1->br_startblock == HOLESTARTBLOCK ||
 	    b2->br_startblock == HOLESTARTBLOCK)
 		return false;

 	/* We don't merge holes. */
 	if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2))
 		return false;

 	if (b1->br_startoff   + b1->br_blockcount == b2->br_startoff &&
 	    b1->br_startblock + b1->br_blockcount == b2->br_startblock &&
 	    b1->br_state			  == b2->br_state &&
 	    b1->br_blockcount + b2->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
 		return true;

 	return false;
 }

 /*
  * Decide if we can merge three mappings.  Caller must ensure all three
  * mappings must not be holes or delalloc reservations.
  */
 static inline bool
 xmi_can_merge_all(
 	const struct xfs_bmbt_irec	*l,
 	const struct xfs_bmbt_irec	*m,
 	const struct xfs_bmbt_irec	*r)
 {
 	xfs_filblks_t			new_len;

 	new_len = l->br_blockcount + m->br_blockcount + r->br_blockcount;
 	return new_len <= XFS_MAX_BMBT_EXTLEN;
 }

 #define CLEFT_CONTIG	0x01
 #define CRIGHT_CONTIG	0x02
 #define CHOLE		0x04
 #define CBOTH_CONTIG	(CLEFT_CONTIG | CRIGHT_CONTIG)

 #define NLEFT_CONTIG	0x10
 #define NRIGHT_CONTIG	0x20
 #define NHOLE		0x40
 #define NBOTH_CONTIG	(NLEFT_CONTIG | NRIGHT_CONTIG)

 /* Estimate the effect of a single exchange on mapping count. */
 static inline int
 xmi_delta_nextents_step(
 	struct xfs_mount		*mp,
 	const struct xfs_bmbt_irec	*left,
 	const struct xfs_bmbt_irec	*curr,
 	const struct xfs_bmbt_irec	*new,
 	const struct xfs_bmbt_irec	*right)
 {
 	bool				lhole, rhole, chole, nhole;
 	unsigned int			state = 0;
 	int				ret = 0;

 	lhole = left->br_startblock == HOLESTARTBLOCK;
 	rhole = right->br_startblock == HOLESTARTBLOCK;
 	chole = curr->br_startblock == HOLESTARTBLOCK;
 	nhole = new->br_startblock == HOLESTARTBLOCK;

 	if (chole)
 		state |= CHOLE;
 	if (!lhole && !chole && xmi_can_merge(left, curr))
 		state |= CLEFT_CONTIG;
 	if (!rhole && !chole && xmi_can_merge(curr, right))
 		state |= CRIGHT_CONTIG;
 	if ((state & CBOTH_CONTIG) == CBOTH_CONTIG &&
 	    !xmi_can_merge_all(left, curr, right))
 		state &= ~CRIGHT_CONTIG;

 	if (nhole)
 		state |= NHOLE;
 	if (!lhole && !nhole && xmi_can_merge(left, new))
 		state |= NLEFT_CONTIG;
 	if (!rhole && !nhole && xmi_can_merge(new, right))
 		state |= NRIGHT_CONTIG;
 	if ((state & NBOTH_CONTIG) == NBOTH_CONTIG &&
 	    !xmi_can_merge_all(left, new, right))
 		state &= ~NRIGHT_CONTIG;

 	switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) {
 	case CLEFT_CONTIG | CRIGHT_CONTIG:
 		/*
 		 * left/curr/right are the same mapping, so deleting curr
 		 * causes 2 new mappings to be created.
 		 */
 		ret += 2;
 		break;
 	case 0:
 		/*
 		 * curr is not contiguous with any mapping, so we remove curr
 		 * completely
 		 */
 		ret--;
 		break;
 	case CHOLE:
 		/* hole, do nothing */
 		break;
 	case CLEFT_CONTIG:
 	case CRIGHT_CONTIG:
 		/* trim either left or right, no change */
 		break;
 	}

 	switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) {
 	case NLEFT_CONTIG | NRIGHT_CONTIG:
 		/*
 		 * left/curr/right will become the same mapping, so adding
 		 * curr causes the deletion of right.
 		 */
 		ret--;
 		break;
 	case 0:
 		/* new is not contiguous with any mapping */
 		ret++;
 		break;
 	case NHOLE:
 		/* hole, do nothing. */
 		break;
 	case NLEFT_CONTIG:
 	case NRIGHT_CONTIG:
 		/* new is absorbed into left or right, no change */
 		break;
 	}

 	trace_xfs_exchmaps_delta_nextents_step(mp, left, curr, new, right, ret,
 			state);
 	return ret;
 }

 /* Make sure we don't overflow the extent (mapping) counters. */
 static inline int
 xmi_ensure_delta_nextents(
 	struct xfs_exchmaps_req	*req,
 	struct xfs_inode	*ip,
 	int64_t			delta)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	int			whichfork = xfs_exchmaps_reqfork(req);
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	uint64_t		new_nextents;
 	xfs_extnum_t		max_nextents;

 	if (delta < 0)
 		return 0;

 	/*
 	 * It's always an error if the delta causes integer overflow.  delta
 	 * needs an explicit cast here to avoid warnings about implicit casts
 	 * coded into the overflow check.
 	 */
 	if (check_add_overflow(ifp->if_nextents, (uint64_t)delta,
 				&new_nextents))
 		return -EFBIG;

 	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
 	    new_nextents > 10)
 		return -EFBIG;

 	/*
 	 * We always promote both inodes to have large extent counts if the
 	 * superblock feature is enabled, so we only need to check against the
 	 * theoretical maximum.
 	 */
 	max_nextents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
 					     whichfork);
 	if (new_nextents > max_nextents)
 		return -EFBIG;

 	return 0;
 }

 /* Find the next mapping after irec. */
 static inline int
 xmi_next(
 	struct xfs_inode		*ip,
 	int				bmap_flags,
 	const struct xfs_bmbt_irec	*irec,
 	struct xfs_bmbt_irec		*nrec)
 {
 	xfs_fileoff_t			off;
 	xfs_filblks_t			blockcount;
 	int				nimaps = 1;
 	int				error;

 	off = irec->br_startoff + irec->br_blockcount;
 	blockcount = XFS_MAX_FILEOFF - off;
 	error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags);
 	if (error)
 		return error;
 	if (nrec->br_startblock == DELAYSTARTBLOCK ||
 	    nrec->br_startoff != off) {
 		/*
 		 * If we don't get the mapping we want, return a zero-length
 		 * mapping, which our estimator function will pretend is a hole.
 		 * We shouldn't get delalloc reservations.
 		 */
 		nrec->br_startblock = HOLESTARTBLOCK;
 	}

 	return 0;
 }

 int __init
 xfs_exchmaps_intent_init_cache(void)
 {
 	xfs_exchmaps_intent_cache = kmem_cache_create("xfs_exchmaps_intent",
 			sizeof(struct xfs_exchmaps_intent),
 			0, 0, NULL);

 	return xfs_exchmaps_intent_cache != NULL ? 0 : -ENOMEM;
 }

 void
 xfs_exchmaps_intent_destroy_cache(void)
 {
 	kmem_cache_destroy(xfs_exchmaps_intent_cache);
 	xfs_exchmaps_intent_cache = NULL;
 }

 /*
  * Decide if we will exchange the reflink flags between the two files after the
  * exchange.  The only time we want to do this is if we're exchanging all
  * mappings under EOF and the inode reflink flags have different states.
  */
 static inline bool
 xmi_can_exchange_reflink_flags(
 	const struct xfs_exchmaps_req	*req,
 	unsigned int			reflink_state)
 {
 	struct xfs_mount		*mp = req->ip1->i_mount;

 	if (hweight32(reflink_state) != 1)
 		return false;
 	if (req->startoff1 != 0 || req->startoff2 != 0)
 		return false;
 	if (req->blockcount != XFS_B_TO_FSB(mp, req->ip1->i_disk_size))
 		return false;
 	if (req->blockcount != XFS_B_TO_FSB(mp, req->ip2->i_disk_size))
 		return false;
 	return true;
 }


 /* Allocate and initialize a new incore intent item from a request. */
 struct xfs_exchmaps_intent *
 xfs_exchmaps_init_intent(
 	const struct xfs_exchmaps_req	*req)
 {
 	struct xfs_exchmaps_intent	*xmi;
 	unsigned int			rs = 0;

 	xmi = kmem_cache_zalloc(xfs_exchmaps_intent_cache,
 			GFP_NOFS | __GFP_NOFAIL);
 	INIT_LIST_HEAD(&xmi->xmi_list);
 	xmi->xmi_ip1 = req->ip1;
 	xmi->xmi_ip2 = req->ip2;
 	xmi->xmi_startoff1 = req->startoff1;
 	xmi->xmi_startoff2 = req->startoff2;
 	xmi->xmi_blockcount = req->blockcount;
 	xmi->xmi_isize1 = xmi->xmi_isize2 = -1;
 	xmi->xmi_flags = req->flags & XFS_EXCHMAPS_PARAMS;

 	if (xfs_exchmaps_whichfork(xmi) == XFS_ATTR_FORK) {
 		xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM;
 		return xmi;
 	}

 	if (req->flags & XFS_EXCHMAPS_SET_SIZES) {
 		xmi->xmi_flags |= XFS_EXCHMAPS_SET_SIZES;
 		xmi->xmi_isize1 = req->ip2->i_disk_size;
 		xmi->xmi_isize2 = req->ip1->i_disk_size;
 	}

 	/* Record the state of each inode's reflink flag before the op. */
 	if (xfs_is_reflink_inode(req->ip1))
 		rs |= 1;
 	if (xfs_is_reflink_inode(req->ip2))
 		rs |= 2;

 	/*
 	 * Figure out if we're clearing the reflink flags (which effectively
 	 * exchanges them) after the operation.
 	 */
 	if (xmi_can_exchange_reflink_flags(req, rs)) {
 		if (rs & 1)
 			xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO1_REFLINK;
 		if (rs & 2)
 			xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO2_REFLINK;
 	}

 	if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode) ||
 	    S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode))
 		xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM;

 	return xmi;
 }

 /*
  * Estimate the number of exchange operations and the number of file blocks
  * in each file that will be affected by the exchange operation.
  */
 int
 xfs_exchmaps_estimate(
 	struct xfs_exchmaps_req		*req)
 {
 	struct xfs_exchmaps_intent	*xmi;
 	struct xfs_bmbt_irec		irec1, irec2;
 	struct xfs_exchmaps_adjacent	adj = ADJACENT_INIT;
 	xfs_filblks_t			ip1_blocks = 0, ip2_blocks = 0;
 	int64_t				d_nexts1, d_nexts2;
 	int				bmap_flags;
 	int				error;

 	ASSERT(!(req->flags & ~XFS_EXCHMAPS_PARAMS));

 	bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_reqfork(req));
 	xmi = xfs_exchmaps_init_intent(req);

 	/*
 	 * To guard against the possibility of overflowing the extent counters,
 	 * we have to estimate an upper bound on the potential increase in that
 	 * counter.  We can split the mapping at each end of the range, and for
 	 * each step of the exchange we can split the mapping that we're
 	 * working on if the mappings do not align.
 	 */
 	d_nexts1 = d_nexts2 = 3;

 	while (xmi_has_more_exchange_work(xmi)) {
 		/*
 		 * Walk through the file ranges until we find something to
 		 * exchange.  Because we're simulating the exchange, pass in
 		 * adj to capture skipped mappings for correct estimation of
 		 * bmbt record merges.
 		 */
 		error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, &adj);
 		if (error)
 			goto out_free;
 		if (!xmi_has_more_exchange_work(xmi))
 			break;

 		/* Update accounting. */
 		if (xfs_bmap_is_real_extent(&irec1))
 			ip1_blocks += irec1.br_blockcount;
 		if (xfs_bmap_is_real_extent(&irec2))
 			ip2_blocks += irec2.br_blockcount;
 		req->nr_exchanges++;

 		/* Read the next mappings from both files. */
 		error = xmi_next(req->ip1, bmap_flags, &irec1, &adj.right1);
 		if (error)
 			goto out_free;

 		error = xmi_next(req->ip2, bmap_flags, &irec2, &adj.right2);
 		if (error)
 			goto out_free;

 		/* Update extent count deltas. */
 		d_nexts1 += xmi_delta_nextents_step(req->ip1->i_mount,
 				&adj.left1, &irec1, &irec2, &adj.right1);

 		d_nexts2 += xmi_delta_nextents_step(req->ip1->i_mount,
 				&adj.left2, &irec2, &irec1, &adj.right2);

 		/* Now pretend we exchanged the mappings. */
 		if (xmi_can_merge(&adj.left2, &irec1))
 			adj.left2.br_blockcount += irec1.br_blockcount;
 		else
 			memcpy(&adj.left2, &irec1, sizeof(irec1));

 		if (xmi_can_merge(&adj.left1, &irec2))
 			adj.left1.br_blockcount += irec2.br_blockcount;
 		else
 			memcpy(&adj.left1, &irec2, sizeof(irec2));

 		xmi_advance(xmi, &irec1);
 	}

 	/* Account for the blocks that are being exchanged. */
 	if (XFS_IS_REALTIME_INODE(req->ip1) &&
 	    xfs_exchmaps_reqfork(req) == XFS_DATA_FORK) {
 		req->ip1_rtbcount = ip1_blocks;
 		req->ip2_rtbcount = ip2_blocks;
 	} else {
 		req->ip1_bcount = ip1_blocks;
 		req->ip2_bcount = ip2_blocks;
 	}

 	/*
 	 * Make sure that both forks have enough slack left in their extent
 	 * counters that the exchange operation will not overflow.
 	 */
 	trace_xfs_exchmaps_delta_nextents(req, d_nexts1, d_nexts2);
 	if (req->ip1 == req->ip2) {
 		error = xmi_ensure_delta_nextents(req, req->ip1,
 				d_nexts1 + d_nexts2);
 	} else {
 		error = xmi_ensure_delta_nextents(req, req->ip1, d_nexts1);
 		if (error)
 			goto out_free;
 		error = xmi_ensure_delta_nextents(req, req->ip2, d_nexts2);
 	}
 	if (error)
 		goto out_free;

 	trace_xfs_exchmaps_initial_estimate(req);
 	error = xfs_exchmaps_estimate_overhead(req);
 out_free:
 	kmem_cache_free(xfs_exchmaps_intent_cache, xmi);
 	return error;
 }

 /* Set the reflink flag before an operation. */
 static inline void
 xfs_exchmaps_set_reflink(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip)
 {
 	trace_xfs_reflink_set_inode_flag(ip);

 	ip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }

 /*
  * If either file has shared blocks and we're exchanging data forks, we must
  * flag the other file as having shared blocks so that we get the shared-block
  * rmap functions if we need to fix up the rmaps.
  */
 void
 xfs_exchmaps_ensure_reflink(
 	struct xfs_trans			*tp,
 	const struct xfs_exchmaps_intent	*xmi)
 {
 	unsigned int				rs = 0;

 	if (xfs_is_reflink_inode(xmi->xmi_ip1))
 		rs |= 1;
 	if (xfs_is_reflink_inode(xmi->xmi_ip2))
 		rs |= 2;

 	if ((rs & 1) && !xfs_is_reflink_inode(xmi->xmi_ip2))
 		xfs_exchmaps_set_reflink(tp, xmi->xmi_ip2);

 	if ((rs & 2) && !xfs_is_reflink_inode(xmi->xmi_ip1))
 		xfs_exchmaps_set_reflink(tp, xmi->xmi_ip1);
 }

 /* Set the large extent count flag before an operation if needed. */
 static inline void
 xfs_exchmaps_ensure_large_extent_counts(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip)
 {
 	if (xfs_inode_has_large_extent_counts(ip))
 		return;

 	ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }

 /* Widen the extent counter fields of both inodes if necessary. */
 void
 xfs_exchmaps_upgrade_extent_counts(
 	struct xfs_trans			*tp,
 	const struct xfs_exchmaps_intent	*xmi)
 {
 	if (!xfs_has_large_extent_counts(tp->t_mountp))
 		return;

 	xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip1);
 	xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip2);
 }

 /*
  * Schedule an exchange a range of mappings from one inode to another.
  *
  * The use of file mapping exchange log intent items ensures the operation can
  * be resumed even if the system goes down.  The caller must commit the
  * transaction to start the work.
  *
  * The caller must ensure the inodes must be joined to the transaction and
  * ILOCKd; they will still be joined to the transaction at exit.
  */
 void
 xfs_exchange_mappings(
 	struct xfs_trans		*tp,
 	const struct xfs_exchmaps_req	*req)
 {
 	struct xfs_exchmaps_intent	*xmi;

 	BUILD_BUG_ON(XFS_EXCHMAPS_INTERNAL_FLAGS & XFS_EXCHMAPS_LOGGED_FLAGS);

 	xfs_assert_ilocked(req->ip1, XFS_ILOCK_EXCL);
 	xfs_assert_ilocked(req->ip2, XFS_ILOCK_EXCL);
 	ASSERT(!(req->flags & ~XFS_EXCHMAPS_LOGGED_FLAGS));
 	if (req->flags & XFS_EXCHMAPS_SET_SIZES)
 		ASSERT(!(req->flags & XFS_EXCHMAPS_ATTR_FORK));
 	ASSERT(xfs_has_exchange_range(tp->t_mountp));

 	if (req->blockcount == 0)
 		return;

 	xmi = xfs_exchmaps_init_intent(req);
 	xfs_exchmaps_defer_add(tp, xmi);
 	xfs_exchmaps_ensure_reflink(tp, xmi);
 	xfs_exchmaps_upgrade_extent_counts(tp, xmi);
 }