| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* |
| * Copyright (c) 2020-2024 Oracle. All Rights Reserved. |
| * Author: Darrick J. Wong <djwong@kernel.org> |
| */ |
| #include "xfs.h" |
| #include "xfs_fs.h" |
| #include "xfs_shared.h" |
| #include "xfs_format.h" |
| #include "xfs_log_format.h" |
| #include "xfs_trans_resv.h" |
| #include "xfs_mount.h" |
| #include "xfs_defer.h" |
| #include "xfs_inode.h" |
| #include "xfs_trans.h" |
| #include "xfs_bmap.h" |
| #include "xfs_icache.h" |
| #include "xfs_quota.h" |
| #include "xfs_exchmaps.h" |
| #include "xfs_trace.h" |
| #include "xfs_bmap_btree.h" |
| #include "xfs_trans_space.h" |
| #include "xfs_error.h" |
| #include "xfs_errortag.h" |
| #include "xfs_health.h" |
| #include "xfs_exchmaps_item.h" |
| #include "xfs_da_format.h" |
| #include "xfs_da_btree.h" |
| #include "xfs_attr_leaf.h" |
| #include "xfs_attr.h" |
| #include "xfs_dir2_priv.h" |
| #include "xfs_dir2.h" |
| #include "xfs_symlink_remote.h" |
| |
| struct kmem_cache *xfs_exchmaps_intent_cache; |
| |
| /* bmbt mappings adjacent to a pair of records. */ |
| struct xfs_exchmaps_adjacent { |
| struct xfs_bmbt_irec left1; |
| struct xfs_bmbt_irec right1; |
| struct xfs_bmbt_irec left2; |
| struct xfs_bmbt_irec right2; |
| }; |
| |
| #define ADJACENT_INIT { \ |
| .left1 = { .br_startblock = HOLESTARTBLOCK }, \ |
| .right1 = { .br_startblock = HOLESTARTBLOCK }, \ |
| .left2 = { .br_startblock = HOLESTARTBLOCK }, \ |
| .right2 = { .br_startblock = HOLESTARTBLOCK }, \ |
| } |
| |
| /* Information to reset reflink flag / CoW fork state after an exchange. */ |
| |
| /* |
| * If the reflink flag is set on either inode, make sure it has an incore CoW |
| * fork, since all reflink inodes must have them. If there's a CoW fork and it |
| * has mappings in it, make sure the inodes are tagged appropriately so that |
| * speculative preallocations can be GC'd if we run low of space. |
| */ |
| static inline void |
| xfs_exchmaps_ensure_cowfork( |
| struct xfs_inode *ip) |
| { |
| struct xfs_ifork *cfork; |
| |
| if (xfs_is_reflink_inode(ip)) |
| xfs_ifork_init_cow(ip); |
| |
| cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); |
| if (!cfork) |
| return; |
| if (cfork->if_bytes > 0) |
| xfs_inode_set_cowblocks_tag(ip); |
| else |
| xfs_inode_clear_cowblocks_tag(ip); |
| } |
| |
| /* |
| * Adjust the on-disk inode size upwards if needed so that we never add |
| * mappings into the file past EOF. This is crucial so that log recovery won't |
| * get confused by the sudden appearance of post-eof mappings. |
| */ |
| STATIC void |
| xfs_exchmaps_update_size( |
| struct xfs_trans *tp, |
| struct xfs_inode *ip, |
| struct xfs_bmbt_irec *imap, |
| xfs_fsize_t new_isize) |
| { |
| struct xfs_mount *mp = tp->t_mountp; |
| xfs_fsize_t len; |
| |
| if (new_isize < 0) |
| return; |
| |
| len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount), |
| new_isize); |
| |
| if (len <= ip->i_disk_size) |
| return; |
| |
| trace_xfs_exchmaps_update_inode_size(ip, len); |
| |
| ip->i_disk_size = len; |
| xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
| } |
| |
| /* Advance the incore state tracking after exchanging a mapping. */ |
| static inline void |
| xmi_advance( |
| struct xfs_exchmaps_intent *xmi, |
| const struct xfs_bmbt_irec *irec) |
| { |
| xmi->xmi_startoff1 += irec->br_blockcount; |
| xmi->xmi_startoff2 += irec->br_blockcount; |
| xmi->xmi_blockcount -= irec->br_blockcount; |
| } |
| |
| /* Do we still have more mappings to exchange? */ |
| static inline bool |
| xmi_has_more_exchange_work(const struct xfs_exchmaps_intent *xmi) |
| { |
| return xmi->xmi_blockcount > 0; |
| } |
| |
| /* Do we have post-operation cleanups to perform? */ |
| static inline bool |
| xmi_has_postop_work(const struct xfs_exchmaps_intent *xmi) |
| { |
| return xmi->xmi_flags & (XFS_EXCHMAPS_CLEAR_INO1_REFLINK | |
| XFS_EXCHMAPS_CLEAR_INO2_REFLINK | |
| __XFS_EXCHMAPS_INO2_SHORTFORM); |
| } |
| |
| /* Check all mappings to make sure we can actually exchange them. */ |
| int |
| xfs_exchmaps_check_forks( |
| struct xfs_mount *mp, |
| const struct xfs_exchmaps_req *req) |
| { |
| struct xfs_ifork *ifp1, *ifp2; |
| int whichfork = xfs_exchmaps_reqfork(req); |
| |
| /* No fork? */ |
| ifp1 = xfs_ifork_ptr(req->ip1, whichfork); |
| ifp2 = xfs_ifork_ptr(req->ip2, whichfork); |
| if (!ifp1 || !ifp2) |
| return -EINVAL; |
| |
| /* We don't know how to exchange local format forks. */ |
| if (ifp1->if_format == XFS_DINODE_FMT_LOCAL || |
| ifp2->if_format == XFS_DINODE_FMT_LOCAL) |
| return -EINVAL; |
| |
| return 0; |
| } |
| |
| #ifdef CONFIG_XFS_QUOTA |
| /* Log the actual updates to the quota accounting. */ |
| static inline void |
| xfs_exchmaps_update_quota( |
| struct xfs_trans *tp, |
| struct xfs_exchmaps_intent *xmi, |
| struct xfs_bmbt_irec *irec1, |
| struct xfs_bmbt_irec *irec2) |
| { |
| int64_t ip1_delta = 0, ip2_delta = 0; |
| unsigned int qflag; |
| |
| qflag = XFS_IS_REALTIME_INODE(xmi->xmi_ip1) ? XFS_TRANS_DQ_RTBCOUNT : |
| XFS_TRANS_DQ_BCOUNT; |
| |
| if (xfs_bmap_is_real_extent(irec1)) { |
| ip1_delta -= irec1->br_blockcount; |
| ip2_delta += irec1->br_blockcount; |
| } |
| |
| if (xfs_bmap_is_real_extent(irec2)) { |
| ip1_delta += irec2->br_blockcount; |
| ip2_delta -= irec2->br_blockcount; |
| } |
| |
| xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip1, qflag, ip1_delta); |
| xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip2, qflag, ip2_delta); |
| } |
| #else |
| # define xfs_exchmaps_update_quota(tp, xmi, irec1, irec2) ((void)0) |
| #endif |
| |
| /* Decide if we want to skip this mapping from file1. */ |
| static inline bool |
| xfs_exchmaps_can_skip_mapping( |
| struct xfs_exchmaps_intent *xmi, |
| struct xfs_bmbt_irec *irec) |
| { |
| struct xfs_mount *mp = xmi->xmi_ip1->i_mount; |
| |
| /* Do not skip this mapping if the caller did not tell us to. */ |
| if (!(xmi->xmi_flags & XFS_EXCHMAPS_INO1_WRITTEN)) |
| return false; |
| |
| /* Do not skip mapped, written mappings. */ |
| if (xfs_bmap_is_written_extent(irec)) |
| return false; |
| |
| /* |
| * The mapping is unwritten or a hole. It cannot be a delalloc |
| * reservation because we already excluded those. It cannot be an |
| * unwritten extent with dirty page cache because we flushed the page |
| * cache. For files where the allocation unit is 1FSB (files on the |
| * data dev, rt files if the extent size is 1FSB), we can safely |
| * skip this mapping. |
| */ |
| if (!xfs_inode_has_bigrtalloc(xmi->xmi_ip1)) |
| return true; |
| |
| /* |
| * For a realtime file with a multi-fsb allocation unit, the decision |
| * is trickier because we can only swap full allocation units. |
| * Unwritten mappings can appear in the middle of an rtx if the rtx is |
| * partially written, but they can also appear for preallocations. |
| * |
| * If the mapping is a hole, skip it entirely. Holes should align with |
| * rtx boundaries. |
| */ |
| if (!xfs_bmap_is_real_extent(irec)) |
| return true; |
| |
| /* |
| * All mappings below this point are unwritten. |
| * |
| * - If the beginning is not aligned to an rtx, trim the end of the |
| * mapping so that it does not cross an rtx boundary, and swap it. |
| * |
| * - If both ends are aligned to an rtx, skip the entire mapping. |
| */ |
| if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) { |
| xfs_fileoff_t new_end; |
| |
| new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize); |
| irec->br_blockcount = min(irec->br_blockcount, |
| new_end - irec->br_startoff); |
| return false; |
| } |
| if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize)) |
| return true; |
| |
| /* |
| * All mappings below this point are unwritten, start on an rtx |
| * boundary, and do not end on an rtx boundary. |
| * |
| * - If the mapping is longer than one rtx, trim the end of the mapping |
| * down to an rtx boundary and skip it. |
| * |
| * - The mapping is shorter than one rtx. Swap it. |
| */ |
| if (irec->br_blockcount > mp->m_sb.sb_rextsize) { |
| xfs_fileoff_t new_end; |
| |
| new_end = rounddown_64(irec->br_startoff + irec->br_blockcount, |
| mp->m_sb.sb_rextsize); |
| irec->br_blockcount = new_end - irec->br_startoff; |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* |
| * Walk forward through the file ranges in @xmi until we find two different |
| * mappings to exchange. If there is work to do, return the mappings; |
| * otherwise we've reached the end of the range and xmi_blockcount will be |
| * zero. |
| * |
| * If the walk skips over a pair of mappings to the same storage, save them as |
| * the left records in @adj (if provided) so that the simulation phase can |
| * avoid an extra lookup. |
| */ |
| static int |
| xfs_exchmaps_find_mappings( |
| struct xfs_exchmaps_intent *xmi, |
| struct xfs_bmbt_irec *irec1, |
| struct xfs_bmbt_irec *irec2, |
| struct xfs_exchmaps_adjacent *adj) |
| { |
| int nimaps; |
| int bmap_flags; |
| int error; |
| |
| bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_whichfork(xmi)); |
| |
| for (; xmi_has_more_exchange_work(xmi); xmi_advance(xmi, irec1)) { |
| /* Read mapping from the first file */ |
| nimaps = 1; |
| error = xfs_bmapi_read(xmi->xmi_ip1, xmi->xmi_startoff1, |
| xmi->xmi_blockcount, irec1, &nimaps, |
| bmap_flags); |
| if (error) |
| return error; |
| if (nimaps != 1 || |
| irec1->br_startblock == DELAYSTARTBLOCK || |
| irec1->br_startoff != xmi->xmi_startoff1) { |
| /* |
| * We should never get no mapping or a delalloc mapping |
| * or something that doesn't match what we asked for, |
| * since the caller flushed both inodes and we hold the |
| * ILOCKs for both inodes. |
| */ |
| ASSERT(0); |
| return -EINVAL; |
| } |
| |
| if (xfs_exchmaps_can_skip_mapping(xmi, irec1)) { |
| trace_xfs_exchmaps_mapping1_skip(xmi->xmi_ip1, irec1); |
| continue; |
| } |
| |
| /* Read mapping from the second file */ |
| nimaps = 1; |
| error = xfs_bmapi_read(xmi->xmi_ip2, xmi->xmi_startoff2, |
| irec1->br_blockcount, irec2, &nimaps, |
| bmap_flags); |
| if (error) |
| return error; |
| if (nimaps != 1 || |
| irec2->br_startblock == DELAYSTARTBLOCK || |
| irec2->br_startoff != xmi->xmi_startoff2) { |
| /* |
| * We should never get no mapping or a delalloc mapping |
| * or something that doesn't match what we asked for, |
| * since the caller flushed both inodes and we hold the |
| * ILOCKs for both inodes. |
| */ |
| ASSERT(0); |
| return -EINVAL; |
| } |
| |
| /* |
| * We can only exchange as many blocks as the smaller of the |
| * two mapping maps. |
| */ |
| irec1->br_blockcount = min(irec1->br_blockcount, |
| irec2->br_blockcount); |
| |
| trace_xfs_exchmaps_mapping1(xmi->xmi_ip1, irec1); |
| trace_xfs_exchmaps_mapping2(xmi->xmi_ip2, irec2); |
| |
| /* We found something to exchange, so return it. */ |
| if (irec1->br_startblock != irec2->br_startblock) |
| return 0; |
| |
| /* |
| * Two mappings pointing to the same physical block must not |
| * have different states; that's filesystem corruption. Move |
| * on to the next mapping if they're both holes or both point |
| * to the same physical space extent. |
| */ |
| if (irec1->br_state != irec2->br_state) { |
| xfs_bmap_mark_sick(xmi->xmi_ip1, |
| xfs_exchmaps_whichfork(xmi)); |
| xfs_bmap_mark_sick(xmi->xmi_ip2, |
| xfs_exchmaps_whichfork(xmi)); |
| return -EFSCORRUPTED; |
| } |
| |
| /* |
| * Save the mappings if we're estimating work and skipping |
| * these identical mappings. |
| */ |
| if (adj) { |
| memcpy(&adj->left1, irec1, sizeof(*irec1)); |
| memcpy(&adj->left2, irec2, sizeof(*irec2)); |
| } |
| } |
| |
| return 0; |
| } |
| |
| /* Exchange these two mappings. */ |
| static void |
| xfs_exchmaps_one_step( |
| struct xfs_trans *tp, |
| struct xfs_exchmaps_intent *xmi, |
| struct xfs_bmbt_irec *irec1, |
| struct xfs_bmbt_irec *irec2) |
| { |
| int whichfork = xfs_exchmaps_whichfork(xmi); |
| |
| xfs_exchmaps_update_quota(tp, xmi, irec1, irec2); |
| |
| /* Remove both mappings. */ |
| xfs_bmap_unmap_extent(tp, xmi->xmi_ip1, whichfork, irec1); |
| xfs_bmap_unmap_extent(tp, xmi->xmi_ip2, whichfork, irec2); |
| |
| /* |
| * Re-add both mappings. We exchange the file offsets between the two |
| * maps and add the opposite map, which has the effect of filling the |
| * logical offsets we just unmapped, but with with the physical mapping |
| * information exchanged. |
| */ |
| swap(irec1->br_startoff, irec2->br_startoff); |
| xfs_bmap_map_extent(tp, xmi->xmi_ip1, whichfork, irec2); |
| xfs_bmap_map_extent(tp, xmi->xmi_ip2, whichfork, irec1); |
| |
| /* Make sure we're not adding mappings past EOF. */ |
| if (whichfork == XFS_DATA_FORK) { |
| xfs_exchmaps_update_size(tp, xmi->xmi_ip1, irec2, |
| xmi->xmi_isize1); |
| xfs_exchmaps_update_size(tp, xmi->xmi_ip2, irec1, |
| xmi->xmi_isize2); |
| } |
| |
| /* |
| * Advance our cursor and exit. The caller (either defer ops or log |
| * recovery) will log the XMD item, and if *blockcount is nonzero, it |
| * will log a new XMI item for the remainder and call us back. |
| */ |
| xmi_advance(xmi, irec1); |
| } |
| |
| /* Convert inode2's leaf attr fork back to shortform, if possible.. */ |
| STATIC int |
| xfs_exchmaps_attr_to_sf( |
| struct xfs_trans *tp, |
| struct xfs_exchmaps_intent *xmi) |
| { |
| struct xfs_da_args args = { |
| .dp = xmi->xmi_ip2, |
| .geo = tp->t_mountp->m_attr_geo, |
| .whichfork = XFS_ATTR_FORK, |
| .trans = tp, |
| .owner = xmi->xmi_ip2->i_ino, |
| }; |
| struct xfs_buf *bp; |
| int forkoff; |
| int error; |
| |
| if (!xfs_attr_is_leaf(xmi->xmi_ip2)) |
| return 0; |
| |
| error = xfs_attr3_leaf_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, 0, |
| &bp); |
| if (error) |
| return error; |
| |
| forkoff = xfs_attr_shortform_allfit(bp, xmi->xmi_ip2); |
| if (forkoff == 0) |
| return 0; |
| |
| return xfs_attr3_leaf_to_shortform(bp, &args, forkoff); |
| } |
| |
| /* Convert inode2's block dir fork back to shortform, if possible.. */ |
| STATIC int |
| xfs_exchmaps_dir_to_sf( |
| struct xfs_trans *tp, |
| struct xfs_exchmaps_intent *xmi) |
| { |
| struct xfs_da_args args = { |
| .dp = xmi->xmi_ip2, |
| .geo = tp->t_mountp->m_dir_geo, |
| .whichfork = XFS_DATA_FORK, |
| .trans = tp, |
| .owner = xmi->xmi_ip2->i_ino, |
| }; |
| struct xfs_dir2_sf_hdr sfh; |
| struct xfs_buf *bp; |
| int size; |
| int error = 0; |
| |
| if (xfs_dir2_format(&args, &error) != XFS_DIR2_FMT_BLOCK) |
| return error; |
| |
| error = xfs_dir3_block_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, &bp); |
| if (error) |
| return error; |
| |
| size = xfs_dir2_block_sfsize(xmi->xmi_ip2, bp->b_addr, &sfh); |
| if (size > xfs_inode_data_fork_size(xmi->xmi_ip2)) |
| return 0; |
| |
| return xfs_dir2_block_to_sf(&args, bp, size, &sfh); |
| } |
| |
| /* Convert inode2's remote symlink target back to shortform, if possible. */ |
| STATIC int |
| xfs_exchmaps_link_to_sf( |
| struct xfs_trans *tp, |
| struct xfs_exchmaps_intent *xmi) |
| { |
| struct xfs_inode *ip = xmi->xmi_ip2; |
| struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); |
| char *buf; |
| int error; |
| |
| if (ifp->if_format == XFS_DINODE_FMT_LOCAL || |
| ip->i_disk_size > xfs_inode_data_fork_size(ip)) |
| return 0; |
| |
| /* Read the current symlink target into a buffer. */ |
| buf = kmalloc(ip->i_disk_size + 1, |
| GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); |
| if (!buf) { |
| ASSERT(0); |
| return -ENOMEM; |
| } |
| |
| error = xfs_symlink_remote_read(ip, buf); |
| if (error) |
| goto free; |
| |
| /* Remove the blocks. */ |
| error = xfs_symlink_remote_truncate(tp, ip); |
| if (error) |
| goto free; |
| |
| /* Convert fork to local format and log our changes. */ |
| xfs_idestroy_fork(ifp); |
| ifp->if_bytes = 0; |
| ifp->if_format = XFS_DINODE_FMT_LOCAL; |
| xfs_init_local_fork(ip, XFS_DATA_FORK, buf, ip->i_disk_size); |
| xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); |
| free: |
| kfree(buf); |
| return error; |
| } |
| |
| /* Clear the reflink flag after an exchange. */ |
| static inline void |
| xfs_exchmaps_clear_reflink( |
| struct xfs_trans *tp, |
| struct xfs_inode *ip) |
| { |
| trace_xfs_reflink_unset_inode_flag(ip); |
| |
| ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; |
| xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
| } |
| |
| /* Finish whatever work might come after an exchange operation. */ |
| static int |
| xfs_exchmaps_do_postop_work( |
| struct xfs_trans *tp, |
| struct xfs_exchmaps_intent *xmi) |
| { |
| if (xmi->xmi_flags & __XFS_EXCHMAPS_INO2_SHORTFORM) { |
| int error = 0; |
| |
| if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK) |
| error = xfs_exchmaps_attr_to_sf(tp, xmi); |
| else if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode)) |
| error = xfs_exchmaps_dir_to_sf(tp, xmi); |
| else if (S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) |
| error = xfs_exchmaps_link_to_sf(tp, xmi); |
| xmi->xmi_flags &= ~__XFS_EXCHMAPS_INO2_SHORTFORM; |
| if (error) |
| return error; |
| } |
| |
| if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO1_REFLINK) { |
| xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip1); |
| xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO1_REFLINK; |
| } |
| |
| if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO2_REFLINK) { |
| xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip2); |
| xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO2_REFLINK; |
| } |
| |
| return 0; |
| } |
| |
| /* Finish one step in a mapping exchange operation, possibly relogging. */ |
| int |
| xfs_exchmaps_finish_one( |
| struct xfs_trans *tp, |
| struct xfs_exchmaps_intent *xmi) |
| { |
| struct xfs_bmbt_irec irec1, irec2; |
| int error; |
| |
| if (xmi_has_more_exchange_work(xmi)) { |
| /* |
| * If the operation state says that some range of the files |
| * have not yet been exchanged, look for mappings in that range |
| * to exchange. If we find some mappings, exchange them. |
| */ |
| error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, NULL); |
| if (error) |
| return error; |
| |
| if (xmi_has_more_exchange_work(xmi)) |
| xfs_exchmaps_one_step(tp, xmi, &irec1, &irec2); |
| |
| /* |
| * If the caller asked us to exchange the file sizes after the |
| * exchange and either we just exchanged the last mappings in |
| * the range or we didn't find anything to exchange, update the |
| * ondisk file sizes. |
| */ |
| if ((xmi->xmi_flags & XFS_EXCHMAPS_SET_SIZES) && |
| !xmi_has_more_exchange_work(xmi)) { |
| xmi->xmi_ip1->i_disk_size = xmi->xmi_isize1; |
| xmi->xmi_ip2->i_disk_size = xmi->xmi_isize2; |
| |
| xfs_trans_log_inode(tp, xmi->xmi_ip1, XFS_ILOG_CORE); |
| xfs_trans_log_inode(tp, xmi->xmi_ip2, XFS_ILOG_CORE); |
| } |
| } else if (xmi_has_postop_work(xmi)) { |
| /* |
| * Now that we're finished with the exchange operation, |
| * complete the post-op cleanup work. |
| */ |
| error = xfs_exchmaps_do_postop_work(tp, xmi); |
| if (error) |
| return error; |
| } |
| |
| if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) |
| return -EIO; |
| |
| /* If we still have work to do, ask for a new transaction. */ |
| if (xmi_has_more_exchange_work(xmi) || xmi_has_postop_work(xmi)) { |
| trace_xfs_exchmaps_defer(tp->t_mountp, xmi); |
| return -EAGAIN; |
| } |
| |
| /* |
| * If we reach here, we've finished all the exchange work and the post |
| * operation work. The last thing we need to do before returning to |
| * the caller is to make sure that COW forks are set up correctly. |
| */ |
| if (!(xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)) { |
| xfs_exchmaps_ensure_cowfork(xmi->xmi_ip1); |
| xfs_exchmaps_ensure_cowfork(xmi->xmi_ip2); |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Compute the amount of bmbt blocks we should reserve for each file. In the |
| * worst case, each exchange will fill a hole with a new mapping, which could |
| * result in a btree split every time we add a new leaf block. |
| */ |
| static inline uint64_t |
| xfs_exchmaps_bmbt_blocks( |
| struct xfs_mount *mp, |
| const struct xfs_exchmaps_req *req) |
| { |
| return howmany_64(req->nr_exchanges, |
| XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)) * |
| XFS_EXTENTADD_SPACE_RES(mp, xfs_exchmaps_reqfork(req)); |
| } |
| |
| /* Compute the space we should reserve for the rmap btree expansions. */ |
| static inline uint64_t |
| xfs_exchmaps_rmapbt_blocks( |
| struct xfs_mount *mp, |
| const struct xfs_exchmaps_req *req) |
| { |
| if (!xfs_has_rmapbt(mp)) |
| return 0; |
| if (XFS_IS_REALTIME_INODE(req->ip1)) |
| return 0; |
| |
| return howmany_64(req->nr_exchanges, |
| XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * |
| XFS_RMAPADD_SPACE_RES(mp); |
| } |
| |
| /* Estimate the bmbt and rmapbt overhead required to exchange mappings. */ |
| int |
| xfs_exchmaps_estimate_overhead( |
| struct xfs_exchmaps_req *req) |
| { |
| struct xfs_mount *mp = req->ip1->i_mount; |
| xfs_filblks_t bmbt_blocks; |
| xfs_filblks_t rmapbt_blocks; |
| xfs_filblks_t resblks = req->resblks; |
| |
| /* |
| * Compute the number of bmbt and rmapbt blocks we might need to handle |
| * the estimated number of exchanges. |
| */ |
| bmbt_blocks = xfs_exchmaps_bmbt_blocks(mp, req); |
| rmapbt_blocks = xfs_exchmaps_rmapbt_blocks(mp, req); |
| |
| trace_xfs_exchmaps_overhead(mp, bmbt_blocks, rmapbt_blocks); |
| |
| /* Make sure the change in file block count doesn't overflow. */ |
| if (check_add_overflow(req->ip1_bcount, bmbt_blocks, &req->ip1_bcount)) |
| return -EFBIG; |
| if (check_add_overflow(req->ip2_bcount, bmbt_blocks, &req->ip2_bcount)) |
| return -EFBIG; |
| |
| /* |
| * Add together the number of blocks we need to handle btree growth, |
| * then add it to the number of blocks we need to reserve to this |
| * transaction. |
| */ |
| if (check_add_overflow(resblks, bmbt_blocks, &resblks)) |
| return -ENOSPC; |
| if (check_add_overflow(resblks, bmbt_blocks, &resblks)) |
| return -ENOSPC; |
| if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) |
| return -ENOSPC; |
| if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) |
| return -ENOSPC; |
| |
| /* Can't actually reserve more than UINT_MAX blocks. */ |
| if (req->resblks > UINT_MAX) |
| return -ENOSPC; |
| |
| req->resblks = resblks; |
| trace_xfs_exchmaps_final_estimate(req); |
| return 0; |
| } |
| |
| /* Decide if we can merge two real mappings. */ |
| static inline bool |
| xmi_can_merge( |
| const struct xfs_bmbt_irec *b1, |
| const struct xfs_bmbt_irec *b2) |
| { |
| /* Don't merge holes. */ |
| if (b1->br_startblock == HOLESTARTBLOCK || |
| b2->br_startblock == HOLESTARTBLOCK) |
| return false; |
| |
| /* We don't merge holes. */ |
| if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2)) |
| return false; |
| |
| if (b1->br_startoff + b1->br_blockcount == b2->br_startoff && |
| b1->br_startblock + b1->br_blockcount == b2->br_startblock && |
| b1->br_state == b2->br_state && |
| b1->br_blockcount + b2->br_blockcount <= XFS_MAX_BMBT_EXTLEN) |
| return true; |
| |
| return false; |
| } |
| |
| /* |
| * Decide if we can merge three mappings. Caller must ensure all three |
| * mappings must not be holes or delalloc reservations. |
| */ |
| static inline bool |
| xmi_can_merge_all( |
| const struct xfs_bmbt_irec *l, |
| const struct xfs_bmbt_irec *m, |
| const struct xfs_bmbt_irec *r) |
| { |
| xfs_filblks_t new_len; |
| |
| new_len = l->br_blockcount + m->br_blockcount + r->br_blockcount; |
| return new_len <= XFS_MAX_BMBT_EXTLEN; |
| } |
| |
| #define CLEFT_CONTIG 0x01 |
| #define CRIGHT_CONTIG 0x02 |
| #define CHOLE 0x04 |
| #define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG) |
| |
| #define NLEFT_CONTIG 0x10 |
| #define NRIGHT_CONTIG 0x20 |
| #define NHOLE 0x40 |
| #define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG) |
| |
| /* Estimate the effect of a single exchange on mapping count. */ |
| static inline int |
| xmi_delta_nextents_step( |
| struct xfs_mount *mp, |
| const struct xfs_bmbt_irec *left, |
| const struct xfs_bmbt_irec *curr, |
| const struct xfs_bmbt_irec *new, |
| const struct xfs_bmbt_irec *right) |
| { |
| bool lhole, rhole, chole, nhole; |
| unsigned int state = 0; |
| int ret = 0; |
| |
| lhole = left->br_startblock == HOLESTARTBLOCK; |
| rhole = right->br_startblock == HOLESTARTBLOCK; |
| chole = curr->br_startblock == HOLESTARTBLOCK; |
| nhole = new->br_startblock == HOLESTARTBLOCK; |
| |
| if (chole) |
| state |= CHOLE; |
| if (!lhole && !chole && xmi_can_merge(left, curr)) |
| state |= CLEFT_CONTIG; |
| if (!rhole && !chole && xmi_can_merge(curr, right)) |
| state |= CRIGHT_CONTIG; |
| if ((state & CBOTH_CONTIG) == CBOTH_CONTIG && |
| !xmi_can_merge_all(left, curr, right)) |
| state &= ~CRIGHT_CONTIG; |
| |
| if (nhole) |
| state |= NHOLE; |
| if (!lhole && !nhole && xmi_can_merge(left, new)) |
| state |= NLEFT_CONTIG; |
| if (!rhole && !nhole && xmi_can_merge(new, right)) |
| state |= NRIGHT_CONTIG; |
| if ((state & NBOTH_CONTIG) == NBOTH_CONTIG && |
| !xmi_can_merge_all(left, new, right)) |
| state &= ~NRIGHT_CONTIG; |
| |
| switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) { |
| case CLEFT_CONTIG | CRIGHT_CONTIG: |
| /* |
| * left/curr/right are the same mapping, so deleting curr |
| * causes 2 new mappings to be created. |
| */ |
| ret += 2; |
| break; |
| case 0: |
| /* |
| * curr is not contiguous with any mapping, so we remove curr |
| * completely |
| */ |
| ret--; |
| break; |
| case CHOLE: |
| /* hole, do nothing */ |
| break; |
| case CLEFT_CONTIG: |
| case CRIGHT_CONTIG: |
| /* trim either left or right, no change */ |
| break; |
| } |
| |
| switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) { |
| case NLEFT_CONTIG | NRIGHT_CONTIG: |
| /* |
| * left/curr/right will become the same mapping, so adding |
| * curr causes the deletion of right. |
| */ |
| ret--; |
| break; |
| case 0: |
| /* new is not contiguous with any mapping */ |
| ret++; |
| break; |
| case NHOLE: |
| /* hole, do nothing. */ |
| break; |
| case NLEFT_CONTIG: |
| case NRIGHT_CONTIG: |
| /* new is absorbed into left or right, no change */ |
| break; |
| } |
| |
| trace_xfs_exchmaps_delta_nextents_step(mp, left, curr, new, right, ret, |
| state); |
| return ret; |
| } |
| |
| /* Make sure we don't overflow the extent (mapping) counters. */ |
| static inline int |
| xmi_ensure_delta_nextents( |
| struct xfs_exchmaps_req *req, |
| struct xfs_inode *ip, |
| int64_t delta) |
| { |
| struct xfs_mount *mp = ip->i_mount; |
| int whichfork = xfs_exchmaps_reqfork(req); |
| struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); |
| uint64_t new_nextents; |
| xfs_extnum_t max_nextents; |
| |
| if (delta < 0) |
| return 0; |
| |
| /* |
| * It's always an error if the delta causes integer overflow. delta |
| * needs an explicit cast here to avoid warnings about implicit casts |
| * coded into the overflow check. |
| */ |
| if (check_add_overflow(ifp->if_nextents, (uint64_t)delta, |
| &new_nextents)) |
| return -EFBIG; |
| |
| if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && |
| new_nextents > 10) |
| return -EFBIG; |
| |
| /* |
| * We always promote both inodes to have large extent counts if the |
| * superblock feature is enabled, so we only need to check against the |
| * theoretical maximum. |
| */ |
| max_nextents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), |
| whichfork); |
| if (new_nextents > max_nextents) |
| return -EFBIG; |
| |
| return 0; |
| } |
| |
| /* Find the next mapping after irec. */ |
| static inline int |
| xmi_next( |
| struct xfs_inode *ip, |
| int bmap_flags, |
| const struct xfs_bmbt_irec *irec, |
| struct xfs_bmbt_irec *nrec) |
| { |
| xfs_fileoff_t off; |
| xfs_filblks_t blockcount; |
| int nimaps = 1; |
| int error; |
| |
| off = irec->br_startoff + irec->br_blockcount; |
| blockcount = XFS_MAX_FILEOFF - off; |
| error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags); |
| if (error) |
| return error; |
| if (nrec->br_startblock == DELAYSTARTBLOCK || |
| nrec->br_startoff != off) { |
| /* |
| * If we don't get the mapping we want, return a zero-length |
| * mapping, which our estimator function will pretend is a hole. |
| * We shouldn't get delalloc reservations. |
| */ |
| nrec->br_startblock = HOLESTARTBLOCK; |
| } |
| |
| return 0; |
| } |
| |
| int __init |
| xfs_exchmaps_intent_init_cache(void) |
| { |
| xfs_exchmaps_intent_cache = kmem_cache_create("xfs_exchmaps_intent", |
| sizeof(struct xfs_exchmaps_intent), |
| 0, 0, NULL); |
| |
| return xfs_exchmaps_intent_cache != NULL ? 0 : -ENOMEM; |
| } |
| |
| void |
| xfs_exchmaps_intent_destroy_cache(void) |
| { |
| kmem_cache_destroy(xfs_exchmaps_intent_cache); |
| xfs_exchmaps_intent_cache = NULL; |
| } |
| |
| /* |
| * Decide if we will exchange the reflink flags between the two files after the |
| * exchange. The only time we want to do this is if we're exchanging all |
| * mappings under EOF and the inode reflink flags have different states. |
| */ |
| static inline bool |
| xmi_can_exchange_reflink_flags( |
| const struct xfs_exchmaps_req *req, |
| unsigned int reflink_state) |
| { |
| struct xfs_mount *mp = req->ip1->i_mount; |
| |
| if (hweight32(reflink_state) != 1) |
| return false; |
| if (req->startoff1 != 0 || req->startoff2 != 0) |
| return false; |
| if (req->blockcount != XFS_B_TO_FSB(mp, req->ip1->i_disk_size)) |
| return false; |
| if (req->blockcount != XFS_B_TO_FSB(mp, req->ip2->i_disk_size)) |
| return false; |
| return true; |
| } |
| |
| |
| /* Allocate and initialize a new incore intent item from a request. */ |
| struct xfs_exchmaps_intent * |
| xfs_exchmaps_init_intent( |
| const struct xfs_exchmaps_req *req) |
| { |
| struct xfs_exchmaps_intent *xmi; |
| unsigned int rs = 0; |
| |
| xmi = kmem_cache_zalloc(xfs_exchmaps_intent_cache, |
| GFP_NOFS | __GFP_NOFAIL); |
| INIT_LIST_HEAD(&xmi->xmi_list); |
| xmi->xmi_ip1 = req->ip1; |
| xmi->xmi_ip2 = req->ip2; |
| xmi->xmi_startoff1 = req->startoff1; |
| xmi->xmi_startoff2 = req->startoff2; |
| xmi->xmi_blockcount = req->blockcount; |
| xmi->xmi_isize1 = xmi->xmi_isize2 = -1; |
| xmi->xmi_flags = req->flags & XFS_EXCHMAPS_PARAMS; |
| |
| if (xfs_exchmaps_whichfork(xmi) == XFS_ATTR_FORK) { |
| xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; |
| return xmi; |
| } |
| |
| if (req->flags & XFS_EXCHMAPS_SET_SIZES) { |
| xmi->xmi_flags |= XFS_EXCHMAPS_SET_SIZES; |
| xmi->xmi_isize1 = req->ip2->i_disk_size; |
| xmi->xmi_isize2 = req->ip1->i_disk_size; |
| } |
| |
| /* Record the state of each inode's reflink flag before the op. */ |
| if (xfs_is_reflink_inode(req->ip1)) |
| rs |= 1; |
| if (xfs_is_reflink_inode(req->ip2)) |
| rs |= 2; |
| |
| /* |
| * Figure out if we're clearing the reflink flags (which effectively |
| * exchanges them) after the operation. |
| */ |
| if (xmi_can_exchange_reflink_flags(req, rs)) { |
| if (rs & 1) |
| xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO1_REFLINK; |
| if (rs & 2) |
| xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO2_REFLINK; |
| } |
| |
| if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode) || |
| S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) |
| xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; |
| |
| return xmi; |
| } |
| |
| /* |
| * Estimate the number of exchange operations and the number of file blocks |
| * in each file that will be affected by the exchange operation. |
| */ |
| int |
| xfs_exchmaps_estimate( |
| struct xfs_exchmaps_req *req) |
| { |
| struct xfs_exchmaps_intent *xmi; |
| struct xfs_bmbt_irec irec1, irec2; |
| struct xfs_exchmaps_adjacent adj = ADJACENT_INIT; |
| xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0; |
| int64_t d_nexts1, d_nexts2; |
| int bmap_flags; |
| int error; |
| |
| ASSERT(!(req->flags & ~XFS_EXCHMAPS_PARAMS)); |
| |
| bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_reqfork(req)); |
| xmi = xfs_exchmaps_init_intent(req); |
| |
| /* |
| * To guard against the possibility of overflowing the extent counters, |
| * we have to estimate an upper bound on the potential increase in that |
| * counter. We can split the mapping at each end of the range, and for |
| * each step of the exchange we can split the mapping that we're |
| * working on if the mappings do not align. |
| */ |
| d_nexts1 = d_nexts2 = 3; |
| |
| while (xmi_has_more_exchange_work(xmi)) { |
| /* |
| * Walk through the file ranges until we find something to |
| * exchange. Because we're simulating the exchange, pass in |
| * adj to capture skipped mappings for correct estimation of |
| * bmbt record merges. |
| */ |
| error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, &adj); |
| if (error) |
| goto out_free; |
| if (!xmi_has_more_exchange_work(xmi)) |
| break; |
| |
| /* Update accounting. */ |
| if (xfs_bmap_is_real_extent(&irec1)) |
| ip1_blocks += irec1.br_blockcount; |
| if (xfs_bmap_is_real_extent(&irec2)) |
| ip2_blocks += irec2.br_blockcount; |
| req->nr_exchanges++; |
| |
| /* Read the next mappings from both files. */ |
| error = xmi_next(req->ip1, bmap_flags, &irec1, &adj.right1); |
| if (error) |
| goto out_free; |
| |
| error = xmi_next(req->ip2, bmap_flags, &irec2, &adj.right2); |
| if (error) |
| goto out_free; |
| |
| /* Update extent count deltas. */ |
| d_nexts1 += xmi_delta_nextents_step(req->ip1->i_mount, |
| &adj.left1, &irec1, &irec2, &adj.right1); |
| |
| d_nexts2 += xmi_delta_nextents_step(req->ip1->i_mount, |
| &adj.left2, &irec2, &irec1, &adj.right2); |
| |
| /* Now pretend we exchanged the mappings. */ |
| if (xmi_can_merge(&adj.left2, &irec1)) |
| adj.left2.br_blockcount += irec1.br_blockcount; |
| else |
| memcpy(&adj.left2, &irec1, sizeof(irec1)); |
| |
| if (xmi_can_merge(&adj.left1, &irec2)) |
| adj.left1.br_blockcount += irec2.br_blockcount; |
| else |
| memcpy(&adj.left1, &irec2, sizeof(irec2)); |
| |
| xmi_advance(xmi, &irec1); |
| } |
| |
| /* Account for the blocks that are being exchanged. */ |
| if (XFS_IS_REALTIME_INODE(req->ip1) && |
| xfs_exchmaps_reqfork(req) == XFS_DATA_FORK) { |
| req->ip1_rtbcount = ip1_blocks; |
| req->ip2_rtbcount = ip2_blocks; |
| } else { |
| req->ip1_bcount = ip1_blocks; |
| req->ip2_bcount = ip2_blocks; |
| } |
| |
| /* |
| * Make sure that both forks have enough slack left in their extent |
| * counters that the exchange operation will not overflow. |
| */ |
| trace_xfs_exchmaps_delta_nextents(req, d_nexts1, d_nexts2); |
| if (req->ip1 == req->ip2) { |
| error = xmi_ensure_delta_nextents(req, req->ip1, |
| d_nexts1 + d_nexts2); |
| } else { |
| error = xmi_ensure_delta_nextents(req, req->ip1, d_nexts1); |
| if (error) |
| goto out_free; |
| error = xmi_ensure_delta_nextents(req, req->ip2, d_nexts2); |
| } |
| if (error) |
| goto out_free; |
| |
| trace_xfs_exchmaps_initial_estimate(req); |
| error = xfs_exchmaps_estimate_overhead(req); |
| out_free: |
| kmem_cache_free(xfs_exchmaps_intent_cache, xmi); |
| return error; |
| } |
| |
| /* Set the reflink flag before an operation. */ |
| static inline void |
| xfs_exchmaps_set_reflink( |
| struct xfs_trans *tp, |
| struct xfs_inode *ip) |
| { |
| trace_xfs_reflink_set_inode_flag(ip); |
| |
| ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; |
| xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
| } |
| |
| /* |
| * If either file has shared blocks and we're exchanging data forks, we must |
| * flag the other file as having shared blocks so that we get the shared-block |
| * rmap functions if we need to fix up the rmaps. |
| */ |
| void |
| xfs_exchmaps_ensure_reflink( |
| struct xfs_trans *tp, |
| const struct xfs_exchmaps_intent *xmi) |
| { |
| unsigned int rs = 0; |
| |
| if (xfs_is_reflink_inode(xmi->xmi_ip1)) |
| rs |= 1; |
| if (xfs_is_reflink_inode(xmi->xmi_ip2)) |
| rs |= 2; |
| |
| if ((rs & 1) && !xfs_is_reflink_inode(xmi->xmi_ip2)) |
| xfs_exchmaps_set_reflink(tp, xmi->xmi_ip2); |
| |
| if ((rs & 2) && !xfs_is_reflink_inode(xmi->xmi_ip1)) |
| xfs_exchmaps_set_reflink(tp, xmi->xmi_ip1); |
| } |
| |
| /* Set the large extent count flag before an operation if needed. */ |
| static inline void |
| xfs_exchmaps_ensure_large_extent_counts( |
| struct xfs_trans *tp, |
| struct xfs_inode *ip) |
| { |
| if (xfs_inode_has_large_extent_counts(ip)) |
| return; |
| |
| ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; |
| xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
| } |
| |
| /* Widen the extent counter fields of both inodes if necessary. */ |
| void |
| xfs_exchmaps_upgrade_extent_counts( |
| struct xfs_trans *tp, |
| const struct xfs_exchmaps_intent *xmi) |
| { |
| if (!xfs_has_large_extent_counts(tp->t_mountp)) |
| return; |
| |
| xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip1); |
| xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip2); |
| } |
| |
| /* |
| * Schedule an exchange a range of mappings from one inode to another. |
| * |
| * The use of file mapping exchange log intent items ensures the operation can |
| * be resumed even if the system goes down. The caller must commit the |
| * transaction to start the work. |
| * |
| * The caller must ensure the inodes must be joined to the transaction and |
| * ILOCKd; they will still be joined to the transaction at exit. |
| */ |
| void |
| xfs_exchange_mappings( |
| struct xfs_trans *tp, |
| const struct xfs_exchmaps_req *req) |
| { |
| struct xfs_exchmaps_intent *xmi; |
| |
| BUILD_BUG_ON(XFS_EXCHMAPS_INTERNAL_FLAGS & XFS_EXCHMAPS_LOGGED_FLAGS); |
| |
| xfs_assert_ilocked(req->ip1, XFS_ILOCK_EXCL); |
| xfs_assert_ilocked(req->ip2, XFS_ILOCK_EXCL); |
| ASSERT(!(req->flags & ~XFS_EXCHMAPS_LOGGED_FLAGS)); |
| if (req->flags & XFS_EXCHMAPS_SET_SIZES) |
| ASSERT(!(req->flags & XFS_EXCHMAPS_ATTR_FORK)); |
| ASSERT(xfs_has_exchange_range(tp->t_mountp)); |
| |
| if (req->blockcount == 0) |
| return; |
| |
| xmi = xfs_exchmaps_init_intent(req); |
| xfs_exchmaps_defer_add(tp, xmi); |
| xfs_exchmaps_ensure_reflink(tp, xmi); |
| xfs_exchmaps_upgrade_extent_counts(tp, xmi); |
| } |