| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* |
| * Copyright (c) 2021-2024 Oracle. All Rights Reserved. |
| * Author: Darrick J. Wong <djwong@kernel.org> |
| */ |
| #include "xfs.h" |
| #include "xfs_fs.h" |
| #include "xfs_shared.h" |
| #include "xfs_format.h" |
| #include "xfs_trans_resv.h" |
| #include "xfs_mount.h" |
| #include "xfs_log_format.h" |
| #include "xfs_trans.h" |
| #include "xfs_inode.h" |
| #include "xfs_ialloc.h" |
| #include "xfs_quota.h" |
| #include "xfs_bmap.h" |
| #include "xfs_bmap_btree.h" |
| #include "xfs_trans_space.h" |
| #include "xfs_dir2.h" |
| #include "xfs_exchrange.h" |
| #include "xfs_exchmaps.h" |
| #include "xfs_defer.h" |
| #include "scrub/scrub.h" |
| #include "scrub/common.h" |
| #include "scrub/repair.h" |
| #include "scrub/trace.h" |
| #include "scrub/tempfile.h" |
| #include "scrub/tempexch.h" |
| #include "scrub/xfile.h" |
| |
| /* |
| * Create a temporary file for reconstructing metadata, with the intention of |
| * atomically exchanging the temporary file's contents with the file that's |
| * being repaired. |
| */ |
| int |
| xrep_tempfile_create( |
| struct xfs_scrub *sc, |
| uint16_t mode) |
| { |
| struct xfs_mount *mp = sc->mp; |
| struct xfs_trans *tp = NULL; |
| struct xfs_dquot *udqp = NULL; |
| struct xfs_dquot *gdqp = NULL; |
| struct xfs_dquot *pdqp = NULL; |
| struct xfs_trans_res *tres; |
| struct xfs_inode *dp = mp->m_rootip; |
| xfs_ino_t ino; |
| unsigned int resblks; |
| bool is_dir = S_ISDIR(mode); |
| int error; |
| |
| if (xfs_is_shutdown(mp)) |
| return -EIO; |
| if (xfs_is_readonly(mp)) |
| return -EROFS; |
| |
| ASSERT(sc->tp == NULL); |
| ASSERT(sc->tempip == NULL); |
| |
| /* |
| * Make sure that we have allocated dquot(s) on disk. The temporary |
| * inode should be completely root owned so that we don't fail due to |
| * quota limits. |
| */ |
| error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, |
| XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp); |
| if (error) |
| return error; |
| |
| if (is_dir) { |
| resblks = XFS_MKDIR_SPACE_RES(mp, 0); |
| tres = &M_RES(mp)->tr_mkdir; |
| } else { |
| resblks = XFS_IALLOC_SPACE_RES(mp); |
| tres = &M_RES(mp)->tr_create_tmpfile; |
| } |
| |
| error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, |
| &tp); |
| if (error) |
| goto out_release_dquots; |
| |
| /* Allocate inode, set up directory. */ |
| error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); |
| if (error) |
| goto out_trans_cancel; |
| error = xfs_init_new_inode(&nop_mnt_idmap, tp, dp, ino, mode, 0, 0, |
| 0, false, &sc->tempip); |
| if (error) |
| goto out_trans_cancel; |
| |
| /* Change the ownership of the inode to root. */ |
| VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID; |
| VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID; |
| sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT); |
| xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE); |
| |
| /* |
| * Mark our temporary file as private so that LSMs and the ACL code |
| * don't try to add their own metadata or reason about these files. |
| * The file should never be exposed to userspace. |
| */ |
| VFS_I(sc->tempip)->i_flags |= S_PRIVATE; |
| VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR; |
| |
| if (is_dir) { |
| error = xfs_dir_init(tp, sc->tempip, dp); |
| if (error) |
| goto out_trans_cancel; |
| } |
| |
| /* |
| * Attach the dquot(s) to the inodes and modify them incore. |
| * These ids of the inode couldn't have changed since the new |
| * inode has been locked ever since it was created. |
| */ |
| xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp); |
| |
| /* |
| * Put our temp file on the unlinked list so it's purged automatically. |
| * All file-based metadata being reconstructed using this file must be |
| * atomically exchanged with the original file because the contents |
| * here will be purged when the inode is dropped or log recovery cleans |
| * out the unlinked list. |
| */ |
| error = xfs_iunlink(tp, sc->tempip); |
| if (error) |
| goto out_trans_cancel; |
| |
| error = xfs_trans_commit(tp); |
| if (error) |
| goto out_release_inode; |
| |
| trace_xrep_tempfile_create(sc); |
| |
| xfs_qm_dqrele(udqp); |
| xfs_qm_dqrele(gdqp); |
| xfs_qm_dqrele(pdqp); |
| |
| /* Finish setting up the incore / vfs context. */ |
| xfs_setup_iops(sc->tempip); |
| xfs_finish_inode_setup(sc->tempip); |
| |
| sc->temp_ilock_flags = 0; |
| return error; |
| |
| out_trans_cancel: |
| xfs_trans_cancel(tp); |
| out_release_inode: |
| /* |
| * Wait until after the current transaction is aborted to finish the |
| * setup of the inode and release the inode. This prevents recursive |
| * transactions and deadlocks from xfs_inactive. |
| */ |
| if (sc->tempip) { |
| xfs_finish_inode_setup(sc->tempip); |
| xchk_irele(sc, sc->tempip); |
| } |
| out_release_dquots: |
| xfs_qm_dqrele(udqp); |
| xfs_qm_dqrele(gdqp); |
| xfs_qm_dqrele(pdqp); |
| |
| return error; |
| } |
| |
| /* Take IOLOCK_EXCL on the temporary file, maybe. */ |
| bool |
| xrep_tempfile_iolock_nowait( |
| struct xfs_scrub *sc) |
| { |
| if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) { |
| sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* |
| * Take the temporary file's IOLOCK while holding a different inode's IOLOCK. |
| * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock |
| * to avoid deadlocks and lockdep complaints. |
| */ |
| int |
| xrep_tempfile_iolock_polled( |
| struct xfs_scrub *sc) |
| { |
| int error = 0; |
| |
| while (!xrep_tempfile_iolock_nowait(sc)) { |
| if (xchk_should_terminate(sc, &error)) |
| return error; |
| delay(1); |
| } |
| |
| return 0; |
| } |
| |
| /* Release IOLOCK_EXCL on the temporary file. */ |
| void |
| xrep_tempfile_iounlock( |
| struct xfs_scrub *sc) |
| { |
| xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL); |
| sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL; |
| } |
| |
| /* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */ |
| void |
| xrep_tempfile_ilock( |
| struct xfs_scrub *sc) |
| { |
| sc->temp_ilock_flags |= XFS_ILOCK_EXCL; |
| xfs_ilock(sc->tempip, XFS_ILOCK_EXCL); |
| } |
| |
| /* Try to grab ILOCK_EXCL on the temporary file. */ |
| bool |
| xrep_tempfile_ilock_nowait( |
| struct xfs_scrub *sc) |
| { |
| if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) { |
| sc->temp_ilock_flags |= XFS_ILOCK_EXCL; |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* Unlock ILOCK_EXCL on the temporary file after an update. */ |
| void |
| xrep_tempfile_iunlock( |
| struct xfs_scrub *sc) |
| { |
| xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); |
| sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL; |
| } |
| |
| /* Release the temporary file. */ |
| void |
| xrep_tempfile_rele( |
| struct xfs_scrub *sc) |
| { |
| if (!sc->tempip) |
| return; |
| |
| if (sc->temp_ilock_flags) { |
| xfs_iunlock(sc->tempip, sc->temp_ilock_flags); |
| sc->temp_ilock_flags = 0; |
| } |
| |
| xchk_irele(sc, sc->tempip); |
| sc->tempip = NULL; |
| } |
| |
| /* |
| * Make sure that the given range of the data fork of the temporary file is |
| * mapped to written blocks. The caller must ensure that both inodes are |
| * joined to the transaction. |
| */ |
| int |
| xrep_tempfile_prealloc( |
| struct xfs_scrub *sc, |
| xfs_fileoff_t off, |
| xfs_filblks_t len) |
| { |
| struct xfs_bmbt_irec map; |
| xfs_fileoff_t end = off + len; |
| int error; |
| |
| ASSERT(sc->tempip != NULL); |
| ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip)); |
| |
| for (; off < end; off = map.br_startoff + map.br_blockcount) { |
| int nmaps = 1; |
| |
| /* |
| * If we have a real extent mapping this block then we're |
| * in ok shape. |
| */ |
| error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps, |
| XFS_DATA_FORK); |
| if (error) |
| return error; |
| if (nmaps == 0) { |
| ASSERT(nmaps != 0); |
| return -EFSCORRUPTED; |
| } |
| |
| if (xfs_bmap_is_written_extent(&map)) |
| continue; |
| |
| /* |
| * If we find a delalloc reservation then something is very |
| * very wrong. Bail out. |
| */ |
| if (map.br_startblock == DELAYSTARTBLOCK) |
| return -EFSCORRUPTED; |
| |
| /* |
| * Make sure this block has a real zeroed extent allocated to |
| * it. |
| */ |
| nmaps = 1; |
| error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off, |
| XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map, |
| &nmaps); |
| if (error) |
| return error; |
| if (nmaps != 1) |
| return -EFSCORRUPTED; |
| |
| trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map); |
| |
| /* Commit new extent and all deferred work. */ |
| error = xfs_defer_finish(&sc->tp); |
| if (error) |
| return error; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Write data to each block of a file. The given range of the tempfile's data |
| * fork must already be populated with written extents. |
| */ |
| int |
| xrep_tempfile_copyin( |
| struct xfs_scrub *sc, |
| xfs_fileoff_t off, |
| xfs_filblks_t len, |
| xrep_tempfile_copyin_fn prep_fn, |
| void *data) |
| { |
| LIST_HEAD(buffers_list); |
| struct xfs_mount *mp = sc->mp; |
| struct xfs_buf *bp; |
| xfs_fileoff_t flush_mask; |
| xfs_fileoff_t end = off + len; |
| loff_t pos = XFS_FSB_TO_B(mp, off); |
| int error = 0; |
| |
| ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode)); |
| |
| /* Flush buffers to disk every 512K */ |
| flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1; |
| |
| for (; off < end; off++, pos += mp->m_sb.sb_blocksize) { |
| struct xfs_bmbt_irec map; |
| int nmaps = 1; |
| |
| /* Read block mapping for this file block. */ |
| error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0); |
| if (error) |
| goto out_err; |
| if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) { |
| error = -EFSCORRUPTED; |
| goto out_err; |
| } |
| |
| /* Get the metadata buffer for this offset in the file. */ |
| error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, |
| XFS_FSB_TO_DADDR(mp, map.br_startblock), |
| mp->m_bsize, 0, &bp); |
| if (error) |
| goto out_err; |
| |
| trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map); |
| |
| /* Read in a block's worth of data from the xfile. */ |
| error = prep_fn(sc, bp, data); |
| if (error) { |
| xfs_trans_brelse(sc->tp, bp); |
| goto out_err; |
| } |
| |
| /* Queue buffer, and flush if we have too much dirty data. */ |
| xfs_buf_delwri_queue_here(bp, &buffers_list); |
| xfs_trans_brelse(sc->tp, bp); |
| |
| if (!(off & flush_mask)) { |
| error = xfs_buf_delwri_submit(&buffers_list); |
| if (error) |
| goto out_err; |
| } |
| } |
| |
| /* |
| * Write the new blocks to disk. If the ordered list isn't empty after |
| * that, then something went wrong and we have to fail. This should |
| * never happen, but we'll check anyway. |
| */ |
| error = xfs_buf_delwri_submit(&buffers_list); |
| if (error) |
| goto out_err; |
| |
| if (!list_empty(&buffers_list)) { |
| ASSERT(list_empty(&buffers_list)); |
| error = -EIO; |
| goto out_err; |
| } |
| |
| return 0; |
| |
| out_err: |
| xfs_buf_delwri_cancel(&buffers_list); |
| return error; |
| } |
| |
| /* |
| * Set the temporary file's size. Caller must join the tempfile to the scrub |
| * transaction and is responsible for adjusting block mappings as needed. |
| */ |
| int |
| xrep_tempfile_set_isize( |
| struct xfs_scrub *sc, |
| unsigned long long isize) |
| { |
| if (sc->tempip->i_disk_size == isize) |
| return 0; |
| |
| sc->tempip->i_disk_size = isize; |
| i_size_write(VFS_I(sc->tempip), isize); |
| return xrep_tempfile_roll_trans(sc); |
| } |
| |
| /* |
| * Roll a repair transaction involving the temporary file. Caller must join |
| * both the temporary file and the file being scrubbed to the transaction. |
| * This function return with both inodes joined to a new scrub transaction, |
| * or the usual negative errno. |
| */ |
| int |
| xrep_tempfile_roll_trans( |
| struct xfs_scrub *sc) |
| { |
| int error; |
| |
| xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE); |
| error = xrep_roll_trans(sc); |
| if (error) |
| return error; |
| |
| xfs_trans_ijoin(sc->tp, sc->tempip, 0); |
| return 0; |
| } |
| |
| /* Enable file content exchanges. */ |
| int |
| xrep_tempexch_enable( |
| struct xfs_scrub *sc) |
| { |
| if (sc->flags & XREP_FSGATES_EXCHANGE_RANGE) |
| return 0; |
| |
| if (!xfs_has_exchange_range(sc->mp)) |
| return -EOPNOTSUPP; |
| |
| trace_xchk_fsgates_enable(sc, XREP_FSGATES_EXCHANGE_RANGE); |
| |
| sc->flags |= XREP_FSGATES_EXCHANGE_RANGE; |
| return 0; |
| } |
| |
| /* |
| * Fill out the mapping exchange request in preparation for atomically |
| * committing the contents of a metadata file that we've rebuilt in the temp |
| * file. |
| */ |
| STATIC int |
| xrep_tempexch_prep_request( |
| struct xfs_scrub *sc, |
| int whichfork, |
| struct xrep_tempexch *tx) |
| { |
| struct xfs_exchmaps_req *req = &tx->req; |
| |
| memset(tx, 0, sizeof(struct xrep_tempexch)); |
| |
| /* COW forks don't exist on disk. */ |
| if (whichfork == XFS_COW_FORK) { |
| ASSERT(0); |
| return -EINVAL; |
| } |
| |
| /* Both files should have the relevant forks. */ |
| if (!xfs_ifork_ptr(sc->ip, whichfork) || |
| !xfs_ifork_ptr(sc->tempip, whichfork)) { |
| ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL); |
| ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL); |
| return -EINVAL; |
| } |
| |
| /* Exchange all mappings in both forks. */ |
| req->ip1 = sc->tempip; |
| req->ip2 = sc->ip; |
| req->startoff1 = 0; |
| req->startoff2 = 0; |
| switch (whichfork) { |
| case XFS_ATTR_FORK: |
| req->flags |= XFS_EXCHMAPS_ATTR_FORK; |
| break; |
| case XFS_DATA_FORK: |
| /* Always exchange sizes when exchanging data fork mappings. */ |
| req->flags |= XFS_EXCHMAPS_SET_SIZES; |
| break; |
| } |
| req->blockcount = XFS_MAX_FILEOFF; |
| |
| return 0; |
| } |
| |
| /* |
| * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip |
| * this if quota enforcement is disabled or if both inodes' dquots are the |
| * same. The qretry structure must be initialized to zeroes before the first |
| * call to this function. |
| */ |
| STATIC int |
| xrep_tempexch_reserve_quota( |
| struct xfs_scrub *sc, |
| const struct xrep_tempexch *tx) |
| { |
| struct xfs_trans *tp = sc->tp; |
| const struct xfs_exchmaps_req *req = &tx->req; |
| int64_t ddelta, rdelta; |
| int error; |
| |
| /* |
| * Don't bother with a quota reservation if we're not enforcing them |
| * or the two inodes have the same dquots. |
| */ |
| if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || |
| (req->ip1->i_udquot == req->ip2->i_udquot && |
| req->ip1->i_gdquot == req->ip2->i_gdquot && |
| req->ip1->i_pdquot == req->ip2->i_pdquot)) |
| return 0; |
| |
| /* |
| * Quota reservation for each file comes from two sources. First, we |
| * need to account for any net gain in mapped blocks during the |
| * exchange. Second, we need reservation for the gross gain in mapped |
| * blocks so that we don't trip over any quota block reservation |
| * assertions. We must reserve the gross gain because the quota code |
| * subtracts from bcount the number of blocks that we unmap; it does |
| * not add that quantity back to the quota block reservation. |
| */ |
| ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount); |
| rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount); |
| error = xfs_trans_reserve_quota_nblks(tp, req->ip1, |
| ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount, |
| true); |
| if (error) |
| return error; |
| |
| ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount); |
| rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount); |
| return xfs_trans_reserve_quota_nblks(tp, req->ip2, |
| ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount, |
| true); |
| } |
| |
| /* |
| * Prepare an existing transaction for an atomic file contents exchange. |
| * |
| * This function fills out the mapping exchange request and resource estimation |
| * structures in preparation for exchanging the contents of a metadata file |
| * that has been rebuilt in the temp file. Next, it reserves space and quota |
| * for the transaction. |
| * |
| * The caller must hold ILOCK_EXCL of the scrub target file and the temporary |
| * file. The caller must join both inodes to the transaction with no unlock |
| * flags, and is responsible for dropping both ILOCKs when appropriate. Only |
| * use this when those ILOCKs cannot be dropped. |
| */ |
| int |
| xrep_tempexch_trans_reserve( |
| struct xfs_scrub *sc, |
| int whichfork, |
| struct xrep_tempexch *tx) |
| { |
| int error; |
| |
| ASSERT(sc->tp != NULL); |
| xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL); |
| xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL); |
| |
| error = xrep_tempexch_prep_request(sc, whichfork, tx); |
| if (error) |
| return error; |
| |
| error = xfs_exchmaps_estimate(&tx->req); |
| if (error) |
| return error; |
| |
| error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0); |
| if (error) |
| return error; |
| |
| return xrep_tempexch_reserve_quota(sc, tx); |
| } |
| |
| /* |
| * Exchange file mappings (and hence file contents) between the file being |
| * repaired and the temporary file. Returns with both inodes locked and joined |
| * to a clean scrub transaction. |
| */ |
| int |
| xrep_tempexch_contents( |
| struct xfs_scrub *sc, |
| struct xrep_tempexch *tx) |
| { |
| int error; |
| |
| ASSERT(sc->flags & XREP_FSGATES_EXCHANGE_RANGE); |
| |
| xfs_exchange_mappings(sc->tp, &tx->req); |
| error = xfs_defer_finish(&sc->tp); |
| if (error) |
| return error; |
| |
| /* |
| * If we exchanged the ondisk sizes of two metadata files, we must |
| * exchanged the incore sizes as well. |
| */ |
| if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) { |
| loff_t temp; |
| |
| temp = i_size_read(VFS_I(sc->ip)); |
| i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); |
| i_size_write(VFS_I(sc->tempip), temp); |
| } |
| |
| return 0; |
| } |