blob: 7791336ca82092517ef5ff394fb416b8c0574b6b [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2021-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
#include "xfs_quota.h"
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
#include "xfs_dir2.h"
#include "xfs_exchrange.h"
#include "xfs_exchmaps.h"
#include "xfs_defer.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
#include "scrub/trace.h"
#include "scrub/tempfile.h"
#include "scrub/tempexch.h"
#include "scrub/xfile.h"
/*
* Create a temporary file for reconstructing metadata, with the intention of
* atomically exchanging the temporary file's contents with the file that's
* being repaired.
*/
int
xrep_tempfile_create(
struct xfs_scrub *sc,
uint16_t mode)
{
struct xfs_mount *mp = sc->mp;
struct xfs_trans *tp = NULL;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
struct xfs_dquot *pdqp = NULL;
struct xfs_trans_res *tres;
struct xfs_inode *dp = mp->m_rootip;
xfs_ino_t ino;
unsigned int resblks;
bool is_dir = S_ISDIR(mode);
int error;
if (xfs_is_shutdown(mp))
return -EIO;
if (xfs_is_readonly(mp))
return -EROFS;
ASSERT(sc->tp == NULL);
ASSERT(sc->tempip == NULL);
/*
* Make sure that we have allocated dquot(s) on disk. The temporary
* inode should be completely root owned so that we don't fail due to
* quota limits.
*/
error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp);
if (error)
return error;
if (is_dir) {
resblks = XFS_MKDIR_SPACE_RES(mp, 0);
tres = &M_RES(mp)->tr_mkdir;
} else {
resblks = XFS_IALLOC_SPACE_RES(mp);
tres = &M_RES(mp)->tr_create_tmpfile;
}
error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
&tp);
if (error)
goto out_release_dquots;
/* Allocate inode, set up directory. */
error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
if (error)
goto out_trans_cancel;
error = xfs_init_new_inode(&nop_mnt_idmap, tp, dp, ino, mode, 0, 0,
0, false, &sc->tempip);
if (error)
goto out_trans_cancel;
/* Change the ownership of the inode to root. */
VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID;
VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID;
sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE);
/*
* Mark our temporary file as private so that LSMs and the ACL code
* don't try to add their own metadata or reason about these files.
* The file should never be exposed to userspace.
*/
VFS_I(sc->tempip)->i_flags |= S_PRIVATE;
VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR;
if (is_dir) {
error = xfs_dir_init(tp, sc->tempip, dp);
if (error)
goto out_trans_cancel;
}
/*
* Attach the dquot(s) to the inodes and modify them incore.
* These ids of the inode couldn't have changed since the new
* inode has been locked ever since it was created.
*/
xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp);
/*
* Put our temp file on the unlinked list so it's purged automatically.
* All file-based metadata being reconstructed using this file must be
* atomically exchanged with the original file because the contents
* here will be purged when the inode is dropped or log recovery cleans
* out the unlinked list.
*/
error = xfs_iunlink(tp, sc->tempip);
if (error)
goto out_trans_cancel;
error = xfs_trans_commit(tp);
if (error)
goto out_release_inode;
trace_xrep_tempfile_create(sc);
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
/* Finish setting up the incore / vfs context. */
xfs_setup_iops(sc->tempip);
xfs_finish_inode_setup(sc->tempip);
sc->temp_ilock_flags = 0;
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
out_release_inode:
/*
* Wait until after the current transaction is aborted to finish the
* setup of the inode and release the inode. This prevents recursive
* transactions and deadlocks from xfs_inactive.
*/
if (sc->tempip) {
xfs_finish_inode_setup(sc->tempip);
xchk_irele(sc, sc->tempip);
}
out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
return error;
}
/* Take IOLOCK_EXCL on the temporary file, maybe. */
bool
xrep_tempfile_iolock_nowait(
struct xfs_scrub *sc)
{
if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) {
sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
return true;
}
return false;
}
/*
* Take the temporary file's IOLOCK while holding a different inode's IOLOCK.
* In theory nobody else should hold the tempfile's IOLOCK, but we use trylock
* to avoid deadlocks and lockdep complaints.
*/
int
xrep_tempfile_iolock_polled(
struct xfs_scrub *sc)
{
int error = 0;
while (!xrep_tempfile_iolock_nowait(sc)) {
if (xchk_should_terminate(sc, &error))
return error;
delay(1);
}
return 0;
}
/* Release IOLOCK_EXCL on the temporary file. */
void
xrep_tempfile_iounlock(
struct xfs_scrub *sc)
{
xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL);
sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL;
}
/* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */
void
xrep_tempfile_ilock(
struct xfs_scrub *sc)
{
sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
xfs_ilock(sc->tempip, XFS_ILOCK_EXCL);
}
/* Try to grab ILOCK_EXCL on the temporary file. */
bool
xrep_tempfile_ilock_nowait(
struct xfs_scrub *sc)
{
if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) {
sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
return true;
}
return false;
}
/* Unlock ILOCK_EXCL on the temporary file after an update. */
void
xrep_tempfile_iunlock(
struct xfs_scrub *sc)
{
xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL;
}
/* Release the temporary file. */
void
xrep_tempfile_rele(
struct xfs_scrub *sc)
{
if (!sc->tempip)
return;
if (sc->temp_ilock_flags) {
xfs_iunlock(sc->tempip, sc->temp_ilock_flags);
sc->temp_ilock_flags = 0;
}
xchk_irele(sc, sc->tempip);
sc->tempip = NULL;
}
/*
* Make sure that the given range of the data fork of the temporary file is
* mapped to written blocks. The caller must ensure that both inodes are
* joined to the transaction.
*/
int
xrep_tempfile_prealloc(
struct xfs_scrub *sc,
xfs_fileoff_t off,
xfs_filblks_t len)
{
struct xfs_bmbt_irec map;
xfs_fileoff_t end = off + len;
int error;
ASSERT(sc->tempip != NULL);
ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip));
for (; off < end; off = map.br_startoff + map.br_blockcount) {
int nmaps = 1;
/*
* If we have a real extent mapping this block then we're
* in ok shape.
*/
error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps,
XFS_DATA_FORK);
if (error)
return error;
if (nmaps == 0) {
ASSERT(nmaps != 0);
return -EFSCORRUPTED;
}
if (xfs_bmap_is_written_extent(&map))
continue;
/*
* If we find a delalloc reservation then something is very
* very wrong. Bail out.
*/
if (map.br_startblock == DELAYSTARTBLOCK)
return -EFSCORRUPTED;
/*
* Make sure this block has a real zeroed extent allocated to
* it.
*/
nmaps = 1;
error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off,
XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map,
&nmaps);
if (error)
return error;
if (nmaps != 1)
return -EFSCORRUPTED;
trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map);
/* Commit new extent and all deferred work. */
error = xfs_defer_finish(&sc->tp);
if (error)
return error;
}
return 0;
}
/*
* Write data to each block of a file. The given range of the tempfile's data
* fork must already be populated with written extents.
*/
int
xrep_tempfile_copyin(
struct xfs_scrub *sc,
xfs_fileoff_t off,
xfs_filblks_t len,
xrep_tempfile_copyin_fn prep_fn,
void *data)
{
LIST_HEAD(buffers_list);
struct xfs_mount *mp = sc->mp;
struct xfs_buf *bp;
xfs_fileoff_t flush_mask;
xfs_fileoff_t end = off + len;
loff_t pos = XFS_FSB_TO_B(mp, off);
int error = 0;
ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode));
/* Flush buffers to disk every 512K */
flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1;
for (; off < end; off++, pos += mp->m_sb.sb_blocksize) {
struct xfs_bmbt_irec map;
int nmaps = 1;
/* Read block mapping for this file block. */
error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0);
if (error)
goto out_err;
if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) {
error = -EFSCORRUPTED;
goto out_err;
}
/* Get the metadata buffer for this offset in the file. */
error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
mp->m_bsize, 0, &bp);
if (error)
goto out_err;
trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map);
/* Read in a block's worth of data from the xfile. */
error = prep_fn(sc, bp, data);
if (error) {
xfs_trans_brelse(sc->tp, bp);
goto out_err;
}
/* Queue buffer, and flush if we have too much dirty data. */
xfs_buf_delwri_queue_here(bp, &buffers_list);
xfs_trans_brelse(sc->tp, bp);
if (!(off & flush_mask)) {
error = xfs_buf_delwri_submit(&buffers_list);
if (error)
goto out_err;
}
}
/*
* Write the new blocks to disk. If the ordered list isn't empty after
* that, then something went wrong and we have to fail. This should
* never happen, but we'll check anyway.
*/
error = xfs_buf_delwri_submit(&buffers_list);
if (error)
goto out_err;
if (!list_empty(&buffers_list)) {
ASSERT(list_empty(&buffers_list));
error = -EIO;
goto out_err;
}
return 0;
out_err:
xfs_buf_delwri_cancel(&buffers_list);
return error;
}
/*
* Set the temporary file's size. Caller must join the tempfile to the scrub
* transaction and is responsible for adjusting block mappings as needed.
*/
int
xrep_tempfile_set_isize(
struct xfs_scrub *sc,
unsigned long long isize)
{
if (sc->tempip->i_disk_size == isize)
return 0;
sc->tempip->i_disk_size = isize;
i_size_write(VFS_I(sc->tempip), isize);
return xrep_tempfile_roll_trans(sc);
}
/*
* Roll a repair transaction involving the temporary file. Caller must join
* both the temporary file and the file being scrubbed to the transaction.
* This function return with both inodes joined to a new scrub transaction,
* or the usual negative errno.
*/
int
xrep_tempfile_roll_trans(
struct xfs_scrub *sc)
{
int error;
xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE);
error = xrep_roll_trans(sc);
if (error)
return error;
xfs_trans_ijoin(sc->tp, sc->tempip, 0);
return 0;
}
/* Enable file content exchanges. */
int
xrep_tempexch_enable(
struct xfs_scrub *sc)
{
if (sc->flags & XREP_FSGATES_EXCHANGE_RANGE)
return 0;
if (!xfs_has_exchange_range(sc->mp))
return -EOPNOTSUPP;
trace_xchk_fsgates_enable(sc, XREP_FSGATES_EXCHANGE_RANGE);
sc->flags |= XREP_FSGATES_EXCHANGE_RANGE;
return 0;
}
/*
* Fill out the mapping exchange request in preparation for atomically
* committing the contents of a metadata file that we've rebuilt in the temp
* file.
*/
STATIC int
xrep_tempexch_prep_request(
struct xfs_scrub *sc,
int whichfork,
struct xrep_tempexch *tx)
{
struct xfs_exchmaps_req *req = &tx->req;
memset(tx, 0, sizeof(struct xrep_tempexch));
/* COW forks don't exist on disk. */
if (whichfork == XFS_COW_FORK) {
ASSERT(0);
return -EINVAL;
}
/* Both files should have the relevant forks. */
if (!xfs_ifork_ptr(sc->ip, whichfork) ||
!xfs_ifork_ptr(sc->tempip, whichfork)) {
ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL);
return -EINVAL;
}
/* Exchange all mappings in both forks. */
req->ip1 = sc->tempip;
req->ip2 = sc->ip;
req->startoff1 = 0;
req->startoff2 = 0;
switch (whichfork) {
case XFS_ATTR_FORK:
req->flags |= XFS_EXCHMAPS_ATTR_FORK;
break;
case XFS_DATA_FORK:
/* Always exchange sizes when exchanging data fork mappings. */
req->flags |= XFS_EXCHMAPS_SET_SIZES;
break;
}
req->blockcount = XFS_MAX_FILEOFF;
return 0;
}
/*
* Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
* this if quota enforcement is disabled or if both inodes' dquots are the
* same. The qretry structure must be initialized to zeroes before the first
* call to this function.
*/
STATIC int
xrep_tempexch_reserve_quota(
struct xfs_scrub *sc,
const struct xrep_tempexch *tx)
{
struct xfs_trans *tp = sc->tp;
const struct xfs_exchmaps_req *req = &tx->req;
int64_t ddelta, rdelta;
int error;
/*
* Don't bother with a quota reservation if we're not enforcing them
* or the two inodes have the same dquots.
*/
if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
(req->ip1->i_udquot == req->ip2->i_udquot &&
req->ip1->i_gdquot == req->ip2->i_gdquot &&
req->ip1->i_pdquot == req->ip2->i_pdquot))
return 0;
/*
* Quota reservation for each file comes from two sources. First, we
* need to account for any net gain in mapped blocks during the
* exchange. Second, we need reservation for the gross gain in mapped
* blocks so that we don't trip over any quota block reservation
* assertions. We must reserve the gross gain because the quota code
* subtracts from bcount the number of blocks that we unmap; it does
* not add that quantity back to the quota block reservation.
*/
ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount);
rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount);
error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount,
true);
if (error)
return error;
ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount);
rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount);
return xfs_trans_reserve_quota_nblks(tp, req->ip2,
ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount,
true);
}
/*
* Prepare an existing transaction for an atomic file contents exchange.
*
* This function fills out the mapping exchange request and resource estimation
* structures in preparation for exchanging the contents of a metadata file
* that has been rebuilt in the temp file. Next, it reserves space and quota
* for the transaction.
*
* The caller must hold ILOCK_EXCL of the scrub target file and the temporary
* file. The caller must join both inodes to the transaction with no unlock
* flags, and is responsible for dropping both ILOCKs when appropriate. Only
* use this when those ILOCKs cannot be dropped.
*/
int
xrep_tempexch_trans_reserve(
struct xfs_scrub *sc,
int whichfork,
struct xrep_tempexch *tx)
{
int error;
ASSERT(sc->tp != NULL);
xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL);
xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL);
error = xrep_tempexch_prep_request(sc, whichfork, tx);
if (error)
return error;
error = xfs_exchmaps_estimate(&tx->req);
if (error)
return error;
error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0);
if (error)
return error;
return xrep_tempexch_reserve_quota(sc, tx);
}
/*
* Exchange file mappings (and hence file contents) between the file being
* repaired and the temporary file. Returns with both inodes locked and joined
* to a clean scrub transaction.
*/
int
xrep_tempexch_contents(
struct xfs_scrub *sc,
struct xrep_tempexch *tx)
{
int error;
ASSERT(sc->flags & XREP_FSGATES_EXCHANGE_RANGE);
xfs_exchange_mappings(sc->tp, &tx->req);
error = xfs_defer_finish(&sc->tp);
if (error)
return error;
/*
* If we exchanged the ondisk sizes of two metadata files, we must
* exchanged the incore sizes as well.
*/
if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) {
loff_t temp;
temp = i_size_read(VFS_I(sc->ip));
i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
i_size_write(VFS_I(sc->tempip), temp);
}
return 0;
}